Spaces:

Huy227
/

English_Sentences_Analyzer

Sleeping

App Files Files Community

HuuHuy227 commited on Jan 13

Commit

ad57a01

1 Parent(s): 8a5400a

init commit

Browse files

Files changed (4) hide show

Dockerfile +48 -0
app.py +246 -0
requirements.txt +5 -0
utils.py +133 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,48 @@

+# Use Python 3.9 slim image
+FROM python:3.9-slim
+# Set working directory
+WORKDIR /app
+# Install system dependencies for cairosvg
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    python3-dev \
+    python3-pip \
+    python3-setuptools \
+    libcairo2-dev \
+    pkg-config \
+    libcairo2 \
+    libcairo-gobject2 \
+    python3-cairo \
+    libpango1.0-dev \
+    shared-mime-info \
+    mime-support \
+    && rm -rf /var/lib/apt/lists/*
+# Copy requirements first to leverage Docker cache
+COPY requirements.txt .
+# Install Python packages
+RUN pip install --no-cache-dir -r requirements.txt
+# Download spaCy language model
+RUN python -m spacy download en_core_web_md
+# Copy application files
+COPY app.py .
+COPY utils.py .
+# Create and configure streamlit directory
+RUN mkdir -p /root/.streamlit
+RUN echo "\
+[server]\n\
+enableCORS = false\n\
+enableXsrfProtection = false\n\
+" > /root/.streamlit/config.toml
+# Expose port for Streamlit
+EXPOSE 8501
+# Set entry command
+ENTRYPOINT ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]

app.py ADDED Viewed

	@@ -0,0 +1,246 @@

+import streamlit as st
+import spacy
+from spacy import displacy
+import pandas as pd
+from collections import Counter
+import plotly.express as px
+from utils import analyze_text
+from utils import svg_to_png
+import base64
+# Set page to wide mode for better visualization
+st.set_page_config(layout="wide")
+# Load English language model
+@st.cache_resource
+def load_model():
+    return spacy.load('en_core_web_md')
+nlp = load_model()
+# Streamlit UI
+st.markdown("<h1 style='text-align: center; color: white;'>English Sentences Analyzer</h1>", unsafe_allow_html=True)
+# Text Input and Help side by side
+col1, col2 = st.columns([3, 1])
+with col1:
+    text_input = st.text_area(
+        "Enter English text:",
+        "The ambitious startup in Silicon Valley developed an innovative AI system last year. " +
+        "Google and Microsoft showed interest in acquiring the technology for $50 million.",
+        height=200
+    )
+    analyze_button = st.button("Analyze Text")
+with col2:
+    with st.expander("Quick Guide", expanded=True):
+        st.markdown("""
+        1. Enter your text in the input box
+        2. Click "Analyze Text" to see:
+            - Sentence structure visualization
+            - Detailed token analysis
+            - Additional analysis in expandable sections
+        3. Use mouse wheel or buttons to zoom the visualization
+        4. Click and drag to pan around
+        """)
+if analyze_button:
+    if text_input:
+        tokens, entities, noun_chunks, stats, doc = analyze_text(nlp, text_input)
+        # 1. Dependency Parse with improved visualization
+        st.header("Sentence Structure Analysis")
+        # Generate sentence visualizations
+        sentences = list(doc.sents)
+        sentence_htmls = []
+        for sent in sentences:
+            sent_html = displacy.render(sent, style="dep", options={
+                "distance": 120,
+                "arrow_stroke": 2,
+                "arrow_width": 8,
+                "font": "Arial",
+                "bg": "#ffffff",
+            })
+            # Ensure proper SVG structure
+            if not sent_html.startswith('<?xml'):
+                sent_html = '<?xml version="1.0" encoding="UTF-8"?>' + sent_html
+            sentence_htmls.append(sent_html)
+        doc_html = "<br><br>".join(sentence_htmls)
+        # Convert SVG to PNG with error handling
+        png_bytes = svg_to_png(doc_html)
+        if png_bytes is None:
+            st.error("Failed to generate visualization")
+        else:
+            png_b64 = base64.b64encode(png_bytes).decode()
+            # CSS for image container
+            st.markdown("""
+            <style>
+            .image-container {
+                position: relative;
+                overflow: hidden;
+                background: #b4b4b4;
+                border: 1px solid #ddd;
+                border-radius: 5px;
+                margin: 10px 0;
+            }
+            .zoomable-image {
+                transform-origin: 0 0;
+                transition: transform 0.1s;
+            }
+            .download-btn {
+                position: absolute;
+                right: 10px;
+                top: 10px;
+                background: rgba(255, 255, 255, 0.8);
+                border: 1px solid #ddd;
+                border-radius: 4px;
+                padding: 5px 10px;
+                cursor: pointer;
+            }
+            .download-btn:hover {
+                background: white;
+            }
+            </style>
+            """, unsafe_allow_html=True)
+            # JavaScript for zoom and pan functionality
+            js_code = f"""
+            <div class="image-container" id="imageContainer">
+                <img src="data:image/png;base64,{png_b64}"
+                     class="zoomable-image"
+                     id="zoomableImage"
+                     style="max-width: 100%;">
+                <a class="download-btn"
+                   href="data:image/png;base64,{png_b64}"
+                   download="sentence_structure.png">
+                   📥 Download
+                </a>
+            </div>
+            <script>
+                const container = document.getElementById('imageContainer');
+                const img = document.getElementById('zoomableImage');
+                let scale = 1;
+                let isPanning = false;
+                let startX, startY, translateX = 0, translateY = 0;
+                // Zoom functionality
+                container.addEventListener('wheel', (e) => {{
+                    e.preventDefault();
+                    const rect = container.getBoundingClientRect();
+                    const mouseX = e.clientX - rect.left;
+                    const mouseY = e.clientY - rect.top;
+                    const delta = e.deltaY * -0.01;
+                    const newScale = Math.max(1, Math.min(scale + delta, 4));
+                    const scaleChange = newScale / scale;
+                    translateX = mouseX - (mouseX - translateX) * scaleChange;
+                    translateY = mouseY - (mouseY - translateY) * scaleChange;
+                    scale = newScale;
+                    updateTransform();
+                }});
+                // Pan functionality
+                container.addEventListener('mousedown', (e) => {{
+                    isPanning = true;
+                    startX = e.clientX - translateX;
+                    startY = e.clientY - translateY;
+                    container.style.cursor = 'grabbing';
+                }});
+                container.addEventListener('mousemove', (e) => {{
+                    if (!isPanning) return;
+                    translateX = e.clientX - startX;
+                    translateY = e.clientY - startY;
+                    updateTransform();
+                }});
+                container.addEventListener('mouseup', () => {{
+                    isPanning = false;
+                    container.style.cursor = 'grab';
+                }});
+                container.addEventListener('mouseleave', () => {{
+                    isPanning = false;
+                    container.style.cursor = 'grab';
+                }});
+                function updateTransform() {{
+                    img.style.transform = `translate(${{translateX}}px, ${{translateY}}px) scale(${{scale}})`;
+                }}
+                // Initialize
+                container.style.cursor = 'grab';
+                container.style.height = '500px';
+            </script>
+            """
+            st.markdown(js_code, unsafe_allow_html=True)
+            # Add caption
+            col1, col2 = st.columns([3, 1])
+            with col1:
+                st.caption("💡 Tip: Use mouse wheel to zoom, click and drag to pan around")
+        # 2. Detailed Token Analysis
+        st.header("Token Analysis")
+        token_df = pd.DataFrame(tokens)
+        # Create two columns for token distribution and token details
+        col1, col2 = st.columns([1, 2])
+        with col1:
+            # Token distribution visualization
+            pos_counts = Counter([token['POS'] for token in tokens])
+            fig = px.pie(
+                values=list(pos_counts.values()),
+                names=list(pos_counts.keys()),
+                title="Parts of Speech Distribution"
+            )
+            fig.update_layout(height=400)
+            st.plotly_chart(fig, use_container_width=True)
+        with col2:
+            st.dataframe(token_df, use_container_width=True)
+        # Additional Analysis in Expanders
+        with st.expander("Named Entities"):
+            if entities:
+                ent_df = pd.DataFrame(entities)
+                # Visualization of entity distribution
+                entity_counts = Counter([ent['Label'] for ent in entities])
+                fig = px.bar(
+                    x=list(entity_counts.keys()),
+                    y=list(entity_counts.values()),
+                    title="Distribution of Named Entities",
+                    labels={'x': 'Entity Type', 'y': 'Count'}
+                )
+                st.plotly_chart(fig)
+                st.table(ent_df)
+            else:
+                st.info("No named entities found in the text.")
+        with st.expander("Noun Chunks (Phrases)"):
+            if noun_chunks:
+                st.table(pd.DataFrame(noun_chunks))
+            else:
+                st.info("No noun chunks found in the text.")
+        with st.expander("Text Statistics"):
+            col1, col2, col3 = st.columns(3)
+            with col1:
+                st.metric("Word Count", stats['Word Count'])
+            with col2:
+                st.metric("Sentence Count", stats['Sentence Count'])
+            with col3:
+                st.metric("Unique Words", stats['Unique Words'])
+            st.metric("Average Words per Sentence", stats['Average Words per Sentence'])
+            st.metric("Stop Words Percentage", f"{stats['Stop Words %']}%")

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+streamlit
+spacy
+pandas
+plotly
+cairosvg

utils.py ADDED Viewed

	@@ -0,0 +1,133 @@

+import io
+from cairosvg import svg2png
+from PIL import Image
+# import base64
+def get_entity_explanation(label):
+    """Return explanation for named entity labels"""
+    explanations = {
+        'PERSON': 'People, including fictional',
+        'NORP': 'Nationalities, religious or political groups',
+        'FAC': 'Buildings, airports, highways, bridges, etc.',
+        'ORG': 'Companies, agencies, institutions, etc.',
+        'GPE': 'Countries, cities, states',
+        'LOC': 'Non-GPE locations, mountain ranges, water bodies',
+        'PRODUCT': 'Objects, vehicles, foods, etc.',
+        'EVENT': 'Named hurricanes, battles, wars, sports events, etc.',
+        'WORK_OF_ART': 'Titles of books, songs, etc.',
+        'DATE': 'Absolute or relative dates or periods',
+        'TIME': 'Times smaller than a day',
+        'MONEY': 'Monetary values, including unit',
+        'QUANTITY': 'Measurements, as of weight or distance'
+    }
+    return explanations.get(label, 'Other type of entity')
+def analyze_text(nlp, text):
+    doc = nlp(text)
+    # Basic tokenization and POS analysis
+    tokens = [{
+        'Text': token.text,
+        'Lemma': token.lemma_,
+        'POS': token.pos_,
+        'Tag': token.tag_,
+        'Dependency': token.dep_,
+        'Shape': token.shape_,
+        'Is Alpha': token.is_alpha,
+        'Is Stop': token.is_stop
+    } for token in doc]
+    # Named Entity Recognition
+    entities = [{
+        'Text': ent.text,
+        'Label': ent.label_,
+        'Explanation': get_entity_explanation(ent.label_),
+        'Start': ent.start_char,
+        'End': ent.end_char
+    } for ent in doc.ents]
+    # Noun Chunks (phrases)
+    noun_chunks = [{
+        'Text': chunk.text,
+        'Root Text': chunk.root.text,
+        'Root Dep': chunk.root.dep_,
+        'Root Head Text': chunk.root.head.text
+    } for chunk in doc.noun_chunks]
+    # Text Statistics
+    stats = {
+        'Word Count': len([token for token in doc if not token.is_punct]),
+        'Sentence Count': len(list(doc.sents)),
+        'Average Words per Sentence': round(len([token for token in doc if not token.is_punct]) / len(list(doc.sents)), 2),
+        'Unique Words': len(set([token.text.lower() for token in doc if token.is_alpha])),
+        'Stop Words %': round(len([token for token in doc if token.is_stop]) / len(doc) * 100, 2)
+    }
+    return tokens, entities, noun_chunks, stats, doc
+def svg_to_png(svg_content, background_color='white'):
+    """Convert SVG to PNG with specified background color"""
+    # Split multiple SVGs if present
+    svg_parts = svg_content.split('<br><br>')
+    images = []
+    for svg in svg_parts:
+        # Add SVG namespace if missing
+        if not 'xmlns="http://www.w3.org/2000/svg"' in svg:
+            svg = svg.replace('<svg', '<svg xmlns="http://www.w3.org/2000/svg"')
+        try:
+            # Convert SVG to PNG bytes
+            png_bytes = svg2png(bytestring=svg.encode('utf-8'),
+                              background_color=background_color,
+                              scale=1)
+            # Create PIL Image from PNG bytes
+            img = Image.open(io.BytesIO(png_bytes))
+            # Convert RGBA to RGB with white background
+            if img.mode == 'RGBA':
+                background = Image.new('RGB', img.size, background_color)
+                background.paste(img, mask=img.split()[3])  # Use alpha channel as mask
+                img = background
+            # Add some padding
+            padding = 20  # pixels
+            img_with_padding = Image.new('RGB',
+                                       (img.width, img.height + padding * 2),
+                                       background_color)
+            img_with_padding.paste(img, (0, padding))
+            images.append(img_with_padding)
+        except Exception as e:
+            st.error(f"Error converting SVG to PNG: {str(e)}")
+            continue
+    if not images:
+        return None
+    # Combine images vertically if there are multiple
+    if len(images) > 1:
+        # Calculate total height and max width
+        total_height = sum(img.height for img in images)
+        max_width = max(img.width for img in images)
+        # Create new image to hold all sentences
+        combined = Image.new('RGB', (max_width, total_height), background_color)
+        # Paste each image
+        y_offset = 0
+        for img in images:
+            # Center image horizontally
+            x_offset = (max_width - img.width) // 2
+            combined.paste(img, (x_offset, y_offset))
+            y_offset += img.height
+    else:
+        combined = images[0]
+    # Convert to bytes for Streamlit
+    img_byte_arr = io.BytesIO()
+    combined.save(img_byte_arr, format='PNG')
+    img_byte_arr.seek(0)
+    return img_byte_arr.getvalue()