HuuHuy227 commited on
Commit
ad57a01
·
1 Parent(s): 8a5400a

init commit

Browse files
Files changed (4) hide show
  1. Dockerfile +48 -0
  2. app.py +246 -0
  3. requirements.txt +5 -0
  4. utils.py +133 -0
Dockerfile ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use Python 3.9 slim image
2
+ FROM python:3.9-slim
3
+
4
+ # Set working directory
5
+ WORKDIR /app
6
+
7
+ # Install system dependencies for cairosvg
8
+ RUN apt-get update && apt-get install -y \
9
+ build-essential \
10
+ python3-dev \
11
+ python3-pip \
12
+ python3-setuptools \
13
+ libcairo2-dev \
14
+ pkg-config \
15
+ libcairo2 \
16
+ libcairo-gobject2 \
17
+ python3-cairo \
18
+ libpango1.0-dev \
19
+ shared-mime-info \
20
+ mime-support \
21
+ && rm -rf /var/lib/apt/lists/*
22
+
23
+ # Copy requirements first to leverage Docker cache
24
+ COPY requirements.txt .
25
+
26
+ # Install Python packages
27
+ RUN pip install --no-cache-dir -r requirements.txt
28
+
29
+ # Download spaCy language model
30
+ RUN python -m spacy download en_core_web_md
31
+
32
+ # Copy application files
33
+ COPY app.py .
34
+ COPY utils.py .
35
+
36
+ # Create and configure streamlit directory
37
+ RUN mkdir -p /root/.streamlit
38
+ RUN echo "\
39
+ [server]\n\
40
+ enableCORS = false\n\
41
+ enableXsrfProtection = false\n\
42
+ " > /root/.streamlit/config.toml
43
+
44
+ # Expose port for Streamlit
45
+ EXPOSE 8501
46
+
47
+ # Set entry command
48
+ ENTRYPOINT ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]
app.py ADDED
@@ -0,0 +1,246 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import spacy
3
+ from spacy import displacy
4
+ import pandas as pd
5
+ from collections import Counter
6
+ import plotly.express as px
7
+ from utils import analyze_text
8
+ from utils import svg_to_png
9
+ import base64
10
+
11
+ # Set page to wide mode for better visualization
12
+ st.set_page_config(layout="wide")
13
+
14
+ # Load English language model
15
+ @st.cache_resource
16
+ def load_model():
17
+ return spacy.load('en_core_web_md')
18
+
19
+ nlp = load_model()
20
+
21
+ # Streamlit UI
22
+ st.markdown("<h1 style='text-align: center; color: white;'>English Sentences Analyzer</h1>", unsafe_allow_html=True)
23
+
24
+ # Text Input and Help side by side
25
+ col1, col2 = st.columns([3, 1])
26
+ with col1:
27
+ text_input = st.text_area(
28
+ "Enter English text:",
29
+ "The ambitious startup in Silicon Valley developed an innovative AI system last year. " +
30
+ "Google and Microsoft showed interest in acquiring the technology for $50 million.",
31
+ height=200
32
+ )
33
+ analyze_button = st.button("Analyze Text")
34
+
35
+ with col2:
36
+ with st.expander("Quick Guide", expanded=True):
37
+ st.markdown("""
38
+ 1. Enter your text in the input box
39
+ 2. Click "Analyze Text" to see:
40
+ - Sentence structure visualization
41
+ - Detailed token analysis
42
+ - Additional analysis in expandable sections
43
+ 3. Use mouse wheel or buttons to zoom the visualization
44
+ 4. Click and drag to pan around
45
+ """)
46
+
47
+ if analyze_button:
48
+ if text_input:
49
+ tokens, entities, noun_chunks, stats, doc = analyze_text(nlp, text_input)
50
+
51
+ # 1. Dependency Parse with improved visualization
52
+ st.header("Sentence Structure Analysis")
53
+
54
+ # Generate sentence visualizations
55
+ sentences = list(doc.sents)
56
+ sentence_htmls = []
57
+ for sent in sentences:
58
+ sent_html = displacy.render(sent, style="dep", options={
59
+ "distance": 120,
60
+ "arrow_stroke": 2,
61
+ "arrow_width": 8,
62
+ "font": "Arial",
63
+ "bg": "#ffffff",
64
+ })
65
+ # Ensure proper SVG structure
66
+ if not sent_html.startswith('<?xml'):
67
+ sent_html = '<?xml version="1.0" encoding="UTF-8"?>' + sent_html
68
+ sentence_htmls.append(sent_html)
69
+
70
+ doc_html = "<br><br>".join(sentence_htmls)
71
+
72
+ # Convert SVG to PNG with error handling
73
+ png_bytes = svg_to_png(doc_html)
74
+ if png_bytes is None:
75
+ st.error("Failed to generate visualization")
76
+ else:
77
+ png_b64 = base64.b64encode(png_bytes).decode()
78
+
79
+ # CSS for image container
80
+ st.markdown("""
81
+ <style>
82
+ .image-container {
83
+ position: relative;
84
+ overflow: hidden;
85
+ background: #b4b4b4;
86
+ border: 1px solid #ddd;
87
+ border-radius: 5px;
88
+ margin: 10px 0;
89
+ }
90
+ .zoomable-image {
91
+ transform-origin: 0 0;
92
+ transition: transform 0.1s;
93
+ }
94
+ .download-btn {
95
+ position: absolute;
96
+ right: 10px;
97
+ top: 10px;
98
+ background: rgba(255, 255, 255, 0.8);
99
+ border: 1px solid #ddd;
100
+ border-radius: 4px;
101
+ padding: 5px 10px;
102
+ cursor: pointer;
103
+ }
104
+ .download-btn:hover {
105
+ background: white;
106
+ }
107
+ </style>
108
+ """, unsafe_allow_html=True)
109
+
110
+ # JavaScript for zoom and pan functionality
111
+ js_code = f"""
112
+ <div class="image-container" id="imageContainer">
113
+ <img src="data:image/png;base64,{png_b64}"
114
+ class="zoomable-image"
115
+ id="zoomableImage"
116
+ style="max-width: 100%;">
117
+ <a class="download-btn"
118
+ href="data:image/png;base64,{png_b64}"
119
+ download="sentence_structure.png">
120
+ 📥 Download
121
+ </a>
122
+ </div>
123
+ <script>
124
+ const container = document.getElementById('imageContainer');
125
+ const img = document.getElementById('zoomableImage');
126
+ let scale = 1;
127
+ let isPanning = false;
128
+ let startX, startY, translateX = 0, translateY = 0;
129
+
130
+ // Zoom functionality
131
+ container.addEventListener('wheel', (e) => {{
132
+ e.preventDefault();
133
+ const rect = container.getBoundingClientRect();
134
+ const mouseX = e.clientX - rect.left;
135
+ const mouseY = e.clientY - rect.top;
136
+
137
+ const delta = e.deltaY * -0.01;
138
+ const newScale = Math.max(1, Math.min(scale + delta, 4));
139
+ const scaleChange = newScale / scale;
140
+
141
+ translateX = mouseX - (mouseX - translateX) * scaleChange;
142
+ translateY = mouseY - (mouseY - translateY) * scaleChange;
143
+
144
+ scale = newScale;
145
+ updateTransform();
146
+ }});
147
+
148
+ // Pan functionality
149
+ container.addEventListener('mousedown', (e) => {{
150
+ isPanning = true;
151
+ startX = e.clientX - translateX;
152
+ startY = e.clientY - translateY;
153
+ container.style.cursor = 'grabbing';
154
+ }});
155
+
156
+ container.addEventListener('mousemove', (e) => {{
157
+ if (!isPanning) return;
158
+ translateX = e.clientX - startX;
159
+ translateY = e.clientY - startY;
160
+ updateTransform();
161
+ }});
162
+
163
+ container.addEventListener('mouseup', () => {{
164
+ isPanning = false;
165
+ container.style.cursor = 'grab';
166
+ }});
167
+
168
+ container.addEventListener('mouseleave', () => {{
169
+ isPanning = false;
170
+ container.style.cursor = 'grab';
171
+ }});
172
+
173
+ function updateTransform() {{
174
+ img.style.transform = `translate(${{translateX}}px, ${{translateY}}px) scale(${{scale}})`;
175
+ }}
176
+
177
+ // Initialize
178
+ container.style.cursor = 'grab';
179
+ container.style.height = '500px';
180
+ </script>
181
+ """
182
+
183
+ st.markdown(js_code, unsafe_allow_html=True)
184
+
185
+ # Add caption
186
+ col1, col2 = st.columns([3, 1])
187
+ with col1:
188
+ st.caption("💡 Tip: Use mouse wheel to zoom, click and drag to pan around")
189
+
190
+ # 2. Detailed Token Analysis
191
+ st.header("Token Analysis")
192
+ token_df = pd.DataFrame(tokens)
193
+
194
+ # Create two columns for token distribution and token details
195
+ col1, col2 = st.columns([1, 2])
196
+
197
+ with col1:
198
+ # Token distribution visualization
199
+ pos_counts = Counter([token['POS'] for token in tokens])
200
+ fig = px.pie(
201
+ values=list(pos_counts.values()),
202
+ names=list(pos_counts.keys()),
203
+ title="Parts of Speech Distribution"
204
+ )
205
+ fig.update_layout(height=400)
206
+ st.plotly_chart(fig, use_container_width=True)
207
+
208
+ with col2:
209
+ st.dataframe(token_df, use_container_width=True)
210
+
211
+ # Additional Analysis in Expanders
212
+ with st.expander("Named Entities"):
213
+ if entities:
214
+ ent_df = pd.DataFrame(entities)
215
+
216
+ # Visualization of entity distribution
217
+ entity_counts = Counter([ent['Label'] for ent in entities])
218
+ fig = px.bar(
219
+ x=list(entity_counts.keys()),
220
+ y=list(entity_counts.values()),
221
+ title="Distribution of Named Entities",
222
+ labels={'x': 'Entity Type', 'y': 'Count'}
223
+ )
224
+ st.plotly_chart(fig)
225
+
226
+ st.table(ent_df)
227
+ else:
228
+ st.info("No named entities found in the text.")
229
+
230
+ with st.expander("Noun Chunks (Phrases)"):
231
+ if noun_chunks:
232
+ st.table(pd.DataFrame(noun_chunks))
233
+ else:
234
+ st.info("No noun chunks found in the text.")
235
+
236
+ with st.expander("Text Statistics"):
237
+ col1, col2, col3 = st.columns(3)
238
+ with col1:
239
+ st.metric("Word Count", stats['Word Count'])
240
+ with col2:
241
+ st.metric("Sentence Count", stats['Sentence Count'])
242
+ with col3:
243
+ st.metric("Unique Words", stats['Unique Words'])
244
+
245
+ st.metric("Average Words per Sentence", stats['Average Words per Sentence'])
246
+ st.metric("Stop Words Percentage", f"{stats['Stop Words %']}%")
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ streamlit
2
+ spacy
3
+ pandas
4
+ plotly
5
+ cairosvg
utils.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ from cairosvg import svg2png
3
+ from PIL import Image
4
+ # import base64
5
+
6
+ def get_entity_explanation(label):
7
+ """Return explanation for named entity labels"""
8
+ explanations = {
9
+ 'PERSON': 'People, including fictional',
10
+ 'NORP': 'Nationalities, religious or political groups',
11
+ 'FAC': 'Buildings, airports, highways, bridges, etc.',
12
+ 'ORG': 'Companies, agencies, institutions, etc.',
13
+ 'GPE': 'Countries, cities, states',
14
+ 'LOC': 'Non-GPE locations, mountain ranges, water bodies',
15
+ 'PRODUCT': 'Objects, vehicles, foods, etc.',
16
+ 'EVENT': 'Named hurricanes, battles, wars, sports events, etc.',
17
+ 'WORK_OF_ART': 'Titles of books, songs, etc.',
18
+ 'DATE': 'Absolute or relative dates or periods',
19
+ 'TIME': 'Times smaller than a day',
20
+ 'MONEY': 'Monetary values, including unit',
21
+ 'QUANTITY': 'Measurements, as of weight or distance'
22
+ }
23
+ return explanations.get(label, 'Other type of entity')
24
+
25
+ def analyze_text(nlp, text):
26
+ doc = nlp(text)
27
+
28
+ # Basic tokenization and POS analysis
29
+ tokens = [{
30
+ 'Text': token.text,
31
+ 'Lemma': token.lemma_,
32
+ 'POS': token.pos_,
33
+ 'Tag': token.tag_,
34
+ 'Dependency': token.dep_,
35
+ 'Shape': token.shape_,
36
+ 'Is Alpha': token.is_alpha,
37
+ 'Is Stop': token.is_stop
38
+ } for token in doc]
39
+
40
+ # Named Entity Recognition
41
+ entities = [{
42
+ 'Text': ent.text,
43
+ 'Label': ent.label_,
44
+ 'Explanation': get_entity_explanation(ent.label_),
45
+ 'Start': ent.start_char,
46
+ 'End': ent.end_char
47
+ } for ent in doc.ents]
48
+
49
+ # Noun Chunks (phrases)
50
+ noun_chunks = [{
51
+ 'Text': chunk.text,
52
+ 'Root Text': chunk.root.text,
53
+ 'Root Dep': chunk.root.dep_,
54
+ 'Root Head Text': chunk.root.head.text
55
+ } for chunk in doc.noun_chunks]
56
+
57
+ # Text Statistics
58
+ stats = {
59
+ 'Word Count': len([token for token in doc if not token.is_punct]),
60
+ 'Sentence Count': len(list(doc.sents)),
61
+ 'Average Words per Sentence': round(len([token for token in doc if not token.is_punct]) / len(list(doc.sents)), 2),
62
+ 'Unique Words': len(set([token.text.lower() for token in doc if token.is_alpha])),
63
+ 'Stop Words %': round(len([token for token in doc if token.is_stop]) / len(doc) * 100, 2)
64
+ }
65
+
66
+ return tokens, entities, noun_chunks, stats, doc
67
+
68
+ def svg_to_png(svg_content, background_color='white'):
69
+ """Convert SVG to PNG with specified background color"""
70
+ # Split multiple SVGs if present
71
+ svg_parts = svg_content.split('<br><br>')
72
+ images = []
73
+
74
+ for svg in svg_parts:
75
+ # Add SVG namespace if missing
76
+ if not 'xmlns="http://www.w3.org/2000/svg"' in svg:
77
+ svg = svg.replace('<svg', '<svg xmlns="http://www.w3.org/2000/svg"')
78
+
79
+ try:
80
+ # Convert SVG to PNG bytes
81
+ png_bytes = svg2png(bytestring=svg.encode('utf-8'),
82
+ background_color=background_color,
83
+ scale=1)
84
+
85
+ # Create PIL Image from PNG bytes
86
+ img = Image.open(io.BytesIO(png_bytes))
87
+
88
+ # Convert RGBA to RGB with white background
89
+ if img.mode == 'RGBA':
90
+ background = Image.new('RGB', img.size, background_color)
91
+ background.paste(img, mask=img.split()[3]) # Use alpha channel as mask
92
+ img = background
93
+
94
+ # Add some padding
95
+ padding = 20 # pixels
96
+ img_with_padding = Image.new('RGB',
97
+ (img.width, img.height + padding * 2),
98
+ background_color)
99
+ img_with_padding.paste(img, (0, padding))
100
+ images.append(img_with_padding)
101
+
102
+ except Exception as e:
103
+ st.error(f"Error converting SVG to PNG: {str(e)}")
104
+ continue
105
+
106
+ if not images:
107
+ return None
108
+
109
+ # Combine images vertically if there are multiple
110
+ if len(images) > 1:
111
+ # Calculate total height and max width
112
+ total_height = sum(img.height for img in images)
113
+ max_width = max(img.width for img in images)
114
+
115
+ # Create new image to hold all sentences
116
+ combined = Image.new('RGB', (max_width, total_height), background_color)
117
+
118
+ # Paste each image
119
+ y_offset = 0
120
+ for img in images:
121
+ # Center image horizontally
122
+ x_offset = (max_width - img.width) // 2
123
+ combined.paste(img, (x_offset, y_offset))
124
+ y_offset += img.height
125
+ else:
126
+ combined = images[0]
127
+
128
+ # Convert to bytes for Streamlit
129
+ img_byte_arr = io.BytesIO()
130
+ combined.save(img_byte_arr, format='PNG')
131
+ img_byte_arr.seek(0)
132
+
133
+ return img_byte_arr.getvalue()