Spaces:
Sleeping
Sleeping
import io | |
from cairosvg import svg2png | |
from PIL import Image | |
# import base64 | |
def get_entity_explanation(label): | |
"""Return explanation for named entity labels""" | |
explanations = { | |
'PERSON': 'People, including fictional', | |
'NORP': 'Nationalities, religious or political groups', | |
'FAC': 'Buildings, airports, highways, bridges, etc.', | |
'ORG': 'Companies, agencies, institutions, etc.', | |
'GPE': 'Countries, cities, states', | |
'LOC': 'Non-GPE locations, mountain ranges, water bodies', | |
'PRODUCT': 'Objects, vehicles, foods, etc.', | |
'EVENT': 'Named hurricanes, battles, wars, sports events, etc.', | |
'WORK_OF_ART': 'Titles of books, songs, etc.', | |
'DATE': 'Absolute or relative dates or periods', | |
'TIME': 'Times smaller than a day', | |
'MONEY': 'Monetary values, including unit', | |
'QUANTITY': 'Measurements, as of weight or distance' | |
} | |
return explanations.get(label, 'Other type of entity') | |
def analyze_text(nlp, text): | |
doc = nlp(text) | |
# Basic tokenization and POS analysis | |
tokens = [{ | |
'Text': token.text, | |
'Lemma': token.lemma_, | |
'POS': token.pos_, | |
'Tag': token.tag_, | |
'Dependency': token.dep_, | |
'Shape': token.shape_, | |
'Is Alpha': token.is_alpha, | |
'Is Stop': token.is_stop | |
} for token in doc] | |
# Named Entity Recognition | |
entities = [{ | |
'Text': ent.text, | |
'Label': ent.label_, | |
'Explanation': get_entity_explanation(ent.label_), | |
'Start': ent.start_char, | |
'End': ent.end_char | |
} for ent in doc.ents] | |
# Noun Chunks (phrases) | |
noun_chunks = [{ | |
'Text': chunk.text, | |
'Root Text': chunk.root.text, | |
'Root Dep': chunk.root.dep_, | |
'Root Head Text': chunk.root.head.text | |
} for chunk in doc.noun_chunks] | |
# Text Statistics | |
stats = { | |
'Word Count': len([token for token in doc if not token.is_punct]), | |
'Sentence Count': len(list(doc.sents)), | |
'Average Words per Sentence': round(len([token for token in doc if not token.is_punct]) / len(list(doc.sents)), 2), | |
'Unique Words': len(set([token.text.lower() for token in doc if token.is_alpha])), | |
'Stop Words %': round(len([token for token in doc if token.is_stop]) / len(doc) * 100, 2) | |
} | |
return tokens, entities, noun_chunks, stats, doc | |
def svg_to_png(svg_content, background_color='white'): | |
"""Convert SVG to PNG with specified background color""" | |
# Split multiple SVGs if present | |
svg_parts = svg_content.split('<br><br>') | |
images = [] | |
for svg in svg_parts: | |
# Add SVG namespace if missing | |
if not 'xmlns="http://www.w3.org/2000/svg"' in svg: | |
svg = svg.replace('<svg', '<svg xmlns="http://www.w3.org/2000/svg"') | |
try: | |
# Convert SVG to PNG bytes | |
png_bytes = svg2png(bytestring=svg.encode('utf-8'), | |
background_color=background_color, | |
scale=1) | |
# Create PIL Image from PNG bytes | |
img = Image.open(io.BytesIO(png_bytes)) | |
# Convert RGBA to RGB with white background | |
if img.mode == 'RGBA': | |
background = Image.new('RGB', img.size, background_color) | |
background.paste(img, mask=img.split()[3]) # Use alpha channel as mask | |
img = background | |
# Add some padding | |
padding = 20 # pixels | |
img_with_padding = Image.new('RGB', | |
(img.width, img.height + padding * 2), | |
background_color) | |
img_with_padding.paste(img, (0, padding)) | |
images.append(img_with_padding) | |
except Exception as e: | |
st.error(f"Error converting SVG to PNG: {str(e)}") | |
continue | |
if not images: | |
return None | |
# Combine images vertically if there are multiple | |
if len(images) > 1: | |
# Calculate total height and max width | |
total_height = sum(img.height for img in images) | |
max_width = max(img.width for img in images) | |
# Create new image to hold all sentences | |
combined = Image.new('RGB', (max_width, total_height), background_color) | |
# Paste each image | |
y_offset = 0 | |
for img in images: | |
# Center image horizontally | |
x_offset = (max_width - img.width) // 2 | |
combined.paste(img, (x_offset, y_offset)) | |
y_offset += img.height | |
else: | |
combined = images[0] | |
# Convert to bytes for Streamlit | |
img_byte_arr = io.BytesIO() | |
combined.save(img_byte_arr, format='PNG') | |
img_byte_arr.seek(0) | |
return img_byte_arr.getvalue() |