HuuHuy227
init commit
ad57a01
raw
history blame
4.88 kB
import io
from cairosvg import svg2png
from PIL import Image
# import base64
def get_entity_explanation(label):
"""Return explanation for named entity labels"""
explanations = {
'PERSON': 'People, including fictional',
'NORP': 'Nationalities, religious or political groups',
'FAC': 'Buildings, airports, highways, bridges, etc.',
'ORG': 'Companies, agencies, institutions, etc.',
'GPE': 'Countries, cities, states',
'LOC': 'Non-GPE locations, mountain ranges, water bodies',
'PRODUCT': 'Objects, vehicles, foods, etc.',
'EVENT': 'Named hurricanes, battles, wars, sports events, etc.',
'WORK_OF_ART': 'Titles of books, songs, etc.',
'DATE': 'Absolute or relative dates or periods',
'TIME': 'Times smaller than a day',
'MONEY': 'Monetary values, including unit',
'QUANTITY': 'Measurements, as of weight or distance'
}
return explanations.get(label, 'Other type of entity')
def analyze_text(nlp, text):
doc = nlp(text)
# Basic tokenization and POS analysis
tokens = [{
'Text': token.text,
'Lemma': token.lemma_,
'POS': token.pos_,
'Tag': token.tag_,
'Dependency': token.dep_,
'Shape': token.shape_,
'Is Alpha': token.is_alpha,
'Is Stop': token.is_stop
} for token in doc]
# Named Entity Recognition
entities = [{
'Text': ent.text,
'Label': ent.label_,
'Explanation': get_entity_explanation(ent.label_),
'Start': ent.start_char,
'End': ent.end_char
} for ent in doc.ents]
# Noun Chunks (phrases)
noun_chunks = [{
'Text': chunk.text,
'Root Text': chunk.root.text,
'Root Dep': chunk.root.dep_,
'Root Head Text': chunk.root.head.text
} for chunk in doc.noun_chunks]
# Text Statistics
stats = {
'Word Count': len([token for token in doc if not token.is_punct]),
'Sentence Count': len(list(doc.sents)),
'Average Words per Sentence': round(len([token for token in doc if not token.is_punct]) / len(list(doc.sents)), 2),
'Unique Words': len(set([token.text.lower() for token in doc if token.is_alpha])),
'Stop Words %': round(len([token for token in doc if token.is_stop]) / len(doc) * 100, 2)
}
return tokens, entities, noun_chunks, stats, doc
def svg_to_png(svg_content, background_color='white'):
"""Convert SVG to PNG with specified background color"""
# Split multiple SVGs if present
svg_parts = svg_content.split('<br><br>')
images = []
for svg in svg_parts:
# Add SVG namespace if missing
if not 'xmlns="http://www.w3.org/2000/svg"' in svg:
svg = svg.replace('<svg', '<svg xmlns="http://www.w3.org/2000/svg"')
try:
# Convert SVG to PNG bytes
png_bytes = svg2png(bytestring=svg.encode('utf-8'),
background_color=background_color,
scale=1)
# Create PIL Image from PNG bytes
img = Image.open(io.BytesIO(png_bytes))
# Convert RGBA to RGB with white background
if img.mode == 'RGBA':
background = Image.new('RGB', img.size, background_color)
background.paste(img, mask=img.split()[3]) # Use alpha channel as mask
img = background
# Add some padding
padding = 20 # pixels
img_with_padding = Image.new('RGB',
(img.width, img.height + padding * 2),
background_color)
img_with_padding.paste(img, (0, padding))
images.append(img_with_padding)
except Exception as e:
st.error(f"Error converting SVG to PNG: {str(e)}")
continue
if not images:
return None
# Combine images vertically if there are multiple
if len(images) > 1:
# Calculate total height and max width
total_height = sum(img.height for img in images)
max_width = max(img.width for img in images)
# Create new image to hold all sentences
combined = Image.new('RGB', (max_width, total_height), background_color)
# Paste each image
y_offset = 0
for img in images:
# Center image horizontally
x_offset = (max_width - img.width) // 2
combined.paste(img, (x_offset, y_offset))
y_offset += img.height
else:
combined = images[0]
# Convert to bytes for Streamlit
img_byte_arr = io.BytesIO()
combined.save(img_byte_arr, format='PNG')
img_byte_arr.seek(0)
return img_byte_arr.getvalue()