Spaces:
Sleeping
Sleeping
File size: 7,295 Bytes
ad57a01 d6d5bda ad57a01 d6d5bda ad57a01 d6d5bda ad57a01 d6d5bda ad57a01 d6d5bda ad57a01 d6d5bda ad57a01 d6d5bda ad57a01 d6d5bda ad57a01 d6d5bda ad57a01 d6d5bda ad57a01 d6d5bda ad57a01 d6d5bda ad57a01 d6d5bda ad57a01 d6d5bda ad57a01 d6d5bda ad57a01 d6d5bda ad57a01 d6d5bda |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 |
import streamlit as st
import spacy
import graphviz
import pandas as pd
import base64
import shutil
import subprocess
# Load English language model for spaCy
nlp = spacy.load('en_core_web_md')
def check_graphviz_installation():
"""
Check if Graphviz is installed and accessible
"""
if shutil.which('dot') is None:
return False
try:
subprocess.run(['dot', '-V'], capture_output=True, check=True)
return True
except (subprocess.SubprocessError, OSError):
return False
def identify_clauses(doc):
"""
Identify clauses in the sentence using spaCy, correctly separating dependent and independent clauses
"""
clauses = []
# First identify all subordinate clauses and their spans
subordinate_spans = []
for token in doc:
if token.dep_ in ["ccomp", "xcomp", "advcl", "relcl"]:
span = doc[token.left_edge.i:token.right_edge.i + 1]
subordinate_spans.append({
"span": span,
"type": {
"ccomp": "Complement Clause",
"xcomp": "Open Complement Clause",
"advcl": "Adverbial Clause",
"relcl": "Adjective Clause"
}[token.dep_]
})
# Find the root and construct the main clause by excluding subordinate spans
root = None
for token in doc:
if token.dep_ == "ROOT":
root = token
break
if root:
# Get all tokens in the root's subtree
main_clause_tokens = set(token for token in root.subtree)
# Remove tokens that are part of subordinate clauses
for sub_clause in subordinate_spans:
for token in sub_clause["span"]:
if token in main_clause_tokens:
main_clause_tokens.remove(token)
# Construct the main clause text from remaining tokens
main_clause_text = " ".join(sorted([token.text for token in main_clause_tokens],
key=lambda x: [t.i for t in doc if t.text == x][0]))
main_clause_text = main_clause_text.strip().replace(",","").replace(".","")
clauses.append({"Type": "Independent Clause", "Text": main_clause_text})
# Add the subordinate clauses
for sub_clause in subordinate_spans:
clauses.append({
"Type": sub_clause["type"],
"Text": sub_clause["span"].text
})
return clauses
def analyze_clause_functions(doc):
"""
Analyze the function of each clause
"""
functions = []
for token in doc:
if token.dep_ == "ROOT":
functions.append({"Type": "Independent Clause", "Function": "Express the primary action or state"})
elif token.dep_ == "ccomp":
functions.append({"Type": "Complement Clause", "Function": "Acts as object of the main verb"})
elif token.dep_ == "xcomp":
functions.append({"Type": "Open Complement Clause", "Function": "Predicate complement without its own subject"})
elif token.dep_ == "advcl":
functions.append({"Type": "Adverbial Clause", "Function": "Modifies the verb like an adverb"})
elif token.dep_ == "relcl":
functions.append({"Type": "Adjective Clause", "Function": "Modifies a noun like an adjective"})
return functions
def create_dependency_graph(doc):
"""
Create a graphviz visualization of the dependency tree
"""
if not check_graphviz_installation():
return None
dot = graphviz.Digraph(comment='Dependency Tree')
# Add nodes
for token in doc:
dot.node(str(token.i), f"{token.text}\n({token.pos_})")
# Add edges
for token in doc:
if token.head is not token: # Skip root
dot.edge(str(token.head.i), str(token.i), token.dep_)
return dot
def get_graph_download_link(dot):
"""
Generate a download link for the graph image
"""
try:
# Create PDF in memory
pdf = dot.pipe(format='pdf')
# Encode to base64
b64 = base64.b64encode(pdf).decode()
href = f'<a href="data:application/pdf;base64,{b64}" download="syntax_tree.pdf">Download Syntax Tree (PDF)</a>'
return href
except Exception as e:
return f"Error generating download link: {str(e)}"
def main():
# Set page to wide mode for better visualization
st.set_page_config(layout="wide")
st.markdown("<h1 style='text-align: center; color: white;'>English Clause Analyzer</h1>", unsafe_allow_html=True)
st.write("Enter an English sentence to analyze its clauses, their functions, and syntax tree.")
# Input text
text = st.text_area("Enter your sentence:", "When I arrived at the station, the train had already left.", height=100)
if st.button("Analyze"):
if text:
# Process the text
doc = nlp(text)
# Create two columns for layout
col1, col2 = st.columns(2)
with col1:
# Identify clauses
clauses = identify_clauses(doc)
st.subheader(f"Clauses Analysis")
# Convert clauses to DataFrame for better presentation
df_clauses = pd.DataFrame(clauses)
st.table(df_clauses.style.set_properties(**{
'background-color': 'rgba(0,0,0,0.1)',
'color': 'white'
}))
# Display clause functions
functions = analyze_clause_functions(doc)
st.subheader("Clause Functions")
df_functions = pd.DataFrame(functions)
st.table(df_functions.style.set_properties(**{
'background-color': 'rgba(0,0,0,0.1)',
'color': 'white'
}))
with col2:
# Display dependency visualization
st.subheader("Syntax Tree Visualization")
if not check_graphviz_installation():
st.error("Graphviz is not installed. Please install it using:")
st.code("sudo apt-get install graphviz")
st.markdown("After installation, restart the application.")
else:
dot = create_dependency_graph(doc)
st.graphviz_chart(dot)
# Add download button for the graph
st.markdown(get_graph_download_link(dot), unsafe_allow_html=True)
# Display part-of-speech tags in a table
st.subheader("Part-of-Speech Analysis")
pos_data = [{"Word": token.text, "Part of Speech": token.pos_,
"Description": spacy.explain(token.pos_)} for token in doc]
df_pos = pd.DataFrame(pos_data)
st.table(df_pos.style.set_properties(**{
'background-color': 'rgba(0,0,0,0.1)',
'color': 'white'
}))
if __name__ == "__main__":
main() |