File size: 7,295 Bytes
ad57a01
 
d6d5bda
ad57a01
 
d6d5bda
 
ad57a01
d6d5bda
 
ad57a01
d6d5bda
 
 
 
 
 
 
 
 
 
 
ad57a01
d6d5bda
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ad57a01
d6d5bda
 
 
 
 
 
 
 
 
 
 
ad57a01
d6d5bda
 
 
 
 
ad57a01
d6d5bda
 
 
 
 
 
 
 
 
 
 
 
ad57a01
d6d5bda
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ad57a01
d6d5bda
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ad57a01
d6d5bda
 
 
 
 
 
 
ad57a01
d6d5bda
 
ad57a01
d6d5bda
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ad57a01
d6d5bda
 
 
 
 
 
 
ad57a01
d6d5bda
 
 
 
 
 
ad57a01
d6d5bda
 
 
 
 
 
 
 
ad57a01
d6d5bda
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
import streamlit as st
import spacy
import graphviz
import pandas as pd
import base64
import shutil
import subprocess

# Load English language model for spaCy
nlp = spacy.load('en_core_web_md')

def check_graphviz_installation():
    """
    Check if Graphviz is installed and accessible
    """
    if shutil.which('dot') is None:
        return False
    try:
        subprocess.run(['dot', '-V'], capture_output=True, check=True)
        return True
    except (subprocess.SubprocessError, OSError):
        return False

def identify_clauses(doc):
    """
    Identify clauses in the sentence using spaCy, correctly separating dependent and independent clauses
    """
    clauses = []
    
    # First identify all subordinate clauses and their spans
    subordinate_spans = []
    for token in doc:
        if token.dep_ in ["ccomp", "xcomp", "advcl", "relcl"]:
            span = doc[token.left_edge.i:token.right_edge.i + 1]
            subordinate_spans.append({
                "span": span,
                "type": {
                    "ccomp": "Complement Clause",
                    "xcomp": "Open Complement Clause",
                    "advcl": "Adverbial Clause",
                    "relcl": "Adjective Clause"
                }[token.dep_]
            })
    
    # Find the root and construct the main clause by excluding subordinate spans
    root = None
    for token in doc:
        if token.dep_ == "ROOT":
            root = token
            break
    
    if root:
        # Get all tokens in the root's subtree
        main_clause_tokens = set(token for token in root.subtree)
        
        # Remove tokens that are part of subordinate clauses
        for sub_clause in subordinate_spans:
            for token in sub_clause["span"]:
                if token in main_clause_tokens:
                    main_clause_tokens.remove(token)
        
        # Construct the main clause text from remaining tokens
        main_clause_text = " ".join(sorted([token.text for token in main_clause_tokens], 
                                         key=lambda x: [t.i for t in doc if t.text == x][0]))
        main_clause_text = main_clause_text.strip().replace(",","").replace(".","")
        clauses.append({"Type": "Independent Clause", "Text": main_clause_text})
    
    # Add the subordinate clauses
    for sub_clause in subordinate_spans:
        clauses.append({
            "Type": sub_clause["type"],
            "Text": sub_clause["span"].text
        })
            
    return clauses

def analyze_clause_functions(doc):
    """
    Analyze the function of each clause
    """
    functions = []
    
    for token in doc:
        if token.dep_ == "ROOT":
            functions.append({"Type": "Independent Clause", "Function": "Express the primary action or state"})
        elif token.dep_ == "ccomp":
            functions.append({"Type": "Complement Clause", "Function": "Acts as object of the main verb"})
        elif token.dep_ == "xcomp":
            functions.append({"Type": "Open Complement Clause", "Function": "Predicate complement without its own subject"})
        elif token.dep_ == "advcl":
            functions.append({"Type": "Adverbial Clause", "Function": "Modifies the verb like an adverb"})
        elif token.dep_ == "relcl":
            functions.append({"Type": "Adjective Clause", "Function": "Modifies a noun like an adjective"})
            
    return functions

def create_dependency_graph(doc):
    """
    Create a graphviz visualization of the dependency tree
    """
    if not check_graphviz_installation():
        return None
    
    dot = graphviz.Digraph(comment='Dependency Tree')
    
    # Add nodes
    for token in doc:
        dot.node(str(token.i), f"{token.text}\n({token.pos_})")
    
    # Add edges
    for token in doc:
        if token.head is not token:  # Skip root
            dot.edge(str(token.head.i), str(token.i), token.dep_)
    
    return dot

def get_graph_download_link(dot):
    """
    Generate a download link for the graph image
    """
    try:
        # Create PDF in memory
        pdf = dot.pipe(format='pdf')
        
        # Encode to base64
        b64 = base64.b64encode(pdf).decode()
        
        href = f'<a href="data:application/pdf;base64,{b64}" download="syntax_tree.pdf">Download Syntax Tree (PDF)</a>'
        return href
    except Exception as e:
        return f"Error generating download link: {str(e)}"

def main():
    # Set page to wide mode for better visualization
    st.set_page_config(layout="wide")
    st.markdown("<h1 style='text-align: center; color: white;'>English Clause Analyzer</h1>", unsafe_allow_html=True)
    st.write("Enter an English sentence to analyze its clauses, their functions, and syntax tree.")
    
    # Input text
    text = st.text_area("Enter your sentence:", "When I arrived at the station, the train had already left.", height=100)
    
    if st.button("Analyze"):
        if text:
            # Process the text
            doc = nlp(text)
            
            # Create two columns for layout
            col1, col2 = st.columns(2)
            
            with col1:
                # Identify clauses
                clauses = identify_clauses(doc)
                st.subheader(f"Clauses Analysis")
                
                # Convert clauses to DataFrame for better presentation
                df_clauses = pd.DataFrame(clauses)
                st.table(df_clauses.style.set_properties(**{
                    'background-color': 'rgba(0,0,0,0.1)',
                    'color': 'white'
                }))
                
                # Display clause functions
                functions = analyze_clause_functions(doc)
                st.subheader("Clause Functions")
                df_functions = pd.DataFrame(functions)
                st.table(df_functions.style.set_properties(**{
                    'background-color': 'rgba(0,0,0,0.1)',
                    'color': 'white'
                }))
            
            with col2:
                # Display dependency visualization
                st.subheader("Syntax Tree Visualization")
                if not check_graphviz_installation():
                    st.error("Graphviz is not installed. Please install it using:")
                    st.code("sudo apt-get install graphviz")
                    st.markdown("After installation, restart the application.")
                else:
                    dot = create_dependency_graph(doc)
                    st.graphviz_chart(dot)
                    
                    # Add download button for the graph
                    st.markdown(get_graph_download_link(dot), unsafe_allow_html=True)
                    
                    # Display part-of-speech tags in a table
                    st.subheader("Part-of-Speech Analysis")
                    pos_data = [{"Word": token.text, "Part of Speech": token.pos_, 
                            "Description": spacy.explain(token.pos_)} for token in doc]
                    df_pos = pd.DataFrame(pos_data)
                    st.table(df_pos.style.set_properties(**{
                        'background-color': 'rgba(0,0,0,0.1)',
                        'color': 'white'
                    }))

if __name__ == "__main__":
    main()