yinuozhang commited on
Commit
45d6af3
1 Parent(s): c5d7b26
Files changed (2) hide show
  1. app.py +279 -0
  2. requirements.txt +2 -0
app.py ADDED
@@ -0,0 +1,279 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import re
3
+ import pandas as pd
4
+ from io import StringIO
5
+
6
+ def remove_nested_branches(smiles):
7
+ """Remove nested branches from SMILES string"""
8
+ result = ''
9
+ depth = 0
10
+ for char in smiles:
11
+ if char == '(':
12
+ depth += 1
13
+ elif char == ')':
14
+ depth -= 1
15
+ elif depth == 0:
16
+ result += char
17
+ return result
18
+ def identify_linkage_type(segment):
19
+ """
20
+ Identify the type of linkage between residues
21
+ Returns: tuple (type, is_n_methylated)
22
+ """
23
+ if 'OC(=O)' in segment:
24
+ return ('ester', False)
25
+ elif 'N(C)C(=O)' in segment:
26
+ return ('peptide', True) # N-methylated peptide bond
27
+ elif 'NC(=O)' in segment:
28
+ return ('peptide', False) # Regular peptide bond
29
+ return (None, False)
30
+ def identify_residue(segment, next_segment=None, prev_segment=None):
31
+ """
32
+ Identify amino acid residues with modifications and special handling for Proline
33
+ Returns: tuple (residue, modifications)
34
+ """
35
+ modifications = []
36
+
37
+ # Check for modifications in the next segment
38
+ if next_segment:
39
+ if 'N(C)C(=O)' in next_segment:
40
+ modifications.append('N-Me')
41
+ if 'OC(=O)' in next_segment:
42
+ modifications.append('O-linked')
43
+
44
+ # Special case for Proline - check for CCCN pattern and its cyclization
45
+ # Proline can appear in several patterns due to its cyclic nature
46
+ if any(pattern in segment for pattern in ['CCCN2', 'N2CCC', '[C@@H]2CCCN2', 'CCCN1', 'N1CCC']):
47
+ return ('Pro', modifications)
48
+
49
+ # Check if this segment is part of a Proline ring by looking at context
50
+ if prev_segment and next_segment:
51
+ if ('CCC' in segment and 'N' in next_segment) or ('N' in segment and 'CCC' in prev_segment):
52
+ combined = prev_segment + segment + next_segment
53
+ if re.search(r'CCCN.*C\(=O\)', combined):
54
+ return ('Pro', modifications)
55
+
56
+ # Aromatic amino acids
57
+ if 'Cc2ccccc2' in segment or 'c1ccccc1' in segment:
58
+ return ('Phe', modifications)
59
+ if 'c2ccc(O)cc2' in segment:
60
+ return ('Tyr', modifications)
61
+ if 'c1c[nH]c2ccccc12' in segment:
62
+ return ('Trp', modifications)
63
+ if 'c1cnc[nH]1' in segment:
64
+ return ('His', modifications)
65
+
66
+ # Branched chain amino acids
67
+ if 'CC(C)C[C@H]' in segment or 'CC(C)C[C@@H]' in segment:
68
+ return ('Leu', modifications)
69
+ if '[C@H](CC(C)C)' in segment or '[C@@H](CC(C)C)' in segment:
70
+ return ('Leu', modifications)
71
+ if 'C(C)C' in segment and not any(pat in segment for pat in ['CC(C)C', 'C(C)C[C@H]', 'C(C)C[C@@H]']):
72
+ return ('Val', modifications)
73
+ if 'C(C)C[C@H]' in segment or 'C(C)C[C@@H]' in segment:
74
+ return ('Ile', modifications)
75
+
76
+ # Small/polar amino acids
77
+ if ('[C@H](C)' in segment or '[C@@H](C)' in segment) and 'C(C)C' not in segment:
78
+ return ('Ala', modifications)
79
+ if '[C@H](CO)' in segment:
80
+ return ('Ser', modifications)
81
+ if '[C@H](C(C)O)' in segment or '[C@@H](C(C)O)' in segment:
82
+ return ('Thr', modifications)
83
+ if '[C@H]' in segment and not any(pat in segment for pat in ['C(C)', 'CC', 'O', 'N', 'S']):
84
+ return ('Gly', modifications)
85
+
86
+ # Rest of amino acids remain the same...
87
+ # [Previous code for other amino acids]
88
+
89
+ return (None, modifications)
90
+ def parse_peptide(smiles):
91
+ """
92
+ Parse peptide sequence with enhanced Proline recognition
93
+ """
94
+ # Split on peptide bonds while preserving cycle numbers
95
+ bond_pattern = r'(NC\(=O\)|N\(C\)C\(=O\)|N\dC\(=O\)|OC\(=O\))'
96
+ segments = re.split(bond_pattern, smiles)
97
+ segments = [s for s in segments if s]
98
+
99
+ sequence = []
100
+ i = 0
101
+ while i < len(segments):
102
+ segment = segments[i]
103
+ next_segment = segments[i+1] if i+1 < len(segments) else None
104
+ prev_segment = segments[i-1] if i > 0 else None
105
+
106
+ # Skip pure bond patterns
107
+ if re.match(r'.*C\(=O\)$', segment):
108
+ i += 1
109
+ continue
110
+
111
+ residue, modifications = identify_residue(segment, next_segment, prev_segment)
112
+ if residue:
113
+ # Format residue with modifications
114
+ formatted_residue = residue
115
+ if modifications:
116
+ formatted_residue += f"({','.join(modifications)})"
117
+ sequence.append(formatted_residue)
118
+
119
+ i += 1
120
+
121
+ is_cyclic = is_cyclic_peptide(smiles)
122
+
123
+ # Print debug information
124
+ print("\nDetailed Analysis:")
125
+ print("Segments:", segments)
126
+ print("Found sequence:", sequence)
127
+
128
+ # Format the final sequence
129
+ if is_cyclic:
130
+ return f"cyclo({'-'.join(sequence)})"
131
+ return '-'.join(sequence)
132
+
133
+ def is_cyclic_peptide(smiles):
134
+ """
135
+ Determine if SMILES represents a cyclic peptide by checking:
136
+ 1. Proper cycle number pairing
137
+ 2. Presence of peptide bonds between cycle points
138
+ 3. Distinguishing between aromatic rings and peptide cycles
139
+ """
140
+ cycle_info = {}
141
+
142
+ # Find all cycle numbers and their contexts
143
+ for match in re.finditer(r'(\w{3})?(\d)(\w{3})?', smiles):
144
+ number = match.group(2)
145
+ pre_context = match.group(1) or ''
146
+ post_context = match.group(3) or ''
147
+ position = match.start(2)
148
+
149
+ if number not in cycle_info:
150
+ cycle_info[number] = []
151
+ cycle_info[number].append({
152
+ 'position': position,
153
+ 'pre_context': pre_context,
154
+ 'post_context': post_context,
155
+ 'full_context': smiles[max(0, position-3):min(len(smiles), position+4)]
156
+ })
157
+
158
+ # Check each cycle
159
+ peptide_cycles = []
160
+ aromatic_cycles = []
161
+
162
+ for number, occurrences in cycle_info.items():
163
+ if len(occurrences) != 2: # Must have exactly 2 occurrences
164
+ continue
165
+
166
+ start, end = occurrences[0]['position'], occurrences[1]['position']
167
+
168
+ # Get the segment between cycle points
169
+ segment = smiles[start:end+1]
170
+ clean_segment = remove_nested_branches(segment)
171
+
172
+ # Check if this is an aromatic ring
173
+ is_aromatic = any(context['full_context'].count('c') >= 2 for context in occurrences)
174
+
175
+ # Check if this is a peptide cycle
176
+ has_peptide_bond = 'NC(=O)' in segment or 'N2C(=O)' in segment
177
+
178
+ if is_aromatic:
179
+ aromatic_cycles.append(number)
180
+ elif has_peptide_bond:
181
+ peptide_cycles.append(number)
182
+
183
+ return len(peptide_cycles) > 0, peptide_cycles, aromatic_cycles
184
+
185
+ def analyze_single_smiles(smiles):
186
+ """Analyze a single SMILES string"""
187
+ try:
188
+ is_cyclic, peptide_cycles, aromatic_cycles = is_cyclic_peptide(smiles)
189
+ sequence = parse_peptide(smiles)
190
+
191
+ details = {
192
+ 'SMILES': smiles,
193
+ 'Sequence': sequence,
194
+ 'Is Cyclic': 'Yes' if is_cyclic else 'No',
195
+ 'Peptide Cycles': ', '.join(peptide_cycles) if peptide_cycles else 'None',
196
+ 'Aromatic Cycles': ', '.join(aromatic_cycles) if aromatic_cycles else 'None'
197
+ }
198
+ return details
199
+
200
+ except Exception as e:
201
+ return {
202
+ 'SMILES': smiles,
203
+ 'Sequence': f'Error: {str(e)}',
204
+ 'Is Cyclic': 'Error',
205
+ 'Peptide Cycles': 'Error',
206
+ 'Aromatic Cycles': 'Error'
207
+ }
208
+
209
+ def process_input(smiles_input=None, file_obj=None):
210
+ """Process either direct SMILES input or file input"""
211
+ results = []
212
+
213
+ # Handle direct SMILES input
214
+ if smiles_input:
215
+ result = analyze_single_smiles(smiles_input.strip())
216
+ results.append(result)
217
+
218
+ # Handle file input
219
+ if file_obj is not None:
220
+ content = file_obj.decode('utf-8')
221
+ for line in StringIO(content):
222
+ smiles = line.strip()
223
+ if smiles: # Skip empty lines
224
+ result = analyze_single_smiles(smiles)
225
+ results.append(result)
226
+
227
+ # Create formatted output
228
+ output_text = ""
229
+ for i, result in enumerate(results, 1):
230
+ output_text += f"Entry {i}:\n"
231
+ output_text += f"SMILES: {result['SMILES']}\n"
232
+ output_text += f"Sequence: {result['Sequence']}\n"
233
+ output_text += f"Is Cyclic: {result['Is Cyclic']}\n"
234
+ output_text += f"Peptide Cycles: {result['Peptide Cycles']}\n"
235
+ output_text += f"Aromatic Cycles: {result['Aromatic Cycles']}\n"
236
+ output_text += "-" * 50 + "\n"
237
+
238
+ return output_text
239
+
240
+ # Create Gradio interface
241
+ iface = gr.Interface(
242
+ fn=process_input,
243
+ inputs=[
244
+ gr.Textbox(
245
+ label="Enter SMILES string",
246
+ placeholder="Enter SMILES notation of peptide...",
247
+ lines=2
248
+ ),
249
+ gr.File(
250
+ label="Or upload a text file with SMILES",
251
+ file_types=[".txt"],
252
+ type="binary"
253
+ )
254
+ ],
255
+ outputs=gr.Textbox(
256
+ label="Analysis Results",
257
+ lines=10
258
+ ),
259
+ title="Peptide Structure Analyzer",
260
+ description="""
261
+ Analyze peptide structures from SMILES notation to:
262
+ 1. Determine if the peptide is cyclic
263
+ 2. Identify peptide cycles vs aromatic rings
264
+ 3. Parse the amino acid sequence
265
+
266
+ Input: Either enter a SMILES string directly or upload a text file with multiple SMILES (one per line)
267
+ """,
268
+ examples=[
269
+ # Example cyclic peptide with Proline
270
+ ["CC(C)C[C@@H]1NC(=O)[C@@H]2CCCN2C(=O)[C@@H](CC(C)C)NC(=O)[C@@H](CC(C)C)N(C)C(=O)[C@H](C)NC(=O)[C@H](Cc2ccccc2)NC1=O", None],
271
+ # Example cyclic peptide with ester bond
272
+ ["CC(C)C[C@@H]1OC(=O)[C@H](C)NC(=O)[C@H](C(C)C)OC(=O)[C@H](C)N(C)C(=O)[C@@H](C)NC(=O)[C@@H](Cc2ccccc2)N(C)C1=O", None]
273
+ ],
274
+ allow_flagging="never"
275
+ )
276
+
277
+ # Launch the app
278
+ if __name__ == "__main__":
279
+ iface.launch()
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ gradio==4.19.2
2
+ pandas==2.2.0