Spaces:

ChatterjeeLab
/

SMILES2PEPTIDE

Running

App Files Files Community

yinuozhang commited on Nov 14, 2024

Commit

45d6af3

1 Parent(s): c5d7b26

software

Browse files

Files changed (2) hide show

app.py +279 -0
requirements.txt +2 -0

app.py ADDED Viewed

	@@ -0,0 +1,279 @@

+import gradio as gr
+import re
+import pandas as pd
+from io import StringIO
+def remove_nested_branches(smiles):
+    """Remove nested branches from SMILES string"""
+    result = ''
+    depth = 0
+    for char in smiles:
+        if char == '(':
+            depth += 1
+        elif char == ')':
+            depth -= 1
+        elif depth == 0:
+            result += char
+    return result
+def identify_linkage_type(segment):
+    """
+    Identify the type of linkage between residues
+    Returns: tuple (type, is_n_methylated)
+    """
+    if 'OC(=O)' in segment:
+        return ('ester', False)
+    elif 'N(C)C(=O)' in segment:
+        return ('peptide', True)  # N-methylated peptide bond
+    elif 'NC(=O)' in segment:
+        return ('peptide', False)  # Regular peptide bond
+    return (None, False)
+def identify_residue(segment, next_segment=None, prev_segment=None):
+    """
+    Identify amino acid residues with modifications and special handling for Proline
+    Returns: tuple (residue, modifications)
+    """
+    modifications = []
+    # Check for modifications in the next segment
+    if next_segment:
+        if 'N(C)C(=O)' in next_segment:
+            modifications.append('N-Me')
+        if 'OC(=O)' in next_segment:
+            modifications.append('O-linked')
+    # Special case for Proline - check for CCCN pattern and its cyclization
+    # Proline can appear in several patterns due to its cyclic nature
+    if any(pattern in segment for pattern in ['CCCN2', 'N2CCC', '[C@@H]2CCCN2', 'CCCN1', 'N1CCC']):
+        return ('Pro', modifications)
+    # Check if this segment is part of a Proline ring by looking at context
+    if prev_segment and next_segment:
+        if ('CCC' in segment and 'N' in next_segment) or ('N' in segment and 'CCC' in prev_segment):
+            combined = prev_segment + segment + next_segment
+            if re.search(r'CCCN.*C\(=O\)', combined):
+                return ('Pro', modifications)
+    # Aromatic amino acids
+    if 'Cc2ccccc2' in segment or 'c1ccccc1' in segment:
+        return ('Phe', modifications)
+    if 'c2ccc(O)cc2' in segment:
+        return ('Tyr', modifications)
+    if 'c1c[nH]c2ccccc12' in segment:
+        return ('Trp', modifications)
+    if 'c1cnc[nH]1' in segment:
+        return ('His', modifications)
+    # Branched chain amino acids
+    if 'CC(C)C[C@H]' in segment or 'CC(C)C[C@@H]' in segment:
+        return ('Leu', modifications)
+    if '[C@H](CC(C)C)' in segment or '[C@@H](CC(C)C)' in segment:
+        return ('Leu', modifications)
+    if 'C(C)C' in segment and not any(pat in segment for pat in ['CC(C)C', 'C(C)C[C@H]', 'C(C)C[C@@H]']):
+        return ('Val', modifications)
+    if 'C(C)C[C@H]' in segment or 'C(C)C[C@@H]' in segment:
+        return ('Ile', modifications)
+    # Small/polar amino acids
+    if ('[C@H](C)' in segment or '[C@@H](C)' in segment) and 'C(C)C' not in segment:
+        return ('Ala', modifications)
+    if '[C@H](CO)' in segment:
+        return ('Ser', modifications)
+    if '[C@H](C(C)O)' in segment or '[C@@H](C(C)O)' in segment:
+        return ('Thr', modifications)
+    if '[C@H]' in segment and not any(pat in segment for pat in ['C(C)', 'CC', 'O', 'N', 'S']):
+        return ('Gly', modifications)
+    # Rest of amino acids remain the same...
+    # [Previous code for other amino acids]
+    return (None, modifications)
+def parse_peptide(smiles):
+    """
+    Parse peptide sequence with enhanced Proline recognition
+    """
+    # Split on peptide bonds while preserving cycle numbers
+    bond_pattern = r'(NC\(=O\)|N\(C\)C\(=O\)|N\dC\(=O\)|OC\(=O\))'
+    segments = re.split(bond_pattern, smiles)
+    segments = [s for s in segments if s]
+    sequence = []
+    i = 0
+    while i < len(segments):
+        segment = segments[i]
+        next_segment = segments[i+1] if i+1 < len(segments) else None
+        prev_segment = segments[i-1] if i > 0 else None
+        # Skip pure bond patterns
+        if re.match(r'.*C\(=O\)$', segment):
+            i += 1
+            continue
+        residue, modifications = identify_residue(segment, next_segment, prev_segment)
+        if residue:
+            # Format residue with modifications
+            formatted_residue = residue
+            if modifications:
+                formatted_residue += f"({','.join(modifications)})"
+            sequence.append(formatted_residue)
+        i += 1
+    is_cyclic = is_cyclic_peptide(smiles)
+    # Print debug information
+    print("\nDetailed Analysis:")
+    print("Segments:", segments)
+    print("Found sequence:", sequence)
+    # Format the final sequence
+    if is_cyclic:
+        return f"cyclo({'-'.join(sequence)})"
+    return '-'.join(sequence)
+def is_cyclic_peptide(smiles):
+    """
+    Determine if SMILES represents a cyclic peptide by checking:
+    1. Proper cycle number pairing
+    2. Presence of peptide bonds between cycle points
+    3. Distinguishing between aromatic rings and peptide cycles
+    """
+    cycle_info = {}
+    # Find all cycle numbers and their contexts
+    for match in re.finditer(r'(\w{3})?(\d)(\w{3})?', smiles):
+        number = match.group(2)
+        pre_context = match.group(1) or ''
+        post_context = match.group(3) or ''
+        position = match.start(2)
+        if number not in cycle_info:
+            cycle_info[number] = []
+        cycle_info[number].append({
+            'position': position,
+            'pre_context': pre_context,
+            'post_context': post_context,
+            'full_context': smiles[max(0, position-3):min(len(smiles), position+4)]
+        })
+    # Check each cycle
+    peptide_cycles = []
+    aromatic_cycles = []
+    for number, occurrences in cycle_info.items():
+        if len(occurrences) != 2:  # Must have exactly 2 occurrences
+            continue
+        start, end = occurrences[0]['position'], occurrences[1]['position']
+        # Get the segment between cycle points
+        segment = smiles[start:end+1]
+        clean_segment = remove_nested_branches(segment)
+        # Check if this is an aromatic ring
+        is_aromatic = any(context['full_context'].count('c') >= 2 for context in occurrences)
+        # Check if this is a peptide cycle
+        has_peptide_bond = 'NC(=O)' in segment or 'N2C(=O)' in segment
+        if is_aromatic:
+            aromatic_cycles.append(number)
+        elif has_peptide_bond:
+            peptide_cycles.append(number)
+    return len(peptide_cycles) > 0, peptide_cycles, aromatic_cycles
+def analyze_single_smiles(smiles):
+    """Analyze a single SMILES string"""
+    try:
+        is_cyclic, peptide_cycles, aromatic_cycles = is_cyclic_peptide(smiles)
+        sequence = parse_peptide(smiles)
+        details = {
+            'SMILES': smiles,
+            'Sequence': sequence,
+            'Is Cyclic': 'Yes' if is_cyclic else 'No',
+            'Peptide Cycles': ', '.join(peptide_cycles) if peptide_cycles else 'None',
+            'Aromatic Cycles': ', '.join(aromatic_cycles) if aromatic_cycles else 'None'
+        }
+        return details
+    except Exception as e:
+        return {
+            'SMILES': smiles,
+            'Sequence': f'Error: {str(e)}',
+            'Is Cyclic': 'Error',
+            'Peptide Cycles': 'Error',
+            'Aromatic Cycles': 'Error'
+        }
+def process_input(smiles_input=None, file_obj=None):
+    """Process either direct SMILES input or file input"""
+    results = []
+    # Handle direct SMILES input
+    if smiles_input:
+        result = analyze_single_smiles(smiles_input.strip())
+        results.append(result)
+    # Handle file input
+    if file_obj is not None:
+        content = file_obj.decode('utf-8')
+        for line in StringIO(content):
+            smiles = line.strip()
+            if smiles:  # Skip empty lines
+                result = analyze_single_smiles(smiles)
+                results.append(result)
+    # Create formatted output
+    output_text = ""
+    for i, result in enumerate(results, 1):
+        output_text += f"Entry {i}:\n"
+        output_text += f"SMILES: {result['SMILES']}\n"
+        output_text += f"Sequence: {result['Sequence']}\n"
+        output_text += f"Is Cyclic: {result['Is Cyclic']}\n"
+        output_text += f"Peptide Cycles: {result['Peptide Cycles']}\n"
+        output_text += f"Aromatic Cycles: {result['Aromatic Cycles']}\n"
+        output_text += "-" * 50 + "\n"
+    return output_text
+# Create Gradio interface
+iface = gr.Interface(
+    fn=process_input,
+    inputs=[
+        gr.Textbox(
+            label="Enter SMILES string",
+            placeholder="Enter SMILES notation of peptide...",
+            lines=2
+        ),
+        gr.File(
+            label="Or upload a text file with SMILES",
+            file_types=[".txt"],
+            type="binary"
+        )
+    ],
+    outputs=gr.Textbox(
+        label="Analysis Results",
+        lines=10
+    ),
+    title="Peptide Structure Analyzer",
+    description="""
+    Analyze peptide structures from SMILES notation to:
+    1. Determine if the peptide is cyclic
+    2. Identify peptide cycles vs aromatic rings
+    3. Parse the amino acid sequence
+    Input: Either enter a SMILES string directly or upload a text file with multiple SMILES (one per line)
+    """,
+    examples=[
+        # Example cyclic peptide with Proline
+        ["CC(C)C[C@@H]1NC(=O)[C@@H]2CCCN2C(=O)[C@@H](CC(C)C)NC(=O)[C@@H](CC(C)C)N(C)C(=O)[C@H](C)NC(=O)[C@H](Cc2ccccc2)NC1=O", None],
+        # Example cyclic peptide with ester bond
+        ["CC(C)C[C@@H]1OC(=O)[C@H](C)NC(=O)[C@H](C(C)C)OC(=O)[C@H](C)N(C)C(=O)[C@@H](C)NC(=O)[C@@H](Cc2ccccc2)N(C)C1=O", None]
+    ],
+    allow_flagging="never"
+)
+# Launch the app
+if __name__ == "__main__":
+    iface.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ gradio==4.19.2
2	+ pandas==2.2.0