Spaces:

ChatterjeeLab
/

SMILES2PEPTIDE

Running

App Files Files Community

SMILES2PEPTIDE / app.py

yinuozhang

software

45d6af3 about 23 hours ago

raw

history blame

9.82 kB

	import gradio as gr
	import re
	import pandas as pd
	from io import StringIO

	def remove_nested_branches(smiles):
	"""Remove nested branches from SMILES string"""
	result = ''
	depth = 0
	for char in smiles:
	if char == '(':
	depth += 1
	elif char == ')':
	depth -= 1
	elif depth == 0:
	result += char
	return result
	def identify_linkage_type(segment):
	"""
	Identify the type of linkage between residues
	Returns: tuple (type, is_n_methylated)
	"""
	if 'OC(=O)' in segment:
	return ('ester', False)
	elif 'N(C)C(=O)' in segment:
	return ('peptide', True) # N-methylated peptide bond
	elif 'NC(=O)' in segment:
	return ('peptide', False) # Regular peptide bond
	return (None, False)
	def identify_residue(segment, next_segment=None, prev_segment=None):
	"""
	Identify amino acid residues with modifications and special handling for Proline
	Returns: tuple (residue, modifications)
	"""
	modifications = []

	# Check for modifications in the next segment
	if next_segment:
	if 'N(C)C(=O)' in next_segment:
	modifications.append('N-Me')
	if 'OC(=O)' in next_segment:
	modifications.append('O-linked')

	# Special case for Proline - check for CCCN pattern and its cyclization
	# Proline can appear in several patterns due to its cyclic nature
	if any(pattern in segment for pattern in ['CCCN2', 'N2CCC', '[C@@H]2CCCN2', 'CCCN1', 'N1CCC']):
	return ('Pro', modifications)

	# Check if this segment is part of a Proline ring by looking at context
	if prev_segment and next_segment:
	if ('CCC' in segment and 'N' in next_segment) or ('N' in segment and 'CCC' in prev_segment):
	combined = prev_segment + segment + next_segment
	if re.search(r'CCCN.*C$=O$', combined):
	return ('Pro', modifications)

	# Aromatic amino acids
	if 'Cc2ccccc2' in segment or 'c1ccccc1' in segment:
	return ('Phe', modifications)
	if 'c2ccc(O)cc2' in segment:
	return ('Tyr', modifications)
	if 'c1c[nH]c2ccccc12' in segment:
	return ('Trp', modifications)
	if 'c1cnc[nH]1' in segment:
	return ('His', modifications)

	# Branched chain amino acids
	if 'CC(C)C[C@H]' in segment or 'CC(C)C[C@@H]' in segment:
	return ('Leu', modifications)
	if '[C@H](CC(C)C)' in segment or '[C@@H](CC(C)C)' in segment:
	return ('Leu', modifications)
	if 'C(C)C' in segment and not any(pat in segment for pat in ['CC(C)C', 'C(C)C[C@H]', 'C(C)C[C@@H]']):
	return ('Val', modifications)
	if 'C(C)C[C@H]' in segment or 'C(C)C[C@@H]' in segment:
	return ('Ile', modifications)

	# Small/polar amino acids
	if ('[C@H](C)' in segment or '[C@@H](C)' in segment) and 'C(C)C' not in segment:
	return ('Ala', modifications)
	if '[C@H](CO)' in segment:
	return ('Ser', modifications)
	if '[C@H](C(C)O)' in segment or '[C@@H](C(C)O)' in segment:
	return ('Thr', modifications)
	if '[C@H]' in segment and not any(pat in segment for pat in ['C(C)', 'CC', 'O', 'N', 'S']):
	return ('Gly', modifications)

	# Rest of amino acids remain the same...
	# [Previous code for other amino acids]

	return (None, modifications)
	def parse_peptide(smiles):
	"""
	Parse peptide sequence with enhanced Proline recognition
	"""
	# Split on peptide bonds while preserving cycle numbers
	bond_pattern = r'(NC$=O$\|N$C$C$=O$\|N\dC$=O$\|OC$=O$)'
	segments = re.split(bond_pattern, smiles)
	segments = [s for s in segments if s]

	sequence = []
	i = 0
	while i < len(segments):
	segment = segments[i]
	next_segment = segments[i+1] if i+1 < len(segments) else None
	prev_segment = segments[i-1] if i > 0 else None

	# Skip pure bond patterns
	if re.match(r'.*C$=O$$', segment):
	i += 1
	continue

	residue, modifications = identify_residue(segment, next_segment, prev_segment)
	if residue:
	# Format residue with modifications
	formatted_residue = residue
	if modifications:
	formatted_residue += f"({','.join(modifications)})"
	sequence.append(formatted_residue)

	i += 1

	is_cyclic = is_cyclic_peptide(smiles)

	# Print debug information
	print("\nDetailed Analysis:")
	print("Segments:", segments)
	print("Found sequence:", sequence)

	# Format the final sequence
	if is_cyclic:
	return f"cyclo({'-'.join(sequence)})"
	return '-'.join(sequence)

	def is_cyclic_peptide(smiles):
	"""
	Determine if SMILES represents a cyclic peptide by checking:
	1. Proper cycle number pairing
	2. Presence of peptide bonds between cycle points
	3. Distinguishing between aromatic rings and peptide cycles
	"""
	cycle_info = {}

	# Find all cycle numbers and their contexts
	for match in re.finditer(r'(\w{3})?(\d)(\w{3})?', smiles):
	number = match.group(2)
	pre_context = match.group(1) or ''
	post_context = match.group(3) or ''
	position = match.start(2)

	if number not in cycle_info:
	cycle_info[number] = []
	cycle_info[number].append({
	'position': position,
	'pre_context': pre_context,
	'post_context': post_context,
	'full_context': smiles[max(0, position-3):min(len(smiles), position+4)]
	})

	# Check each cycle
	peptide_cycles = []
	aromatic_cycles = []

	for number, occurrences in cycle_info.items():
	if len(occurrences) != 2: # Must have exactly 2 occurrences
	continue

	start, end = occurrences[0]['position'], occurrences[1]['position']

	# Get the segment between cycle points
	segment = smiles[start:end+1]
	clean_segment = remove_nested_branches(segment)

	# Check if this is an aromatic ring
	is_aromatic = any(context['full_context'].count('c') >= 2 for context in occurrences)

	# Check if this is a peptide cycle
	has_peptide_bond = 'NC(=O)' in segment or 'N2C(=O)' in segment

	if is_aromatic:
	aromatic_cycles.append(number)
	elif has_peptide_bond:
	peptide_cycles.append(number)

	return len(peptide_cycles) > 0, peptide_cycles, aromatic_cycles

	def analyze_single_smiles(smiles):
	"""Analyze a single SMILES string"""
	try:
	is_cyclic, peptide_cycles, aromatic_cycles = is_cyclic_peptide(smiles)
	sequence = parse_peptide(smiles)

	details = {
	'SMILES': smiles,
	'Sequence': sequence,
	'Is Cyclic': 'Yes' if is_cyclic else 'No',
	'Peptide Cycles': ', '.join(peptide_cycles) if peptide_cycles else 'None',
	'Aromatic Cycles': ', '.join(aromatic_cycles) if aromatic_cycles else 'None'
	}
	return details

	except Exception as e:
	return {
	'SMILES': smiles,
	'Sequence': f'Error: {str(e)}',
	'Is Cyclic': 'Error',
	'Peptide Cycles': 'Error',
	'Aromatic Cycles': 'Error'
	}

	def process_input(smiles_input=None, file_obj=None):
	"""Process either direct SMILES input or file input"""
	results = []

	# Handle direct SMILES input
	if smiles_input:
	result = analyze_single_smiles(smiles_input.strip())
	results.append(result)

	# Handle file input
	if file_obj is not None:
	content = file_obj.decode('utf-8')
	for line in StringIO(content):
	smiles = line.strip()
	if smiles: # Skip empty lines
	result = analyze_single_smiles(smiles)
	results.append(result)

	# Create formatted output
	output_text = ""
	for i, result in enumerate(results, 1):
	output_text += f"Entry {i}:\n"
	output_text += f"SMILES: {result['SMILES']}\n"
	output_text += f"Sequence: {result['Sequence']}\n"
	output_text += f"Is Cyclic: {result['Is Cyclic']}\n"
	output_text += f"Peptide Cycles: {result['Peptide Cycles']}\n"
	output_text += f"Aromatic Cycles: {result['Aromatic Cycles']}\n"
	output_text += "-" * 50 + "\n"

	return output_text

	# Create Gradio interface
	iface = gr.Interface(
	fn=process_input,
	inputs=[
	gr.Textbox(
	label="Enter SMILES string",
	placeholder="Enter SMILES notation of peptide...",
	lines=2
	),
	gr.File(
	label="Or upload a text file with SMILES",
	file_types=[".txt"],
	type="binary"
	)
	],
	outputs=gr.Textbox(
	label="Analysis Results",
	lines=10
	),
	title="Peptide Structure Analyzer",
	description="""
	Analyze peptide structures from SMILES notation to:
	1. Determine if the peptide is cyclic
	2. Identify peptide cycles vs aromatic rings
	3. Parse the amino acid sequence

	Input: Either enter a SMILES string directly or upload a text file with multiple SMILES (one per line)
	""",
	examples=[
	# Example cyclic peptide with Proline
	["CC(C)C[C@@H]1NC(=O)[C@@H]2CCCN2C(=O)[C@@H](CC(C)C)NC(=O)[C@@H](CC(C)C)N(C)C(=O)[C@H](C)NC(=O)[C@H](Cc2ccccc2)NC1=O", None],
	# Example cyclic peptide with ester bond
	["CC(C)C[C@@H]1OC(=O)[C@H](C)NC(=O)[C@H](C(C)C)OC(=O)[C@H](C)N(C)C(=O)[C@@H](C)NC(=O)[C@@H](Cc2ccccc2)N(C)C1=O", None]
	],
	allow_flagging="never"
	)

	# Launch the app
	if __name__ == "__main__":
	iface.launch()