B3PPs_Predict / app.py
Bhadralab's picture
Update app.py
34fd3ae verified
raw
history blame
2.72 kB
import pandas as pd
import torch
import re
import gradio as gr
from io import StringIO
from transformers import EsmForSequenceClassification, EsmTokenizer
tokenizer = EsmTokenizer.from_pretrained("facebook/esm2_t6_8M_UR50D", do_lower_case=False)
model = EsmForSequenceClassification.from_pretrained("model/best_model5")
model.eval()
def read_fasta(fasta_string):
"""Parses FASTA format input and returns sequences with validation."""
sequences = []
headers = []
seq_buffer = []
header = None
for line in StringIO(fasta_string):
line = line.strip()
if line.startswith(">"):
if seq_buffer:
sequences.append("".join(seq_buffer))
seq_buffer.clear()
header = line
headers.append(header)
else:
if not re.match(r'^[ACDEFGHIKLMNPQRSTVWY]+$', line):
raise ValueError("Invalid FASTA format: Sequences must contain only natural amino acids (ACDEFGHIKLMNPQRSTVWY).")
if len(line) > 30:
raise ValueError(f"Sequence too long: '{line}' (Max 30 characters allowed).")
seq_buffer.append(line)
if seq_buffer:
sequences.append("".join(seq_buffer))
return headers, sequences
def predict_peptide_class(sequences):
"""Predicts peptide classifications in batch."""
inputs = tokenizer(sequences, return_tensors='pt')
with torch.no_grad():
logits = model(**inputs).logits
probs = torch.sigmoid(logits)[:, 1].cpu().numpy()
classes = ["B3PP" if p > 0.5 else "Non-B3PP" for p in probs]
return probs, classes
def predict_from_fasta(fasta_input):
"""Processes FASTA input and returns predictions in a table format."""
try:
headers, sequences = read_fasta(fasta_input)
if not sequences:
return pd.DataFrame({"Error": ["No valid sequences found."]})
probs, classes = predict_peptide_class(sequences)
return pd.DataFrame({"Header": headers, "Sequence": sequences, "Probability": probs, "Predicted Class": classes})
except ValueError as e:
return pd.DataFrame({"Error": [str(e)]})
except Exception:
return pd.DataFrame({"Error": ["An unexpected error occurred."]})
iface = gr.Interface(
fn=predict_from_fasta,
inputs=gr.Textbox(lines=10, placeholder="Paste your peptide sequences in FASTA format here"),
outputs=gr.Dataframe(),
title="B3PP Predictor",
description="Submit peptide sequences in FASTA format to determine their potential as blood-brain barrier penetration peptides. Sequences must consist exclusively of natural amino acids in uppercase letters, with a maximum length of 30 characters."
)
iface.launch()