Spaces:
Sleeping
Sleeping
import pandas as pd | |
import torch | |
import re | |
import gradio as gr | |
from io import StringIO | |
from transformers import EsmForSequenceClassification, EsmTokenizer | |
tokenizer = EsmTokenizer.from_pretrained("facebook/esm2_t6_8M_UR50D", do_lower_case=False) | |
model = EsmForSequenceClassification.from_pretrained("model/best_model5") | |
model.eval() | |
def read_fasta(fasta_string): | |
"""Parses FASTA format input and returns sequences with validation.""" | |
sequences = [] | |
headers = [] | |
seq_buffer = [] | |
header = None | |
for line in StringIO(fasta_string): | |
line = line.strip() | |
if line.startswith(">"): | |
if seq_buffer: | |
sequences.append("".join(seq_buffer)) | |
seq_buffer.clear() | |
header = line | |
headers.append(header) | |
else: | |
if not re.match(r'^[ACDEFGHIKLMNPQRSTVWY]+$', line): | |
raise ValueError("Invalid FASTA format: Sequences must contain only natural amino acids (ACDEFGHIKLMNPQRSTVWY).") | |
if len(line) > 30: | |
raise ValueError(f"Sequence too long: '{line}' (Max 30 characters allowed).") | |
seq_buffer.append(line) | |
if seq_buffer: | |
sequences.append("".join(seq_buffer)) | |
return headers, sequences | |
def predict_peptide_class(sequences): | |
"""Predicts peptide classifications in batch.""" | |
inputs = tokenizer(sequences, return_tensors='pt') | |
with torch.no_grad(): | |
logits = model(**inputs).logits | |
probs = torch.sigmoid(logits)[:, 1].cpu().numpy() | |
classes = ["B3PP" if p > 0.5 else "Non-B3PP" for p in probs] | |
return probs, classes | |
def predict_from_fasta(fasta_input): | |
"""Processes FASTA input and returns predictions in a table format.""" | |
try: | |
headers, sequences = read_fasta(fasta_input) | |
if not sequences: | |
return pd.DataFrame({"Error": ["No valid sequences found."]}) | |
probs, classes = predict_peptide_class(sequences) | |
return pd.DataFrame({"Header": headers, "Sequence": sequences, "Probability": probs, "Predicted Class": classes}) | |
except ValueError as e: | |
return pd.DataFrame({"Error": [str(e)]}) | |
except Exception: | |
return pd.DataFrame({"Error": ["An unexpected error occurred."]}) | |
iface = gr.Interface( | |
fn=predict_from_fasta, | |
inputs=gr.Textbox(lines=10, placeholder="Paste your peptide sequences in FASTA format here"), | |
outputs=gr.Dataframe(), | |
title="B3PP Predictor", | |
description="Submit peptide sequences in FASTA format to determine their potential as blood-brain barrier penetration peptides. Sequences must consist exclusively of natural amino acids in uppercase letters, with a maximum length of 30 characters." | |
) | |
iface.launch() | |