import pandas as pd import torch import re import gradio as gr from io import StringIO from transformers import EsmForSequenceClassification, EsmTokenizer tokenizer = EsmTokenizer.from_pretrained("facebook/esm2_t6_8M_UR50D", do_lower_case=False) model = EsmForSequenceClassification.from_pretrained("model/best_model5") model.eval() def read_fasta(fasta_string): """Parses FASTA format input and returns sequences with validation.""" sequences = [] headers = [] seq_buffer = [] header = None for line in StringIO(fasta_string): line = line.strip() if line.startswith(">"): if seq_buffer: sequences.append("".join(seq_buffer)) seq_buffer.clear() header = line headers.append(header) else: if not re.match(r'^[ACDEFGHIKLMNPQRSTVWY]+$', line): raise ValueError("Invalid FASTA format: Sequences must contain only natural amino acids (ACDEFGHIKLMNPQRSTVWY).") if len(line) > 30: raise ValueError(f"Sequence too long: '{line}' (Max 30 characters allowed).") seq_buffer.append(line) if seq_buffer: sequences.append("".join(seq_buffer)) return headers, sequences def predict_peptide_class(sequences): """Predicts peptide classifications in batch.""" inputs = tokenizer(sequences, return_tensors='pt') with torch.no_grad(): logits = model(**inputs).logits probs = torch.sigmoid(logits)[:, 1].cpu().numpy() classes = ["B3PP" if p > 0.5 else "Non-B3PP" for p in probs] return probs, classes def predict_from_fasta(fasta_input): """Processes FASTA input and returns predictions in a table format.""" try: headers, sequences = read_fasta(fasta_input) if not sequences: return pd.DataFrame({"Error": ["No valid sequences found."]}) probs, classes = predict_peptide_class(sequences) return pd.DataFrame({"Header": headers, "Sequence": sequences, "Probability": probs, "Predicted Class": classes}) except ValueError as e: return pd.DataFrame({"Error": [str(e)]}) except Exception: return pd.DataFrame({"Error": ["An unexpected error occurred."]}) iface = gr.Interface( fn=predict_from_fasta, inputs=gr.Textbox(lines=10, placeholder="Paste your peptide sequences in FASTA format here"), outputs=gr.Dataframe(), title="B3PP Predictor", description="Submit peptide sequences in FASTA format to determine their potential as blood-brain barrier penetration peptides. Sequences must consist exclusively of natural amino acids in uppercase letters, with a maximum length of 30 characters." ) iface.launch()