import pandas as pd import torch import re import gradio as gr from transformers import EsmForSequenceClassification, EsmTokenizer def read_fasta(fasta_string): """Parses FASTA format input and returns sequences with validation.""" sequences = [] headers = [] lines = fasta_string.strip().split("\n") seq = "" header = "" for line in lines: if line.startswith(">"): if seq: sequences.append((header, seq)) seq = "" header = line.strip() else: line = line.strip() if not re.match(r'^[ACDEFGHIKLMNPQRSTVWY]+$', line): raise ValueError("Invalid FASTA format: Sequences must contain only natural amino acids (ACDEFGHIKLMNPQRSTVWY).") if len(line) > 30: # max sequence length raise ValueError(f"Sequence too long: '{line}' (Max 30 characters allowed).") seq += line if seq: sequences.append((header, seq)) return sequences # Load model and tokenizer tokenizer = EsmTokenizer.from_pretrained("facebook/esm2_t6_8M_UR50D", do_lower_case=False) model = EsmForSequenceClassification.from_pretrained("model/best_model5") def predict_peptide_class(peptide_seq): """Predicts peptide classification using the ESM model.""" input_seq_tok = tokenizer(peptide_seq, return_tensors='pt') output = model(**input_seq_tok) logits = output[0] y_prob = torch.sigmoid(logits)[:, 1].detach().numpy() prob = y_prob.item() classification = "B3PP" if prob > 0.5 else "Non-B3PP" return prob, classification def predict_from_fasta(fasta_input): """Processes FASTA input and returns predictions in a table format.""" try: sequences = read_fasta(fasta_input) df = pd.DataFrame(sequences, columns=['Header', 'Sequence']) df[['Probability', 'Predicted Class']] = df['Sequence'].apply(lambda seq: pd.Series(predict_peptide_class(seq))) return df except ValueError as e: return pd.DataFrame({"Error": [str(e)]}) except Exception as e: return pd.DataFrame({"Error": ["An unexpected error occurred."]}) iface = gr.Interface( fn=predict_from_fasta, inputs=gr.Textbox(lines=10, placeholder="Paste your peptide sequences in FASTA format here"), outputs=gr.Dataframe(), title="B3PP Predictor", description="Upload peptide sequences in FASTA format to predict their class. Sequences must contain only natural amino acids in capital letters. Max length: 30 characters." ) # Launch on Hugging Face Spaces iface.launch()