Spaces:
Sleeping
Sleeping
File size: 2,724 Bytes
5d3ceee afe1d29 5d3ceee afe1d29 ef06ce0 afe1d29 5d3ceee afe1d29 5d3ceee afe1d29 5d3ceee afe1d29 5d3ceee afe1d29 5d3ceee afe1d29 5d3ceee afe1d29 34fd3ae afe1d29 5d3ceee afe1d29 5d3ceee afe1d29 5d3ceee 6a0516f 5d3ceee afe1d29 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
import pandas as pd
import torch
import re
import gradio as gr
from io import StringIO
from transformers import EsmForSequenceClassification, EsmTokenizer
tokenizer = EsmTokenizer.from_pretrained("facebook/esm2_t6_8M_UR50D", do_lower_case=False)
model = EsmForSequenceClassification.from_pretrained("model/best_model5")
model.eval()
def read_fasta(fasta_string):
"""Parses FASTA format input and returns sequences with validation."""
sequences = []
headers = []
seq_buffer = []
header = None
for line in StringIO(fasta_string):
line = line.strip()
if line.startswith(">"):
if seq_buffer:
sequences.append("".join(seq_buffer))
seq_buffer.clear()
header = line
headers.append(header)
else:
if not re.match(r'^[ACDEFGHIKLMNPQRSTVWY]+$', line):
raise ValueError("Invalid FASTA format: Sequences must contain only natural amino acids (ACDEFGHIKLMNPQRSTVWY).")
if len(line) > 30:
raise ValueError(f"Sequence too long: '{line}' (Max 30 characters allowed).")
seq_buffer.append(line)
if seq_buffer:
sequences.append("".join(seq_buffer))
return headers, sequences
def predict_peptide_class(sequences):
"""Predicts peptide classifications in batch."""
inputs = tokenizer(sequences, return_tensors='pt')
with torch.no_grad():
logits = model(**inputs).logits
probs = torch.sigmoid(logits)[:, 1].cpu().numpy()
classes = ["B3PP" if p > 0.5 else "Non-B3PP" for p in probs]
return probs, classes
def predict_from_fasta(fasta_input):
"""Processes FASTA input and returns predictions in a table format."""
try:
headers, sequences = read_fasta(fasta_input)
if not sequences:
return pd.DataFrame({"Error": ["No valid sequences found."]})
probs, classes = predict_peptide_class(sequences)
return pd.DataFrame({"Header": headers, "Sequence": sequences, "Probability": probs, "Predicted Class": classes})
except ValueError as e:
return pd.DataFrame({"Error": [str(e)]})
except Exception:
return pd.DataFrame({"Error": ["An unexpected error occurred."]})
iface = gr.Interface(
fn=predict_from_fasta,
inputs=gr.Textbox(lines=10, placeholder="Paste your peptide sequences in FASTA format here"),
outputs=gr.Dataframe(),
title="B3PP Predictor",
description="Submit peptide sequences in FASTA format to determine their potential as blood-brain barrier penetration peptides. Sequences must consist exclusively of natural amino acids in uppercase letters, with a maximum length of 30 characters."
)
iface.launch()
|