Spaces:

Bhadralab
/

B3PPs_Predict

Sleeping

File size: 2,724 Bytes

5d3ceee
 
 
 
afe1d29
5d3ceee
 
afe1d29
 
ef06ce0
afe1d29
5d3ceee
 
 
 
afe1d29
 
 
 
 
5d3ceee
afe1d29
 
 
 
 
5d3ceee
 
 
afe1d29
5d3ceee
afe1d29
5d3ceee
afe1d29
 
 
 
5d3ceee
afe1d29
 
34fd3ae
afe1d29
 
 
 
 
5d3ceee
 
 
 
afe1d29
 
 
 
 
 
 
5d3ceee
 
afe1d29
5d3ceee
 
 
 
 
 
 
6a0516f
5d3ceee
 
afe1d29

import pandas as pd
import torch
import re
import gradio as gr
from io import StringIO
from transformers import EsmForSequenceClassification, EsmTokenizer

tokenizer = EsmTokenizer.from_pretrained("facebook/esm2_t6_8M_UR50D", do_lower_case=False)
model = EsmForSequenceClassification.from_pretrained("model/best_model5")
model.eval() 

def read_fasta(fasta_string):
    """Parses FASTA format input and returns sequences with validation."""
    sequences = []
    headers = []
    seq_buffer = []
    header = None

    for line in StringIO(fasta_string):
        line = line.strip()
        if line.startswith(">"):
            if seq_buffer:
                sequences.append("".join(seq_buffer))
                seq_buffer.clear()
            header = line
            headers.append(header)
        else:
            if not re.match(r'^[ACDEFGHIKLMNPQRSTVWY]+$', line):
                raise ValueError("Invalid FASTA format: Sequences must contain only natural amino acids (ACDEFGHIKLMNPQRSTVWY).")
            if len(line) > 30:
                raise ValueError(f"Sequence too long: '{line}' (Max 30 characters allowed).")
            seq_buffer.append(line)

    if seq_buffer:
        sequences.append("".join(seq_buffer))

    return headers, sequences

def predict_peptide_class(sequences):
    """Predicts peptide classifications in batch."""
    inputs = tokenizer(sequences, return_tensors='pt')
    with torch.no_grad():
        logits = model(**inputs).logits
    probs = torch.sigmoid(logits)[:, 1].cpu().numpy()
    classes = ["B3PP" if p > 0.5 else "Non-B3PP" for p in probs]
    return probs, classes

def predict_from_fasta(fasta_input):
    """Processes FASTA input and returns predictions in a table format."""
    try:
        headers, sequences = read_fasta(fasta_input)
        if not sequences:
            return pd.DataFrame({"Error": ["No valid sequences found."]})

        probs, classes = predict_peptide_class(sequences)
        return pd.DataFrame({"Header": headers, "Sequence": sequences, "Probability": probs, "Predicted Class": classes})

    except ValueError as e:
        return pd.DataFrame({"Error": [str(e)]})
    except Exception:
        return pd.DataFrame({"Error": ["An unexpected error occurred."]})

iface = gr.Interface(
    fn=predict_from_fasta,
    inputs=gr.Textbox(lines=10, placeholder="Paste your peptide sequences in FASTA format here"),
    outputs=gr.Dataframe(),
    title="B3PP Predictor",
    description="Submit peptide sequences in FASTA format to determine their potential as blood-brain barrier penetration peptides. Sequences must consist exclusively of natural amino acids in uppercase letters, with a maximum length of 30 characters."
)

iface.launch()