|
--- |
|
library_name: transformers |
|
tags: [] |
|
--- |
|
|
|
# THE MODEL SHOULD NOT BE USED FOR NOW, IT IS STILL IN TESTING |
|
|
|
### How to use |
|
|
|
Until its next release, the transformers library needs to be installed from source with the following command in order to use the models. |
|
PyTorch should also be installed. |
|
|
|
``` |
|
pip install --upgrade git+https://github.com/huggingface/transformers.git |
|
pip install torch |
|
``` |
|
|
|
A small snippet of code is given here in order to **generate sequences from a pipeline (high-level)**. |
|
|
|
``` |
|
# Load pipeline |
|
from transformers import pipeline |
|
pipe = pipeline(model="InstaDeepAI/ChatNT-text-generation-pipeline", trust_remote_code=True) |
|
|
|
# Define custom inputs (note that the number of <DNA> token in the english sequence must be equal to len(dna_sequences)) |
|
english_sequence = "Is there any evidence of an acceptor splice site in this sequence <DNA> ?" |
|
dna_sequences = ["ATCGGAAAAAGATCCAGAAAGTTATACCAGGCCAATGGGAATCACCTATTACGTGGATAATAGCGATAGTATGTTACCTATAAATTTAACTACGTGGATATCAGGCAGTTACGTTACCAGTCAAGGAGCACCCAAAACTGTCCAGCAACAAGTTAATTTACCCATGAAGATGTACTGCAAGCCTTGCCAACCAGTTAAAGTAGCTACTCATAAGGTAATAAACAGTAATATCGACTTTTTATCCATTTTGATAATTGATTTATAACAGTCTATAACTGATCGCTCTACATAATCTCTATCAGATTACTATTGACACAAACAGAAACCCCGTTAATTTGTATGATATATTTCCCGGTAAGCTTCGATTTTTAATCCTATCGTGACAATTTGGAATGTAACTTATTTCGTATAGGATAAACTAATTTACACGTTTGAATTCCTAGAATATGGAGAATCTAAAGGTCCTGGCAATGCCATCGGCTTTCAATATTATAATGGACCAAAAGTTACTCTATTAGCTTCCAAAACTTCGCGTGAGTACATTAGAACAGAAGAATAACCTTCAATATCGAGAGAGTTACTATCACTAACTATCCTATG"] |
|
|
|
# Generate sequence |
|
generated_english_sequence = pipe( |
|
inputs={ |
|
"english_sequence": english_sequence, |
|
"dna_sequences": dna_sequences |
|
} |
|
) |
|
``` |
|
|
|
A small snippet of code is given here in order to **infer with the model without any abstraction (low-level)**. |
|
|
|
``` |
|
import numpy as np |
|
from transformers import AutoModel, AutoTokenizer |
|
|
|
# Load model and tokenizers |
|
model = AutoModel.from_pretrained("InstaDeepAI/ChatNT", trust_remote_code=True) |
|
english_tokenizer = AutoTokenizer.from_pretrained("InstaDeepAI/ChatNT", subfolder="english_tokenizer") |
|
bio_tokenizer = AutoTokenizer.from_pretrained("InstaDeepAI/ChatNT", subfolder="bio_tokenizer") |
|
|
|
# Define custom inputs (note that the number of <DNA> token in the english sequence must be equal to len(dna_sequences)) |
|
english_sequence = "A chat between a curious user and an artificial intelligence assistant that can handle bio sequences. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: Is there any evidence of an acceptor splice site in this sequence <DNA> ?" |
|
dna_sequences = ["ATCGGAAAAAGATCCAGAAAGTTATACCAGGCCAATGGGAATCACCTATTACGTGGATAATAGCGATAGTATGTTACCTATAAATTTAACTACGTGGATATCAGGCAGTTACGTTACCAGTCAAGGAGCACCCAAAACTGTCCAGCAACAAGTTAATTTACCCATGAAGATGTACTGCAAGCCTTGCCAACCAGTTAAAGTAGCTACTCATAAGGTAATAAACAGTAATATCGACTTTTTATCCATTTTGATAATTGATTTATAACAGTCTATAACTGATCGCTCTACATAATCTCTATCAGATTACTATTGACACAAACAGAAACCCCGTTAATTTGTATGATATATTTCCCGGTAAGCTTCGATTTTTAATCCTATCGTGACAATTTGGAATGTAACTTATTTCGTATAGGATAAACTAATTTACACGTTTGAATTCCTAGAATATGGAGAATCTAAAGGTCCTGGCAATGCCATCGGCTTTCAATATTATAATGGACCAAAAGTTACTCTATTAGCTTCCAAAACTTCGCGTGAGTACATTAGAACAGAAGAATAACCTTCAATATCGAGAGAGTTACTATCACTAACTATCCTATG"] |
|
|
|
# Tokenize |
|
english_tokens = english_tokenizer(english_sequence, return_tensors="pt", padding="max_length", truncation=True, max_length=512).input_ids |
|
bio_tokens = bio_tokenizer(dna_sequences, return_tensors="pt", padding="max_length", max_length=512, truncation=True).input_ids.unsqueeze(0) # unsqueeze to simulate batch_size = 1 |
|
|
|
# Predict |
|
outs = model( |
|
multi_omics_tokens_ids=(english_tokens, bio_tokens), |
|
projection_english_tokens_ids=english_tokens, |
|
projected_bio_embeddings=None, |
|
) |
|
``` |