uploading some code files

Browse files

Files changed (11) hide show

README.md +2 -60
convertjsondata.py +64 -0
dataset_creation.py +71 -0
db_config.py +5 -0
finetune_llm.py +242 -0
inference_code.py +22 -0
llm_test.py +24 -0
openvino_inference.py +12 -0
papers_data.txt +0 -0
pyproject.toml +33 -0
text_gen.py +32 -0

README.md CHANGED Viewed

@@ -1,60 +1,2 @@
----
-datasets:
-- qiaojin/PubMedQA
-- kroshan/BioASQ
-language:
-- en
-library_name: transformers
-pipeline_tag: table-question-answering
-tags:
-- chemistry
-- biology
-- molecular
-- synthetic
-- language model
----
-# Description:
-This model is an example of how a fine-tuned LLM even without the full depth, size, and complexity of larger and more expensive models can be useful in context-sensitive situations. In our use-case, we are applying this LLM as part of a broader electronic lab notebook software setup for molecular and computational biologists. This GPT-2 has been finetuned on datasets from BioASQ and PubMedQA and is now knowledgeable enough in biochemistry to assist scientists and integrates as not just a copilot-like tool but also as a lab partner to the overall Design-Built-Test-Learn workflow ever growing in prominence in synthetic biology.
-# Intel Optimization Inference Code Sample:
-We made use of both the BF16 datatype and INT8 quantization to improve performance. BF16 halves the memory compared to FP32, allowing larger models and/or larger batches to fit into memory. Moreover, BF16 is supported by modern Intel CPUs and operations with it are optimized. Quantizing models to INT8 can reduce the model size, making better use of cache and speeding up load times.
-Additionally, we then optimized further with OpenVino to make it run better on Intel Hardware by converting it to an onxx model to then OpenVINO Intermediate Representation
-```
-from openvino.runtime import Core
-import numpy as np
-# Initialize the OpenVINO runtime Core
-ie = Core()
-# Load and compile the model for the CPU device
-compiled_model = ie.compile_model(model='../ovc_output/converted_model.xml', device_name="CPU")
-# Prepare input: a non tokenized example just for examples sake
-input_ids = np.random.randint(0, 50256, (1, 10))
-# Create a dictionary for the inputs expected by the model
-inputs = {"input_ids": input_ids}
-# Create an infer request and start synchronous inference
-result = compiled_model.create_infer_request().infer(inputs=inputs)
-# Access output tensor data directly from the result using the appropriate output key
-output = result['outputs']
-print("Inference results:", output)
-```
-In the finetuning file you will see our other optimizations.
-We perform BF16 conversion as follows (we also implement a custom collator):
-```
-model = GPT2LMHeadModel.from_pretrained('gpt2-medium').to(torch.bfloat16)
-```
-We perform Int8 quantization as follows:
-```
-# Load the full-precision model
-model.eval()  # Ensure the model is in evaluation mode
-quantized_model = quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)
-```


1	+ # MadMolecool
2	+ Mad Molecool is an all in one space for molecular scientists!

convertjsondata.py ADDED Viewed

	@@ -0,0 +1,64 @@

+# import json
+#
+#
+# # Function to load JSON data from a file
+# def load_json(filename):
+#     with open(filename, 'r', encoding='utf-8') as file:
+#         return json.load(file)
+#
+#
+# # Function to save text data to a file
+# def save_to_text(data, filename):
+#     with open(filename, 'w', encoding='utf-8') as file:
+#         file.write(data)
+#
+#
+# # Function to process the JSON data into the desired text format
+# def process_data(data):
+#     formatted_text = ""
+#     questions = data.get("questions", [])
+#     for question in questions:
+#         # Remove newline characters and ensure single line string
+#         body = question["body"].replace('\n', ' ').strip()
+#         ideal_answer = " ".join(question["ideal_answer"]).replace('\n', ' ').strip()
+#
+#         # Append the formatted question and answer to the result string
+#         formatted_text += body + "\n" + ideal_answer + "\n\n"
+#
+#     return formatted_text
+#
+#
+# # Main function to orchestrate the JSON to text conversion
+# def main():
+#     json_data = load_json("C:/Users/ibrah/Downloads/BioASQ-training12b/BioASQ-training12b/training12b_new.json")  # Load the JSON data
+#     formatted_data = process_data(json_data)  # Process the data
+#     save_to_text(formatted_data, "formatted_data.txt")  # Save the formatted text
+#
+#
+# if __name__ == "__main__":
+#     main()
+import json
+def load_json(filename):
+    """Load JSON data from a file."""
+    with open(filename, 'r', encoding='utf-8') as file:
+        return json.load(file)
+def save_to_text(data, filename):
+    """Save processed data to a text file."""
+    with open(filename, 'w', encoding='utf-8') as file:
+        for key, value in data.items():
+            question = value['QUESTION'].replace('\n', ' ').strip()
+            long_answer = value['LONG_ANSWER'].replace('\n', ' ').strip()
+            # Write the question and long answer to the file
+            file.write(question + '\n' + long_answer + '\n\n')
+def main():
+    # Load the JSON data from a file
+    json_data = load_json("C:/Users/ibrah/Downloads/ori_pqaa.json")
+    # Save the formatted data to a text file
+    save_to_text(json_data, 'formatted_data_pqaa.txt')
+if __name__ == "__main__":
+    main()

dataset_creation.py ADDED Viewed

	@@ -0,0 +1,71 @@

+from Bio import Entrez
+from bs4 import BeautifulSoup
+import re
+def remove_latex(text):
+    """ Remove LaTeX from text using regex. """
+    cleaned_text = re.sub(r'\\.+?{.*?}', '', text)
+    cleaned_text = re.sub(r'[{}\\]', '', cleaned_text)
+    cleaned_text = re.sub(r'\b(ABSTRACT|METHODS)\b', '', cleaned_text, flags=re.IGNORECASE)  # Improved regex
+    return cleaned_text
+def clean_text(text):
+    """ Clean and prepare text for output. """
+    text = remove_latex(text)
+    text = text.replace('\n', ' ').strip()  # Replace newlines with spaces
+    return ' '.join(text.split())  # Removes excessive whitespace
+Entrez.email = "[email protected]"  # legally required
+def search_pmc_articles(query, max_papers):
+    handle = Entrez.esearch(db="pmc", term=query, retmax=max_papers)
+    record = Entrez.read(handle)
+    handle.close()
+    return record["IdList"]
+def fetch_full_text(paper_id):
+    try:
+        handle = Entrez.efetch(db="pmc", id=paper_id, rettype="xml", retmode="xml")
+        xml_data = handle.read()
+        handle.close()
+        return xml_data
+    except Exception as e:
+        print(f"Failed to fetch data for paper ID {paper_id}: {str(e)}")
+        return None
+# Fetch paper IDs
+query = "molecular biology[Title] AND open access[Filter]"
+max_papers = 1000
+paper_ids = search_pmc_articles(query, max_papers*20)
+# Open a file to write
+with open('papers_data_newbig.txt', 'w', encoding='utf-8') as file:
+    current_papers = 0
+    for paper_id in paper_ids:
+        if current_papers >= max_papers:
+            break
+        xml_full_text = fetch_full_text(paper_id)
+        if not xml_full_text:
+            continue
+        soup = BeautifulSoup(xml_full_text, 'xml')
+        abstract_text = soup.find('abstract')
+        method_section = soup.find('sec', {'sec-type': 'methods'})
+        if abstract_text and method_section:
+            abstract_content = clean_text(abstract_text.get_text())
+            method_content = clean_text(method_section.get_text())
+            file.write(
+                f"Here is an example past experiment's abstract which tells you amongst other things the main results of what was studied, what methods do you think were used for it? {abstract_content}\n")
+            file.write(
+                f"{method_content}\n\n")
+            current_papers += 1

db_config.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from flask import Flask
+from pymongo import MongoClient
+client = MongoClient('mongodb+srv://madScientist:[email protected]/')
+db = client['madMolecool']

finetune_llm.py ADDED Viewed

	@@ -0,0 +1,242 @@

+from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, \
+    TrainingArguments
+import logging
+from transformers import logging as hf_logging
+import os
+from torch.quantization import quantize_dynamic
+import torch
+# Setup logging
+logging.basicConfig(level=logging.INFO)  # Adjust as per the desired verbosity
+hf_logging.set_verbosity_info()
+hf_logging.enable_default_handler()
+hf_logging.enable_explicit_format()
+# Define the custom data collator
+class CustomDataCollatorForLanguageModeling(DataCollatorForLanguageModeling):
+    def collate_batch(self, features):
+        batch = super().collate_batch(features)
+        batch = {k: v.to(torch.bfloat16) if isinstance(v, torch.Tensor) else v for k, v in batch.items()}
+        return batch
+# Use GPT-2 XL
+model = GPT2LMHeadModel.from_pretrained('gpt2-medium').to(torch.bfloat16)
+tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
+train_dataset = TextDataset(
+    tokenizer=tokenizer,
+    file_path="formatted_data_small.txt",
+    block_size=256)  # Increased block size for larger model
+# Instantiate the custom data collator
+data_collator = CustomDataCollatorForLanguageModeling(
+    tokenizer=tokenizer, mlm=False
+)
+training_args = TrainingArguments(
+    output_dir="./gpt2-medium-finetuned",
+    overwrite_output_dir=True,
+    num_train_epochs=4,
+    per_device_train_batch_size=4,  # Adjusted for potential memory constraints
+    gradient_accumulation_steps=4,  # Increased accumulation to handle larger model size
+    learning_rate=2e-4,
+    save_steps=1_000,
+    save_total_limit=3,
+    logging_dir='./logs',
+    logging_steps=50,
+    fp16=False  # Ensure this is false since we're using bfloat16 manually
+)
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    data_collator=data_collator,
+    train_dataset=train_dataset,
+)
+trainer.train()
+# Save model and tokenizer
+model_path = "./saved_gpt2_medium_nice_model_directory"
+if not os.path.exists(model_path):
+    os.makedirs(model_path)
+model.save_pretrained(model_path)
+tokenizer.save_pretrained(model_path)
+# Load the full-precision model
+model.eval()  # Ensure the model is in evaluation mode
+quantized_model = quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)
+quantized_model_path = "./quantized_model_directory"
+if not os.path.exists(quantized_model_path):
+    os.makedirs(quantized_model_path)
+torch.save(quantized_model.state_dict(), os.path.join(quantized_model_path, 'quantized_nice_medium_model.pth'))
+# from transformers import BertForMaskedLM, BertTokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
+# import logging
+# from transformers import logging as hf_logging
+# import os
+# from torch.quantization import quantize_dynamic
+# import torch
+#
+# # Setup logging
+# logging.basicConfig(level=logging.INFO)
+# hf_logging.set_verbosity_info()
+# hf_logging.enable_default_handler()
+# hf_logging.enable_explicit_format()
+#
+# # Define the custom data collator for masked language modeling
+# class CustomDataCollatorForLanguageModeling(DataCollatorForLanguageModeling):
+#     def collate_batch(self, features):
+#         batch = super().collate_batch(features)
+#         batch = {k: v.to(torch.bfloat16) if isinstance(v, torch.Tensor) else v for k, v in batch.items()}
+#         return batch
+#
+# # Load BioBERT
+# model = BertForMaskedLM.from_pretrained('dmis-lab/biobert-v1.1').to(torch.bfloat16)
+# tokenizer = BertTokenizer.from_pretrained('dmis-lab/biobert-v1.1')
+#
+# # Prepare the dataset
+# train_dataset = TextDataset(
+#     tokenizer=tokenizer,
+#     file_path="papers_data_mountain.txt",
+#     block_size=512)  # Adjust block_size if necessary
+#
+# data_collator = CustomDataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True)
+#
+# training_args = TrainingArguments(
+#     output_dir="./biobert-finetuned",
+#     overwrite_output_dir=True,
+#     num_train_epochs=4,
+#     per_device_train_batch_size=8,
+#     gradient_accumulation_steps=2,
+#     learning_rate=2e-4,
+#     save_steps=1_000,
+#     save_total_limit=3,
+#     logging_dir='./logs',
+#     logging_steps=50,
+#     fp16=False  # Ensure this is false since we're using bfloat16 manually
+# )
+#
+# trainer = Trainer(
+#     model=model,
+#     args=training_args,
+#     data_collator=data_collator,
+#     train_dataset=train_dataset,
+# )
+#
+# trainer.train()
+#
+# # Save model and tokenizer
+# model_path = "./saved_mountain_model_directory"
+# if not os.path.exists(model_path):
+#     os.makedirs(model_path)
+# model.save_pretrained(model_path)
+# tokenizer.save_pretrained(model_path)
+#
+# # Quantize the model
+# model.eval()  # Ensure the model is in evaluation mode
+# quantized_model = quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)
+# quantized_model_path = "./quantized_model_directory"
+# if not os.path.exists(quantized_model_path):
+#     os.makedirs(quantized_model_path)
+# torch.save(quantized_model.state_dict(), os.path.join(quantized_model_path, 'quantized_mountain_model.pth'))
+# from transformers import Phi3Model, Phi3Config, AutoTokenizer, Trainer, TrainingArguments
+# import logging
+# from transformers import logging as hf_logging
+# import os
+# import torch
+# from torch.utils.data import Dataset
+#
+#
+# class TextDataset(Dataset):
+#     def __init__(self, tokenizer, file_path, block_size=512):
+#         self.tokenizer = tokenizer
+#         self.block_size = block_size
+#         self.input_ids = []
+#
+#         # Read and tokenize the file content in chunks
+#         with open(file_path, 'r', encoding='utf-8') as f:
+#             while True:
+#                 text = f.read(1024 * 1024)  # Read approximately 1MB of text at a time
+#                 if not text:
+#                     break
+#                 tokens = tokenizer(text, add_special_tokens=True, truncation=True, max_length=block_size,
+#                                    return_tensors="pt")
+#                 self.input_ids.extend(tokens.input_ids.tolist())
+#
+#     def __len__(self):
+#         # Ensure we return a non-negative value
+#         return max(0, len(self.input_ids) - self.block_size + 1)
+#
+#     def __getitem__(self, idx):
+#         # Ensure the index does not exceed the bounds and forms a proper sequence
+#         input_ids = self.input_ids[idx:idx + self.block_size]
+#         return {"input_ids": torch.tensor(input_ids, dtype=torch.long)}
+#
+#
+#
+#
+# # Setup logging
+# logging.basicConfig(level=logging.INFO)
+# hf_logging.set_verbosity_info()
+# hf_logging.enable_default_handler()
+# hf_logging.enable_explicit_format()
+#
+# # Load Phi-3 model
+# configuration = Phi3Config.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
+# model = Phi3Model(configuration).to(torch.bfloat16)
+#
+# # Load tokenizer
+# tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
+#
+#
+# # Initialize the dataset
+# train_dataset = TextDataset(tokenizer, "papers_data_mountain.txt", block_size=512)
+#
+# # Custom data collator function (simplified for generality)
+# def custom_collate_fn(examples):
+#     batch = tokenizer.pad(
+#         examples,
+#         return_tensors='pt',
+#         padding=True,
+#         max_length=512
+#     )
+#     batch = {k: v.to(torch.bfloat16) if isinstance(v, torch.Tensor) else v for k, v in batch.items()}
+#     return batch
+#
+#
+# training_args = TrainingArguments(
+#     output_dir="./phi3-finetuned",
+#     overwrite_output_dir=True,
+#     num_train_epochs=4,
+#     per_device_train_batch_size=8,
+#     gradient_accumulation_steps=2,
+#     learning_rate=2e-4,
+#     save_steps=1_000,
+#     save_total_limit=3,
+#     logging_dir='./logs',
+#     logging_steps=50,
+#     fp16=False  # bfloat16 usage is manual
+# )
+#
+# trainer = Trainer(
+#     model=model,
+#     args=training_args,
+#     data_collator=custom_collate_fn,
+#     train_dataset=train_dataset,
+# )
+#
+# trainer.train()
+#
+# # Save model and tokenizer
+# model_path = "./saved_phi3_model_directory"
+# if not os.path.exists(model_path):
+#     os.makedirs(model_path)
+# model.save_pretrained(model_path)
+# tokenizer.save_pretrained(model_path)

inference_code.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from openvino.runtime import Core
+import numpy as np
+# Initialize the OpenVINO runtime Core
+ie = Core()
+# Load and compile the model for the CPU device
+compiled_model = ie.compile_model(model='../ovc_output/converted_model.xml', device_name="CPU")
+# Prepare input: a non tokenized example just for examples sake
+input_ids = np.random.randint(0, 50256, (1, 10))
+# Create a dictionary for the inputs expected by the model
+inputs = {"input_ids": input_ids}
+# Create an infer request and start synchronous inference
+result = compiled_model.create_infer_request().infer(inputs=inputs)
+# Access output tensor data directly from the result using the appropriate output key
+output = result['outputs']
+print("Inference results:", output)

llm_test.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from transformers import GPT2Tokenizer, GPT2LMHeadModel
+from flask import Flask, request, jsonify
+app = Flask(__name__)
+# Load model and tokenizer
+tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+model = GPT2LMHeadModel.from_pretrained('gpt2')
+@app.route('/predict', methods=['POST'])
+def predict():
+    data = request.json
+    text = data['text']
+    # Tokenize and encode the input text
+    inputs = tokenizer.encode(text, return_tensors='pt')
+    outputs = model.generate(inputs, max_length=50)  # Adjust max_length as needed
+    # Decode the output tokens to string
+    text_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    return jsonify({'result': text_output})
+if __name__ == '__main__':
+    app.run(host='0.0.0.0', port=3000)

openvino_inference.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import torch
+from transformers import GPT2LMHeadModel
+# Load your pre-trained model
+model = GPT2LMHeadModel.from_pretrained('./saved_gpt2_medium_nice_model_directory')
+model.eval()
+# Dummy input for the export
+dummy_input = torch.randint(0, 50256, (1, 10))
+# Export the model
+torch.onnx.export(model, dummy_input, "model.onnx", opset_version=11, input_names=['input_ids'], output_names=['outputs'])

papers_data.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

pyproject.toml ADDED Viewed

	@@ -0,0 +1,33 @@

+[tool.poetry]
+name = "MadMolecool"
+version = "0.1.0"
+description = ""
+authors = ["Ibraheem Khan <[email protected]>, Fiorina Chau <[email protected]>"]
+readme = "README.md"
+[[tool.poetry.source]]
+name = "pytorch"
+url = "https://download.pytorch.org/whl/cpu"
+priority = "supplemental"
+[tool.poetry.dependencies]
+python = "^3.12"
+flask = "^3.0.3"
+pymongo = "^4.7.0"
+flask-cors = "^4.0.0"
+transformers = "^4.40.1"
+torch = {version = "^2.3.0+cpu", source = "pytorch"}
+torchvision = {version = "^0.18.0+cpu", source = "pytorch"}
+torchaudio = {version = "^2.3.0+cpu", source = "pytorch"}
+bio = "^1.7.0"
+lxml = "^5.2.1"
+bs4 = "^0.0.2"
+accelerate = "^0.29.3"
+[tool.poetry.group.dev.dependencies]
+setuptools = "^69.5.1"
+wheel = "^0.43.0"
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"

text_gen.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import torch
+from transformers import GPT2LMHeadModel, GPT2Tokenizer
+def load_model_and_tokenizer(model_path):
+    model = model = GPT2LMHeadModel.from_pretrained(model_path)
+    tokenizer = GPT2Tokenizer.from_pretrained(model_path)
+    return model, tokenizer
+def generate_text(input_text, model, tokenizer):
+    # Encode the input text
+    input_ids = tokenizer.encode(input_text, return_tensors='pt')
+    # Generate output from the model
+    output = model.generate(input_ids, max_length=75, num_return_sequences=1)
+    # Decode and print the output
+    return tokenizer.decode(output[0], skip_special_tokens=True)
+if __name__ == "__main__":
+    model_path = "./saved_gpt2_medium_nice_model_directory"  # Adjust the path as needed
+    model, tokenizer = load_model_and_tokenizer(model_path)
+    # Ensure model is in eval mode
+    model.eval()
+    print("Type 'exit' to quit.")
+    while True:
+        input_text = input("Enter your text: ")
+        if input_text.lower() == 'exit':
+            break
+        response = generate_text(input_text, model, tokenizer)
+        print("Generated text:", response)