pikhan commited on
Commit
b5f1696
1 Parent(s): f92274c

uploading some code files

Browse files
README.md CHANGED
@@ -1,60 +1,2 @@
1
- ---
2
- datasets:
3
- - qiaojin/PubMedQA
4
- - kroshan/BioASQ
5
- language:
6
- - en
7
- library_name: transformers
8
- pipeline_tag: table-question-answering
9
- tags:
10
- - chemistry
11
- - biology
12
- - molecular
13
- - synthetic
14
- - language model
15
- ---
16
- # Description:
17
- This model is an example of how a fine-tuned LLM even without the full depth, size, and complexity of larger and more expensive models can be useful in context-sensitive situations. In our use-case, we are applying this LLM as part of a broader electronic lab notebook software setup for molecular and computational biologists. This GPT-2 has been finetuned on datasets from BioASQ and PubMedQA and is now knowledgeable enough in biochemistry to assist scientists and integrates as not just a copilot-like tool but also as a lab partner to the overall Design-Built-Test-Learn workflow ever growing in prominence in synthetic biology.
18
-
19
- # Intel Optimization Inference Code Sample:
20
- We made use of both the BF16 datatype and INT8 quantization to improve performance. BF16 halves the memory compared to FP32, allowing larger models and/or larger batches to fit into memory. Moreover, BF16 is supported by modern Intel CPUs and operations with it are optimized. Quantizing models to INT8 can reduce the model size, making better use of cache and speeding up load times.
21
- Additionally, we then optimized further with OpenVino to make it run better on Intel Hardware by converting it to an onxx model to then OpenVINO Intermediate Representation
22
-
23
- ```
24
- from openvino.runtime import Core
25
- import numpy as np
26
-
27
- # Initialize the OpenVINO runtime Core
28
- ie = Core()
29
-
30
- # Load and compile the model for the CPU device
31
- compiled_model = ie.compile_model(model='../ovc_output/converted_model.xml', device_name="CPU")
32
-
33
- # Prepare input: a non tokenized example just for examples sake
34
- input_ids = np.random.randint(0, 50256, (1, 10))
35
-
36
- # Create a dictionary for the inputs expected by the model
37
- inputs = {"input_ids": input_ids}
38
-
39
- # Create an infer request and start synchronous inference
40
- result = compiled_model.create_infer_request().infer(inputs=inputs)
41
-
42
- # Access output tensor data directly from the result using the appropriate output key
43
- output = result['outputs']
44
-
45
- print("Inference results:", output)
46
-
47
- ```
48
- In the finetuning file you will see our other optimizations.
49
-
50
- We perform BF16 conversion as follows (we also implement a custom collator):
51
- ```
52
- model = GPT2LMHeadModel.from_pretrained('gpt2-medium').to(torch.bfloat16)
53
- ```
54
-
55
- We perform Int8 quantization as follows:
56
- ```
57
- # Load the full-precision model
58
- model.eval() # Ensure the model is in evaluation mode
59
- quantized_model = quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)
60
- ```
 
1
+ # MadMolecool
2
+ Mad Molecool is an all in one space for molecular scientists!
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
convertjsondata.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import json
2
+ #
3
+ #
4
+ # # Function to load JSON data from a file
5
+ # def load_json(filename):
6
+ # with open(filename, 'r', encoding='utf-8') as file:
7
+ # return json.load(file)
8
+ #
9
+ #
10
+ # # Function to save text data to a file
11
+ # def save_to_text(data, filename):
12
+ # with open(filename, 'w', encoding='utf-8') as file:
13
+ # file.write(data)
14
+ #
15
+ #
16
+ # # Function to process the JSON data into the desired text format
17
+ # def process_data(data):
18
+ # formatted_text = ""
19
+ # questions = data.get("questions", [])
20
+ # for question in questions:
21
+ # # Remove newline characters and ensure single line string
22
+ # body = question["body"].replace('\n', ' ').strip()
23
+ # ideal_answer = " ".join(question["ideal_answer"]).replace('\n', ' ').strip()
24
+ #
25
+ # # Append the formatted question and answer to the result string
26
+ # formatted_text += body + "\n" + ideal_answer + "\n\n"
27
+ #
28
+ # return formatted_text
29
+ #
30
+ #
31
+ # # Main function to orchestrate the JSON to text conversion
32
+ # def main():
33
+ # json_data = load_json("C:/Users/ibrah/Downloads/BioASQ-training12b/BioASQ-training12b/training12b_new.json") # Load the JSON data
34
+ # formatted_data = process_data(json_data) # Process the data
35
+ # save_to_text(formatted_data, "formatted_data.txt") # Save the formatted text
36
+ #
37
+ #
38
+ # if __name__ == "__main__":
39
+ # main()
40
+
41
+ import json
42
+
43
+ def load_json(filename):
44
+ """Load JSON data from a file."""
45
+ with open(filename, 'r', encoding='utf-8') as file:
46
+ return json.load(file)
47
+
48
+ def save_to_text(data, filename):
49
+ """Save processed data to a text file."""
50
+ with open(filename, 'w', encoding='utf-8') as file:
51
+ for key, value in data.items():
52
+ question = value['QUESTION'].replace('\n', ' ').strip()
53
+ long_answer = value['LONG_ANSWER'].replace('\n', ' ').strip()
54
+ # Write the question and long answer to the file
55
+ file.write(question + '\n' + long_answer + '\n\n')
56
+
57
+ def main():
58
+ # Load the JSON data from a file
59
+ json_data = load_json("C:/Users/ibrah/Downloads/ori_pqaa.json")
60
+ # Save the formatted data to a text file
61
+ save_to_text(json_data, 'formatted_data_pqaa.txt')
62
+
63
+ if __name__ == "__main__":
64
+ main()
dataset_creation.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from Bio import Entrez
2
+ from bs4 import BeautifulSoup
3
+ import re
4
+
5
+
6
+ def remove_latex(text):
7
+ """ Remove LaTeX from text using regex. """
8
+ cleaned_text = re.sub(r'\\.+?{.*?}', '', text)
9
+ cleaned_text = re.sub(r'[{}\\]', '', cleaned_text)
10
+ cleaned_text = re.sub(r'\b(ABSTRACT|METHODS)\b', '', cleaned_text, flags=re.IGNORECASE) # Improved regex
11
+ return cleaned_text
12
+
13
+
14
+ def clean_text(text):
15
+ """ Clean and prepare text for output. """
16
+ text = remove_latex(text)
17
+ text = text.replace('\n', ' ').strip() # Replace newlines with spaces
18
+ return ' '.join(text.split()) # Removes excessive whitespace
19
+
20
+
21
+ Entrez.email = "[email protected]" # legally required
22
+
23
+
24
+ def search_pmc_articles(query, max_papers):
25
+ handle = Entrez.esearch(db="pmc", term=query, retmax=max_papers)
26
+ record = Entrez.read(handle)
27
+ handle.close()
28
+ return record["IdList"]
29
+
30
+
31
+ def fetch_full_text(paper_id):
32
+ try:
33
+ handle = Entrez.efetch(db="pmc", id=paper_id, rettype="xml", retmode="xml")
34
+ xml_data = handle.read()
35
+ handle.close()
36
+ return xml_data
37
+ except Exception as e:
38
+ print(f"Failed to fetch data for paper ID {paper_id}: {str(e)}")
39
+ return None
40
+
41
+
42
+ # Fetch paper IDs
43
+ query = "molecular biology[Title] AND open access[Filter]"
44
+ max_papers = 1000
45
+ paper_ids = search_pmc_articles(query, max_papers*20)
46
+
47
+ # Open a file to write
48
+ with open('papers_data_newbig.txt', 'w', encoding='utf-8') as file:
49
+ current_papers = 0
50
+ for paper_id in paper_ids:
51
+ if current_papers >= max_papers:
52
+ break
53
+ xml_full_text = fetch_full_text(paper_id)
54
+ if not xml_full_text:
55
+ continue
56
+
57
+ soup = BeautifulSoup(xml_full_text, 'xml')
58
+
59
+ abstract_text = soup.find('abstract')
60
+ method_section = soup.find('sec', {'sec-type': 'methods'})
61
+
62
+ if abstract_text and method_section:
63
+ abstract_content = clean_text(abstract_text.get_text())
64
+ method_content = clean_text(method_section.get_text())
65
+
66
+ file.write(
67
+ f"Here is an example past experiment's abstract which tells you amongst other things the main results of what was studied, what methods do you think were used for it? {abstract_content}\n")
68
+ file.write(
69
+ f"{method_content}\n\n")
70
+
71
+ current_papers += 1
db_config.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from flask import Flask
2
+ from pymongo import MongoClient
3
+
4
+ client = MongoClient('mongodb+srv://madScientist:[email protected]/')
5
+ db = client['madMolecool']
finetune_llm.py ADDED
@@ -0,0 +1,242 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, \
2
+ TrainingArguments
3
+ import logging
4
+ from transformers import logging as hf_logging
5
+ import os
6
+ from torch.quantization import quantize_dynamic
7
+ import torch
8
+
9
+ # Setup logging
10
+ logging.basicConfig(level=logging.INFO) # Adjust as per the desired verbosity
11
+ hf_logging.set_verbosity_info()
12
+ hf_logging.enable_default_handler()
13
+ hf_logging.enable_explicit_format()
14
+
15
+ # Define the custom data collator
16
+ class CustomDataCollatorForLanguageModeling(DataCollatorForLanguageModeling):
17
+ def collate_batch(self, features):
18
+ batch = super().collate_batch(features)
19
+ batch = {k: v.to(torch.bfloat16) if isinstance(v, torch.Tensor) else v for k, v in batch.items()}
20
+ return batch
21
+
22
+ # Use GPT-2 XL
23
+ model = GPT2LMHeadModel.from_pretrained('gpt2-medium').to(torch.bfloat16)
24
+ tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
25
+
26
+ train_dataset = TextDataset(
27
+ tokenizer=tokenizer,
28
+ file_path="formatted_data_small.txt",
29
+ block_size=256) # Increased block size for larger model
30
+
31
+ # Instantiate the custom data collator
32
+ data_collator = CustomDataCollatorForLanguageModeling(
33
+ tokenizer=tokenizer, mlm=False
34
+ )
35
+
36
+ training_args = TrainingArguments(
37
+ output_dir="./gpt2-medium-finetuned",
38
+ overwrite_output_dir=True,
39
+ num_train_epochs=4,
40
+ per_device_train_batch_size=4, # Adjusted for potential memory constraints
41
+ gradient_accumulation_steps=4, # Increased accumulation to handle larger model size
42
+ learning_rate=2e-4,
43
+ save_steps=1_000,
44
+ save_total_limit=3,
45
+ logging_dir='./logs',
46
+ logging_steps=50,
47
+ fp16=False # Ensure this is false since we're using bfloat16 manually
48
+ )
49
+
50
+ trainer = Trainer(
51
+ model=model,
52
+ args=training_args,
53
+ data_collator=data_collator,
54
+ train_dataset=train_dataset,
55
+ )
56
+
57
+ trainer.train()
58
+
59
+ # Save model and tokenizer
60
+ model_path = "./saved_gpt2_medium_nice_model_directory"
61
+ if not os.path.exists(model_path):
62
+ os.makedirs(model_path)
63
+
64
+ model.save_pretrained(model_path)
65
+ tokenizer.save_pretrained(model_path)
66
+
67
+ # Load the full-precision model
68
+ model.eval() # Ensure the model is in evaluation mode
69
+ quantized_model = quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)
70
+ quantized_model_path = "./quantized_model_directory"
71
+ if not os.path.exists(quantized_model_path):
72
+ os.makedirs(quantized_model_path)
73
+ torch.save(quantized_model.state_dict(), os.path.join(quantized_model_path, 'quantized_nice_medium_model.pth'))
74
+
75
+
76
+ # from transformers import BertForMaskedLM, BertTokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
77
+ # import logging
78
+ # from transformers import logging as hf_logging
79
+ # import os
80
+ # from torch.quantization import quantize_dynamic
81
+ # import torch
82
+ #
83
+ # # Setup logging
84
+ # logging.basicConfig(level=logging.INFO)
85
+ # hf_logging.set_verbosity_info()
86
+ # hf_logging.enable_default_handler()
87
+ # hf_logging.enable_explicit_format()
88
+ #
89
+ # # Define the custom data collator for masked language modeling
90
+ # class CustomDataCollatorForLanguageModeling(DataCollatorForLanguageModeling):
91
+ # def collate_batch(self, features):
92
+ # batch = super().collate_batch(features)
93
+ # batch = {k: v.to(torch.bfloat16) if isinstance(v, torch.Tensor) else v for k, v in batch.items()}
94
+ # return batch
95
+ #
96
+ # # Load BioBERT
97
+ # model = BertForMaskedLM.from_pretrained('dmis-lab/biobert-v1.1').to(torch.bfloat16)
98
+ # tokenizer = BertTokenizer.from_pretrained('dmis-lab/biobert-v1.1')
99
+ #
100
+ # # Prepare the dataset
101
+ # train_dataset = TextDataset(
102
+ # tokenizer=tokenizer,
103
+ # file_path="papers_data_mountain.txt",
104
+ # block_size=512) # Adjust block_size if necessary
105
+ #
106
+ # data_collator = CustomDataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True)
107
+ #
108
+ # training_args = TrainingArguments(
109
+ # output_dir="./biobert-finetuned",
110
+ # overwrite_output_dir=True,
111
+ # num_train_epochs=4,
112
+ # per_device_train_batch_size=8,
113
+ # gradient_accumulation_steps=2,
114
+ # learning_rate=2e-4,
115
+ # save_steps=1_000,
116
+ # save_total_limit=3,
117
+ # logging_dir='./logs',
118
+ # logging_steps=50,
119
+ # fp16=False # Ensure this is false since we're using bfloat16 manually
120
+ # )
121
+ #
122
+ # trainer = Trainer(
123
+ # model=model,
124
+ # args=training_args,
125
+ # data_collator=data_collator,
126
+ # train_dataset=train_dataset,
127
+ # )
128
+ #
129
+ # trainer.train()
130
+ #
131
+ # # Save model and tokenizer
132
+ # model_path = "./saved_mountain_model_directory"
133
+ # if not os.path.exists(model_path):
134
+ # os.makedirs(model_path)
135
+ # model.save_pretrained(model_path)
136
+ # tokenizer.save_pretrained(model_path)
137
+ #
138
+ # # Quantize the model
139
+ # model.eval() # Ensure the model is in evaluation mode
140
+ # quantized_model = quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)
141
+ # quantized_model_path = "./quantized_model_directory"
142
+ # if not os.path.exists(quantized_model_path):
143
+ # os.makedirs(quantized_model_path)
144
+ # torch.save(quantized_model.state_dict(), os.path.join(quantized_model_path, 'quantized_mountain_model.pth'))
145
+
146
+ # from transformers import Phi3Model, Phi3Config, AutoTokenizer, Trainer, TrainingArguments
147
+ # import logging
148
+ # from transformers import logging as hf_logging
149
+ # import os
150
+ # import torch
151
+ # from torch.utils.data import Dataset
152
+ #
153
+ #
154
+ # class TextDataset(Dataset):
155
+ # def __init__(self, tokenizer, file_path, block_size=512):
156
+ # self.tokenizer = tokenizer
157
+ # self.block_size = block_size
158
+ # self.input_ids = []
159
+ #
160
+ # # Read and tokenize the file content in chunks
161
+ # with open(file_path, 'r', encoding='utf-8') as f:
162
+ # while True:
163
+ # text = f.read(1024 * 1024) # Read approximately 1MB of text at a time
164
+ # if not text:
165
+ # break
166
+ # tokens = tokenizer(text, add_special_tokens=True, truncation=True, max_length=block_size,
167
+ # return_tensors="pt")
168
+ # self.input_ids.extend(tokens.input_ids.tolist())
169
+ #
170
+ # def __len__(self):
171
+ # # Ensure we return a non-negative value
172
+ # return max(0, len(self.input_ids) - self.block_size + 1)
173
+ #
174
+ # def __getitem__(self, idx):
175
+ # # Ensure the index does not exceed the bounds and forms a proper sequence
176
+ # input_ids = self.input_ids[idx:idx + self.block_size]
177
+ # return {"input_ids": torch.tensor(input_ids, dtype=torch.long)}
178
+ #
179
+ #
180
+ #
181
+ #
182
+ # # Setup logging
183
+ # logging.basicConfig(level=logging.INFO)
184
+ # hf_logging.set_verbosity_info()
185
+ # hf_logging.enable_default_handler()
186
+ # hf_logging.enable_explicit_format()
187
+ #
188
+ # # Load Phi-3 model
189
+ # configuration = Phi3Config.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
190
+ # model = Phi3Model(configuration).to(torch.bfloat16)
191
+ #
192
+ # # Load tokenizer
193
+ # tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
194
+ #
195
+ #
196
+ # # Initialize the dataset
197
+ # train_dataset = TextDataset(tokenizer, "papers_data_mountain.txt", block_size=512)
198
+ #
199
+ # # Custom data collator function (simplified for generality)
200
+ # def custom_collate_fn(examples):
201
+ # batch = tokenizer.pad(
202
+ # examples,
203
+ # return_tensors='pt',
204
+ # padding=True,
205
+ # max_length=512
206
+ # )
207
+ # batch = {k: v.to(torch.bfloat16) if isinstance(v, torch.Tensor) else v for k, v in batch.items()}
208
+ # return batch
209
+ #
210
+ #
211
+ # training_args = TrainingArguments(
212
+ # output_dir="./phi3-finetuned",
213
+ # overwrite_output_dir=True,
214
+ # num_train_epochs=4,
215
+ # per_device_train_batch_size=8,
216
+ # gradient_accumulation_steps=2,
217
+ # learning_rate=2e-4,
218
+ # save_steps=1_000,
219
+ # save_total_limit=3,
220
+ # logging_dir='./logs',
221
+ # logging_steps=50,
222
+ # fp16=False # bfloat16 usage is manual
223
+ # )
224
+ #
225
+ # trainer = Trainer(
226
+ # model=model,
227
+ # args=training_args,
228
+ # data_collator=custom_collate_fn,
229
+ # train_dataset=train_dataset,
230
+ # )
231
+ #
232
+ # trainer.train()
233
+ #
234
+ # # Save model and tokenizer
235
+ # model_path = "./saved_phi3_model_directory"
236
+ # if not os.path.exists(model_path):
237
+ # os.makedirs(model_path)
238
+ # model.save_pretrained(model_path)
239
+ # tokenizer.save_pretrained(model_path)
240
+
241
+
242
+
inference_code.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from openvino.runtime import Core
2
+ import numpy as np
3
+
4
+ # Initialize the OpenVINO runtime Core
5
+ ie = Core()
6
+
7
+ # Load and compile the model for the CPU device
8
+ compiled_model = ie.compile_model(model='../ovc_output/converted_model.xml', device_name="CPU")
9
+
10
+ # Prepare input: a non tokenized example just for examples sake
11
+ input_ids = np.random.randint(0, 50256, (1, 10))
12
+
13
+ # Create a dictionary for the inputs expected by the model
14
+ inputs = {"input_ids": input_ids}
15
+
16
+ # Create an infer request and start synchronous inference
17
+ result = compiled_model.create_infer_request().infer(inputs=inputs)
18
+
19
+ # Access output tensor data directly from the result using the appropriate output key
20
+ output = result['outputs']
21
+
22
+ print("Inference results:", output)
llm_test.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import GPT2Tokenizer, GPT2LMHeadModel
2
+ from flask import Flask, request, jsonify
3
+
4
+ app = Flask(__name__)
5
+
6
+ # Load model and tokenizer
7
+ tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
8
+ model = GPT2LMHeadModel.from_pretrained('gpt2')
9
+
10
+ @app.route('/predict', methods=['POST'])
11
+ def predict():
12
+ data = request.json
13
+ text = data['text']
14
+
15
+ # Tokenize and encode the input text
16
+ inputs = tokenizer.encode(text, return_tensors='pt')
17
+ outputs = model.generate(inputs, max_length=50) # Adjust max_length as needed
18
+
19
+ # Decode the output tokens to string
20
+ text_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
21
+ return jsonify({'result': text_output})
22
+
23
+ if __name__ == '__main__':
24
+ app.run(host='0.0.0.0', port=3000)
openvino_inference.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import GPT2LMHeadModel
3
+
4
+ # Load your pre-trained model
5
+ model = GPT2LMHeadModel.from_pretrained('./saved_gpt2_medium_nice_model_directory')
6
+ model.eval()
7
+
8
+ # Dummy input for the export
9
+ dummy_input = torch.randint(0, 50256, (1, 10))
10
+
11
+ # Export the model
12
+ torch.onnx.export(model, dummy_input, "model.onnx", opset_version=11, input_names=['input_ids'], output_names=['outputs'])
papers_data.txt ADDED
The diff for this file is too large to render. See raw diff
 
pyproject.toml ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.poetry]
2
+ name = "MadMolecool"
3
+ version = "0.1.0"
4
+ description = ""
5
+ authors = ["Ibraheem Khan <[email protected]>, Fiorina Chau <[email protected]>"]
6
+ readme = "README.md"
7
+
8
+ [[tool.poetry.source]]
9
+ name = "pytorch"
10
+ url = "https://download.pytorch.org/whl/cpu"
11
+ priority = "supplemental"
12
+
13
+ [tool.poetry.dependencies]
14
+ python = "^3.12"
15
+ flask = "^3.0.3"
16
+ pymongo = "^4.7.0"
17
+ flask-cors = "^4.0.0"
18
+ transformers = "^4.40.1"
19
+ torch = {version = "^2.3.0+cpu", source = "pytorch"}
20
+ torchvision = {version = "^0.18.0+cpu", source = "pytorch"}
21
+ torchaudio = {version = "^2.3.0+cpu", source = "pytorch"}
22
+ bio = "^1.7.0"
23
+ lxml = "^5.2.1"
24
+ bs4 = "^0.0.2"
25
+ accelerate = "^0.29.3"
26
+
27
+ [tool.poetry.group.dev.dependencies]
28
+ setuptools = "^69.5.1"
29
+ wheel = "^0.43.0"
30
+
31
+ [build-system]
32
+ requires = ["poetry-core"]
33
+ build-backend = "poetry.core.masonry.api"
text_gen.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import GPT2LMHeadModel, GPT2Tokenizer
3
+
4
+ def load_model_and_tokenizer(model_path):
5
+ model = model = GPT2LMHeadModel.from_pretrained(model_path)
6
+ tokenizer = GPT2Tokenizer.from_pretrained(model_path)
7
+ return model, tokenizer
8
+
9
+
10
+ def generate_text(input_text, model, tokenizer):
11
+ # Encode the input text
12
+ input_ids = tokenizer.encode(input_text, return_tensors='pt')
13
+ # Generate output from the model
14
+ output = model.generate(input_ids, max_length=75, num_return_sequences=1)
15
+ # Decode and print the output
16
+ return tokenizer.decode(output[0], skip_special_tokens=True)
17
+
18
+
19
+ if __name__ == "__main__":
20
+ model_path = "./saved_gpt2_medium_nice_model_directory" # Adjust the path as needed
21
+ model, tokenizer = load_model_and_tokenizer(model_path)
22
+
23
+ # Ensure model is in eval mode
24
+ model.eval()
25
+
26
+ print("Type 'exit' to quit.")
27
+ while True:
28
+ input_text = input("Enter your text: ")
29
+ if input_text.lower() == 'exit':
30
+ break
31
+ response = generate_text(input_text, model, tokenizer)
32
+ print("Generated text:", response)