File size: 6,451 Bytes
60f8cd4 555360f 60f8cd4 555360f 60f8cd4 555360f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 |
import json
from os import close
from pathlib import Path
from azure.cosmos import CosmosClient, PartitionKey, exceptions
from transformers import DistilBertTokenizerFast
import torch
from transformers import DistilBertForQuestionAnswering, AdamW
from torch.utils.data import DataLoader
import subprocess
class Model:
def __init__(self) -> None:
self.endPoint = "https://productdevelopmentstorage.documents.azure.com:443/"
self.primaryKey = "nVds9dPOkPuKu8RyWqigA1DIah4SVZtl1DIM0zDuRKd95an04QC0qv9TQIgrdtgluZo7Z0HXACFQgKgOQEAx1g=="
self.client = CosmosClient(self.endPoint, self.primaryKey)
self.tokenizer = None
def GetData(self, type):
database = self.client.get_database_client("squadstorage")
container = database.get_container_client(type)
item_list = list(container.read_all_items(max_item_count=10))
return item_list
def ArrangeData(self, type):
squad_dict = self.GetData(type)
contexts = []
questions = []
answers = []
for i in squad_dict:
contexts.append(i["context"])
questions.append(i["question"])
answers.append(i["answers"])
return contexts, questions, answers
def add_end_idx(self, answers, contexts):
for answer, context in zip(answers, contexts):
gold_text = answer['text'][0]
start_idx = answer['answer_start'][0]
end_idx = start_idx + len(gold_text)
if context[start_idx:end_idx] == gold_text:
answer['answer_end'] = end_idx
elif context[start_idx-1:end_idx-1] == gold_text:
answer['answer_start'] = start_idx - 1
answer['answer_end'] = end_idx - 1 # When the gold label is off by one character
elif context[start_idx-2:end_idx-2] == gold_text:
answer['answer_start'] = start_idx - 2
answer['answer_end'] = end_idx - 2 # When the gold label is off by two characters
return answers, contexts
def Tokenizer(self, train_contexts, train_questions, val_contexts, val_questions):
self.tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
train_encodings = self.tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_encodings = self.tokenizer(val_contexts, val_questions, truncation=True, padding=True)
return train_encodings, val_encodings
def add_token_positions(self, encodings, answers):
start_positions = []
end_positions = []
for i in range(len(answers)):
start_positions.append(encodings.char_to_token(i, answers[i]['answer_start'][0]))
end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))
# if start position is None, the answer passage has been truncated
if start_positions[-1] is None:
start_positions[-1] = self.tokenizer.model_max_length
if end_positions[-1] is None:
end_positions[-1] = self.tokenizer.model_max_length
encodings.update({'start_positions': start_positions, 'end_positions': end_positions})
return encodings
# train_contexts, train_questions, train_answers = read_squad('squad/train-v2.0.json')
# val_contexts, val_questions, val_answers = read_squad('squad/dev-v2.0.json')
def ModelExecution(self):
train_contexts, train_questions, train_answers = self.ArrangeData("livecheckcontainer")
val_contexts, val_questions, val_answers = self.ArrangeData("livecheckcontainer")
print(train_answers)
train_answers, train_contexts = self.add_end_idx(train_answers, train_contexts)
val_answers, val_contexts = self.add_end_idx(val_answers, val_contexts)
train_encodings, val_encodings = self.Tokenizer(train_contexts, train_questions, val_contexts, val_questions)
train_encodings = self.add_token_positions(train_encodings, train_answers)
val_encodings = self.add_token_positions(val_encodings, val_answers)
train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
model.train()
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
optim = AdamW(model.parameters(), lr=5e-5)
for epoch in range(2):
print(epoch)
for batch in train_loader:
optim.zero_grad()
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
start_positions = batch['start_positions'].to(device)
end_positions = batch['end_positions'].to(device)
outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
loss = outputs[0]
loss.backward()
optim.step()
print("Done")
model.eval()
model.save_pretrained("./")
self.tokenizer.save_pretrained("./")
subprocess.call(["git", "add","--all"])
subprocess.call(["git", "status"])
subprocess.call(["git", "commit", "-m", "First version of the your-model-name model and tokenizer."])
subprocess.call(["git", "push"])
class SquadDataset(torch.utils.data.Dataset):
def __init__(self, encodings):
self.encodings = encodings
def __getitem__(self, idx):
return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
def __len__(self):
return len(self.encodings.input_ids)
# import requests
# API_URL = "https://api-inference.huggingface.co/models/Ateeb/QA"
# headers = {"Authorization": "Bearer api_DHnvjPKdjmjkmEYQubgvmIKJqWaNNYljaF"}
# def query(payload):
# data = json.dumps(payload)
# response = requests.request("POST", API_URL, headers=headers, data=data)
# return json.loads(response.content.decode("utf-8"))
# data = query(
# {
# "inputs": {
# "question": "What is my name?",
# "context": "My name is Clara and I live in Berkeley.",
# }
# }
# )
# print(data) |