|
from transformers import AutoConfig |
|
from transformers import GPT2Tokenizer, GPT2LMHeadModel |
|
|
|
from utils import SPECIAL_TOKENS, build_input_from_segments, add_special_tokens_ |
|
from utils import get_dataset, download_pretrained_model |
|
|
|
import timeit |
|
|
|
import logging |
|
logging.basicConfig(format='%(asctime)s: %(message)s',level=logging.INFO) |
|
logger = logging.getLogger(__file__) |
|
|
|
import random |
|
|
|
from itertools import chain |
|
from pprint import pformat |
|
|
|
|
|
import torch |
|
import torch.nn.functional as F |
|
|
|
import boto3 |
|
import os |
|
import tarfile |
|
import io |
|
import base64 |
|
import json |
|
import re |
|
|
|
from types import SimpleNamespace |
|
|
|
import warnings |
|
warnings.simplefilter(action='ignore', category=FutureWarning) |
|
|
|
print("Loading Model.py module...") |
|
|
|
s3 = boto3.client('s3') |
|
|
|
|
|
def is_list_of_strings(lst): |
|
if lst and isinstance(lst, list): |
|
return all(isinstance(elem, str) for elem in lst) |
|
else: |
|
return False |
|
|
|
|
|
|
|
class ServerlessModel: |
|
def __init__(self, model_path=None, s3_bucket=None, file_prefix=None, efs_path=None): |
|
|
|
|
|
print("Trying to init model") |
|
|
|
self.model = None |
|
self.tokenizer = None |
|
self.dataset = None |
|
|
|
if s3_bucket is None: |
|
if model_path is not None and efs_path is None : |
|
print("Loading model from local..") |
|
self.model, self.tokenizer, self.dataset = self.from_pretrained_local_path(model_path, file_prefix) |
|
logging.debug("Done loading") |
|
else: |
|
|
|
if model_path is not None and efs_path is not None: |
|
print("loading model from EFS") |
|
self.model, self.tokenizer, self.dataset = self.from_pretrained(model_path, s3_bucket, file_prefix, efs_path=efs_path) |
|
logging.debug("Done loading") |
|
else: |
|
|
|
print("ERROR: Model path not found") |
|
raise Exception("No model path found") |
|
|
|
else: |
|
print("Loading model from s3 path..") |
|
print(s3_bucket) |
|
self.model, self.tokenizer, self.dataset = self.from_pretrained( |
|
model_path, s3_bucket, file_prefix) |
|
logging.debug("Done loading") |
|
|
|
|
|
self.parameters = { |
|
'max_length' : 25, |
|
'min_length' : 1, |
|
'device' : 'cpu', |
|
'temperature' : 1.0, |
|
'dynamic_temperature' : True, |
|
'dynamic_temperature_range' : 0.15, |
|
'top_k' : 50, |
|
'top_p' : 0.9, |
|
'no_sample' : False, |
|
'max_history' : 2, |
|
|
|
} |
|
|
|
print("Done initializing model") |
|
|
|
|
|
def from_pretrained(self, model_path: str, s3_bucket: str, file_prefix: str , efs_path = None ): |
|
|
|
if efs_path is None: |
|
model = self.load_model_from_s3(model_path, s3_bucket, file_prefix) |
|
else: |
|
model = self.load_model_from_efs(model_path,efs_path) |
|
|
|
print("Model loaded.") |
|
print("loading tokenizer from path: ", model_path) |
|
|
|
tokenizer = self.load_tokenizer(model_path) |
|
|
|
tokenizer.model_max_length = 1024 |
|
print("tokenizer loaded") |
|
|
|
self.model = model |
|
self.tokenizer = tokenizer |
|
|
|
add_special_tokens_(self.model, self.tokenizer) |
|
|
|
|
|
DATASET_PATH = model_path + '/personafile.json' |
|
|
|
|
|
DATASET_CACHE = model_path +'/persona_good' |
|
|
|
|
|
dataset = self.load_dataset(DATASET_PATH, DATASET_CACHE) |
|
self.dataset = dataset |
|
|
|
print("dataset loaded") |
|
model.eval() |
|
print("Model in eval mode, dataset and tokenizer also loaded") |
|
return model, tokenizer, dataset |
|
|
|
def load_model_from_path(self, model_path:str): |
|
print("Loading model from path:",model_path) |
|
model = GPT2LMHeadModel.from_pretrained(model_path) |
|
model.eval() |
|
self.model = model |
|
return model |
|
|
|
|
|
def from_pretrained_local_path(self, model_path: str, file_prefix: str): |
|
print("Local model loading...") |
|
model = GPT2LMHeadModel.from_pretrained(model_path) |
|
tokenizer = self.load_tokenizer(model_path) |
|
|
|
self.model = model |
|
self.tokenizer = tokenizer |
|
|
|
|
|
tokenizer.model_max_length = 1024 |
|
add_special_tokens_(model, tokenizer) |
|
|
|
|
|
|
|
DATASET_PATH = model_path + '/personafile.json' |
|
|
|
|
|
DATASET_CACHE = model_path +'/persona_good' |
|
|
|
dataset = self.load_dataset(DATASET_PATH, DATASET_CACHE) |
|
|
|
self.dataset = dataset |
|
|
|
|
|
model.eval() |
|
print("Model in eval mode, dataset and tokenizer also loaded") |
|
return model, tokenizer, dataset |
|
|
|
def load_model_from_efs(self, model_path: str, efs_path: str): |
|
if model_path and efs_path: |
|
config = AutoConfig.from_pretrained(f'{model_path}/config.json') |
|
with open(efs_path, 'rb') as f: |
|
|
|
state = torch.load(io.BytesIO( |
|
f.read()), map_location=lambda storage, loc: storage) |
|
|
|
'''alt |
|
with open(efs_path, 'rb') as f: |
|
state = pickle.load(f, encoding='latin1') |
|
''' |
|
model = GPT2LMHeadModel.from_pretrained( |
|
pretrained_model_name_or_path=None, state_dict=state, config=config) |
|
return model |
|
else: |
|
raise KeyError('No model config path or EFS bin path') |
|
|
|
|
|
def load_model_from_s3(self, model_path: str, s3_bucket: str, file_prefix: str): |
|
if model_path and s3_bucket and file_prefix: |
|
obj = s3.get_object(Bucket=s3_bucket, Key=file_prefix) |
|
bytestream = io.BytesIO(obj['Body'].read()) |
|
|
|
tar = tarfile.open(fileobj=bytestream, mode="r:gz") |
|
config = AutoConfig.from_pretrained(f'{model_path}/config.json') |
|
for member in tar.getmembers(): |
|
if member.name.startswith("./._"): |
|
|
|
continue |
|
if member.name.endswith(".bin"): |
|
f = tar.extractfile(member) |
|
print("Model file extracted: " + member.name) |
|
|
|
|
|
state = torch.load(io.BytesIO( |
|
f.read()), map_location=lambda storage, loc: storage) |
|
model = GPT2LMHeadModel.from_pretrained( |
|
pretrained_model_name_or_path=None, state_dict=state, config=config) |
|
|
|
|
|
|
|
return model |
|
else: |
|
raise KeyError('No S3 Bucket and Key Prefix provided') |
|
|
|
def load_tokenizer(self, model_path: str): |
|
print("loading tokenizer") |
|
tokenizer = GPT2Tokenizer.from_pretrained(model_path) |
|
return tokenizer |
|
|
|
def load_dataset(self, DATASET_PATH: str, DATASET_CACHE: str, use_efs= False): |
|
print("loading dataset") |
|
dataset = get_dataset(self.tokenizer, DATASET_PATH, DATASET_CACHE) |
|
return dataset |
|
|
|
def encode(self, question, context): |
|
encoded = self.tokenizer.encode_plus(question, context) |
|
return encoded["input_ids"], encoded["attention_mask"] |
|
|
|
def decode(self, token): |
|
answer_tokens = self.tokenizer.convert_ids_to_tokens( |
|
token, skip_special_tokens=True) |
|
return self.tokenizer.convert_tokens_to_string(answer_tokens) |
|
|
|
def generate_word(self, text, model=None, tokenizer=None, noprint=False): |
|
if model is None or tokenizer is None: |
|
print("ERROR: No model or tokenizer") |
|
return None |
|
|
|
inputs = tokenizer(text, return_tensors="pt") |
|
|
|
|
|
outputs = model(**inputs, labels=inputs["input_ids"]) |
|
loss, logits = outputs[:2] |
|
predicted_index = torch.argmax(logits[0, -1, :]).item() |
|
predicted_text = tokenizer.decode([predicted_index]) |
|
|
|
|
|
if not noprint: |
|
print('input text:', text) |
|
print('predicted text:', predicted_text) |
|
|
|
return predicted_text |
|
|
|
|
|
def top_filtering(self,logits, top_k=0., top_p=0.9, threshold=-float('Inf'), filter_value=-float('Inf')): |
|
""" Filter a distribution of logits using top-k, top-p (nucleus) and/or threshold filtering |
|
Args: |
|
logits: logits distribution shape (vocabulary size) |
|
top_k: <=0: no filtering, >0: keep only top k tokens with highest probability. |
|
top_p: <=0.0: no filtering, >0.0: keep only a subset S of candidates, where S is the smallest subset |
|
whose total probability mass is greater than or equal to the threshold top_p. |
|
In practice, we select the highest probability tokens whose cumulative probability mass exceeds |
|
the threshold top_p. |
|
threshold: a minimal threshold to keep logits |
|
""" |
|
assert logits.dim() == 1 |
|
top_k = min(top_k, logits.size(-1)) |
|
if top_k > 0: |
|
|
|
indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None] |
|
logits[indices_to_remove] = filter_value |
|
|
|
if top_p > 0.0: |
|
|
|
sorted_logits, sorted_indices = torch.sort(logits, descending=True) |
|
cumulative_probabilities = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1) |
|
|
|
|
|
sorted_indices_to_remove = cumulative_probabilities > top_p |
|
|
|
sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone() |
|
sorted_indices_to_remove[..., 0] = 0 |
|
|
|
|
|
indices_to_remove = sorted_indices[sorted_indices_to_remove] |
|
logits[indices_to_remove] = filter_value |
|
|
|
indices_to_remove = logits < threshold |
|
logits[indices_to_remove] = filter_value |
|
|
|
return logits |
|
|
|
|
|
def sample_sequence(self,personality, history, tokenizer, model, params=None, current_output=None): |
|
|
|
start = timeit.default_timer() |
|
|
|
|
|
if params is not None: |
|
|
|
for k,v in params.items(): |
|
self.parameters[k] = v |
|
|
|
|
|
|
|
|
|
special_tokens_ids = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS) |
|
if current_output is None: |
|
current_output = [] |
|
|
|
for i in range(self.parameters['max_length']): |
|
|
|
|
|
|
|
instance = build_input_from_segments(personality, history, current_output, tokenizer, with_eos=False) |
|
|
|
input_ids = torch.tensor(instance["input_ids"], device=self.parameters['device']).unsqueeze(0) |
|
token_type_ids = torch.tensor(instance["token_type_ids"], device=self.parameters['device']).unsqueeze(0) |
|
|
|
logits = model(input_ids, token_type_ids=token_type_ids) |
|
if isinstance(logits, tuple): |
|
logits = logits[0] |
|
|
|
|
|
if self.parameters['dynamic_temperature']: |
|
|
|
rand_range = random.uniform(-1 * self.parameters['dynamic_temperature_range'] , self.parameters['dynamic_temperature_range']) |
|
temperature = self.parameters['temperature'] + rand_range |
|
else: |
|
temperature = self.parameters['temperature'] |
|
|
|
logits = logits[0, -1, :] / temperature |
|
|
|
logits = self.top_filtering(logits, top_k=self.parameters['top_k'], top_p=self.parameters['top_p']) |
|
|
|
probs = F.softmax(logits, dim=-1) |
|
|
|
prev = torch.topk(probs, 1)[1] if self.parameters['no_sample'] else torch.multinomial(probs, 1) |
|
if i < self.parameters['min_length'] and prev.item() in special_tokens_ids: |
|
while prev.item() in special_tokens_ids: |
|
if probs.max().item() == 1: |
|
warnings.warn("Warning: model generating special token with probability 1.") |
|
break |
|
prev = torch.multinomial(probs, num_samples=1) |
|
|
|
if prev.item() in special_tokens_ids: |
|
|
|
break |
|
current_output.append(prev.item()) |
|
|
|
|
|
stop = timeit.default_timer() |
|
|
|
|
|
return current_output |
|
|
|
|
|
def dump_personalities_with_movies(self): |
|
personalities = [ [dialog["name"], dialog["moviename"]] for dialog in self.dataset["train"]] |
|
name_list = [] |
|
for person in personalities: |
|
try: |
|
name_tokenized = person[0] |
|
name = self.tokenizer.decode(name_tokenized) |
|
movies_tokenized = person[1] |
|
movienames= "" |
|
|
|
|
|
if isinstance(movies_tokenized[0], int): |
|
movienames = self.tokenizer.decode(movies_tokenized) |
|
movienames = movienames.replace(".txt", "") |
|
else: |
|
for movie in movies_tokenized: |
|
moviename = self.tokenizer.decode(movie) |
|
moviename = moviename.replace(".txt", "") |
|
movienames = movienames + " / " + moviename |
|
name_list.append([name,movienames]) |
|
except: |
|
print("Could not do name:", self.tokenizer.decode(person[0])) |
|
|
|
return name_list |
|
|
|
|
|
|
|
|
|
def dump_personalities(self,as_list=False): |
|
personalities = [dialog["personality"] for dialog in self.dataset["train"]] |
|
name_list = [] |
|
for person in personalities: |
|
name_tokenized = person[-1] |
|
name = self.tokenizer.decode(name_tokenized) |
|
name = name.replace("My name is ", "")[:-1] |
|
name_list.append(name) |
|
|
|
|
|
if as_list: |
|
return name_list |
|
else: |
|
return " | ".join(name_list) |
|
|
|
|
|
def get_personalities(self): |
|
|
|
personalities = [dialog["personality"] for dialog in self.dataset["train"]] |
|
|
|
people = [item[-1][-10:-1] for item in personalities] |
|
|
|
people_list = self.tokenizer.decode(chain(*people)) |
|
|
|
|
|
text_to_remove = "My name is " |
|
people_list = people_list.replace(text_to_remove, " | ") |
|
|
|
|
|
|
|
|
|
return people_list |
|
|
|
def select_personality(self,characters,select_random=False): |
|
|
|
|
|
|
|
personalities = [dialog["personality"] for dialog in self.dataset["train"]] |
|
|
|
if select_random : return random.choice(personalities) |
|
|
|
|
|
|
|
|
|
|
|
|
|
personality = None |
|
|
|
name = "My name is " + str(characters) |
|
name_token = self.tokenizer.encode(name) |
|
|
|
index_start = len(name_token)+1 |
|
|
|
try: |
|
|
|
index_of_name = [ item[-1][-1*index_start: -1]== name_token for item in personalities].index(True) |
|
|
|
|
|
personality = personalities[index_of_name] |
|
except: |
|
print("Not found ... Select again") |
|
return None |
|
|
|
|
|
|
|
|
|
|
|
|
|
print("Selected personality: %s", self.tokenizer.decode(chain(*personality))) |
|
|
|
return personality |
|
|
|
|
|
|
|
def get_answer(self, input_text, personality, history, params=None): |
|
|
|
|
|
if len(history)>0: |
|
|
|
|
|
|
|
new_hist = [] |
|
for ele in history: |
|
new_hist.append( self.tokenizer.encode(ele) ) |
|
history = new_hist.copy() |
|
|
|
history.append(self.tokenizer.encode(input_text)) |
|
|
|
with torch.no_grad(): |
|
out_ids = self.sample_sequence(personality, history, self.tokenizer, self.model, params=params) |
|
history.append(out_ids) |
|
history = history[-(2*self.parameters['max_history']+1):] |
|
out_text = self.tokenizer.decode(out_ids, skip_special_tokens=True) |
|
|
|
|
|
|
|
history_decoded = [] |
|
for ele in history: |
|
history_decoded.append(self.tokenizer.decode(ele)) |
|
|
|
return out_text, history_decoded, self.parameters |
|
|
|
|
|
|
|
def predict(self, question, parameter_dict): |
|
try: |
|
answer = self.generate_text(question, model=self.model, |
|
tokenizer=self.tokenizer, |
|
parameter_dict=parameter_dict, |
|
) |
|
return answer |
|
except Exception as e: |
|
raise Exception( |
|
"Runtime error see cloudwatch logs : {}".format(repr(e))) |
|
|