import numpy as np | |
import torch | |
import tiktoken | |
from torch import nn | |
tokenizer = tiktoken.get_encoding("cl100k_base") | |
def text_to_token_ids(text, tokenizer): | |
encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'}) | |
encoded_tensor = torch.tensor(encoded).unsqueeze(0) | |
return encoded_tensor | |
def token_ids_to_text(token_ids, tokenizer): | |
flat = token_ids.squeeze(0) | |
return tokenizer.decode(flat.tolist()) |