DhruvaVeena_Mark-1 / tokenizer.py
AdityaBDhruva's picture
Rename handler.py to tokenizer.py
b0cfec1 verified
import numpy as np
import torch
import tiktoken
from torch import nn
tokenizer = tiktoken.get_encoding("cl100k_base")
def text_to_token_ids(text, tokenizer):
encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
encoded_tensor = torch.tensor(encoded).unsqueeze(0)
return encoded_tensor
def token_ids_to_text(token_ids, tokenizer):
flat = token_ids.squeeze(0)
return tokenizer.decode(flat.tolist())