In [7]:
!pip install protobuf

Collecting protobuf
  Downloading protobuf-4.25.2-cp37-abi3-manylinux2014_x86_64.whl.metadata (541 bytes)
Downloading protobuf-4.25.2-cp37-abi3-manylinux2014_x86_64.whl (294 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m294.6/294.6 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: protobuf
Successfully installed protobuf-4.25.2
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [1]:
from datasets import load_dataset

dataset = load_dataset("cekal/llama2-orion")

In [2]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 8277
    })
})

In [3]:
print(dataset["train"][0]["text"])

# Name: <|im_start|>user
Orion, Can you please summarize all the theatre awards that shellie shulkin had won. She is a friend of mine, I want to have correct information. Thanks Please write in English language.<|im_end|> <|im_start|>assistant
# Overview of Awards Won by Shellie Shulkin in Theatre

Unfortunately, I was unable to find specific information about the theatre awards won by Shellie Shulkin. This might be due to the fact that such specific information could not be well-documented or publicly accessible online, especially if Shellie Shulkin is a local or non-commercial theatre artist.

However, I encourage you to reach out directly to Shellie Shulkin or those who work closely with her as they would likely have the most accurate and up-to-date information regarding the accolades she has achieved in theatre. While I strive to provide accurate and comprehensive responses, it's possible that some specialized or more obscure information is not readily available in the public domai

In [4]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, BitsAndBytesConfig
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

modelpath="152334H/miqu-1-70b-sf"

# Load 4-bit quantized model
model = AutoModelForCausalLM.from_pretrained(
    modelpath,
    use_auth_token=True,
    device_map="auto",
    quantization_config=BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_quant_type="nf4",
    ),
    torch_dtype=torch.bfloat16,
)

# Load (slow) Tokenizer, fast tokenizer sometimes ignores added tokens
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-70b-hf", use_fast=False, use_auth_token=True)   

# Add tokens <|im_start|> and <|im_end|>, latter is special eos token 
tokenizer.pad_token = "</s>"
tokenizer.add_tokens(["<|im_start|>"])
tokenizer.add_special_tokens(dict(eos_token="<|im_end|>"))
model.resize_token_embeddings(len(tokenizer))
model.config.eos_token_id = tokenizer.eos_token_id



Loading checkpoint shards:   0%|          | 0/29 [00:00<?, ?it/s]



In [5]:
# Add LoRA adapters to model
model = prepare_model_for_kbit_training(model)
config = LoraConfig(
    r=64, 
    lora_alpha=16, 
    target_modules = ['q_proj', 'k_proj', 'down_proj', 'v_proj', 'gate_proj', 'o_proj', 'up_proj'],
    lora_dropout=0.1, 
    bias="none", 
    modules_to_save = ["lm_head", "embed_tokens"],		# needed because we added new tokens to tokenizer/model
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, config)
model.config.use_cache = False

In [6]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): ModulesToSaveWrapper(
          (original_module): Embedding(32002, 8192)
          (modules_to_save): ModuleDict(
            (default): Embedding(32002, 8192)
          )
        )
        (layers): ModuleList(
          (0-79): 80 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=8192, out_features=8192, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=8192, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=64, out_features=8192, bias=False)
                )
                (lora_embedding_A): Paramet

In [7]:
import os 

def tokenize(element):
    return tokenizer(
        element["text"],
        truncation=True,
        max_length=2048,
        add_special_tokens=False,
    )

dataset_tokenized = dataset.map(
    tokenize, 
    batched=True, 
    num_proc=os.cpu_count(),    # multithreaded
    remove_columns=["text"]     # don't need this anymore, we have tokens from here on
)

In [8]:
dataset_tokenized

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 8277
    })
})

In [9]:
# define collate function - transform list of dictionaries [ {input_ids: [123, ..]}, {.. ] to single batch dictionary { input_ids: [..], labels: [..], attention_mask: [..] }
def collate(elements):
    tokenlist=[e["input_ids"] for e in elements]
    tokens_maxlen=max([len(t) for t in tokenlist])

    input_ids,labels,attention_masks = [],[],[]
    for tokens in tokenlist:
        pad_len=tokens_maxlen-len(tokens)

        # pad input_ids with pad_token, labels with ignore_index (-100) and set attention_mask 1 where content otherwise 0
        input_ids.append( tokens + [tokenizer.pad_token_id]*pad_len )   
        labels.append( tokens + [-100]*pad_len )    
        attention_masks.append( [1]*len(tokens) + [0]*pad_len ) 

    batch={
        "input_ids": torch.tensor(input_ids),
        "labels": torch.tensor(labels),
        "attention_mask": torch.tensor(attention_masks)
    }
    return batch

In [10]:
bs=1        # batch size
ga_steps=1  # gradient acc. steps
epochs=1
steps_per_epoch=len(dataset_tokenized["train"])//(bs*ga_steps)

args = TrainingArguments(
    output_dir="out",
    per_device_train_batch_size=bs,
    per_device_eval_batch_size=bs,
    evaluation_strategy="steps",
    logging_steps=1,
    eval_steps=steps_per_epoch,		# eval and save once per epoch  	
    save_steps=steps_per_epoch,
    gradient_accumulation_steps=ga_steps,
    num_train_epochs=epochs,
    lr_scheduler_type="constant",
    optim="paged_adamw_32bit",
    learning_rate=0.0002,
    group_by_length=True,
    fp16=True,
    ddp_find_unused_parameters=False,
)

In [11]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=collate,
    train_dataset=dataset_tokenized["train"],
    args=args,
)

trainer.train()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss,Validation Loss


ValueError: Trainer: evaluation requires an eval_dataset.

In [9]:
import json

def add_tokens(data):
    for conversation_object in data:
        # Obtain the name if present, else use blank string
        name = conversation_object.get('name', '') 

        for i, conversation in enumerate(conversation_object['conversations']):
            if conversation['from'] == 'human':
                if i == 0: # If it's the first message from the human
                    conversation['value'] = f"# Name: {name}<|im_start|>user\n" + conversation['value'] + "<|im_end|>\n"
                else:
                    conversation['value'] = "<|im_start|>user\n" + conversation['value'] + "<|im_end|>\n"
            elif conversation['from'] == 'gpt':
                conversation['value'] = "<|im_start|>assistant\n" + conversation['value'] + "<|im_end|>\n"
    return data

def main():
    # Opening the input file and loading the json data 
    with open('orion.json', 'r') as in_file:
        data = json.load(in_file)

    # Adding the special tokens
    data = add_tokens(data)

    # Saving the modified data into the output.json file
    with open('oorion.json', 'w') as out_file:
        json.dump(data, out_file, indent=4)

if __name__ == '__main__':
    main()

In [10]:
import json

def process_json_object(json_obj):
    """ Process a single JSON object to the required format. """
    formatted_text = ""

    # Check if 'conversations' is in json_obj and is a list
    if "conversations" in json_obj and isinstance(json_obj["conversations"], list):
        for conversation in json_obj["conversations"]:
            # Use .get() for safer access
            speaker = "" if conversation.get("from") == "human" else ""
            formatted_value = conversation.get("value", "").replace("\n\n","\n\n").strip()
            # Add space after each piece of conversation
            formatted_text += speaker + formatted_value + " "
    else:
        return None  # If structure is not as expected, return None

    # Remove trailing spaces
    formatted_text = formatted_text.rstrip(" ")
    # Wrap the formatted text in a dictionary
    return {"text": formatted_text}

# Paths to your files
input_file_path = 'oorion.json'
output_file_path = 'train-orion.json'

# Process and write output based on data structure
with open(input_file_path, 'r', encoding='utf-8') as infile, \
     open(output_file_path, 'w', encoding='utf-8') as outfile:

    data = json.load(infile)

    # Adjust the loop to correctly handle whether data is a list or a single dict
    if isinstance(data, list):
        # If data is a list, process each item in the list
        for json_obj in data:
            formatted_line = process_json_object(json_obj)
            if formatted_line is not None:
                outfile.write(json.dumps(formatted_line, ensure_ascii=False))  # Changed to False for better encoding
                outfile.write('\n')
    elif isinstance(data, dict):
        # If data is a single dict, process it directly
        formatted_line = process_json_object(data)
        if formatted_line is not None:
            outfile.write(json.dumps(formatted_line, ensure_ascii=False))  # Changed to False for better encoding
            outfile.write('\n')

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# Set model ID and load configuration
base_model_id = "152334H/miqu-1-70b-sf"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load tokenizer and add special tokens
tokenizer = AutoTokenizer.from_pretrained(
    "meta-llama/Llama-2-70b-hf", 
    add_bos_token=True, 
    trust_remote_code=True, 
    use_auth_token=True
)
tokenizer.pad_token = "</s>"
tokenizer.add_tokens(["<|im_start|>"])
tokenizer.add_special_tokens(dict(eos_token="<|im_end|>"))

# Load base model and adjust token embeddings size
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,  
    quantization_config=bnb_config,  
    device_map="auto",
    trust_remote_code=True,
    use_auth_token=True
)
base_model.resize_token_embeddings(len(tokenizer))

# Configure EOS token ID
base_model.config.eos_token_id = tokenizer.eos_token_id

# Import Peft function and create fine-tuned model
from peft import PeftModel
ft_model = PeftModel.from_pretrained(base_model, "out/Orion-3")



Loading checkpoint shards:   0%|          | 0/29 [00:00<?, ?it/s]

In [17]:
import sys
import bitsandbytes #necessary to fit in colab
import accelerate #necessary to fit in colab
import torch
from transformers import AutoTokenizer, TextStreamer, GenerationConfig, AutoModelForCausalLM
streamer = TextStreamer(tokenizer)
eval_prompt = """# Name: <|im_start|>user\nExplain quantum computing in simple terms<|im_end|> <|im_start|>assistant\n"""
model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

ft_model.eval()
with torch.no_grad():
    # Using Dgenerate for streaming the output
    generated_tokens = ft_model.generate(
        **model_input, 
        max_new_tokens=4096,
        temperature=0.7,  # Adjusted for balance
        do_sample=True,
        eos_token_id=tokenizer.convert_tokens_to_ids("<|im_end|>"),
        streamer=streamer  # Ensure 'streamer' is defined and appropriate for your setup
    )

    # Decode and print the generated tokens
    print(tokenizer.decode(generated_tokens, skip_special_tokens=True))

<s> # Name:<|im_start|>user
Explain quantum computing in simple terms<|im_end|> <|im_start|>assistant
# Quantum Computing Simplified

Quantum Computing is a branch of technology that leverages the principles of quantum mechanics to process information. It's an exciting, yet complex, field that has the potential to revolutionize many aspects of digital technology.

---

## Key Concepts of Quantum Computing

### 1. Quantum Bits (Qubits)

Just as classical computers use bits (0s and 1s) to process information, quantum computers use a fundamental unit called a quantum bit, or **qubit**. However, a qubit can be in multiple states at once, thanks to a property known as **superposition**.

> "Qubits can be both 0 and 1 at the same time."

### 2. Quantum Entanglement

Another key principle of quantum computing is **entanglement**. When two qubits become entangled, the state of one instantly influences the state of the other, no matter the distance between them.

> "When two qubits become entan

TypeError: argument 'ids': 'list' object cannot be interpreted as an integer