In [1]:
import torch

In [2]:
import transformers

tokenizer = transformers.AutoTokenizer.from_pretrained('cerebras/Cerebras-GPT-2.7B')
tokenizer.pad_token_id = 0

In [13]:
import datasets
dataset = datasets.load_dataset('json', data_files='alpaca_data_cleaned.json')

Found cached dataset json (/root/.cache/huggingface/datasets/json/default-8d265dbd6f34ccd3/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


  0%|          | 0/1 [00:00<?, ?it/s]

In [14]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 51942
    })
})


In [15]:
cutoff_len = 512

def generate_prompt(entry):
    if entry['input']:
        return f"User: {entry['instruction']}: {entry['input']}\n\nAssistant: {entry['output']}"
    else:
        return f"User: {entry['instruction']}\n\nAssistant: {entry['output']}"

def tokenize(item, add_eos_token=True):
    result = tokenizer(
        generate_prompt(item),
        truncation=True,
        max_length=cutoff_len,
        padding=False,
        return_tensors=None,
    )

    if (
        result["input_ids"][-1] != tokenizer.eos_token_id
        and len(result["input_ids"]) < cutoff_len
        and add_eos_token
    ):
        result["input_ids"].append(tokenizer.eos_token_id)
        result["attention_mask"].append(1)

    result["labels"] = result["input_ids"].copy()

    return result

In [16]:
train_val = dataset["train"].train_test_split(test_size=0.2, shuffle=True, seed=42)
train_data = train_val["train"].shuffle().map(tokenize)
val_data = train_val["test"].shuffle().map(tokenize)

Map:   0%|          | 0/41553 [00:00<?, ? examples/s]

Map:   0%|          | 0/10389 [00:00<?, ? examples/s]

In [18]:
if 'model' in globals(): 
    del model
    torch.cuda.empty_cache()

model = transformers.AutoModelForCausalLM.from_pretrained(
    'cerebras/Cerebras-GPT-2.7B',    
    load_in_8bit=True,
    torch_dtype=torch.float16,
    device_map={'': 0}
)


Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
CUDA SETUP: CUDA runtime path found: /root/miniconda3/envs/llama/lib/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 8.6
CUDA SETUP: Detected CUDA version 117
CUDA SETUP: Loading binary /root/miniconda3/envs/llama/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so...


In [19]:
import peft

model = peft.prepare_model_for_int8_training(model)

model = peft.get_peft_model(model, peft.LoraConfig(
    r=8,
    lora_alpha=16,
    # target_modules=["q_proj", "v_proj"],
    target_modules=["c_attn"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
))

In [20]:
import peft



model = peft.PeftModel.from_pretrained(
    model,
    # 'lora-cerebras-gpt2.7b-hh-rlhf-helpful-online',
    output_dir,
    torch_dtype=torch.float16
)

ValueError: Can't find config.json at 'lora-cerebras-gpt2.7b-alpaca'

In [28]:


import os
import wandb 

output_dir = 'lora-cerebras-gpt2.7b-alpaca'

use_wandb = True,
wandb_run_name = f"{output_dir}-{wandb.util.generate_id()}"

# set the wandb project where this run will be logged
os.environ["WANDB_PROJECT"]=output_dir

# save your trained model checkpoint to wandb
os.environ["WANDB_LOG_MODEL"]="true"

# turn off watch to log faster
os.environ["WANDB_WATCH"]="false"

training_args = transformers.TrainingArguments(
    per_device_train_batch_size=16, 
    gradient_accumulation_steps=8,  
    num_train_epochs=3,  
    learning_rate=1e-4, 
    fp16=True,
    optim="adamw_torch",
    logging_steps=10, 
    evaluation_strategy="steps",
    save_strategy="steps",
    eval_steps=200,
    save_steps=200,
    output_dir=output_dir, 
    save_total_limit=3,

    report_to="wandb" if use_wandb else None,
    run_name=wandb_run_name if use_wandb else None,
)

In [32]:
trainer = transformers.Trainer(
    model=model, 
    train_dataset=train_data,
    eval_dataset=val_data,
    args=training_args, 
    data_collator=transformers.DataCollatorForSeq2Seq(
        tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
    ),
)

model.config.use_cache = False
result = trainer.train('lora-cerebras-gpt2.7b-alpaca/checkpoint-800')
model.save_pretrained(output_dir)

wandb.finish()

Step,Training Loss,Validation Loss


0,1
eval/loss,█▄▂▁
eval/runtime,▅█▄▁
eval/samples_per_second,▄▁▅█
eval/steps_per_second,▄▁▅█
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇██▇▇▇███
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇██▇▇▇███
train/learning_rate,████▇▇▇▇▇▆▆▆▆▆▅▅▅▅▄▄▄▄▄▃▃▃▃▂▂▂▂▂▁▁▂▂▂▂▁▁
train/loss,█▃▃▂▂▂▂▂▂▂▂▁▂▁▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/total_flos,▁█
train/train_loss,█▁

0,1
eval/loss,1.69353
eval/runtime,213.477
eval/samples_per_second,48.666
eval/steps_per_second,6.085
train/epoch,3.0
train/global_step,972.0
train/learning_rate,0.0
train/loss,1.7007
train/total_flos,4.1553623137959936e+17
train/train_loss,0.29741


In [33]:
model.config
print(model.dtype)

model.half()

torch.float16


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GPT2LMHeadModel(
      (transformer): GPT2Model(
        (wte): Embedding(50257, 2560)
        (wpe): Embedding(2048, 2560)
        (drop): Dropout(p=0.0, inplace=False)
        (h): ModuleList(
          (0): GPT2Block(
            (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
            (attn): GPT2Attention(
              (c_attn): MergedLinear(
                in_features=2560, out_features=7680, bias=True
                (lora_dropout): Dropout(p=0.05, inplace=False)
                (lora_A): Linear(in_features=2560, out_features=16, bias=False)
                (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)
              )
              (c_proj): Conv1D()
              (attn_dropout): Dropout(p=0.0, inplace=False)
              (resid_dropout): Dropout(p=0.0, inplace=False)
            )
            (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        

In [35]:
text = "Human: Can I run inference on my local machine?\nAssistant:"

inputs = tokenizer(text, return_tensors="pt")
input_ids = inputs["input_ids"].to(model.device)

generation_config = transformers.GenerationConfig(
    max_new_tokens=100,
    temperature=0.2,
    top_p=0.75,
    top_k=50,
    repetition_penalty=1.2,
    do_sample=True,
    early_stopping=True,
#     num_beams=5,
    
    pad_token_id=model.config.pad_token_id,
    eos_token_id=model.config.eos_token_id,
)

with torch.no_grad():
    output = model.generate(
        input_ids=input_ids,
        attention_mask=torch.ones_like(input_ids),
        generation_config=generation_config
    )[0].cuda()

result = tokenizer.decode(output, skip_special_tokens=True).strip()
print(result)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Human: Can I run inference on my local machine?
Assistant: Yes, you can. You should be able to use the same model and data as your local machine for inference. The only difference is that you will need to download the necessary packages from the cloud or install them locally.
