Configuration Parsing
Warning:
In adapter_config.json: "peft.task_type" must be a string
idefics3-llama-gui-dense-descriptions
This model is a fine-tuned version of HuggingFaceM4/Idefics3-8B-Llama3 on https://huggingface.co/datasets/Agent-Eval-Refine/GUI-Dense-Descriptions dataset
Finetuning script
# !pip install git+https://github.com/andimarafioti/transformers.git@e1b7c0a05ab65e4ddb62a407fe12f8ec13a916f0"
# !pip install accelerate datasets peft bitsandbytes
# !pip install flash-attn --no-build-isolation
import pandas as pd
import torch
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from transformers import (
AutoProcessor,
BitsAndBytesConfig,
Idefics3ForConditionalGeneration,
)
import os
from PIL import Image
from datasets import load_dataset
from transformers import TrainingArguments, Trainer
from huggingface_hub import notebook_login
notebook_login()
gui_dense_desc_dataset = load_dataset("Agent-Eval-Refine/GUI-Dense-Descriptions")
train_ds = gui_dense_desc_dataset["train"]
# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
# os.environ["CUDA_VISIBLE_DEVICES"] = "2"
USE_LORA = False
USE_QLORA = True
model_id = "HuggingFaceM4/Idefics3-8B-Llama3"
processor = AutoProcessor.from_pretrained(model_id)
if USE_QLORA or USE_LORA:
lora_config = LoraConfig(
r=8,
lora_alpha=8,
lora_dropout=0.1,
target_modules=[
"down_proj",
"o_proj",
"k_proj",
"q_proj",
"gate_proj",
"up_proj",
"v_proj",
],
use_dora=False if USE_QLORA else True,
init_lora_weights="gaussian",
)
lora_config.inference_mode = False
if USE_QLORA:
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
)
model = Idefics3ForConditionalGeneration.from_pretrained(
model_id,
quantization_config=bnb_config if USE_QLORA else None,
_attn_implementation="flash_attention_2",
device_map="auto",
torch_dtype=torch.bfloat16,
)
model.add_adapter(lora_config)
model.enable_adapters()
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)
print(model.get_nb_trainable_parameters())
else:
model = Idefics3ForConditionalGeneration.from_pretrained(
model_id,
torch_dtype=torch.bfloat16,
_attn_implementation="flash_attention_2",
device_map="auto",
)
# if you'd like to only fine-tune LLM
for param in model.model.vision_model.parameters():
param.requires_grad = False
image_token_id = processor.tokenizer.additional_special_tokens_ids[
processor.tokenizer.additional_special_tokens.index("<image>")
]
def collate_fn(examples):
texts = []
images = []
for example in examples:
image = example["image"]
image_description = example["text"]
messages = [
{
"role": "user",
"content": [
{"type": "image"},
{
"type": "text",
"text": "Provide a detailed description of the image.",
},
],
},
{
"role": "assistant",
"content": [{"type": "text", "text": image_description}],
},
]
text = processor.apply_chat_template(messages, add_generation_prompt=False)
texts.append(text.strip())
images.append([image])
batch = processor(text=texts, images=images, return_tensors="pt", padding=True)
labels = batch["input_ids"].clone()
labels[labels == processor.tokenizer.pad_token_id] = -100
labels[labels == image_token_id] = -100
batch["labels"] = labels
return batch
training_args = TrainingArguments(
num_train_epochs=1,
per_device_train_batch_size=2,
gradient_accumulation_steps=8,
warmup_steps=50,
learning_rate=1e-4,
weight_decay=0.01,
logging_steps=5,
save_strategy="steps",
save_steps=250,
save_total_limit=1,
optim="adamw_torch",
bf16=True,
output_dir="./idefics3-llama-gui-dense-descriptions",
hub_model_id="idefics3-llama-gui-dense-descriptions",
remove_unused_columns=False,
)
trainer = Trainer(
model=model,
args=training_args,
data_collator=collate_fn,
train_dataset=train_ds,
)
trainer.train()
trainer.push_to_hub()
Training took approx. 40 min. on 2xH100 (80 Gb each) devices.
Intended usage
from peft import PeftModel
from transformers import AutoProcessor, Idefics3ForConditionalGeneration
from transformers.image_utils import load_image
import torch
adapter_path = "Maverick17/idefics3-llama-gui-dense-descriptions"
base_model_id = "HuggingFaceM4/Idefics3-8B-Llama3"
# Load Model base model
model = Idefics3ForConditionalGeneration.from_pretrained(
base_model_id,
_attn_implementation="flash_attention_2",
device_map="auto",
torch_dtype=torch.bfloat16,
)
# Merge LoRA and base model
peft_model = PeftModel.from_pretrained(model, adapter_path)
merged_model = peft_model.merge_and_unload()
processor = AutoProcessor.from_pretrained(base_model_id)
image = load_image("path/to/ui/image.png")
# Create inputs
messages = [
{
"role": "user",
"content": [
{"type": "image"},
{
"type": "text",
"text": "Provide a detailed description of the image.",
},
],
},
]
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(text=prompt, images=[image], return_tensors="pt")
inputs = {k: v.to("cuda") for k, v in inputs.items()}
generation_args = {
"max_new_tokens": 1024,
"repetition_penalty": 1,
}
generation_args["do_sample"] = False
generation_args.update(inputs)
# Generate
generated_ids = model.generate(**generation_args)
generated_texts = processor.batch_decode(
generated_ids[:, generation_args["input_ids"].size(1) :], skip_special_tokens=True
)
print(generated_texts[0].strip())
Training procedure
Training hyperparameters
The following hyperparameters were used during training:
- learning_rate: 0.0001
- train_batch_size: 2
- eval_batch_size: 8
- seed: 42
- gradient_accumulation_steps: 8
- total_train_batch_size: 16
- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
- lr_scheduler_type: linear
- lr_scheduler_warmup_steps: 50
- num_epochs: 1
Framework versions
- PEFT 0.13.0
- Transformers 4.44.0.dev0
- Pytorch 2.4.1+cu121
- Datasets 3.0.1
- Tokenizers 0.19.1
- Downloads last month
- 5
Model tree for Maverick17/idefics3-llama-gui-dense-descriptions
Base model
HuggingFaceM4/Idefics3-8B-Llama3