--- base_model: HuggingFaceM4/Idefics3-8B-Llama3 library_name: peft license: apache-2.0 tags: - generated_from_trainer model-index: - name: idefics3-llama-gui-dense-descriptions results: [] datasets: - Agent-Eval-Refine/GUI-Dense-Descriptions language: - en --- # idefics3-llama-gui-dense-descriptions This model is a fine-tuned version of [HuggingFaceM4/Idefics3-8B-Llama3](https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3) on https://huggingface.co/datasets/Agent-Eval-Refine/GUI-Dense-Descriptions dataset ## Finetuning script ```python # !pip install git+https://github.com/andimarafioti/transformers.git@e1b7c0a05ab65e4ddb62a407fe12f8ec13a916f0" # !pip install accelerate datasets peft bitsandbytes # !pip install flash-attn --no-build-isolation import pandas as pd import torch from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model from transformers import ( AutoProcessor, BitsAndBytesConfig, Idefics3ForConditionalGeneration, ) import os from PIL import Image from datasets import load_dataset from transformers import TrainingArguments, Trainer from huggingface_hub import notebook_login notebook_login() gui_dense_desc_dataset = load_dataset("Agent-Eval-Refine/GUI-Dense-Descriptions") train_ds = gui_dense_desc_dataset["train"] # os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # os.environ["CUDA_VISIBLE_DEVICES"] = "2" USE_LORA = False USE_QLORA = True model_id = "HuggingFaceM4/Idefics3-8B-Llama3" processor = AutoProcessor.from_pretrained(model_id) if USE_QLORA or USE_LORA: lora_config = LoraConfig( r=8, lora_alpha=8, lora_dropout=0.1, target_modules=[ "down_proj", "o_proj", "k_proj", "q_proj", "gate_proj", "up_proj", "v_proj", ], use_dora=False if USE_QLORA else True, init_lora_weights="gaussian", ) lora_config.inference_mode = False if USE_QLORA: bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16, ) model = Idefics3ForConditionalGeneration.from_pretrained( model_id, quantization_config=bnb_config if USE_QLORA else None, _attn_implementation="flash_attention_2", device_map="auto", torch_dtype=torch.bfloat16, ) model.add_adapter(lora_config) model.enable_adapters() model = prepare_model_for_kbit_training(model) model = get_peft_model(model, lora_config) print(model.get_nb_trainable_parameters()) else: model = Idefics3ForConditionalGeneration.from_pretrained( model_id, torch_dtype=torch.bfloat16, _attn_implementation="flash_attention_2", device_map="auto", ) # if you'd like to only fine-tune LLM for param in model.model.vision_model.parameters(): param.requires_grad = False image_token_id = processor.tokenizer.additional_special_tokens_ids[ processor.tokenizer.additional_special_tokens.index("") ] def collate_fn(examples): texts = [] images = [] for example in examples: image = example["image"] image_description = example["text"] messages = [ { "role": "user", "content": [ {"type": "image"}, { "type": "text", "text": "Provide a detailed description of the image.", }, ], }, { "role": "assistant", "content": [{"type": "text", "text": image_description}], }, ] text = processor.apply_chat_template(messages, add_generation_prompt=False) texts.append(text.strip()) images.append([image]) batch = processor(text=texts, images=images, return_tensors="pt", padding=True) labels = batch["input_ids"].clone() labels[labels == processor.tokenizer.pad_token_id] = -100 labels[labels == image_token_id] = -100 batch["labels"] = labels return batch training_args = TrainingArguments( num_train_epochs=1, per_device_train_batch_size=2, gradient_accumulation_steps=8, warmup_steps=50, learning_rate=1e-4, weight_decay=0.01, logging_steps=5, save_strategy="steps", save_steps=250, save_total_limit=1, optim="adamw_torch", bf16=True, output_dir="./idefics3-llama-gui-dense-descriptions", hub_model_id="idefics3-llama-gui-dense-descriptions", remove_unused_columns=False, ) trainer = Trainer( model=model, args=training_args, data_collator=collate_fn, train_dataset=train_ds, ) trainer.train() trainer.push_to_hub() ``` Training took approx. 40 min. on 2xH100 (80 Gb each) devices. ## Intended usage ```python from peft import PeftModel from transformers import AutoProcessor, Idefics3ForConditionalGeneration from transformers.image_utils import load_image import torch adapter_path = "Maverick17/idefics3-llama-gui-dense-descriptions" base_model_id = "HuggingFaceM4/Idefics3-8B-Llama3" # Load Model base model model = Idefics3ForConditionalGeneration.from_pretrained( base_model_id, _attn_implementation="flash_attention_2", device_map="auto", torch_dtype=torch.bfloat16, ) # Merge LoRA and base model peft_model = PeftModel.from_pretrained(model, adapter_path) merged_model = peft_model.merge_and_unload() processor = AutoProcessor.from_pretrained(base_model_id) image = load_image("path/to/ui/image.png") # Create inputs messages = [ { "role": "user", "content": [ {"type": "image"}, { "type": "text", "text": "Provide a detailed description of the image.", }, ], }, ] prompt = processor.apply_chat_template(messages, add_generation_prompt=True) inputs = processor(text=prompt, images=[image], return_tensors="pt") inputs = {k: v.to("cuda") for k, v in inputs.items()} generation_args = { "max_new_tokens": 1024, "repetition_penalty": 1, } generation_args["do_sample"] = False generation_args.update(inputs) # Generate generated_ids = model.generate(**generation_args) generated_texts = processor.batch_decode( generated_ids[:, generation_args["input_ids"].size(1) :], skip_special_tokens=True ) print(generated_texts[0].strip()) ``` ## Training procedure ### Training hyperparameters The following hyperparameters were used during training: - learning_rate: 0.0001 - train_batch_size: 2 - eval_batch_size: 8 - seed: 42 - gradient_accumulation_steps: 8 - total_train_batch_size: 16 - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08 - lr_scheduler_type: linear - lr_scheduler_warmup_steps: 50 - num_epochs: 1 ### Framework versions - PEFT 0.13.0 - Transformers 4.44.0.dev0 - Pytorch 2.4.1+cu121 - Datasets 3.0.1 - Tokenizers 0.19.1