Maverick17
/

idefics3-llama-gui-dense-descriptions

PEFT

Safetensors

English

Generated from Trainer

Model card Files Files and versions Community

Maverick17 commited on Oct 2, 2024

Commit

dd1d00d

verified ·

1 Parent(s): ec29dda

Update README.md

Browse files

Added finetuning script description

Files changed (1) hide show

README.md +157 -0

README.md CHANGED Viewed

@@ -20,6 +20,163 @@ should probably proofread and complete it, then remove this comment. -->
 This model is a fine-tuned version of [HuggingFaceM4/Idefics3-8B-Llama3](https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3) on https://huggingface.co/datasets/Agent-Eval-Refine/GUI-Dense-Descriptions dataset
 ## Intended usage
 ```python

 This model is a fine-tuned version of [HuggingFaceM4/Idefics3-8B-Llama3](https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3) on https://huggingface.co/datasets/Agent-Eval-Refine/GUI-Dense-Descriptions dataset
+## Finetuning script
+```python
+# !pip install git+https://github.com/andimarafioti/transformers.git@e1b7c0a05ab65e4ddb62a407fe12f8ec13a916f0"
+# !pip install accelerate datasets peft bitsandbytes
+# !pip install flash-attn --no-build-isolation
+import pandas as pd
+import torch
+from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
+from transformers import (
+    AutoProcessor,
+    BitsAndBytesConfig,
+    Idefics3ForConditionalGeneration,
+)
+import os
+from PIL import Image
+from datasets import load_dataset
+from transformers import TrainingArguments, Trainer
+from huggingface_hub import notebook_login
+notebook_login()
+gui_dense_desc_dataset = load_dataset("Agent-Eval-Refine/GUI-Dense-Descriptions")
+train_ds = gui_dense_desc_dataset["train"]
+# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+# os.environ["CUDA_VISIBLE_DEVICES"] = "2"
+USE_LORA = False
+USE_QLORA = True
+model_id = "HuggingFaceM4/Idefics3-8B-Llama3"
+processor = AutoProcessor.from_pretrained(model_id)
+if USE_QLORA or USE_LORA:
+    lora_config = LoraConfig(
+        r=8,
+        lora_alpha=8,
+        lora_dropout=0.1,
+        target_modules=[
+            "down_proj",
+            "o_proj",
+            "k_proj",
+            "q_proj",
+            "gate_proj",
+            "up_proj",
+            "v_proj",
+        ],
+        use_dora=False if USE_QLORA else True,
+        init_lora_weights="gaussian",
+    )
+    lora_config.inference_mode = False
+    if USE_QLORA:
+        bnb_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_compute_dtype=torch.bfloat16,
+        )
+    model = Idefics3ForConditionalGeneration.from_pretrained(
+        model_id,
+        quantization_config=bnb_config if USE_QLORA else None,
+        _attn_implementation="flash_attention_2",
+        device_map="auto",
+        torch_dtype=torch.bfloat16,
+    )
+    model.add_adapter(lora_config)
+    model.enable_adapters()
+    model = prepare_model_for_kbit_training(model)
+    model = get_peft_model(model, lora_config)
+    print(model.get_nb_trainable_parameters())
+else:
+    model = Idefics3ForConditionalGeneration.from_pretrained(
+        model_id,
+        torch_dtype=torch.bfloat16,
+        _attn_implementation="flash_attention_2",
+        device_map="auto",
+    )
+    # if you'd like to only fine-tune LLM
+    for param in model.model.vision_model.parameters():
+        param.requires_grad = False
+image_token_id = processor.tokenizer.additional_special_tokens_ids[
+    processor.tokenizer.additional_special_tokens.index("<image>")
+]
+def collate_fn(examples):
+    texts = []
+    images = []
+    for example in examples:
+        image = example["image"]
+        image_description = example["text"]
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image"},
+                    {
+                        "type": "text",
+                        "text": "Provide a detailed description of the image.",
+                    },
+                ],
+            },
+            {
+                "role": "assistant",
+                "content": [{"type": "text", "text": image_description}],
+            },
+        ]
+        text = processor.apply_chat_template(messages, add_generation_prompt=False)
+        texts.append(text.strip())
+        images.append([image])
+        batch = processor(text=texts, images=images, return_tensors="pt", padding=True)
+        labels = batch["input_ids"].clone()
+        labels[labels == processor.tokenizer.pad_token_id] = -100
+        labels[labels == image_token_id] = -100
+        batch["labels"] = labels
+    return batch
+training_args = TrainingArguments(
+    num_train_epochs=1,
+    per_device_train_batch_size=2,
+    gradient_accumulation_steps=8,
+    warmup_steps=50,
+    learning_rate=1e-4,
+    weight_decay=0.01,
+    logging_steps=5,
+    save_strategy="steps",
+    save_steps=250,
+    save_total_limit=1,
+    optim="adamw_torch",
+    bf16=True,
+    output_dir="./idefics3-llama-gui-dense-descriptions",
+    hub_model_id="idefics3-llama-gui-dense-descriptions",
+    remove_unused_columns=False,
+)
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    data_collator=collate_fn,
+    train_dataset=train_ds,
+)
+trainer.train()
+trainer.push_to_hub()
+```
+Training took approx. 40 min. on 2xH100 (80 Gb each) devices.
 ## Intended usage
 ```python