ndkhanh95 commited on
Commit
f1f0f7a
·
verified ·
1 Parent(s): 62a8d6e

Upload PaliGemma_NoJax.ipynb

Browse files
Files changed (1) hide show
  1. PaliGemma_NoJax.ipynb +339 -0
PaliGemma_NoJax.ipynb ADDED
@@ -0,0 +1,339 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "f841af43-faf7-4a7b-ad55-0da226f3220f",
7
+ "metadata": {},
8
+ "outputs": [
9
+ {
10
+ "name": "stderr",
11
+ "output_type": "stream",
12
+ "text": [
13
+ "C:\\Users\\user\\anaconda3\\envs\\Ultralytics\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
14
+ " from .autonotebook import tqdm as notebook_tqdm\n"
15
+ ]
16
+ }
17
+ ],
18
+ "source": [
19
+ "from datasets import load_dataset\n",
20
+ "ds = load_dataset('merve/vqav2-small')"
21
+ ]
22
+ },
23
+ {
24
+ "cell_type": "code",
25
+ "execution_count": 2,
26
+ "id": "b47b7e33-b5eb-46ec-9e43-ed118c09b290",
27
+ "metadata": {},
28
+ "outputs": [],
29
+ "source": [
30
+ "ds = ds['validation']"
31
+ ]
32
+ },
33
+ {
34
+ "cell_type": "code",
35
+ "execution_count": 3,
36
+ "id": "877df06d-4384-4442-a8d7-7002706b7afe",
37
+ "metadata": {},
38
+ "outputs": [],
39
+ "source": [
40
+ "split_ds = ds.train_test_split(test_size=0.05) # we'll use a very small split for demo\n",
41
+ "train_ds = split_ds[\"test\"]"
42
+ ]
43
+ },
44
+ {
45
+ "cell_type": "code",
46
+ "execution_count": 4,
47
+ "id": "870b515b-d3f5-4638-adbf-70fa39ee2ac5",
48
+ "metadata": {},
49
+ "outputs": [
50
+ {
51
+ "data": {
52
+ "text/plain": [
53
+ "Dataset({\n",
54
+ " features: ['multiple_choice_answer', 'question', 'image'],\n",
55
+ " num_rows: 1072\n",
56
+ "})"
57
+ ]
58
+ },
59
+ "execution_count": 4,
60
+ "metadata": {},
61
+ "output_type": "execute_result"
62
+ }
63
+ ],
64
+ "source": [
65
+ "train_ds"
66
+ ]
67
+ },
68
+ {
69
+ "cell_type": "code",
70
+ "execution_count": 5,
71
+ "id": "50e42737-ff75-4c90-bdf4-012b45678292",
72
+ "metadata": {},
73
+ "outputs": [],
74
+ "source": [
75
+ "from transformers import PaliGemmaProcessor\n",
76
+ "model_id = r\"D:\\PaliGemma\\paligemma-3b-pt-224\"\n",
77
+ "processor = PaliGemmaProcessor.from_pretrained(model_id)"
78
+ ]
79
+ },
80
+ {
81
+ "cell_type": "code",
82
+ "execution_count": 14,
83
+ "id": "83f6c8f7-1960-4ae9-93cc-0a2d25d0d5f4",
84
+ "metadata": {},
85
+ "outputs": [],
86
+ "source": [
87
+ "import torch\n",
88
+ "device = \"cuda\"\n",
89
+ "\n",
90
+ "image_token = processor.tokenizer.convert_tokens_to_ids(\"<image>\")\n",
91
+ "def collate_fn(examples):\n",
92
+ " texts = [\"answer \" + example[\"question\"] for example in examples]\n",
93
+ " labels= [example['multiple_choice_answer'] for example in examples]\n",
94
+ " images = [example[\"image\"].convert(\"RGB\") for example in examples]\n",
95
+ " tokens = processor(text=texts, images=images, suffix=labels,\n",
96
+ " return_tensors=\"pt\", padding=\"longest\",\n",
97
+ " #tokenize_newline_separately=False\n",
98
+ " )\n",
99
+ "\n",
100
+ " tokens = tokens.to(torch.bfloat16).to(device)\n",
101
+ " return tokens\n"
102
+ ]
103
+ },
104
+ {
105
+ "cell_type": "code",
106
+ "execution_count": 15,
107
+ "id": "3fb8260e-c333-4948-8051-c85964409660",
108
+ "metadata": {},
109
+ "outputs": [
110
+ {
111
+ "name": "stderr",
112
+ "output_type": "stream",
113
+ "text": [
114
+ "Loading checkpoint shards: 100%|██████████| 3/3 [00:12<00:00, 4.05s/it]\n"
115
+ ]
116
+ }
117
+ ],
118
+ "source": [
119
+ "from transformers import PaliGemmaForConditionalGeneration\n",
120
+ "import torch\n",
121
+ "\n",
122
+ "model = PaliGemmaForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(device)\n",
123
+ "\n",
124
+ "for param in model.vision_tower.parameters():\n",
125
+ " param.requires_grad = False\n",
126
+ "\n",
127
+ "for param in model.multi_modal_projector.parameters():\n",
128
+ " param.requires_grad = False"
129
+ ]
130
+ },
131
+ {
132
+ "cell_type": "code",
133
+ "execution_count": 16,
134
+ "id": "7ae939ff-98e7-47f8-af29-8fd1ee8f237c",
135
+ "metadata": {},
136
+ "outputs": [
137
+ {
138
+ "name": "stderr",
139
+ "output_type": "stream",
140
+ "text": [
141
+ "Unused kwargs: ['bnb_4bit_compute_type']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.\n",
142
+ "Loading checkpoint shards: 100%|██████████| 3/3 [00:22<00:00, 7.45s/it]\n"
143
+ ]
144
+ },
145
+ {
146
+ "name": "stdout",
147
+ "output_type": "stream",
148
+ "text": [
149
+ "trainable params: 11,298,816 || all params: 2,934,765,296 || trainable%: 0.3850\n"
150
+ ]
151
+ }
152
+ ],
153
+ "source": [
154
+ "from transformers import BitsAndBytesConfig\n",
155
+ "from peft import get_peft_model, LoraConfig\n",
156
+ "\n",
157
+ "bnb_config = BitsAndBytesConfig(\n",
158
+ " load_in_4bit=True,\n",
159
+ " bnb_4bit_quant_type=\"nf4\",\n",
160
+ " bnb_4bit_compute_type=torch.bfloat16\n",
161
+ ")\n",
162
+ "\n",
163
+ "lora_config = LoraConfig(\n",
164
+ " r=8,\n",
165
+ " target_modules=[\"q_proj\", \"o_proj\", \"k_proj\", \"v_proj\", \"gate_proj\", \"up_proj\", \"down_proj\"],\n",
166
+ " task_type=\"CAUSAL_LM\",\n",
167
+ ")\n",
168
+ "model = PaliGemmaForConditionalGeneration.from_pretrained(model_id, quantization_config=bnb_config, device_map={\"\":0})\n",
169
+ "model = get_peft_model(model, lora_config)\n",
170
+ "model.print_trainable_parameters()\n",
171
+ "#trainable params: 11,298,816 || all params: 2,934,634,224 || trainable%: 0.38501616002417344"
172
+ ]
173
+ },
174
+ {
175
+ "cell_type": "code",
176
+ "execution_count": 17,
177
+ "id": "7fe77639-44ab-4747-8ced-343eb06e0efd",
178
+ "metadata": {},
179
+ "outputs": [],
180
+ "source": [
181
+ "import accelerate"
182
+ ]
183
+ },
184
+ {
185
+ "cell_type": "code",
186
+ "execution_count": 18,
187
+ "id": "98b996db-e9c5-42bf-b979-fd79a28f7e5e",
188
+ "metadata": {},
189
+ "outputs": [
190
+ {
191
+ "name": "stdout",
192
+ "output_type": "stream",
193
+ "text": [
194
+ "0.26.0\n"
195
+ ]
196
+ }
197
+ ],
198
+ "source": [
199
+ "print(accelerate.__version__)"
200
+ ]
201
+ },
202
+ {
203
+ "cell_type": "code",
204
+ "execution_count": 19,
205
+ "id": "9a6546c4-90b3-4f4d-8de0-1e020883a702",
206
+ "metadata": {},
207
+ "outputs": [],
208
+ "source": [
209
+ "from transformers import TrainingArguments\n",
210
+ "args=TrainingArguments(\n",
211
+ " num_train_epochs=2,\n",
212
+ " remove_unused_columns=False,\n",
213
+ " per_device_train_batch_size=4,\n",
214
+ " gradient_accumulation_steps=4,\n",
215
+ " warmup_steps=2,\n",
216
+ " learning_rate=2e-5,\n",
217
+ " weight_decay=1e-6,\n",
218
+ " adam_beta2=0.999,\n",
219
+ " logging_steps=100,\n",
220
+ " optim=\"adamw_torch\",\n",
221
+ " save_strategy=\"steps\",\n",
222
+ " save_steps=1000,\n",
223
+ " # push_to_hub=True,\n",
224
+ " save_total_limit=1,\n",
225
+ " output_dir=\"paligemma_vqav2\",\n",
226
+ " bf16=True,\n",
227
+ " report_to=[\"tensorboard\"],\n",
228
+ " dataloader_pin_memory=False\n",
229
+ " )"
230
+ ]
231
+ },
232
+ {
233
+ "cell_type": "code",
234
+ "execution_count": null,
235
+ "id": "df25a8dc-8ab9-467a-b6ce-dee13addb776",
236
+ "metadata": {},
237
+ "outputs": [],
238
+ "source": []
239
+ },
240
+ {
241
+ "cell_type": "code",
242
+ "execution_count": 20,
243
+ "id": "9a8de871-e869-4daf-a250-0aec6437f076",
244
+ "metadata": {},
245
+ "outputs": [],
246
+ "source": [
247
+ "from transformers import Trainer\n",
248
+ "\n",
249
+ "trainer = Trainer(\n",
250
+ " model=model,\n",
251
+ " train_dataset=train_ds ,\n",
252
+ " data_collator=collate_fn,\n",
253
+ " args=args\n",
254
+ " )"
255
+ ]
256
+ },
257
+ {
258
+ "cell_type": "code",
259
+ "execution_count": 21,
260
+ "id": "77d743e5-6b5b-40a0-a2f4-7591f2c8df50",
261
+ "metadata": {},
262
+ "outputs": [
263
+ {
264
+ "name": "stderr",
265
+ "output_type": "stream",
266
+ "text": [
267
+ "You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.\n",
268
+ "You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.\n",
269
+ "You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.\n",
270
+ "You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.\n",
271
+ "You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.\n",
272
+ "C:\\Users\\user\\anaconda3\\envs\\Ultralytics\\lib\\site-packages\\transformers\\models\\siglip\\modeling_siglip.py:574: UserWarning: 1Torch was not compiled with flash attention. (Triggered internally at C:\\cb\\pytorch_1000000000000\\work\\aten\\src\\ATen\\native\\transformers\\cuda\\sdp_utils.cpp:555.)\n",
273
+ " attn_output = torch.nn.functional.scaled_dot_product_attention(\n",
274
+ "C:\\Users\\user\\anaconda3\\envs\\Ultralytics\\lib\\site-packages\\bitsandbytes\\nn\\modules.py:452: UserWarning: Input type into Linear4bit is torch.float16, but bnb_4bit_compute_dtype=torch.float32 (default). This will lead to slow inference or training speed.\n",
275
+ " warnings.warn(\n"
276
+ ]
277
+ },
278
+ {
279
+ "ename": "KeyboardInterrupt",
280
+ "evalue": "",
281
+ "output_type": "error",
282
+ "traceback": [
283
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
284
+ "\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
285
+ "Cell \u001b[1;32mIn[21], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrain\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
286
+ "File \u001b[1;32m~\\anaconda3\\envs\\Ultralytics\\lib\\site-packages\\transformers\\trainer.py:2123\u001b[0m, in \u001b[0;36mTrainer.train\u001b[1;34m(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)\u001b[0m\n\u001b[0;32m 2121\u001b[0m hf_hub_utils\u001b[38;5;241m.\u001b[39menable_progress_bars()\n\u001b[0;32m 2122\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m-> 2123\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43minner_training_loop\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 2124\u001b[0m \u001b[43m \u001b[49m\u001b[43margs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 2125\u001b[0m \u001b[43m \u001b[49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 2126\u001b[0m \u001b[43m \u001b[49m\u001b[43mtrial\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrial\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 2127\u001b[0m \u001b[43m \u001b[49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 2128\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n",
287
+ "File \u001b[1;32m~\\anaconda3\\envs\\Ultralytics\\lib\\site-packages\\transformers\\trainer.py:2481\u001b[0m, in \u001b[0;36mTrainer._inner_training_loop\u001b[1;34m(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)\u001b[0m\n\u001b[0;32m 2475\u001b[0m context \u001b[38;5;241m=\u001b[39m (\n\u001b[0;32m 2476\u001b[0m functools\u001b[38;5;241m.\u001b[39mpartial(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39maccelerator\u001b[38;5;241m.\u001b[39mno_sync, model\u001b[38;5;241m=\u001b[39mmodel)\n\u001b[0;32m 2477\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m i \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mlen\u001b[39m(batch_samples) \u001b[38;5;241m-\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[0;32m 2478\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m contextlib\u001b[38;5;241m.\u001b[39mnullcontext\n\u001b[0;32m 2479\u001b[0m )\n\u001b[0;32m 2480\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m context():\n\u001b[1;32m-> 2481\u001b[0m tr_loss_step \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtraining_step\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnum_items_in_batch\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 2483\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[0;32m 2484\u001b[0m args\u001b[38;5;241m.\u001b[39mlogging_nan_inf_filter\n\u001b[0;32m 2485\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_torch_xla_available()\n\u001b[0;32m 2486\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m (torch\u001b[38;5;241m.\u001b[39misnan(tr_loss_step) \u001b[38;5;129;01mor\u001b[39;00m torch\u001b[38;5;241m.\u001b[39misinf(tr_loss_step))\n\u001b[0;32m 2487\u001b[0m ):\n\u001b[0;32m 2488\u001b[0m \u001b[38;5;66;03m# if loss is nan or inf simply add the average of previous logged losses\u001b[39;00m\n\u001b[0;32m 2489\u001b[0m tr_loss \u001b[38;5;241m=\u001b[39m tr_loss \u001b[38;5;241m+\u001b[39m tr_loss \u001b[38;5;241m/\u001b[39m (\u001b[38;5;241m1\u001b[39m \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mglobal_step \u001b[38;5;241m-\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_globalstep_last_logged)\n",
288
+ "File \u001b[1;32m~\\anaconda3\\envs\\Ultralytics\\lib\\site-packages\\transformers\\trainer.py:3612\u001b[0m, in \u001b[0;36mTrainer.training_step\u001b[1;34m(***failed resolving arguments***)\u001b[0m\n\u001b[0;32m 3610\u001b[0m scaled_loss\u001b[38;5;241m.\u001b[39mbackward()\n\u001b[0;32m 3611\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m-> 3612\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39maccelerator\u001b[38;5;241m.\u001b[39mbackward(loss, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m 3613\u001b[0m \u001b[38;5;66;03m# Finally we need to normalize the loss for reporting\u001b[39;00m\n\u001b[0;32m 3614\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m num_items_in_batch \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
289
+ "File \u001b[1;32m~\\anaconda3\\envs\\Ultralytics\\lib\\site-packages\\accelerate\\accelerator.py:1964\u001b[0m, in \u001b[0;36mAccelerator.backward\u001b[1;34m(self, loss, **kwargs)\u001b[0m\n\u001b[0;32m 1962\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mscaler\u001b[38;5;241m.\u001b[39mscale(loss)\u001b[38;5;241m.\u001b[39mbackward(\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m 1963\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m-> 1964\u001b[0m loss\u001b[38;5;241m.\u001b[39mbackward(\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
290
+ "File \u001b[1;32m~\\anaconda3\\envs\\Ultralytics\\lib\\site-packages\\torch\\_tensor.py:521\u001b[0m, in \u001b[0;36mTensor.backward\u001b[1;34m(self, gradient, retain_graph, create_graph, inputs)\u001b[0m\n\u001b[0;32m 511\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m has_torch_function_unary(\u001b[38;5;28mself\u001b[39m):\n\u001b[0;32m 512\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m handle_torch_function(\n\u001b[0;32m 513\u001b[0m Tensor\u001b[38;5;241m.\u001b[39mbackward,\n\u001b[0;32m 514\u001b[0m (\u001b[38;5;28mself\u001b[39m,),\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 519\u001b[0m inputs\u001b[38;5;241m=\u001b[39minputs,\n\u001b[0;32m 520\u001b[0m )\n\u001b[1;32m--> 521\u001b[0m \u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mautograd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbackward\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 522\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgradient\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minputs\u001b[49m\n\u001b[0;32m 523\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
291
+ "File \u001b[1;32m~\\anaconda3\\envs\\Ultralytics\\lib\\site-packages\\torch\\autograd\\__init__.py:289\u001b[0m, in \u001b[0;36mbackward\u001b[1;34m(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)\u001b[0m\n\u001b[0;32m 284\u001b[0m retain_graph \u001b[38;5;241m=\u001b[39m create_graph\n\u001b[0;32m 286\u001b[0m \u001b[38;5;66;03m# The reason we repeat the same comment below is that\u001b[39;00m\n\u001b[0;32m 287\u001b[0m \u001b[38;5;66;03m# some Python versions print out the first line of a multi-line function\u001b[39;00m\n\u001b[0;32m 288\u001b[0m \u001b[38;5;66;03m# calls in the traceback and some print out the last line\u001b[39;00m\n\u001b[1;32m--> 289\u001b[0m \u001b[43m_engine_run_backward\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 290\u001b[0m \u001b[43m \u001b[49m\u001b[43mtensors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 291\u001b[0m \u001b[43m \u001b[49m\u001b[43mgrad_tensors_\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 292\u001b[0m \u001b[43m \u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 293\u001b[0m \u001b[43m \u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 294\u001b[0m \u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 295\u001b[0m \u001b[43m \u001b[49m\u001b[43mallow_unreachable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[0;32m 296\u001b[0m \u001b[43m \u001b[49m\u001b[43maccumulate_grad\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[0;32m 297\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
292
+ "File \u001b[1;32m~\\anaconda3\\envs\\Ultralytics\\lib\\site-packages\\torch\\autograd\\graph.py:768\u001b[0m, in \u001b[0;36m_engine_run_backward\u001b[1;34m(t_outputs, *args, **kwargs)\u001b[0m\n\u001b[0;32m 766\u001b[0m unregister_hooks \u001b[38;5;241m=\u001b[39m _register_logging_hooks_on_whole_graph(t_outputs)\n\u001b[0;32m 767\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m--> 768\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m Variable\u001b[38;5;241m.\u001b[39m_execution_engine\u001b[38;5;241m.\u001b[39mrun_backward( \u001b[38;5;66;03m# Calls into the C++ engine to run the backward pass\u001b[39;00m\n\u001b[0;32m 769\u001b[0m t_outputs, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs\n\u001b[0;32m 770\u001b[0m ) \u001b[38;5;66;03m# Calls into the C++ engine to run the backward pass\u001b[39;00m\n\u001b[0;32m 771\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[0;32m 772\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m attach_logging_hooks:\n",
293
+ "\u001b[1;31mKeyboardInterrupt\u001b[0m: "
294
+ ]
295
+ }
296
+ ],
297
+ "source": [
298
+ "trainer.train()"
299
+ ]
300
+ },
301
+ {
302
+ "cell_type": "code",
303
+ "execution_count": null,
304
+ "id": "422d8f32-ecd9-4266-b5a4-bd26d45c4fc7",
305
+ "metadata": {},
306
+ "outputs": [],
307
+ "source": []
308
+ },
309
+ {
310
+ "cell_type": "code",
311
+ "execution_count": null,
312
+ "id": "365cf997-a80e-407e-9848-74e4d4b6a8a8",
313
+ "metadata": {},
314
+ "outputs": [],
315
+ "source": []
316
+ }
317
+ ],
318
+ "metadata": {
319
+ "kernelspec": {
320
+ "display_name": "Python 3 (ipykernel)",
321
+ "language": "python",
322
+ "name": "python3"
323
+ },
324
+ "language_info": {
325
+ "codemirror_mode": {
326
+ "name": "ipython",
327
+ "version": 3
328
+ },
329
+ "file_extension": ".py",
330
+ "mimetype": "text/x-python",
331
+ "name": "python",
332
+ "nbconvert_exporter": "python",
333
+ "pygments_lexer": "ipython3",
334
+ "version": "3.9.19"
335
+ }
336
+ },
337
+ "nbformat": 4,
338
+ "nbformat_minor": 5
339
+ }