vishwa2488 commited on
Commit
73814b7
·
verified ·
1 Parent(s): 51ef034

(Trained with Unsloth)

Browse files
added_tokens.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<|box_end|>": 151649,
3
+ "<|box_start|>": 151648,
4
+ "<|endoftext|>": 151643,
5
+ "<|im_end|>": 151645,
6
+ "<|im_start|>": 151644,
7
+ "<|image_pad|>": 151655,
8
+ "<|object_ref_end|>": 151647,
9
+ "<|object_ref_start|>": 151646,
10
+ "<|quad_end|>": 151651,
11
+ "<|quad_start|>": 151650,
12
+ "<|video_pad|>": 151656,
13
+ "<|vision_end|>": 151653,
14
+ "<|vision_pad|>": 151654,
15
+ "<|vision_start|>": 151652
16
+ }
chat_template.json CHANGED
@@ -1,3 +1,3 @@
1
  {
2
- "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- Find out if there are any images #}\n{% set image_ns = namespace(has_images=false) %} \n{%- for message in messages %}\n {%- for content in message['content'] %}\n {%- if content['type'] == 'image' %}\n {%- set image_ns.has_images = true %}\n {%- endif %}\n {%- endfor %}\n{%- endfor %}\n\n{#- Error out if there are images and system message #}\n{%- if image_ns.has_images and not system_message == \"\" %}\n {{- raise_exception(\"Prompting with images is incompatible with system messages.\") }}\n{%- endif %}\n\n{#- System message if there are no images #}\n{%- if not image_ns.has_images %}\n {{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n {%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n {%- endif %}\n {{- \"Cutting Knowledge Date: December 2023\\n\" }}\n {{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n {%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {%- endif %}\n {{- system_message }}\n {{- \"<|eot_id|>\" }}\n{%- endif %}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n' }}\n {%- if message['content'] is string %}\n {{- message['content'] }}\n {%- else %}\n {%- for content in message['content'] %}\n {%- if content['type'] == 'image' %}\n {{- '<|image|>' }}\n {%- elif content['type'] == 'text' %}\n {{- content['text'] }}\n {%- endif %}\n {%- endfor %}\n {%- endif %}\n {{- '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n"
3
  }
 
1
  {
2
+ "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
3
  }
config.json CHANGED
@@ -1,227 +1,49 @@
1
  {
2
- "_name_or_path": "meta-llama/Llama-3.2-11B-Vision-Instruct",
3
  "architectures": [
4
- "MllamaForConditionalGeneration"
5
  ],
6
- "image_token_index": 128256,
7
- "model_type": "mllama",
8
- "text_config": {
9
- "_attn_implementation_autoset": false,
10
- "_name_or_path": "",
11
- "add_cross_attention": false,
12
- "architectures": null,
13
- "bad_words_ids": null,
14
- "begin_suppress_tokens": null,
15
- "bos_token_id": 128000,
16
- "chunk_size_feed_forward": 0,
17
- "cross_attention_hidden_size": null,
18
- "cross_attention_layers": [
19
- 3,
20
- 8,
21
- 13,
22
- 18,
23
- 23,
24
- 28,
25
- 33,
26
- 38
27
  ],
28
- "decoder_start_token_id": null,
29
- "diversity_penalty": 0.0,
30
- "do_sample": false,
31
- "dropout": 0,
32
- "early_stopping": false,
33
- "encoder_no_repeat_ngram_size": 0,
34
- "eos_token_id": [
35
- 128001,
36
- 128008,
37
- 128009
38
- ],
39
- "exponential_decay_length_penalty": null,
40
- "finetuning_task": null,
41
- "forced_bos_token_id": null,
42
- "forced_eos_token_id": null,
43
- "hidden_act": "silu",
44
- "hidden_size": 4096,
45
- "id2label": {
46
- "0": "LABEL_0",
47
- "1": "LABEL_1"
48
- },
49
- "initializer_range": 0.02,
50
- "intermediate_size": 14336,
51
- "is_decoder": false,
52
- "is_encoder_decoder": false,
53
- "label2id": {
54
- "LABEL_0": 0,
55
- "LABEL_1": 1
56
- },
57
- "length_penalty": 1.0,
58
- "max_length": 20,
59
- "max_position_embeddings": 131072,
60
- "min_length": 0,
61
- "model_type": "mllama_text_model",
62
- "no_repeat_ngram_size": 0,
63
- "num_attention_heads": 32,
64
- "num_beam_groups": 1,
65
- "num_beams": 1,
66
- "num_hidden_layers": 40,
67
- "num_key_value_heads": 8,
68
- "num_return_sequences": 1,
69
- "output_attentions": false,
70
- "output_hidden_states": false,
71
- "output_scores": false,
72
- "pad_token_id": 128004,
73
- "prefix": null,
74
- "problem_type": null,
75
- "pruned_heads": {},
76
- "remove_invalid_values": false,
77
- "repetition_penalty": 1.0,
78
- "return_dict": true,
79
- "return_dict_in_generate": false,
80
- "rms_norm_eps": 1e-05,
81
- "rope_scaling": {
82
- "factor": 8.0,
83
- "high_freq_factor": 4.0,
84
- "low_freq_factor": 1.0,
85
- "original_max_position_embeddings": 8192,
86
- "rope_type": "llama3"
87
- },
88
- "rope_theta": 500000.0,
89
- "sep_token_id": null,
90
- "suppress_tokens": null,
91
- "task_specific_params": null,
92
- "temperature": 1.0,
93
- "tf_legacy_loss": false,
94
- "tie_encoder_decoder": false,
95
- "tie_word_embeddings": false,
96
- "tokenizer_class": null,
97
- "top_k": 50,
98
- "top_p": 1.0,
99
- "torch_dtype": "bfloat16",
100
- "torchscript": false,
101
- "typical_p": 1.0,
102
- "use_bfloat16": false,
103
- "use_cache": true,
104
- "vocab_size": 128256
105
  },
 
 
 
106
  "torch_dtype": "bfloat16",
107
  "transformers_version": "4.46.3",
108
  "unsloth_fixed": true,
 
 
 
109
  "vision_config": {
110
- "_attn_implementation_autoset": false,
111
- "_name_or_path": "",
112
- "add_cross_attention": false,
113
- "architectures": null,
114
- "attention_heads": 16,
115
- "bad_words_ids": null,
116
- "begin_suppress_tokens": null,
117
- "bos_token_id": null,
118
- "chunk_size_feed_forward": 0,
119
- "cross_attention_hidden_size": null,
120
- "decoder_start_token_id": null,
121
- "diversity_penalty": 0.0,
122
- "do_sample": false,
123
- "early_stopping": false,
124
- "encoder_no_repeat_ngram_size": 0,
125
- "eos_token_id": null,
126
- "exponential_decay_length_penalty": null,
127
- "finetuning_task": null,
128
- "forced_bos_token_id": null,
129
- "forced_eos_token_id": null,
130
- "hidden_act": "gelu",
131
- "hidden_size": 1280,
132
- "id2label": {
133
- "0": "LABEL_0",
134
- "1": "LABEL_1"
135
- },
136
- "image_size": 560,
137
- "initializer_range": 0.02,
138
- "intermediate_layers_indices": [
139
- 3,
140
- 7,
141
- 15,
142
- 23,
143
- 30
144
- ],
145
- "intermediate_size": 5120,
146
- "is_decoder": false,
147
- "is_encoder_decoder": false,
148
- "label2id": {
149
- "LABEL_0": 0,
150
- "LABEL_1": 1
151
- },
152
- "length_penalty": 1.0,
153
- "max_length": 20,
154
- "max_num_tiles": 4,
155
- "min_length": 0,
156
- "model_type": "mllama_vision_model",
157
- "no_repeat_ngram_size": 0,
158
- "norm_eps": 1e-05,
159
- "num_beam_groups": 1,
160
- "num_beams": 1,
161
- "num_channels": 3,
162
- "num_global_layers": 8,
163
- "num_hidden_layers": 32,
164
- "num_return_sequences": 1,
165
- "output_attentions": false,
166
- "output_hidden_states": false,
167
- "output_scores": false,
168
- "pad_token_id": null,
169
- "patch_size": 14,
170
- "prefix": null,
171
- "problem_type": null,
172
- "pruned_heads": {},
173
- "remove_invalid_values": false,
174
- "repetition_penalty": 1.0,
175
- "return_dict": true,
176
- "return_dict_in_generate": false,
177
- "sep_token_id": null,
178
- "supported_aspect_ratios": [
179
- [
180
- 1,
181
- 1
182
- ],
183
- [
184
- 1,
185
- 2
186
- ],
187
- [
188
- 1,
189
- 3
190
- ],
191
- [
192
- 1,
193
- 4
194
- ],
195
- [
196
- 2,
197
- 1
198
- ],
199
- [
200
- 2,
201
- 2
202
- ],
203
- [
204
- 3,
205
- 1
206
- ],
207
- [
208
- 4,
209
- 1
210
- ]
211
- ],
212
- "suppress_tokens": null,
213
- "task_specific_params": null,
214
- "temperature": 1.0,
215
- "tf_legacy_loss": false,
216
- "tie_encoder_decoder": false,
217
- "tie_word_embeddings": true,
218
- "tokenizer_class": null,
219
- "top_k": 50,
220
- "top_p": 1.0,
221
- "torch_dtype": "bfloat16",
222
- "torchscript": false,
223
- "typical_p": 1.0,
224
- "use_bfloat16": false,
225
- "vision_output_dim": 7680
226
- }
227
  }
 
1
  {
2
+ "_name_or_path": "Qwen/Qwen2-VL-7B-Instruct",
3
  "architectures": [
4
+ "Qwen2VLForConditionalGeneration"
5
  ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 151643,
8
+ "eos_token_id": 151645,
9
+ "hidden_act": "silu",
10
+ "hidden_size": 3584,
11
+ "image_token_id": 151655,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 18944,
14
+ "max_position_embeddings": 32768,
15
+ "max_window_layers": 28,
16
+ "model_type": "qwen2_vl",
17
+ "num_attention_heads": 28,
18
+ "num_hidden_layers": 28,
19
+ "num_key_value_heads": 4,
20
+ "pad_token_id": 151654,
21
+ "rms_norm_eps": 1e-06,
22
+ "rope_scaling": {
23
+ "mrope_section": [
24
+ 16,
25
+ 24,
26
+ 24
27
  ],
28
+ "rope_type": "default",
29
+ "type": "default"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  },
31
+ "rope_theta": 1000000.0,
32
+ "sliding_window": 32768,
33
+ "tie_word_embeddings": false,
34
  "torch_dtype": "bfloat16",
35
  "transformers_version": "4.46.3",
36
  "unsloth_fixed": true,
37
+ "use_cache": true,
38
+ "use_sliding_window": false,
39
+ "video_token_id": 151656,
40
  "vision_config": {
41
+ "in_chans": 3,
42
+ "model_type": "qwen2_vl",
43
+ "spatial_patch_size": 14
44
+ },
45
+ "vision_end_token_id": 151653,
46
+ "vision_start_token_id": 151652,
47
+ "vision_token_id": 151654,
48
+ "vocab_size": 152064
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  }
generation_config.json CHANGED
@@ -1,13 +1,14 @@
1
  {
2
- "bos_token_id": 128000,
3
  "do_sample": true,
4
  "eos_token_id": [
5
- 128001,
6
- 128008,
7
- 128009
8
  ],
9
- "pad_token_id": 128004,
10
- "temperature": 0.6,
11
- "top_p": 0.9,
 
 
12
  "transformers_version": "4.46.3"
13
  }
 
1
  {
2
+ "bos_token_id": 151643,
3
  "do_sample": true,
4
  "eos_token_id": [
5
+ 151645,
6
+ 151643
 
7
  ],
8
+ "max_length": 32768,
9
+ "pad_token_id": 151654,
10
+ "temperature": 0.01,
11
+ "top_k": 1,
12
+ "top_p": 0.001,
13
  "transformers_version": "4.46.3"
14
  }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors.index.json CHANGED
The diff for this file is too large to render. See raw diff
 
preprocessor_config.json CHANGED
@@ -1,7 +1,6 @@
1
  {
2
  "do_convert_rgb": true,
3
  "do_normalize": true,
4
- "do_pad": true,
5
  "do_rescale": true,
6
  "do_resize": true,
7
  "image_mean": [
@@ -9,18 +8,22 @@
9
  0.4578275,
10
  0.40821073
11
  ],
12
- "image_processor_type": "MllamaImageProcessor",
13
  "image_std": [
14
  0.26862954,
15
  0.26130258,
16
  0.27577711
17
  ],
18
- "max_image_tiles": 4,
19
- "processor_class": "MllamaProcessor",
20
- "resample": 2,
 
 
 
21
  "rescale_factor": 0.00392156862745098,
22
  "size": {
23
- "height": 560,
24
- "width": 560
25
- }
 
26
  }
 
1
  {
2
  "do_convert_rgb": true,
3
  "do_normalize": true,
 
4
  "do_rescale": true,
5
  "do_resize": true,
6
  "image_mean": [
 
8
  0.4578275,
9
  0.40821073
10
  ],
11
+ "image_processor_type": "Qwen2VLImageProcessor",
12
  "image_std": [
13
  0.26862954,
14
  0.26130258,
15
  0.27577711
16
  ],
17
+ "max_pixels": 12845056,
18
+ "merge_size": 2,
19
+ "min_pixels": 3136,
20
+ "patch_size": 14,
21
+ "processor_class": "Qwen2VLProcessor",
22
+ "resample": 3,
23
  "rescale_factor": 0.00392156862745098,
24
  "size": {
25
+ "max_pixels": 12845056,
26
+ "min_pixels": 3136
27
+ },
28
+ "temporal_patch_size": 2
29
  }
vocab.json ADDED
The diff for this file is too large to render. See raw diff