xinlai commited on
Commit
33e54e3
·
1 Parent(s): 674d663
app.py CHANGED
@@ -30,12 +30,8 @@ from diffusers import AutoencoderKL, UNet2DConditionModel, EulerDiscreteSchedule
30
 
31
  pyrootutils.setup_root(__file__, indicator=".project-root", pythonpath=True)
32
 
33
- from src.data.any_res import process_anyres_image
34
-
35
  BOI_TOKEN = '<img>'
36
- BOP_TOKEN = '<patch>'
37
  EOI_TOKEN = '</img>'
38
- EOP_TOKEN = '</patch>'
39
  IMG_TOKEN = '<img_{:05d}>'
40
 
41
  IMG_FLAG = '<image>'
@@ -70,7 +66,7 @@ class Arguments:
70
  tokenizer: Optional[str] = field(default='configs/tokenizer/clm_llama_tokenizer.yaml',
71
  metadata={"help": "config path of tokenizer used to initialize tokenizer"})
72
  llm: Optional[str] = field(default='configs/clm_models/llama2chat7b_lora.yaml', metadata={"help": "config path of llm"})
73
- visual_encoder: Optional[str] = field(default='configs/visual_tokenzier/qwen_vitg_448.yaml',
74
  metadata={"help": "config path of visual encoder"})
75
  sd_adapter: Optional[str] = field(
76
  default='configs/detokenizer/detokenizer_sdxl_qwen_vit_adapted.yaml',
@@ -158,10 +154,9 @@ class LLMService:
158
 
159
  self.visual_encoder.to(self.vit_sd_device, dtype=self.dtype)
160
 
161
- model_id_or_path = "stablediffusionapi/realistic-vision-v51"
162
- self.vae_pipe = StableDiffusionImg2ImgPipeline.from_pretrained(model_id_or_path, safety_checker=None,
163
- torch_dtype=torch.float16)
164
- # self.vae_pipe = self.vae_pipe.to(self.vit_sd_device)
165
 
166
  self.boi_token_id = self.tokenizer.encode(BOI_TOKEN, add_special_tokens=False)[0]
167
  self.eoi_token_id = self.tokenizer.encode(EOI_TOKEN, add_special_tokens=False)[0]
@@ -171,7 +166,7 @@ service = LLMService(args)
171
 
172
 
173
  @spaces.GPU
174
- def generate(text_list, image_list, max_new_tokens, force_boi, force_bbox, force_polish):
175
  with torch.no_grad():
176
  text_list = text_list.split(IMG_FLAG)
177
  top_p = 0.5
@@ -300,53 +295,17 @@ def generate(text_list, image_list, max_new_tokens, force_boi, force_bbox, force
300
  img_feat = img_gen_feat[img_idx:img_idx + 1]
301
  generated_image = service.sd_adapter.generate(image_embeds=img_feat, num_inference_steps=50)[0]
302
 
303
- if force_polish:
304
- # service.sd_adapter = service.sd_adapter.cpu()
305
- # service.vae_pipe = service.vae_pipe.to(service.vit_sd_device, dtype=service.dtype)
306
-
307
- torch.cuda.empty_cache()
308
-
309
- service.vae_pipe = service.vae_pipe.to(service.vit_sd_device)
310
-
311
- init_image = generated_image.resize((1024, 1024))
312
- prompt = ""
313
- images = service.vae_pipe(prompt=prompt, image=init_image,
314
- num_inference_steps=50, guidance_scale=8.0, strength=0.38).images
315
- generated_image = images[0]
316
-
317
- image_base64 = encode_image(generated_image)
318
- gen_imgs_base64_list.append(image_base64)
319
-
320
- # service.vae_pipe = service.vae_pipe.to("cpu")
321
- # service.sd_adapter = service.sd_adapter.to(service.vit_sd_device, dtype=service.dtype)
322
-
323
- torch.cuda.empty_cache()
324
-
325
- # print('loading visual encoder and llm to GPU, and sd to CPU')
326
  # a = time.time()
327
  # service.sd_adapter = service.sd_adapter.cpu()
328
  # service.visual_encoder = service.visual_encoder.to(service.vit_sd_device, dtype=service.dtype)
329
  # service.agent = service.agent.to(service.vit_sd_device, dtype=service.dtype)
330
  # print("Loading finished: ", time.time() - a)
331
 
332
- if args.has_bbox:
333
- bboxes = extract_box(generated_text)
334
- if bboxes is not None and len(input_images) > 0:
335
- image_viz = visualize_bbox(input_images[-1], bboxes)
336
- image_base64 = encode_image(image_viz)
337
- gen_imgs_base64_list.append(image_base64)
338
- if '<box_start>' in generated_text:
339
- generated_text = re.sub(r'\[\[ <box_start>.*?<box_end>.*?\]\]', 'the green bounding box',
340
- generated_text)
341
- else:
342
- generated_text = re.sub(r'<loc-\d+> <loc-\d+> <loc-\d+> <loc-\d+> <box_end> \]\]',
343
- 'the green bounding box', generated_text)
344
- generated_text += IMG_FLAG
345
  print(input_text + generated_text)
346
  return {'text': generated_text, 'images': gen_imgs_base64_list, 'error_msg': error_msg}
347
 
348
 
349
- def http_bot(dialog_state, input_state, max_new_tokens, max_turns, force_image_gen, force_bbox, force_polish,
350
  request: gr.Request):
351
  print('input_state:', input_state)
352
 
@@ -365,10 +324,8 @@ def http_bot(dialog_state, input_state, max_new_tokens, max_turns, force_image_g
365
  text = prompt['text']
366
  max_new_tokens = int(max_new_tokens)
367
  images = prompt['images']
368
- force_boi = force_image_gen
369
- force_bbox = force_bbox
370
 
371
- results = generate(text, images, max_new_tokens, force_boi, force_bbox, force_polish)
372
  print('response: ', {'text': results['text'], 'error_msg': results['error_msg']})
373
 
374
  output_state = init_input_state()
@@ -588,25 +545,18 @@ def load_demo(request: gr.Request):
588
 
589
 
590
  title = ("""
591
- # SEED-X-I
592
- [[Paper]](https://arxiv.org/abs/2404.14396) [[Code]](https://github.com/AILab-CVC/SEED-X) [[Faster Demo]](https://arc.tencent.com/en/ai-demos/multimodal)
593
-
594
- Demo of a general instruction-tuned model SEED-X-I (17B) from the foundation model SEED-X.
595
- SEED-X-I can follow multimodal instruction (including images with **dynamic resolutions**) and make responses with **images, texts and bounding boxes** in multi-turn conversation.
596
-
597
- SEED-X-I **does not support image manipulation**. If you want to experience **SEED-X-Edit** for high-precision image editing, please refer to [[Inference Code]](https://github.com/AILab-CVC/SEED-X).
598
 
599
- If you want to experience the normal model inference speed, you can use [[Faster Demo]](https://arc.tencent.com/en/ai-demos/multimodal) or run [[Inference Code]](https://github.com/AILab-CVC/SEED-X) locally.
 
600
 
601
  ## Tips:
602
  * Check out the conversation examples (at the bottom) for inspiration.
603
  * You can adjust "Max History Rounds" to try a conversation with up to **three rounds due to insufficient GPU memory**. For more turns, you can download our checkpoints from GitHub and deploy them locally for inference.
604
  * Our demo supports a mix of images and texts as input. You can freely upload an image or enter text, and then click on "Add Image/Text". You can repeat the former step multiple times, and click on "Submit" for model inference at last.
605
- * You can click "Force Image Generation" to compel the model to produce images when necessary. For example, our model might struggle to generate images when there is an excessive amount of text-only context.
606
- * You can click "Force Bounding Box" to compel the model to produce bounding box for object detection.
607
- * You can click "Force Polishing Generated Image" to compel the model to polish the generated image with image post-processing.
608
 
609
- * SEED-X was trained with English-only data. It may process with other languages due to the inherent capabilities from LLaMA, but might not stable.
610
  """)
611
 
612
  css = """
 
30
 
31
  pyrootutils.setup_root(__file__, indicator=".project-root", pythonpath=True)
32
 
 
 
33
  BOI_TOKEN = '<img>'
 
34
  EOI_TOKEN = '</img>'
 
35
  IMG_TOKEN = '<img_{:05d}>'
36
 
37
  IMG_FLAG = '<image>'
 
66
  tokenizer: Optional[str] = field(default='configs/tokenizer/clm_llama_tokenizer.yaml',
67
  metadata={"help": "config path of tokenizer used to initialize tokenizer"})
68
  llm: Optional[str] = field(default='configs/clm_models/llama2chat7b_lora.yaml', metadata={"help": "config path of llm"})
69
+ visual_encoder: Optional[str] = field(default='configs/visual_tokenizer/qwen_vitg_448.yaml',
70
  metadata={"help": "config path of visual encoder"})
71
  sd_adapter: Optional[str] = field(
72
  default='configs/detokenizer/detokenizer_sdxl_qwen_vit_adapted.yaml',
 
154
 
155
  self.visual_encoder.to(self.vit_sd_device, dtype=self.dtype)
156
 
157
+ # model_id_or_path = "stablediffusionapi/realistic-vision-v51"
158
+ # self.vae_pipe = StableDiffusionImg2ImgPipeline.from_pretrained(model_id_or_path, safety_checker=None,
159
+ # torch_dtype=torch.float16)
 
160
 
161
  self.boi_token_id = self.tokenizer.encode(BOI_TOKEN, add_special_tokens=False)[0]
162
  self.eoi_token_id = self.tokenizer.encode(EOI_TOKEN, add_special_tokens=False)[0]
 
166
 
167
 
168
  @spaces.GPU
169
+ def generate(text_list, image_list, max_new_tokens):
170
  with torch.no_grad():
171
  text_list = text_list.split(IMG_FLAG)
172
  top_p = 0.5
 
295
  img_feat = img_gen_feat[img_idx:img_idx + 1]
296
  generated_image = service.sd_adapter.generate(image_embeds=img_feat, num_inference_steps=50)[0]
297
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
298
  # a = time.time()
299
  # service.sd_adapter = service.sd_adapter.cpu()
300
  # service.visual_encoder = service.visual_encoder.to(service.vit_sd_device, dtype=service.dtype)
301
  # service.agent = service.agent.to(service.vit_sd_device, dtype=service.dtype)
302
  # print("Loading finished: ", time.time() - a)
303
 
 
 
 
 
 
 
 
 
 
 
 
 
 
304
  print(input_text + generated_text)
305
  return {'text': generated_text, 'images': gen_imgs_base64_list, 'error_msg': error_msg}
306
 
307
 
308
+ def http_bot(dialog_state, input_state, max_new_tokens, max_turns,
309
  request: gr.Request):
310
  print('input_state:', input_state)
311
 
 
324
  text = prompt['text']
325
  max_new_tokens = int(max_new_tokens)
326
  images = prompt['images']
 
 
327
 
328
+ results = generate(text, images, max_new_tokens)
329
  print('response: ', {'text': results['text'], 'error_msg': results['error_msg']})
330
 
331
  output_state = init_input_state()
 
545
 
546
 
547
  title = ("""
548
+ # SEED-Story
549
+ [[Paper]](https://arxiv.org/abs/2407.08683) [[Code]](https://github.com/TencentARC/SEED-Story)
 
 
 
 
 
550
 
551
+ Demo of a multimodal story generation model SEED-Story-George. It is trained on StoryStream-Curious George subset.
552
+ SEED-Story is a MLLM capable of generating multimodal long stories consisting of rich and coherent narrative texts, along with images that are consistent in characters and style.
553
 
554
  ## Tips:
555
  * Check out the conversation examples (at the bottom) for inspiration.
556
  * You can adjust "Max History Rounds" to try a conversation with up to **three rounds due to insufficient GPU memory**. For more turns, you can download our checkpoints from GitHub and deploy them locally for inference.
557
  * Our demo supports a mix of images and texts as input. You can freely upload an image or enter text, and then click on "Add Image/Text". You can repeat the former step multiple times, and click on "Submit" for model inference at last.
 
 
 
558
 
559
+ * SEED-Story was trained with English-only data. It may process with other languages due to the inherent capabilities from LLaMA, but might not stable.
560
  """)
561
 
562
  css = """
configs/clm_models/agent_7b_seedx_pretrained.yaml ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _target_: src.models_clm.models.ContinuousLVLM.from_pretrained
2
+ input_resampler:
3
+ _target_: src.models.qwen_visual.Resampler
4
+ grid_size: 8
5
+ embed_dim: 4096
6
+ num_heads: 32
7
+ kv_dim: 4096
8
+
9
+ output_resampler:
10
+ _target_: src.models.qwen_visual.Resampler
11
+ grid_size: 16
12
+ embed_dim: 4096
13
+ num_heads: 32
14
+ kv_dim: 4096
15
+
16
+ lm_loss_scale: 1.0
17
+ rec_loss_scale: 1.0
18
+ pretrained_model_path: pretrained/seedx/checkpoint-30000/pytorch_model.bin
configs/clm_models/agent_7b_sft.yaml ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _target_: src.models_clm.models.ContinuousLVLM.from_pretrained
2
+ input_resampler:
3
+ _target_: src.models.qwen_visual.Resampler
4
+ grid_size: 8
5
+ embed_dim: 4096
6
+ num_heads: 32
7
+ kv_dim: 4096
8
+
9
+ output_resampler:
10
+ _target_: src.models.qwen_visual.Resampler
11
+ grid_size: 16
12
+ embed_dim: 4096
13
+ num_heads: 32
14
+ kv_dim: 4096
15
+
16
+ lm_loss_scale: 1.0
17
+ rec_loss_scale: 1.0
18
+ pretrained_model_path: pretrained/seed_story/george_sft/pytorch_model.bin
configs/clm_models/llama2chat7b_lora.yaml ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _target_: src.models_clm.peft_models.get_peft_model_with_resize_embedding
2
+ model:
3
+ _target_: src.models_clm.modeling_llama_xformer.LlamaForCausalLM.from_pretrained
4
+ # _target_: transformers.LlamaForCausalLM.from_pretrained
5
+ pretrained_model_name_or_path: luodian/llama-7b-hf
6
+ low_cpu_mem_usage: True
7
+ peft_config:
8
+ _target_: peft.LoraConfig
9
+ _convert_: object
10
+ r: 16
11
+ lora_alpha: 32
12
+ modules_to_save:
13
+ # - embed_tokens
14
+ # - lm_head
15
+ - input_layernorm
16
+ - post_attention_layernorm
17
+ - norm
18
+ target_modules:
19
+ - q_proj
20
+ - v_proj
21
+ - k_proj
22
+ - o_proj
23
+ - gate_proj
24
+ - down_proj
25
+ - up_proj
26
+ task_type: CAUSAL_LM
27
+ lora_dropout: 0.05
28
+
29
+ vocab_size: 32066
30
+ # _target_: src.models_clm.peft_models.get_model_with_resize_embedding
31
+ # model:
32
+ # # _target_: src.models_clm.modeling_llama_xformer.LlamaForCausalLM.from_pretrained
33
+ # _target_: transformers.LlamaForCausalLM.from_pretrained
34
+ # pretrained_model_name_or_path: /apdcephfs_cq3/share_1290939/sijiezhao/model_hub/Llama-2-7b-hf
35
+ # low_cpu_mem_usage: True
36
+
37
+ # vocab_size: 32066
configs/data/george_sdxl.yaml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _target_: src.data.story_telling.build_multi_datapipes
2
+ _recursive_: False
3
+ datapipes:
4
+ - _target_: src.data.story_telling.build_long_story_datapipe
5
+ data_dir: data/json/george_train10
6
+ image_dir: data/image/george_full
7
+ max_length: 1280
8
+ batch_size: 4
9
+ instruction_prompt: "{instruction}"
10
+ # turn_sep: '\n'
11
+ min_aspect_ratio: 0.2
12
+ min_resolution: 128
13
+ num_img_in_tokens: 64
14
+ num_img_out_tokens: 64
15
+ cycle_count: 10000
16
+ story_len: 10
17
+
18
+ sample_weights:
19
+ - 1.0 # llava
configs/data/george_sft.yaml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _target_: src.data.story_telling.build_multi_datapipes
2
+ _recursive_: False
3
+ datapipes:
4
+ - _target_: src.data.story_telling.build_long_story_datapipe
5
+ data_dir: data/json/george_train10
6
+ image_dir: data/image/george_full
7
+ max_length: 1280
8
+ batch_size: 30
9
+ instruction_prompt: "{instruction}"
10
+ # turn_sep: '\n'
11
+ min_aspect_ratio: 0.2
12
+ min_resolution: 128
13
+ num_img_in_tokens: 64
14
+ num_img_out_tokens: 64
15
+ cycle_count: 10000
16
+ story_len: 10
17
+
18
+ sample_weights:
19
+ - 1.0 # llava
configs/detokenizer/detokenizer_sdxl_qwen_vit_adapted.yaml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _target_: src.models_ipa.adapter_modules.SDXLAdapter.from_pretrained
2
+
3
+ resampler:
4
+ _target_: src.models_ipa.resampler.ResamplerXLV2
5
+ dim: 1024
6
+ depth: 4
7
+ dim_head: 64
8
+ heads: 16
9
+ num_queries: 64
10
+ embedding_dim: 4096
11
+ output1_dim: 768
12
+ output2_dim: 1280
13
+ ff_mult: 4
14
+
15
+ pretrained_model_path: pretrained/detokenizer/detokenizer_george_adapted/checkpoint-4000/pytorch_model.bin
configs/detokenizer/detokenizer_sdxl_qwen_vit_pretrained.yaml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _target_: src.models_ipa.adapter_modules.SDXLAdapter.from_pretrained
2
+
3
+ resampler:
4
+ _target_: src.models_ipa.resampler.ResamplerXLV2
5
+ dim: 1024
6
+ depth: 4
7
+ dim_head: 64
8
+ heads: 16
9
+ num_queries: 64
10
+ embedding_dim: 4096
11
+ output1_dim: 768
12
+ output2_dim: 1280
13
+ ff_mult: 4
14
+
15
+ pretrained_model_path: pretrained/detokenizer_pretrained/checkpoint-55000/pytorch_model.bin
configs/discrete_model/discrete_identity.yaml ADDED
@@ -0,0 +1 @@
 
 
1
+ _target_: src.models.discrete_models.DiscreteModleIdentity
configs/processer/qwen_448_transform.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ _target_: src.processer.transforms.get_transform
2
+ type: clip
3
+ image_size: 448
4
+ keep_ratio: False
configs/processer/qwen_448_transform_keep_ratio.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ _target_: src.processer.transforms.get_transform
2
+ type: clip
3
+ image_size: 448
4
+ keep_ratio: True
configs/processer/sd_transform_1024.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ _target_: src.processer.transforms.get_transform
2
+ type: sd
3
+ image_size: 1024
4
+ keep_ratio: True
configs/tokenizer/clm_llama_tokenizer.yaml ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ _target_: transformers.LlamaTokenizer.from_pretrained
2
+ pretrained_model_name_or_path: pretrained/cvlm_llama2_tokenizer
configs/visual_tokenizer/qwen_vitg_448.yaml ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ _target_: src.models.qwen_visual.VisionTransformerWithAttnPool.from_pretrained
2
+ heads: 16
3
+ image_size: 448
4
+ image_start_id": 151857
5
+ layers: 48
6
+ mlp_ratio: 4.9231
7
+ output_dim: 4096
8
+ patch_size: 14
9
+ width: 1664
10
+ pretrained_model_path: /dataset/syang/pretrained/qwen_vit_G.pt
pretrained/cvlm_llama2_tokenizer/added_tokens.json ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</img>": 32065,
3
+ "<img>": 32064,
4
+ "<img_00000>": 32000,
5
+ "<img_00001>": 32001,
6
+ "<img_00002>": 32002,
7
+ "<img_00003>": 32003,
8
+ "<img_00004>": 32004,
9
+ "<img_00005>": 32005,
10
+ "<img_00006>": 32006,
11
+ "<img_00007>": 32007,
12
+ "<img_00008>": 32008,
13
+ "<img_00009>": 32009,
14
+ "<img_00010>": 32010,
15
+ "<img_00011>": 32011,
16
+ "<img_00012>": 32012,
17
+ "<img_00013>": 32013,
18
+ "<img_00014>": 32014,
19
+ "<img_00015>": 32015,
20
+ "<img_00016>": 32016,
21
+ "<img_00017>": 32017,
22
+ "<img_00018>": 32018,
23
+ "<img_00019>": 32019,
24
+ "<img_00020>": 32020,
25
+ "<img_00021>": 32021,
26
+ "<img_00022>": 32022,
27
+ "<img_00023>": 32023,
28
+ "<img_00024>": 32024,
29
+ "<img_00025>": 32025,
30
+ "<img_00026>": 32026,
31
+ "<img_00027>": 32027,
32
+ "<img_00028>": 32028,
33
+ "<img_00029>": 32029,
34
+ "<img_00030>": 32030,
35
+ "<img_00031>": 32031,
36
+ "<img_00032>": 32032,
37
+ "<img_00033>": 32033,
38
+ "<img_00034>": 32034,
39
+ "<img_00035>": 32035,
40
+ "<img_00036>": 32036,
41
+ "<img_00037>": 32037,
42
+ "<img_00038>": 32038,
43
+ "<img_00039>": 32039,
44
+ "<img_00040>": 32040,
45
+ "<img_00041>": 32041,
46
+ "<img_00042>": 32042,
47
+ "<img_00043>": 32043,
48
+ "<img_00044>": 32044,
49
+ "<img_00045>": 32045,
50
+ "<img_00046>": 32046,
51
+ "<img_00047>": 32047,
52
+ "<img_00048>": 32048,
53
+ "<img_00049>": 32049,
54
+ "<img_00050>": 32050,
55
+ "<img_00051>": 32051,
56
+ "<img_00052>": 32052,
57
+ "<img_00053>": 32053,
58
+ "<img_00054>": 32054,
59
+ "<img_00055>": 32055,
60
+ "<img_00056>": 32056,
61
+ "<img_00057>": 32057,
62
+ "<img_00058>": 32058,
63
+ "<img_00059>": 32059,
64
+ "<img_00060>": 32060,
65
+ "<img_00061>": 32061,
66
+ "<img_00062>": 32062,
67
+ "<img_00063>": 32063
68
+ }
pretrained/cvlm_llama2_tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ {
4
+ "content": "<img>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ {
11
+ "content": "</img>",
12
+ "lstrip": false,
13
+ "normalized": false,
14
+ "rstrip": false,
15
+ "single_word": false
16
+ }
17
+ ],
18
+ "bos_token": {
19
+ "content": "<s>",
20
+ "lstrip": false,
21
+ "normalized": false,
22
+ "rstrip": false,
23
+ "single_word": false
24
+ },
25
+ "eos_token": {
26
+ "content": "</s>",
27
+ "lstrip": false,
28
+ "normalized": false,
29
+ "rstrip": false,
30
+ "single_word": false
31
+ },
32
+ "pad_token": "<unk>",
33
+ "unk_token": {
34
+ "content": "<unk>",
35
+ "lstrip": false,
36
+ "normalized": false,
37
+ "rstrip": false,
38
+ "single_word": false
39
+ }
40
+ }
pretrained/cvlm_llama2_tokenizer/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
pretrained/cvlm_llama2_tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,573 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<unk>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<s>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "</s>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "32000": {
30
+ "content": "<img_00000>",
31
+ "lstrip": false,
32
+ "normalized": true,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": false
36
+ },
37
+ "32001": {
38
+ "content": "<img_00001>",
39
+ "lstrip": false,
40
+ "normalized": true,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": false
44
+ },
45
+ "32002": {
46
+ "content": "<img_00002>",
47
+ "lstrip": false,
48
+ "normalized": true,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": false
52
+ },
53
+ "32003": {
54
+ "content": "<img_00003>",
55
+ "lstrip": false,
56
+ "normalized": true,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": false
60
+ },
61
+ "32004": {
62
+ "content": "<img_00004>",
63
+ "lstrip": false,
64
+ "normalized": true,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": false
68
+ },
69
+ "32005": {
70
+ "content": "<img_00005>",
71
+ "lstrip": false,
72
+ "normalized": true,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": false
76
+ },
77
+ "32006": {
78
+ "content": "<img_00006>",
79
+ "lstrip": false,
80
+ "normalized": true,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": false
84
+ },
85
+ "32007": {
86
+ "content": "<img_00007>",
87
+ "lstrip": false,
88
+ "normalized": true,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": false
92
+ },
93
+ "32008": {
94
+ "content": "<img_00008>",
95
+ "lstrip": false,
96
+ "normalized": true,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": false
100
+ },
101
+ "32009": {
102
+ "content": "<img_00009>",
103
+ "lstrip": false,
104
+ "normalized": true,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": false
108
+ },
109
+ "32010": {
110
+ "content": "<img_00010>",
111
+ "lstrip": false,
112
+ "normalized": true,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": false
116
+ },
117
+ "32011": {
118
+ "content": "<img_00011>",
119
+ "lstrip": false,
120
+ "normalized": true,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "32012": {
126
+ "content": "<img_00012>",
127
+ "lstrip": false,
128
+ "normalized": true,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "32013": {
134
+ "content": "<img_00013>",
135
+ "lstrip": false,
136
+ "normalized": true,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "32014": {
142
+ "content": "<img_00014>",
143
+ "lstrip": false,
144
+ "normalized": true,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "32015": {
150
+ "content": "<img_00015>",
151
+ "lstrip": false,
152
+ "normalized": true,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "32016": {
158
+ "content": "<img_00016>",
159
+ "lstrip": false,
160
+ "normalized": true,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "32017": {
166
+ "content": "<img_00017>",
167
+ "lstrip": false,
168
+ "normalized": true,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "32018": {
174
+ "content": "<img_00018>",
175
+ "lstrip": false,
176
+ "normalized": true,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "32019": {
182
+ "content": "<img_00019>",
183
+ "lstrip": false,
184
+ "normalized": true,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "32020": {
190
+ "content": "<img_00020>",
191
+ "lstrip": false,
192
+ "normalized": true,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "32021": {
198
+ "content": "<img_00021>",
199
+ "lstrip": false,
200
+ "normalized": true,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "32022": {
206
+ "content": "<img_00022>",
207
+ "lstrip": false,
208
+ "normalized": true,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ },
213
+ "32023": {
214
+ "content": "<img_00023>",
215
+ "lstrip": false,
216
+ "normalized": true,
217
+ "rstrip": false,
218
+ "single_word": false,
219
+ "special": false
220
+ },
221
+ "32024": {
222
+ "content": "<img_00024>",
223
+ "lstrip": false,
224
+ "normalized": true,
225
+ "rstrip": false,
226
+ "single_word": false,
227
+ "special": false
228
+ },
229
+ "32025": {
230
+ "content": "<img_00025>",
231
+ "lstrip": false,
232
+ "normalized": true,
233
+ "rstrip": false,
234
+ "single_word": false,
235
+ "special": false
236
+ },
237
+ "32026": {
238
+ "content": "<img_00026>",
239
+ "lstrip": false,
240
+ "normalized": true,
241
+ "rstrip": false,
242
+ "single_word": false,
243
+ "special": false
244
+ },
245
+ "32027": {
246
+ "content": "<img_00027>",
247
+ "lstrip": false,
248
+ "normalized": true,
249
+ "rstrip": false,
250
+ "single_word": false,
251
+ "special": false
252
+ },
253
+ "32028": {
254
+ "content": "<img_00028>",
255
+ "lstrip": false,
256
+ "normalized": true,
257
+ "rstrip": false,
258
+ "single_word": false,
259
+ "special": false
260
+ },
261
+ "32029": {
262
+ "content": "<img_00029>",
263
+ "lstrip": false,
264
+ "normalized": true,
265
+ "rstrip": false,
266
+ "single_word": false,
267
+ "special": false
268
+ },
269
+ "32030": {
270
+ "content": "<img_00030>",
271
+ "lstrip": false,
272
+ "normalized": true,
273
+ "rstrip": false,
274
+ "single_word": false,
275
+ "special": false
276
+ },
277
+ "32031": {
278
+ "content": "<img_00031>",
279
+ "lstrip": false,
280
+ "normalized": true,
281
+ "rstrip": false,
282
+ "single_word": false,
283
+ "special": false
284
+ },
285
+ "32032": {
286
+ "content": "<img_00032>",
287
+ "lstrip": false,
288
+ "normalized": true,
289
+ "rstrip": false,
290
+ "single_word": false,
291
+ "special": false
292
+ },
293
+ "32033": {
294
+ "content": "<img_00033>",
295
+ "lstrip": false,
296
+ "normalized": true,
297
+ "rstrip": false,
298
+ "single_word": false,
299
+ "special": false
300
+ },
301
+ "32034": {
302
+ "content": "<img_00034>",
303
+ "lstrip": false,
304
+ "normalized": true,
305
+ "rstrip": false,
306
+ "single_word": false,
307
+ "special": false
308
+ },
309
+ "32035": {
310
+ "content": "<img_00035>",
311
+ "lstrip": false,
312
+ "normalized": true,
313
+ "rstrip": false,
314
+ "single_word": false,
315
+ "special": false
316
+ },
317
+ "32036": {
318
+ "content": "<img_00036>",
319
+ "lstrip": false,
320
+ "normalized": true,
321
+ "rstrip": false,
322
+ "single_word": false,
323
+ "special": false
324
+ },
325
+ "32037": {
326
+ "content": "<img_00037>",
327
+ "lstrip": false,
328
+ "normalized": true,
329
+ "rstrip": false,
330
+ "single_word": false,
331
+ "special": false
332
+ },
333
+ "32038": {
334
+ "content": "<img_00038>",
335
+ "lstrip": false,
336
+ "normalized": true,
337
+ "rstrip": false,
338
+ "single_word": false,
339
+ "special": false
340
+ },
341
+ "32039": {
342
+ "content": "<img_00039>",
343
+ "lstrip": false,
344
+ "normalized": true,
345
+ "rstrip": false,
346
+ "single_word": false,
347
+ "special": false
348
+ },
349
+ "32040": {
350
+ "content": "<img_00040>",
351
+ "lstrip": false,
352
+ "normalized": true,
353
+ "rstrip": false,
354
+ "single_word": false,
355
+ "special": false
356
+ },
357
+ "32041": {
358
+ "content": "<img_00041>",
359
+ "lstrip": false,
360
+ "normalized": true,
361
+ "rstrip": false,
362
+ "single_word": false,
363
+ "special": false
364
+ },
365
+ "32042": {
366
+ "content": "<img_00042>",
367
+ "lstrip": false,
368
+ "normalized": true,
369
+ "rstrip": false,
370
+ "single_word": false,
371
+ "special": false
372
+ },
373
+ "32043": {
374
+ "content": "<img_00043>",
375
+ "lstrip": false,
376
+ "normalized": true,
377
+ "rstrip": false,
378
+ "single_word": false,
379
+ "special": false
380
+ },
381
+ "32044": {
382
+ "content": "<img_00044>",
383
+ "lstrip": false,
384
+ "normalized": true,
385
+ "rstrip": false,
386
+ "single_word": false,
387
+ "special": false
388
+ },
389
+ "32045": {
390
+ "content": "<img_00045>",
391
+ "lstrip": false,
392
+ "normalized": true,
393
+ "rstrip": false,
394
+ "single_word": false,
395
+ "special": false
396
+ },
397
+ "32046": {
398
+ "content": "<img_00046>",
399
+ "lstrip": false,
400
+ "normalized": true,
401
+ "rstrip": false,
402
+ "single_word": false,
403
+ "special": false
404
+ },
405
+ "32047": {
406
+ "content": "<img_00047>",
407
+ "lstrip": false,
408
+ "normalized": true,
409
+ "rstrip": false,
410
+ "single_word": false,
411
+ "special": false
412
+ },
413
+ "32048": {
414
+ "content": "<img_00048>",
415
+ "lstrip": false,
416
+ "normalized": true,
417
+ "rstrip": false,
418
+ "single_word": false,
419
+ "special": false
420
+ },
421
+ "32049": {
422
+ "content": "<img_00049>",
423
+ "lstrip": false,
424
+ "normalized": true,
425
+ "rstrip": false,
426
+ "single_word": false,
427
+ "special": false
428
+ },
429
+ "32050": {
430
+ "content": "<img_00050>",
431
+ "lstrip": false,
432
+ "normalized": true,
433
+ "rstrip": false,
434
+ "single_word": false,
435
+ "special": false
436
+ },
437
+ "32051": {
438
+ "content": "<img_00051>",
439
+ "lstrip": false,
440
+ "normalized": true,
441
+ "rstrip": false,
442
+ "single_word": false,
443
+ "special": false
444
+ },
445
+ "32052": {
446
+ "content": "<img_00052>",
447
+ "lstrip": false,
448
+ "normalized": true,
449
+ "rstrip": false,
450
+ "single_word": false,
451
+ "special": false
452
+ },
453
+ "32053": {
454
+ "content": "<img_00053>",
455
+ "lstrip": false,
456
+ "normalized": true,
457
+ "rstrip": false,
458
+ "single_word": false,
459
+ "special": false
460
+ },
461
+ "32054": {
462
+ "content": "<img_00054>",
463
+ "lstrip": false,
464
+ "normalized": true,
465
+ "rstrip": false,
466
+ "single_word": false,
467
+ "special": false
468
+ },
469
+ "32055": {
470
+ "content": "<img_00055>",
471
+ "lstrip": false,
472
+ "normalized": true,
473
+ "rstrip": false,
474
+ "single_word": false,
475
+ "special": false
476
+ },
477
+ "32056": {
478
+ "content": "<img_00056>",
479
+ "lstrip": false,
480
+ "normalized": true,
481
+ "rstrip": false,
482
+ "single_word": false,
483
+ "special": false
484
+ },
485
+ "32057": {
486
+ "content": "<img_00057>",
487
+ "lstrip": false,
488
+ "normalized": true,
489
+ "rstrip": false,
490
+ "single_word": false,
491
+ "special": false
492
+ },
493
+ "32058": {
494
+ "content": "<img_00058>",
495
+ "lstrip": false,
496
+ "normalized": true,
497
+ "rstrip": false,
498
+ "single_word": false,
499
+ "special": false
500
+ },
501
+ "32059": {
502
+ "content": "<img_00059>",
503
+ "lstrip": false,
504
+ "normalized": true,
505
+ "rstrip": false,
506
+ "single_word": false,
507
+ "special": false
508
+ },
509
+ "32060": {
510
+ "content": "<img_00060>",
511
+ "lstrip": false,
512
+ "normalized": true,
513
+ "rstrip": false,
514
+ "single_word": false,
515
+ "special": false
516
+ },
517
+ "32061": {
518
+ "content": "<img_00061>",
519
+ "lstrip": false,
520
+ "normalized": true,
521
+ "rstrip": false,
522
+ "single_word": false,
523
+ "special": false
524
+ },
525
+ "32062": {
526
+ "content": "<img_00062>",
527
+ "lstrip": false,
528
+ "normalized": true,
529
+ "rstrip": false,
530
+ "single_word": false,
531
+ "special": false
532
+ },
533
+ "32063": {
534
+ "content": "<img_00063>",
535
+ "lstrip": false,
536
+ "normalized": true,
537
+ "rstrip": false,
538
+ "single_word": false,
539
+ "special": false
540
+ },
541
+ "32064": {
542
+ "content": "<img>",
543
+ "lstrip": false,
544
+ "normalized": false,
545
+ "rstrip": false,
546
+ "single_word": false,
547
+ "special": true
548
+ },
549
+ "32065": {
550
+ "content": "</img>",
551
+ "lstrip": false,
552
+ "normalized": false,
553
+ "rstrip": false,
554
+ "single_word": false,
555
+ "special": true
556
+ }
557
+ },
558
+ "additional_special_tokens": [
559
+ "<img>",
560
+ "</img>"
561
+ ],
562
+ "bos_token": "<s>",
563
+ "clean_up_tokenization_spaces": false,
564
+ "eos_token": "</s>",
565
+ "legacy": false,
566
+ "model_max_length": 1000000000000000019884624838656,
567
+ "pad_token": "<unk>",
568
+ "sp_model_kwargs": {},
569
+ "spaces_between_special_tokens": false,
570
+ "tokenizer_class": "LlamaTokenizer",
571
+ "unk_token": "<unk>",
572
+ "use_default_system_prompt": false
573
+ }
pretrained/detokenizer/detokenizer_george_adapted/checkpoint-4000/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:785d4e670ccfdce33b493d0aada60ee5c116918468098b2ed82ae2c28f31e423
3
+ size 6471628187
pretrained/qwen_vit_G.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d951083fc79b07bdb84be61944eb263b8e14572fe2dc4fa80b0447f83064463c
3
+ size 3871440281
pretrained/seed_story/george_sft/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:46db6f1beb672085204ca9f7d542f6b62063cbe9970933ca702bccc72f00a4f6
3
+ size 14709979626