Spaces:

omni-research
/

Tarsier2-7b

Running on Zero

App Files Files Community

omni-research commited on 7 days ago

Commit

dcd4560

1 Parent(s): aa4801f

update to tarsier2-7b-0115

Browse files

Files changed (21) hide show

app.py +28 -27
configs/tarser2_default_config.yaml +14 -0
dataset/custom_data_parsers/multi_images_parser.py +199 -0
dataset/custom_data_parsers/object_tracking_parser.py +160 -0
dataset/custom_data_parsers/standard_vision_parser.py +255 -0
dataset/custom_data_parsers/utils.py +452 -0
dataset/custom_data_parsers/utils_visualize.py +54 -0
dataset/custom_data_parsers/video_permutation_parser.py +137 -0
dataset/mm_dataset.py +0 -62
dataset/processor.py +0 -164
dataset/tarsier_datamodule.py +284 -0
dataset/tarsier_processor.py +240 -0
dataset/utils.py +58 -0
models/modeling_qwen2_vl_fast.py +1320 -0
models/modeling_tarsier.py +290 -545
models/utils.py +17 -0
requirements.txt +2 -1
tools/color.py +36 -0
tools/conversation.py +98 -64
tools/rw_utils.py +64 -0
tools/utils.py +14 -34

app.py CHANGED Viewed

@@ -13,19 +13,22 @@
 # limitations under the License.
 # copy and modify from: https://github.com/OpenGVLab/Ask-Anything/blob/main/video_chat2/demo/demo.py
-import spaces
 from copy import deepcopy
 import gradio as gr
 from gradio.themes.utils import colors, fonts, sizes
 from tools.conversation import Chat, conv_templates
 from tools.utils import load_model_and_processor, file_to_base64
-from dataset.processor import Processor
 import os
 import torch
 # huggingface-cli login
-model_path = os.getenv("MODEL_PATH", "omni-research/Tarsier2-7b")
 max_n_frames = int(os.getenv("MAX_N_FRAMES", 16))
 debug = False
 device = 'cuda' if not debug else 'cpu'
@@ -34,13 +37,14 @@ device = 'cuda' if not debug else 'cpu'
 #             Model Initialization
 # ========================================
 def init_model():
     print("Start Initialization...")
     # if torch.cuda.is_available():
     if not debug:
-        model, processor = load_model_and_processor(model_path, max_n_frames)
     else:
         print(f"No Valid GPU! Lauch in debug mode!")
-        processor = Processor(model_path, max_n_frames)
         model = None
     chat = Chat(model, processor, device, debug)
     print('Initialization Finished')
@@ -50,13 +54,11 @@ def init_model():
 # ========================================
 #             Gradio Setting
 # ========================================
-def gradio_reset(chat_state, img_file, img_list):
     if chat_state is not None:
         chat_state.messages = []
     img_file = None
-    if img_list is not None:
-        img_list = []
-    return None, gr.update(value=None, interactive=True), gr.update(value=None, interactive=True), gr.update(value=None, interactive=True), gr.update(placeholder='Please upload your video first', interactive=False),gr.update(value="Upload & Start Chat", interactive=True), chat_state, img_file, img_list
 def upload_img(gr_img, gr_video, gr_gif, chat_state, num_frames):
@@ -64,24 +66,24 @@ def upload_img(gr_img, gr_video, gr_gif, chat_state, num_frames):
     conv_type = ''
     if 'tarsier2-7b' in model_path.lower():
         conv_type = 'tarsier2-7b'
-    elif '7b' in model_path.lower():
-        conv_type = 'tarsier-7b'
-    elif '13b' in model_path.lower():
-        conv_type = 'tarsier-13b'
-    elif '34b' in model_path.lower():
-        conv_type = 'tarsier-34b'
     else:
         raise ValueError(f"Unknow model: {model_path}")
     chat_state = deepcopy(conv_templates[conv_type])
-    img_list = []
     if gr_img is None and gr_video is None and gr_gif is None:
         return None, None, None, gr.update(interactive=True), gr.update(interactive=True, placeholder='Please upload video/image first!'), chat_state, None, None
     if gr_video or gr_img or gr_gif:
         for img_file in [gr_video, gr_img, gr_gif]:
             if img_file is not None:
                 break
-        return gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True, placeholder='Type and press Enter'), gr.update(value="Start Chatting", interactive=False), chat_state, img_file, img_list
 def gradio_ask(user_message, chatbot, chat_state):
@@ -91,13 +93,13 @@ def gradio_ask(user_message, chatbot, chat_state):
     chatbot = chatbot + [[user_message, None]]
     return '', chatbot, chat_state
-@spaces.GPU(duration=120)
-def gradio_answer(chatbot, chat_state, img_file, img_list, top_p, temperature, n_frames=None):
-    llm_message, chat_state, img_list = chat.answer(conv=chat_state, visual_data_file=img_file, images=img_list, n_frames=n_frames, max_new_tokens=256, num_beams=1, temperature=temperature, top_p=top_p)
     chatbot[-1][1] = llm_message
     print(chat_state)
     print(f"Answer: {llm_message}")
-    return chatbot, chat_state, img_list
 class OpenGVLab(gr.themes.base.Base):
@@ -203,7 +205,6 @@ with gr.Blocks(title="Tarsier",theme=gvlabtheme,css="#chatbot {overflow:auto; he
         with gr.Column(visible=True)  as input_raws:
             chat_state = gr.State()
-            img_list = gr.State()
             img_file = gr.State()
             chatbot = gr.Chatbot(elem_id="chatbot",label='VideoChat')
             with gr.Row():
@@ -216,19 +217,19 @@ with gr.Blocks(title="Tarsier",theme=gvlabtheme,css="#chatbot {overflow:auto; he
             gr.Examples(examples=[
                     [f"examples/test1.mp4", "Describe the video in detail."],
                     [f"examples/test2.mp4", "Are they having a pleasant conversation?"],
-                ], inputs=[up_video, text_input])
     chat = init_model()
-    upload_button.click(upload_img, [up_image, up_video, up_gif, chat_state, num_frames], [up_image, up_video, up_gif, text_input, upload_button, chat_state, img_file, img_list])
     text_input.submit(gradio_ask, [text_input, chatbot, chat_state], [text_input, chatbot, chat_state]).then(
-        gradio_answer, [chatbot, chat_state, img_file, img_list, top_p, temperature, num_frames], [chatbot, chat_state, img_list]
     )
     run.click(gradio_ask, [text_input, chatbot, chat_state], [text_input, chatbot, chat_state]).then(
-        gradio_answer, [chatbot, chat_state, img_file, img_list, top_p, temperature, num_frames], [chatbot, chat_state, img_list]
     )
     run.click(lambda: "", None, text_input)
-    clear.click(gradio_reset, [chat_state, img_file, img_list], [chatbot, up_image, up_video, up_gif, text_input, upload_button, chat_state, img_file, img_list], queue=False)
 demo.launch()

 # limitations under the License.
 # copy and modify from: https://github.com/OpenGVLab/Ask-Anything/blob/main/video_chat2/demo/demo.py
+import spaces # for deploying on huggingface ZeroGPU
 from copy import deepcopy
 import gradio as gr
 from gradio.themes.utils import colors, fonts, sizes
 from tools.conversation import Chat, conv_templates
 from tools.utils import load_model_and_processor, file_to_base64
+from dataset.tarsier_datamodule import init_processor
 import os
 import torch
+import yaml
 # huggingface-cli login
+model_path = os.getenv("MODEL_PATH", "omni-research/Tarsier2-7b-0115")
+config_path = "configs/tarser2_default_config.yaml"
 max_n_frames = int(os.getenv("MAX_N_FRAMES", 16))
 debug = False
 device = 'cuda' if not debug else 'cpu'
 #             Model Initialization
 # ========================================
 def init_model():
+    config = yaml.safe_load(open(config_path, 'r'))
     print("Start Initialization...")
     # if torch.cuda.is_available():
     if not debug:
+        model, processor = load_model_and_processor(model_path, config)
     else:
         print(f"No Valid GPU! Lauch in debug mode!")
+        processor = init_processor(model_path, config)
         model = None
     chat = Chat(model, processor, device, debug)
     print('Initialization Finished')
 # ========================================
 #             Gradio Setting
 # ========================================
+def gradio_reset(chat_state, img_file):
     if chat_state is not None:
         chat_state.messages = []
     img_file = None
+    return None, gr.update(value=None, interactive=True), gr.update(value=None, interactive=True), gr.update(value=None, interactive=True), gr.update(placeholder='Please upload your video first', interactive=False),gr.update(value="Upload & Start Chat", interactive=True), chat_state, img_file
 def upload_img(gr_img, gr_video, gr_gif, chat_state, num_frames):
     conv_type = ''
     if 'tarsier2-7b' in model_path.lower():
         conv_type = 'tarsier2-7b'
+    # elif '7b' in model_path.lower():
+    #     conv_type = 'tarsier-7b'
+    # elif '13b' in model_path.lower():
+    #     conv_type = 'tarsier-13b'
+    # elif '34b' in model_path.lower():
+    #     conv_type = 'tarsier-34b'
     else:
         raise ValueError(f"Unknow model: {model_path}")
     chat_state = deepcopy(conv_templates[conv_type])
     if gr_img is None and gr_video is None and gr_gif is None:
         return None, None, None, gr.update(interactive=True), gr.update(interactive=True, placeholder='Please upload video/image first!'), chat_state, None, None
     if gr_video or gr_img or gr_gif:
         for img_file in [gr_video, gr_img, gr_gif]:
             if img_file is not None:
                 break
+        chat_state.messages.append([chat_state.roles[0], {"type": "video", "text": img_file}])
+        return gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True, placeholder='Type and press Enter'), gr.update(value="Start Chatting", interactive=False), chat_state, img_file
 def gradio_ask(user_message, chatbot, chat_state):
     chatbot = chatbot + [[user_message, None]]
     return '', chatbot, chat_state
+@spaces.GPU(duration=120) # for deploying on huggingface ZeroGPU
+def gradio_answer(chatbot, chat_state, img_file, top_p, temperature, n_frames=None):
+    llm_message, chat_state = chat.answer(conv=chat_state, n_frames=n_frames, max_new_tokens=256, num_beams=1, temperature=temperature, top_p=top_p)
     chatbot[-1][1] = llm_message
     print(chat_state)
     print(f"Answer: {llm_message}")
+    return chatbot, chat_state
 class OpenGVLab(gr.themes.base.Base):
         with gr.Column(visible=True)  as input_raws:
             chat_state = gr.State()
             img_file = gr.State()
             chatbot = gr.Chatbot(elem_id="chatbot",label='VideoChat')
             with gr.Row():
             gr.Examples(examples=[
                     [f"examples/test1.mp4", "Describe the video in detail."],
                     [f"examples/test2.mp4", "Are they having a pleasant conversation?"],
+                ], inputs=[up_video, text_input])
     chat = init_model()
+    upload_button.click(upload_img, [up_image, up_video, up_gif, chat_state, num_frames], [up_image, up_video, up_gif, text_input, upload_button, chat_state, img_file])
     text_input.submit(gradio_ask, [text_input, chatbot, chat_state], [text_input, chatbot, chat_state]).then(
+        gradio_answer, [chatbot, chat_state, img_file, top_p, temperature, num_frames], [chatbot, chat_state]
     )
     run.click(gradio_ask, [text_input, chatbot, chat_state], [text_input, chatbot, chat_state]).then(
+        gradio_answer, [chatbot, chat_state, img_file, top_p, temperature, num_frames], [chatbot, chat_state]
     )
     run.click(lambda: "", None, text_input)
+    clear.click(gradio_reset, [chat_state, img_file], [chatbot, up_image, up_video, up_gif, text_input, upload_button, chat_state, img_file], queue=False)
 demo.launch()

configs/tarser2_default_config.yaml ADDED Viewed

	@@ -0,0 +1,14 @@

+max_n_frames: 256
+n_frames: 16
+max_pixels: 460800 # 1280 * 720 // 2
+min_pixels: 0
+max_seq_len: 16384
+is_training: false  # 会影响：1. 训练和测试时采帧不同；2. 测试时忽略 response。
+print_data_error: true
+is_training: false
+do_image_padding: false
+do_image_crop: false
+do_image_resize: false
+video_sampling_strategy: {'video_sampler_version': 'v1', 'force_frames_n_divisible': 1, 'use_multi_images_for_video': true}
+prompt: ""
+train_task: sft

dataset/custom_data_parsers/multi_images_parser.py ADDED Viewed

	@@ -0,0 +1,199 @@

+from typing import Dict, List
+import random
+import re
+from PIL import Image
+from .utils import sample_video, read_image
+class MultiImagesParser:
+    def __init__(
+        self,
+        n_frames=8,
+        is_training=True,
+    ):
+        self.n_frames = n_frames
+        self.is_training = is_training
+        # fmt: off
+        self.data_temp = {
+            "text": [
+                [{
+                    "prompt": "Describe the image in short.",
+                    "response": "A rollerblader rides high in a full pipe while others watch"
+                }],
+                [{
+                    "prompt": "Describe the image in short.",
+                    "response": "A woman in winter clothes is on the sidewalk with a phone."
+                }]
+            ],
+            "image": [
+                {
+                    "image_file": "/mnt/bn/videonaslq/images/flickr30k/images/3371533654.jpg"
+                },
+                {
+                    "image_file": "/mnt/bn/videonaslq/images/coco/train2014/COCO_train2014_000000177950.jpg"
+                },
+                {
+                    "video_file": "/mnt/bn/llmdatalq/jiangnan/video_generation/webvid_10M_download/20230609/videos/011851_011900/1047443473.mp4",
+                    "frame_indices": [0, 85, 171, 256, 342, 427, 513, 598]
+                }
+            ],
+            "dataset": "coco",
+            "task": "multi_images",
+            "image_processing_config": {},
+        }
+        # fmt: on
+    def check_format(self, data_dict: Dict, image_processing_config: Dict):
+        assert data_dict['dataset'] in ['coco', 'sharegpt4v_cap100k', 'sharegpt4v_mix665k', 'webvid', 'movie'], data_dict
+        # 目前多图数据应该没有包含坐标的数据吧
+        if image_processing_config.get('has_coordinates', False):
+            raise ValueError(f'do_crop and has_coordinates cannot be True at the same time in MultiImagesParser!')
+        # 检查是否能匹配到坐标
+        texts = data_dict['text']
+        for text in texts:
+            match = re.search(r'\[(\d+(\.\d+)?,\s*)+\d+(\.\d+)?\]', text['prompt'] + text['response'])
+            if match:
+                print(f'[Warning] 疑似检测到包含坐标的数据：{data_dict}')
+    def transform(self, data_dict: Dict, image_processing_config: Dict = None) -> Dict:
+        self.check_format(data_dict, image_processing_config)
+        # shuffle
+        texts = data_dict['text']
+        images = data_dict['image']
+        images = self.load_images(images)
+        idxs = list(range(len(texts)))
+        random.shuffle(idxs)
+        texts = [texts[i] for i in idxs]
+        images = [images[i] for i in idxs]
+        # sample n_frames
+        if isinstance(self.n_frames, int):
+            n_frames = random.choice(list(range(1, self.n_frames + 1)))
+        else:
+            n_frames = random.choice(self.n_frames)
+        texts = texts[: n_frames]
+        images = images[: n_frames]
+        dataset = data_dict['dataset']
+        if dataset in ['coco', 'sharegpt4v_cap100k', 'webvid', 'movie']:
+            prompt, response = self.transform_for_caption_task(texts, dataset, images)
+        else:
+            prompt, response = self.transform_for_qa_task(texts, dataset, images)
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    *[{"type": "image", "image": img} for img in images],
+                    {"type": "text", "text": prompt},
+                ]
+            },
+            {
+                "role": "assistant",
+                "content": [
+                    {"type": "text", "text": response}
+                ]
+            }
+        ]
+        return messages
+    def transform_for_caption_task(self, texts, dataset, images):
+        idx = random.choice(list(range(len(texts))))
+        if dataset == 'coco':
+            if len(texts) == 1:
+                prompt = 'Describe the image in short.'
+            else:
+                prompt = f'Describe the images starting from frame {idx + 1} in short in order.'
+        elif dataset == 'sharegpt4v_cap100k':
+            if len(texts) == 1:
+                prompt = 'Describe the image in detail.'
+            else:
+                prompt = f'Describe the images starting from frame {idx + 1} in detail in order.'
+        else:
+            if len(texts) == 1:
+                prompt = 'Describe the image.'
+            else:
+                prompt = f'Describe the images starting from frame {idx + 1} in order.'
+        response = ''
+        for i, text in enumerate(texts):
+            if i < idx:
+                continue
+            if not isinstance(text, dict):
+                text = random.choice(text)
+            resp = text['response']
+            response += f'{resp}\n'
+        return prompt, response
+    def transform_for_qa_task(self, texts, dataset, images):
+        prompt, response = '', ''
+        for i, text in enumerate(texts):
+            if not isinstance(text, dict):
+                text = random.choice(text)
+            if len(texts) > 1:
+                prompt += f'Question for frame {i+1}:\n' + text['prompt'] + '\n'
+                response += f'Answer to question of frame {i+1}:\n' + text['response'] + '\n'
+            else:
+                prompt += text['prompt'] + '\n'
+                response += text['response'] + '\n'
+        return prompt, response
+    def load_images(self, image_items: List[Dict]) -> List[Image.Image]:
+        """
+        image_items: List[Dict]. each item like:
+            {"video_file": "path/to/video", "frame_indices": [1]}
+            or
+            {"image_file": "path/to/image"}
+        """
+        if image_items is None:
+            raise ValueError(f'image_items is None!')
+        if isinstance(image_items, dict):
+            image_items = [image_items]
+        images = []
+        for image_item in image_items:
+            if 'video_file' in image_item:
+                file_key = 'video_file'
+            elif 'image_file' in image_item:
+                file_key = 'image_file'
+            else:
+                raise KeyError(f'video_file or image_file not in {image_item}')
+            file_path = image_item[file_key]
+            if file_key == 'video_file':
+                frame_indices = image_item.get('frame_indices', None)
+                if frame_indices is None:
+                    raise ValueError(f'read 0 frame: {image_item}')
+                if isinstance(frame_indices, int):
+                    frame_indices = [frame_indices]
+                frames = sample_video(file_path, frame_indices = frame_indices)
+                images.extend(frames)
+            else:
+                if isinstance(file_path, str):
+                    file_path = [file_path]
+                images.extend([read_image(f) for f in file_path])
+        return images
+if __name__ == '__main__':
+    # python3 -m xenon_generation.data.custom_data_parsers.multi_images_parser
+    from tqdm import tqdm
+    from tools.rw_utils import read_jsonlines
+    lines = read_jsonlines('/mnt/bn/videonaslq/VideoCaption/datasets_1009/sharegpt4v_cap100k/part_36.jsonl')
+    lines = lines[:10]
+    parser = MultiImagesParser(n_frames=8)
+    for i, l in tqdm(enumerate(lines)):
+        l_image_processing_config = l.get('image_processing_config', {})
+        messages = parser.transform(l, l_image_processing_config)
+        print(messages)

dataset/custom_data_parsers/object_tracking_parser.py ADDED Viewed

	@@ -0,0 +1,160 @@

+from typing import Dict
+import random
+import re
+from torchvision import transforms
+from .utils import sample_video
+def return_same(x):
+    return x
+def _bbox_transform_for_padding(bbox, frame):
+    w1, h1, w2, h2 = bbox
+    width, height = frame.size
+    if width == height:
+        pass
+    elif width > height:
+        h1 += (width - height) // 2
+        h2 += (width - height) // 2
+        height = width
+    else:
+        w1 += (height - width) // 2
+        w2 += (height - width) // 2
+        width = height
+    new_bbox = [w1 / width, h1 / height, w2 / width, h2 / height]
+    new_bbox = [round(i, 2) for i in new_bbox]
+    return new_bbox
+def _bbox_transform_for_resize(bbox, frame):
+    w1, h1, w2, h2 = bbox
+    width, height = frame.size
+    new_bbox = [w1 / width, h1 / height, w2 / width, h2 / height]
+    new_bbox = [round(i, 2) for i in new_bbox]
+    return new_bbox
+class InAndOutCropAndResize(object):
+    """Crop and resize for in_and_out boxes data according to yuchen
+    Args:
+        size: tuple of (width, height)
+    """
+    def __init__(self, size):
+        self.size = size
+    def __call__(self, img):
+        """
+        Args:
+            img (PIL Image): PIL Image
+        Returns:
+            PIL Image: PIL image.
+        """
+        w = img.width
+        h = img.height
+        x0 = int(w * 0.5 - h * 0.375)
+        y0 = int(h * 0.125)
+        x1 = int(w * 0.5 + h * 0.375)
+        y1 = int(h * 0.875)
+        img = img.crop((x0, y0, x1, y1)).resize(self.size)
+        return img
+class ObjectTrackingParser:
+    def __init__(
+        self,
+        n_frames = 8,
+        max_objects = 3,
+        is_training=True,
+    ):
+        self.n_frames = n_frames
+        self.max_objects = max_objects
+        self.is_training = is_training
+        self.img_transform = self.get_img_transform()
+        # fmt: off
+        self.data_temp = {
+            "video_file": "/mnt/bn/llmdatalq/jiaxin/hdvila/20230926/saved/saved_video_clips/0076/lOjn__YCec4.624.1104.mp4",
+            "frame_indices": [154, 157, 160, 163, 166, 169, 172, 175, 178, 181, 184, 187, 190, 193, 196, 199, 202],
+            "objects": {
+                "0": {
+                    "phrase": "person",
+                    "all_frame_bounding_boxes": [[2, 0, 255, 250], [17, 0, 255, 251], [35, 0, 255, 253], [44, 0, 255, 255], [52, 0, 255, 255], [54, 0, 255, 255], [63, 0, 255, 255], [60, 0, 255, 255], [54, 0, 253, 255], [43, 0, 250, 255], [36, 1, 249, 255], [36, 0, 252, 254], [41, 0, 252, 254], [61, 0, 255, 253], [68, 4, 255, 255], [74, 8, 255, 255], [91, 3, 255, 255]]
+                }
+            },
+            "task": "object_tracking",
+            "dataset": "hdvila"
+        }
+        # fmt: on
+    def check_format(self, data_dict: Dict, image_processing_config: Dict):
+        # box tracking 数据不支持 do_crop！！！
+        if image_processing_config.get('do_crop', False):
+            raise ValueError(f'do_crop is not supported in ObjectTrackingParser!')
+    def transform(self, data_dict: Dict, image_processing_config: Dict = None) -> Dict:
+        self.check_format(data_dict, image_processing_config)
+        bbox_transform = _bbox_transform_for_padding if image_processing_config['do_padding'] else _bbox_transform_for_resize
+        # sample n_frames
+        if isinstance(self.n_frames, int):
+            n_frames = self.n_frames
+        else:
+            n_frames = random.choice(self.n_frames)
+        total_frames = list(range(len(data_dict['frame_indices'])))
+        idxs = random.sample(total_frames, min(n_frames, len(total_frames)))
+        idxs.sort()
+        frame_indices = [data_dict['frame_indices'][i] for i in idxs]
+        frames = sample_video(data_dict['video_file'], frame_indices=frame_indices)
+        img_transform = self.img_transform[data_dict['dataset']]
+        frames = [img_transform(f) for f in frames]
+        objects = []
+        for _, o in data_dict['objects'].items():
+            if o is None:
+                continue
+            all_frame_bounding_boxes = [o['all_frame_bounding_boxes'][i] for i in idxs]
+            all_frame_bounding_boxes_t = []
+            for bbox, frame in zip(all_frame_bounding_boxes, frames):
+                all_frame_bounding_boxes_t.append(bbox_transform(bbox, frame))
+            objects.append(all_frame_bounding_boxes_t)
+            if len(objects) >= self.max_objects:
+                break
+        prompt = "Given the bounding box coordinates of these objects in the first frame, output the bounding box coordinates in the following frames.\n{}"
+        response = ''
+        object_info = ''
+        for i, o in enumerate(objects):
+            object_info += f'object {i+1}: {o[0]}\n'
+            response += f'object {i+1}: {o[1:]}\n'
+        response = response.strip()
+        prompt = prompt.format(object_info)
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "video", "video": frames},
+                    {"type": "text", "text": prompt}
+                ]
+            },
+            {
+                "role": "assistant",
+                "content": [
+                    {"type": "text", "text": response}
+                ]
+            }
+        ]
+        return messages
+    def get_img_transform(self):
+        return {
+            'webvid': return_same,
+            'hdvila': transforms.Compose([
+                transforms.Resize(size=256),
+                transforms.CenterCrop(size=(256, 256))
+            ]),
+            'hdvila_in_and_out_boxes': InAndOutCropAndResize(size=(256, 256))
+        }

dataset/custom_data_parsers/standard_vision_parser.py ADDED Viewed

	@@ -0,0 +1,255 @@

+from typing import Dict, List
+from PIL import Image
+import random
+from .utils import sample_video, read_image, adjust_bbox, filter_ocr_polygon
+class VisionParser:
+    def __init__(
+        self,
+        n_frames=8,
+        max_n_frames=256,
+        is_training=True,
+        video_sampling_strategy={},
+    ):
+        self.n_frames = n_frames
+        self.max_n_frames = max_n_frames
+        self.is_training = is_training
+        self.video_sampling_strategy = video_sampling_strategy
+        # fmt: off
+        self.data_temp = {
+            "messages": [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": "Describe the image and the video."},
+                        # 支持的 image 格式：
+                        {"type": "image", "image": {"image_file": "/path/to/image"}},
+                        {"type": "image", "image": {"video_file": "/path/to/video", "frame_indices": 0}},
+                        # 支持的 video 格式：
+                        {"type": "video", "video": {"video_file": "/path/to/video"}},
+                        {"type": "video", "video": {"video_file": "/path/to/video", "frame_indices": [0, 1, 2]}},
+                        {"type": "video", "video": {"video_file": "/path/to/video", "start_frame": 0, "end_frame": 100}},
+                        {"type": "video", "video": {"video_file": "/path/to/video", "time_indices": [0, 1, 2]}},
+                        {"type": "video", "video": {"video_file": "/path/to/video", "start_time": 0, "end_time": 100}},
+                        {"type": "video", "video": {"image_file": ["/path/to/image"]}, "frame_indices": [0, 1, 2]},
+                    ]
+                },
+                {
+                    "role": "assistant",
+                    "content": [
+                        {"type": "text","text": "xxx"}
+                    ]
+                }
+            ],
+            "dataset": "LSMDC",
+            "task": "video/caption"
+        }
+        # fmt: on
+    def check_format(self, data_dict: Dict, image_processing_config: Dict):
+        if image_processing_config.get('do_crop', False) and image_processing_config.get('has_coordinates', False):
+            raise ValueError(f'do_crop and has_coordinates cannot be True at the same time!')
+    """
+    1. 将 messages 中的 image/video 替换成相应的 PIL.Image/List[PIL.Image]
+    2. text 的特殊处理：调整 box；过滤面积太小的OCR
+    """
+    def transform(self, data_dict: Dict, image_processing_config: Dict = None) -> Dict:
+        self.check_format(data_dict, image_processing_config)
+        self.set_n_frames(data_dict)
+        first_image = None # ugly! 需要调整box/过滤面积太小的OCR的数据只有图片任务
+        for msg in data_dict['messages']:
+            if isinstance(msg['content'], dict):
+                msg['content'] = [msg['content']]
+            for content in msg['content']:
+                if content['type'] == 'image':
+                    content['image'] = self.load_image_item(content['image'])
+                    if first_image is None:
+                        first_image = content['image']
+                elif content['type'] == 'video':
+                    video = self.load_video_item(content['video'])
+                    content['video'] = video.pop('frames')
+                    if video:
+                        data_dict['extra_info']['frame_disturb_info'] = video.pop('video_info', {})
+                elif content['type'] == 'text':
+                    pass
+                else:
+                    raise ValueError(f"content['type']={content['type']} MUST be one of ['image', 'video', 'text']")
+        for msg in data_dict['messages']:
+            for content in msg['content']:
+                if content['type'] == 'text':
+                    self.postprocess_text(content, data_dict, image_processing_config, first_image)
+        return data_dict['messages']
+    # set n_frames for each vision item.
+    def set_n_frames(self, data_dict):
+        if isinstance(self.n_frames, int):
+            n_frames = self.n_frames
+        else:
+            n_frames = random.choice(self.n_frames)
+        assert n_frames <= self.max_n_frames
+        curr_n_frames = 0
+        has_dynamic = False
+        for msg in data_dict['messages']:
+            if isinstance(msg['content'], dict):
+                msg['content'] = [msg['content']]
+            for content in msg['content']:
+                if content['type'] == 'image':
+                    curr_n_frames += 1
+                elif content['type'] == 'video':
+                    if 'frame_indices' in content['video']:
+                        curr_n_frames += len(content['video']['frame_indices'])
+                        content['video']['n_frames'] = len(content['video']['frame_indices'])
+                    elif 'time_indices' in content['video']:
+                        curr_n_frames += len(content['video']['time_indices'])
+                        content['video']['n_frames'] = len(content['video']['time_indices'])
+                    elif 'min_n_frames' in content['video']:
+                        content['video']['min_n_frames'] = int(content['video']['min_n_frames'])
+                        curr_n_frames += content['video']['min_n_frames']
+                        content['video']['n_frames'] = content['video']['min_n_frames']
+                        has_dynamic = True
+                    elif 'fps' in content['video']:
+                        content['video']['n_frames'] = self.max_n_frames
+                        curr_n_frames += self.max_n_frames
+                        has_dynamic = True
+                    else:
+                        content['video']['n_frames'] = 0
+                        has_dynamic = True
+        while curr_n_frames < n_frames and has_dynamic:
+            for msg in data_dict['messages']:
+                for content in msg['content']:
+                    if content['type'] == 'video':
+                        if 'frame_indices' in content['video']:
+                            pass
+                        elif 'time_indices' in content['video']:
+                            pass
+                        else:
+                            if curr_n_frames < n_frames:
+                                content['video']['n_frames'] += 1
+                            curr_n_frames += 1
+        while curr_n_frames > self.max_n_frames and has_dynamic:
+            for msg in data_dict['messages']:
+                for content in msg['content']:
+                    if content['type'] == 'video':
+                        if 'frame_indices' in content['video']:
+                            pass
+                        elif 'time_indices' in content['video']:
+                            pass
+                        else:
+                            if curr_n_frames > self.max_n_frames:
+                                content['video']['n_frames'] -= 1
+                            curr_n_frames -= 1
+        for msg in data_dict['messages']:
+            for content in msg['content']:
+                if content['type'] == 'video':
+                    if 'frame_indices' in content['video']:
+                        pass
+                    elif 'time_indices' in content['video']:
+                        pass
+                    else:
+                        n = self.video_sampling_strategy.get('force_frames_n_divisible', 1)
+                        if n > 1 and content['video']['n_frames'] % n != 0:
+                            content['video']['n_frames'] += n - content['video']['n_frames'] % n
+    def load_image_item(self, image_item) -> Image.Image:
+        """
+        image_item:
+        {"image_file": {"lq": "/path/to/image"}}
+        {"video_file": {"lq": "/path/to/video"}, "frame_indices": 0}
+        """
+        # check format
+        if ("image_file" not in image_item) and ("video_file" not in image_item):
+            raise KeyError(f"Key 'image_file' or 'video_file' not found in image_item")
+        if 'image_file' in image_item:
+            if not isinstance(image_item['image_file'], str):
+                raise ValueError(f"{image_item['image_file']} is not a str!")
+        if 'video_file' in image_item:
+            if not isinstance(image_item['frame_indices'], int):
+                raise ValueError(f"{image_item['frame_indices']} is not a int!")
+        if 'image_file' in image_item:
+            image = read_image(image_item['image_file'])
+        else:
+            frame_indices = [image_item['frame_indices']]
+            image = sample_video(image_item['video_file'], frame_indices = frame_indices)[0]
+        return image
+    def load_video_item(self, video_item) -> List[Image.Image]:
+        """
+        video_item:
+        {"video_file": {"lq": "/path/to/video"}, "n_frames": 8}
+        {"video_file": {"lq": "/path/to/video"}, "frame_indices": [0, 1, 2], "n_frames": 3}
+        {"video_file": {"lq": "/path/to/video"}, "start_frame": 0, "end_frame": 100, "n_frames": 8}
+        {"video_file": {"lq": "/path/to/video"}, "time_indices": [0, 1, 2], "n_frames": 3}
+        {"video_file": {"lq": "/path/to/video"}, "start_time": 0, "end_time": 100, "n_frames": 8}
+        {"image_file": {"lq": ["/path/to/image"]}, "frame_indices": [0, 1, 2], "n_frames": 3}
+        """
+        # check format
+        if ("image_file" not in video_item) and ("video_file" not in video_item):
+            raise KeyError(f"Key 'image_file' or 'video_file' not found in video_item")
+        video_path = video_item.get('video_file', video_item.get('image_file'))
+        n_frames = video_item.get('n_frames', None)
+        frame_indices = video_item.get('frame_indices', None)
+        start_frame = video_item.get('start_frame', None)
+        end_frame = video_item.get('end_frame', None)
+        time_indices = video_item.get('time_indices', None)
+        start_time = video_item.get('start_time', None)
+        end_time = video_item.get('end_time', None)
+        mask_boxes = video_item.get('mask_boxes', None)
+        fps = video_item.get('fps', None)
+        frames, frame_indices = sample_video(
+            video_path=video_path,
+            frame_indices=frame_indices,
+            start_frame=start_frame,
+            end_frame=end_frame,
+            n_frames=n_frames,
+            time_indices=time_indices,
+            start_time=start_time,
+            end_time=end_time,
+            sampling_fps=fps,
+            mask_boxes=mask_boxes,
+            is_training=self.is_training,
+            video_sampling_strategy=self.video_sampling_strategy,
+            return_frame_ids=True,
+        )
+        if self.video_sampling_strategy.get('use_multi_images_for_video', False):
+            new_frames = []
+            for f in frames:
+                new_frames.extend([f, f])
+            frames = new_frames
+        if isinstance(frame_indices, dict):
+            return {
+                'frames': frames,
+                'video_info': frame_indices
+            }
+        return {'frames': frames}
+    def postprocess_text(self, content, data_dict, image_processing_config, first_image):
+        if image_processing_config.get('has_coordinates') and image_processing_config.get('do_padding'):
+            content['text'] = adjust_bbox(content['text'], frame=first_image)
+        if data_dict.get('task') == 'image/OCR' and image_processing_config.get('has_coordinates'):
+            content['text'] = filter_ocr_polygon(content['text'])

dataset/custom_data_parsers/utils.py ADDED Viewed

	@@ -0,0 +1,452 @@

+from typing import List, Dict, Union
+import os
+import random
+import tempfile
+from PIL import Image, ImageSequence
+import base64
+import io
+import re
+import uuid
+import json
+import numpy as np
+import pyarrow.fs as pf
+import func_timeout
+from func_timeout import func_set_timeout
+import math
+# fmt: on
+import decord
+# fmt: off
+def denorm_box(points, height, width):
+    new_points = []
+    for p in points:
+        new_points.append((round(p[0] * width), round(p[1] * height)))
+    return new_points
+def process_image_for_tiktok(frames: List[Image.Image], mask_boxes):
+    mask_boxes = mask_boxes[:len(frames)]
+    frames = [np.array(f) for f in frames]
+    # assert len(mask_boxes) == len(frames)
+    height, width = frames[0].shape[:2]
+    new_frames = []
+    for boxes, frame in zip(mask_boxes, frames):
+        left, top, right, bottom = 0, 0, width, height
+        for box in boxes:
+            pts = np.array(denorm_box(box, height, width), np.int32)
+            upper_bound = max([p[1] for p in pts]) + 30
+            if bottom > upper_bound:
+                bottom = upper_bound
+            frame[pts[0][1]: pts[2][1], pts[0][0]: pts[1][0]] = 0
+        new_frames.append(Image.fromarray(frame[top: bottom, left: right]))
+    return new_frames
+# 先将视频分成 n_frames 份。训练时，每份随机抽一帧；测试时，每份抽中间的那一帧。
+def _sample_frame_indices_v2(
+        total_frames: int,
+        n_frames: int,
+        is_training=False,
+        video_sampling_strategy = {},
+    ):
+    total_frames_idxs = list(range(total_frames))
+    if total_frames <= n_frames:
+        return total_frames_idxs
+    k, m = divmod(total_frames, n_frames)
+    frame_splits = [total_frames_idxs[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in list(range(n_frames))]
+    if is_training:
+        sample_ids = [random.choice(i) for i in frame_splits]
+    else:
+        sample_ids = [i[(len(i)+1)//2-1] for i in frame_splits]
+    return sample_ids
+# 均匀抽帧，必采样首尾帧。
+def _sample_frame_indices_v1(total_frames: int, n_frames: int, is_training=False, video_sampling_strategy = {}):
+    if n_frames == 1:
+        return [0]  # sample first frame in default
+    if total_frames <= n_frames:
+        return list(range(total_frames))
+    sample_ids = [round(i * (total_frames - 1) / (n_frames - 1)) for i in range(n_frames)]
+    return sample_ids
+def conduct_disturb_frame(frame_indices):
+    disturb_type = random.choice(['exchange', 'crop', 'reverse', 'discard'])
+    n_frames = len(frame_indices)
+    frame_indices_new = []
+    if disturb_type == 'exchange':
+        # 均等分成4个segments, 随机交换两个segment
+        seg_len = math.ceil(n_frames / 4)
+        seg_idxs = list(range(0, n_frames, seg_len))
+        target_idxs = random.sample(range(0, 4), 2)
+        seg_idxs[target_idxs[0]], seg_idxs[target_idxs[1]] = seg_idxs[target_idxs[1]], seg_idxs[target_idxs[0]]
+        for idx in seg_idxs:
+            frame_indices_new += frame_indices[idx: idx+seg_len]
+    elif disturb_type == 'crop':
+        # 随机截取出3/4时长，再采均匀n_frames帧
+        crop_len = math.ceil(n_frames / 4)
+        idx_s = random.choice(range(0, crop_len+1))
+        idx_e = n_frames - 1 - (crop_len - idx_s)
+        frame_indices_new = np.linspace(frame_indices[idx_s], frame_indices[idx_e], n_frames, dtype=int).tolist()
+    elif disturb_type == 'reverse':
+        # 随机选择长度为[1/2, 1]时长的片段进行顺序颠倒
+        reverse_len = math.ceil(random.uniform(0.5,1) * n_frames)
+        idx_s = random.choice(range(0, n_frames-reverse_len+1))
+        idx_e = idx_s + reverse_len - 1
+        frame_indices_new = frame_indices[:idx_s] + list(reversed(frame_indices[idx_s: idx_e+1])) + frame_indices[idx_e+1:]
+    elif disturb_type == 'discard':
+        # 随机丢弃一半帧
+        frame_indices_new = random.sample(frame_indices, n_frames//2)
+        frame_indices_new.sort()
+    return disturb_type, frame_indices_new
+@func_set_timeout(60)
+def _download_file(path):
+    if path.startswith("hdfs"):
+        local_path = os.path.join(tempfile.gettempdir(), f'{uuid.uuid4()}_' + os.path.basename(path))
+        fs = pf.HadoopFileSystem.from_uri(uri="hdfs://harunava")
+        hdfs_file = fs.open_input_file(path)
+        file_size = hdfs_file.size()
+        if file_size > 1024 * 1024 * 1024: # 1G
+            os.system(f"hadoop fs -get --ct 8 -c 512 '{path}' '{local_path}' > /dev/null 2>&1")
+        elif file_size > 1024 * 1024 * 100: # 100M
+            os.system(f"hadoop fs -get '{path}' '{local_path}' > /dev/null 2>&1")
+        else:
+            local_fs = pf.LocalFileSystem()
+            with local_fs.open_output_stream(local_path) as local_file:
+                while True:
+                    chunk = hdfs_file.read(1024 * 1024 * 100)  # Reading 1MB chunks, you can adjust this as needed
+                    if not chunk:
+                        break
+                    local_file.write(chunk)
+    else:
+        local_path = path
+    if not os.path.exists(local_path):
+        raise FileNotFoundError(f'{local_path}')
+    return local_path
+def download_file(path):
+    try:
+        # with timer(f'Download {path}'):
+        return _download_file(path)
+    except func_timeout.exceptions.FunctionTimedOut as e:
+        raise ValueError(e)
+class VideoReader:
+    def __init__(self, path: str) -> None:
+        self.path = path
+        self.local_path = self.preprocess()
+        self.vr = decord.VideoReader(self.local_path, num_threads=1, ctx=decord.cpu(0), fault_tol=1)
+        self.vr.seek(0)
+        self._length = len(self.vr)
+        self._fps = self.vr.get_avg_fps()
+    @property
+    def length(self):
+        return self._length
+    @property
+    def fps(self):
+        return self._fps
+    def sample(self, frame_indices) -> List[Image.Image]:
+        frames = self.vr.get_batch(frame_indices).asnumpy()
+        frames = [Image.fromarray(f).convert('RGB') for f in frames]
+        return frames
+    def preprocess(self):
+        return download_file(self.path)
+    def postprocess(self):
+        if self.path.startswith("hdfs"):
+            os.remove(self.local_path)
+class ImageSeqReader:
+    def __init__(self, path: List[str]) -> None:
+        self.path = path
+        self.local_path = self.preprocess()
+        self._length = len(self.local_path)
+        self._fps = None
+    @property
+    def length(self):
+        return self._length
+    @property
+    def fps(self):
+        return self._fps
+    def sample(self, frame_indices):
+        return [read_image(self.local_path[i]) for i in frame_indices]
+    def preprocess(self):
+        local_paths = []
+        for p in self.path:
+             local_paths.append(p)
+        return local_paths
+    def postprocess(self):
+        pass
+class GIFReader:
+    def __init__(self, path: str) -> None:
+        self.path = path
+        self.local_path = self.preprocess()
+        self.gif = Image.open(self.local_path)
+        self._length = self.gif.n_frames
+        duration = self.gif.info.get('duration', 0) / 1000  # 转换为秒
+        if duration > 0:
+            self._fps = 1 / duration
+        else:
+            self._fps = None
+    @property
+    def length(self):
+        return self._length
+    @property
+    def fps(self):
+        return self._fps
+    def sample(self, frame_indices):
+        frames = []
+        i = 0
+        for frame in ImageSequence.Iterator(self.gif):
+            if i in frame_indices:
+                frames.append(frame.convert('RGB'))
+            i += 1
+        return frames
+    def preprocess(self):
+        return download_file(self.path)
+    def postprocess(self):
+        if self.path.startswith("hdfs"):
+            os.remove(self.local_path)
+def check_frame_indices(frame_indices, total_frames, video_path):
+    if frame_indices[-1] == total_frames:
+        frame_indices[-1] = total_frames - 1
+    valid_frame_indices = [i for i in frame_indices if i >= 0 and i < total_frames]
+    if len(valid_frame_indices) != len(frame_indices):
+        print(f'[Error] frame out of index. video_path={video_path}, frame_indices={frame_indices}, total_frames={total_frames}', flush=True)
+    return valid_frame_indices
+def sample_video(
+    video_path: Union[str, List[str]],
+    frame_indices: List[int] = None,
+    start_frame:int=None,
+    end_frame:int=None,
+    n_frames:int = None,
+    time_indices: List[float] = None,
+    start_time:int=None,
+    end_time:int=None,
+    sampling_fps:float=None,
+    mask_boxes=None,
+    is_training:bool=False,
+    video_sampling_strategy={'video_sampler_version': 'v1'},
+    return_frame_ids: bool=False,
+    ) -> List[Image.Image]:
+    do_frame_disturb = video_sampling_strategy.get('do_frame_disturb', False)
+    if isinstance(video_path, str):
+        if video_path.endswith('.gif'):
+            reader = GIFReader(video_path)
+        else:
+            reader = VideoReader(video_path)
+    else:
+        reader = ImageSeqReader(video_path)
+    total_frames = reader.length
+    fps = reader.fps
+    if sampling_fps is not None:
+        frame_indices = list(range(0, total_frames, round(fps / sampling_fps)))
+        if len(frame_indices) > n_frames:
+            frame_indices = None
+    if time_indices is not None:
+        frame_indices = [round(float(i) * fps) for i in time_indices]
+    if start_time is not None and end_time is not None:
+        start_frame = round(start_time * fps)
+        end_frame = round(end_time * fps)
+    if frame_indices is None:
+        start_frame = 0 if start_frame is None else round(start_frame)
+        end_frame = total_frames - 1 if end_frame is None else round(end_frame)
+        if end_frame == total_frames:
+            end_frame -= 1
+        if video_sampling_strategy['video_sampler_version'] == 'v1':
+            # 均匀抽帧，必采样首尾帧。
+            frame_indices = _sample_frame_indices_v1(end_frame - start_frame + 1, n_frames, is_training, video_sampling_strategy)
+        elif video_sampling_strategy['video_sampler_version'] == 'v2':
+            frame_indices = _sample_frame_indices_v2(end_frame - start_frame + 1, n_frames, is_training, video_sampling_strategy)
+        else:
+            raise ValueError(f"video_sampler_version={video_sampling_strategy['video_sampler_version']} must be 'v1' or 'v2'")
+        frame_indices = [i + start_frame for i in frame_indices]
+    frame_indices = check_frame_indices(frame_indices, total_frames, video_path)
+    if do_frame_disturb:
+        frame_disturb_type, frame_indices_new = conduct_disturb_frame(frame_indices)
+        frame_indices_raw = frame_indices[:]
+        frame_indices = frame_indices_new
+    frames = reader.sample(frame_indices)
+    if mask_boxes is not None:
+        frames = process_image_for_tiktok(frames, mask_boxes)
+    n = video_sampling_strategy.get('force_frames_n_divisible', 1)
+    if n > 1 and len(frames) % n != 0:
+        new_n = n - len(frames) % n
+        frames.extend([Image.new(mode='RGB', size=frames[-1].size) for _ in range(new_n)])
+    reader.postprocess()
+    if do_frame_disturb:
+        return frames, {"frame_indices": frame_indices, "disturb_type": frame_disturb_type, "frame_indices_raw": frame_indices_raw}
+    if return_frame_ids:
+        return frames, frame_indices
+    return frames
+def load_image_from_base64String(img_path):
+    img = base64.b64decode(open(img_path, "rb").read())
+    buf = io.BytesIO(img)
+    img = Image.open(buf)
+    return img
+def read_image(image_path):
+    local_file = download_file(image_path)
+    if local_file.endswith('.dat'):
+        image = load_image_from_base64String(local_file)
+    else:
+        image = Image.open(local_file).convert('RGB')
+    if image_path.startswith("hdfs"):
+        os.remove(local_file)
+    return image
+def adjust_bbox(text, frame):
+    width, height = frame.size
+    new_text = []
+    start_idx = 0
+    for match in re.finditer(r'\[(\d+(\.\d+)?,\s*)+\d+(\.\d+)?\]', text):
+        coordinate_matches = re.findall(r"([0-9.]+)", match.group(0))
+        xys = [float(coord) for coord in coordinate_matches]
+        new_xys = []
+        for i in range(len(xys)):
+            p = xys[i]
+            if width == height:
+                pass
+            if width > height and i % 2 != 0:
+                p = xys[i] * height
+                p += (width - height) // 2
+                p = round(p / width, 2)
+            if height > width and i % 2 == 0:
+                p = xys[i] * width
+                p += (height - width) // 2
+                p = round(p / height, 2)
+            new_xys.append(p)
+        new_text.append(text[start_idx: match.span()[0]])
+        new_text.append(str(new_xys))
+        start_idx = match.span()[1]
+    new_text.append(text[start_idx: ])
+    text = ''.join(new_text)
+    return text
+def bbox_area(vertices, convert_format = True):
+    if convert_format:
+        vertices = list(zip(vertices[::2], vertices[1::2]))
+    x0, y0 = vertices[0]
+    x1, y1 = vertices[1]
+    return abs((x1 - x0) * (y1 - y0))
+def polygon_area(vertices, convert_format = True):
+    if convert_format:
+        vertices = list(zip(vertices[::2], vertices[1::2]))
+    n = len(vertices)  # 多边形顶点的数量
+    if n == 2:
+        return bbox_area(vertices, convert_format=False)
+    area = 0
+    for i in range(n):
+        x1, y1 = vertices[i]
+        x2, y2 = vertices[(i + 1) % n]
+        area += x1 * y2 - x2 * y1
+    return abs(area) / 2
+def get_text_len(text_line):
+    l = 0
+    for c in text_line:
+        if '\u4e00' <= c <= '\u9fff':
+            l += 1
+        else:
+            l += 0.5
+    return l
+def filter_ocr_polygon(response, area_threshold=0.0005):
+    try:
+        resp = json.loads(response)
+    except:
+        return response
+    new_resp = []
+    for coords, text_line in resp:
+        area = polygon_area(coords, convert_format=True)
+        text_len = get_text_len(text_line)
+        if text_len == 0:
+            continue
+        if area / text_len < area_threshold:
+            continue
+        new_resp.append([coords, text_line])
+    new_resp = json.dumps(new_resp, ensure_ascii=False)
+    return new_resp
+def put_pred_to_data_dict(prediction, data_dict):
+    msg = data_dict['messages'][-1]
+    if msg['role'] == 'assistant':
+        msg['content'][-1]['text'] = prediction
+    else:
+        data_dict['messages'].append({
+            "role": "assistant",
+            "content": [{"type": "text", "text": prediction}]
+        })
+def get_prompt_from_data_dict(data_dict):
+    prompt = ""
+    for msg in data_dict['messages']:
+        role = msg['role']
+        assert role in {'system', 'user', 'assistant'}
+        for content in msg['content']:
+            if content['type'] == 'text':
+                if content['text']:
+                    prompt += f"[{role}]: {content['text']}"
+            elif content['type'] == 'image':
+                prompt += f"[{role}]: <image>"
+            elif content['type'] == 'video':
+                prompt += f"[{role}]: <video>"
+            prompt += '\n'
+    return prompt

dataset/custom_data_parsers/utils_visualize.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import re
+from typing import Dict, List, Optional
+from PIL import Image, ImageDraw, ImageFont
+def scale_polygon(polygon, w, h):
+    new_polygon = []
+    for (x, y) in polygon:
+        new_polygon.append((x * w, y * h))
+    return new_polygon
+def draw_polygon(image: Image.Image, points: List[List[int]], label: Optional[str] = None):
+    draw = ImageDraw.Draw(image)
+    if len(points) > 2:
+        draw.polygon(points, outline="red", width=3)
+    elif len(points) == 2:
+        draw.rectangle(points, outline="red", width=3)
+    else:
+        raise ValueError(f'points={points} only has one point!')
+    if label is not None:
+        font = ImageFont.truetype('/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf', 20)
+        draw.text(points[0], label, font=font, fill=(0, 0, 255))
+    return image
+def visualize_image_bbox(data_dict, image_processing_config, processor):
+    if image_processing_config.get('has_coordinates') != True:
+        return
+    messages = data_dict['messages']
+    polygons = []
+    first_image_content = None
+    for msg in messages:
+        for content in msg['content']:
+            if content['type'] == 'text':
+                for match in re.finditer(r'\[(\d+(\.\d+)?,\s*)+\d+(\.\d+)?\]', content["text"]):
+                    coordinate_matches = re.findall(r"([0-9.]+)", match.group(0))
+                    coords = [float(coord) for coord in coordinate_matches]
+                    polygons.append(list(zip(coords[::2], coords[1::2])))
+            elif first_image_content is None and content['type'] == 'image':
+                first_image_content = content
+    first_image = first_image_content['image']
+    first_image = processor.preprocess_image(first_image, image_processing_config)
+    w, h = first_image.size
+    if len(polygons) > 0:
+        for i, polygon in enumerate(polygons):
+            polygon = scale_polygon(polygon, w, h)
+            first_image = draw_polygon(first_image, polygon, label=str(i))
+    first_image_content['image'] = first_image

dataset/custom_data_parsers/video_permutation_parser.py ADDED Viewed

	@@ -0,0 +1,137 @@

+from typing import Dict, List
+import random
+from PIL import Image, ImageDraw, ImageFont
+from .utils import sample_video
+class VideoPermutationParser:
+    def __init__(
+        self,
+        n_frames=8,
+        is_training=True,
+        frame_nums = list(range(8, 25)),
+        video_sampling_strategy={},
+    ):
+        self.n_frames = n_frames
+        self.is_training = is_training
+        self.frame_nums = frame_nums
+        self.video_sampling_strategy = video_sampling_strategy
+        # fmt: off
+        self.data_temp = {
+            "text": [{
+                "prompt": "<video>",
+                "response": ""
+            }],
+            "video": [{
+                "video_file": {
+                    "yg": "/mnt/bn/videonasyg/videos/webvid_10M_download/011851_011900/1047443473.mp4",
+                    "lq": "/mnt/bn/llmdatalq/jiangnan/video_generation/webvid_10M_download/20230609/videos/011851_011900/1047443473.mp4"
+                },
+                "frame_indices": [0, 85, 171, 256, 342, 427, 513, 598]
+            }],
+        }
+        # fmt: on
+    def check_format(self, data_dict: Dict):
+        pass
+        # for k in self.data_temp.keys():
+        #     assert k in data_dict
+    def transform(self, data_dict: Dict, image_processing_config: Dict = None) -> Dict:
+        self.check_format(data_dict)
+        frames = self.load_video_item(data_dict['video'][0])
+        # frames = self.add_text_to_frames(frames) # for debug
+        idxs = list(range(1, len(frames) + 1))
+        random.shuffle(idxs)
+        prefix_len = int(3/8*len(idxs))
+        shuffled_frames = [frames[i-1] for i in idxs]
+        prompt = f'Output the correct chronological order of scrambled video frames. The order of the first {prefix_len} ones are:\n'
+        prompt += '\n'.join([str(i) for i in idxs[: prefix_len]]) + '\nOutput the order of the following frames:'
+        response = '\n'.join([str(i) for i in idxs[prefix_len: ]])
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "video", "video": shuffled_frames},
+                    {"type": "text", "text": prompt},
+                ]
+            },
+            {
+                "role": "assistant",
+                "content": [
+                    {"type": "text", "text": response}
+                ]
+            }
+        ]
+        return messages
+    def load_video_item(self, video_item) -> List[Image.Image]:
+        """
+        video_item:
+        {"video_file": "/path/to/video", "n_frames": 8}
+        {"video_file": "/path/to/video", "frame_indices": [0, 1, 2], "n_frames": 3}
+        {"video_file": "/path/to/video", "start_frame": 0, "end_frame": 100, "n_frames": 8}
+        {"video_file": "/path/to/video", "time_indices": [0, 1, 2], "n_frames": 3}
+        {"video_file": "/path/to/video", "start_time": 0, "end_time": 100, "n_frames": 8}
+        {"image_file": ["/path/to/image"], "frame_indices": [0, 1, 2], "n_frames": 3}
+        """
+        # check format
+        if ("image_file" not in video_item) and ("video_file" not in video_item):
+            raise KeyError(f"Key 'image_file' or 'video_file' not found in video_item")
+        video_path = video_item.get('video_file', video_item.get('image_file'))
+        n_frames = video_item.get('n_frames', None)
+        frame_indices = video_item.get('frame_indices', None)
+        start_frame = video_item.get('start_frame', None)
+        end_frame = video_item.get('end_frame', None)
+        time_indices = video_item.get('time_indices', None)
+        start_time = video_item.get('start_time', None)
+        end_time = video_item.get('end_time', None)
+        mask_boxes = video_item.get('mask_boxes', None)
+        n_frames = random.choice(self.frame_nums)
+        n = self.video_sampling_strategy.get('force_frames_n_divisible', 1)
+        if n > 1 and n_frames % n != 0:
+            n_frames += n - n_frames % n
+        frames, frame_indices = sample_video(
+            video_path=video_path,
+            frame_indices=frame_indices,
+            start_frame=start_frame,
+            end_frame=end_frame,
+            n_frames=n_frames,
+            time_indices=time_indices,
+            start_time=start_time,
+            end_time=end_time,
+            mask_boxes=mask_boxes,
+            is_training=self.is_training,
+            video_sampling_strategy=self.video_sampling_strategy,
+            return_frame_ids=True,
+        )
+        return frames
+    def add_text_to_frames(self, frames: List[Image.Image]):
+        new_frames = []
+        for i, image in enumerate(frames):
+            draw = ImageDraw.Draw(image)
+            font = ImageFont.truetype('/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf', 100)
+            text_position = (50, 50)
+            text_content = f'{i+1}'
+            text_color = (255, 0, 0)
+            draw.text(text_position, text_content, font=font, fill=text_color)
+            new_frames.append(image)
+        return new_frames

dataset/mm_dataset.py DELETED Viewed

@@ -1,62 +0,0 @@
-# Copyright (2024) Bytedance Ltd. and/or its affiliates
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from dataset.utils import get_visual_type, sample_frame_indices
-from .processor import Processor
-from tools.rw_utils import read_jsonlines
-class MMDataset(object):
-    def __init__(self, ann_path="", anns=None, processor:Processor=None):
-        self.processor = processor
-        if anns is None:
-            self.anns = []
-            if isinstance(ann_path, str):
-                ann_path = [ann_path]
-            for path in ann_path:
-                self.anns.extend(read_jsonlines(path))
-        else:
-            self.anns = anns
-    def __len__(self):
-        return len(self.anns)
-    def __getitem__(self, index):
-        try:
-            ann = self.anns[index]
-            prompt = ann['text']['prompt']
-            video_file = ann['video_file']
-            visual_files = []
-            start_time = ann.get("start_time", 0)
-            end_time = ann.get("end_time", -1)
-            if isinstance(video_file, list):
-                # This is for MVBench/Episodic Reasoning
-                # The video_file are a list of sorted frames extract from the target video
-                for img_file in video_file:
-                    if get_visual_type(img_file) == 'image':
-                        visual_files.append(img_file)
-                frame_indices = sample_frame_indices(start_frame=0, total_frames=len(visual_files), n_frames=min(len(visual_files), self.processor.max_n_frames))
-                visual_files = [v for i,v in enumerate(visual_files) if i in frame_indices]
-            else:
-                if get_visual_type(video_file) in ['image', 'video', 'gif']:
-                    visual_files.append(video_file)
-            assert len(visual_files) >= 0, f"Failed to load valid visual file from anns[{index}]!"
-            images = []
-            for v_f in visual_files:
-                images.extend(self.processor.load_images(v_f, start_time=start_time, end_time=end_time))
-            model_inputs = self.processor(prompt, images=images, edit_prompt=True, return_prompt=True)
-        except Exception as e:
-            print(f"Load data error: {e}")
-            return ann, None
-        return ann, model_inputs

dataset/processor.py DELETED Viewed

@@ -1,164 +0,0 @@
-# Copyright (2024) Bytedance Ltd. and/or its affiliates
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from PIL import Image
-from typing import List
-import torch
-from transformers import DataCollatorForSeq2Seq
-from transformers.models.llava import LlavaProcessor
-import re
-import os
-from .utils import sample_image, sample_video, sample_gif, get_visual_type
-HF_TOKEN = os.environ.get('HF_TOKEN', '')
-ext2sampler = {
-    'image': sample_image,
-    'gif': sample_gif,
-    'video': sample_video
-}
-class CustomImageProcessor:
-    def __init__(self, processor) -> None:
-        self.processor = processor
-    def __call__(self, images: List[Image.Image], do_padding=False) -> torch.Tensor:
-        if do_padding:
-            images = [self.expand2square(
-                img,
-                tuple(int(x * 255) for x in self.processor.image_processor.image_mean)
-            ) for img in images]
-        else:
-            images = [self.resize2square(img) for img in images]
-        images_pixel = self.processor(text="", images=images, return_tensors="pt")['pixel_values']
-        return images_pixel  # [num_images, 3, 336, 336]
-    def expand2square(self, pil_img, background_color):
-        width, height = pil_img.size
-        if width == height:
-            return pil_img
-        elif width > height:
-            result = Image.new(pil_img.mode, (width, width), background_color)
-            result.paste(pil_img, (0, (width - height) // 2))
-            return result
-        else:
-            result = Image.new(pil_img.mode, (height, height), background_color)
-            result.paste(pil_img, ((height - width) // 2, 0))
-            return result
-    def resize2square(self, pil_img: Image.Image):
-        width, height = pil_img.size
-        pil_img = pil_img.resize((max(width, height), max(width, height)))
-        return pil_img
-class Processor(object):
-    def __init__(
-            self,
-            model_name_or_path,
-            max_n_frames=8,
-            max_seq_len=None,
-            add_sep=False,
-            do_image_padding=False,
-        ):
-        self.max_n_frames = max_n_frames
-        self.max_seq_len = max_seq_len,
-        self.add_sep = add_sep
-        self.do_image_padding = do_image_padding
-        if not self.do_image_padding:
-            print(f"### do_image_padding is set as False, images will be resized directly!")
-        self.setup(model_name_or_path)
-    def setup(self, model_name_or_path):
-        sub_processor = LlavaProcessor.from_pretrained(
-            model_name_or_path,
-            padding_side='left',
-            trust_remote_code=True,
-            token=HF_TOKEN,
-        )
-        self.processor = CustomImageProcessor(sub_processor)
-        self.tokenizer = sub_processor.tokenizer
-        # self.pad_collator = DataCollatorForSeq2Seq(self.tokenizer, padding='longest')
-        self.sep_id = self.tokenizer.sep_token_id
-        self.pad_id = self.tokenizer.pad_token_id
-        self.eos_id = self.tokenizer.eos_token_id
-        if self.sep_id is None:
-            self.add_sep = False
-        if not self.max_seq_len:
-            self.max_seq_len = self.tokenizer.model_max_length
-    def process_prompt(self, prompt, images: List[Image.Image]=None):
-        if not images:
-            prompt = prompt.replace("<image>", "").replace("<video>", "")
-        elif images is not None:
-            prompt = prompt.replace("<video>", "<image>"*len(images))
-            image_token_num = len(re.findall('<image>', prompt, re.S))
-            if image_token_num == 0:
-                prompt_parts = re.findall(r'USER:(.*)ASSISTANT:(.*)', prompt, re.S)
-                if prompt_parts and len(prompt_parts) == 2:
-                    p1, p2 = prompt_parts
-                else:
-                    p1 = prompt
-                    p2 = ''
-                prompt = f"USER: {'<image>'*len(images) + ' ' + p1.strip()} ASSISTANT: {p2.strip()}"
-            assert image_token_num == len(images)
-        if not re.findall(r'USER:(.*)ASSISTANT:(.*)', prompt, re.S):
-            prompt = f'USER: {prompt} ASSISTANT: '
-        return prompt
-    def select_frames_sampler(self, visual_data_path):
-        visual_type = get_visual_type(visual_data_path)
-        if visual_type in ext2sampler:
-            return ext2sampler[visual_type]
-        else:
-            raise ValueError(f"Unsupported data format: {visual_data_path}")
-    def load_images(self, visual_data_path, n_frames=None, start_time=0, end_time=-1):
-        sampler = self.select_frames_sampler(visual_data_path)
-        return sampler(visual_data_path, n_frames=min(n_frames, self.max_n_frames) if n_frames else self.max_n_frames, start_time=start_time, end_time=end_time)
-    def get_pixel_values(self, images):
-        if images is not None and len(images) > 0:
-            pixel_values = self.processor(images=images, do_padding=self.do_image_padding)
-        else:
-            pixel_values = None
-        return pixel_values
-    def get_text_inputs(self, text):
-        prompt_ids = self.tokenizer.encode(text, add_special_tokens=True)  # will add <s>
-        if self.add_sep:
-            prompt_ids = prompt_ids + [self.sep_id]
-        prompt_ids = torch.tensor(prompt_ids, dtype=torch.long).unsqueeze(dim=0)
-        return prompt_ids
-    def get_inputs(self, prompt, visual_data_file=None, images=None, n_frames=None, edit_prompt=False, return_prompt=False):
-        if images is None:
-            images = self.load_images(visual_data_file, n_frames) if visual_data_file else None
-        if edit_prompt:
-            prompt = self.process_prompt(prompt, images)
-        text_inputs = self.get_text_inputs(prompt)
-        pixel_values = self.get_pixel_values(images)
-        inputs = {
-            "input_ids": text_inputs,
-            "pixel_values": pixel_values
-        }
-        if return_prompt:
-            inputs['prompt'] = prompt
-        return inputs
-    def __call__(self, prompt, visual_data_file=None, images=None, n_frames=None, edit_prompt=False, return_prompt=False):
-        return self.get_inputs(prompt, visual_data_file, images, n_frames, edit_prompt, return_prompt)

dataset/tarsier_datamodule.py ADDED Viewed

	@@ -0,0 +1,284 @@

+"""Datamodule for Llava Pretraining and Finetuning"""
+import os
+import re
+from PIL import Image
+import numpy as np
+import re
+import tempfile
+from typing import Dict, List, Union, Tuple
+import traceback
+import json
+import torch
+import torch.nn.functional as F
+from transformers import DataCollatorForSeq2Seq
+from tools.rw_utils import read_jsonlines
+from torch.utils.data import Dataset, DataLoader
+np_str_obj_array_pattern = re.compile(r"[SaUO]")
+default_collate_err_msg_format = (
+    "default_collate: batch must contain tensors, numpy arrays, numbers, "
+    "dicts or lists; found {}"
+)
+from .custom_data_parsers.standard_vision_parser import VisionParser
+from .custom_data_parsers.object_tracking_parser import ObjectTrackingParser
+from .custom_data_parsers.multi_images_parser import MultiImagesParser
+from .custom_data_parsers.video_permutation_parser import VideoPermutationParser
+from .custom_data_parsers.utils_visualize import visualize_image_bbox
+from .tarsier_processor import TarsierProcessor
+from tools.rw_utils import NumpyArrayEncoder
+from .utils import DictToObject
+import os
+HF_TOKEN = os.environ.get('HF_TOKEN', '')
+class TarsierDataProcessor:
+    def __init__(
+        self,
+        processor: TarsierProcessor,
+        n_frames: Union[int, list],
+        max_n_frames=256,
+        max_pixels=int(1280 * 720 // 2),
+        min_pixels=0,
+        max_seq_len=None,
+        is_training=True,  # 会影响：1. 训练和测试时采帧不同；2. 测试时忽略 response。
+        print_data_error=True,
+        do_image_padding=False,
+        do_image_crop=False,
+        do_image_resize=True,
+        video_sampling_strategy={},
+        prompt='',
+        train_task='sft',
+        **kwargs
+    ):
+        self.kwargs = kwargs
+        self.processor = processor
+        self.pad_collator = DataCollatorForSeq2Seq(processor.tokenizer, padding='longest')
+        self.processor.max_seq_len = self.tokenizer.model_max_length if max_seq_len is None else max_seq_len
+        self.n_frames = n_frames
+        self.max_n_frames = max_n_frames
+        self.max_pixels = max_pixels
+        self.min_pixels = min_pixels
+        self.is_training = is_training
+        self.print_data_error = print_data_error
+        self.do_image_padding = do_image_padding
+        self.do_image_crop = do_image_crop
+        self.do_image_resize = do_image_resize
+        self.video_sampling_strategy = video_sampling_strategy
+        self.prompt = prompt
+        self.train_task = train_task
+        self.object_tracking_parser = ObjectTrackingParser(
+            n_frames=self.n_frames,
+            max_objects=4,
+            is_training=self.is_training,
+        )
+        self.multi_images_parser = MultiImagesParser(
+            n_frames=self.n_frames,
+            is_training=self.is_training,
+        )
+        self.video_permutation_parser = VideoPermutationParser(
+            n_frames=self.n_frames,
+            is_training=self.is_training,
+            video_sampling_strategy=self.video_sampling_strategy,
+        )
+        self.vision_parser = VisionParser(
+            n_frames=self.n_frames,
+            max_n_frames=self.max_n_frames,
+            is_training=self.is_training,
+            video_sampling_strategy=self.video_sampling_strategy
+        )
+    def select_parser(self, data_dict):
+        if data_dict.get('task', None) == 'video/object_tracking':
+            return self.object_tracking_parser
+        elif data_dict.get('task', None) == 'multi_images':
+            return self.multi_images_parser
+        elif data_dict.get('dataset', None) == 'video_permutation':
+            return self.video_permutation_parser
+        else:
+            return self.vision_parser
+    def parse_image_processing_config(self, data_dict):
+        image_processing_config=data_dict.get('image_processing_config', {})
+        do_padding = image_processing_config.get('do_padding', self.do_image_padding)
+        do_crop = image_processing_config.get('do_crop', self.do_image_crop)
+        do_resize = image_processing_config.get('do_resize', self.do_image_resize)
+        max_pixels = image_processing_config.get('max_pixels', self.max_pixels)
+        min_pixels = image_processing_config.get('min_pixels', self.min_pixels)
+        assert min_pixels <= max_pixels
+        image_processing_config['do_padding'] = do_padding
+        image_processing_config['do_crop'] = do_crop
+        image_processing_config['do_resize'] = do_resize
+        image_processing_config['max_pixels'] = max_pixels
+        image_processing_config['min_pixels'] = min_pixels
+        return image_processing_config
+    def _transform(self, raw_data_dict: Dict) -> Dict:
+        data_dict = json.loads(json.dumps(raw_data_dict, cls=NumpyArrayEncoder))
+        del raw_data_dict
+        if self.prompt:
+            for msg in data_dict['messages']:
+                if msg['role'] == 'user':
+                    for content in msg['content']:
+                        if content['type'] == 'text':
+                            content['text'] = self.prompt
+        data_dict_copy = json.loads(json.dumps(data_dict, cls=NumpyArrayEncoder))
+        image_processing_config = self.parse_image_processing_config(data_dict)
+        parser = self.select_parser(data_dict)
+        messages = parser.transform(data_dict, image_processing_config)
+        data_dict_copy['extra_info'] = data_dict.pop('extra_info', {})
+        # visualize_image_bbox(data_dict, image_processing_config, self.processor)
+        outputs = self.processor(messages, image_processing_config, is_training=self.is_training)
+        # if not self.is_training:
+        outputs['raw_data_dict'] = data_dict_copy
+        return [outputs]
+    def _split_chosen_rejected(self, data_dict: Dict):
+        chosen_data_dict = data_dict
+        rejected_data_dict = json.loads(json.dumps(data_dict, cls=NumpyArrayEncoder))
+        for msg in chosen_data_dict['messages']:
+            if msg['role'] == 'assistant':
+                for content in msg['content']:
+                    if content['type'] == 'text':
+                        content['text'] = content['chosen']
+        for msg in rejected_data_dict['messages']:
+            if msg['role'] == 'assistant':
+                for content in msg['content']:
+                    if content['type'] == 'text':
+                        content['text'] = content['rejected']
+        return chosen_data_dict, rejected_data_dict
+    def transform(self, data_dict: Dict) -> Dict:
+        try:
+            if self.train_task == 'dpo':
+                chosen_data_dict, rejected_data_dict = self._split_chosen_rejected(data_dict)
+                return self._transform(chosen_data_dict) + self._transform(rejected_data_dict)
+            return self._transform(data_dict)
+        except Exception as e:
+            if self.print_data_error:
+                print(traceback.format_exc())
+                print(f'Error occurs when processing: \n{data_dict}')
+            return []
+    def batch_transform(self, batch_data: List[Dict]) -> Dict:
+        model_inputs = {}
+        # if not self.is_training:
+        raw_data_dict = [d.pop('raw_data_dict') for d in batch_data]
+        model_inputs['raw_data_dict'] = raw_data_dict
+        batch_pixel_values = [d.pop('pixel_values') for d in batch_data if 'pixel_values' in d]
+        batch_image_grid_thw = [d.pop('image_grid_thw') for d in batch_data if 'image_grid_thw' in d]
+        if len(batch_pixel_values) == 0:
+            vision_placeholder = self.get_vision_placeholder()
+            batch_pixel_values = [vision_placeholder.get('pixel_values')]
+            batch_image_grid_thw = [vision_placeholder.get('image_grid_thw')] if 'image_grid_thw' in vision_placeholder else []
+        model_inputs['pixel_values'] = torch.cat(batch_pixel_values, dim=0)
+        if len(batch_image_grid_thw) > 0:
+            model_inputs['image_grid_thw'] = torch.cat(batch_image_grid_thw, dim=0)
+        batch_num_images = [d.pop('num_images') for d in batch_data]
+        model_inputs['num_images'] = torch.tensor(batch_num_images)
+        model_inputs.update(self.pad_collator(batch_data))
+        return model_inputs
+    def __call__(self, batch_data: Union[Dict, List[Dict]]) -> Dict:
+        if isinstance(batch_data, dict):
+            batch_data = [batch_data]
+        batch = [self.transform(d)[0] for d in batch_data]
+        return self.batch_transform(batch)
+    def get_vision_placeholder(self):
+        messages = [{"role": "user", "content": [{"type": "image", "image": Image.new(mode='RGB', size=(336, 336))}]}]
+        image_processing_config = self.parse_image_processing_config({})
+        return self.processor(messages, image_processing_config)
+    def get_text_placeholder(self):
+        messages = [
+            {"role": "user", "content": [{"type": "text", "text": "Hello!"}]},
+            {"role": "assistant", "content": [{"type": "text", "text": "Thank you very much"}]},
+        ]
+        image_processing_config = self.parse_image_processing_config({})
+        return self.processor(messages, image_processing_config)
+def init_processor(processor: Union[TarsierProcessor, str]=None, config: Dict=None):
+    config = DictToObject(config) if isinstance(config, dict) else config
+    if isinstance(processor, str):
+        sub_processor = TarsierProcessor.from_pretrained(
+            processor,
+            padding_side='left',
+            trust_remote_code=True,
+            token=HF_TOKEN,
+        )
+    else:
+        sub_processor = processor
+    processor = TarsierDataProcessor(
+        processor=sub_processor,
+        n_frames=config.n_frames,
+        max_n_frames=config.max_n_frames,
+        max_pixels=config.max_pixels,
+        min_pixels=config.min_pixels,
+        max_seq_len=config.max_seq_len,
+        is_training=config.is_training,
+        print_data_error=config.print_data_error,
+        do_image_padding=config.do_image_padding,
+        do_image_crop=config.do_image_crop,
+        do_image_resize=config.do_image_resize,
+        video_sampling_strategy=config.video_sampling_strategy,
+        prompt=config.prompt,
+        train_task=config.train_task
+    )
+    return processor
+class TarsierDataset(Dataset):
+    def __init__(self, ann_path="", anns=None, config: Dict=None, processor: Union[TarsierDataProcessor, TarsierProcessor, str]=None):
+        self.config = DictToObject(config) if isinstance(config, dict) else config
+        if not isinstance(processor, TarsierDataProcessor):
+            self.processor = init_processor(processor, config)
+        else:
+            self.processor = processor
+        if anns is None:
+            self.anns = []
+            if isinstance(ann_path, str):
+                ann_path = [ann_path]
+            for path in ann_path:
+                self.anns.extend(read_jsonlines(path))
+        else:
+            self.anns = anns
+    def __len__(self):
+        return len(self.anns)
+    def __getitem__(self, index):
+        if index < 0 or index >= len(self.anns):
+            raise IndexError("Index out of range")
+        try:
+            ann = self.anns[index]
+            model_inputs = self.processor(ann)
+        except Exception as e:
+            print(f"Load data error: {e}")
+            return ann, None
+        return ann, model_inputs

dataset/tarsier_processor.py ADDED Viewed

	@@ -0,0 +1,240 @@

+from typing import List, Union
+from PIL import Image
+import torch
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.image_utils import ImageInput, get_image_size, to_numpy_array
+from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack, _validate_images_text_input_order
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+from transformers.utils import logging
+from transformers import Qwen2VLImageProcessor
+from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
+logger = logging.get_logger(__name__)
+class TarsierProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {
+        "text_kwargs": {},
+        "images_kwargs": {},
+    }
+class TarsierProcessor(ProcessorMixin):
+    attributes = ["image_processor", "tokenizer"]
+    valid_kwargs = ["chat_template", "image_token", "patch_size", "merge_size", "temporal_patch_size", "max_seq_len"]
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+    def __init__(
+                self,
+                image_processor=None,
+                tokenizer=None,
+                chat_template=None,
+                image_token="<image>",
+                patch_size=None,
+                merge_size=1,
+                temporal_patch_size=1,
+                max_seq_len=8192,
+                **kwargs,
+            ) -> None:
+        self.image_token = image_token
+        self.patch_size = patch_size
+        self.merge_size = merge_size
+        self.temporal_patch_size = temporal_patch_size
+        self.max_seq_len = max_seq_len
+        self.max_pixels_per_sample = 128 * 384 * 384
+        super().__init__(image_processor, tokenizer, chat_template=chat_template)
+    def __call__(
+            self,
+            messages,
+            image_processing_config=None,
+            is_training=True,
+        ) -> torch.Tensor:
+        output_kwargs = self._merge_kwargs(
+            TarsierProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+        )
+        # 【图片处理】
+        pixel_values, image_grid_thw = [], []
+        num_images = 0
+        for msg in messages:
+            for content in msg['content']:
+                if content['type'] == 'image':
+                    num_images += self.temporal_patch_size
+                elif content['type'] == 'video':
+                    num_images += len(content['video'])
+        if num_images > 0 and self.max_pixels_per_sample // num_images < image_processing_config['max_pixels']:
+            image_processing_config['max_pixels'] = self.max_pixels_per_sample // num_images
+            image_processing_config['min_pixels'] = min(image_processing_config['min_pixels'], image_processing_config['max_pixels'])
+        for msg in messages:
+            for content in msg['content']:
+                if content['type'] == 'image':
+                    content['image'] = self.preprocess_image(content['image'], image_processing_config)
+                    content['image'] = self.image_processor(images = content['image'], **output_kwargs["images_kwargs"], return_tensors="pt")
+                    content['num_vision_tokens'] = self.get_num_vision_tokens(content)
+                    pixel_values.append(content['image']['pixel_values'])
+                    if 'image_grid_thw' in content['image']:
+                        image_grid_thw.extend(content['image']['image_grid_thw'])
+                elif content['type'] == 'video':
+                    content['video'] = self.preprocess_image(content['video'], image_processing_config)
+                    if isinstance(self.image_processor, Qwen2VLImageProcessor):
+                        content['video'] = self.image_processor(images = None, videos = content['video'], **output_kwargs["images_kwargs"], return_tensors="pt")
+                        pixel_values.append(content['video']['pixel_values_videos'])
+                    else:
+                        content['video'] = self.image_processor(images = content['video'], **output_kwargs["images_kwargs"], return_tensors="pt")
+                        pixel_values.append(content['video']['pixel_values'])
+                    if 'video_grid_thw' in content['video']:
+                        image_grid_thw.extend(content['video']['video_grid_thw'])
+                    content['num_vision_tokens'] = self.get_num_vision_tokens(content)
+        #【文本处理】
+        add_generation_prompt = (not is_training and messages[-1]['role'] != 'assistant')
+        strip_final_eos = (not is_training and messages[-1]['role'] == 'assistant')
+        text_inputs = self.tokenizer.apply_chat_template(
+            messages,
+            chat_template = self.chat_template,
+            tokenize=True,
+            tokenizer_kwargs = output_kwargs["text_kwargs"],
+            return_assistant_tokens_mask=True,
+            return_dict=True,
+            add_generation_prompt=add_generation_prompt,
+            strip_final_eos=strip_final_eos,
+        )
+        labels = [-100 if j == 0 else i for i, j in zip(text_inputs['input_ids'], text_inputs['assistant_masks'])]
+        labels = labels[:self.max_seq_len]
+        input_ids = text_inputs['input_ids'][:self.max_seq_len]
+        image_token_id = self.tokenizer.convert_tokens_to_ids(self.image_token)
+        if image_token_id in text_inputs['input_ids'][self.max_seq_len:]:
+            raise ValueError(f'Too long sequence! {len(text_inputs["input_ids"])}')
+        outputs = {
+            'input_ids': input_ids,
+            'labels': labels,
+            'num_images': num_images,
+        }
+        if len(pixel_values) > 0:
+            outputs['pixel_values'] = torch.cat(pixel_values, dim=0)
+        if len(image_grid_thw) > 0:
+            outputs['image_grid_thw'] = torch.stack(image_grid_thw)
+        return outputs
+    def preprocess_image(self, pil_img: Union[Image.Image, List[Image.Image]], image_processing_config):
+        if image_processing_config is None:
+            return pil_img
+        images = pil_img
+        if isinstance(pil_img, Image.Image):
+            images = [images]
+        if image_processing_config['do_crop']:
+            images = [self.centralcrop(img, rate=[4, 3]) for img in images]
+        if image_processing_config['do_padding']:
+            images = [self.expand2square(
+                img,
+                # tuple(int(x * 255) for x in self.processor.image_processor.image_mean)
+                tuple(int(x * 255) for x in [0, 0, 0])
+            ) for img in images]
+        if image_processing_config['do_resize']:
+            images = [self.resize2square(img) for img in images]
+        if image_processing_config.get('max_pixels'):
+            images = [self.resize2pixels(
+                img,
+                int(image_processing_config['max_pixels']),
+                int(image_processing_config['min_pixels'])
+            ) for img in images]
+        if isinstance(pil_img, Image.Image):
+            images = images[0]
+        return images
+    def expand2square(self, pil_img, background_color):
+        width, height = pil_img.size
+        if width == height:
+            return pil_img
+        elif width > height:
+            result = Image.new(pil_img.mode, (width, width), background_color)
+            result.paste(pil_img, (0, (width - height) // 2))
+            return result
+        else:
+            result = Image.new(pil_img.mode, (height, height), background_color)
+            result.paste(pil_img, ((height - width) // 2, 0))
+            return result
+    def resize2square(self, pil_img: Image.Image):
+        width, height = pil_img.size
+        pil_img = pil_img.resize((max(width, height), max(width, height)))
+        return pil_img
+    def centralcrop(self, pil_img: Image.Image, rate=[4, 3]):
+        width, height = pil_img.size
+        size = (width, height)
+        min_len = min(size)
+        longer_side = 0 if width >= height else 1
+        center = (width/2, height/2)
+        box = [0, 0, size[0], size[1]]
+        # if longer_side == 0:
+        #     box[0] = max(0, center[0] - 1/2*min_len/rate[1]*rate[0])
+        #     box[2] = min(width, center[0] + 1/2*min_len/rate[1]*rate[0])
+        # else:
+        #     box[1] = max(0, center[1] - 1/2*min_len/rate[1]*rate[0])
+        #     box[3] = min(height, center[1] + 1/2*min_len/rate[1]*rate[0])
+        box[longer_side] = max(0, center[longer_side] - 1/2*min_len/rate[1]*rate[0])
+        box[2 + longer_side] = min(size[longer_side], center[longer_side] + 1/2*min_len/rate[1]*rate[0])
+        # box = (width/2-min_len/2, height/2-min_len/2, width/2+min_len/2, height/2+min_len/2)
+        pil_img = pil_img.crop(box)
+        return pil_img
+    def resize2pixels(self, pil_img: Image.Image, max_pixels=None, min_pixels=None):
+        width, height = pil_img.size
+        new_height, new_width = smart_resize(height, width, factor=1, max_pixels=max_pixels, min_pixels=min_pixels)
+        pil_img = pil_img.resize((new_width, new_height))
+        return pil_img
+    def get_num_vision_tokens(self, content):
+        if isinstance(self.image_processor, Qwen2VLImageProcessor):
+            merge_length = self.image_processor.merge_size**2
+            if content['type'] == 'image':
+                num_image_tokens = content['image']['image_grid_thw'].prod() // merge_length
+            else:
+                num_image_tokens = content['video']['video_grid_thw'].prod() // merge_length
+            return num_image_tokens
+        else:
+            # 其他模型：image tokens (-> 2x2 compressed) -> add image_newline and image_new
+            k = 'image'if content['type'] == 'image' else 'video'
+            pixel_values = content[k]['pixel_values'][0]
+            n_frames = len(content[k]['pixel_values'])
+            height, width = get_image_size(to_numpy_array(pixel_values))
+            num_image_tokens = (height // (self.patch_size * self.merge_size)) * (width // (self.patch_size * self.merge_size) + 1) + 1
+            return num_image_tokens * n_frames
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))

dataset/utils.py CHANGED Viewed

@@ -126,3 +126,61 @@ def get_benchmarks(benchmarks):
         else:
             final_benchmarks.append(bm)
     return final_benchmarks

         else:
             final_benchmarks.append(bm)
     return final_benchmarks
+def check_data_format(data):
+    for msg in data['messages']:
+        if isinstance(msg['content'], dict):
+            msg['content'] = [msg['content']]
+        for content in msg['content']:
+            assert content['type'] in {'image', 'video', 'text'}, f"content['type']={content['type']} MUST be one of ['image', 'video', 'text']"
+            if content['type'] != "text":
+                media_path_key = f"{content['type']}_file"
+                meida_paths = content[content['type']][media_path_key]
+                if isinstance(meida_paths, str):
+                    meida_paths = [meida_paths]
+                for path in meida_paths:
+                    assert os.path.exists(path), f"File not found: {path}"
+def format_one_sample(media_file=None, prompt="Describe the video in detail."):
+    sample = {
+        "messages": []
+    }
+    user_content = {
+        "role": "user",
+        "content": []
+    }
+    if media_file is not None:
+        media_type = get_visual_type(media_file)
+        if media_type in ("video", "gif"):
+            media_type = "video"
+        media_path_key = f"{media_type}_file"
+        user_content["content"].append({
+            "type": media_type,
+            media_type: {
+                media_path_key: media_file,
+            }
+        })
+    user_content["content"].append({
+        "type": "text",
+        "text": prompt
+    })
+    assistant_content = {
+        "role": "assistant",
+        "content": []
+    }
+    sample["messages"].append(user_content)
+    sample["messages"].append(assistant_content)
+    if media_file is not None:
+        sample["task"] = f"{media_type}/QA"
+    else:
+        sample["task"] = 'text-only'
+    check_data_format(sample)
+    return sample
+class DictToObject(object):
+    def __init__(self, dictionary):
+        for key, value in dictionary.items():
+            setattr(self, key, value)

models/modeling_qwen2_vl_fast.py ADDED Viewed

	@@ -0,0 +1,1320 @@

+import os
+import math
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import LayerNorm
+from transformers.modeling_utils import PreTrainedModel
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_rope_utils import rope_config_validation, ROPE_INIT_FUNCTIONS
+from transformers.cache_utils import Cache, SlidingWindowCache, StaticCache
+from transformers.modeling_attn_mask_utils import AttentionMaskConverter
+from transformers.utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal_2_10,
+    logging,
+    replace_return_docstrings,
+)
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    ModelOutput,
+)
+from transformers.activations import ACT2FN
+from transformers.generation import GenerationMixin
+if is_flash_attn_2_available():
+    from flash_attn import flash_attn_varlen_func
+    from transformers.modeling_flash_attention_utils import _flash_attention_forward
+else:
+    flash_attn_varlen_func = None
+# from apex.normalization.fused_layer_norm import fused_rms_norm_affine
+logger = logging.get_logger(__name__)
+@dataclass
+class Qwen2VLCausalLMOutputWithPast(ModelOutput):
+    """
+    Base class for Qwen2VL causal language model (or autoregressive) outputs.
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss (for next-token prediction).
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+            `past_key_values` input) to speed up sequential decoding.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
+            The rope index difference between sequence length and multimodal rope.
+    """
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    past_key_values: Optional[List[torch.FloatTensor]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+class Qwen2VLVisionConfig(PretrainedConfig):
+    model_type = "qwen2_vl"
+    def __init__(
+        self,
+        depth=32,
+        embed_dim=1280,
+        hidden_size=3584,
+        hidden_act="quick_gelu",
+        mlp_ratio=4,
+        num_heads=16,
+        in_channels=3,
+        patch_size=14,
+        spatial_merge_size=2,
+        temporal_patch_size=2,
+        attn_implementation='flash_attention_2',
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.depth = depth
+        self.embed_dim = embed_dim
+        self.hidden_size = hidden_size
+        self.hidden_act = hidden_act
+        self.mlp_ratio = mlp_ratio
+        self.num_heads = num_heads
+        self.in_channels = in_channels
+        self.patch_size = patch_size
+        self.spatial_merge_size = spatial_merge_size
+        self.temporal_patch_size = temporal_patch_size
+        self.attn_implementation = attn_implementation
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+        if config_dict.get("model_type") == "qwen2_vl":
+            config_dict = config_dict["vision_config"]
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+        return cls.from_dict(config_dict, **kwargs)
+class Qwen2VLConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Qwen2VLModel`]. It is used to instantiate a
+    Qwen2-VL model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of
+    Qwen2-VL-7B-Instruct [Qwen/Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct).
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 152064):
+            Vocabulary size of the Qwen2VL model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Qwen2VLModel`]
+        hidden_size (`int`, *optional*, defaults to 8192):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 29568):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 80):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 64):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 32768):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_theta (`float`, *optional*, defaults to 1000000.0):
+            The base period of the RoPE embeddings.
+        use_sliding_window (`bool`, *optional*, defaults to `False`):
+            Whether to use sliding window attention.
+        sliding_window (`int`, *optional*, defaults to 4096):
+            Sliding window attention (SWA) window size. If not specified, will default to `4096`.
+        max_window_layers (`int`, *optional*, defaults to 80):
+            The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        vision_config (`Dict`, *optional*):
+            The config for the visual encoder initialization.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+    ```python
+    >>> from transformers import Qwen2VLForConditionalGeneration, Qwen2VLConfig
+    >>> # Initializing a Qwen2VL style configuration
+    >>> configuration = Qwen2VLConfig()
+    >>> # Initializing a model from the Qwen2-VL-7B style configuration
+    >>> model = Qwen2VLForConditionalGeneration(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "qwen2_vl"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        vocab_size=152064,
+        hidden_size=8192,
+        intermediate_size=29568,
+        num_hidden_layers=80,
+        num_attention_heads=64,
+        num_key_value_heads=8,
+        hidden_act="silu",
+        max_position_embeddings=32768,
+        initializer_range=0.02,
+        rms_norm_eps=1e-05,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=1000000.0,
+        use_sliding_window=False,
+        sliding_window=4096,
+        max_window_layers=80,
+        attention_dropout=0.0,
+        rope_scaling=None,
+        spatial_merge_size=2,
+        attn_implementation='flash_attention_2',
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.use_sliding_window = use_sliding_window
+        self.sliding_window = sliding_window
+        self.max_window_layers = max_window_layers
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_dropout = attention_dropout
+        self.rope_scaling = rope_scaling
+        self.spatial_merge_size = spatial_merge_size
+        self.attn_implementation = attn_implementation
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, move it to 'rope_type'.
+        # and change type from 'mrope' to 'default' because `mrope` does defeault RoPE calculations
+        # one can set it to "linear"/"dynamic" etc. to have scaled RoPE
+        # TODO: @raushan update config in the hub
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            if self.rope_scaling["type"] == "mrope":
+                self.rope_scaling["type"] = "default"
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_config_validation(self, ignore_keys={"mrope_section"})
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_multimodal_rotary_pos_emb(q, k, cos, sin, mrope_section, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding with Multimodal Sections to the query and key tensors (https://qwenlm.github.io/blog/qwen2-vl/).
+    Explanation:
+        Multimodal 3D rotary position embedding is an extension to 1D rotary position embedding. The input embedding
+        sequence contains vision (images / videos) embedding and text embedding or just contains text embedding. For
+        vision embedding part, we apply rotary position embedding on temporal, height and width dimension seperately.
+        Here we split the channel dimension to 3 chunks for the temporal, height and width rotary position embedding.
+        For text embedding part, we just apply 1D rotary position embedding. The three rotary position index (temporal,
+        height and width) of text embedding is always the same, so the text embedding rotary position embedding has no
+        difference with modern LLMs.
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`):
+            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
+            used to pass offsetted position ids when working with a KV-cache.
+        mrope_section(`List(int)`):
+            Multimodal rope section is for channel dimension of temporal, height and width in rope calculation.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    mrope_section = mrope_section * 2
+    cos = torch.cat([m[i % 3] for i, m in enumerate(cos.split(mrope_section, dim=-1))], dim=-1).unsqueeze(
+        unsqueeze_dim
+    )
+    sin = torch.cat([m[i % 3] for i, m in enumerate(sin.split(mrope_section, dim=-1))], dim=-1).unsqueeze(
+        unsqueeze_dim
+    )
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+def apply_rotary_pos_emb_vision(tensor: torch.Tensor, freqs: torch.Tensor) -> torch.Tensor:
+    orig_dtype = tensor.dtype
+    tensor = tensor.float()
+    cos = freqs.cos()
+    sin = freqs.sin()
+    cos = cos.unsqueeze(1).repeat(1, 1, 2).unsqueeze(0).float()
+    sin = sin.unsqueeze(1).repeat(1, 1, 2).unsqueeze(0).float()
+    output = (tensor * cos) + (rotate_half(tensor) * sin)
+    output = output.to(orig_dtype)
+    return output
+class VisionRotaryEmbedding(nn.Module):
+    def __init__(self, dim: int, theta: float = 10000.0) -> None:
+        super().__init__()
+        inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+    def forward(self, seqlen: int) -> torch.Tensor:
+        seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
+        freqs = torch.outer(seq, self.inv_freq)
+        return freqs
+class PatchEmbed(nn.Module):
+    def __init__(
+        self,
+        patch_size: int = 14,
+        temporal_patch_size: int = 2,
+        in_channels: int = 3,
+        embed_dim: int = 1152,
+    ) -> None:
+        super().__init__()
+        self.patch_size = patch_size
+        self.temporal_patch_size = temporal_patch_size
+        self.in_channels = in_channels
+        self.embed_dim = embed_dim
+        kernel_size = [temporal_patch_size, patch_size, patch_size]
+        self.proj = nn.Conv3d(in_channels, embed_dim, kernel_size=kernel_size, stride=kernel_size, bias=False)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        target_dtype = self.proj.weight.dtype
+        hidden_states = hidden_states.view(
+            -1, self.in_channels, self.temporal_patch_size, self.patch_size, self.patch_size
+        )
+        hidden_states = self.proj(hidden_states.to(dtype=target_dtype)).view(-1, self.embed_dim)
+        return hidden_states
+class PatchMerger(nn.Module):
+    def __init__(self, dim: int, context_dim: int, spatial_merge_size: int = 2) -> None:
+        super().__init__()
+        self.hidden_size = context_dim * (spatial_merge_size**2)
+        self.ln_q = LayerNorm(context_dim, eps=1e-6)
+        self.mlp = nn.Sequential(
+            nn.Linear(self.hidden_size, self.hidden_size),
+            nn.GELU(),
+            nn.Linear(self.hidden_size, dim),
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.mlp(self.ln_q(x).view(-1, self.hidden_size))
+        return x
+class VisionMlp(nn.Module):
+    def __init__(self, dim: int, hidden_dim: int, hidden_act: str) -> None:
+        super().__init__()
+        self.fc1 = nn.Linear(dim, hidden_dim)
+        self.act = ACT2FN[hidden_act]
+        self.fc2 = nn.Linear(hidden_dim, dim)
+    def forward(self, x) -> torch.Tensor:
+        return self.fc2(self.act(self.fc1(x)))
+class VisionAttention(nn.Module):
+    def __init__(self, dim: int, num_heads: int = 16) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.qkv = nn.Linear(dim, dim * 3, bias=True)
+        self.proj = nn.Linear(dim, dim)
+    def forward(
+        self, hidden_states: torch.Tensor, cu_seqlens: torch.Tensor, rotary_pos_emb: torch.Tensor = None
+    ) -> torch.Tensor:
+        seq_length = hidden_states.shape[0]
+        q, k, v = self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
+        q = apply_rotary_pos_emb_vision(q.unsqueeze(0), rotary_pos_emb).squeeze(0)
+        k = apply_rotary_pos_emb_vision(k.unsqueeze(0), rotary_pos_emb).squeeze(0)
+        attention_mask = torch.full(
+            [1, seq_length, seq_length], torch.finfo(q.dtype).min, device=q.device, dtype=q.dtype
+        )
+        for i in range(1, len(cu_seqlens)):
+            attention_mask[..., cu_seqlens[i - 1] : cu_seqlens[i], cu_seqlens[i - 1] : cu_seqlens[i]] = 0
+        q = q.transpose(0, 1)
+        k = k.transpose(0, 1)
+        v = v.transpose(0, 1)
+        attn_weights = torch.matmul(q, k.transpose(1, 2)) / math.sqrt(self.head_dim)
+        attn_weights = attn_weights + attention_mask
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(q.dtype)
+        attn_output = torch.matmul(attn_weights, v)
+        attn_output = attn_output.transpose(0, 1)
+        attn_output = attn_output.reshape(seq_length, -1)
+        attn_output = self.proj(attn_output)
+        return attn_output
+class VisionFlashAttention2(nn.Module):
+    def __init__(self, dim: int, num_heads: int = 16) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        self.qkv = nn.Linear(dim, dim * 3, bias=True)
+        self.proj = nn.Linear(dim, dim)
+    def forward(
+        self, hidden_states: torch.Tensor, cu_seqlens: torch.Tensor, rotary_pos_emb: torch.Tensor = None
+    ) -> torch.Tensor:
+        seq_length = hidden_states.shape[0]
+        q, k, v = self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
+        q = apply_rotary_pos_emb_vision(q.unsqueeze(0), rotary_pos_emb).squeeze(0)
+        k = apply_rotary_pos_emb_vision(k.unsqueeze(0), rotary_pos_emb).squeeze(0)
+        max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+        attn_output = flash_attn_varlen_func(q, k, v, cu_seqlens, cu_seqlens, max_seqlen, max_seqlen).reshape(
+            seq_length, -1
+        )
+        attn_output = self.proj(attn_output)
+        return attn_output
+QWEN2_VL_VISION_ATTENTION_CLASSES = {
+    "eager": VisionAttention,
+    "flash_attention_2": VisionFlashAttention2,
+}
+class Qwen2VLVisionBlock(nn.Module):
+    def __init__(self, config, attn_implementation: str = "sdpa") -> None:
+        super().__init__()
+        self.norm1 = LayerNorm(config.embed_dim, eps=1e-6)
+        self.norm2 = LayerNorm(config.embed_dim, eps=1e-6)
+        mlp_hidden_dim = int(config.embed_dim * config.mlp_ratio)
+        self.attn = QWEN2_VL_VISION_ATTENTION_CLASSES[attn_implementation](
+            config.embed_dim, num_heads=config.num_heads
+        )
+        self.mlp = VisionMlp(dim=config.embed_dim, hidden_dim=mlp_hidden_dim, hidden_act=config.hidden_act)
+    def forward(self, hidden_states, cu_seqlens, rotary_pos_emb) -> torch.Tensor:
+        hidden_states = hidden_states + self.attn(
+            self.norm1(hidden_states), cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb
+        )
+        hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
+        return hidden_states
+class Qwen2VLPreTrainedModel(PreTrainedModel):
+    config_class = Qwen2VLConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Qwen2VLDecoderLayer", "Qwen2VLVisionBlock"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_sdpa = False
+    _supports_cache_class = True
+    _supports_static_cache = True
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, (nn.Linear, nn.Conv3d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+class Qwen2VisionTransformerPretrainedModel(Qwen2VLPreTrainedModel):
+    config_class = Qwen2VLVisionConfig
+    _no_split_modules = ["Qwen2VLVisionBlock"]
+    def __init__(self, config) -> None:
+        super().__init__(config)
+        self.spatial_merge_size = config.spatial_merge_size
+        self.patch_embed = PatchEmbed(
+            patch_size=config.patch_size,
+            temporal_patch_size=config.temporal_patch_size,
+            in_channels=config.in_channels,
+            embed_dim=config.embed_dim,
+        )
+        head_dim = config.embed_dim // config.num_heads
+        self.rotary_pos_emb = VisionRotaryEmbedding(head_dim // 2)
+        self.blocks = nn.ModuleList(
+            [Qwen2VLVisionBlock(config, config.attn_implementation) for _ in range(config.depth)]
+        )
+        self.merger = PatchMerger(
+            dim=config.hidden_size, context_dim=config.embed_dim, spatial_merge_size=config.spatial_merge_size
+        )
+        # Initialize weights and apply final processing
+        self.gradient_checkpointing = False
+        self.post_init()
+    def get_dtype(self) -> torch.dtype:
+        return self.blocks[0].mlp.fc2.weight.dtype
+    def get_device(self) -> torch.device:
+        return self.blocks[0].mlp.fc2.weight.device
+    def rot_pos_emb(self, grid_thw):
+        pos_ids = []
+        for t, h, w in grid_thw:
+            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
+            hpos_ids = hpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            )
+            hpos_ids = hpos_ids.permute(0, 2, 1, 3)
+            hpos_ids = hpos_ids.flatten()
+            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
+            wpos_ids = wpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            )
+            wpos_ids = wpos_ids.permute(0, 2, 1, 3)
+            wpos_ids = wpos_ids.flatten()
+            pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
+        pos_ids = torch.cat(pos_ids, dim=0)
+        max_grid_size = grid_thw[:, 1:].max()
+        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
+        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
+        return rotary_pos_emb
+    def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.patch_embed(hidden_states)
+        rotary_pos_emb = self.rot_pos_emb(grid_thw)
+        cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum(
+            dim=0, dtype=torch.int32
+        )
+        cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
+        for blk in self.blocks:
+            if self.gradient_checkpointing and self.training:
+                hidden_states = self._gradient_checkpointing_func(
+                    blk.__call__,
+                    hidden_states,
+                    cu_seqlens,
+                    rotary_pos_emb,
+                )
+            else:
+                hidden_states = blk(hidden_states, cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb)
+        return self.merger(hidden_states)
+# class Qwen2RMSNorm(nn.Module):
+#     def __init__(self, hidden_size, eps=1e-6):
+#         """
+#         Qwen2RMSNorm is equivalent to T5LayerNorm
+#         """
+#         super().__init__()
+#         self.weight = nn.Parameter(torch.ones(hidden_size))
+#         self.variance_epsilon = eps
+#         self.normalized_shape = torch.Size((hidden_size, ))
+#     def forward(self, hidden_states):
+#         return fused_rms_norm_affine(input=hidden_states,
+#                                      weight=self.weight,
+#                                      normalized_shape=self.normalized_shape,
+#                                      eps=self.variance_epsilon,
+#                                      memory_efficient=True)
+class Qwen2RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Qwen2RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+class Qwen2VLRotaryEmbedding(nn.Module):
+    def __init__(
+        self,
+        dim=None,
+        max_position_embeddings=2048,
+        base=10000,
+        device=None,
+        scaling_factor=1.0,
+        rope_type="default",
+        config: Optional[Qwen2VLConfig] = None,
+    ):
+        super().__init__()
+        # TODO (joao): remove the `if` below, only used for BC
+        self.rope_kwargs = {}
+        if config is None:
+            logger.warning_once(
+                "`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the "
+                "`config` argument. All other arguments will be removed in v4.46"
+            )
+            self.rope_kwargs = {
+                "rope_type": rope_type,
+                "factor": scaling_factor,
+                "dim": dim,
+                "base": base,
+                "max_position_embeddings": max_position_embeddings,
+            }
+            self.rope_type = rope_type
+            self.max_seq_len_cached = max_position_embeddings
+            self.original_max_seq_len = max_position_embeddings
+        else:
+            # BC: "rope_type" was originally "type"
+            if config.rope_scaling is not None:
+                self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+            else:
+                self.rope_type = "default"
+            self.max_seq_len_cached = config.max_position_embeddings
+            self.original_max_seq_len = config.max_position_embeddings
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+    def _dynamic_frequency_update(self, position_ids, device):
+        """
+        dynamic RoPE layers should recompute `inv_freq` in the following situations:
+        1 - growing beyond the cached sequence length (allow scaling)
+        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
+        """
+        seq_len = torch.max(position_ids) + 1
+        if seq_len > self.max_seq_len_cached:  # growth
+            inv_freq, self.attention_scaling = self.rope_init_fn(
+                self.config, device, seq_len=seq_len, **self.rope_kwargs
+            )
+            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
+            self.max_seq_len_cached = seq_len
+        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
+            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+            self.max_seq_len_cached = self.original_max_seq_len
+    @torch.no_grad()
+    def forward(self, x, position_ids):
+        position_ids = position_ids.permute(2, 0, 1)
+        if "dynamic" in self.rope_type:
+            self._dynamic_frequency_update(position_ids, device=x.device)
+        # Core RoPE block. In contrast to other models, Qwen2_VL has different position ids for thw grids
+        # So we expand the inv_freq to shape (3, ...)
+        inv_freq_expanded = self.inv_freq[None, None, :, None].float().expand(3, position_ids.shape[1], -1, 1)
+        position_ids_expanded = position_ids[:, :, None, :].float()  # shape (3, bs, 1, positions)
+        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
+        device_type = x.device.type
+        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(2, 3)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
+        cos = cos * self.attention_scaling
+        sin = sin * self.attention_scaling
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+# Copied from transformers.models.qwen2.modeling_qwen2.Qwen2MLP
+class Qwen2MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, hidden_state):
+        return self.down_proj(self.act_fn(self.gate_proj(hidden_state)) * self.up_proj(hidden_state))
+# Copied from transformers.models.llama.modeling_llama.repeat_kv
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+class Qwen2VLAttention(nn.Module):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
+    and "Generating Long Sequences with Sparse Transformers".
+    """
+    def __init__(self, config: Qwen2VLConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
+                "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.is_causal = True
+        self.attention_dropout = config.attention_dropout
+        self.rope_scaling = config.rope_scaling
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+class Qwen2VLFlashAttention2(Qwen2VLAttention):
+    """
+    Qwen2VL flash attention module, following Qwen2VL attention module. This module inherits from `Qwen2VLAttention`
+    as the weights of the module stays untouched. The only required change would be on the forward pass
+    where it needs to correctly call the public API of flash attention and deal with padding tokens
+    in case the input contains any of them. Additionally, for sliding window attention, we apply SWA only to the bottom
+    config.max_window_layers layers.
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+        use_rmpad: Optional[bool] = False,
+        cu_seqlens: Optional[torch.Tensor] = False,
+    ):
+        """
+        Train:
+          unpad: (bsz, q_len) = (1, acc_seqlen)
+          pad: (bsz, q_len) = (bsz, q_len)
+        Test:
+          first_iter: (bsz, q_len) = (bsz, q_len)
+          other: (bsz, q_len) = (bsz, 1)
+        """
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        cos, sin = position_embeddings
+        query_states, key_states = apply_multimodal_rotary_pos_emb(
+            query_states, key_states, cos, sin, self.rope_scaling["mrope_section"]
+        )
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        dropout_rate = 0.0 if not self.training else self.attention_dropout
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in float16 just to be sure everything works as expected.
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+        # Reashape to the expected shape for Flash Attention
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+        if use_rmpad:
+            max_seqlen = torch.max(cu_seqlens[1:] - cu_seqlens[:-1]).item()
+            attn_output = flash_attn_varlen_func(
+                query_states.squeeze(0), key_states.squeeze(0), value_states.squeeze(0),
+                cu_seqlens_q=cu_seqlens, cu_seqlens_k=cu_seqlens,
+                max_seqlen_q=max_seqlen, max_seqlen_k=max_seqlen,
+                dropout_p=dropout_rate,
+                causal=self.is_causal, window_size=(-1, -1),
+            )
+        else:
+            attn_output = _flash_attention_forward(
+                query_states, key_states, value_states,
+                attention_mask,
+                q_len,
+                dropout=dropout_rate,
+                sliding_window=None,
+                is_causal=self.is_causal,
+                use_top_left_mask=self._flash_attn_uses_top_left_mask,
+            )
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
+        attn_output = self.o_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+QWEN2_VL_ATTENTION_CLASSES = {
+    "flash_attention_2": Qwen2VLFlashAttention2,
+}
+class Qwen2VLDecoderLayer(nn.Module):
+    def __init__(self, config: Qwen2VLConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        if config.attn_implementation != "flash_attention_2":
+            logger.error(
+                f"只支持 flash_attention_2！config.attn_implementation={config.attn_implementation}"
+            )
+        self.self_attn = QWEN2_VL_ATTENTION_CLASSES[config.attn_implementation](config, layer_idx)
+        self.mlp = Qwen2MLP(config)
+        self.input_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+        use_rmpad: Optional[bool] = False,
+        cu_seqlens: Optional[torch.Tensor] = False,
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
+                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
+                with `head_dim` being the embedding dimension of each attention head.
+            kwargs (`dict`, *optional*):
+                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
+                into the model
+        """
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            position_embeddings=position_embeddings,
+            use_rmpad=use_rmpad,
+            cu_seqlens=cu_seqlens,
+        )
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        if use_cache:
+            outputs += (present_key_value,)
+        return outputs
+class Qwen2VLModel(Qwen2VLPreTrainedModel):
+    def __init__(self, config: Qwen2VLConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList([Qwen2VLDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)])
+        self.norm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = Qwen2VLRotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        use_rmpad: Optional[bool] = False,
+        cu_seqlens: Optional[torch.Tensor] = False,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        hidden_states = inputs_embeds
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    position_embeddings,
+                    use_rmpad,
+                    cu_seqlens,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    position_embeddings=position_embeddings,
+                    use_rmpad=use_rmpad,
+                    cu_seqlens=cu_seqlens,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        hidden_states = self.norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+class Qwen2VLForCausalLM(Qwen2VLPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = Qwen2VLModel(config)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.padding_side = "left"  # set it to left by default, user can use setter to change padding_sides
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def set_decoder(self, decoder):
+        self.model = decoder
+    def get_decoder(self):
+        return self.model
+    def get_rope_index(
+        self,
+        input_ids: torch.LongTensor,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
+        Explanation:
+            Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
+            For pure text embedding sequence, the rotary position embedding has no difference with mordern LLMs.
+            Examples:
+                input_ids: [T T T T T], here T is for text.
+                temporal position_ids: [0, 1, 2, 3, 4]
+                height position_ids: [0, 1, 2, 3, 4]
+                width position_ids: [0, 1, 2, 3, 4]
+            For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
+            and 1D rotary position embeddin for text part.
+            Examples:
+                Assume we have a video input with 3 temporal patches, 2 height patches and 2 width patches.
+                input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
+                vision temporal position_ids: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
+                vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
+                vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
+                text temporal position_ids: [3, 4, 5, 6, 7]
+                text height position_ids: [3, 4, 5, 6, 7]
+                text width position_ids: [3, 4, 5, 6, 7]
+                Here we calculate the text start position_ids as the max vision position_ids plus 1.
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+                it.
+            image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
+                The temporal, height and width of feature shape of each image in LLM.
+            video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
+                The temporal, height and width of feature shape of each video in LLM.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+        Returns:
+            position_ids (`torch.LongTensor` of shape `(3, batch_size, sequence_length)`)
+            mrope_position_deltas (`torch.Tensor` of shape `(batch_size)`)
+        """
+        spatial_merge_size = self.config.spatial_merge_size
+        vision_token_id = self.config.image_token_id
+        vision_start_token_id = self.config.vision_start_token_id
+        assert image_grid_thw is not None # TODO：测试纯文本会不会卡住
+        total_input_ids = input_ids
+        position_ids = torch.ones(
+            3, input_ids.shape[0], input_ids.shape[1], dtype=input_ids.dtype, device=input_ids.device
+        )
+        vision_index = 0
+        for i, input_ids in enumerate(total_input_ids):
+            if attention_mask is not None:
+                input_ids = input_ids[attention_mask[i] == 1]
+            vision_start_indices = torch.argwhere(input_ids == vision_start_token_id).squeeze(1)
+            vision_num = (input_ids[vision_start_indices + 1] == vision_token_id).sum()
+            input_tokens = input_ids.tolist()
+            llm_pos_ids_list: list = []
+            st = 0
+            remain_vision_num = vision_num
+            for _ in range(vision_num):
+                if vision_token_id in input_tokens and remain_vision_num > 0:
+                    ed_vision = input_tokens.index(vision_token_id, st)
+                else:
+                    ed_vision = len(input_tokens) + 1
+                t, h, w = (
+                    image_grid_thw[vision_index][0],
+                    image_grid_thw[vision_index][1],
+                    image_grid_thw[vision_index][2],
+                )
+                vision_index += 1
+                remain_vision_num -= 1
+                ed = ed_vision
+                llm_grid_t, llm_grid_h, llm_grid_w = (
+                    t.item(),
+                    h.item() // spatial_merge_size,
+                    w.item() // spatial_merge_size,
+                )
+                text_len = ed - st
+                st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+                llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+                t_index = torch.arange(llm_grid_t).view(-1, 1).expand(-1, llm_grid_h * llm_grid_w).flatten()
+                h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(llm_grid_t, -1, llm_grid_w).flatten()
+                w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(llm_grid_t, llm_grid_h, -1).flatten()
+                llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + text_len + st_idx)
+                st = ed + llm_grid_t * llm_grid_h * llm_grid_w
+            if st < len(input_tokens):
+                st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+                text_len = len(input_tokens) - st
+                llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+            llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+            position_ids[..., i, attention_mask[i] == 1] = llm_positions.to(position_ids.device)
+        position_ids = position_ids.permute(1, 2, 0)
+        return position_ids
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        use_rmpad: Optional[bool] = False,
+        cu_seqlens: Optional[torch.Tensor] = False,
+    ) -> Union[Tuple, Qwen2VLCausalLMOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            use_rmpad=use_rmpad,
+            cu_seqlens=cu_seqlens,
+        )
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return output
+        return Qwen2VLCausalLMOutputWithPast(
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )

models/modeling_tarsier.py CHANGED Viewed

@@ -1,100 +1,30 @@
-# Copyright (2024) Bytedance Ltd. and/or its affiliates
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# copy and modify from: https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py
-""" PyTorch Llava model."""
 from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
 import math
-import numpy as np
-import torch
 import torch.utils.checkpoint
 from torch import nn
 import torch.nn.functional as F
-from transformers import PreTrainedModel
 from transformers.activations import ACT2FN
 from transformers.cache_utils import Cache
 from transformers.modeling_outputs import ModelOutput
-from transformers.utils import (
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    logging,
-    replace_return_docstrings,
-)
-from transformers.models.auto import AutoModel, AutoModelForCausalLM, CONFIG_MAPPING
-from transformers import LlamaForCausalLM
 from transformers.configuration_utils import PretrainedConfig
 logger = logging.get_logger(__name__)
-LLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "llava-hf/llava-v1.5-7b": "https://huggingface.co/llava-hf/llava-v1.5-7b/resolve/main/config.json",
-}
 class LlavaConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`LlavaForConditionalGeneration`]. It is used to instantiate an
-    Llava model according to the specified arguments, defining the model architecture. Instantiating a configuration
-    with the defaults will yield a similar configuration to that of the Llava-9B.
-    e.g. [llava-hf/llava-9b](https://huggingface.co/llava-hf/llava-9b)
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-    Args:
-        vision_config (`LlavaVisionConfig`,  *optional*):
-            Custom vision config or dict
-        text_config (`Union[AutoConfig, dict]`, *optional*):
-            The config object of the text backbone. Can be any of `LlamaConfig` or `MistralConfig`.
-        ignore_index (`int`, *optional*, defaults to -100):
-            The ignore index for the loss function.
-        image_token_index (`int`, *optional*, defaults to 32000):
-            The image token index to encode the image prompt.
-        projector_hidden_act (`str`, *optional*, defaults to `"gelu"`):
-            The activation function used by the multimodal projector.
-        vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
-            The feature selection strategy used to select the vision feature from the CLIP backbone.
-        vision_feature_layer (`int`, *optional*, defaults to -2):
-            The index of the layer to select the vision feature.
-        vocab_size (`int`, *optional*, defaults to 32000):
-            Vocabulary size of the Llava model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`~LlavaForConditionalGeneration`]
-    Example:
-    ```python
-    >>> from transformers import LlavaForConditionalGeneration, LlavaConfig, CLIPVisionConfig, LlamaConfig
-    >>> # Initializing a CLIP-vision config
-    >>> vision_config = CLIPVisionConfig()
-    >>> # Initializing a Llama config
-    >>> text_config = LlamaConfig()
-    >>> # Initializing a Llava llava-1.5-7b style configuration
-    >>> configuration = LlavaConfig(vision_config, text_config)
-    >>> # Initializing a model from the llava-1.5-7b style configuration
-    >>> model = LlavaForConditionalGeneration(configuration)
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
     model_type = "llava"
     is_composition = False
@@ -108,9 +38,9 @@ class LlavaConfig(PretrainedConfig):
         projector_hidden_act="gelu",
         vision_feature_select_strategy="default",
         vision_feature_layer=-2,
-        vocab_size=32000,
         image_newline_idx=32002,
         image_new_idx=32003,
         **kwargs,
     ):
         self.ignore_index = ignore_index
@@ -118,9 +48,9 @@ class LlavaConfig(PretrainedConfig):
         self.projector_hidden_act = projector_hidden_act
         self.vision_feature_select_strategy = vision_feature_select_strategy
         self.vision_feature_layer = vision_feature_layer
-        self.vocab_size = vocab_size
         self.image_newline_idx = image_newline_idx
         self.image_new_idx = image_new_idx
         self.vision_config = vision_config
@@ -128,142 +58,166 @@ class LlavaConfig(PretrainedConfig):
             vision_config["model_type"] = (
                 vision_config["model_type"] if "model_type" in vision_config else "clip_vision_model"
             )
-            self.vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config)
-        elif vision_config is None:
-            self.vision_config = CONFIG_MAPPING["clip_vision_model"](
-                intermediate_size=4096,
-                hidden_size=1024,
-                patch_size=14,
-                image_size=336,
-                num_hidden_layers=24,
-                num_attention_heads=16,
-                vocab_size=32000,
-                projection_dim=768,
-            )
-        self.vocab_size = self.vocab_size
         self.text_config = text_config
         if isinstance(self.text_config, dict):
             text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "llama"
-            self.text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
-            self.vocab_size = self.text_config.vocab_size
-        elif text_config is None:
-            self.text_config = CONFIG_MAPPING["llama"]()
         super().__init__(**kwargs)
-logger = logging.get_logger(__name__)
-_CONFIG_FOR_DOC = "LlavaConfig"
-LLAVA_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "llava-hf/llava-1.5-7b-hf",
-    "llava-hf/llava-1.5-13b-hf",
-    "llava-hf/bakLlava-v1-hf",
-    # See all Llava models at https://huggingface.co/models?filter=llava
-]
-class Llava3DPositionalEncoding(nn.Module):
-    def __init__(self, num_pos, dim) -> None:
-        super().__init__()
-        dim1, dim2, dim3 = self.split_dim(dim)
-        frame_position_encodings = self.create_sinusoidal_positions(num_pos, dim1)
-        height_position_encodings = self.create_sinusoidal_positions(num_pos, dim2)
-        width_position_encodings = self.create_sinusoidal_positions(num_pos, dim3)
-        self.register_buffer('frame_position_encodings', frame_position_encodings, persistent=False)
-        self.register_buffer('height_position_encodings', height_position_encodings, persistent=False)
-        self.register_buffer('width_position_encodings', width_position_encodings, persistent=False)
-    def split_dim(self, dim):
-        dim1 = dim // 3
-        if dim1 % 2 != 0:
-            dim1 -= 1
-        dim2 = dim // 3
-        if dim2 % 2 != 0:
-            dim2 -= 1
-        dim3 = dim - dim1 - dim2
-        return dim1, dim2, dim3
-    def create_sinusoidal_positions(self, num_pos: int, dim: int) -> torch.Tensor:
-        inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2) / dim))
-        sinusoid_inp = torch.einsum("i , j -> i j", torch.arange(num_pos, dtype=torch.float), inv_freq).float()
-        return torch.cat((torch.sin(sinusoid_inp), torch.cos(sinusoid_inp)), dim=1)
-    def forward(self, frame_position_ids, height_position_ids, width_position_ids):
-        frame_position_embeds = F.embedding(frame_position_ids, self.frame_position_encodings)
-        height_position_embeds = F.embedding(height_position_ids, self.height_position_encodings)
-        width_position_embeds = F.embedding(width_position_ids, self.width_position_encodings)
-        return torch.cat([frame_position_embeds, height_position_embeds, width_position_embeds], dim = -1)
 @dataclass
 # Copied from transformers.models.idefics.modeling_idefics.IdeficsCausalLMOutputWithPast with Idefics->Llava
 class LlavaCausalLMOutputWithPast(ModelOutput):
-    """
-    Base class for Llava causal language model (or autoregressive) outputs.
-    Args:
-        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
-            Language modeling loss (for next-token prediction).
-        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
-            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
-            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
-            `past_key_values` input) to speed up sequential decoding.
-        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
-            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
-            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
-        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-        image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
-            Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
-            sequence_length, hidden_size)`.
-            image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
-    """
     loss: Optional[torch.FloatTensor] = None
     logits: torch.FloatTensor = None
     past_key_values: Optional[List[torch.FloatTensor]] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
-    image_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    vision_outputs: Optional[torch.FloatTensor] = None
-    llm_attn_mask: Optional[Tuple[torch.FloatTensor]] = None
 class LlavaMultiModalProjector(nn.Module):
     def __init__(self, config: LlavaConfig):
         super().__init__()
         self.linear_1 = nn.Linear(config.vision_config.hidden_size, config.text_config.hidden_size, bias=True)
         self.act = ACT2FN[config.projector_hidden_act]
         self.linear_2 = nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size, bias=True)
-    def forward(self, image_features):
-        hidden_states = self.linear_1(image_features)
         hidden_states = self.act(hidden_states)
         hidden_states = self.linear_2(hidden_states)
         return hidden_states
-TARSIER_START_DOCSTRING = r"""
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
     etc.)
@@ -279,23 +233,17 @@ TARSIER_START_DOCSTRING = r"""
             [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 """
-@add_start_docstrings(
-    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
-    TARSIER_START_DOCSTRING,
-)
 class TarsierPreTrainedModel(PreTrainedModel):
     config_class = LlavaConfig
-    base_model_prefix = "model"
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["LlavaVisionAttention"]
     _skip_keys_device_placement = "past_key_values"
     _supports_flash_attn_2 = True
     def _init_weights(self, module):
-        # important: this ported version of Llava isn't meant for training from scratch - only
-        # inference and fine-tuning - so the proper init weights code has been removed - the original codebase
-        # https://github.com/haotian-liu/LLaVA/tree/main/llava should serve for that purpose
         std = (
             self.config.initializer_range
             if hasattr(self.config, "initializer_range")
@@ -305,7 +253,7 @@ class TarsierPreTrainedModel(PreTrainedModel):
         if hasattr(module, "class_embedding"):
             module.class_embedding.data.normal_(mean=0.0, std=std)
-        if isinstance(module, (nn.Linear, nn.Conv2d)):
             module.weight.data.normal_(mean=0.0, std=std)
             if module.bias is not None:
                 module.bias.data.zero_()
@@ -313,98 +261,39 @@ class TarsierPreTrainedModel(PreTrainedModel):
             module.weight.data.normal_(mean=0.0, std=std)
             if module.padding_idx is not None:
                 module.weight.data[module.padding_idx].zero_()
     @property
-    def _supports_sdpa(self):
-        """
-        Retrieve language_model's attribute to check whether the model supports
-        SDPA or not.
-        """
-        return self.language_model._supports_sdpa
-TARSIER_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-            [What are input IDs?](../glossary#input-ids)
-        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)):
-            The tensors corresponding to the input images. Pixel values can be obtained using
-            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details ([]`LlavaProcessor`] uses
-            [`CLIPImageProcessor`] for processing images).
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-            [What are attention masks?](../glossary#attention-mask)
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
-            `past_key_values`).
-            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
-            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
-            information on the default strategy.
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
-        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
-            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
-            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
-            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
-            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
-            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
-            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-@add_start_docstrings(
-    """The LLAVA model which consists of a vision backbone and a language model.""",
-    TARSIER_INPUTS_DOCSTRING,
-)
-class TarsierForConditionalGeneration(TarsierPreTrainedModel):
     def __init__(self, config: LlavaConfig):
         super().__init__(config)
         self.vision_tower = AutoModel.from_config(config.vision_config, trust_remote_code=True)
-        self.multi_modal_projector = LlavaMultiModalProjector(config)
-        self.vocab_size = config.vocab_size
-        self.language_model = AutoModelForCausalLM.from_config(config.text_config, attn_implementation="flash_attention_2")
-        image_newline_idx = torch.tensor([config.image_newline_idx], dtype=torch.long)
-        image_new_idx = torch.tensor([config.image_new_idx], dtype=torch.long)
-        self.register_buffer('image_newline_idx', image_newline_idx, persistent=False)
-        self.register_buffer('image_new_idx', image_new_idx, persistent=False)
-        self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
         self.post_init()
     def get_input_embeddings(self):
@@ -432,231 +321,81 @@ class TarsierForConditionalGeneration(TarsierPreTrainedModel):
         model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
         # update vocab size
         self.config.text_config.vocab_size = model_embeds.num_embeddings
-        self.config.vocab_size = model_embeds.num_embeddings
-        self.vocab_size = model_embeds.num_embeddings
         return model_embeds
-    def _merge_input_ids_with_image_features(self, image_features, inputs_embeds, input_ids, attention_mask, labels):
-        num_images, num_image_patches, embed_dim = image_features.shape
-        batch_size, sequence_length = input_ids.shape
-        left_padding = not torch.sum(input_ids[:, -1] == torch.tensor(self.pad_token_id))
-        # 1. Create a mask to know where special image tokens are
-        special_image_token_mask = input_ids == self.config.image_token_index
-        num_special_image_tokens = torch.sum(special_image_token_mask, dim=-1)
-        # Compute the maximum embed dimension
-        max_embed_dim = (num_special_image_tokens.max() * (num_image_patches - 1)) + sequence_length
-        batch_indices, non_image_indices = torch.where(input_ids != self.config.image_token_index)
-        # 2. Compute the positions where text should be written
-        # Calculate new positions for text tokens in merged image-text sequence.
-        # `special_image_token_mask` identifies image tokens. Each image token will be replaced by `nb_text_tokens_per_images - 1` text tokens.
-        # `torch.cumsum` computes how each image token shifts subsequent text token positions.
-        # - 1 to adjust for zero-based indexing, as `cumsum` inherently increases indices by one.
-        new_token_positions = torch.cumsum((special_image_token_mask * (num_image_patches - 1) + 1), -1) - 1
-        nb_image_pad = max_embed_dim - 1 - new_token_positions[:, -1]
-        if left_padding:
-            new_token_positions += nb_image_pad[:, None]  # offset for left padding
-        text_to_overwrite = new_token_positions[batch_indices, non_image_indices]
-        # 3. Create the full embedding, already padded to the maximum position
-        final_embedding = torch.zeros(
-            batch_size, max_embed_dim, embed_dim, dtype=inputs_embeds.dtype, device=inputs_embeds.device
-        )
-        final_attention_mask = torch.zeros(
-            batch_size, max_embed_dim, dtype=attention_mask.dtype, device=inputs_embeds.device
-        )
-        if labels is not None:
-            final_labels = torch.full(
-                (batch_size, max_embed_dim), self.config.ignore_index, dtype=input_ids.dtype, device=input_ids.device
-            )
-        # In case the Vision model or the Language model has been offloaded to CPU, we need to manually
-        # set the corresponding tensors into their correct target device.
-        target_device = inputs_embeds.device
-        batch_indices, non_image_indices, text_to_overwrite = (
-            batch_indices.to(target_device),
-            non_image_indices.to(target_device),
-            text_to_overwrite.to(target_device),
-        )
-        attention_mask = attention_mask.to(target_device)
-        # 4. Fill the embeddings based on the mask. If we have ["hey" "<image>", "how", "are"]
-        # we need to index copy on [0, 577, 578, 579] for the text and [1:576] for the image features
-        final_embedding[batch_indices, text_to_overwrite] = inputs_embeds[batch_indices, non_image_indices]
-        final_attention_mask[batch_indices, text_to_overwrite] = attention_mask[batch_indices, non_image_indices]
-        if labels is not None:
-            final_labels[batch_indices, text_to_overwrite] = labels[batch_indices, non_image_indices]
-        # 5. Fill the embeddings corresponding to the images. Anything that is still zeros needs filling
-        image_to_overwrite = torch.all(final_embedding == 0, dim=-1)
-        image_to_overwrite &= image_to_overwrite.cumsum(-1) - 1 >= nb_image_pad[:, None].to(target_device)
-        if image_to_overwrite.sum() != image_features.shape[:-1].numel():
-            raise ValueError(
-                f"The input provided to the model are wrong. The number of image tokens is {torch.sum(special_image_token_mask)} while"
-                f" the number of image given to the model is {num_images}. This prevents correct indexing and breaks batch generation."
-            )
-        final_embedding[image_to_overwrite] = image_features.contiguous().reshape(-1, embed_dim).to(target_device)
-        final_attention_mask |= image_to_overwrite
-        position_ids = (final_attention_mask.cumsum(-1) - 1).masked_fill_((final_attention_mask == 0), 1)
-        if labels is None:
-            final_labels = None
-        return final_embedding, final_attention_mask, final_labels, position_ids
-    def add_split_tokens(self, image_features):
-        num_images, num_image_patches, embed_dim = image_features.shape
-        num_height_patches, num_width_patches = int(math.sqrt(num_image_patches)), int(math.sqrt(num_image_patches))
-        # add image_newline
-        image_newline = self.get_input_embeddings()(self.image_newline_idx).squeeze()
-        image_features = image_features.view(num_images, num_height_patches, num_width_patches, embed_dim)
-        image_features = torch.cat([
-            image_features,
-            image_newline.expand((num_images, num_height_patches, 1, embed_dim)).to(device=image_features.device)
-        ], dim=2)
-        num_image_patches += num_height_patches
-        image_features = image_features.view(num_images, num_image_patches, embed_dim)
-        # add image_new
-        image_new = self.get_input_embeddings()(self.image_new_idx).squeeze()
-        image_features = torch.cat([
-            image_features,
-            image_new.expand((num_images, 1, embed_dim)).to(device=image_features.device)
-        ], dim = 1)
-        return image_features
-    @add_start_docstrings_to_model_forward(TARSIER_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=LlavaCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         input_ids: torch.LongTensor = None,
-        pixel_values: torch.FloatTensor = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        vision_feature_layer: Optional[int] = None,
-        vision_feature_select_strategy: Optional[str] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         **kwargs,
     ) -> Union[Tuple, LlavaCausalLMOutputWithPast]:
-        r"""
-        Args:
-            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-        Returns:
-        Example:
-        ```python
-        >>> from PIL import Image
-        >>> import requests
-        >>> from transformers import AutoProcessor, LlavaForConditionalGeneration
-        >>> model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf")
-        >>> processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
-        >>> prompt = "<image>\nUSER: What's the content of the image?\nASSISTANT:"
-        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
-        >>> inputs = processor(text=prompt, images=image, return_tensors="pt")
-        >>> # Generate
-        >>> generate_ids = model.generate(**inputs, max_length=30)
-        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "\nUSER: What's the content of the image?\nASSISTANT: The image features a stop sign on a street corner"
-        ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        vision_feature_layer = (
-            vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
-        )
-        vision_feature_select_strategy = (
-            vision_feature_select_strategy
-            if vision_feature_select_strategy is not None
-            else self.config.vision_feature_select_strategy
-        )
         image_features = None
-        if inputs_embeds is None:
-            # 1. Extra the input embeddings
-            inputs_embeds = self.get_input_embeddings()(input_ids)
-            # 2. Merge text and images
-            if pixel_values is not None and input_ids.shape[1] != 1:
-                pixel_values = pixel_values.to(dtype=self.vision_tower.dtype)
                 image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
-                # this is not memory efficient at all (output_hidden_states=True) will save all the hidden stated.
-                selected_image_feature = image_outputs.hidden_states[vision_feature_layer]
-                if vision_feature_select_strategy == "default":
-                    selected_image_feature = selected_image_feature[:, 1:]
-                elif vision_feature_select_strategy == "full":
-                    selected_image_feature = selected_image_feature
-                else:
-                    raise ValueError(
-                        f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}"
-                    )
-                image_features = self.multi_modal_projector(selected_image_feature)
-                special_image_token_mask = input_ids == self.config.image_token_index
-                num_special_image_tokens = torch.sum(special_image_token_mask, dim = -1)
-                image_features = self.add_split_tokens(image_features)
-                if sum(num_special_image_tokens) > 0:
-                    # print(f'num_special_image_tokens: {num_special_image_tokens}')
-                    inputs_embeds, attention_mask, labels, position_ids = self._merge_input_ids_with_image_features(
-                        image_features, inputs_embeds, input_ids, attention_mask, labels
-                    )
-                else:
-                    inputs_embeds = image_features.sum(dim=(0,1))[None, None, :] * 0. + inputs_embeds
-                if labels is None:
-                    labels = torch.full_like(attention_mask, self.config.ignore_index).to(torch.long)
             else:
-                # In case input_ids.shape[1] == 1 & pixel_values==None & past_key_values != None, we are in the case of
-                # generation with cache
-                if past_key_values is not None and pixel_values is not None and input_ids.shape[1] == 1:
-                    # Retrieve the first layer to inspect the logits and mask out the hidden states
-                    # that are set to 0
-                    first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
-                    # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
-                    batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
-                    # Get the target length
-                    target_seqlen = first_layer_past_key_value.shape[-1] + 1
-                    extended_attention_mask = torch.ones(
-                        (attention_mask.shape[0], target_seqlen),
-                        dtype=attention_mask.dtype,
-                        device=attention_mask.device,
-                    )
-                    extended_attention_mask[batch_index, non_attended_tokens] = 0
-                    valid_indices = torch.ones_like(attention_mask)
-                    valid_indices[:, 0] = target_seqlen - extended_attention_mask.sum(dim=-1)
-                    valid_indices = torch.cumsum(valid_indices, dim=-1)
-                    extended_attention_mask = extended_attention_mask.scatter(1, valid_indices, attention_mask)
-                    attention_mask = extended_attention_mask
-                    position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
         outputs = self.language_model(
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -665,27 +404,35 @@ class TarsierForConditionalGeneration(TarsierPreTrainedModel):
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            # use_rmpad=kwargs.get("use_rmpad", False),
             return_dict=return_dict,
         )
         logits = outputs[0]
         loss = None
         if labels is not None:
-            # Shift so that tokens < n predict n
-            if attention_mask is not None:
-                shift_attention_mask = attention_mask[..., 1:]
-                shift_logits = logits[..., :-1, :][shift_attention_mask.to(logits.device) != 0].contiguous()
-                shift_labels = labels[..., 1:][shift_attention_mask.to(labels.device) != 0].contiguous()
             else:
                 shift_logits = logits[..., :-1, :].contiguous()
                 shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = nn.CrossEntropyLoss()
-            loss = loss_fct(
-                shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1).to(shift_logits.device)
-            )
         if not return_dict:
             output = (logits,) + outputs[1:]
@@ -697,61 +444,59 @@ class TarsierForConditionalGeneration(TarsierPreTrainedModel):
             past_key_values=outputs.past_key_values,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
-            llm_attn_mask=attention_mask
         )
     def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, inputs_embeds=None, pixel_values=None, attention_mask=None, **kwargs
     ):
         if past_key_values is not None:
-            if isinstance(past_key_values, Cache):
-                cache_length = past_key_values.get_seq_length()
-                past_length = past_key_values.seen_tokens
-            else:
-                cache_length = past_length = past_key_values[0][0].shape[2]
-            # Keep only the unprocessed tokens:
-            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
-            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
-            # input)
-            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
-                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
-            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
-            # input_ids based on the past_length.
-            elif past_length < input_ids.shape[1]:
-                input_ids = input_ids[:, past_length:]
-            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
-            elif self.config.image_token_index in input_ids:
-                input_ids = input_ids[:, input_ids.shape[1] - 1 :]
-            # If the cache has seen more tokens than it can hold, then the cache has a size limit. Let's discard the
-            # older attention values, as their corresponding values are not part of the input.
-            if cache_length < past_length and attention_mask is not None:
-                attention_mask = attention_mask[:, -(cache_length + input_ids.shape[1]) :]
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1] :]
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
         else:
-            model_inputs = {"input_ids": input_ids}
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-                "pixel_values": pixel_values,
-            }
-        )
         return model_inputs
-    def _reorder_cache(self, *args, **kwargs):
-        return self.language_model._reorder_cache(*args, **kwargs)

 from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union, Dict, Any
 import math
 import torch.utils.checkpoint
 from torch import nn
 import torch.nn.functional as F
+from transformers import PreTrainedModel, AutoConfig, AutoModel
 from transformers.activations import ACT2FN
 from transformers.cache_utils import Cache
 from transformers.modeling_outputs import ModelOutput
+from transformers.utils import logging
 from transformers.configuration_utils import PretrainedConfig
+from transformers.dynamic_module_utils import get_class_from_dynamic_module
+from transformers.models.auto import AutoModel, AutoModelForCausalLM, CONFIG_MAPPING
+from transformers.generation import GenerationMixin
+from transformers import LlamaForCausalLM, Qwen2ForCausalLM
+# from models.modeling_qwen2 import Qwen2ForCausalLM
+from models.modeling_qwen2_vl_fast import Qwen2VLForCausalLM
+from models.utils import _pad_input, _unpad_input
 logger = logging.get_logger(__name__)
 class LlavaConfig(PretrainedConfig):
     model_type = "llava"
     is_composition = False
         projector_hidden_act="gelu",
         vision_feature_select_strategy="default",
         vision_feature_layer=-2,
         image_newline_idx=32002,
         image_new_idx=32003,
+        projection_head="MLP",
         **kwargs,
     ):
         self.ignore_index = ignore_index
         self.projector_hidden_act = projector_hidden_act
         self.vision_feature_select_strategy = vision_feature_select_strategy
         self.vision_feature_layer = vision_feature_layer
         self.image_newline_idx = image_newline_idx
         self.image_new_idx = image_new_idx
+        self.projection_head = projection_head
         self.vision_config = vision_config
             vision_config["model_type"] = (
                 vision_config["model_type"] if "model_type" in vision_config else "clip_vision_model"
             )
+            if 'auto_map' in vision_config:
+                repo_id, class_ref = vision_config['auto_map']['AutoConfig'].split("--")
+                config_class = get_class_from_dynamic_module(class_ref, repo_id, **kwargs)
+                self.vision_config = config_class(**vision_config)
+            elif vision_config["model_type"] in CONFIG_MAPPING:
+                self.vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config)
+            else:
+                raise ValueError(f'vision_config["model_type"] = {vision_config["model_type"]} not supported!')
         self.text_config = text_config
         if isinstance(self.text_config, dict):
             text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "llama"
+            if 'auto_map' in text_config:
+                repo_id, class_ref = text_config['auto_map']['AutoConfig'].split("--")
+                config_class = get_class_from_dynamic_module(class_ref, repo_id, **kwargs)
+                self.text_config = config_class(**text_config)
+            elif text_config["model_type"] in CONFIG_MAPPING:
+                self.text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
+            else:
+                raise ValueError(f'text_config["model_type"] = {text_config["model_type"]} not supported!')
         super().__init__(**kwargs)
 @dataclass
 # Copied from transformers.models.idefics.modeling_idefics.IdeficsCausalLMOutputWithPast with Idefics->Llava
 class LlavaCausalLMOutputWithPast(ModelOutput):
     loss: Optional[torch.FloatTensor] = None
     logits: torch.FloatTensor = None
     past_key_values: Optional[List[torch.FloatTensor]] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
+    position_ids: Optional[torch.LongTensor] = None
+def add_split_tokens(image_features, image_newline_embed, image_new_embed):
+    num_images, num_image_patches, embed_dim = image_features.shape
+    num_height_patches, num_width_patches = int(math.sqrt(num_image_patches)), int(math.sqrt(num_image_patches))
+    # add image_newline
+    image_features = image_features.view(num_images, num_height_patches, num_width_patches, embed_dim)
+    image_features = torch.cat([
+        image_features,
+        image_newline_embed.expand((num_images, num_height_patches, 1, embed_dim))
+    ], dim=2)
+    num_image_patches += num_height_patches
+    image_features = image_features.view(num_images, num_image_patches, embed_dim)
+    # add image_new
+    image_features = torch.cat([
+        image_features,
+        image_new_embed.expand((num_images, 1, embed_dim))
+    ], dim = 1)
+    return image_features
 class LlavaMultiModalProjector(nn.Module):
     def __init__(self, config: LlavaConfig):
         super().__init__()
+        self.config = config
         self.linear_1 = nn.Linear(config.vision_config.hidden_size, config.text_config.hidden_size, bias=True)
         self.act = ACT2FN[config.projector_hidden_act]
         self.linear_2 = nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size, bias=True)
+        image_newline_idx = torch.tensor([config.image_newline_idx], dtype=torch.long)
+        image_new_idx = torch.tensor([config.image_new_idx], dtype=torch.long)
+        self.register_buffer('image_newline_idx', image_newline_idx, persistent=False)
+        self.register_buffer('image_new_idx', image_new_idx, persistent=False)
+    def forward(self, image_features, input_embeddings):
+        selected_image_feature = image_features[self.config.vision_feature_layer]
+        if self.config.vision_feature_select_strategy == "default":
+            selected_image_feature = selected_image_feature[:, 1:]
+        elif self.config.vision_feature_select_strategy == "full":
+            selected_image_feature = selected_image_feature
+        else:
+            raise ValueError(
+                f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}"
+            )
+        hidden_states = self.linear_1(selected_image_feature)
         hidden_states = self.act(hidden_states)
         hidden_states = self.linear_2(hidden_states)
+        image_newline_embed = input_embeddings(self.image_newline_idx).squeeze()
+        image_new_embed = input_embeddings(self.image_new_idx).squeeze()
+        hidden_states = add_split_tokens(hidden_states, image_newline_embed, image_new_embed)
         return hidden_states
+class PixelShuffleMultiModalProjector(nn.Module):
+    def __init__(self, config: LlavaConfig):
+        super().__init__()
+        self.config = config
+        self.downsample_ratio = 0.5
+        vit_hidden_size = config.vision_config.hidden_size
+        llm_hidden_size = config.text_config.hidden_size
+        self.mlp = nn.Sequential(
+            nn.LayerNorm(vit_hidden_size * int(1 / self.downsample_ratio) ** 2),
+            nn.Linear(vit_hidden_size * int(1 / self.downsample_ratio) ** 2, llm_hidden_size),
+            nn.GELU(),
+            nn.Linear(llm_hidden_size, llm_hidden_size)
+        )
+        image_newline_idx = torch.tensor([config.image_newline_idx], dtype=torch.long)
+        image_new_idx = torch.tensor([config.image_new_idx], dtype=torch.long)
+        self.register_buffer('image_newline_idx', image_newline_idx, persistent=False)
+        self.register_buffer('image_new_idx', image_new_idx, persistent=False)
+    def forward(self, image_features, input_embeddings):
+        selected_image_feature = image_features[self.config.vision_feature_layer]
+        if self.config.vision_feature_select_strategy == "default":
+            selected_image_feature = selected_image_feature[:, 1:]
+        elif self.config.vision_feature_select_strategy == "full":
+            selected_image_feature = selected_image_feature
+        else:
+            raise ValueError(
+                f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}"
+            )
+        image_features = self.pixel_shuffle(selected_image_feature)
+        hidden_states = self.mlp(image_features)
+        image_newline_embed = input_embeddings(self.image_newline_idx).squeeze()
+        image_new_embed = input_embeddings(self.image_new_idx).squeeze()
+        hidden_states = add_split_tokens(hidden_states, image_newline_embed, image_new_embed)
+        return hidden_states
+    def pixel_shuffle(self, x, scale_factor=0.5):
+        if scale_factor == 1:
+            return x
+        n, wh, c = x.shape
+        h, w = int(math.sqrt(wh)), int(math.sqrt(wh))
+        x = x.view(n, h, w, c)
+        n, w, h, c = x.size()
+        # N, W, H, C --> N, W, H * scale, C // scale
+        x = x.view(n, w, int(h * scale_factor), int(c / scale_factor))
+        # N, W, H * scale, C // scale --> N, H * scale, W, C // scale
+        x = x.permute(0, 2, 1, 3).contiguous()
+        # N, H * scale, W, C // scale --> N, H * scale, W * scale, C // (scale ** 2)
+        x = x.view(n, int(h * scale_factor), int(w * scale_factor),
+                   int(c / (scale_factor * scale_factor)))
+        x = x.permute(0, 2, 1, 3).contiguous()
+        x = x.view(x.shape[0], -1, x.shape[-1])
+        return x
+LLAVA_START_DOCSTRING = r"""
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
     etc.)
             [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 """
 class TarsierPreTrainedModel(PreTrainedModel):
     config_class = LlavaConfig
+    base_model_prefix = "llm"
+    supports_gradient_checkpointing = True # TODO: support latest gc
     _skip_keys_device_placement = "past_key_values"
     _supports_flash_attn_2 = True
+    _supports_sdpa = False
+    _supports_cache_class = True # TODO: support different cache
+    _supports_static_cache = True
     def _init_weights(self, module):
         std = (
             self.config.initializer_range
             if hasattr(self.config, "initializer_range")
         if hasattr(module, "class_embedding"):
             module.class_embedding.data.normal_(mean=0.0, std=std)
+        if isinstance(module, (nn.Linear, nn.Conv2d, nn.Conv3d)):
             module.weight.data.normal_(mean=0.0, std=std)
             if module.bias is not None:
                 module.bias.data.zero_()
             module.weight.data.normal_(mean=0.0, std=std)
             if module.padding_idx is not None:
                 module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.weight.data.fill_(1.0)
+            if module.bias is not None:
+                module.bias.data.zero_()
     @property
+    def _no_split_modules(self):
+        return self.language_model._no_split_modules + self.vision_tower._no_split_modules
+class TarsierForConditionalGeneration(TarsierPreTrainedModel, GenerationMixin):
     def __init__(self, config: LlavaConfig):
         super().__init__(config)
         self.vision_tower = AutoModel.from_config(config.vision_config, trust_remote_code=True)
+        if config.text_config.model_type == 'qwen2':
+            self.language_model = Qwen2ForCausalLM(config.text_config)
+        elif config.text_config.model_type == 'qwen2_vl':
+            self.language_model = Qwen2VLForCausalLM(config.text_config)
+        elif config.text_config.model_type == 'llama':
+            self.language_model = LlamaForCausalLM(config.text_config)
+        else:
+            raise ValueError(f'{config.text_config.model_type} not supported!')
+        if config.projection_head == 'Pixel_Shuffle':
+            self.multi_modal_projector = PixelShuffleMultiModalProjector(config)
+        elif config.projection_head == 'MLP':
+            self.multi_modal_projector = LlavaMultiModalProjector(config)
+        elif config.projection_head == 'auto_map':
+            repo_id, class_ref = config.auto_map['ProjectionLayer'].split("--")
+            model_class = get_class_from_dynamic_module(class_ref, repo_id)
+            self.multi_modal_projector = model_class(config)
+        elif config.projection_head is None:
+            self.multi_modal_projector = lambda x, *args, **kwargs: x
         self.post_init()
     def get_input_embeddings(self):
         model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
         # update vocab size
         self.config.text_config.vocab_size = model_embeds.num_embeddings
         return model_embeds
     def forward(
         self,
         input_ids: torch.LongTensor = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
+        pixel_values: torch.FloatTensor = None,
+        image_grid_thw: Optional[torch.Tensor] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
         labels: Optional[torch.LongTensor] = None,
+        num_images: Optional[torch.Tensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
+        use_rmpad: Optional[bool] = False,
         **kwargs,
     ) -> Union[Tuple, LlavaCausalLMOutputWithPast]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if input_ids is None:
+            raise ValueError("You must specify input_ids")
+        bsz, max_seq_len = input_ids.shape[0], input_ids.shape[1]
+        if max_seq_len > 1:
+            special_image_mask = input_ids == self.config.image_token_index
+            print(f'[{input_ids.device}] num_images: {num_images.tolist()} num_image_tokens: {special_image_mask.sum(-1).tolist()}', flush=True)
+        if position_ids is None:
+            if 'Qwen2VLForCausalLM' in self.language_model.__class__.__name__:
+                position_ids = self.language_model.get_rope_index(input_ids, image_grid_thw, attention_mask) # [bsz, seqlen, 3]
+            else:
+                position_ids = attention_mask.long().cumsum(-1) - 1 #  # [bsz, seqlen]
+                position_ids.masked_fill_(attention_mask == 0, 1)
+        if use_rmpad:
+            input_ids, input_ids_indices, cu_seqlens, _ = _unpad_input(input_ids, attention_mask) # [bsz, seqlen] -> [1, seqlen]
+            position_ids, _, _, _ = _unpad_input(position_ids, attention_mask)
+            input_ids, position_ids = input_ids.unsqueeze(0), position_ids.unsqueeze(0)
+        else:
+            input_ids_indices, cu_seqlens = None, None
+        inputs_embeds = self.get_input_embeddings()(input_ids) # [1, seqlen, dim]
         image_features = None
+        if pixel_values is not None: # training / first step in generation
+            if 'Qwen2VLForCausalLM' in self.language_model.__class__.__name__:
+                pixel_values = pixel_values.type(self.vision_tower.get_dtype())
+                image_features = self.vision_tower(pixel_values, image_grid_thw)
+            else:
                 image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
+                image_features = self.multi_modal_projector(
+                    image_outputs.hidden_states,
+                    self.get_input_embeddings(),
+                )
+            special_image_mask = input_ids == self.config.image_token_index
+            if special_image_mask.sum() > 0:
+                image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+                inputs_embeds = inputs_embeds.masked_scatter(
+                    special_image_mask.unsqueeze(-1).expand_as(inputs_embeds),
+                    image_features
+                )
             else:
+                inputs_embeds = image_features.sum(dim=(0,1)) * 0. + inputs_embeds
         outputs = self.language_model(
             attention_mask=attention_mask,
             position_ids=position_ids,
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
+            use_rmpad=use_rmpad,
+            cu_seqlens=cu_seqlens,
         )
         logits = outputs[0]
         loss = None
         if labels is not None:
+            loss_fct = nn.CrossEntropyLoss()
+            if use_rmpad:
+                labels = labels.view(-1)[input_ids_indices.long()]
+                shift_labels = torch.cat((labels[1:], labels.new_ones((1))*-100))
+                shift_labels.requires_grad = False
+                lbl_seq_lens = (cu_seqlens[1:]-1).long()
+                shift_labels[lbl_seq_lens] = -100
+                loss = loss_fct(logits.squeeze(0), shift_labels)
             else:
+                # Shift so that tokens < n predict n
                 shift_logits = logits[..., :-1, :].contiguous()
                 shift_labels = labels[..., 1:].contiguous()
+                # Flatten the tokens
+                shift_logits = shift_logits.view(-1, self.config.text_config.vocab_size)
+                shift_labels = shift_labels.view(-1)
+                # Enable model parallelism
+                shift_labels = shift_labels.to(shift_logits.device)
+                loss = loss_fct(shift_logits, shift_labels)
+        elif use_rmpad: # 训练的时候，就不 unpad logits 了，节省显存。
+            logits = _pad_input(logits.squeeze(0), input_ids_indices, bsz, max_seq_len)
         if not return_dict:
             output = (logits,) + outputs[1:]
             past_key_values=outputs.past_key_values,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
+            position_ids=position_ids,
         )
     def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        attention_mask=None,
+        position_ids=None,
+        past_key_values=None,
+        cache_position=None,
+        use_cache=True,
+        pixel_values=None,
+        image_grid_thw=None,
+        **kwargs,
     ):
         if past_key_values is not None:
+            past_length = past_key_values.get_seq_length()
+            input_ids = input_ids[:, past_length:]
+        model_inputs = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "past_key_values": past_key_values,
+            "use_cache": use_cache,
+        }
+        if kwargs.get('num_images') is not None:
+            model_inputs['num_images'] = kwargs['num_images']
+        if cache_position[0] == 0:
+            # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
+            # Otherwise we need pixel values to be passed to model
+            model_inputs["pixel_values"] = pixel_values
+            model_inputs["image_grid_thw"] = image_grid_thw
         else:
+            model_inputs['position_ids'] = position_ids[:, -1, ...].unsqueeze(1).to(device=input_ids.device) + 1
         return model_inputs
+    def _update_model_kwargs_for_generation(
+        self,
+        outputs: ModelOutput,
+        model_kwargs: Dict[str, Any],
+        is_encoder_decoder: bool = False,
+        num_new_tokens: int = 1,
+    ) -> Dict[str, Any]:
+        model_kwargs = super()._update_model_kwargs_for_generation(
+            outputs=outputs,
+            model_kwargs=model_kwargs,
+            is_encoder_decoder=is_encoder_decoder,
+            num_new_tokens=num_new_tokens,
+        )
+        if getattr(outputs, "position_ids", None) is not None:
+            model_kwargs["position_ids"] = outputs.position_ids
+        return model_kwargs

models/utils.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import torch
+import torch.nn.functional as F
+from einops import rearrange
+def _unpad_input(input_ids, attention_mask):
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
+    input_ids = rearrange(input_ids, 'b s ... -> (b s) ...')[indices]
+    return input_ids, indices, cu_seqlens, max_seqlen_in_batch
+def _pad_input(hidden_states, indices, batch, seqlen):
+    output = torch.zeros(batch * seqlen, *hidden_states.shape[1:], device=hidden_states.device,
+                         dtype=hidden_states.dtype)
+    output[indices] = hidden_states
+    return rearrange(output, '(b s) ... -> b s ...', b=batch)

requirements.txt CHANGED Viewed

@@ -19,5 +19,6 @@ torch==2.1.0
 torchvision==0.16.0
 torchaudio==2.1.0
 https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.7/flash_attn-2.5.7+cu122torch2.1cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
-transformers==4.44.2
 triton==2.1.0

 torchvision==0.16.0
 torchaudio==2.1.0
 https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.7/flash_attn-2.5.7+cu122torch2.1cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
+transformers==4.47.0
 triton==2.1.0
+func_timeout==4.3.5

tools/color.py ADDED Viewed

	@@ -0,0 +1,36 @@

+# Copyright (2024) Bytedance Ltd. and/or its affiliates
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+class Color:
+    @staticmethod
+    def red(x):
+        return '\33[31m' +x + '\033[0m'
+    @staticmethod
+    def green(x):
+        return '\33[32m' +x + '\033[0m'
+    @staticmethod
+    def yellow(x):
+        return '\33[33m' +x + '\033[0m'
+    @staticmethod
+    def blue(x):
+        return '\33[34m' +x + '\033[0m'
+    @staticmethod
+    def violet(x):
+        return '\33[35m' +x + '\033[0m'

tools/conversation.py CHANGED Viewed

@@ -16,12 +16,43 @@
 from PIL import Image
 import torch
 from transformers import StoppingCriteria, StoppingCriteriaList
 from enum import auto, Enum
 import os
-from dataset.processor import Processor
 import re
 IMAGE_TOKEN = "<image>"
 VIDEO_TOKEN = "<video>"
@@ -31,24 +62,48 @@ class SeparatorStyle(Enum):
     SINGLE = auto()
     TWO = auto()
-def get_prompt(conv):
-    ret = ""
-    if conv.system:
-        ret = conv.system + conv.sep1
     for i, (role, message) in enumerate(conv.messages):
         if message:
-            # In current version, the image should be add at the first conversation round.
-            # So we need to remove the special image tokens in following user input.
-            if i > 0:
-                message = re.sub(f"({IMAGE_TOKEN}|{VIDEO_TOKEN})\n*", "", message)
-            ret += role + ": " + message
-            if i % 2:
-                ret += conv.sep2
             else:
-                ret += conv.sep1
-        else:
-            ret += role + ": "
-    return ret
 class StoppingCriteriaSub(StoppingCriteria):
@@ -64,53 +119,36 @@ class StoppingCriteriaSub(StoppingCriteria):
 class Chat:
-    def __init__(self, model, processor: Processor, device='cuda', debug=False):
         self.model = model
         self.processor = processor
         self.device = device
         self.debug = debug
-        stop_words_ids = [torch.tensor([self.processor.tokenizer.eos_token_id]).to(device)]
         self.stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)])
     def ask(self,text,conv):
-        conv.messages.append([conv.roles[0], text])
         return conv
-    def prepare_model_inputs(self, conv, visual_data_file=None, images=None, n_frames=None):
-        conv.messages.append([conv.roles[1], None])
-        print(conv.messages)
-        conv.messages[0][1] = re.sub(f"({IMAGE_TOKEN}|{VIDEO_TOKEN})\n*", "", conv.messages[0][1])
-        if images is None or isinstance(images, list) and len(images) == 0:
-            if isinstance(visual_data_file, str) and os.path.exists(visual_data_file):
-                images = self.processor.load_images(visual_data_file, n_frames)
-            elif isinstance(visual_data_file, Image.Image):
-                images = [visual_data_file]
-            elif visual_data_file is None or visual_data_file == "":
-                images = None
-            else:
-                raise NotImplementedError
-        # os.system("rm tmp_images/*")
-        # for i, img in enumerate(images):
-        #     img.save(f"tmp_images/{i+1}.jpg")
-        if isinstance(images, list) and len(images) > 0:
-            conv.messages[0][1] = IMAGE_TOKEN*len(images) + '\n' + conv.messages[0][1]
-        prompt = get_prompt(conv)
         if self.debug:
-            print(f"visual_data_file: {visual_data_file}")
-            print(f"Prompt: {prompt}", flush=True)
-        inputs = self.processor(prompt, images=images, edit_prompt=False, return_prompt=False)
-        # print(self.processor.tokenizer.convert_ids_to_tokens(inputs['input_ids'][0]))
-        inputs = {k:v.to(self.device) for k,v in inputs.items() if v is not None}
-        return inputs, conv, images
-    def answer(self, conv, visual_data_file=None, images=None, n_frames=None, max_new_tokens=256, num_beams=1, min_length=1, top_p=1.0,
                repetition_penalty=1.0, length_penalty=1, temperature=0):
-        inputs, conv, images = self.prepare_model_inputs(conv, visual_data_file, images, n_frames)
         if self.model is not None:
             outputs = self.model.generate(
                 **inputs,
@@ -124,11 +162,13 @@ class Chat:
                 length_penalty=length_penalty,
                 temperature=temperature,
             )
-            output_text = self.processor.tokenizer.decode(outputs[0][inputs['input_ids'][0].shape[0]:], skip_special_tokens=True)
         else:
             output_text = "Fake respone as launched in debug mode!"
-        conv.messages[-1][1] = output_text
-        return output_text, conv, images
 class EasyDict(dict):
     """
@@ -204,19 +244,13 @@ conv_tarsier_yi = EasyDict({
 }
 )
-conv_tarsier_qwen2 = EasyDict({
     "system": "",
-    "roles": ("USER", "ASSISTANT"),
     "messages": [],
-    "sep1": " ",
-    "sep2": "<|endoftext|>",
 }
 )
 conv_templates = {
-    "tarsier-7b": conv_tarsier,
-    "tarsier-13b": conv_tarsier,
-    "tarsier-34b": conv_tarsier_yi,
-    "tarsier2-7b": conv_tarsier_qwen2
 }

 from PIL import Image
 import torch
 from transformers import StoppingCriteria, StoppingCriteriaList
+from dataset.custom_data_parsers.utils import put_pred_to_data_dict, get_prompt_from_data_dict
+from dataset.tarsier_datamodule import TarsierDataProcessor
+from dataset.utils import *
 from enum import auto, Enum
 import os
 import re
+data_dict_tmp = {
+    "messages": [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "video",
+                    "video": {
+                        "video_file": "/mnt/hdfs/vlm/videos/movies_aligned_0523/tt8266310/tt8266310_1.50.24-1.50.29.mp4"}
+                },
+                {
+                    "type": "text",
+                    "text": "Describe the video in detail."
+                }
+            ]
+        },
+        {
+            "role": "assistant",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "A man in the driver's seat, wearing a black jacket with a maroon shirt, fastens his seatbelt while smiling at the man in the passenger seat, who is adjusting his position. The passenger, also wearing a black jacket with a maroon shirt, turns to look forward and smiles. The driver then leans forward to start the car and leans back in his seat. In the background, a beige car is visible through the window."
+            }]}
+    ],
+    "dataset": "video_caption",
+    "task": "video/caption",
+    "idx": 0,
+}
 IMAGE_TOKEN = "<image>"
 VIDEO_TOKEN = "<video>"
     SINGLE = auto()
     TWO = auto()
+def get_data_dict(conv, max_n_frames=None):
+    data_dict = {
+        "messages": []
+    }
     for i, (role, message) in enumerate(conv.messages):
         if message:
+            text = message["text"]
+            content_type = message["type"]
+            content = {}
+            if content_type == "text":
+                content['type'] = 'text'
+                content['text'] = text
+                task = "text-only"
+            elif content_type == "video":
+                content['type'] = 'video'
+                content['video'] = {
+                    "video_file": text
+                }
+                if max_n_frames is not None:
+                    content['video']['n_frames'] = max_n_frames
+                task = "video/QA"
+            elif content_type == "image":
+                content['type'] = 'image'
+                content['image'] = {
+                    "image_file": text
+                }
+                task = "image/QA"
             else:
+                content['type'] = 'text'
+                content['text'] = text
+                task = "text-only"
+            if data_dict['messages'] and data_dict['messages'][-1]['role'] == role:
+                data_dict['messages'][-1]['content'].append(content)
+            else:
+                data_dict['messages'].append({
+                    "role": role,
+                    "content": [content]
+                })
+    data_dict['dataset'] = task
+    data_dict['task'] = task
+    check_data_format(data_dict)
+    return data_dict
 class StoppingCriteriaSub(StoppingCriteria):
 class Chat:
+    def __init__(self, model, processor: TarsierDataProcessor, device='cuda', debug=False):
         self.model = model
         self.processor = processor
         self.device = device
         self.debug = debug
+        stop_words_ids = [torch.tensor([self.processor.processor.tokenizer.eos_token_id]).to(device)]
         self.stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)])
     def ask(self,text,conv):
+        conv.messages.append([conv.roles[0], {"text": text, "type": "text"}])
         return conv
+    def prepare_model_inputs(self, conv, n_frames=None):
+        # print(conv.messages)
+        data_dict = get_data_dict(conv, n_frames)
         if self.debug:
+            # print(f"visual_data_file: {visual_data_file}", flush=True)
+            print(f"###Prompt:\n{get_prompt_from_data_dict(data_dict)}")
+        batch_data = self.processor(data_dict)
+        model_inputs = {}
+        for k, v in batch_data.items():
+            if not isinstance(v, torch.Tensor):
+                continue
+            model_inputs[k] = v.to(self.device)
+        return model_inputs, conv
+    def answer(self, conv, n_frames=None, max_new_tokens=256, num_beams=1, min_length=1, top_p=1.0,
                repetition_penalty=1.0, length_penalty=1, temperature=0):
+        inputs, conv = self.prepare_model_inputs(conv, n_frames)
         if self.model is not None:
             outputs = self.model.generate(
                 **inputs,
                 length_penalty=length_penalty,
                 temperature=temperature,
             )
+            output_text = self.processor.processor.tokenizer.decode(outputs[0][inputs['input_ids'][0].shape[0]:], skip_special_tokens=True)
         else:
             output_text = "Fake respone as launched in debug mode!"
+        conv.messages.append(
+            [conv.roles[1], {"text": output_text, "type": "text"}]
+        )
+        return output_text, conv
 class EasyDict(dict):
     """
 }
 )
+conv_tarsier_qwen2_vl = EasyDict({
     "system": "",
+    "roles": ("user", "assistant"),
     "messages": [],
 }
 )
 conv_templates = {
+    "tarsier2-7b": conv_tarsier_qwen2_vl
 }

tools/rw_utils.py ADDED Viewed

	@@ -0,0 +1,64 @@

+# Copyright (2024) Bytedance Ltd. and/or its affiliates
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+from json import JSONEncoder
+import numpy
+import pandas as pd
+class NumpyArrayEncoder(JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, numpy.ndarray):
+            return obj.tolist()
+        return JSONEncoder.default(self, obj)
+def write_txt(data, path):
+    with open(path, 'w', encoding='utf-8')as f:
+        for d in data:
+            f.write(f'{d}\n')
+def read_txt(path):
+    with open(path, 'r', encoding='utf-8', errors='ignore') as f:
+        lines = [l.strip('\n') for l in f.readlines()]
+        return lines
+def read_jsonlines(path):
+    objs = []
+    with open(path) as f:
+        for line in f:
+            line = json.loads(line)
+            objs.append(line)
+    return objs
+def write_jsonlines(data, path, cls=None, ensure_ascii=False):
+    with open(path, 'w') as f:
+        for d in data:
+            d = json.dumps(d, ensure_ascii=ensure_ascii, cls=cls)
+            f.write(d)
+            f.write('\n')
+def read_parquet(path):
+    data = pd.read_parquet(path)
+    return data.to_dict('records')
+def write_parquet(data, path):
+    data = pd.DataFrame(data)
+    data.to_parquet(path)
+def read_csv(path):
+    data = pd.read_csv(path)
+    return data.to_dict(orient='records')
+def write_csv(data, path):
+    data = pd.DataFrame(data)
+    data.to_csv(path, index=False, sep='\t')

tools/utils.py CHANGED Viewed

@@ -12,46 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from models.modeling_tarsier import TarsierForConditionalGeneration, LlavaConfig
-from dataset.processor import Processor
 import torch
 import base64
 import os
 HF_TOKEN = os.environ.get('HF_TOKEN', '')
-class Color:
-    @staticmethod
-    def red(x):
-        return '\33[31m' +x + '\033[0m'
-    @staticmethod
-    def green(x):
-        return '\33[32m' +x + '\033[0m'
-    @staticmethod
-    def yellow(x):
-        return '\33[33m' +x + '\033[0m'
-    @staticmethod
-    def blue(x):
-        return '\33[34m' +x + '\033[0m'
-    @staticmethod
-    def violet(x):
-        return '\33[35m' +x + '\033[0m'
-def file_to_base64(img_path):
-    with open(img_path, 'rb') as video_file:
-        video_b64_str = base64.b64encode(video_file.read()).decode()
-    return video_b64_str
-def load_model_and_processor(model_name_or_path, max_n_frames=8):
-    print(Color.red(f"Load model and processor from: {model_name_or_path}; with max_n_frames={max_n_frames}"), flush=True)
-    processor = Processor(
-        model_name_or_path,
-        max_n_frames=max_n_frames,
-    )
     model_config = LlavaConfig.from_pretrained(
         model_name_or_path,
         trust_remote_code=True,
@@ -68,3 +43,8 @@ def load_model_and_processor(model_name_or_path, max_n_frames=8):
     model.eval()
     return model, processor

 # See the License for the specific language governing permissions and
 # limitations under the License.
 from models.modeling_tarsier import TarsierForConditionalGeneration, LlavaConfig
+# from dataset.processor import Processor
+from dataset.tarsier_datamodule import init_processor
 import torch
 import base64
+from tools.color import Color
+import yaml
 import os
 HF_TOKEN = os.environ.get('HF_TOKEN', '')
+def load_model_and_processor(model_name_or_path, data_config):
+    print(Color.red(f"Load model and processor from: {model_name_or_path}"), flush=True)
+    if isinstance(data_config, str):
+        data_config = yaml.safe_load(open(data_config, 'r'))
+    processor = init_processor(model_name_or_path, data_config)
     model_config = LlavaConfig.from_pretrained(
         model_name_or_path,
         trust_remote_code=True,
     model.eval()
     return model, processor
+def file_to_base64(img_path):
+    with open(img_path, 'rb') as video_file:
+        video_b64_str = base64.b64encode(video_file.read()).decode()
+    return video_b64_str