yogkul2000
/

VideoSAVi

Safetensors

internvl_chat

custom_code

Model card Files Files and versions

xet

Community

yogkul2000 commited on Jun 4, 2025

Commit

ee17189

verified ·

1 Parent(s): e9514e6

Update README.md

Browse files

Files changed (1) hide show

README.md +214 -278

README.md CHANGED Viewed

@@ -1,304 +1,240 @@
 ---
 license: apache-2.0
 ---
-# VideoSAVi Checkpoints
 This repository contains the weights for the **VideoSAVi (Self-Aligned Video Language Model)** introduced in the paper [VideoSAVi: Self-Aligned Video Language Models without Human Supervision](https://arxiv.org/abs/2412.00624).
-## Model Overview
-VideoSAVi is a novel self-training pipeline designed to improve video-language understanding tasks without requiring extensive human annotations or proprietary models. By leveraging self-generated synthetic preference data, VideoSAVi achieves state-of-the-art performance on multiple benchmarks, including multi-choice QA, open-ended QA, and temporal reasoning tasks.
 - **Project Page:** [https://people-robots.github.io/VideoSAVi/](https://people-robots.github.io/VideoSAVi/)
 ## Usage Instructions
-Please refer to [LLaVA-NeXT](https://github.com/LLaVA-VL/LLaVA-NeXT) for installing all the requirements. We provide sample inference code below.
-```bash
-import math
-import os
 import argparse
 import json
-import torch
-from tqdm import tqdm
 import numpy as np
-import cv2
-import base64
 from decord import VideoReader, cpu
 from PIL import Image
-from transformers import AutoConfig
-from llava.conversation import conv_templates, SeparatorStyle
-from llava.constants import (
-    IMAGE_TOKEN_INDEX,
-    DEFAULT_IMAGE_TOKEN,
-    DEFAULT_IM_START_TOKEN,
-    DEFAULT_IM_END_TOKEN,
-)
-from llava.mm_utils import (
-    process_anyres_image,
-    tokenizer_image_token,
-    get_model_name_from_path,
-    KeywordsStoppingCriteria,
-)
-from llava.model.builder import load_pretrained_model
-from llava.train.train import smart_tokenizer_and_embedding_resize
-class VideoProcessor:
-    def __init__(self, args):
-        self.args = args
-    def load_video(self, video_path):
-        """Load and process video frames."""
-        if self.args.frame_count == 0:
-            return np.zeros((1, 336, 336, 3))
-        vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
-        total_frames = len(vr)
-        fps = round(vr.get_avg_fps())
-        video_duration = total_frames / fps
-        frame_indices = [i for i in range(0, total_frames, fps)]
-        frame_times = [i / fps for i in frame_indices]
-        if len(frame_indices) > self.args.frame_count or self.args.force_sample:
-            uniform_samples = np.linspace(
-                0, total_frames - 1, self.args.frame_count, dtype=int
-            )
-            frame_indices = uniform_samples.tolist()
-            frame_times = [i / fps for i in frame_indices]
-        frame_times_str = ",".join([f"{t:.2f}s" for t in frame_times])
-        frames = vr.get_batch(frame_indices).asnumpy()
-        return frames, frame_times_str, video_duration
-    @staticmethod
-    def load_video_base64(path):
-        """Convert video frames to base64 encoding."""
-        video = cv2.VideoCapture(path)
-        base64_frames = []
-        while video.isOpened():
-            success, frame = video.read()
-            if not success:
-                break
-            _, buffer = cv2.imencode(".jpg", frame)
-            base64_frames.append(base64.b64encode(buffer).decode("utf-8"))
-        video.release()
-        return base64_frames
-class VideoInference:
-    def __init__(self, model, tokenizer, device="cuda"):
-        self.model = model
-        self.tokenizer = tokenizer
-        self.device = device
-    def generate_response(self, video, prompt):
-        """Generate model response for video input."""
-        if self.model.config.mm_use_im_start_end:
-            prompt = f"{DEFAULT_IM_START_TOKEN}{DEFAULT_IMAGE_TOKEN}{DEFAULT_IM_END_TOKEN}\n{prompt}"
-        else:
-            prompt = f"{DEFAULT_IMAGE_TOKEN}\n{prompt}"
-        conv = conv_templates["qwen_2"].copy()
-        conv.append_message(conv.roles[0], prompt)
-        conv.append_message(conv.roles[1], None)
-        full_prompt = conv.get_prompt()
-        input_ids = (
-            tokenizer_image_token(
-                full_prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt"
-            )
-            .unsqueeze(0)
-            .to(self.device)
-        )
-        if self.tokenizer.pad_token_id is None:
-            if "qwen" in self.tokenizer.name_or_path.lower():
-                self.tokenizer.pad_token_id = 151643
-        attention_mask = (
-            input_ids.ne(self.tokenizer.pad_token_id).long().to(self.device)
-        )
-        stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
-        stopping_criteria = KeywordsStoppingCriteria(
-            [stop_str], self.tokenizer, input_ids
-        )
-        try:
-            with torch.inference_mode():
-                output_ids = self.model.generate(
-                    inputs=input_ids,
-                    images=video,
-                    attention_mask=attention_mask,
-                    modalities="video",
-                    do_sample=True,
-                    temperature=0.2,
-                    max_new_tokens=256,
-                    use_cache=True,
-                    stopping_criteria=[stopping_criteria],
-                )
-                generated_text = self.tokenizer.batch_decode(
-                    output_ids, skip_special_tokens=True
-                )[0].strip()
-                if generated_text.endswith(stop_str):
-                    generated_text = generated_text[: -len(stop_str)].strip()
-                return generated_text
-        except Exception as e:
-            print(f"Generation error: {str(e)}")
-            return "Can you describe another aspect of the video?"
-ANSWER_PROMPTS = {
-    "multi-choice": "\nPlease directly give the best option:",
-    "yes_no": "\nPlease answer yes or no:",
-    "caption_matching": "\nPlease directly give the best option:",
-    "captioning": "",
-}
-def setup_model(model_path, args):
-    """Setup the model, tokenizer and processors."""
-    model_name = get_model_name_from_path(model_path)
-    if args.overwrite:
-        config = {
-            "mm_spatial_pool_mode": args.mm_spatial_pool_mode,
-            "mm_spatial_pool_stride": args.mm_spatial_pool_stride,
-            "mm_newline_position": args.mm_newline_position,
-        }
-        cfg_pretrained = AutoConfig.from_pretrained(model_path)
-        if "qwen" not in model_path.lower():
-            if "224" in cfg_pretrained.mm_vision_tower:
-                min_tokens = (
-                    args.frame_count * (16 // args.mm_spatial_pool_stride) ** 2 + 1000
-                )
-            else:
-                min_tokens = (
-                    args.frame_count * (24 // args.mm_spatial_pool_stride) ** 2 + 1000
-                )
-            scaling = math.ceil(min_tokens / 4096)
-            if scaling >= 2:
-                if "vicuna" in cfg_pretrained._name_or_path.lower():
-                    config["rope_scaling"] = {
-                        "factor": float(scaling),
-                        "type": "linear",
-                    }
-                config["max_sequence_length"] = 4096 * scaling
-                config["tokenizer_model_max_length"] = 4096 * scaling
-        return load_pretrained_model(
-            model_path, args.model_base, model_name, overwrite_config=config
-        )
-    return load_pretrained_model(model_path, args.model_base, model_name)
-def main():
-    parser = argparse.ArgumentParser(description="Video LLM Processing")
-    parser.add_argument(
-        "--video_dir", required=True, help="Directory containing video files"
-    )
-    parser.add_argument(
-        "--output_dir", required=True, help="Directory for output predictions"
     )
-    parser.add_argument("--model_path", required=True, help="Path to the model")
-    parser.add_argument(
-        "--questions_dir", required=True, help="Directory containing question files"
-    )
-    parser.add_argument(
-        "--task_type",
-        default="multi-choice",
-        choices=["multi-choice", "captioning", "caption_matching", "yes_no"],
-    )
-    parser.add_argument("--frame_count", type=int, default=4)
-    parser.add_argument(
-        "--overwrite", type=lambda x: str(x).lower() == "true", default=True
-    )
-    parser.add_argument(
-        "--force_sample", type=lambda x: str(x).lower() == "true", default=False
-    )
-    parser.add_argument("--model_base", default=None)
-    parser.add_argument("--model_max_length", type=int, default=2048)
-    parser.add_argument("--mm_spatial_pool_stride", type=int, default=4)
-    parser.add_argument("--mm_spatial_pool_out_channels", type=int, default=1024)
-    parser.add_argument("--mm_spatial_pool_mode", type=str, default="average")
-    parser.add_argument("--mm_newline_position", type=str, default="no_token")
-    args = parser.parse_args()
-    # Ensure output directory exists
-    os.makedirs(args.output_dir, exist_ok=True)
-    # Setup model and processors
-    tokenizer, model, image_processor, context_len = setup_model(args.model_path, args)
-    model = model.to("cuda")
-    # Load questions
-    question_file = os.path.join(args.questions_dir, f"{args.task_type}.json")
-    with open(question_file, "r") as f:
-        questions = json.load(f)
-    # Initialize or load predictions
-    pred_file = os.path.join(args.output_dir, f"{args.task_type}.json")
-    if os.path.isfile(pred_file):
-        with open(pred_file, "r") as f:
-            predictions = json.load(f)
     else:
-        predictions = {}
-    # Setup processors
-    video_processor = VideoProcessor(args)
-    inference_engine = VideoInference(model, tokenizer)
-    # Process videos
-    for video_id, data in tqdm(questions.items()):
-        if video_id not in predictions:
-            predictions[video_id] = {}
-            video_path = os.path.join(args.video_dir, f"{video_id}.mp4")
-            for dimension, question_list in data.items():
-                predictions[video_id][dimension] = []
-                for question in question_list:
-                    prompt = question["question"] + ANSWER_PROMPTS[args.task_type]
-                    video, _, _ = video_processor.load_video(video_path)
-                    video = (
-                        image_processor.preprocess(video, return_tensors="pt")[
-                            "pixel_values"
-                        ]
-                        .half()
-                        .cuda()
-                    )
-                    video = [video]
-                    prediction = inference_engine.generate_response(video, prompt)
-                    predictions[video_id][dimension].append(
-                        {
-                            "question": question["question"],
-                            "answer": question["answer"],
-                            "prediction": prediction,
-                        }
-                    )
-            # Save predictions after each video
-            with open(pred_file, "w") as f:
-                json.dump(predictions, f, indent=4)
 if __name__ == "__main__":
     main()
 ```

 ---
 license: apache-2.0
 ---
 This repository contains the weights for the **VideoSAVi (Self-Aligned Video Language Model)** introduced in the paper [VideoSAVi: Self-Aligned Video Language Models without Human Supervision](https://arxiv.org/abs/2412.00624).
 - **Project Page:** [https://people-robots.github.io/VideoSAVi/](https://people-robots.github.io/VideoSAVi/)
 ## Usage Instructions
+We provide sample inference code below.
+```python
+#!/usr/bin/env python3
 import argparse
 import json
 import numpy as np
+import torch
+import torchvision.transforms as T
 from decord import VideoReader, cpu
 from PIL import Image
+from torchvision.transforms.functional import InterpolationMode
+from transformers import AutoModel, AutoTokenizer
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+def build_transform(input_size):
+    transform = T.Compose(
+        [T.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img), T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC), T.ToTensor(), T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)]
     )
+    return transform
+def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
+    if bound:
+        start, end = bound[0], bound[1]
     else:
+        start, end = -100000, 100000
+    start_idx = max(first_idx, round(start * fps))
+    end_idx = min(round(end * fps), max_frame)
+    seg_size = float(end_idx - start_idx) / num_segments
+    frame_indices = np.array([int(start_idx + (seg_size / 2) + np.round(seg_size * idx)) for idx in range(num_segments)])
+    return frame_indices
+def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
+    orig_width, orig_height = image.size
+    aspect_ratio = orig_width / orig_height
+    # calculate the existing image aspect ratio
+    target_ratios = set((i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if i * j <= max_num and i * j >= min_num)
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+    # find the closest aspect ratio to the target
+    best_ratio_diff = float("inf")
+    best_ratio = (1, 1)
+    area = orig_width * orig_height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                best_ratio = ratio
+    # calculate the target width and height
+    target_width = image_size * best_ratio[0]
+    target_height = image_size * best_ratio[1]
+    blocks = best_ratio[0] * best_ratio[1]
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = ((i % (target_width // image_size)) * image_size, (i // (target_width // image_size)) * image_size, ((i % (target_width // image_size)) + 1) * image_size, ((i // (target_width // image_size)) + 1) * image_size)
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    assert len(processed_images) == blocks
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+    return processed_images
+def load_video(video_path, bound=None, input_size=448, max_num=12, num_segments=8):
+    vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
+    max_frame = len(vr) - 1
+    fps = float(vr.get_avg_fps())
+    pixel_values_list, num_patches_list = [], []
+    transform = build_transform(input_size=input_size)
+    frame_indices = get_index(bound, fps, max_frame, first_idx=0, num_segments=num_segments)
+    for frame_index in frame_indices:
+        img = Image.fromarray(vr[frame_index].asnumpy()).convert("RGB")
+        img = dynamic_preprocess(img, image_size=input_size, use_thumbnail=True, max_num=max_num)
+        pixel_values = [transform(tile) for tile in img]
+        pixel_values = torch.stack(pixel_values)
+        num_patches_list.append(pixel_values.shape[0])
+        pixel_values_list.append(pixel_values)
+    pixel_values = torch.cat(pixel_values_list)
+    return pixel_values, num_patches_list
+def parse_args():
+    parser = argparse.ArgumentParser(description="Inference Script")
+    parser.add_argument("--video_path", type=str, required=True, help="Path to the input video file")
+    parser.add_argument("--model_path", type=str, default="yogkul2000/VideoSAVi", help="Path to the VideoSAVi model")
+    parser.add_argument("--num_segments", type=int, default=8, help="Number of video segments to sample (default: 8)")
+    parser.add_argument("--max_patches", type=int, default=12, help="Maximum patches per frame (default: 12)")
+    parser.add_argument("--input_size", type=int, default=448, help="Input image size (default: 448)")
+    parser.add_argument("--max_new_tokens", type=int, default=1024, help="Maximum number of tokens to generate (default: 1024)")
+    parser.add_argument("--do_sample", action="store_true", default=False, help="Whether to use sampling for generation")
+    parser.add_argument("--temperature", type=float, default=0, help="Sampling temperature)")
+    parser.add_argument("--top_p", type=float, default=1.0, help="Top-p sampling parameter (default: 1.0)")
+    parser.add_argument("--question", type=str, default="What is happening in this video?", help="Question to ask about the video")
+    parser.add_argument("--output_file", type=str, default=None, help="Optional output file to save results")
+    parser.add_argument("--device", type=str, default="cuda", help="Device to use for inference (default: cuda)")
+    parser.add_argument("--torch_dtype", type=str, default="bfloat16", choices=["float16", "bfloat16", "float32"], help="Torch dtype for model (default: bfloat16)")
+    parser.add_argument("--verbose", action="store_true", help="Enable verbose output")
+    parser.add_argument("--no_follow_up", action="store_true", help="Skip follow-up question")
+    return parser.parse_args()
+def main():
+    args = parse_args()
+    if args.verbose:
+        print(f"Loading model from: {args.model_path}")
+        print(f"Processing video: {args.video_path}")
+        print(f"Video segments: {args.num_segments}")
+        print(f"Max patches per frame: {args.max_patches}")
+    torch_dtype_map = {"float16": torch.float16, "bfloat16": torch.bfloat16, "float32": torch.float32}
+    torch_dtype = torch_dtype_map[args.torch_dtype]
+    try:
+        model = AutoModel.from_pretrained(args.model_path, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_flash_attn=True, trust_remote_code=True).eval()
+        if args.device == "cuda" and torch.cuda.is_available():
+            model = model.cuda()
+        tokenizer = AutoTokenizer.from_pretrained(args.model_path, trust_remote_code=True, use_fast=False)
+        if args.verbose:
+            print("Model and tokenizer loaded successfully!")
+    except Exception as e:
+        print(f"Error loading model: {e}")
+        return
+    try:
+        if args.verbose:
+            print("Loading and processing video...")
+        pixel_values, num_patches_list = load_video(args.video_path, num_segments=args.num_segments, max_num=args.max_patches, input_size=args.input_size)
+        pixel_values = pixel_values.to(torch_dtype)
+        if args.device == "cuda" and torch.cuda.is_available():
+            pixel_values = pixel_values.cuda()
+        if args.verbose:
+            print(f"Video processed: {len(num_patches_list)} segments, {pixel_values.shape[0]} total patches")
+    except Exception as e:
+        print(f"Error processing video: {e}")
+        return
+    # Create video prefix for frames
+    video_prefix = "".join([f"Frame{i+1}: <image>\n" for i in range(len(num_patches_list))])
+    # Generation config
+    generation_config = {"max_new_tokens": args.max_new_tokens, "do_sample": args.do_sample, "temperature": args.temperature, "top_p": args.top_p}
+    results = {}
+    try:
+        question = video_prefix + args.question
+        if args.verbose:
+            print(f"\nAsking question: {args.question}")
+        response, history = model.chat(tokenizer, pixel_values, question, generation_config, num_patches_list=num_patches_list, history=None, return_history=True)
+        print(f"\nUser: {args.question}")
+        print(f"VideoSAVi: {response}")
+        results["question_1"] = {"question": args.question, "response": response}
+        # Clear GPU cache
+        if args.device == "cuda" and torch.cuda.is_available():
+            torch.cuda.empty_cache()
+    except Exception as e:
+        print(f"Error during first inference: {e}")
+        return
+    # Save results if output file specified
+    if args.output_file:
+        try:
+            results["video_path"] = args.video_path
+            results["model_path"] = args.model_path
+            results["config"] = {"num_segments": args.num_segments, "max_patches": args.max_patches, "input_size": args.input_size, "generation_config": generation_config}
+            with open(args.output_file, "w") as f:
+                json.dump(results, f, indent=2)
+            print(f"\nResults saved to: {args.output_file}")
+        except Exception as e:
+            print(f"Error saving results: {e}")
+    if args.verbose:
+        print("\nInference completed successfully!")
 if __name__ == "__main__":
     main()
 ```