In [1]:
%%capture
!pip install gradio transformers pillow opencv-python
!pip install accelerate torchvision torch huggingface_hub
!pip install hf_xet qwen-vl-utils gradio_client
!pip install transformers-stream-generator spaces

In [None]:
import os
import uuid
import time
from threading import Thread

import gradio as gr
import torch
import numpy as np
import cv2
from PIL import Image
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor

# Ensure CUDA if available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Load Callisto OCR3 multimodal model and processor
MODEL_ID = "prithivMLmods/Imgscope-OCR-2B-0527"
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
model = Qwen2VLForConditionalGeneration.from_pretrained(
    MODEL_ID,
    trust_remote_code=True,
    torch_dtype=torch.float16
).to(device).eval()

# Constants
MAX_INPUT_TOKEN_LENGTH = 4096


def downsample_video(video_path: str, num_frames: int = 10):
    """
    Extracts 'num_frames' evenly spaced frames from the video.
    Returns a list of (PIL.Image, timestamp_seconds).
    """
    vidcap = cv2.VideoCapture(video_path)
    total = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = vidcap.get(cv2.CAP_PROP_FPS) or 1
    indices = np.linspace(0, total - 1, num_frames, dtype=int)
    frames = []
    for idx in indices:
        vidcap.set(cv2.CAP_PROP_POS_FRAMES, idx)
        ret, frame = vidcap.read()
        if not ret:
            continue
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        pil = Image.fromarray(frame)
        timestamp = round(idx / fps, 2)
        frames.append((pil, timestamp))
    vidcap.release()
    return frames


def generate(video_file: str):
    """
    Process the uploaded video through OCR and return concatenated output.
    """
    # Step 1: extract frames
    frames = downsample_video(video_file)

    # Step 2: build chat-like messages
    messages = [
        {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant, for video understanding."}]},
        {"role": "user", "content": [{"type": "text", "text": "Please describe the content of the following video frames:"}]
        }
    ]
    for img, ts in frames:
        # save temporary frame image
        path = f"frame_{uuid.uuid4().hex}.png"
        img.save(path)
        messages[1]["content"].append({"type": "text", "text": f"Frame at {ts}s:"})
        messages[1]["content"].append({"type": "image", "url": path})

    # Step 3: tokenize with truncation
    inputs = processor.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_dict=True,
        return_tensors="pt",
        truncation=True,
        max_length=MAX_INPUT_TOKEN_LENGTH
    ).to(device)

    # Step 4: use streamer to collect output
    from transformers import TextIteratorStreamer
    streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
    gen_kwargs = {
        **inputs,
        "streamer": streamer,
        "max_new_tokens": 1024,
        "do_sample": True,
        "temperature": 0.7,
    }
    thread = Thread(target=model.generate, kwargs=gen_kwargs)
    thread.start()

    # collect all tokens
    buffer = ""
    for chunk in streamer:
        buffer += chunk.replace("<|im_end|>", "")
        time.sleep(0.01)

    # return full concatenated response
    return buffer


def launch_app():
    demo = gr.Interface(
        fn=generate,
        inputs=gr.Video(label="Upload Video"),
        outputs=gr.Textbox(label="Video Description"),
        title="Video Understanding with Imgscope-OCR-2B-0527",
        description="Upload a video and get an OCR-based description of its frames.",
        allow_flagging="never"
    )
    demo.queue().launch(debug=True)


if __name__ == "__main__":
    launch_app()