generate.py

# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.

# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
import itertools
import sys
import time
from pathlib import Path
from typing import Optional, Tuple

import torch
import torch._dynamo.config
import torch._inductor.config

def device_sync(device):
    if "cuda" in device:
        torch.cuda.synchronize()
    elif "cpu" in device:
        pass
    else:
        print(f"device={device} is not yet suppported")

from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
from PIL import Image, ImageDraw, ImageFont
import requests
import torch
import numpy as np
import cv2

torch._inductor.config.coordinate_descent_tuning = True
torch._inductor.config.triton.unique_kernel_names = True
torch._inductor.config.fx_graph_cache = True # Experimental feature to reduce compilation times, will be on by default in future

# support running without installing as a package
wd = Path(__file__).parent.parent.resolve()
sys.path.append(str(wd))

from sentencepiece import SentencePieceProcessor

from model import Transformer


def multinomial_sample_one_no_sync(probs_sort): # Does multinomial sampling without a cuda synchronization
    q = torch.empty_like(probs_sort).exponential_(1)
    return torch.argmax(probs_sort / q, dim=-1, keepdim=True).to(dtype=torch.int)

def logits_to_probs(logits, temperature: float = 1.0, top_k: Optional[int] = None):
    logits = logits / max(temperature, 1e-5)

    if top_k is not None:
        v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
        pivot = v.select(-1, -1).unsqueeze(-1)
        logits = torch.where(logits < pivot, -float("Inf"), logits)
    probs = torch.nn.functional.softmax(logits, dim=-1)
    return probs

def sample(logits, temperature: float = 1.0, top_k: Optional[int] = None):
    #logits[0, -1, 1] = -10

    probs = logits_to_probs(logits[0, -1], temperature, top_k)
    idx_next = multinomial_sample_one_no_sync(probs)
    idx_next = torch.tensor([torch.argmax(logits[0, -1])]).to('cuda:0')

    return idx_next, probs

def prefill(model: Transformer, x: torch.Tensor, embeds: torch.Tensor, input_pos: torch.Tensor, **sampling_kwargs) -> torch.Tensor:
    # input_pos: [B, S]
    logits = model(x, input_pos, embeds=embeds)
    return sample(logits, **sampling_kwargs)[0]

def decode_one_token(model: Transformer, x: torch.Tensor, input_pos: torch.Tensor, **sampling_kwargs) -> Tuple[torch.Tensor, torch.Tensor]:
    # input_pos: [B, 1]
    assert input_pos.shape[-1] == 1
    logits = model(x, input_pos)
    return sample(logits, **sampling_kwargs)

def decode_n_tokens(model: Transformer, cur_token: torch.Tensor, input_pos: torch.Tensor, num_new_tokens: int, callback=lambda _: _, **sampling_kwargs):
    new_tokens, new_probs = [], []
    for i in range(num_new_tokens):
        with torch.backends.cuda.sdp_kernel(enable_flash=False, enable_mem_efficient=False, enable_math=True): # Actually better for Inductor to codegen attention here
            next_token, next_prob = decode_one_token(
                model, cur_token, input_pos, **sampling_kwargs
            )
            input_pos += 1
            new_tokens.append(next_token.clone())
            callback(new_tokens[-1])
            new_probs.append(next_prob.clone())
            cur_token = next_token.view(1, -1)

    return new_tokens, new_probs


def model_forward(model, x, input_pos):
    return model(x, input_pos)

def speculative_decode(
    model: Transformer,
    draft_model: Transformer,
    cur_token: torch.Tensor,
    input_pos: int,
    speculate_k: int,
    **sampling_kwargs
) -> torch.Tensor:
    # draft model inference sequentially
    device = cur_token.device
    orig_input_pos = torch.tensor([input_pos], dtype=torch.int64, device=cur_token.device)
    draft_tokens, draft_probs = decode_n_tokens(draft_model, cur_token.view(1, -1), orig_input_pos.clone(), speculate_k, **sampling_kwargs)

    draft_tokens = torch.cat(draft_tokens)
    # parallel inference on target model using draft tokens
    target_logits = model_forward(
        model,
        torch.cat([cur_token.view(1), draft_tokens]).view(1, -1),
        torch.arange(input_pos, input_pos + speculate_k + 1, device=cur_token.device)
    )
    target_probs = logits_to_probs(target_logits[0], **sampling_kwargs)
    draft_probs = torch.stack(draft_probs)
    # q: target prob, p: draft prob
    # q >= p: always accept draft token
    # q < p: q/p prob to accept draft token
    p = draft_probs[torch.arange(0, speculate_k, device=device), draft_tokens]
    q = target_probs[torch.arange(0, speculate_k, device=device), draft_tokens]
    accept_draft_prob = torch.minimum(torch.ones(()), q[:speculate_k]/ p)
    rejected_locations = (torch.rand_like(accept_draft_prob) > accept_draft_prob).nonzero()

    if rejected_locations.shape[0] == 0: # All draft tokens have been accepted
        accept_length = speculate_k + 1
        last_token = multinomial_sample_one_no_sync(target_probs[-1])
        # fill last token into draft model
        model_forward(
            draft_model,
            draft_tokens[-1].view(1, -1),
            orig_input_pos + speculate_k,
        )
        return torch.cat([draft_tokens, last_token])
    else:
        accept_length = rejected_locations[0].item()
        p = draft_probs[accept_length]
        q = target_probs[accept_length]
        new = q - p
        new = torch.where(new > 0, new, 0.0)
        new = new / new.sum()
        next_token = multinomial_sample_one_no_sync(new)
        return torch.cat([draft_tokens[:accept_length], next_token])

@torch.no_grad()
def generate(
    model: Transformer,
    prompt: torch.Tensor,
    embeds: torch.Tensor,
    max_new_tokens: int,
    
    *,
    interactive: bool,
    draft_model: Transformer,
    speculate_k: Optional[int] = 8,
    callback = lambda x: x,
    **sampling_kwargs
) -> torch.Tensor:
    """
    Takes a conditioning sequence (prompt) as input and continues to generate as many tokens as requested.
    """

    is_speculative = draft_model is not None
    # create an empty tensor of the expected final shape and fill in the current tokens
    T = prompt.size(0)
    T_new = T + max_new_tokens
    if interactive:
        max_seq_length = 350
    else:
        max_seq_length = min(T_new, model.config.block_size)

    device, dtype = prompt.device, prompt.dtype
    max_seq_length = max_seq_length + speculate_k + 1 if is_speculative else max_seq_length
    with torch.device(device):
        model.setup_caches(max_batch_size=1, max_seq_length=max_seq_length)
        if is_speculative and draft_model is not model:
            draft_model.setup_caches(max_batch_size=1, max_seq_length=max_seq_length)

    # create an empty tensor of the expected final shape and fill in the current tokens
    empty = torch.empty(T_new, dtype=dtype, device=device)
    empty[:T] = prompt
    seq = empty
    input_pos = torch.arange(0, T, device=device)

    print("prefill")
    next_token = prefill(model, prompt.view(1, -1), embeds, input_pos, **sampling_kwargs)
    if is_speculative:
        prefill(draft_model, prompt.view(1, -1), input_pos, **sampling_kwargs)
    seq[T] = next_token

    input_pos = torch.tensor([T], device=device, dtype=torch.int)
    accept_counts = [0] * (speculate_k + 1)

    if is_speculative:
        input_pos = input_pos.item()  # for speculative decoding easier to keep on host
        while input_pos < T_new - 1:
            cur_token = next_token.view(())

            next_tokens = speculative_decode(
                model, draft_model, cur_token, input_pos, speculate_k, **sampling_kwargs
            )

            accept_counts[len(next_tokens) - 1] += 1
            num_added = min(T_new - input_pos - 1, len(next_tokens))
            seq[input_pos + 1 : input_pos + num_added + 1] = next_tokens[: num_added]
            for i in next_tokens[: num_added,]:
                callback(i)
            input_pos = input_pos + num_added
            next_token = next_tokens[-1]
    else:
        generated_tokens, _ = decode_n_tokens(model, next_token.view(1, -1), input_pos, max_new_tokens - 1, callback=callback, **sampling_kwargs)
        seq[T + 1:] = torch.cat(generated_tokens)

    generate_stats = {
        'accept_counts': accept_counts
    }
    return seq, generate_stats

def encode_tokens(tokenizer, string, bos=True, device='cuda'):
    tokens = tokenizer.encode(string)
    if bos:
        tokens = [tokenizer.bos_id()] + tokens
    return torch.tensor(tokens, dtype=torch.int, device=device)

def _load_model(checkpoint_path, device, precision, use_tp):
    with torch.device('meta'):
        model = Transformer.from_name(checkpoint_path.parent.name)

    if "int8" in str(checkpoint_path):
        print("Using int8 weight-only quantization!")
        from quantize import WeightOnlyInt8QuantHandler
        simple_quantizer = WeightOnlyInt8QuantHandler(model)
        model = simple_quantizer.convert_for_runtime()

    if "int4" in str(checkpoint_path):
        print("Using int4 quantization!")
        path_comps = checkpoint_path.name.split(".")
        assert path_comps[-2].startswith("g")
        groupsize = int(path_comps[-2][1:])
        from quantize import WeightOnlyInt4QuantHandler
        simple_quantizer = WeightOnlyInt4QuantHandler(model, groupsize)
        model = simple_quantizer.convert_for_runtime()

    checkpoint = torch.load(str(checkpoint_path), mmap=True, weights_only=True)
    model.load_state_dict(checkpoint, assign=True)

    if use_tp:
        from tp import apply_tp
        print("Applying tensor parallel to model ...")
        apply_tp(model)

    #print(model.get_tok_embeddings().bias)
    model = model.to(device=device, dtype=torch.bfloat16)
    return model.eval()

B_INST, E_INST = "[INST]", "[/INST]"

def main(
    prompt: str = "Hello, my name is",
    vid_path: str = "",
    vid_start: int = 1,
    vid_end: int = 2,
    interactive: bool = False,
    max_new_tokens: int = 100,
    top_k: int = 200,
    temperature: float = 0.0,
    checkpoint_path: Path = Path("checkpoints/meta-Transformer/Transformer-2-7b-chat-hf/model.pth"),
    compile: bool = True,
    compile_prefill: bool = False,
    profile: Optional[Path] = None,
    draft_checkpoint_path: Optional[Path] = None,
    speculate_k: int = 5,
    device='cuda',
) -> None:


    """Generates text samples based on a pre-trained Transformer model and tokenizer.
    """
    assert checkpoint_path.is_file(), checkpoint_path

    tokenizer_path = checkpoint_path.parent / "tokenizer.model"
    assert tokenizer_path.is_file(), tokenizer_path

    global print
    from tp import maybe_init_dist
    rank = maybe_init_dist()
    use_tp = rank is not None
    if use_tp:
        if rank != 0:
            # only print on rank 0
            print = lambda *args, **kwargs: None

    print(f"Using device={device}")
    precision = torch.bfloat16
    is_speculative = draft_checkpoint_path is not None
    is_chat = "chat" in str(checkpoint_path)

    print("Loading model ...")
    t0 = time.time()
    model = _load_model(checkpoint_path, device, precision, use_tp)

    if is_speculative:
        draft_model = _load_model(draft_checkpoint_path, device, precision, use_tp)
    else:
        draft_model = None

    ### EDIT

    model_id = "google/paligemma-3b-mix-224"
    device = "cuda:0"
    dtype = torch.bfloat16
    
    #url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true"
    #image = Image.open("sidewalk.jpg")

    #url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true"
    #image = Image.open(requests.get(url, stream=True).raw)
    
    _model = PaliGemmaForConditionalGeneration.from_pretrained(
        checkpoint_path.parent,
        torch_dtype=dtype,
        device_map=device,
        revision="bfloat16",
    ).eval()

    vision_model = _model.vision_tower
    projector = _model.multi_modal_projector
    processor = AutoProcessor.from_pretrained(model_id)
    
    # Instruct the model to create a caption in Spanish
    #model_inputs = processor(text=prompt, images=image, return_tensors="pt").to('cuda:0')


    #input_len = model_inputs["input_ids"].shape[-1]

    device_sync(device=device) # MKG
    print(f"Time to load model: {time.time() - t0:.02f} seconds")

    cap = cv2.VideoCapture(vid_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    #frame_interval = int(fps // 8)
    frames = []


    for i in range(int(fps * vid_end)):
        ret, frame = cap.read()

        if i > fps * vid_start:
            if not ret:
                break
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            pil_frame = Image.fromarray(frame)
            frames.append(pil_frame)

    cap.release()

    out = cv2.VideoWriter('output_video.mp4', cv2.VideoWriter_fourcc(*'mp4v'), fps, (frame.shape[1], frame.shape[0]))


    #tokenizer = SentencePieceProcessor(model_file=str(tokenizer_path))
    #encoded = encode_tokens(tokenizer, prompt, bos=True, device=device)

    #encoded = model_inputs['input_ids'][0]
    #prompt_length = encoded.size(0)

    #torch.manual_seed(1234)
    model_size = sum([p.numel() * p.dtype.itemsize for p in itertools.chain(model.parameters(), model.buffers())])
    if compile:
        if is_speculative and use_tp: # and ("cuda" in device):
            torch._inductor.config.triton.cudagraph_trees = False # Bug with cudagraph trees in this case

        if is_speculative:
            global model_forward, logits_to_prob
            model_forward = torch.compile(model_forward, mode="reduce-overhead", fullgraph=True)

        global decode_one_token, prefill
        decode_one_token = torch.compile(decode_one_token, mode="reduce-overhead", fullgraph=True)

        # Uncomment to squeeze more perf out of prefill
        if args.compile_prefill:
            prefill = torch.compile(prefill, fullgraph=True, dynamic=True)


    aggregate_metrics = {
        'tokens_per_sec': [],
        'accept_counts': [],
    }
    start = -1 if compile else 0

    embed = model.get_tok_embeddings()

    """

    embedding_values = embed(encoded)
    #print(embedding_values)
    
    img_embed = projector(vision_model(model_inputs.pixel_values.to(dtype=torch.bfloat16)).last_hidden_state)

    img_embed = img_embed / (2048 ** 0.5)

    print(embedding_values.shape)

    embedding_values[:256, :] = img_embed[0]

    embedding_values = embedding_values.unsqueeze(0)
    """

    model_fps = 16

    #print(len(frames))
    bounding_boxes = []


    for i, frame in enumerate(frames):

        if i % 2== 0:

            model_inputs = processor(text=prompt, images=frame, return_tensors="pt").to('cuda:0')
            encoded = model_inputs['input_ids'][0]
            prompt_length = encoded.size(0)

            embedding_values = embed(encoded)

            img_embed = projector(vision_model(model_inputs.pixel_values.to(dtype=torch.bfloat16)).last_hidden_state)

            img_embed = img_embed / (2048 ** 0.5)
            #print(embedding_values.shape)

            embedding_values[:256, :] = img_embed[0]

            embedding_values = embedding_values.unsqueeze(0)

            #exit(0)

            #exit(0)
            device_sync(device=device) # MKG
            if i >= 0 and interactive:
                prompt = input("What is your prompt? ")
                if is_chat:
                    prompt = f"{B_INST} {prompt.strip()} {E_INST}"
                encoded = encode_tokens(tokenizer, prompt, bos=True, device=device)

            if interactive and i >= 0:
                buffer = []
                period_id = tokenizer.encode('.')[0]
                done_generating = False
                def callback(x):
                    nonlocal done_generating
                    if done_generating:
                        return
                    buffer.append(tokenizer.decode([period_id] + x.tolist())[1:])
                    if x.item() == tokenizer.eos_id():
                        done_generating = True
                    if len(buffer) == 4 or done_generating:
                        print(''.join(buffer), end='', flush=True)
                        buffer.clear()
                    # print(, end='', flush=True)
            else:
                callback = lambda x : x
            t0 = time.perf_counter()
            import contextlib
            prof = contextlib.nullcontext()
        
            with prof:
                y, metrics = generate(
                    model,
                    encoded,
                    embedding_values,
                    max_new_tokens,
                    draft_model=draft_model,
                    speculate_k=speculate_k,
                    interactive=interactive,
                    callback=callback,
                    temperature=temperature,
                    top_k=top_k,
                )
                aggregate_metrics['accept_counts'].append(metrics['accept_counts'])
            if i == -1:
                print(f"Compilation time: {time.perf_counter() - t0:.2f} seconds")
                continue
            if hasattr(prof, "export_chrome_trace"):
                if use_tp:
                    prof.export_chrome_trace(f"{profile}_rank_{rank}.json")
                else:
                    prof.export_chrome_trace(f"{profile}.json")
            device_sync(device=device) # MKG
            t = time.perf_counter() - t0

            if not interactive:
                #print(y)
                print(processor.decode(y, skip_special_tokens=True))
                #print(tokenizer.decode(y.tolist()))
            else:
                print()

            decoded_output = processor.decode(y, skip_special_tokens=True)
            tokens_generated = y.size(0) - prompt_length
            tokens_sec = tokens_generated / t

            
            aggregate_metrics['tokens_per_sec'].append(tokens_sec)

            new_model_fps = int(1 / t)
            if new_model_fps != model_fps:
                model_fps=new_model_fps
            print(f"Model fps {new_model_fps}")
            print(f"Time for inference {i + 1}: {t:.02f} sec total, {tokens_sec:.02f} tokens/sec")
            print(f"Bandwidth achieved: {model_size * tokens_sec / 1e9:.02f} GB/s")

            print(processor.decode(y, skip_special_tokens=True))
            if ';' not in decoded_output and ('loc' in decoded_output):
                locations = [int(loc.replace('loc', '').replace('<', '').replace('>', '').replace('detect car\n', '').replace('car', '')) for loc in decoded_output.split("><") if 'loc' in loc]
            else:
                locations = []
            if len(locations) > 0:
                bounding_boxes = []

            if len(locations) > 0:
                # Convert locations to bounding boxes
                bounding_boxes.append(locations[0])

                bounding_boxes.append(locations[1])
                bounding_boxes.append(locations[2])

                bounding_boxes.append(locations[3])

                def convert_bbox(bbox, original_size=(1024, 1024), target_size=(480, 854)):
                    """
                    Convert bounding box coordinates from the original resolution to the target resolution.

                    Parameters:
                    bbox (tuple): A tuple (x1, y1, x2, y2) representing the bounding box coordinates in the original resolution.
                    original_size (tuple): A tuple (width, height) representing the original resolution.
                    target_size (tuple): A tuple (width, height) representing the target resolution.

                    Returns:
                    tuple: A tuple (x1, y1, x2, y2) representing the bounding box coordinates in the target resolution.
                    """
                    original_width, original_height = original_size
                    target_width, target_height = target_size

                    x1, y1, x2, y2 = bbox

                    x1 = int(x1 * target_width / original_width)
                    y1 = int(y1 * target_height / original_height)
                    x2 = int(x2 * target_width / original_width)
                    y2 = int(y2 * target_height / original_height)

                    return (x1, y1, x2, y2)

                bounding_boxes = convert_bbox(bounding_boxes)
                bounding_boxes = [bounding_boxes[1], bounding_boxes[0], bounding_boxes[3], bounding_boxes[2]]

        # Draw bounding boxes on the frame if locations are detected
        if bounding_boxes:
            draw = ImageDraw.Draw(frame)
            #font = ImageFont.truetype("arial.ttf", 20)  # Adjust the font and size as needed

            draw.rectangle(bounding_boxes, outline="lime", width=3)
            text_position = (bounding_boxes[2] - 5, bounding_boxes[3] - 5)
            draw.text(text_position, "car", fill="lime", font_size=30)

        frame = cv2.cvtColor(np.array(frame), cv2.COLOR_RGB2BGR)
        out.write(frame)


    out.release()
    print("Video saved as output_video.mp4")
    print("==========")

    if is_speculative:
        counts_aggregated = [sum(i) for i in zip(*aggregate_metrics['accept_counts'])]
        acceptance_probs = [i/sum(counts_aggregated) for i in counts_aggregated]
        print(f"Acceptance probs: {acceptance_probs}")
        print(f"Mean Accepted: {sum([idx * i for idx, i in enumerate(counts_aggregated)])/sum(counts_aggregated)}")

    print(f"Average tokens/sec: {torch.mean(torch.tensor(aggregate_metrics['tokens_per_sec'])).item():.2f}")
    print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB")


if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser(description='Your CLI description.')

    ### NEW PARAMS

    parser.add_argument('--prompt', type=str, default="detect car", help='Input prompt.')
    parser.add_argument('--vid_path', type=str, default="", help='path to mp4 video.')
    parser.add_argument('--vid_start', type=int, default=0, help='Where in video to start detecting (seconds).')
    parser.add_argument('--vid_end', type=int, default=10, help='Where in video to end detecting (seconds).')

    ### OLD PARAMS
    
    parser.add_argument('--interactive', action='store_true', help='Whether to launch in interactive mode')
    parser.add_argument('--max_new_tokens', type=int, default=10, help='Maximum number of new tokens.')
    parser.add_argument('--top_k', type=int, default=200, help='Top-k for sampling.')
    parser.add_argument('--temperature', type=float, default=0.8, help='Temperature for sampling.')
    parser.add_argument('--checkpoint_path', type=Path, default=Path("checkpoints/meta-Transformer/Transformer-2-7b-chat-hf/model.pth"), help='Model checkpoint path.')
    parser.add_argument('--compile', action='store_true', help='Whether to compile the model.')
    parser.add_argument('--compile_prefill', action='store_true', help='Whether to compile the prefill (improves prefill perf, but higher compile times)')
    parser.add_argument('--profile', type=Path, default=None, help='Profile path.')
    parser.add_argument('--speculate_k', type=int, default=5, help='Speculative execution depth.')
    parser.add_argument('--draft_checkpoint_path', type=Path, default=None, help='Draft checkpoint path.')
    parser.add_argument('--device', type=str, default="cuda", help='device to use')


    args = parser.parse_args()

    main(
        args.prompt, args.vid_path, args.vid_start, args.vid_end, args.interactive, args.max_new_tokens, args.top_k,
        args.temperature, args.checkpoint_path, args.compile, args.compile_prefill, args.profile, args.draft_checkpoint_path,
        args.speculate_k, args.device
    )