rag_over_documents_using_codellama.py

# -*- coding: utf-8 -*-
"""RAG_over_documents_using_CodeLLama.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/14iZd5e9D1l5-TeBBOqoZ7v7WIfcXVO2p
"""

!pip install llama-index transformers accelerate bitsandbytes pypdf

# Commented out IPython magic to ensure Python compatibility.
!pip install llama-index-readers-file llama-index-readers-web
!pip install unstructured
# %pip install llama-index
# %pip install transformers accelerate bitsandbytes
# %pip install llama-index-readers-web
# %pip install llama-index-llms-huggingface
# %pip install llama-index-embeddings-huggingface
# %pip install llama-index-program-openai
# %pip install llama-index-agent-openai
# %pip install -U bitsandbytes
!pip install -U sentence-transformers
!pip install llama-index
!pip install llama-index-embeddings-huggingface
!pip install chromadb llama-index-vector-stores-chroma pinecone-client llama-index-vector-stores-pinecone

"""## Setup

### Data

Here we are placing all our docs in Data folder
"""

from llama_index.core import VectorStoreIndex, SimpleDirectoryReader

documents = SimpleDirectoryReader("/content/verilog").load_data()

"""Since I'm using `meta-llama/Llama-2-7b-chat-hf` as my LLM, it requires my huggingface authentication."""

!huggingface-cli login

"""### LLM

This should run on a T4 instance on the free tier
"""

import torch
from transformers import BitsAndBytesConfig
from llama_index.core.prompts import PromptTemplate
from llama_index.llms.huggingface import HuggingFaceLLM

#Without quantization, we will not be able to load the whole meta-llama/Llama-2-7b-chat-hf in the free tier of colab.
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)


def messages_to_prompt(messages):
  prompt = ""
  for message in messages:
    if message.role == 'system':
      prompt += f"<|system|>\n{message.content}</s>\n"
    elif message.role == 'user':
      prompt += f"<|user|>\n{message.content}</s>\n"
    elif message.role == 'assistant':
      prompt += f"<|assistant|>\n{message.content}</s>\n"

  # ensure we start with a system prompt, insert blank if needed
  if not prompt.startswith("<|system|>\n"):
    prompt = "<|system|>\n</s>\n" + prompt

  # add final assistant prompt
  prompt = prompt + "<|assistant|>\n"

  return prompt


# llm = HuggingFaceLLM(
#     model_name="meta-llama/Llama-2-7b-chat-hf",
#     tokenizer_name="meta-llama/Llama-2-7b-chat-hf",
#     query_wrapper_prompt=PromptTemplate("<|system|>\n</s>\n<|user|>\n{query_str}</s>\n<|assistant|>\n"),
#     context_window=3900,
#     max_new_tokens=256,
#     model_kwargs={"quantization_config": quantization_config},
#     # tokenizer_kwargs={},
#     generate_kwargs={"temperature": 0.3, "top_k": 50, "top_p": 0.95},
#     messages_to_prompt=messages_to_prompt,
#     device_map="auto",

# )

import torch
from transformers import BitsAndBytesConfig
from llama_index.core.prompts import PromptTemplate
from llama_index.llms.huggingface import HuggingFaceLLM

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)


from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct")

query_wrapper_prompt = PromptTemplate(
    template="<s> [INST] You are an expert in SystemVerilog. Your task is to generate a document based on the user's query. \n\n{query_str} [/INST] "
)


llm = HuggingFaceLLM(
    model_name="meta-llama/Meta-Llama-3.1-8B-Instruct",
    tokenizer_name="meta-llama/Meta-Llama-3.1-8B-Instruct",
    query_wrapper_prompt=PromptTemplate("<s> [INST] {query_str} [/INST] "),
    context_window=128000,
    max_new_tokens=1024,
    messages_to_prompt=messages_to_prompt,
    model_kwargs={
        # "token": hf_token,
        "quantization_config": quantization_config,
        "pad_token_id": tokenizer.eos_token_id,  # Add this line
    },
    # tokenizer_kwargs={"token": hf_token},
    device_map="auto",
)

from llama_index.embeddings.huggingface import HuggingFaceEmbedding
# from llama_index import VectorStoreIndex
from llama_index.core import Settings

# Set up the embedding model
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
Settings.embed_model = embed_model

import chromadb
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext


db = chromadb.PersistentClient(path="./content/verilog")
chroma_collection = db.get_or_create_collection("quickstart")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

# from llama_index import ServiceContext

# service_context = ServiceContext.from_defaults(llm=llm, embed_model="local:BAAI/bge-small-en-v1.5")

"""### Index Setup"""

index = VectorStoreIndex.from_documents(documents, embed_model=embed_model, storage_context=storage_context)

"""### Helpful Imports / Logging"""

from llama_index.core.response.notebook_utils import display_response

import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

from llama_index.core import get_response_synthesizer

response_synthesizer = get_response_synthesizer(llm=llm)
query_engine = index.as_query_engine(llm=llm, response_synthesizer=response_synthesizer,response_mode="refine",similarity_top_k=3)

message="You are an expert in SystemVerilog and HDL, known for providing accurate and detailed answers. Your Task is to generate Document for the code .Don't include metadata info in document"

chat_engine = index.as_chat_engine(chat_mode="react",llm=llm,initial_prompt=message, response_synthesizer=response_synthesizer,response_mode="compact",verbose=True)

"""## Basic Query Engine"""

verilog_prompt="""module modN_ctr
  # (parameter N = 10,
     parameter WIDTH = 4)

  ( input   clk,
    input   en,
    input   rstn,
    output  reg[WIDTH-1:0] out);

  always @ (posedge clk) begin
    if (!rstn) begin
      out <= 0;
    end else begin
      if (out == N-1) begin
        out <= 0;
      end
      else if (en == 1) begin
        out <= out + 1;
      end
    end
  end
endmodule


"""

# query_engine = index.as_query_engine(response_mode="compact")

response = query_engine.query(f"{verilog_prompt} create a documnent for above module and eloberate the functionality of code")

display_response(response)