Merge branch 'lancedb:main' into main

raghavdixit99 · Feb 27, 2024 · 55da351 · 55da351
2 parents c8fec77 + 79ca0dd
commit 55da351
Show file tree

Hide file tree

Showing 54 changed files with 13,961 additions and 9,377 deletions.
diff --git a/README.md b/README.md
diff --git a/applications/Multilingual_RAG/.env-example b/applications/Multilingual_RAG/.env-example
@@ -0,0 +1 @@
+COHERE_API_KEY = pastyourapikeyhere
diff --git a/applications/Multilingual_RAG/README.md b/applications/Multilingual_RAG/README.md
@@ -0,0 +1,45 @@
+# Multilingual-RAG
+
+![Multilingual-RAG](https://github.com/akashAD98/Multilingual-RAG/assets/62583018/a84e1839-a311-496c-b545-3533ef348dea.png)
+
+## Overview
+Multilingual-RAG is an innovative question-answering system with multilingual capabilities, capable of understanding and generating responses in multiple languages. It is built upon the powerful architecture of Large Language Models (LLMs) with Retrieve-And-Generate (RAG) capabilities. This application harnesses the capabilities of Cohere's multilingual embeddings, LanceDB vector store, LangChain for question answering, and Argos Translate for seamless translation between languages. The user interface is provided by Gradio, ensuring a smooth and interactive user experience.
+
+## Supported Languages
+Multilingual RAG is designed to support over 100 languages. The specific list of supported languages depends on the capabilities of the Cohere multilingual model and Argos Translate. By default, it includes support for English, Hindi, French, and Turkish languages. Additional languages can be added to suit your use case.
+
+## Getting Started
+Follow these instructions to set up Multilingual-RAG in your local environment.
+
+### Prerequisites
+Ensure you have the following prerequisites installed:
+- Python 3.x
+
+Create a `.env` file and add your Cohere API key:
+just rename `.env-example` with `.env` & past your API
+
+
+
+## Installation
+You can install the required dependencies using the following commands:
+
+```
+pip install -r requirements.txt
+```
+For Argos Translate, you can install it as follows:
+
+```
+git clone https://github.com/argosopentech/argos-translate.git
+cd argos-translate
+virtualenv env
+source env/bin/activate
+pip install -e .
+```
+
+## Running the App
+To run the Multilingual-RAG app, use the following command:
+Currently, support text/pdf file - change the file path inside main.py
+
+```
+python3 main.py
+```
diff --git a/applications/Multilingual_RAG/main.py b/applications/Multilingual_RAG/main.py
@@ -0,0 +1,218 @@
+import os
+import dotenv
+import gradio as gr
+import lancedb
+import logging
+from langchain.embeddings.cohere import CohereEmbeddings
+from langchain.llms import Cohere
+from langchain.prompts import PromptTemplate
+from langchain.chains import RetrievalQA
+from langchain.vectorstores import LanceDB
+from langchain.document_loaders import TextLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.document_loaders import PyPDFLoader
+import argostranslate.package
+import argostranslate.translate
+
+
+# Configuration Management
+dotenv.load_dotenv(".env")
+DB_PATH = "/tmp/lancedb"
+
+COHERE_MODEL_NAME = "multilingual-22-12"
+LANGUAGE_ISO_CODES = {
+    "English": "en",
+    "Hindi": "hi",
+    "Turkish": "tr",
+    "French": "fr",
+}
+
+# Logging Configuration
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+def initialize_documents_and_embeddings(input_file_path):
+    """
+    Initialize documents and their embeddings from a given file.
+
+    Parameters:
+    - input_file_path (str): The path to the input file. Supported formats are .txt and .pdf.
+
+    Returns:
+    - tuple: A tuple containing a list of texts split from the document and the embeddings object.
+    """
+    file_extension = os.path.splitext(input_file_path)[1]
+    if file_extension == ".txt":
+        logger.info("txt file processing")
+        # Handle text file
+        loader = TextLoader(input_file_path)
+        documents = loader.load()
+        text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)
+        texts = text_splitter.split_documents(documents)
+    elif file_extension == ".pdf":
+        logger.info("pdf file processing")
+        # Handle PDF file
+        loader = PyPDFLoader(input_file_path)
+        texts = loader.load_and_split()
+        text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)
+        texts = text_splitter.split_documents(texts)
+    else:
+        raise ValueError(
+            "Unsupported file type. Supported files are .txt and .pdf only."
+        )
+
+    embeddings = CohereEmbeddings(model=COHERE_MODEL_NAME)
+    return texts, embeddings
+
+
+# Database Initialization
+def initialize_database(texts, embeddings):
+    """
+    Initialize and populate a LanceDB database with documents and their embeddings.
+
+    Parameters:
+    - texts (list): A list of texts to be stored in the database.
+    - embeddings (CohereEmbeddings): An embeddings object used to generate vector embeddings for the texts.
+
+    Returns:
+    - LanceDB: An instance of LanceDB with the documents and their embeddings stored.
+    """
+    db = lancedb.connect(DB_PATH)
+    table = db.create_table(
+        "multiling-rag",
+        data=[
+            {
+                "vector": embeddings.embed_query("Hello World"),
+                "text": "Hello World",
+                "id": "1",
+            }
+        ],
+        mode="overwrite",
+    )
+    return LanceDB.from_documents(texts, embeddings, connection=table)
+
+
+# Translation Function
+def translate_text(text, from_code, to_code):
+    """
+    Translate a given text from one language to another.
+
+    Parameters:
+    - text (str): The text to translate.
+    - from_code (str): The ISO language code of the source language.
+    - to_code (str): The ISO language code of the target language.
+
+    Returns:
+    - str: The translated text.
+    """
+    try:
+        argostranslate.package.update_package_index()
+        available_packages = argostranslate.package.get_available_packages()
+        package_to_install = next(
+            filter(
+                lambda x: x.from_code == from_code and x.to_code == to_code,
+                available_packages,
+            )
+        )
+        argostranslate.package.install_from_path(package_to_install.download())
+        return argostranslate.translate.translate(text, from_code, to_code)
+    except Exception as e:
+        logger.error(f"Error in translate_text: {str(e)}")
+        return "Translation error"
+
+
+prompt_template = """Text: {context}
+
+Question: {question}
+
+Answer the question based on the text provided. If the text doesn't contain the answer, reply that the answer is not available."""
+PROMPT = PromptTemplate(
+    template=prompt_template, input_variables=["context", "question"]
+)
+
+
+# Question Answering Function
+def answer_question(question, input_language, output_language, db):
+    """
+    Answer a given question by retrieving relevant information from a database,
+    translating the question and answer if necessary.
+    Parameters:
+    - question (str): The question to answer.
+    - input_language (str): The language of the input question.
+    - output_language (str): The desired language of the answer.
+    - db (LanceDB): The LanceDB instance to use for information retrieval.
+
+    Returns:
+    - str: The answer to the question, in the desired output language
+    """
+    try:
+        input_lang_code = LANGUAGE_ISO_CODES[input_language]
+        output_lang_code = LANGUAGE_ISO_CODES[output_language]
+
+        question_in_english = (
+            translate_text(question, from_code=input_lang_code, to_code="en")
+            if input_language != "English"
+            else question
+        )
+        prompt = PromptTemplate(
+            template=prompt_template, input_variables=["context", "question"]
+        )
+        qa = RetrievalQA.from_chain_type(
+            llm=Cohere(model="command", temperature=0),
+            chain_type="stuff",
+            retriever=db.as_retriever(),
+            chain_type_kwargs={"prompt": prompt},
+            return_source_documents=True,
+        )
+
+        answer = qa({"query": question_in_english})
+        result_in_english = answer["result"].replace("\n", "").replace("Answer:", "")
+
+        return (
+            translate_text(result_in_english, from_code="en", to_code=output_lang_code)
+            if output_language != "English"
+            else result_in_english
+        )
+    except Exception as e:
+        logger.error(f"Error in answer_question: {str(e)}")
+        return "An error occurred while processing your question. Please try again."
+
+
+def setup_gradio_interface(db):
+    """
+    Setup a Gradio interface for interacting with the multilingual chatbot.
+
+    Parameters:
+    - db (LanceDB): The database instance to use for information retrieval.
+
+    Returns:
+    - gr.Interface: A Gradio interface object for the chatbot.
+    """
+
+    return gr.Interface(
+        fn=lambda question, input_language, output_language: answer_question(
+            question, input_language, output_language, db
+        ),
+        inputs=[
+            gr.Textbox(lines=2, placeholder="Type your question here..."),
+            gr.Dropdown(list(LANGUAGE_ISO_CODES.keys()), label="Input Language"),
+            gr.Dropdown(list(LANGUAGE_ISO_CODES.keys()), label="Output Language"),
+        ],
+        outputs="text",
+        title="Multilingual Chatbot",
+        description="Ask any question in your chosen language and get an answer in the language of your choice.",
+    )
+
+
+# Main Function
+def main():
+    INPUT_FILE_PATH = "healthy-diet-fact-sheet-394.pdf"
+    texts, embeddings = initialize_documents_and_embeddings(INPUT_FILE_PATH)
+    db = initialize_database(texts, embeddings)
+    iface = setup_gradio_interface(db)
+    iface.launch(share=True, debug=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/applications/Multilingual_RAG/requirements.txt b/applications/Multilingual_RAG/requirements.txt
@@ -0,0 +1,5 @@
+cohere 
+langchain 
+lancedb 
+python-dotenv 
+gradio
diff --git a/applications/chat_with_anywebsite/app.py b/applications/chat_with_anywebsite/app.py
@@ -13,7 +13,6 @@
 
 
 class ChatbotHelper:
-
     def __init__(self):
         self.chatbot_instance = None
         self.chat_history = []
@@ -135,7 +134,6 @@ def respond(self, message):
         return bot_message
 
     def run_interface(self):
-
         iface = gr.Interface(
             fn=self.respond,
             title="Chatbot with URL or any website ",

diff --git a/applications/chat_with_anywebsite/main_app.ipynb b/applications/chat_with_anywebsite/main_app.ipynb
@@ -79,7 +79,6 @@
     "\n",
     "\n",
     "class ChatbotHelper:\n",
-    "\n",
     "    def __init__(self):\n",
     "        self.chatbot_instance = None\n",
     "        self.chat_history = []\n",
@@ -207,7 +206,6 @@
     "        return bot_message\n",
     "\n",
     "    def run_interface(self):\n",
-    "\n",
     "        iface = gr.Interface(\n",
     "            fn=self.respond,\n",
     "            title=\"Chatbot with URL or any website \",\n",

diff --git a/applications/docchat-with-langroid/app.py b/applications/docchat-with-langroid/app.py
@@ -11,7 +11,6 @@
 uploadedFile = st.file_uploader("Choose a txt file")
 
 if uploadedFile is not None:
-
     with open(os.path.join("tempDir", uploadedFile.name), "wb") as f:
         f.write(uploadedFile.getbuffer())
 

diff --git a/applications/docchat-with-langroid/utils.py b/applications/docchat-with-langroid/utils.py
@@ -41,7 +41,6 @@ def configure(filename):
 
 
 def agent(cfg, prompt):
-
     # Creating DocChatAgent
     rag_agent = DocChatAgent(cfg)
 

diff --git a/applications/talk-with-github/.gitignore b/applications/talk-with-github/.gitignore
@@ -0,0 +1,2 @@
+.lancedb
+example_data/
diff --git a/applications/talk-with-github/README.md b/applications/talk-with-github/README.md
@@ -0,0 +1,41 @@
+# Talk to Github CodeSpaces using Qwen1.5
+
+Using this application, You can talk to Github Repositories. It will clone, embed all the Markdown, Python and Javascript files in the repository. This Application utilizes newly launched Qwen1.5 as a LLM.
+
+---
+**NOTE** <br>
+For this application `OPENAI API KEY` is not required
+
+---
+
+
+1. Install Dependencies
+```
+pip install -r requirements.txt
+```
+2. Ollama Installation 
+```
+curl https://ollama.ai/install.sh | sh
+ollama pull qwen
+```
+for Mac:
+```
+brew install qwen
+
+```
+On a separate terminal, run the following command:
+```
+ollama pull qwen
+```
+
+### Youtube Demo
+![demo_img](../../assets/talk-with-github.jpg)
+
+You are ready to start
+
+## Run Streamlit App
+
+ Run Application
+```
+streamlit run app.py
+```
Original file line number	Diff line number	Diff line change
Expand Up		@@ -41,7 +41,6 @@ def configure(filename):


		def agent(cfg, prompt):

		# Creating DocChatAgent
		rag_agent = DocChatAgent(cfg)

Expand Down