diff --git a/pyserini/demo/aclchatgpt/chatbot.py b/pyserini/demo/aclchatgpt/chatbot.py new file mode 100644 index 000000000..770e59172 --- /dev/null +++ b/pyserini/demo/aclchatgpt/chatbot.py @@ -0,0 +1,104 @@ +from dataclasses import dataclass +from argparse import ArgumentParser +from typing import Optional + +import semantic_kernel as sk +from semantic_kernel.connectors.ai.open_ai import OpenAITextCompletion + +from pyserini.demo.aclchatgpt.skill import PyseriniSkill, PyseriniConfig + + +@dataclass +class OpenAIConfig: + api_key: str + org_id: str + +class ChatBot: + + acl_chat_prompt = """Given the query_results below, your task is to formulate an answer. You may choose to use or not to use the + information in the query_results. + If you use the query_results, you must reference the docid of the document used by appending to the answer with "(docid: doc-id-here)". + If you do not use the query_results, do not reference it in your answer. + + =================== + query_results: {{pyserini.search $input}} + =================== + + What is your response to {{$input}}? + """ + + absolute_question_prompt = """ + Task: You are an AI language model tasked with transforming given questions into + absolute questions. An absolute question is a question that can stand on its own and carries all the context needed + to be answered. Here's an example: + + User: Who is Alan Turing? + ChatBot: Who is Alan Turing? + User: How old is he? + ChatBot: How old is Alan Turing? + + =================== + History: {{$history}} + =================== + + Using the history as context, transform the following question into an absolute question: {{$input}} + """ + + def __init__(self, pyserini_config: PyseriniConfig, openai_config: OpenAIConfig): + + self.kernel = sk.Kernel() + self.kernel.add_text_completion_service("dv", OpenAITextCompletion("text-davinci-003", openai_config.api_key, openai_config.org_id)) + self.kernel.import_skill(PyseriniSkill(pyserini_config),"pyserini") + self.context = self.kernel.create_new_context() + self.context["url"] = "http://127.0.0.1:8080/search" + self.context["history"] = "" + + self.acl_chat_function = self.kernel.create_semantic_function(self.acl_chat_prompt, max_tokens=200, temperature=0, top_p=0.5) + self.absolute_question_function = self.kernel.create_semantic_function(self.absolute_question_prompt, max_tokens=200, temperature=0, top_p=0.5) + + + def _chat(self,input_text: str) -> None: + + print("---------------------------------------------") + absolute_question = self.absolute_question_function(input_text,context=self.context) + + print (f"Absolute Question: {absolute_question}") + + # Process the user message and get an answer + answer = self.acl_chat_function(str(absolute_question),context=self.context) + + # Show the response + print(f"ChatBot: {answer}") + + self.context["history"] += f"\nUser: {input_text}\nChatBot: {answer}\n" + + def chat(self) -> None: + print("Hi, I'm the ACL ChatBot. Ask me a question about ACL Anthology papers and I'll do my best to answer it.") + + while True: + print("=============================================") + self._chat(input("User: ")) + +def main(): + + parser = ArgumentParser() + + parser.add_argument('--k1', type=float, help='BM25 k1 parameter.') + parser.add_argument('--b', type=float, help='BM25 b parameter.') + parser.add_argument('--hits', type=int, default=10, help='Number of hits returned by the retriever') + + args = parser.parse_args() + api_key, org_id = sk.openai_settings_from_dot_env() + open_ai_config = OpenAIConfig(api_key,org_id) + pyserini_config = PyseriniConfig(args.k1, args.b, args.hits) + + print("Starting ChatBot...") + + chatbot = ChatBot(pyserini_config=pyserini_config,openai_config=open_ai_config) + chatbot.chat() + + + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/pyserini/demo/aclchatgpt/readme.md b/pyserini/demo/aclchatgpt/readme.md new file mode 100644 index 000000000..0f3a5809a --- /dev/null +++ b/pyserini/demo/aclchatgpt/readme.md @@ -0,0 +1,20 @@ +# Talking to ACL Anthology with GPT + +By default, GPT-3 text-davinci-003 is used. You can change this in `chatbot.py`. + +## Environment Varaibles +You will need to create an `.env` file in directory that you are running in. +The `.env` file should contain the following: +``` +OPENAI_API_KEY="" +OPENAI_ORG_ID="" +``` + +## Setup + +1. Follow the instructions in [Indexing the ACL Anthology with Anserini](https://github.com/castorini/pyserini/blob/master/docs/working-with-acl-anthology.md) to setup the project and generate a `lucene-index-acl-paragraph` index. +2. Copy the generated `lucene-index-acl-paragraph` index from the `acl-anthology` folder to `pyserini/indexes` +3. You will need Semantic Kernel as well. + - `pip3 install --upgrade semantic-kernel` +4. Start the chatbot with `python -m pyserini.demo.aclchatgpt.chatbot` +5. Start chatting! \ No newline at end of file diff --git a/pyserini/demo/aclchatgpt/skill.py b/pyserini/demo/aclchatgpt/skill.py new file mode 100644 index 000000000..3209d0373 --- /dev/null +++ b/pyserini/demo/aclchatgpt/skill.py @@ -0,0 +1,75 @@ +# Copyright (c) Microsoft. All rights reserved. + +import json +import logging +from dataclasses import dataclass + +import aiohttp +import requests + +from semantic_kernel.orchestration.sk_context import SKContext +from typing import Callable, Optional, Tuple, Union +from semantic_kernel.skill_definition import sk_function, sk_function_context_parameter +from pyserini.search import LuceneSearcher, FaissSearcher, AutoQueryEncoder + +@dataclass +class PyseriniConfig: + k1: Optional[float]=None + b: Optional[float]=None + hits: Optional[int]=1 + +class PyseriniSkill: + """ + A skill that uses Pyserini to search a corpus of documents. + + Usage: + kernel.import_skill(PyseriniSkill(), "http") + + Examples: + + {{pyserini.search $query}} + """ + + def __init__(self,pyserini_config:PyseriniConfig): + self.lang = 'en' + self.searcher = LuceneSearcher('indexes/lucene-index-acl-paragraph') + self.searcher.set_language(self.lang) + if pyserini_config.k1 is not None and pyserini_config.b is not None: + self.searcher.set_bm25(pyserini_config.k1, pyserini_config.b) + self.retriever_name = f'BM25 (k1={pyserini_config.k1}, b={pyserini_config.b})' + else: + self.retriever_name = 'BM25' + self.hits = pyserini_config.hits + + @sk_function(description="Searches a corpus of documents using Pyserini using the specified query.", name="search") + @sk_function_context_parameter(name="url", description="The url of the request") + async def search(self, query: str, context: SKContext) -> str: + """ + Searches a corpus of documents using Pyserini using the specified query. + Return the response body as a string. + + params: + query: The query to search for. + context: The SKContext containing the url of the request. + returns: + The response body as a string. + """ + _, url = context.variables.get("url") + if not url: + raise ValueError("url cannot be `None` or empty") + + if not query: + search_results = [] + else: + hits = self.searcher.search(query, k=self.hits) + docs = [self.searcher.doc(hit.docid) for hit in hits] + search_results = [ + { + 'rank': r + 1, + 'docid': hit.docid, + 'doc': docs[r].contents(), + } + for r, hit in enumerate(hits) + ] + return "docid:" + search_results[0]["docid"] + ",doc:" + search_results[0]["doc"] +