diff --git a/bot/retrievers/retrieve_similar_nodes.py b/bot/retrievers/retrieve_similar_nodes.py index e7954f9..52800dd 100644 --- a/bot/retrievers/retrieve_similar_nodes.py +++ b/bot/retrievers/retrieve_similar_nodes.py @@ -24,7 +24,6 @@ def __init__( """Init params.""" self._vector_store = vector_store self._embed_model = embed_model - print(f"type(embed_model): {type(embed_model)} | embed_model: {embed_model}") self._similarity_top_k = similarity_top_k def query_db( diff --git a/subquery.py b/subquery.py index 58e2eec..a132394 100644 --- a/subquery.py +++ b/subquery.py @@ -7,7 +7,10 @@ from llama_index.llms.openai import OpenAI from llama_index.question_gen.guidance import GuidanceQuestionGenerator from tc_hivemind_backend.embeddings.cohere import CohereEmbedding -from utils.query_engine import prepare_discord_engine_auto_filter +from utils.query_engine import ( + prepare_discord_engine_auto_filter, + DEFAULT_GUIDANCE_SUB_QUESTION_PROMPT_TMPL, +) def query_multiple_source( @@ -106,6 +109,7 @@ def query_multiple_source( question_gen = GuidanceQuestionGenerator.from_defaults( guidance_llm=OpenAIChat("gpt-4"), verbose=False, + prompt_template_str=DEFAULT_GUIDANCE_SUB_QUESTION_PROMPT_TMPL, ) s_engine = SubQuestionQueryEngine.from_defaults( question_gen=question_gen, diff --git a/utils/query_engine/__init__.py b/utils/query_engine/__init__.py index 01a7658..a988e54 100644 --- a/utils/query_engine/__init__.py +++ b/utils/query_engine/__init__.py @@ -1,2 +1,3 @@ # flake8: noqa from .prepare_discord_query_engine import prepare_discord_engine_auto_filter +from .subquery_gen_prompt import DEFAULT_GUIDANCE_SUB_QUESTION_PROMPT_TMPL diff --git a/utils/query_engine/subquery_gen_prompt.py b/utils/query_engine/subquery_gen_prompt.py new file mode 100644 index 0000000..0b089c3 --- /dev/null +++ b/utils/query_engine/subquery_gen_prompt.py @@ -0,0 +1,91 @@ +import json +from typing import Sequence + +from llama_index.core.prompts.base import PromptTemplate +from llama_index.core.prompts.guidance_utils import convert_to_handlebars +from llama_index.core.question_gen.types import SubQuestion +from llama_index.core.tools.types import ToolMetadata + + +# deprecated, kept for backward compatibility +SubQuestionPrompt = PromptTemplate + + +def build_tools_text(tools: Sequence[ToolMetadata]) -> str: + tools_dict = {} + for tool in tools: + tools_dict[tool.name] = tool.description + return json.dumps(tools_dict, indent=4) + + +PREFIX = """\ +Given a user question, and a list of tools, output a list of relevant sub-questions \ +in json markdown that when composed can help answer the full user question. \ +Define the sub-questions as search queries that can be used for vector similarity search: +""" + + +example_query_str = ( + "What was decided about the token allocation budget for the " + "next airdrop and what did the community think of this?" +) +example_tools = [ + ToolMetadata( + name="Discord", + description="Contains messages and summaries of conversations from the Discord platform of the community", + ), + ToolMetadata( + name="Discourse", + description="Contains messages and summaries of discussions from the Discourse platform of the community", + ), +] +example_tools_str = build_tools_text(example_tools) +example_output = [ + SubQuestion( + sub_question="Decision token allocation budget airdrop", tool_name="Discourse" + ), + SubQuestion( + sub_question="Opinion token allocation budget airdrop", tool_name="Discord" + ), +] +example_output_str = json.dumps({"items": [x.dict() for x in example_output]}, indent=4) + +EXAMPLES = f"""\ +# Example 1 + +```json +{example_tools_str} +``` + + +{example_query_str} + + + +```json +{example_output_str} +``` + +""".replace( + "{", "{{" +).replace( + "}", "}}" +) + +SUFFIX = """\ +# Example 2 + +```json +{tools_str} +``` + + +{query_str} + + +""" + +DEFAULT_SUB_QUESTION_PROMPT_TMPL = PREFIX + EXAMPLES + SUFFIX +DEFAULT_GUIDANCE_SUB_QUESTION_PROMPT_TMPL = convert_to_handlebars( + DEFAULT_SUB_QUESTION_PROMPT_TMPL +)