diff --git a/.gitignore b/.gitignore index 27b6457..8e61123 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +# Ignore docs +docs/_build/ # database files *.db diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 0000000..b668cb0 --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,13 @@ +version: "2" + +build: + os: "ubuntu-22.04" + tools: + python: "3.10" + +python: + install: + - requirements: docs/requirements.txt + +sphinx: + configuration: docs/conf.py diff --git a/README.md b/README.md index d7413fa..0196d31 100644 --- a/README.md +++ b/README.md @@ -6,14 +6,15 @@ [![PyPI](https://img.shields.io/pypi/v/buster-doctalk?logo=pypi)](https://pypi.org/project/buster-doctalk) [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) [![Hugging Face Spaces](https://img.shields.io/badge/🤗%20Hugging%20Face-Buster%20Demo-blue)](https://huggingface.co/spaces/jerpint/buster) +[![Documentation Status](https://readthedocs.org/projects/buster/badge/?version=latest)](https://buster.readthedocs.io/en/latest/?badge=latest) -Buster is a question-answering chatbot that can be tuned to any source of documentations. +Buster is retrieval-augmented generation (RAG) module that can be tuned to any source of documentation. # Demo -In order to view the full abilities of Buster, you can play with our [live demo here](https://huggingface.co/spaces/jerpint/buster). +In order to demo Buster's abilities, you can play with our [live demo here](https://huggingface.co/spaces/jerpint/buster). We scraped the documentation of [huggingface 🤗 Transformers](https://huggingface.co/docs/transformers/index) and instructed Buster to answer questions related to its usage. # Quickstart @@ -45,7 +46,7 @@ export OPENAI_API_KEY=sk-... # Generating your own embeddings -Once your local version of Buster is up and running, the next step is for you to be able to import your own data. +Once your local version of Buster is properly installed, the next step is for you to be able to import your own data. We will be using the `stackoverflow.csv` file in the `buster/examples/` folder for this. This is the same data that was used to generate the demo app's embeddings. You will first ingest the documents to be ready for buster. In this example, we use Deeplake's vector store, but you can always write your own custom `DocumentManager`: diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..d4bb2cb --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..920b051 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,79 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +# import os +# import sys +# sys.path.insert(0, os.path.abspath('.')) + +# -- Project information ----------------------------------------------------- + +project = "buster 🤖" +author = "jerpint, hbertrand" + + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [] + +# enable use of markdown files +extensions.append("myst_parser") + +# use the readthedocs theme +extensions.append("sphinx_rtd_theme") +extensions.append("sphinx.ext.napoleon") +extensions.append("sphinxcontrib.katex") + +# autoapi extension for doc strings +extensions.append("autoapi.extension") +autoapi_type = "python" +autoapi_dirs = ["../buster/"] + + +# Skip docstrings for loggers and tests +def check_skip_member(app, what, name, obj, skip, options): + """Skips documentation when the function returns True.""" + SKIP_PATTERNS = ["test_", "logger"] + for pattern in SKIP_PATTERNS: + if pattern in name: + print("Skipping documentation for: ", name) + return True + return False + + +def setup(app): + """Handler to connect to the autoapi app.""" + app.connect("autoapi-skip-member", check_skip_member) + + +# Add any paths that contain templates here, relative to this directory. +templates_path = ["_templates"] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = "sphinx_rtd_theme" + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ["_static"] diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000..2f03f02 --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,48 @@ +Buster 🤖 +============================================= + +About the project +----------------- +Buster is a library for Retrieval-Augmented Generation (RAG). +It leverages LLMs and embeddings to provide answers to questions grounded in references. + +Buster is open-source and hackable. +It includes many features out of the box, and is intended for deployment. + + +Demo +---- + +In order to demo Buster's abilities, you can play with our `live demo here `__. +We scraped the documentation of `huggingface 🤗 Transformers `__ and instructed Buster to answer questions related to its usage. + + +.. toctree:: + :caption: Getting Started + :maxdepth: 2 + + usage/installation + + +.. toctree:: + :caption: Customization + :maxdepth: 1 + + usage/components + usage/configuration + usage/custom_docs + + +.. toctree:: + :caption: Buster Components + :maxdepth: 1 + + usage/components_overview + + +Useful links +============ + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..2119f51 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 0000000..498e45b --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,6 @@ +myst-parser +sphinx +sphinx-autoapi +sphinx-rtd-theme +sphinxcontrib-napoleon +sphinxcontrib-katex diff --git a/docs/usage/components.md b/docs/usage/components.md new file mode 100644 index 0000000..dfcc36b --- /dev/null +++ b/docs/usage/components.md @@ -0,0 +1,19 @@ +# Buster Components + +Buster is built around components that can be customized and extended. + +For example, to do chat completion, we must use a `Completer` component. +While we've implemented some completers like `ChatGPT`, adding more completers is possible by inheriting from the `Completer` base class. + +Currently, buster implements the following components: + +* `Completer`: The language model responsible for generating a response +* `Retriever`: Responsible for fetching the documents associated to a user's input +* `DocumentsFormatter`: Responsible for taking the various documents and formatting them in different ways. We support formatting documents into json-like objects and html-like objects. +* `PromptFormatter`: Responsible for combining the formatted documents with the prompts for the LLM +* `Validator`: Responsible for validating user inputs and/or model outputs. This can be implemented via checks of the questions and answer before and after completions occur. +* `Tokenizer`: Used to monitor the length of prompts and completions. It is generally assumed that the `Tokenizer` is associated to that of the `Completer`. + + +Additional components are also available for managing documents: +* `DocumentManager`: Manager allowing to generate and store embeddings (should be used in conjunction with `Retriever` components) \ No newline at end of file diff --git a/docs/usage/components_overview.md b/docs/usage/components_overview.md new file mode 100644 index 0000000..b0f4d55 --- /dev/null +++ b/docs/usage/components_overview.md @@ -0,0 +1,3 @@ +# Overview + +Completers, Retrievers, etc. diff --git a/docs/usage/configuration.md b/docs/usage/configuration.md new file mode 100644 index 0000000..8537edf --- /dev/null +++ b/docs/usage/configuration.md @@ -0,0 +1,103 @@ +# Configuration of Components + +Buster's internal configuration is controlled via the `BusterConfig` object. +It is meant to set all of the different parameters for the different components in one place. + +Here is a typical setup: + +```python +from buster.busterbot import BusterConfig + +buster_cfg = BusterConfig( + retriever_cfg={ + "path": "deeplake_store", + "top_k": 3, + "thresh": 0.7, + "max_tokens": 2000, + "embedding_model": "text-embedding-ada-002", + }, + validator_cfg={ + "unknown_response_templates": [ + "I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?", + ], + "unknown_threshold": 0.85, + "embedding_model": "text-embedding-ada-002", + "use_reranking": True, + "invalid_question_response": "This question does not seem relevant to my current knowledge.", + "check_question_prompt": """You are an chatbot answering questions on artificial intelligence. + +A user will submit a question. Respond 'true' if it is valid, respond 'false' if it is invalid.""", + "completion_kwargs": { + "model": "gpt-3.5-turbo", + "stream": False, + "temperature": 0, + }, + }, + documents_answerer_cfg={ + "no_documents_message": "No documents are available for this question.", + }, + completion_cfg={ + "completion_kwargs": { + "model": "gpt-3.5-turbo", + "stream": False, + "temperature": 0, + }, + }, + tokenizer_cfg={ + "model_name": "gpt-3.5-turbo", + }, + documents_formatter_cfg={ + "max_tokens": 3500, + "columns": ["content", "title", "source"], + }, + prompt_formatter_cfg={ + "max_tokens": 3500, + "text_before_docs": ( + "You are a chatbot assistant answering technical questions about artificial intelligence (AI)." + "You can only respond to a question if the content necessary to answer the question is contained in the following provided documentation. " + "If the answer is in the documentation, summarize it in a helpful way to the user. " + ), + "text_after_docs": ( + "REMEMBER:\n" + "You are a chatbot assistant answering technical questions about artificial intelligence (AI)." + "Here are the rules you must follow:\n" + "1) You must only respond with information contained in the documentation above. Say you do not know if the information is not provided.\n" + "2) Make sure to format your answers in Markdown format, including code block and snippets.\n" + "Now answer the following question:\n" + ), + }, +) +``` + +This `BusterConfig` can then be passed to initialize Buster and all of its components: + +```python +from buster.busterbot import Buster, BusterConfig +from buster.completers import ChatGPTCompleter, DocumentAnswerer +from buster.formatters.documents import DocumentsFormatterJSON +from buster.formatters.prompts import PromptFormatter +from buster.retriever import DeepLakeRetriever, Retriever +from buster.tokenizers import GPTTokenizer +from buster.validators import QuestionAnswerValidator, Validator + +def setup_buster(buster_cfg: BusterConfig): + """initialize buster with a buster_cfg class""" + retriever: Retriever = DeepLakeRetriever(**buster_cfg.retriever_cfg) + tokenizer = GPTTokenizer(**buster_cfg.tokenizer_cfg) + document_answerer: DocumentAnswerer = DocumentAnswerer( + completer=ChatGPTCompleter(**buster_cfg.completion_cfg), + documents_formatter=DocumentsFormatterJSON(tokenizer=tokenizer, **buster_cfg.documents_formatter_cfg), + prompt_formatter=PromptFormatter(tokenizer=tokenizer, **buster_cfg.prompt_formatter_cfg), + **buster_cfg.documents_answerer_cfg, + ) + validator: Validator = QuestionAnswerValidator(**buster_cfg.validator_cfg) + buster: Buster = Buster(retriever=retriever, document_answerer=document_answerer, validator=validator) + return buster + +buster = setup_buster(buster_cfg) + +completion = buster.process_input("What is backpropagation?") +print(completion) +``` + + uses a config file to setup most of the app. \ No newline at end of file diff --git a/docs/usage/custom_docs.md b/docs/usage/custom_docs.md new file mode 100644 index 0000000..52d2ec5 --- /dev/null +++ b/docs/usage/custom_docs.md @@ -0,0 +1,3 @@ +# Adding Documents + +To add your own documents, \ No newline at end of file diff --git a/docs/usage/guide.rst b/docs/usage/guide.rst new file mode 100644 index 0000000..0db3d3d --- /dev/null +++ b/docs/usage/guide.rst @@ -0,0 +1,16 @@ +User Guide +========== + +Quick Start +----------- + +To get started, you have to first begin! + +Everybody loves Schrodinger's equation, why not put it everywhere? + +.. math:: + i \hbar \frac{\partial}{\partial t}\Psi(\mathbf{r},t) = \hat H \Psi(\mathbf{r},t) + +You can also add math or even link directly in your docstrings! For an example, click at the docstrings here: + +:py:meth:`amlrt_project.models.optim.load_loss` diff --git a/docs/usage/installation.md b/docs/usage/installation.md new file mode 100644 index 0000000..d6f86dd --- /dev/null +++ b/docs/usage/installation.md @@ -0,0 +1,46 @@ +# Installation + +## Install with pip + +This section is meant to help you install and run local version of Buster. +First step, install buster: + +**Note**: Buster requires python>=3.10 + +```bash +pip install buster-doctalk +``` + +We recommend using a virtual environment (e.g. via conda) for the installation process. + +## Testing your installation + +To easily test your setup, we've added an example app in the `buster/examples` directory with a few documents from stackoverflow AI questions. + + +**NOTE**: The demo uses chatGPT to generate text and compute embeddings, make sure to set a valid openai API key: +```bash +export OPENAI_API_KEY=sk-... +``` + +Simply go to the examples folder and launch the app: + +```bash +cd buster/buster/examples +gradio gradio_app.py +``` + +This will launch the gradio app locally. Navigate to your local gradio instance and you should see the chat interface: + +![image](https://github.com/jerpint/buster/assets/18450628/1604a3a9-0bee-4cd2-a6ca-70e88ddc0b81) + + + +## Install from source + +If you want to contribute to buster and improve the library, we recommend installing it locally in editable mode. To do that, clone the repository then install the library: + + git clone https://github.com/jerpint/buster + cd buster/ + pip install -e . +