276 rows × 2 columns
\n", + "273 rows × 2 columns
\n", "" ], "text/plain": [ " _question \\\n", - "0 Do I need to wait before beginning the green c... \n", - "1 Can I start the green card process if my emplo... \n", - "2 If my employer files an H-1B and it is selecte... \n", - "3 What is the process for upgrading from EB-3 to... \n", - "4 What is involved in the EB-3 to EB-2 porting p... \n", + "0 Can I begin the green card process if my H-1B ... \n", + "1 If I have a master's degree in engineering man... \n", + "2 If I have a master's degree in engineering man... \n", + "3 What is the eligibility and process for upgrad... \n", + "4 Can you explain the process for changing from ... \n", ".. ... \n", - "271 Does my wife qualify for EB2 processing as an ... \n", - "272 What is the process for my wife to apply for E... \n", - "273 Can my wife use her occupational therapist deg... \n", - "274 I have 3 years of experience in the IT field a... \n", - "275 I have 3 years of experience in IT and am abou... \n", + "268 What are the chances of my wife obtaining EB2 ... \n", + "269 Is it possible for my wife to get EB2 processi... \n", + "270 Do I have the qualifications to apply for an E... \n", + "271 If I have 3 years of IT experience and am work... \n", + "272 If I have 3 years of experience in IT and am f... \n", "\n", " _answer \n", - "0 No, the employer can initiate the green card p... \n", - "1 Yes, the employer can begin the green card pro... \n", - "2 The employer can initiate the green card proce... \n", - "3 It is possible to upgrade from EB-3 to EB-2 as... \n", - "4 If the PERM was submitted as an EB-2, you can ... \n", + "0 Yes, your employer can initiate the green card... \n", + "1 Yes, the green card process can be initiated b... \n", + "2 Yes, the employer can begin the green card pro... \n", + "3 It is possible to upgrade from EB-3 to EB-2 st... \n", + "4 If the PERM was originally filed as an EB-2, i... \n", ".. ... \n", - "271 OT's do not have a special category like physi... \n", - "272 OT's do not have a special pathway like physic... \n", - "273 OT's do not have a dedicated category like phy... \n", - "274 I understand you are inquiring if you can appl... \n", - "275 I'm assuming you are asking if you can submit ... \n", + "268 Occupational therapists do not have a special ... \n", + "269 Occupational therapists do not have a separate... \n", + "270 Yes, you should have the qualifications to app... \n", + "271 Yes, you should be able to apply for an EB-2 g... \n", + "272 Yes, you should qualify for an EB-2 green card... \n", "\n", - "[276 rows x 2 columns]" + "[273 rows x 2 columns]" ] }, - "execution_count": 129, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -4222,7 +4381,7 @@ }, { "cell_type": "code", - "execution_count": 130, + "execution_count": 22, "id": "a81743fb-966a-4c40-8e32-3bd2f7f7ee4d", "metadata": {}, "outputs": [], @@ -4247,7 +4406,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.10.13" } }, "nbformat": 4, diff --git a/example/data_generation/immigration_gen_data2.ipynb b/example/data_generation/immigration_gen_data2.ipynb index 40e1d56..f8daaf6 100644 --- a/example/data_generation/immigration_gen_data2.ipynb +++ b/example/data_generation/immigration_gen_data2.ipynb @@ -10,7 +10,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 1, "id": "730c285c-af52-4ba4-8cb2-1a9e468af547", "metadata": {}, "outputs": [ @@ -18,24 +18,30 @@ "name": "stdout", "output_type": "stream", "text": [ - "Requirement already satisfied: openai in /opt/conda/envs/pykoi/lib/python3.10/site-packages (0.27.8)\n", - "Requirement already satisfied: requests>=2.20 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from openai) (2.31.0)\n", - "Requirement already satisfied: tqdm in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from openai) (4.65.0)\n", - "Requirement already satisfied: aiohttp in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from openai) (3.8.5)\n", - "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from requests>=2.20->openai) (3.2.0)\n", - "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from requests>=2.20->openai) (3.4)\n", - "Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from requests>=2.20->openai) (2.0.4)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from requests>=2.20->openai) (2023.7.22)\n", - "Requirement already satisfied: attrs>=17.3.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from aiohttp->openai) (23.1.0)\n", - "Requirement already satisfied: multidict<7.0,>=4.5 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from aiohttp->openai) (6.0.4)\n", - "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from aiohttp->openai) (4.0.2)\n", - "Requirement already satisfied: yarl<2.0,>=1.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from aiohttp->openai) (1.9.2)\n", - "Requirement already satisfied: frozenlist>=1.1.1 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from aiohttp->openai) (1.4.0)\n", - "Requirement already satisfied: aiosignal>=1.1.2 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from aiohttp->openai) (1.3.1)\n", - "Requirement already satisfied: clean-text in /opt/conda/envs/pykoi/lib/python3.10/site-packages (0.6.0)\n", - "Requirement already satisfied: emoji<2.0.0,>=1.0.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from clean-text) (1.7.0)\n", - "Requirement already satisfied: ftfy<7.0,>=6.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from clean-text) (6.1.1)\n", - "Requirement already satisfied: wcwidth>=0.2.5 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from ftfy<7.0,>=6.0->clean-text) (0.2.6)\n" + "Requirement already satisfied: openai in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (1.6.0)\n", + "Requirement already satisfied: anyio<5,>=3.5.0 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from openai) (3.7.1)\n", + "Requirement already satisfied: distro<2,>=1.7.0 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from openai) (1.8.0)\n", + "Requirement already satisfied: httpx<1,>=0.23.0 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from openai) (0.26.0)\n", + "Requirement already satisfied: pydantic<3,>=1.9.0 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from openai) (1.10.11)\n", + "Requirement already satisfied: sniffio in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from openai) (1.3.0)\n", + "Requirement already satisfied: tqdm>4 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from openai) (4.66.1)\n", + "Requirement already satisfied: typing-extensions<5,>=4.7 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from openai) (4.9.0)\n", + "Requirement already satisfied: idna>=2.8 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from anyio<5,>=3.5.0->openai) (3.6)\n", + "Requirement already satisfied: exceptiongroup in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from anyio<5,>=3.5.0->openai) (1.2.0)\n", + "Requirement already satisfied: certifi in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from httpx<1,>=0.23.0->openai) (2023.11.17)\n", + "Requirement already satisfied: httpcore==1.* in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from httpx<1,>=0.23.0->openai) (1.0.2)\n", + "Requirement already satisfied: h11<0.15,>=0.13 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from httpcore==1.*->httpx<1,>=0.23.0->openai) (0.14.0)\n", + "Requirement already satisfied: clean-text in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (0.6.0)\n", + "Requirement already satisfied: emoji<2.0.0,>=1.0.0 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from clean-text) (1.7.0)\n", + "Requirement already satisfied: ftfy<7.0,>=6.0 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from clean-text) (6.1.1)\n", + "Requirement already satisfied: wcwidth>=0.2.5 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from ftfy<7.0,>=6.0->clean-text) (0.2.6)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results.\n" ] } ], @@ -55,7 +61,6 @@ "import json\n", "import tqdm\n", "import copy\n", - "import openai\n", "import pandas as pd\n", "\n", "from typing import Optional, Sequence, Union\n", @@ -63,10 +68,7 @@ "import nltk\n", "from nltk.corpus import stopwords\n", "from nltk.stem import SnowballStemmer\n", - "from nltk.tokenize import word_tokenize\n", - "\n", - "# from openai import openai_object\n", - "openai.api_key = \"\"" + "from nltk.tokenize import word_tokenize" ] }, { @@ -87,7 +89,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 2, "id": "151b55a5-7ca4-404f-94ac-8c39a0bdaa12", "metadata": {}, "outputs": [], @@ -108,7 +110,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 3, "id": "5d61054c-3a80-4bec-bc79-b6a83a411094", "metadata": {}, "outputs": [], @@ -150,12 +152,12 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 4, "id": "32865211-51a8-42f5-94bc-63dc78bc6a0d", "metadata": {}, "outputs": [], "source": [ - "context = context.lower() # Lowercase \n", + "context = context.lower() # Lowercase\n", "context = context.strip() # Remove leading/trailing whitespace\n", "context = re.sub(r'[ \\t]+', ' ', context) # Remove extra space and tabs while MAINTAINING NEW LINE CHARACTERS\n", "context = re.compile('<.*?>').sub('', context) # Remove HTML tags/markups:\n", @@ -166,7 +168,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 5, "id": "164e5080-5023-491a-ab53-82269aeaf7ab", "metadata": {}, "outputs": [ @@ -187,7 +189,7 @@ " 'whereas the international center will file both the labor certification and i-140 (unless the petition as a whole is assigned to retained counsel), the adjustment of status application and related applications (advance parole and ead) are filed by retained counsel only.']" ] }, - "execution_count": 42, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -206,7 +208,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 6, "id": "92f7f2c7-6d2a-43f9-a748-de849019dc05", "metadata": {}, "outputs": [ @@ -214,161 +216,253 @@ "name": "stdout", "output_type": "stream", "text": [ - "Requirement already satisfied: lmqg in /opt/conda/envs/pykoi/lib/python3.10/site-packages (0.1.1)\n", - "Requirement already satisfied: psutil in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from lmqg) (5.9.5)\n", - "Requirement already satisfied: pytextrank in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from lmqg) (3.2.5)\n", - "Requirement already satisfied: torch in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from lmqg) (2.0.1)\n", - "Requirement already satisfied: tqdm in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from lmqg) (4.65.0)\n", - "Requirement already satisfied: requests in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from lmqg) (2.31.0)\n", - "Requirement already satisfied: pandas in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from lmqg) (2.0.3)\n", - "Requirement already satisfied: numpy in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from lmqg) (1.25.2)\n", - "Requirement already satisfied: transformers>=4.26.1 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from lmqg) (4.31.0)\n", - "Requirement already satisfied: huggingface-hub>=0.12.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from lmqg) (0.16.4)\n", - "Requirement already satisfied: sentencepiece in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from lmqg) (0.1.99)\n", - "Requirement already satisfied: datasets in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from lmqg) (2.13.1)\n", - "Requirement already satisfied: spacy in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from lmqg) (3.7.0)\n", - "Requirement already satisfied: sudachipy in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from lmqg) (0.6.7)\n", - "Requirement already satisfied: sudachidict-core in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from lmqg) (20230927)\n", - "Requirement already satisfied: bert-score in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from lmqg) (0.3.13)\n", - "Requirement already satisfied: pyemd in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from lmqg) (1.0.0)\n", - "Requirement already satisfied: evaluate in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from lmqg) (0.4.0)\n", - "Requirement already satisfied: wandb in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from lmqg) (0.15.11)\n", - "Requirement already satisfied: ray in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from lmqg) (2.7.0)\n", - "Requirement already satisfied: nltk in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from lmqg) (3.8.1)\n", - "Requirement already satisfied: accelerate in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from lmqg) (0.21.0)\n", - "Requirement already satisfied: filelock in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from huggingface-hub>=0.12.0->lmqg) (3.12.2)\n", - "Requirement already satisfied: fsspec in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from huggingface-hub>=0.12.0->lmqg) (2023.6.0)\n", - "Requirement already satisfied: pyyaml>=5.1 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from huggingface-hub>=0.12.0->lmqg) (6.0.1)\n", - "Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from huggingface-hub>=0.12.0->lmqg) (4.7.1)\n", - "Requirement already satisfied: packaging>=20.9 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from huggingface-hub>=0.12.0->lmqg) (23.1)\n", - "Requirement already satisfied: regex!=2019.12.17 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from transformers>=4.26.1->lmqg) (2023.6.3)\n", - "Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from transformers>=4.26.1->lmqg) (0.13.3)\n", - "Requirement already satisfied: safetensors>=0.3.1 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from transformers>=4.26.1->lmqg) (0.3.1)\n", - "Requirement already satisfied: sympy in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from torch->lmqg) (1.12)\n", - "Requirement already satisfied: networkx in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from torch->lmqg) (3.1)\n", - "Requirement already satisfied: jinja2 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from torch->lmqg) (3.1.2)\n", - "Requirement already satisfied: nvidia-cuda-nvrtc-cu11==11.7.99 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from torch->lmqg) (11.7.99)\n", - "Requirement already satisfied: nvidia-cuda-runtime-cu11==11.7.99 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from torch->lmqg) (11.7.99)\n", - "Requirement already satisfied: nvidia-cuda-cupti-cu11==11.7.101 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from torch->lmqg) (11.7.101)\n", - "Requirement already satisfied: nvidia-cudnn-cu11==8.5.0.96 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from torch->lmqg) (8.5.0.96)\n", - "Requirement already satisfied: nvidia-cublas-cu11==11.10.3.66 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from torch->lmqg) (11.10.3.66)\n", - "Requirement already satisfied: nvidia-cufft-cu11==10.9.0.58 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from torch->lmqg) (10.9.0.58)\n", - "Requirement already satisfied: nvidia-curand-cu11==10.2.10.91 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from torch->lmqg) (10.2.10.91)\n", - "Requirement already satisfied: nvidia-cusolver-cu11==11.4.0.1 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from torch->lmqg) (11.4.0.1)\n", - "Requirement already satisfied: nvidia-cusparse-cu11==11.7.4.91 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from torch->lmqg) (11.7.4.91)\n", - "Requirement already satisfied: nvidia-nccl-cu11==2.14.3 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from torch->lmqg) (2.14.3)\n", - "Requirement already satisfied: nvidia-nvtx-cu11==11.7.91 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from torch->lmqg) (11.7.91)\n", - "Requirement already satisfied: triton==2.0.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from torch->lmqg) (2.0.0)\n", - "Requirement already satisfied: setuptools in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from nvidia-cublas-cu11==11.10.3.66->torch->lmqg) (68.0.0)\n", - "Requirement already satisfied: wheel in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from nvidia-cublas-cu11==11.10.3.66->torch->lmqg) (0.41.1)\n", - "Requirement already satisfied: cmake in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from triton==2.0.0->torch->lmqg) (3.27.1)\n", - "Requirement already satisfied: lit in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from triton==2.0.0->torch->lmqg) (16.0.6)\n", - "Requirement already satisfied: matplotlib in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from bert-score->lmqg) (3.8.0)\n", - "Requirement already satisfied: python-dateutil>=2.8.2 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from pandas->lmqg) (2.8.2)\n", - "Requirement already satisfied: pytz>=2020.1 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from pandas->lmqg) (2023.3)\n", - "Requirement already satisfied: tzdata>=2022.1 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from pandas->lmqg) (2023.3)\n", - "Requirement already satisfied: pyarrow>=8.0.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from datasets->lmqg) (12.0.1)\n", - "Requirement already satisfied: dill<0.3.7,>=0.3.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from datasets->lmqg) (0.3.6)\n", - "Requirement already satisfied: xxhash in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from datasets->lmqg) (3.3.0)\n", - "Requirement already satisfied: multiprocess in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from datasets->lmqg) (0.70.14)\n", - "Requirement already satisfied: aiohttp in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from datasets->lmqg) (3.8.5)\n", - "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from requests->lmqg) (3.2.0)\n", - "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from requests->lmqg) (3.4)\n", - "Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from requests->lmqg) (2.0.4)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from requests->lmqg) (2023.7.22)\n", - "Requirement already satisfied: responses<0.19 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from evaluate->lmqg) (0.18.0)\n", - "Requirement already satisfied: click in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from nltk->lmqg) (8.1.6)\n", - "Requirement already satisfied: joblib in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from nltk->lmqg) (1.3.1)\n", - "Requirement already satisfied: graphviz>=0.13 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from pytextrank->lmqg) (0.20.1)\n", - "Requirement already satisfied: icecream>=2.1 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from pytextrank->lmqg) (2.1.3)\n", - "Requirement already satisfied: pygments>=2.7.4 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from pytextrank->lmqg) (2.16.1)\n", - "Requirement already satisfied: scipy>=1.7 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from pytextrank->lmqg) (1.11.1)\n", - "Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy->lmqg) (3.0.12)\n", - "Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy->lmqg) (1.0.5)\n", - "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy->lmqg) (1.0.10)\n", - "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy->lmqg) (2.0.8)\n", - "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy->lmqg) (3.0.9)\n", - "Requirement already satisfied: thinc<8.3.0,>=8.1.8 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy->lmqg) (8.2.1)\n", - "Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy->lmqg) (1.1.2)\n", - "Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy->lmqg) (2.4.8)\n", - "Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy->lmqg) (2.0.10)\n", - "Requirement already satisfied: weasel<0.4.0,>=0.1.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy->lmqg) (0.3.1)\n", - "Requirement already satisfied: typer<0.10.0,>=0.3.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy->lmqg) (0.7.0)\n", - "Requirement already satisfied: pathy>=0.10.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy->lmqg) (0.10.2)\n", - "Requirement already satisfied: smart-open<7.0.0,>=5.2.1 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy->lmqg) (6.4.0)\n", - "Requirement already satisfied: pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy->lmqg) (1.10.11)\n", - "Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy->lmqg) (3.3.0)\n", - "Requirement already satisfied: jsonschema in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from ray->lmqg) (4.19.0)\n", - "Requirement already satisfied: msgpack<2.0.0,>=1.0.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from ray->lmqg) (1.0.5)\n", - "Requirement already satisfied: protobuf!=3.19.5,>=3.15.3 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from ray->lmqg) (4.23.4)\n", - "Requirement already satisfied: aiosignal in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from ray->lmqg) (1.3.1)\n", - "Requirement already satisfied: frozenlist in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from ray->lmqg) (1.4.0)\n", - "Requirement already satisfied: tensorboardX>=1.9 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from ray->lmqg) (2.6.2.2)\n", - "Requirement already satisfied: GitPython!=3.1.29,>=1.0.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from wandb->lmqg) (3.1.37)\n", - "Requirement already satisfied: sentry-sdk>=1.0.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from wandb->lmqg) (1.31.0)\n", - "Requirement already satisfied: docker-pycreds>=0.4.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from wandb->lmqg) (0.4.0)\n", - "Requirement already satisfied: pathtools in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from wandb->lmqg) (0.1.2)\n", - "Requirement already satisfied: setproctitle in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from wandb->lmqg) (1.3.2)\n", - "Requirement already satisfied: appdirs>=1.4.3 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from wandb->lmqg) (1.4.4)\n", - "Requirement already satisfied: six>=1.4.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from docker-pycreds>=0.4.0->wandb->lmqg) (1.16.0)\n", - "Requirement already satisfied: attrs>=17.3.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from aiohttp->datasets->lmqg) (23.1.0)\n", - "Requirement already satisfied: multidict<7.0,>=4.5 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from aiohttp->datasets->lmqg) (6.0.4)\n", - "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from aiohttp->datasets->lmqg) (4.0.2)\n", - "Requirement already satisfied: yarl<2.0,>=1.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from aiohttp->datasets->lmqg) (1.9.2)\n", - "Requirement already satisfied: gitdb<5,>=4.0.1 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from GitPython!=3.1.29,>=1.0.0->wandb->lmqg) (4.0.10)\n", - "Requirement already satisfied: colorama>=0.3.9 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from icecream>=2.1->pytextrank->lmqg) (0.4.6)\n", - "Requirement already satisfied: executing>=0.3.1 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from icecream>=2.1->pytextrank->lmqg) (1.2.0)\n", - "Requirement already satisfied: asttokens>=2.0.1 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from icecream>=2.1->pytextrank->lmqg) (2.2.1)\n", - "Requirement already satisfied: contourpy>=1.0.1 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from matplotlib->bert-score->lmqg) (1.1.1)\n", - "Requirement already satisfied: cycler>=0.10 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from matplotlib->bert-score->lmqg) (0.12.0)\n", - "Requirement already satisfied: fonttools>=4.22.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from matplotlib->bert-score->lmqg) (4.43.0)\n", - "Requirement already satisfied: kiwisolver>=1.0.1 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from matplotlib->bert-score->lmqg) (1.4.5)\n", - "Requirement already satisfied: pillow>=6.2.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from matplotlib->bert-score->lmqg) (10.0.1)\n", - "Requirement already satisfied: pyparsing>=2.3.1 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from matplotlib->bert-score->lmqg) (3.1.1)\n", - "Requirement already satisfied: blis<0.8.0,>=0.7.8 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from thinc<8.3.0,>=8.1.8->spacy->lmqg) (0.7.11)\n", - "Requirement already satisfied: confection<1.0.0,>=0.0.1 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from thinc<8.3.0,>=8.1.8->spacy->lmqg) (0.1.3)\n", - "Requirement already satisfied: cloudpathlib<0.16.0,>=0.7.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from weasel<0.4.0,>=0.1.0->spacy->lmqg) (0.15.1)\n", - "Requirement already satisfied: MarkupSafe>=2.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from jinja2->torch->lmqg) (2.1.3)\n", - "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from jsonschema->ray->lmqg) (2023.7.1)\n", - "Requirement already satisfied: referencing>=0.28.4 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from jsonschema->ray->lmqg) (0.30.2)\n", - "Requirement already satisfied: rpds-py>=0.7.1 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from jsonschema->ray->lmqg) (0.9.2)\n", - "Requirement already satisfied: mpmath>=0.19 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from sympy->torch->lmqg) (1.3.0)\n", - "Requirement already satisfied: smmap<6,>=3.0.1 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from gitdb<5,>=4.0.1->GitPython!=3.1.29,>=1.0.0->wandb->lmqg) (5.0.1)\n", - "Collecting en-core-web-sm==3.7.0\n", - " Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.0/en_core_web_sm-3.7.0-py3-none-any.whl (12.8 MB)\n", - "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.8/12.8 MB\u001b[0m \u001b[31m97.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mm eta \u001b[36m0:00:01\u001b[0m0:01\u001b[0m01\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: spacy<3.8.0,>=3.7.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from en-core-web-sm==3.7.0) (3.7.0)\n", - "Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (3.0.12)\n", - "Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (1.0.5)\n", - "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (1.0.10)\n", - "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (2.0.8)\n", - "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (3.0.9)\n", - "Requirement already satisfied: thinc<8.3.0,>=8.1.8 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (8.2.1)\n", - "Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (1.1.2)\n", - "Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (2.4.8)\n", - "Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (2.0.10)\n", - "Requirement already satisfied: weasel<0.4.0,>=0.1.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (0.3.1)\n", - "Requirement already satisfied: typer<0.10.0,>=0.3.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (0.7.0)\n", - "Requirement already satisfied: pathy>=0.10.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (0.10.2)\n", - "Requirement already satisfied: smart-open<7.0.0,>=5.2.1 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (6.4.0)\n", - "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (4.65.0)\n", - "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (2.31.0)\n", - "Requirement already satisfied: pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (1.10.11)\n", - "Requirement already satisfied: jinja2 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (3.1.2)\n", - "Requirement already satisfied: setuptools in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (68.0.0)\n", - "Requirement already satisfied: packaging>=20.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (23.1)\n", - "Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (3.3.0)\n", - "Requirement already satisfied: numpy>=1.19.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (1.25.2)\n", - "Requirement already satisfied: typing-extensions>=4.2.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (4.7.1)\n", - "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (3.2.0)\n", - "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (3.4)\n", - "Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (2.0.4)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (2023.7.22)\n", - "Requirement already satisfied: blis<0.8.0,>=0.7.8 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from thinc<8.3.0,>=8.1.8->spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (0.7.11)\n", - "Requirement already satisfied: confection<1.0.0,>=0.0.1 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from thinc<8.3.0,>=8.1.8->spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (0.1.3)\n", - "Requirement already satisfied: click<9.0.0,>=7.1.1 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from typer<0.10.0,>=0.3.0->spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (8.1.6)\n", - "Requirement already satisfied: cloudpathlib<0.16.0,>=0.7.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from weasel<0.4.0,>=0.1.0->spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (0.15.1)\n", - "Requirement already satisfied: MarkupSafe>=2.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from jinja2->spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (2.1.3)\n", + "Requirement already satisfied: lmqg in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (0.1.1)\n", + "Requirement already satisfied: psutil in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from lmqg) (5.9.7)\n", + "Requirement already satisfied: pytextrank in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from lmqg) (3.2.5)\n", + "Collecting torch (from lmqg)\n", + " Obtaining dependency information for torch from https://files.pythonhosted.org/packages/e3/43/ea958505875b22961e1277587f66b79f9e1f9d97d7998850ed089ae0d0bd/torch-2.1.2-cp310-none-macosx_11_0_arm64.whl.metadata\n", + " Downloading torch-2.1.2-cp310-none-macosx_11_0_arm64.whl.metadata (25 kB)\n", + "Requirement already satisfied: tqdm in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from lmqg) (4.66.1)\n", + "Requirement already satisfied: requests in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from lmqg) (2.31.0)\n", + "Requirement already satisfied: pandas in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from lmqg) (2.0.3)\n", + "Requirement already satisfied: numpy in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from lmqg) (1.26.2)\n", + "Collecting transformers>=4.26.1 (from lmqg)\n", + " Obtaining dependency information for transformers>=4.26.1 from https://files.pythonhosted.org/packages/20/0a/739426a81f7635b422fbe6cb8d1d99d1235579a6ac8024c13d743efa6847/transformers-4.36.2-py3-none-any.whl.metadata\n", + " Downloading transformers-4.36.2-py3-none-any.whl.metadata (126 kB)\n", + "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m126.8/126.8 kB\u001b[0m \u001b[31m5.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: huggingface-hub>=0.12.0 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from lmqg) (0.17.3)\n", + "Collecting sentencepiece (from lmqg)\n", + " Using cached sentencepiece-0.1.99-cp310-cp310-macosx_11_0_arm64.whl (1.2 MB)\n", + "Collecting datasets (from lmqg)\n", + " Obtaining dependency information for datasets from https://files.pythonhosted.org/packages/e2/cf/db41e572d7ed958e8679018f8190438ef700aeb501b62da9e1eed9e4d69a/datasets-2.15.0-py3-none-any.whl.metadata\n", + " Downloading datasets-2.15.0-py3-none-any.whl.metadata (20 kB)\n", + "Requirement already satisfied: spacy in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from lmqg) (3.7.0)\n", + "Requirement already satisfied: sudachipy in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from lmqg) (0.6.7)\n", + "Requirement already satisfied: sudachidict-core in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from lmqg) (20230927)\n", + "Requirement already satisfied: bert-score in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from lmqg) (0.3.13)\n", + "Requirement already satisfied: pyemd in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from lmqg) (1.0.0)\n", + "Collecting evaluate (from lmqg)\n", + " Obtaining dependency information for evaluate from https://files.pythonhosted.org/packages/70/63/7644a1eb7b0297e585a6adec98ed9e575309bb973c33b394dae66bc35c69/evaluate-0.4.1-py3-none-any.whl.metadata\n", + " Using cached evaluate-0.4.1-py3-none-any.whl.metadata (9.4 kB)\n", + "Requirement already satisfied: wandb in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from lmqg) (0.15.12)\n", + "Requirement already satisfied: ray in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from lmqg) (2.7.1)\n", + "Requirement already satisfied: nltk in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from lmqg) (3.8.1)\n", + "Collecting accelerate (from lmqg)\n", + " Obtaining dependency information for accelerate from https://files.pythonhosted.org/packages/f7/fc/c55e5a2da345c9a24aa2e1e0f60eb2ca290b6a41be82da03a6d4baec4f99/accelerate-0.25.0-py3-none-any.whl.metadata\n", + " Downloading accelerate-0.25.0-py3-none-any.whl.metadata (18 kB)\n", + "Requirement already satisfied: filelock in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from huggingface-hub>=0.12.0->lmqg) (3.13.1)\n", + "Requirement already satisfied: fsspec in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from huggingface-hub>=0.12.0->lmqg) (2023.6.0)\n", + "Requirement already satisfied: pyyaml>=5.1 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from huggingface-hub>=0.12.0->lmqg) (6.0.1)\n", + "Requirement already satisfied: typing-extensions>=3.7.4.3 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from huggingface-hub>=0.12.0->lmqg) (4.9.0)\n", + "Requirement already satisfied: packaging>=20.9 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from huggingface-hub>=0.12.0->lmqg) (23.1)\n", + "Collecting huggingface-hub>=0.12.0 (from lmqg)\n", + " Obtaining dependency information for huggingface-hub>=0.12.0 from https://files.pythonhosted.org/packages/a0/0a/02ac0ae1047d97769003ff4fb8e6717024f3f174a5d13257415aa09e13d9/huggingface_hub-0.20.1-py3-none-any.whl.metadata\n", + " Downloading huggingface_hub-0.20.1-py3-none-any.whl.metadata (12 kB)\n", + "Requirement already satisfied: regex!=2019.12.17 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from transformers>=4.26.1->lmqg) (2023.10.3)\n", + "Requirement already satisfied: tokenizers<0.19,>=0.14 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from transformers>=4.26.1->lmqg) (0.14.1)\n", + "Collecting safetensors>=0.3.1 (from transformers>=4.26.1->lmqg)\n", + " Obtaining dependency information for safetensors>=0.3.1 from https://files.pythonhosted.org/packages/76/b8/0f61d8db167a6071cccce0b12a5db6fcfc4a16fbdf432de1c23c4ef97e79/safetensors-0.4.1-cp310-cp310-macosx_11_0_arm64.whl.metadata\n", + " Downloading safetensors-0.4.1-cp310-cp310-macosx_11_0_arm64.whl.metadata (3.8 kB)\n", + "Requirement already satisfied: sympy in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from torch->lmqg) (1.12)\n", + "Collecting networkx (from torch->lmqg)\n", + " Obtaining dependency information for networkx from https://files.pythonhosted.org/packages/d5/f0/8fbc882ca80cf077f1b246c0e3c3465f7f415439bdea6b899f6b19f61f70/networkx-3.2.1-py3-none-any.whl.metadata\n", + " Downloading networkx-3.2.1-py3-none-any.whl.metadata (5.2 kB)\n", + "Collecting jinja2 (from torch->lmqg)\n", + " Using cached Jinja2-3.1.2-py3-none-any.whl (133 kB)\n", + "Requirement already satisfied: matplotlib in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from bert-score->lmqg) (3.8.0)\n", + "Requirement already satisfied: python-dateutil>=2.8.2 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from pandas->lmqg) (2.8.2)\n", + "Requirement already satisfied: pytz>=2020.1 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from pandas->lmqg) (2023.3.post1)\n", + "Requirement already satisfied: tzdata>=2022.1 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from pandas->lmqg) (2023.3)\n", + "Collecting pyarrow>=8.0.0 (from datasets->lmqg)\n", + " Obtaining dependency information for pyarrow>=8.0.0 from https://files.pythonhosted.org/packages/c6/97/37f4c3cce6d268cc7593b0aa7dbb83bfe660e617b19a4d7bcd1ba6d6d4f0/pyarrow-14.0.2-cp310-cp310-macosx_11_0_arm64.whl.metadata\n", + " Downloading pyarrow-14.0.2-cp310-cp310-macosx_11_0_arm64.whl.metadata (3.0 kB)\n", + "Collecting pyarrow-hotfix (from datasets->lmqg)\n", + " Obtaining dependency information for pyarrow-hotfix from https://files.pythonhosted.org/packages/e4/f4/9ec2222f5f5f8ea04f66f184caafd991a39c8782e31f5b0266f101cb68ca/pyarrow_hotfix-0.6-py3-none-any.whl.metadata\n", + " Downloading pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)\n", + "Collecting dill<0.3.8,>=0.3.0 (from datasets->lmqg)\n", + " Obtaining dependency information for dill<0.3.8,>=0.3.0 from https://files.pythonhosted.org/packages/f5/3a/74a29b11cf2cdfcd6ba89c0cecd70b37cd1ba7b77978ce611eb7a146a832/dill-0.3.7-py3-none-any.whl.metadata\n", + " Using cached dill-0.3.7-py3-none-any.whl.metadata (9.9 kB)\n", + "Collecting xxhash (from datasets->lmqg)\n", + " Obtaining dependency information for xxhash from https://files.pythonhosted.org/packages/ad/7f/dfdf25e416b67970e89d7b85b0e6a4860ec8a227544cb5db069617cc323e/xxhash-3.4.1-cp310-cp310-macosx_11_0_arm64.whl.metadata\n", + " Using cached xxhash-3.4.1-cp310-cp310-macosx_11_0_arm64.whl.metadata (12 kB)\n", + "Collecting multiprocess (from datasets->lmqg)\n", + " Obtaining dependency information for multiprocess from https://files.pythonhosted.org/packages/35/a8/36d8d7b3e46b377800d8dec47891cdf05842d1a2366909ae4a0c89fbc5e6/multiprocess-0.70.15-py310-none-any.whl.metadata\n", + " Using cached multiprocess-0.70.15-py310-none-any.whl.metadata (7.2 kB)\n", + "Requirement already satisfied: aiohttp in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from datasets->lmqg) (3.9.1)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from requests->lmqg) (3.3.2)\n", + "Requirement already satisfied: idna<4,>=2.5 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from requests->lmqg) (3.6)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from requests->lmqg) (2.1.0)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from requests->lmqg) (2023.11.17)\n", + "Collecting responses<0.19 (from evaluate->lmqg)\n", + " Using cached responses-0.18.0-py3-none-any.whl (38 kB)\n", + "Requirement already satisfied: click in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from nltk->lmqg) (8.1.7)\n", + "Requirement already satisfied: joblib in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from nltk->lmqg) (1.3.2)\n", + "Requirement already satisfied: graphviz>=0.13 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from pytextrank->lmqg) (0.20.1)\n", + "Requirement already satisfied: icecream>=2.1 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from pytextrank->lmqg) (2.1.3)\n", + "Requirement already satisfied: pygments>=2.7.4 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from pytextrank->lmqg) (2.16.1)\n", + "Requirement already satisfied: scipy>=1.7 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from pytextrank->lmqg) (1.11.1)\n", + "Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from spacy->lmqg) (3.0.12)\n", + "Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from spacy->lmqg) (1.0.5)\n", + "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from spacy->lmqg) (1.0.10)\n", + "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from spacy->lmqg) (2.0.8)\n", + "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from spacy->lmqg) (3.0.9)\n", + "Requirement already satisfied: thinc<8.3.0,>=8.1.8 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from spacy->lmqg) (8.2.1)\n", + "Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from spacy->lmqg) (1.1.2)\n", + "Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from spacy->lmqg) (2.4.8)\n", + "Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from spacy->lmqg) (2.0.10)\n", + "Requirement already satisfied: weasel<0.4.0,>=0.1.0 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from spacy->lmqg) (0.3.3)\n", + "Requirement already satisfied: typer<0.10.0,>=0.3.0 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from spacy->lmqg) (0.9.0)\n", + "Requirement already satisfied: pathy>=0.10.0 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from spacy->lmqg) (0.10.3)\n", + "Requirement already satisfied: smart-open<7.0.0,>=5.2.1 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from spacy->lmqg) (6.4.0)\n", + "Requirement already satisfied: pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from spacy->lmqg) (1.10.11)\n", + "Requirement already satisfied: setuptools in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from spacy->lmqg) (69.0.2)\n", + "Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from spacy->lmqg) (3.3.0)\n", + "Requirement already satisfied: jsonschema in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from ray->lmqg) (4.19.0)\n", + "Requirement already satisfied: msgpack<2.0.0,>=1.0.0 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from ray->lmqg) (1.0.5)\n", + "Requirement already satisfied: protobuf!=3.19.5,>=3.15.3 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from ray->lmqg) (4.25.1)\n", + "Requirement already satisfied: aiosignal in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from ray->lmqg) (1.3.1)\n", + "Requirement already satisfied: frozenlist in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from ray->lmqg) (1.4.1)\n", + "Requirement already satisfied: tensorboardX>=1.9 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from ray->lmqg) (2.6.2.2)\n", + "Requirement already satisfied: GitPython!=3.1.29,>=1.0.0 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from wandb->lmqg) (3.1.40)\n", + "Requirement already satisfied: sentry-sdk>=1.0.0 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from wandb->lmqg) (1.39.1)\n", + "Requirement already satisfied: docker-pycreds>=0.4.0 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from wandb->lmqg) (0.4.0)\n", + "Requirement already satisfied: pathtools in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from wandb->lmqg) (0.1.2)\n", + "Requirement already satisfied: setproctitle in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from wandb->lmqg) (1.3.3)\n", + "Requirement already satisfied: appdirs>=1.4.3 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from wandb->lmqg) (1.4.4)\n", + "Requirement already satisfied: six>=1.4.0 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from docker-pycreds>=0.4.0->wandb->lmqg) (1.16.0)\n", + "Requirement already satisfied: attrs>=17.3.0 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from aiohttp->datasets->lmqg) (23.1.0)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from aiohttp->datasets->lmqg) (6.0.4)\n", + "Requirement already satisfied: yarl<2.0,>=1.0 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from aiohttp->datasets->lmqg) (1.9.4)\n", + "Requirement already satisfied: async-timeout<5.0,>=4.0 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from aiohttp->datasets->lmqg) (4.0.3)\n", + "Requirement already satisfied: gitdb<5,>=4.0.1 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from GitPython!=3.1.29,>=1.0.0->wandb->lmqg) (4.0.11)\n", + "Requirement already satisfied: colorama>=0.3.9 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from icecream>=2.1->pytextrank->lmqg) (0.4.6)\n", + "Requirement already satisfied: executing>=0.3.1 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from icecream>=2.1->pytextrank->lmqg) (1.2.0)\n", + "Requirement already satisfied: asttokens>=2.0.1 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from icecream>=2.1->pytextrank->lmqg) (2.2.1)\n", + "Collecting scipy>=1.7 (from pytextrank->lmqg)\n", + " Obtaining dependency information for scipy>=1.7 from https://files.pythonhosted.org/packages/de/0d/4fa68303568c70fd56fbf40668b6c6807cfee4cad975f07d80bdd26d013e/scipy-1.11.4-cp310-cp310-macosx_12_0_arm64.whl.metadata\n", + " Downloading scipy-1.11.4-cp310-cp310-macosx_12_0_arm64.whl.metadata (112 kB)\n", + "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m112.9/112.9 kB\u001b[0m \u001b[31m12.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: contourpy>=1.0.1 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from matplotlib->bert-score->lmqg) (1.1.1)\n", + "Requirement already satisfied: cycler>=0.10 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from matplotlib->bert-score->lmqg) (0.12.1)\n", + "Requirement already satisfied: fonttools>=4.22.0 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from matplotlib->bert-score->lmqg) (4.43.1)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from matplotlib->bert-score->lmqg) (1.4.5)\n", + "Collecting pillow>=6.2.0 (from matplotlib->bert-score->lmqg)\n", + " Obtaining dependency information for pillow>=6.2.0 from https://files.pythonhosted.org/packages/92/a4/c164eb1f692585982e1aa9bf2c1126da9721c2193cd1aba1eaf46fe7f1d7/Pillow-10.1.0-cp310-cp310-macosx_11_0_arm64.whl.metadata\n", + " Using cached Pillow-10.1.0-cp310-cp310-macosx_11_0_arm64.whl.metadata (9.5 kB)\n", + "Requirement already satisfied: pyparsing>=2.3.1 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from matplotlib->bert-score->lmqg) (3.1.1)\n", + "Requirement already satisfied: blis<0.8.0,>=0.7.8 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from thinc<8.3.0,>=8.1.8->spacy->lmqg) (0.7.11)\n", + "Requirement already satisfied: confection<1.0.0,>=0.0.1 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from thinc<8.3.0,>=8.1.8->spacy->lmqg) (0.1.3)\n", + "INFO: pip is looking at multiple versions of tokenizers to determine which version is compatible with other requirements. This could take a while.\n", + "Collecting tokenizers<0.19,>=0.14 (from transformers>=4.26.1->lmqg)\n", + " Obtaining dependency information for tokenizers<0.19,>=0.14 from https://files.pythonhosted.org/packages/74/4a/119371191a5290ee5169dded0c2b542668a2c86798511c7045b480f0c7ed/tokenizers-0.15.0-cp310-cp310-macosx_11_0_arm64.whl.metadata\n", + " Downloading tokenizers-0.15.0-cp310-cp310-macosx_11_0_arm64.whl.metadata (6.7 kB)\n", + "Requirement already satisfied: cloudpathlib<0.17.0,>=0.7.0 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from weasel<0.4.0,>=0.1.0->spacy->lmqg) (0.16.0)\n", + "Collecting MarkupSafe>=2.0 (from jinja2->torch->lmqg)\n", + " Obtaining dependency information for MarkupSafe>=2.0 from https://files.pythonhosted.org/packages/20/1d/713d443799d935f4d26a4f1510c9e61b1d288592fb869845e5cc92a1e055/MarkupSafe-2.1.3-cp310-cp310-macosx_10_9_universal2.whl.metadata\n", + " Using cached MarkupSafe-2.1.3-cp310-cp310-macosx_10_9_universal2.whl.metadata (3.0 kB)\n", + "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from jsonschema->ray->lmqg) (2023.7.1)\n", + "Requirement already satisfied: referencing>=0.28.4 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from jsonschema->ray->lmqg) (0.30.2)\n", + "Requirement already satisfied: rpds-py>=0.7.1 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from jsonschema->ray->lmqg) (0.9.2)\n", + "Requirement already satisfied: mpmath>=0.19 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from sympy->torch->lmqg) (1.3.0)\n", + "Requirement already satisfied: smmap<6,>=3.0.1 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from gitdb<5,>=4.0.1->GitPython!=3.1.29,>=1.0.0->wandb->lmqg) (5.0.1)\n", + "Downloading transformers-4.36.2-py3-none-any.whl (8.2 MB)\n", + "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m8.2/8.2 MB\u001b[0m \u001b[31m48.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mm eta \u001b[36m0:00:01\u001b[0m0:01\u001b[0m:01\u001b[0m\n", + "\u001b[?25hDownloading huggingface_hub-0.20.1-py3-none-any.whl (330 kB)\n", + "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m330.1/330.1 kB\u001b[0m \u001b[31m32.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading accelerate-0.25.0-py3-none-any.whl (265 kB)\n", + "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m265.7/265.7 kB\u001b[0m \u001b[31m33.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading torch-2.1.2-cp310-none-macosx_11_0_arm64.whl (59.6 MB)\n", + "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m59.6/59.6 MB\u001b[0m \u001b[31m36.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mm eta \u001b[36m0:00:01\u001b[0m[36m0:00:01\u001b[0m\n", + "\u001b[?25hDownloading datasets-2.15.0-py3-none-any.whl (521 kB)\n", + "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m521.2/521.2 kB\u001b[0m \u001b[31m33.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hUsing cached evaluate-0.4.1-py3-none-any.whl (84 kB)\n", + "Using cached dill-0.3.7-py3-none-any.whl (115 kB)\n", + "Downloading pyarrow-14.0.2-cp310-cp310-macosx_11_0_arm64.whl (24.0 MB)\n", + "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m24.0/24.0 MB\u001b[0m \u001b[31m57.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m[36m0:00:01\u001b[0m[36m0:00:01\u001b[0m:01\u001b[0m\n", + "\u001b[?25hDownloading safetensors-0.4.1-cp310-cp310-macosx_11_0_arm64.whl (426 kB)\n", + "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m426.2/426.2 kB\u001b[0m \u001b[31m30.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading scipy-1.11.4-cp310-cp310-macosx_12_0_arm64.whl (29.8 MB)\n", + "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m29.8/29.8 MB\u001b[0m \u001b[31m55.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mm eta \u001b[36m0:00:01\u001b[0m[36m0:00:01\u001b[0m\n", + "\u001b[?25hDownloading tokenizers-0.15.0-cp310-cp310-macosx_11_0_arm64.whl (2.5 MB)\n", + "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.5/2.5 MB\u001b[0m \u001b[31m54.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mm eta \u001b[36m0:00:01\u001b[0m\n", + "\u001b[?25hUsing cached multiprocess-0.70.15-py310-none-any.whl (134 kB)\n", + "Downloading networkx-3.2.1-py3-none-any.whl (1.6 MB)\n", + "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.6/1.6 MB\u001b[0m \u001b[31m47.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)\n", + "Using cached xxhash-3.4.1-cp310-cp310-macosx_11_0_arm64.whl (30 kB)\n", + "Using cached MarkupSafe-2.1.3-cp310-cp310-macosx_10_9_universal2.whl (17 kB)\n", + "Using cached Pillow-10.1.0-cp310-cp310-macosx_11_0_arm64.whl (3.3 MB)\n", + "Installing collected packages: sentencepiece, xxhash, scipy, safetensors, pyarrow-hotfix, pyarrow, pillow, networkx, MarkupSafe, dill, responses, multiprocess, jinja2, huggingface-hub, torch, tokenizers, transformers, datasets, accelerate, evaluate\n", + " Attempting uninstall: scipy\n", + " Found existing installation: scipy 1.11.1\n", + " Uninstalling scipy-1.11.1:\n", + " Successfully uninstalled scipy-1.11.1\n", + " Attempting uninstall: huggingface-hub\n", + " Found existing installation: huggingface-hub 0.17.3\n", + " Uninstalling huggingface-hub-0.17.3:\n", + " Successfully uninstalled huggingface-hub-0.17.3\n", + " Attempting uninstall: tokenizers\n", + " Found existing installation: tokenizers 0.14.1\n", + " Uninstalling tokenizers-0.14.1:\n", + " Successfully uninstalled tokenizers-0.14.1\n", + "Successfully installed MarkupSafe-2.1.3 accelerate-0.25.0 datasets-2.15.0 dill-0.3.7 evaluate-0.4.1 huggingface-hub-0.20.1 jinja2-3.1.2 multiprocess-0.70.15 networkx-3.2.1 pillow-10.1.0 pyarrow-14.0.2 pyarrow-hotfix-0.6 responses-0.18.0 safetensors-0.4.1 scipy-1.11.4 sentencepiece-0.1.99 tokenizers-0.15.0 torch-2.1.2 transformers-4.36.2 xxhash-3.4.1\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting en-core-web-sm==3.7.1\n", + " Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)\n", + "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.8/12.8 MB\u001b[0m \u001b[31m64.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mm eta \u001b[36m0:00:01\u001b[0m[36m0:00:01\u001b[0m\n", + "\u001b[?25hCollecting spacy<3.8.0,>=3.7.2 (from en-core-web-sm==3.7.1)\n", + " Obtaining dependency information for spacy<3.8.0,>=3.7.2 from https://files.pythonhosted.org/packages/df/d2/440527cb9099be67ef0e121c71feee1f5a59c956cb10d35afdf4abc35ece/spacy-3.7.2-cp310-cp310-macosx_11_0_arm64.whl.metadata\n", + " Using cached spacy-3.7.2-cp310-cp310-macosx_11_0_arm64.whl.metadata (25 kB)\n", + "Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.0.12)\n", + "Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.0.5)\n", + "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.0.10)\n", + "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.0.8)\n", + "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.0.9)\n", + "Requirement already satisfied: thinc<8.3.0,>=8.1.8 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (8.2.1)\n", + "Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.1.2)\n", + "Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.4.8)\n", + "Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.0.10)\n", + "Requirement already satisfied: weasel<0.4.0,>=0.1.0 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.3.3)\n", + "Requirement already satisfied: typer<0.10.0,>=0.3.0 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.9.0)\n", + "Requirement already satisfied: smart-open<7.0.0,>=5.2.1 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (6.4.0)\n", + "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (4.66.1)\n", + "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.31.0)\n", + "Requirement already satisfied: pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.10.11)\n", + "Requirement already satisfied: jinja2 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.1.2)\n", + "Requirement already satisfied: setuptools in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (69.0.2)\n", + "Requirement already satisfied: packaging>=20.0 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (23.1)\n", + "Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.3.0)\n", + "Requirement already satisfied: numpy>=1.19.0 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.26.2)\n", + "Requirement already satisfied: typing-extensions>=4.2.0 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (4.9.0)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.3.2)\n", + "Requirement already satisfied: idna<4,>=2.5 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.6)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.1.0)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2023.11.17)\n", + "Requirement already satisfied: blis<0.8.0,>=0.7.8 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from thinc<8.3.0,>=8.1.8->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.7.11)\n", + "Requirement already satisfied: confection<1.0.0,>=0.0.1 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from thinc<8.3.0,>=8.1.8->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.1.3)\n", + "Requirement already satisfied: click<9.0.0,>=7.1.1 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from typer<0.10.0,>=0.3.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (8.1.7)\n", + "Requirement already satisfied: cloudpathlib<0.17.0,>=0.7.0 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from weasel<0.4.0,>=0.1.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.16.0)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages (from jinja2->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.1.3)\n", + "Using cached spacy-3.7.2-cp310-cp310-macosx_11_0_arm64.whl (6.6 MB)\n", + "Installing collected packages: spacy, en-core-web-sm\n", + " Attempting uninstall: spacy\n", + " Found existing installation: spacy 3.7.0\n", + " Uninstalling spacy-3.7.0:\n", + " Successfully uninstalled spacy-3.7.0\n", + "Successfully installed en-core-web-sm-3.7.1 spacy-3.7.2\n", "\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n", "You can now load the package via spacy.load('en_core_web_sm')\n" ] @@ -377,12 +471,16 @@ "name": "stderr", "output_type": "stream", "text": [ - "/opt/conda/envs/pykoi/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:1714: FutureWarning: The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.\n", + "/Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages/transformers/models/auto/tokenization_auto.py:690: FutureWarning: The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.\n", + " warnings.warn(\n", + "/Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages/transformers/models/auto/configuration_auto.py:1067: FutureWarning: The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.\n", " warnings.warn(\n", - "/opt/conda/envs/pykoi/lib/python3.10/site-packages/transformers/modeling_utils.py:2193: FutureWarning: The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.\n", + "/Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages/transformers/modeling_utils.py:2759: FutureWarning: The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.\n", " warnings.warn(\n", - "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 31/31 [00:00<00:00, 1683.41it/s]\n", - "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 31/31 [00:00<00:00, 1874.40it/s]\n" + "/Users/joseortiz/anaconda3/envs/pykoi/lib/python3.10/site-packages/spacy/util.py:910: UserWarning: [W095] Model 'en_core_web_sm' (3.7.1) was trained with spaCy v3.7.2 and may not be 100% compatible with the current version (3.7.0). If you see errors or degraded performance, download a newer compatible model or retrain your custom model with the current spaCy version. For more details and available updates, run: python -m spacy validate\n", + " warnings.warn(warn_msg)\n", + "100%|██████████| 31/31 [00:00<00:00, 1333.40it/s]\n", + "100%|██████████| 31/31 [00:00<00:00, 661.53it/s]\n" ] }, { @@ -423,8 +521,7 @@ " 'labor certification?',\n", " 'immigrant petition'),\n", " ('When is the filing of the i-140 the first step of the green card process?',\n", - " 'in cases where no labor certification is required (e.g. eb-1), the filing '\n", - " 'of the i-140 is the first step of the green card process.')],\n", + " 'in cases where no labor certification is required')],\n", " [('What can a foreign national apply for once the i-140 application has been '\n", " 'approved by uscis?',\n", " 'adjustment of status or obtaining an immigrant visa'),\n", @@ -482,11 +579,11 @@ "from pprint import pprint\n", "from lmqg import TransformersQG\n", "\n", - "# Download the en_core_web_sm model explicitly \n", + "# Download the en_core_web_sm model explicitly\n", "! python -m spacy download en_core_web_sm # spacy is a counterpart of nltk\n", "\n", "# initialize model\n", - "model = TransformersQG(model='lmqg/t5-base-squad-qg-ae', max_length=1024) # max length of a paragraph \n", + "model = TransformersQG(model='lmqg/t5-base-squad-qg-ae', max_length=1024) # max length of a paragraph\n", "# paragraph to generate pairs of question and answer\n", "\n", "context = context\n", @@ -504,7 +601,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 7, "id": "19e1e276-3def-4980-96fd-91a7ef9dbd4f", "metadata": {}, "outputs": [ @@ -522,7 +619,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 8, "id": "b1bf02e9-281f-4cbf-bd50-6dfbc06bd2cf", "metadata": {}, "outputs": [ @@ -532,7 +629,7 @@ "31" ] }, - "execution_count": 45, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -552,7 +649,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 9, "id": "9393d02a-baa8-41df-a8ec-5523e0cfc371", "metadata": {}, "outputs": [], @@ -579,7 +676,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.10.13" } }, "nbformat": 4, diff --git a/example/mlu/demo_comparator.py b/example/mlu/demo_comparator.py index 228a082..756f425 100644 --- a/example/mlu/demo_comparator.py +++ b/example/mlu/demo_comparator.py @@ -6,7 +6,6 @@ from pykoi.chat.llm.huggingface import HuggingfaceModel from pykoi.component import Compare - ###################################################################################### # Creating a Huggingface model tiiuae/falcon-rw-1b (EC2 g4.2xlarge with 100GB space) # ###################################################################################### @@ -73,9 +72,7 @@ tokenizers = [hf_tokenizer_1, hf_tokenizer_2, hf_tokenizer_3] models_list = [ - HuggingfaceModel.create( - model=model, tokenizer=tokenizer, name=name, max_length=100 - ) + HuggingfaceModel.create(model=model, tokenizer=tokenizer, name=name, max_length=100) for model, tokenizer, name in zip(models, tokenizers, model_name) ] diff --git a/example/retrieval_qa/retrieval_qa_huggingface_demo.py b/example/retrieval_qa/retrieval_qa_huggingface_demo.py index 24771f1..fb42a1c 100644 --- a/example/retrieval_qa/retrieval_qa_huggingface_demo.py +++ b/example/retrieval_qa/retrieval_qa_huggingface_demo.py @@ -3,27 +3,29 @@ python -m example.retrieval_qa.retrieval_qa_huggingface_demo """ -import os import argparse +import os + +from dotenv import load_dotenv + from pykoi import Application from pykoi.chat import RAGDatabase -from pykoi.retrieval import RetrievalFactory -from pykoi.retrieval import VectorDbFactory from pykoi.component import Chatbot, Dashboard, RetrievalQA -from dotenv import load_dotenv +from pykoi.retrieval import RetrievalFactory, VectorDbFactory # NOTE: Configure your retrieval model as RETRIEVAL_MODEL in .env file. # Load environment variables from .env file load_dotenv() ## Set the RETRIEVAL_MODEL, pykoi supports most of the open-source LLMs, e.g. - # "HuggingFaceH4/zephyr-7b-beta" - # "meta-llama/Llama-2-7b-chat-hf" - # "mistralai/Mistral-7B-v0.1" - # "databricks/dolly-v2-3b" +# "HuggingFaceH4/zephyr-7b-beta" +# "meta-llama/Llama-2-7b-chat-hf" +# "mistralai/Mistral-7B-v0.1" +# "databricks/dolly-v2-3b" RETRIEVAL_MODEL = os.getenv("RETRIEVAL_MODEL", default="mistralai/Mistral-7B-v0.1") + def main(**kwargs): os.environ["DOC_PATH"] = os.path.join(os.getcwd(), "temp/docs") os.environ["VECTORDB_PATH"] = os.path.join(os.getcwd(), "temp/vectordb") @@ -48,11 +50,13 @@ def main(**kwargs): vector_db=vector_db, model_name=RETRIEVAL_MODEL, trust_remote_code=True, - max_length=1000 + max_length=1000, ) # retrieval, chatbot, and dashboard pykoi components - retriever = RetrievalQA(retrieval_model=retrieval_model, vector_db=vector_db, feedback="rag") + retriever = RetrievalQA( + retrieval_model=retrieval_model, vector_db=vector_db, feedback="rag" + ) chatbot = Chatbot(None, feedback="rag", is_retrieval=True) # dashboard = Dashboard(RAGDatabase(), feedback="rag") diff --git a/example/retrieval_qa/retrieval_qa_openai_demo.py b/example/retrieval_qa/retrieval_qa_openai_demo.py index e02fcf7..1cc96fe 100644 --- a/example/retrieval_qa/retrieval_qa_openai_demo.py +++ b/example/retrieval_qa/retrieval_qa_openai_demo.py @@ -1,14 +1,30 @@ -"""Demo for the retrieval_qa application.""" +""" +Demo for launching a retrieval_qa chatbot UI (with database) from an OpenAI model. + +- Prerequisites: + To run this jupyter notebook, you need a `pykoi` environment with the `rag` option. + You can follow [the installation guide](https://github.com/CambioML/pykoi/tree/install#option-1-rag-cpu) + to set up the environment. +- Run the demo: + 1. Enter your OpenAI API key a .env file in the `~/pykoi` directory with the name OPEN_API_KEY, e.g. + ``` + OPENAI_API_KEY=your_api_key + ``` + 2. On terminal and `~/pykoi` directory, run + ``` + python -m example.retrieval_qa.retrieval_qa_openai_demo + ``` +""" -import os import argparse +import os + from dotenv import load_dotenv + from pykoi import Application -from pykoi.retrieval import RetrievalFactory -from pykoi.retrieval import VectorDbFactory -from pykoi.component import Chatbot, Dashboard, RetrievalQA from pykoi.chat import RAGDatabase - +from pykoi.component import Chatbot, Dashboard, RetrievalQA +from pykoi.retrieval import RetrievalFactory, VectorDbFactory load_dotenv() @@ -22,9 +38,11 @@ def main(**kargs): # Creating a retrieval QA component # ##################################### # vector database + print("1. Creating a vector database...") vector_db = VectorDbFactory.create( model_source=MODEL_SOURCE, vector_db_name=kargs.get("vectordb"), **kargs ) + print("2. Vector database created.") # retrieval model with vector database retrieval_model = RetrievalFactory.create( @@ -32,7 +50,9 @@ def main(**kargs): ) # retrieval, chatbot, and dashboard pykoi components - retriever = RetrievalQA(retrieval_model=retrieval_model, vector_db=vector_db, feedback="rag") + retriever = RetrievalQA( + retrieval_model=retrieval_model, vector_db=vector_db, feedback="rag" + ) chatbot = Chatbot(None, feedback="rag", is_retrieval=True) dashboard = Dashboard(RAGDatabase(), feedback="rag") diff --git a/example/rlhf/demo_rl.py b/example/rlhf/demo_rl.py index 3e80f18..aa07f32 100644 --- a/example/rlhf/demo_rl.py +++ b/example/rlhf/demo_rl.py @@ -9,17 +9,15 @@ """ # accelerate launch --num_machines 1 --num_processes 1 --mixed_precision fp16 example/rlhf/demo_rl.py -from pykoi.rlhf import RLHFConfig -from pykoi.rlhf import RLFinetuning - +from pykoi.rlhf import RLFinetuning, RLHFConfig # use huggingface sft and reward model config = RLHFConfig( - base_model_path="models/rlhf_step1_sft", #"elinas/llama-7b-hf-transformers-4.29", - dataset_type="huggingface", + base_model_path="models/rlhf_step1_sft", # "elinas/llama-7b-hf-transformers-4.29", + dataset_type="huggingface", dataset_name="cambioml/stack_exchange_rank_10k_dataset", dataset_subset_rl="data", - reward_model_path="models/rlhf_step2_rw/", #"cambioml/rlhf_reward_model", + reward_model_path="models/rlhf_step2_rw/", # "cambioml/rlhf_reward_model", save_freq=1, ppo_batch_size=32, ppo_epochs=4, diff --git a/example/rlhf/demo_rw_finetuning.py b/example/rlhf/demo_rw_finetuning.py index fbec942..aeddd8c 100644 --- a/example/rlhf/demo_rw_finetuning.py +++ b/example/rlhf/demo_rw_finetuning.py @@ -3,23 +3,24 @@ python -m example.rlhf.demo_rw_finetuning """ -from pykoi.rlhf import RLHFConfig -from pykoi.rlhf import RewardFinetuning from pykoi.chat import RankingDatabase -from pykoi.chat.db.constants import ( - RANKING_CSV_HEADER_ID, - RANKING_CSV_HEADER_QUESTION, - RANKING_CSV_HEADER_UP_RANKING_ANSWER, - RANKING_CSV_HEADER_LOW_RANKING_ANSWER) +from pykoi.chat.db.constants import (RANKING_CSV_HEADER_ID, + RANKING_CSV_HEADER_LOW_RANKING_ANSWER, + RANKING_CSV_HEADER_QUESTION, + RANKING_CSV_HEADER_UP_RANKING_ANSWER) +from pykoi.rlhf import RewardFinetuning, RLHFConfig # get data from local database ranking_database = RankingDatabase() my_data_pd = ranking_database.retrieve_all_question_answers_as_pandas() -my_data_pd = my_data_pd[[ - RANKING_CSV_HEADER_ID, - RANKING_CSV_HEADER_QUESTION, - RANKING_CSV_HEADER_UP_RANKING_ANSWER, - RANKING_CSV_HEADER_LOW_RANKING_ANSWER]] +my_data_pd = my_data_pd[ + [ + RANKING_CSV_HEADER_ID, + RANKING_CSV_HEADER_QUESTION, + RANKING_CSV_HEADER_UP_RANKING_ANSWER, + RANKING_CSV_HEADER_LOW_RANKING_ANSWER, + ] +] # analyze the data print(my_data_pd) diff --git a/example/rlhf/demo_supervised_finetuning_nike.py b/example/rlhf/demo_supervised_finetuning_nike.py index 684d1e4..484886b 100644 --- a/example/rlhf/demo_supervised_finetuning_nike.py +++ b/example/rlhf/demo_supervised_finetuning_nike.py @@ -3,10 +3,9 @@ python -m example.rlhf.demo_supervised_finetuning_nike """ -from pykoi.rlhf import RLHFConfig -from pykoi.rlhf import SupervisedFinetuning from peft import LoraConfig, TaskType +from pykoi.rlhf import RLHFConfig, SupervisedFinetuning base_model_path = "meta-llama/Llama-2-7b-chat-hf" dataset_name = "./output_self_instructed_data_nike_10k_2023_FULL.csv" @@ -22,7 +21,7 @@ save_freq = 200 train_test_split_ratio = 0.0001 dataset_subset_sft_train = 999999999 -size_valid_set = 0 +size_valid_set = 0 r = 8 lora_alpha = 16 @@ -36,13 +35,13 @@ lora_dropout=lora_dropout, bias=bias, task_type=task_type, - ) +) # run supervised finetuning config = RLHFConfig( - base_model_path=base_model_path, - dataset_type=dataset_type, + base_model_path=base_model_path, + dataset_type=dataset_type, dataset_name=dataset_name, learning_rate=learning_rate, weight_decay=weight_decay, @@ -55,7 +54,7 @@ train_test_split_ratio=train_test_split_ratio, dataset_subset_sft_train=dataset_subset_sft_train, size_valid_set=size_valid_set, - lora_config_rl=lora_config - ) + lora_config_rl=lora_config, +) rlhf_step1_sft = SupervisedFinetuning(config) rlhf_step1_sft.train_and_save(peft_model_path) diff --git a/example/rlhf/supervised_finetuning_demo.py b/example/rlhf/supervised_finetuning_demo.py index aba71e5..e1d10f4 100644 --- a/example/rlhf/supervised_finetuning_demo.py +++ b/example/rlhf/supervised_finetuning_demo.py @@ -4,23 +4,22 @@ """ from pykoi.chat import QuestionAnswerDatabase -from pykoi.rlhf import RLHFConfig -from pykoi.rlhf import SupervisedFinetuning - -from pykoi.chat.db.constants import ( - QA_CSV_HEADER_ID, - QA_CSV_HEADER_QUESTION, - QA_CSV_HEADER_ANSWER, - QA_CSV_HEADER_VOTE_STATUS) +from pykoi.chat.db.constants import (QA_CSV_HEADER_ANSWER, QA_CSV_HEADER_ID, + QA_CSV_HEADER_QUESTION, + QA_CSV_HEADER_VOTE_STATUS) +from pykoi.rlhf import RLHFConfig, SupervisedFinetuning # get data from local database qa_database = QuestionAnswerDatabase() my_data_pd = qa_database.retrieve_all_question_answers_as_pandas() -my_data_pd = my_data_pd[[ - QA_CSV_HEADER_ID, - QA_CSV_HEADER_QUESTION, - QA_CSV_HEADER_ANSWER, - QA_CSV_HEADER_VOTE_STATUS]] +my_data_pd = my_data_pd[ + [ + QA_CSV_HEADER_ID, + QA_CSV_HEADER_QUESTION, + QA_CSV_HEADER_ANSWER, + QA_CSV_HEADER_VOTE_STATUS, + ] +] # analyze the data print(my_data_pd) diff --git a/example/uniflow/uniflow_sft_demo.py b/example/uniflow/uniflow_sft_demo.py deleted file mode 100644 index b329dea..0000000 --- a/example/uniflow/uniflow_sft_demo.py +++ /dev/null @@ -1,44 +0,0 @@ -"""Demo for using uniflow to generate data for supervised fine tuning. - -python -m example.uniflow.uniflow_sft_demo -""" -import os -import pandas as pd - -from uniflow.flow.flow import Flow -from pykoi.rlhf import RLHFConfig -from pykoi.rlhf import SupervisedFinetuning -from pykoi.chat.db.constants import ( - QA_CSV_HEADER_ID, - QA_CSV_HEADER_QUESTION, - QA_CSV_HEADER_ANSWER, - QA_CSV_HEADER_VOTE_STATUS) - -CSV_FILENAME = "qd_immigration" -CSV_OUTPUT_SUFFIX = "-flow-output" - -# Load data -current_directory = os.getcwd() -qaa = pd.read_csv(f"{current_directory}/{CSV_FILENAME}.csv", encoding="utf8") - -# run flow -flow = Flow() -output_dict = flow(qaa) - -# save new data to csv -df = pd.DataFrame(output_dict["output"][0], columns=[ - QA_CSV_HEADER_ID, - QA_CSV_HEADER_QUESTION, - QA_CSV_HEADER_ANSWER, - QA_CSV_HEADER_VOTE_STATUS]) -df.to_csv(f"{current_directory}/{CSV_FILENAME}{CSV_OUTPUT_SUFFIX}.csv", index=False) - -# analyze the data -print("Flow save successful!") -print(df) -print(f"The output csv file {CSV_FILENAME}{CSV_OUTPUT_SUFFIX}.csv has {df.shape[0]} rows in total") - -# run supervised finetuning -config = RLHFConfig(base_model_path="databricks/dolly-v2-3b", dataset_type="local_csv", dataset_name=f"{CSV_FILENAME}{CSV_OUTPUT_SUFFIX}.csv") -rlhf_step1_sft = SupervisedFinetuning(config) -rlhf_step1_sft.train_and_save("./models/rlhf_step1_sft") diff --git a/pykoi/application.py b/pykoi/application.py index 8da4b03..36b661f 100644 --- a/pykoi/application.py +++ b/pykoi/application.py @@ -4,20 +4,20 @@ import re import subprocess import time - from datetime import datetime -from typing import List, Optional, Any, Dict, Union -from fastapi import FastAPI, Depends, HTTPException, UploadFile, status -from fastapi.security import HTTPBasic, HTTPBasicCredentials -from passlib.context import CryptContext +from typing import Any, Dict, List, Optional, Union + +from fastapi import Depends, FastAPI, HTTPException, UploadFile, status from fastapi.responses import JSONResponse +from fastapi.security import HTTPBasic, HTTPBasicCredentials from fastapi.staticfiles import StaticFiles +from passlib.context import CryptContext from pydantic import BaseModel from starlette.middleware.cors import CORSMiddleware -from pykoi.telemetry.telemetry import Telemetry -from pykoi.telemetry.events import AppStartEvent, AppStopEvent -from pykoi.chat.db.constants import RAG_LIST_SEPARATOR +from pykoi.chat.db.constants import RAG_LIST_SEPARATOR +from pykoi.telemetry.events import AppStartEvent, AppStopEvent +from pykoi.telemetry.telemetry import Telemetry oauth_scheme = HTTPBasic() @@ -644,10 +644,14 @@ async def inference( try: print("[/retrieval]: model inference.....", request_body.prompt) component["component"].retrieval_model.re_init(request_body.file_names) - output = component["component"].retrieval_model.run_with_return_source_documents({"query": request_body.prompt}) - print('output', output, output["result"]) + output = component[ + "component" + ].retrieval_model.run_with_return_source_documents( + {"query": request_body.prompt} + ) + print("output", output, output["result"]) if "source_documents" not in output: - print('no source documents', output) + print("no source documents", output) source = ["N/A"] source_content = ["N/A"] elif output["source_documents"] == []: @@ -791,9 +795,16 @@ async def check_file_exists( try: file_path = f"{os.getcwd()}/{file_name}" file_exists = os.path.exists(file_path) - return {"log": f"Check if {file_name} exists succeeded.", "file_exists": file_exists, "status": "200"} + return { + "log": f"Check if {file_name} exists succeeded.", + "file_exists": file_exists, + "status": "200", + } except Exception as ex: - return {"log": f"Check if {file_name} exists failed: {ex}", "status": "500"} + return { + "log": f"Check if {file_name} exists failed: {ex}", + "status": "500", + } def create_data_route(id: str, data_source: Any): """ diff --git a/pykoi/chat/__init__.py b/pykoi/chat/__init__.py index efbeb29..d737e4b 100644 --- a/pykoi/chat/__init__.py +++ b/pykoi/chat/__init__.py @@ -1,6 +1,5 @@ import pykoi.chat.llm as llm - -from pykoi.chat.llm.model_factory import ModelFactory from pykoi.chat.db.qa_database import QuestionAnswerDatabase +from pykoi.chat.db.rag_database import RAGDatabase from pykoi.chat.db.ranking_database import RankingDatabase -from pykoi.chat.db.rag_database import RAGDatabase \ No newline at end of file +from pykoi.chat.llm.model_factory import ModelFactory diff --git a/pykoi/chat/db/abs_database.py b/pykoi/chat/db/abs_database.py index 77f2d6c..24d1ed5 100644 --- a/pykoi/chat/db/abs_database.py +++ b/pykoi/chat/db/abs_database.py @@ -2,7 +2,6 @@ import abc import sqlite3 import threading - from typing import List, Tuple @@ -71,9 +70,7 @@ def insert(self, **kwargs) -> None: Args: kwargs (dict): The key-value pairs to insert into the database. """ - raise NotImplementedError( - "Insert method must be implemented by subclasses." - ) + raise NotImplementedError("Insert method must be implemented by subclasses.") @abc.abstractmethod def update(self, **kwargs) -> None: @@ -83,17 +80,13 @@ def update(self, **kwargs) -> None: Args: kwargs (dict): The key-value pairs to update in the database. """ - raise NotImplementedError( - "Update method must be implemented by subclasses." - ) + raise NotImplementedError("Update method must be implemented by subclasses.") def retrieve_all(self) -> List[Tuple]: """ Retrieves all pairs from the database. """ - raise NotImplementedError( - "Retrieve method must be implemented by subclasses." - ) + raise NotImplementedError("Retrieve method must be implemented by subclasses.") @abc.abstractmethod def print_table(self, rows: str) -> None: @@ -103,6 +96,4 @@ def print_table(self, rows: str) -> None: Args: rows (str): The rows to print. """ - raise NotImplementedError( - "Print method must be implemented by subclasses." - ) + raise NotImplementedError("Print method must be implemented by subclasses.") diff --git a/pykoi/chat/db/comparator_database.py b/pykoi/chat/db/comparator_database.py index 0eb9578..761b8a2 100644 --- a/pykoi/chat/db/comparator_database.py +++ b/pykoi/chat/db/comparator_database.py @@ -2,12 +2,10 @@ import csv import datetime import os - from typing import List, Tuple import pandas as pd - from pykoi.chat.db.abs_database import AbsDatabase from pykoi.chat.db.constants import COMPARATOR_CSV_HEADER @@ -238,7 +236,6 @@ def print_table(self, rows: List[Tuple]) -> None: f"Timestamp: {row[5]}" ) - def save_to_csv(self, csv_file_name="comparator_table"): """ This method saves the contents of the RAG table into a CSV file. @@ -292,4 +289,3 @@ def retrieve_all_question_answers_as_pandas(self) -> pd.DataFrame: columns=["ID", "Model", "QID", "Question", "Rank", "Answer", "Timestamp"], ) return df - diff --git a/pykoi/chat/db/rag_database.py b/pykoi/chat/db/rag_database.py index 13dc966..309cd18 100644 --- a/pykoi/chat/db/rag_database.py +++ b/pykoi/chat/db/rag_database.py @@ -72,7 +72,14 @@ def create_table(self): print("Table contents after creating table:") self.print_table(rows) - def insert_question_answer(self, question: str, answer: str, rag_sources: list, source: list, source_content: list): + def insert_question_answer( + self, + question: str, + answer: str, + rag_sources: list, + source: list, + source_content: list, + ): """ Inserts a new question-answer pair into the database with the given question and answer. The vote_status field is set to 'n/a' by default. @@ -100,7 +107,10 @@ def insert_question_answer(self, question: str, answer: str, rag_sources: list, with self._lock: cursor = self.get_cursor() - cursor.execute(query, (question, answer, rag_sources, source, source_content, timestamp)) + cursor.execute( + query, + (question, answer, rag_sources, source, source_content, timestamp), + ) self.get_connection().commit() if self._debug: @@ -155,7 +165,7 @@ def update_answer(self, id, new_answer): SET edited_answer = ? WHERE id = ?; """ - print('update_answer',new_answer) + print("update_answer", new_answer) with self._lock: cursor = self.get_cursor() cursor.execute(query, (new_answer, id)) diff --git a/pykoi/chat/db/ranking_database.py b/pykoi/chat/db/ranking_database.py index c4bf75e..4facee2 100644 --- a/pykoi/chat/db/ranking_database.py +++ b/pykoi/chat/db/ranking_database.py @@ -86,9 +86,7 @@ def insert_ranking( """ with self._lock: cursor = self.get_cursor() - cursor.execute( - query, (question, up_ranking_answer, low_ranking_answer) - ) + cursor.execute(query, (question, up_ranking_answer, low_ranking_answer)) self.get_connection().commit() if self._debug: diff --git a/pykoi/chat/llm/abs_llm.py b/pykoi/chat/llm/abs_llm.py index afb5333..db3fa53 100644 --- a/pykoi/chat/llm/abs_llm.py +++ b/pykoi/chat/llm/abs_llm.py @@ -25,9 +25,7 @@ def predict(self, message: str, num_of_response: int): Raises: NotImplementedError: This method must be implemented by subclasses. """ - raise NotImplementedError( - "This method must be implemented by subclasses." - ) + raise NotImplementedError("This method must be implemented by subclasses.") @property def name(self): @@ -38,6 +36,4 @@ def name(self): Raises: NotImplementedError: This method must be implemented by subclasses. """ - raise NotImplementedError( - "This method must be implemented by subclasses." - ) + raise NotImplementedError("This method must be implemented by subclasses.") diff --git a/pykoi/chat/llm/huggingface.py b/pykoi/chat/llm/huggingface.py index 71b7874..90237a5 100644 --- a/pykoi/chat/llm/huggingface.py +++ b/pykoi/chat/llm/huggingface.py @@ -78,9 +78,8 @@ def predict(self, message: str, num_of_response: int = 1): """ # TODO: need to refractor and include all the derivatives of dolly family if "dolly" in self._pretrained_model_name_or_path: - from pykoi.chat.llm.instruct_pipeline import ( - InstructionTextGenerationPipeline, - ) + from pykoi.chat.llm.instruct_pipeline import \ + InstructionTextGenerationPipeline generate_text = InstructionTextGenerationPipeline( model=self._model, tokenizer=self._tokenizer diff --git a/pykoi/chat/llm/instruct_pipeline.py b/pykoi/chat/llm/instruct_pipeline.py index 677b943..e83f399 100644 --- a/pykoi/chat/llm/instruct_pipeline.py +++ b/pykoi/chat/llm/instruct_pipeline.py @@ -6,7 +6,6 @@ import numpy as np from transformers import Pipeline, PreTrainedTokenizer - from transformers.utils import is_tf_available if is_tf_available(): @@ -91,9 +90,7 @@ def __init__( **kwargs, ) - def _sanitize_parameters( - self, return_full_text: bool = None, **generate_kwargs - ): + def _sanitize_parameters(self, return_full_text: bool = None, **generate_kwargs): preprocess_params = {} # newer versions of the tokenizer configure the response key as a special token. newer versions still may @@ -133,9 +130,7 @@ def _sanitize_parameters( return preprocess_params, forward_params, postprocess_params def preprocess(self, instruction_text, **generate_kwargs): - prompt_text = PROMPT_FOR_GENERATION_FORMAT.format( - instruction=instruction_text - ) + prompt_text = PROMPT_FOR_GENERATION_FORMAT.format(instruction=instruction_text) inputs = self.tokenizer( prompt_text, return_tensors="pt", @@ -192,9 +187,7 @@ def postprocess( generated_sequence = model_outputs["generated_sequence"][0] instruction_text = model_outputs["instruction_text"] - generated_sequence: List[ - List[int] - ] = generated_sequence.numpy().tolist() + generated_sequence: List[List[int]] = generated_sequence.numpy().tolist() records = [] for sequence in generated_sequence: # The response will be set to this variable if we can identify it. @@ -251,9 +244,7 @@ def postprocess( if m: decoded = m.group(1).strip() else: - logger.warn( - f"Failed to find response in:\n{fully_decoded}" - ) + logger.warn(f"Failed to find response in:\n{fully_decoded}") # If the full text is requested, then append the decoded text to the original instruction. # This technically isn't the full text, as we format the instruction in the prompt the model has been diff --git a/pykoi/chat/llm/mlu.py b/pykoi/chat/llm/mlu.py index 5c73530..a513ff2 100644 --- a/pykoi/chat/llm/mlu.py +++ b/pykoi/chat/llm/mlu.py @@ -1,8 +1,7 @@ """MLU HF model.""" from transformers import GenerationConfig -from pykoi.chat.llm.abs_llm import AbsLlm -from transformers import GenerationConfig +from pykoi.chat.llm.abs_llm import AbsLlm class MLUWrapper(AbsLlm): diff --git a/pykoi/chat/llm/model_factory.py b/pykoi/chat/llm/model_factory.py index b167dee..47d8385 100644 --- a/pykoi/chat/llm/model_factory.py +++ b/pykoi/chat/llm/model_factory.py @@ -45,7 +45,8 @@ def create_model(model_source: Union[str, ModelSource], **kwargs) -> AbsLlm: return HuggingfaceModel(**kwargs) elif model_source == ModelSource.PEFT_HUGGINGFACE: - from pykoi.chat.llm.peft_huggingface import PeftHuggingfacemodel + from pykoi.chat.llm.peft_huggingface import \ + PeftHuggingfacemodel return PeftHuggingfacemodel(**kwargs) elif model_source == ModelSource.MLU: diff --git a/pykoi/chat/llm/openai.py b/pykoi/chat/llm/openai.py index 6ea1ee5..b8d4a3d 100644 --- a/pykoi/chat/llm/openai.py +++ b/pykoi/chat/llm/openai.py @@ -1,5 +1,5 @@ """This module provides a wrapper for the OpenAI model.""" -import openai +from openai import OpenAI from pykoi.chat.llm.abs_llm import AbsLlm @@ -9,12 +9,12 @@ class OpenAIModel(AbsLlm): A class that wraps the OpenAI model for use in the LLMChain. Attributes: - _engine (str): The engine to use for the OpenAI model. + _model (str): The model to use for the OpenAI model. _max_tokens (int): The maximum number of tokens to generate. _temperature (float): The temperature to use for the OpenAI model. Methods: - __init__(self, api_key: str, engine: str, max_tokens: int, temperature: float): Initializes the OpenAI model. + __init__(self, model: str, max_tokens: int, temperature: float): Initializes the OpenAI model. predict(self, message: str): Predicts the next word based on the given message. """ @@ -22,9 +22,8 @@ class OpenAIModel(AbsLlm): def __init__( self, - api_key: str, name: str = None, - engine: str = "davinci", + model: str = "davinci", max_tokens: int = 100, temperature: float = 0.5, ): @@ -32,17 +31,16 @@ def __init__( Initializes the OpenAI model with the given parameters. Args: - api_key (str): The API key for the OpenAI model. name (str): The name of the model. Defaults to None. - engine (str, optional): The engine to use for the OpenAI model. Defaults to "davinci". + model (str, optional): The model to use for the OpenAI model. Defaults to "davinci". max_tokens (int, optional): The maximum number of tokens to generate. Defaults to 100. temperature (float, optional): The temperature to use for the OpenAI model. Defaults to 0.5. """ - openai.api_key = api_key - self._engine = engine + self._model = model self._max_tokens = max_tokens self._temperature = temperature self._name = name + self._client = OpenAI() super().__init__() @property @@ -52,7 +50,7 @@ def name(self): return "_".join( [ str(OpenAIModel.model_source), - str(self._engine), + str(self._model), str(self._max_tokens), str(self._temperature), ] @@ -70,8 +68,8 @@ def predict(self, message: str, num_of_response: int = 1): List[str]: List of response. """ prompt = f"Question: {message}\nAnswer:" - response = openai.Completion.create( - engine=self._engine, + response = self._client.completions.create( + model=self._model, prompt=prompt, max_tokens=self._max_tokens, n=num_of_response, diff --git a/pykoi/chat/llm/peft_huggingface.py b/pykoi/chat/llm/peft_huggingface.py index 1d2db89..47edce0 100644 --- a/pykoi/chat/llm/peft_huggingface.py +++ b/pykoi/chat/llm/peft_huggingface.py @@ -1,8 +1,7 @@ """Huggingface PEFT model for Language Model (LLM).""" import torch - -from transformers import AutoModelForCausalLM, AutoTokenizer from peft import PeftModel +from transformers import AutoModelForCausalLM, AutoTokenizer from pykoi.chat.llm.abs_llm import AbsLlm @@ -108,8 +107,7 @@ def predict(self, message: str, num_of_response: int = 1): ) print("[HuggingfaceModel] decode...") response = [ - self._tokenizer.decode(ids, skip_special_tokens=True) - for ids in output_ids + self._tokenizer.decode(ids, skip_special_tokens=True) for ids in output_ids ] print("response: ", response) diff --git a/pykoi/component/__init__.py b/pykoi/component/__init__.py index 5a66798..a22b1e2 100644 --- a/pykoi/component/__init__.py +++ b/pykoi/component/__init__.py @@ -1,4 +1,4 @@ from pykoi.component.base import Chatbot, Dashboard, Dropdown from pykoi.component.chatbot_comparator import Compare -from pykoi.component.retrieval_qa import RetrievalQA from pykoi.component.nvml import Nvml +from pykoi.component.retrieval_qa import RetrievalQA diff --git a/pykoi/component/base.py b/pykoi/component/base.py index d413eb2..e755cf6 100644 --- a/pykoi/component/base.py +++ b/pykoi/component/base.py @@ -2,13 +2,13 @@ import uuid from typing import Callable, List, Optional, Union -from pykoi.component.chatbot_database_factory import ChatbotDatabaseFactory -from pykoi.component.constants import FeedbackType from pykoi.chat.db.comparator_database import ComparatorDatabase from pykoi.chat.db.qa_database import QuestionAnswerDatabase from pykoi.chat.db.rag_database import RAGDatabase from pykoi.chat.db.ranking_database import RankingDatabase from pykoi.chat.llm.abs_llm import AbsLlm +from pykoi.component.chatbot_database_factory import ChatbotDatabaseFactory +from pykoi.component.constants import FeedbackType class DataSource: diff --git a/pykoi/component/chatbot_comparator.py b/pykoi/component/chatbot_comparator.py index 09ffc64..11fb4da 100644 --- a/pykoi/component/chatbot_comparator.py +++ b/pykoi/component/chatbot_comparator.py @@ -1,15 +1,13 @@ """Chatbot comparator component.""" import time -import pandas as pd - from typing import List -from pykoi.component.base import Component -from pykoi.chat.db.comparator_database import ( - ComparatorDatabase, - ComparatorQuestionDatabase, -) +import pandas as pd + +from pykoi.chat.db.comparator_database import (ComparatorDatabase, + ComparatorQuestionDatabase) from pykoi.chat.llm.abs_llm import AbsLlm +from pykoi.component.base import Component from pykoi.interactives.barchart import Barchart diff --git a/pykoi/component/chatbot_database_factory.py b/pykoi/component/chatbot_database_factory.py index d774ba8..825fc8b 100644 --- a/pykoi/component/chatbot_database_factory.py +++ b/pykoi/component/chatbot_database_factory.py @@ -1,10 +1,10 @@ """Chatbot Database Factory class.""" from typing import Union -from pykoi.component.constants import FeedbackType from pykoi.chat.db.qa_database import QuestionAnswerDatabase -from pykoi.chat.db.ranking_database import RankingDatabase from pykoi.chat.db.rag_database import RAGDatabase +from pykoi.chat.db.ranking_database import RankingDatabase +from pykoi.component.constants import FeedbackType class ChatbotDatabaseFactory: diff --git a/pykoi/component/nvml.py b/pykoi/component/nvml.py index d215e15..05d5675 100644 --- a/pykoi/component/nvml.py +++ b/pykoi/component/nvml.py @@ -1,8 +1,8 @@ """nvml component""" -from pykoi.ops.nvml import Nvml as Nv from pykoi.component.base import Component +from pykoi.ops.nvml import Nvml as Nv class Nvml(Component): diff --git a/pykoi/component/retrieval_qa.py b/pykoi/component/retrieval_qa.py index c15de2d..7260bb2 100644 --- a/pykoi/component/retrieval_qa.py +++ b/pykoi/component/retrieval_qa.py @@ -1,10 +1,9 @@ """Retrieval QA component.""" +from pykoi.component.base import Component +from pykoi.component.chatbot_database_factory import ChatbotDatabaseFactory from pykoi.retrieval.llm.abs_llm import AbsLlm from pykoi.retrieval.vectordb.abs_vectordb import AbsVectorDb -from pykoi.component.chatbot_database_factory import ChatbotDatabaseFactory - -from pykoi.component.base import Component class RetrievalQA(Component): diff --git a/pykoi/interactives/barchart.py b/pykoi/interactives/barchart.py index e164ff1..507fc84 100644 --- a/pykoi/interactives/barchart.py +++ b/pykoi/interactives/barchart.py @@ -1,9 +1,7 @@ """Module for the compiled Svelte Barchart interactive.""" import json - from random import randint - -from typing import Dict, Any +from typing import Any, Dict class Barchart: diff --git a/pykoi/interactives/chatbot.py b/pykoi/interactives/chatbot.py index 4e28e32..c18614f 100644 --- a/pykoi/interactives/chatbot.py +++ b/pykoi/interactives/chatbot.py @@ -1,8 +1,6 @@ import json - from random import randint - -from typing import Dict, Any +from typing import Any, Dict class Chatbot: diff --git a/pykoi/ops/__init__.py b/pykoi/ops/__init__.py index 69f541b..7663999 100644 --- a/pykoi/ops/__init__.py +++ b/pykoi/ops/__init__.py @@ -1 +1 @@ -from pykoi.ops.nvml import Nvml \ No newline at end of file +from pykoi.ops.nvml import Nvml diff --git a/pykoi/ops/nvml.py b/pykoi/ops/nvml.py index 0ea0497..12a6d9f 100644 --- a/pykoi/ops/nvml.py +++ b/pykoi/ops/nvml.py @@ -1,7 +1,6 @@ """ NVML (NVIDIA Management Library). """ import time - from datetime import datetime from typing import Any, Dict, List diff --git a/pykoi/retrieval/llm/embedding_factory.py b/pykoi/retrieval/llm/embedding_factory.py index 990384e..e7860f3 100644 --- a/pykoi/retrieval/llm/embedding_factory.py +++ b/pykoi/retrieval/llm/embedding_factory.py @@ -1,8 +1,8 @@ """Embedding factory for LLM""" from typing import Union +from langchain.embeddings import HuggingFaceEmbeddings, OpenAIEmbeddings from langchain.embeddings.base import Embeddings -from langchain.embeddings import OpenAIEmbeddings, HuggingFaceEmbeddings from pykoi.retrieval.llm.constants import ModelSource @@ -28,9 +28,11 @@ def create_embedding(model_source: Union[str, ModelSource], **kwargs) -> Embeddi model_source = ModelSource(model_source) if model_source == ModelSource.OPENAI: from langchain.embeddings import OpenAIEmbeddings + return OpenAIEmbeddings() elif model_source == ModelSource.HUGGINGFACE: from langchain.embeddings import HuggingFaceEmbeddings + return HuggingFaceEmbeddings( model_name=kwargs.get("model_name"), ) diff --git a/pykoi/retrieval/llm/huggingface.py b/pykoi/retrieval/llm/huggingface.py index d4df1ae..283a6e8 100644 --- a/pykoi/retrieval/llm/huggingface.py +++ b/pykoi/retrieval/llm/huggingface.py @@ -1,13 +1,13 @@ """OpenAI language model for retrieval""" import os -import torch +import torch +from dotenv import load_dotenv from langchain.chains import RetrievalQA from langchain.llms import HuggingFacePipeline from pykoi.retrieval.llm.abs_llm import AbsLlm from pykoi.retrieval.vectordb.abs_vectordb import AbsVectorDb -from dotenv import load_dotenv # NOTE: Configure your MIN_DOCS as RAG_NUM_SOURCES in .env file. # Load environment variables from .env file @@ -15,6 +15,7 @@ MIN_DOCS = int(os.getenv("RAG_NUM_SOURCES", default=2)) + class HuggingFaceModel(AbsLlm): """ A class representing a language model that uses Huggingface's model to generate text. @@ -34,7 +35,7 @@ def __init__(self, vector_db: AbsVectorDb, **kwargs): "temperature": 0, "max_length": kwargs.get("max_length", 500), "load_in_8bit": True, - "trust_remote_code": kwargs.get("trust_remote_code", True) + "trust_remote_code": kwargs.get("trust_remote_code", True), }, ) @@ -43,7 +44,9 @@ def __init__(self, vector_db: AbsVectorDb, **kwargs): self._retrieve_qa = RetrievalQA.from_chain_type( llm=self._llm, chain_type="stuff", - retriever=self._vector_db.as_retriever(search_kwargs={"k": MIN_DOCS, "filter": {}}), + retriever=self._vector_db.as_retriever( + search_kwargs={"k": MIN_DOCS, "filter": {}} + ), verbose=True, return_source_documents=True, ) @@ -69,12 +72,17 @@ def re_init(self, file_names: list[str]): self._retrieve_qa = RetrievalQA.from_chain_type( llm=self._llm, chain_type="stuff", - retriever=self._vector_db.as_retriever(search_kwargs={"k": MIN_DOCS, "filter": metadata_filename_filter}), + retriever=self._vector_db.as_retriever( + search_kwargs={"k": MIN_DOCS, "filter": metadata_filename_filter} + ), verbose=True, return_source_documents=True, ) - print("Re-initialized HuggingFaceModel successfully with filter: ", metadata_filename_filter) + print( + "Re-initialized HuggingFaceModel successfully with filter: ", + metadata_filename_filter, + ) super().__init__(self._retrieve_qa) except Exception as ex: diff --git a/pykoi/retrieval/llm/openai.py b/pykoi/retrieval/llm/openai.py index e0d9e7a..d5085c9 100644 --- a/pykoi/retrieval/llm/openai.py +++ b/pykoi/retrieval/llm/openai.py @@ -1,14 +1,13 @@ """OpenAI language model for retrieval""" import os - from typing import List +from dotenv import load_dotenv from langchain.chains import RetrievalQA -from langchain.llms import OpenAI +from langchain.chat_models import ChatOpenAI from pykoi.retrieval.llm.abs_llm import AbsLlm from pykoi.retrieval.vectordb.abs_vectordb import AbsVectorDb -from dotenv import load_dotenv # NOTE: Configure your MIN_DOCS as RAG_NUM_SOURCES in .env file. # Load environment variables from .env file @@ -28,10 +27,7 @@ def __init__(self, vector_db: AbsVectorDb): Initializes the OpenAIModel class. """ try: - self._llm = OpenAI( - model_name="gpt-4", - temperature=0, - max_tokens=500) + self._llm = ChatOpenAI(model_name="gpt-4", temperature=0, max_tokens=500) self._vector_db = vector_db.vector_db diff --git a/pykoi/retrieval/llm/retrieval_factory.py b/pykoi/retrieval/llm/retrieval_factory.py index dfca7db..caa5574 100644 --- a/pykoi/retrieval/llm/retrieval_factory.py +++ b/pykoi/retrieval/llm/retrieval_factory.py @@ -28,9 +28,11 @@ def create( model_source = ModelSource(model_source) if model_source == ModelSource.OPENAI: from pykoi.retrieval.llm.openai import OpenAIModel + return OpenAIModel(vector_db) if model_source == ModelSource.HUGGINGFACE: from pykoi.retrieval.llm.huggingface import HuggingFaceModel + return HuggingFaceModel(vector_db, **kwargs) except Exception as ex: raise Exception(f"Unknown model: {model_source}") from ex diff --git a/pykoi/retrieval/vectordb/abs_vectordb.py b/pykoi/retrieval/vectordb/abs_vectordb.py index 2244643..597248c 100644 --- a/pykoi/retrieval/vectordb/abs_vectordb.py +++ b/pykoi/retrieval/vectordb/abs_vectordb.py @@ -1,9 +1,9 @@ import os -import docx2txt - from abc import ABC, abstractmethod -from langchain.text_splitter import RecursiveCharacterTextSplitter from pathlib import Path + +import docx2txt +from langchain.text_splitter import RecursiveCharacterTextSplitter from pdfminer.high_level import extract_text diff --git a/pykoi/retrieval/vectordb/chroma.py b/pykoi/retrieval/vectordb/chroma.py index 34d5a82..ef93f2a 100644 --- a/pykoi/retrieval/vectordb/chroma.py +++ b/pykoi/retrieval/vectordb/chroma.py @@ -1,6 +1,6 @@ import os -import numpy as np +import numpy as np from langchain.embeddings.base import Embeddings from langchain.vectorstores import Chroma from sklearn.decomposition import PCA diff --git a/pykoi/retrieval/vectordb/epsilla.py b/pykoi/retrieval/vectordb/epsilla.py index 0c48517..59bc7d3 100644 --- a/pykoi/retrieval/vectordb/epsilla.py +++ b/pykoi/retrieval/vectordb/epsilla.py @@ -1,14 +1,14 @@ """Vector store Epsilla module""" import os -import numpy as np import types - from typing import List + +import numpy as np +from langchain.embeddings import OpenAIEmbeddings from langchain.embeddings.base import Embeddings from langchain.schema import BaseRetriever, Document -from langchain.embeddings import OpenAIEmbeddings -from sklearn.decomposition import PCA from pyepsilla import vectordb +from sklearn.decomposition import PCA from pykoi.retrieval.vectordb.abs_vectordb import AbsVectorDb @@ -127,9 +127,7 @@ def as_retriever_wrapper(self, search_kwargs): ], ) if status_code == 409: - print( - f"{result['message']}. Continuing with the existing table." - ) + print(f"{result['message']}. Continuing with the existing table.") super().__init__() diff --git a/pykoi/rlhf/__init__.py b/pykoi/rlhf/__init__.py index affa11f..626ae61 100644 --- a/pykoi/rlhf/__init__.py +++ b/pykoi/rlhf/__init__.py @@ -1,4 +1,4 @@ -from pykoi.rlhf.supervised_finetuning import SupervisedFinetuning -from pykoi.rlhf.rw_finetuning import RewardFinetuning +from pykoi.rlhf.config import RLHFConfig from pykoi.rlhf.rl_finetuning import RLFinetuning -from pykoi.rlhf.config import RLHFConfig \ No newline at end of file +from pykoi.rlhf.rw_finetuning import RewardFinetuning +from pykoi.rlhf.supervised_finetuning import SupervisedFinetuning diff --git a/pykoi/rlhf/config.py b/pykoi/rlhf/config.py index d413fe7..e7721f1 100644 --- a/pykoi/rlhf/config.py +++ b/pykoi/rlhf/config.py @@ -19,9 +19,7 @@ class RLHFConfig: base_model_path: str = field( default="elinas/llama-7b-hf-transformers-4.29", - metadata={ - "help": "Huggingface model name or a local path to the base model." - }, + metadata={"help": "Huggingface model name or a local path to the base model."}, ) dataset_type: Optional[str] = field( default="local_db", @@ -67,8 +65,9 @@ class RLHFConfig: # batch_size: int = field( # default=8, # metadata={"help": "Batch size."}) + # TODO: for trl 0.7.4 there is a OOM issue with batch size > 1, need to revisit per_device_train_batch_size: Optional[int] = field( - default=2, metadata={"help": "Batch size per device for training."} + default=1, metadata={"help": "Batch size per device for training."} ) per_device_eval_batch_size: Optional[int] = field( default=8, metadata={"help": "Batch size per device for evaluation."} @@ -89,12 +88,8 @@ class RLHFConfig: local_rank: Optional[int] = field( default=-1, metadata={"help": "Used for multi-gpu."} ) - fp16: Optional[bool] = field( - default=True, metadata={"help": "Enable FP16."} - ) - bf16: Optional[bool] = field( - default=False, metadata={"help": "Enable BF16."} - ) + fp16: Optional[bool] = field(default=True, metadata={"help": "Enable FP16."}) + bf16: Optional[bool] = field(default=False, metadata={"help": "Enable BF16."}) load_in_8bit: Optional[bool] = field( default=True, metadata={"help": "Whether load the model weights in 8-bit or not."}, @@ -113,6 +108,9 @@ class RLHFConfig: gradient_checkpointing: Optional[bool] = field( default=False, metadata={"help": "Enable gradient checkpointing."} ) + gradient_checkpointing_use_reentrant: Optional[bool] = field( + default=True, metadata={"help": "Enable reentrant for gradient checkpointing."} + ) seed: Optional[int] = field(default=0, metadata={"help": "Random seed."}) num_workers: Optional[int] = field( default=None, metadata={"help": "Number of workers."} @@ -121,9 +119,7 @@ class RLHFConfig: default="./rlhf_checkpoints", metadata={"help": "Output directory for all model weights."}, ) - log_freq: Optional[int] = field( - default=1, metadata={"help": "Logging frequency."} - ) + log_freq: Optional[int] = field(default=1, metadata={"help": "Logging frequency."}) eval_freq: Optional[int] = field( default=1000, metadata={"help": "Evaluation frequency."} ) @@ -135,7 +131,7 @@ class RLHFConfig: metadata={"help": "Whether push to Huggingface Hub or not."}, ) - ## Step 1 SFT parameters + # Step 1 SFT parameters max_steps: Optional[int] = field( default=5, metadata={"help": "Maximum number of training steps."} ) @@ -145,9 +141,7 @@ class RLHFConfig: ) dataset_subset_sft_train: Optional[int] = field( default=10000, - metadata={ - "help": "The size of the subset of the training data to use." - }, + metadata={"help": "The size of the subset of the training data to use."}, ) split: Optional[str] = field( default="train", metadata={"help": "Dataset split to use."} @@ -167,8 +161,7 @@ class RLHFConfig: default="step1_supervised_finetuning_lora_final/", metadata={ "help": ( - "Output directory for step 1 supervised finetuning's Lora" - " weights." + "Output directory for step 1 supervised finetuning's Lora" " weights." ) }, ) @@ -194,17 +187,14 @@ class RLHFConfig: reward_model_path: Optional[str] = field( default="databricks/dolly-v2-3b", metadata={ - "help": ( - "Huggingface model name or a local path to the reward model." - ) + "help": ("Huggingface model name or a local path to the reward model.") }, ) reward_lora_path: Optional[str] = field( default="step2_reward_finetuning_lora_final/", metadata={ "help": ( - "Output directory for step 1 supervised finetuning's Lora" - " weights." + "Output directory for step 1 supervised finetuning's Lora" " weights." ) }, ) @@ -222,9 +212,7 @@ class RLHFConfig: ) reward_num_of_data: Optional[int] = field( default=1000, - metadata={ - "help": "The size of the subset of the training data to use." - }, + metadata={"help": "The size of the subset of the training data to use."}, ) max_seq_length_reward: Optional[int] = field( default=512, metadata={"help": "Maximum sequence length."} @@ -246,9 +234,7 @@ class RLHFConfig: ) label_names: Optional[List[str]] = field( default_factory=list, - metadata={ - "help": "List of column names in the dataset to use as labels." - }, + metadata={"help": "List of column names in the dataset to use as labels."}, ) logging_strategy: Optional[str] = field( default="steps", @@ -284,20 +270,14 @@ class RLHFConfig: ) dataset_subset_rl_train: Optional[int] = field( default=10000, - metadata={ - "help": "The size of the subset of the training data to use." - }, + metadata={"help": "The size of the subset of the training data to use."}, ) adafactor: Optional[bool] = field( default=False, metadata={"help": "whether to use the adafactor optimizer"}, ) - top_k: Optional[float] = field( - default=0.0, metadata={"help": "Value for top_k"} - ) - top_p: Optional[float] = field( - default=1.0, metadata={"help": "Value for top_p"} - ) + top_k: Optional[float] = field(default=0.0, metadata={"help": "Value for top_k"}) + top_p: Optional[float] = field(default=1.0, metadata={"help": "Value for top_p"}) do_sample: Optional[bool] = field( default=True, metadata={"help": "Flag for sampling"} ) @@ -318,9 +298,7 @@ class RLHFConfig: ) ppo_epochs: Optional[int] = field( default=10, - metadata={ - "help": "the number of optimisation epochs per batch of samples" - }, + metadata={"help": "the number of optimisation epochs per batch of samples"}, ) total_epochs: Optional[int] = field( default=100, metadata={"help": "number of total epochs"} @@ -333,9 +311,7 @@ class RLHFConfig: ) reward_baseline: Optional[float] = field( default=0.0, - metadata={ - "help": "a baseline value that is subtracted from the reward" - }, + metadata={"help": "a baseline value that is subtracted from the reward"}, ) init_kl_coef: Optional[float] = field( default=0.2, @@ -354,8 +330,7 @@ class RLHFConfig: default="step3_reinforcement_learning_final_lora_weights/", metadata={ "help": ( - "Output directory for step 3 reinforcement learning's Lora" - " weights." + "Output directory for step 3 reinforcement learning's Lora" " weights." ) }, ) diff --git a/pykoi/rlhf/rl_finetuning.py b/pykoi/rlhf/rl_finetuning.py index bc6bbe3..ef0c1b7 100644 --- a/pykoi/rlhf/rl_finetuning.py +++ b/pykoi/rlhf/rl_finetuning.py @@ -1,46 +1,33 @@ """rl finetuning.""" +import json +import os import time from datetime import datetime -from pykoi.rlhf.config import RLHFConfig -from pykoi.chat.db.constants import ( - QA_CSV_HEADER_ID, - QA_CSV_HEADER_QUESTION, - QA_CSV_HEADER_ANSWER, - QA_CSV_HEADER_VOTE_STATUS, -) -import os -import json import numpy as np import torch -from pykoi.chat.db.qa_database import QuestionAnswerDatabase from accelerate import Accelerator from datasets import Dataset, load_dataset - +from huggingface_hub import hf_hub_download +from peft import AutoPeftModelForCausalLM, PeftConfig, PeftModel from tqdm import tqdm -from transformers import ( - AutoModelForSequenceClassification, - AutoTokenizer, - Trainer, - pipeline, - set_seed, -) +from transformers import (AutoModelForCausalLM, + AutoModelForSequenceClassification, AutoTokenizer, + Trainer, pipeline, set_seed) from trl import AutoModelForCausalLMWithValueHead, PPOConfig, PPOTrainer from trl.core import LengthSampler -from huggingface_hub import hf_hub_download -from transformers import AutoModelForCausalLM -from peft import PeftModel, PeftConfig, AutoPeftModelForCausalLM + +from pykoi.chat.db.constants import (QA_CSV_HEADER_ANSWER, QA_CSV_HEADER_ID, + QA_CSV_HEADER_QUESTION, + QA_CSV_HEADER_VOTE_STATUS) +from pykoi.chat.db.qa_database import QuestionAnswerDatabase +from pykoi.rlhf.config import RLHFConfig +from pykoi.telemetry.events import RLStartEvent, RLStopEvent from pykoi.telemetry.telemetry import Telemetry -from pykoi.telemetry.events import ( - RLStartEvent, - RLStopEvent, -) class RLFinetuning(Trainer): - def __init__(self, - rlhf_config: RLHFConfig, - enable_telemetry: bool = True) -> None: + def __init__(self, rlhf_config: RLHFConfig, enable_telemetry: bool = True) -> None: """ RLFinetuning class for finetuning a language model using reinforcement learning. @@ -51,9 +38,7 @@ def __init__(self, self._rlhf_config = rlhf_config self.accelerator = Accelerator() self.num_proc = ( - self._rlhf_config.num_workers - if not self._rlhf_config.streaming - else None + self._rlhf_config.num_workers if not self._rlhf_config.streaming else None ) set_seed(rlhf_config.seed) @@ -75,53 +60,55 @@ def __init__(self, ) ## Load the reward model and tokenizer and define the reward pipeline - self.reward_tokenizer = self.create_tokenizer( - rlhf_config.reward_model_path - ) + self.reward_tokenizer = self.create_tokenizer(rlhf_config.reward_model_path) self.reward_dataset = self.create_dataset(self.reward_tokenizer) reward_model_path = rlhf_config.reward_model_path try: # If there is a trained peft adapter in the hub, load its config. - remote_adapter_config_reward = hf_hub_download(reward_model_path, "adapter_config.json") + remote_adapter_config_reward = hf_hub_download( + reward_model_path, "adapter_config.json" + ) except: remote_adapter_config_reward = None - - local_adapter_present_reward = os.path.exists( + local_adapter_present_reward = os.path.exists( os.path.join(reward_model_path, "adapter_config.json") ) # # Load the trained peft adapter config if local_adapter_present_reward: - trained_adapter_config_reward = PeftConfig.from_pretrained(reward_model_path) + trained_adapter_config_reward = PeftConfig.from_pretrained( + reward_model_path + ) else: - trained_adapter_config = PeftConfig.from_pretrained(remote_adapter_config_reward) + trained_adapter_config = PeftConfig.from_pretrained( + remote_adapter_config_reward + ) ## Load the pretrained base model pretrained_kwargs_reward = { "num_labels": 1, - "load_in_8bit": False, #True, + "load_in_8bit": False, # True, "device_map": {"": Accelerator().local_process_index}, - } # TODO: ADD + } # TODO: ADD pretrained_model_reward = AutoModelForSequenceClassification.from_pretrained( trained_adapter_config_reward.base_model_name_or_path, - **pretrained_kwargs_reward + **pretrained_kwargs_reward, ) ## TODO: LOAD MERGED BASE MODEL FROM STEP 2 # Load the Peft model by combing the base model with the trained adapter - reward_model = PeftModel.from_pretrained(pretrained_model_reward, reward_model_path, is_trainable=False) # TODO: fix this. This should not be trainable. + reward_model = PeftModel.from_pretrained( + pretrained_model_reward, reward_model_path, is_trainable=False + ) # TODO: fix this. This should not be trainable. self.reward_model = reward_model.merge_and_unload() - #pretrained_model.print_trainable_parameters() + # pretrained_model.print_trainable_parameters() print("\nTrained peft adapter loaded for reward model\n") # have to specify the pad_token_id or will lead to error: "Cannot handle batch sizes > 1 if no padding token is defined" # see https://stackoverflow.com/questions/68084302/assertionerror-cannot-handle-batch-sizes-1-if-no-padding-token-is-defined self.reward_model.config.pad_token_id = self.reward_tokenizer.pad_token_id - - - self.reward_kwargs = { "top_k": None, @@ -145,74 +132,78 @@ def __init__(self, self.base_dataset = self.create_dataset(self.base_tokenizer) pretrained_model_name_or_path = rlhf_config.base_model_path - # #NOTE: TODO: peft config will be directly inferred from the pre-trained model. rlhf_config.lora_config_rl will be ignored in previous implementation. Do we want to use it, in the flow of using merged model as base model and then add peft adapter again?? + # #NOTE: TODO: peft config will be directly inferred from the pre-trained model. rlhf_config.lora_config_rl will be ignored in previous implementation. Do we want to use it, in the flow of using merged model as base model and then add peft adapter again?? pretrained_kwargs = { "load_in_8bit": rlhf_config.load_in_8bit, "device_map": {"": Accelerator().local_process_index}, } - assert isinstance(pretrained_model_name_or_path, str), "The `pretrained_model_path` should be a string." + assert isinstance( + pretrained_model_name_or_path, str + ), "The `pretrained_model_path` should be a string." try: # If there is a trained peft adapter in the hub, load its config. - remote_adapter_config = hf_hub_download(pretrained_model_name_or_path, "adapter_config.json") + remote_adapter_config = hf_hub_download( + pretrained_model_name_or_path, "adapter_config.json" + ) except: remote_adapter_config = None - - local_adapter_present = os.path.exists( + local_adapter_present = os.path.exists( os.path.join(pretrained_model_name_or_path, "adapter_config.json") ) # # Load the trained peft adapter config if local_adapter_present: - trained_adapter_config = PeftConfig.from_pretrained(pretrained_model_name_or_path) + trained_adapter_config = PeftConfig.from_pretrained( + pretrained_model_name_or_path + ) else: trained_adapter_config = PeftConfig.from_pretrained(remote_adapter_config) # # Load the pretrained base model pretrained_model = AutoModelForCausalLM.from_pretrained( - trained_adapter_config.base_model_name_or_path, - **pretrained_kwargs + trained_adapter_config.base_model_name_or_path, **pretrained_kwargs ) # Load the Peft model by combing the base model with the trained adapter - is_trainable = True # TODO: If following merge+train new adapter flow. Below should not be trainable! - pretrained_model = PeftModel.from_pretrained(pretrained_model, pretrained_model_name_or_path, is_trainable=is_trainable) + is_trainable = True # TODO: If following merge+train new adapter flow. Below should not be trainable! + pretrained_model = PeftModel.from_pretrained( + pretrained_model, pretrained_model_name_or_path, is_trainable=is_trainable + ) - #pretrained_model.print_trainable_parameters() + # pretrained_model.print_trainable_parameters() print("\nTrained peft adapter loaded for policy model\n") # Alternatively, load a peft model from a local path. See https://huggingface.co/docs/peft/quicktour. # TODO: DELETE. doesn't work # peft_model = AutoPeftModelForCausalLM.from_pretrained(pretrained_model_name_or_path) - # Add value head to the pretrained peft model to create a policy network. if isinstance(pretrained_model, PeftModel): is_peft_model = True - trl_model_args = {} # args for the value head + trl_model_args = {} # args for the value head # TODO: weights of v_head initialized using v_head_init_strategy="random" by default. trl also suports initialization using "norm". model = AutoModelForCausalLMWithValueHead(pretrained_model, **trl_model_args) # TODO: 1 VALUE HEAD REQURIES GRAD = FALSE AND NOT IN CUDA. CHECK IF BELOW CODE FIX THIS. 2. PEFTMODEL PRINT TRAINABLE PARAMETERS REUTRNS ... AND NONE - # For back compatibility for class AutoModelForCausalLMWithValueHead. is_peft_model needs to be specified or calling model.state_dict() will fail. model.is_peft_model = is_peft_model # For back compatibility model.is_sequential_parallel = True model.current_device = Accelerator().local_process_index - reward_adapter = None # TODO: Consider adding reward adapter here? + reward_adapter = None # TODO: Consider adding reward adapter here? if is_peft_model and reward_adapter is not None: model.add_and_load_reward_modeling_adapter(reward_adapter) model.supports_rm_adapter = True else: model.supports_rm_adapter = False - - # Adding v_head to device and register hook. See AutoModelForCausalLMWithValueHead.post_init(). + # Adding v_head to device and register hook. See AutoModelForCausalLMWithValueHead.post_init(). # TODO: is register_forward_hook necessary? outputs should be already on cuda first_device = list(set(model.pretrained_model.hf_device_map.values()))[0] model.v_head = model.v_head.to(first_device) + def set_device_hook(module, input, outputs): new_output = () for output in outputs: @@ -221,9 +212,10 @@ def set_device_hook(module, input, outputs): else: new_output += (output,) return new_output + model.register_forward_hook(set_device_hook) self.base_model = model - #breakpoint() + # breakpoint() # self.base_model = AutoModelForCausalLMWithValueHead.from_pretrained( # rlhf_config.base_model_path, # load_in_8bit=rlhf_config.load_in_8bit, @@ -288,19 +280,13 @@ def create_dataset(self, tokenizer): """ args = self._rlhf_config if args.dataset_type == "local_db": - qa_database = QuestionAnswerDatabase( - db_file=self._rlhf_config.dataset_name - ) + qa_database = QuestionAnswerDatabase(db_file=self._rlhf_config.dataset_name) my_data_pd = qa_database.retrieve_all_question_answers_as_pandas() - my_data_pd = my_data_pd[ - my_data_pd[QA_CSV_HEADER_VOTE_STATUS] == "up" - ] + my_data_pd = my_data_pd[my_data_pd[QA_CSV_HEADER_VOTE_STATUS] == "up"] my_data_pd = my_data_pd[ [QA_CSV_HEADER_ID, QA_CSV_HEADER_QUESTION, QA_CSV_HEADER_ANSWER] ] - print( - "My local database has {} samples".format(my_data_pd.shape[0]) - ) + print("My local database has {} samples".format(my_data_pd.shape[0])) dataset = Dataset.from_dict(my_data_pd) elif args.dataset_type == "local_csv": ## TODO: test dataset = load_dataset("csv", data_files=args.dataset_name) @@ -338,9 +324,7 @@ def preprocess_function(examples): query = "Question: " + question + "\n\nAnswer: " tokenized_question = tokenizer(query, truncation=True) new_examples["query"].append(query) - new_examples["input_ids"].append( - tokenized_question["input_ids"] - ) + new_examples["input_ids"].append(tokenized_question["input_ids"]) return new_examples dataset = dataset.map( @@ -388,20 +372,13 @@ def train(self, save_checkpoints_path=None): response_tensors, skip_special_tokens=True ) # compute rewards and run PPO - texts = [ - q + r - for q, r in zip(batch["query"], batch[QA_CSV_HEADER_ANSWER]) - ] + texts = [q + r for q, r in zip(batch["query"], batch[QA_CSV_HEADER_ANSWER])] pipe_outputs = self.reward_pipe(texts, **self.reward_kwargs) rewards = [ - torch.tensor( - output[0]["score"] - self._rlhf_config.reward_baseline - ) + torch.tensor(output[0]["score"] - self._rlhf_config.reward_baseline) for output in pipe_outputs ] - stats = self.ppo_trainer.step( - question_tensors, response_tensors, rewards - ) + stats = self.ppo_trainer.step(question_tensors, response_tensors, rewards) ## log stats self.log_stats_to_json(epoch=epoch, stats=stats, reward=rewards[0]) @@ -422,9 +399,7 @@ def train(self, save_checkpoints_path=None): ) self.ppo_trainer.save_pretrained(save_checkpoints_path) - def log_stats_to_json( - self, epoch, stats, reward, filename="ppo_log_stats.json" - ): + def log_stats_to_json(self, epoch, stats, reward, filename="ppo_log_stats.json"): """ Log the PPO stats to a json file. Args: @@ -480,7 +455,7 @@ def train_and_save(self, output_path=None): """ start_event = RLStartEvent( start_time=time.time(), date_time=datetime.utcfromtimestamp(time.time()) - ) + ) self._telemetry.capture(start_event) self.train(save_checkpoints_path=output_path) self.save(output_path) diff --git a/pykoi/rlhf/rw_finetuning.py b/pykoi/rlhf/rw_finetuning.py index ecaba1e..92ba4c6 100644 --- a/pykoi/rlhf/rw_finetuning.py +++ b/pykoi/rlhf/rw_finetuning.py @@ -1,37 +1,26 @@ """reward model finetuning.""" import os import time - -from datetime import datetime from dataclasses import dataclass +from datetime import datetime from typing import Any, Dict, List, Optional import evaluate import numpy as np import torch - from datasets import Dataset, load_dataset from peft import get_peft_model -from transformers import ( - AutoModelForSequenceClassification, - AutoTokenizer, - Trainer, - TrainingArguments, -) +from transformers import (AutoModelForSequenceClassification, AutoTokenizer, + Trainer, TrainingArguments) -from pykoi.rlhf.config import RLHFConfig -from pykoi.chat.db.constants import ( - RANKING_CSV_HEADER_ID, - RANKING_CSV_HEADER_QUESTION, - RANKING_CSV_HEADER_LOW_RANKING_ANSWER, - RANKING_CSV_HEADER_UP_RANKING_ANSWER, -) +from pykoi.chat.db.constants import (RANKING_CSV_HEADER_ID, + RANKING_CSV_HEADER_LOW_RANKING_ANSWER, + RANKING_CSV_HEADER_QUESTION, + RANKING_CSV_HEADER_UP_RANKING_ANSWER) from pykoi.chat.db.ranking_database import RankingDatabase +from pykoi.rlhf.config import RLHFConfig +from pykoi.telemetry.events import RWStartEvent, RWStopEvent from pykoi.telemetry.telemetry import Telemetry -from pykoi.telemetry.events import ( - RWStartEvent, - RWStopEvent, -) @dataclass @@ -70,9 +59,7 @@ def extract_and_pad(key_ids, key_mask): class RewardFinetuning(Trainer): - def __init__(self, - rlhf_config: RLHFConfig, - enable_telemetry: bool = True) -> None: + def __init__(self, rlhf_config: RLHFConfig, enable_telemetry: bool = True) -> None: self._telemetry = Telemetry(enable_telemetry) self._rlhf_config = rlhf_config self.args = TrainingArguments( @@ -95,16 +82,14 @@ def __init__(self, logging_steps=rlhf_config.logging_steps, # optim=rlhf_config.optim, # lr_scheduler_type=rlhf_config.lr_scheduler_type_rw, - adam_epsilon = 1e-7 # Language model is loaded in torch.float16. Adam optimizer adds epsilon to avoid zero denominator. - # NOTE: torch.float 16 will round any number smaller than 6e-8 to 0. Do not change episolon to smaller than 6e-8. + adam_epsilon=1e-7 # Language model is loaded in torch.float16. Adam optimizer adds epsilon to avoid zero denominator. + # NOTE: torch.float 16 will round any number smaller than 6e-8 to 0. Do not change episolon to smaller than 6e-8. ) self.torch_dtype = torch.bfloat16 if rlhf_config.bf16 else torch.float16 # self.torch_dtype = torch.bfloat16 if bf16 else (torch.float16 if fp16 else torch.float32) # Load the tokenizer and the model - self.tokenizer = AutoTokenizer.from_pretrained( - rlhf_config.reward_model_path - ) + self.tokenizer = AutoTokenizer.from_pretrained(rlhf_config.reward_model_path) self.tokenizer.pad_token = self.tokenizer.eos_token self.base_model = AutoModelForSequenceClassification.from_pretrained( @@ -114,16 +99,12 @@ def __init__(self, load_in_8bit=rlhf_config.load_in_8bit, device_map=rlhf_config.device_map, ) - self.model = get_peft_model( - self.base_model, rlhf_config.lora_config_reward - ) + self.model = get_peft_model(self.base_model, rlhf_config.lora_config_reward) self.model.print_trainable_parameters() self.model.config.pad_token_id = self.tokenizer.eos_token_id self.model.config.use_cache = not rlhf_config.gradient_checkpointing self.num_proc = ( - self._rlhf_config.num_workers - if not self._rlhf_config.streaming - else None + self._rlhf_config.num_workers if not self._rlhf_config.streaming else None ) self.dataset = self.create_datasets() @@ -191,9 +172,7 @@ def create_datasets(self): # based on dataset_type (e.g. "huggingface", "csv", etc.), load the data if self._rlhf_config.dataset_type == "local_db": ranking_database = RankingDatabase() - my_data_pd = ( - ranking_database.retrieve_all_question_answers_as_pandas() - ) + my_data_pd = ranking_database.retrieve_all_question_answers_as_pandas() my_data_pd = my_data_pd[ [ RANKING_CSV_HEADER_ID, @@ -219,9 +198,7 @@ def create_datasets(self): # streaming=self._rlhf_config.streaming, # ) elif self._rlhf_config.dataset_type == "csv": - dataset = load_dataset( - "csv", data_files=self._rlhf_config.dataset_name - ) + dataset = load_dataset("csv", data_files=self._rlhf_config.dataset_name) else: raise FileNotFoundError( "No (supported) data files or dataset script found" @@ -235,8 +212,7 @@ def create_datasets(self): num_proc=self.num_proc, ) dataset = dataset.filter( - lambda x: len(x["input_ids_x"]) - <= self._rlhf_config.max_seq_length_reward + lambda x: len(x["input_ids_x"]) <= self._rlhf_config.max_seq_length_reward and len(x["input_ids_y"]) <= self._rlhf_config.max_seq_length_reward ) @@ -318,7 +294,7 @@ def train_and_save(self, output_path=None): """ start_event = RWStartEvent( start_time=time.time(), date_time=datetime.utcfromtimestamp(time.time()) - ) + ) self._telemetry.capture(start_event) self.train() self.save(output_path) diff --git a/pykoi/rlhf/supervised_finetuning.py b/pykoi/rlhf/supervised_finetuning.py index ba6016c..7a58a9f 100644 --- a/pykoi/rlhf/supervised_finetuning.py +++ b/pykoi/rlhf/supervised_finetuning.py @@ -1,34 +1,25 @@ """superised_finetuning.""" import os -from typing import Optional -import torch import time - from datetime import datetime +from typing import Optional + +import torch from datasets import Dataset, load_dataset from peft import PeftConfig, PeftModel -from transformers import ( - AutoModelForCausalLM, - AutoModelForSequenceClassification, - AutoTokenizer, - TrainingArguments, -) - +from transformers import (AutoModelForCausalLM, + AutoModelForSequenceClassification, AutoTokenizer, + TrainingArguments) from trl import SFTTrainer from trl.trainer.utils import ConstantLengthDataset -from pykoi.chat.db.constants import ( - QA_CSV_HEADER_ID, - QA_CSV_HEADER_QUESTION, - QA_CSV_HEADER_ANSWER, - QA_CSV_HEADER_VOTE_STATUS, -) + +from pykoi.chat.db.constants import (QA_CSV_HEADER_ANSWER, QA_CSV_HEADER_ID, + QA_CSV_HEADER_QUESTION, + QA_CSV_HEADER_VOTE_STATUS) from pykoi.chat.db.qa_database import QuestionAnswerDatabase from pykoi.rlhf.config import RLHFConfig +from pykoi.telemetry.events import SFTStartEvent, SFTStopEvent from pykoi.telemetry.telemetry import Telemetry -from pykoi.telemetry.events import ( - SFTStartEvent, - SFTStopEvent, -) class SupervisedFinetuning: @@ -46,9 +37,7 @@ class SupervisedFinetuning: trainer (SFTTrainer): The trainer object used for training the model. """ - def __init__(self, - rlhf_config: RLHFConfig, - enable_telemetry: bool = True) -> None: + def __init__(self, rlhf_config: RLHFConfig, enable_telemetry: bool = True) -> None: """ Initializes the SFTTrainer object. @@ -58,18 +47,12 @@ def __init__(self, """ self._telemetry = Telemetry(enable_telemetry) self._rlhf_config = rlhf_config - self.tokenizer = AutoTokenizer.from_pretrained( - rlhf_config.base_model_path - ) + self.tokenizer = AutoTokenizer.from_pretrained(rlhf_config.base_model_path) self.num_proc = ( - self._rlhf_config.num_workers - if not self._rlhf_config.streaming - else None + self._rlhf_config.num_workers if not self._rlhf_config.streaming else None ) self.dataset = self.create_datasets(self.tokenizer, self._rlhf_config) - self.torch_dtype = ( - torch.bfloat16 if self._rlhf_config.bf16 else torch.float16 - ) + self.torch_dtype = torch.bfloat16 if self._rlhf_config.bf16 else torch.float16 # self.torch_dtype = torch.bfloat16 if bf16 else (torch.float16 if fp16 else torch.float32) self.training_args = TrainingArguments( output_dir=self._rlhf_config.output_dir, @@ -86,6 +69,9 @@ def __init__(self, warmup_steps=self._rlhf_config.num_warmup_steps, gradient_accumulation_steps=self._rlhf_config.gradient_accumulation_steps, gradient_checkpointing=self._rlhf_config.gradient_checkpointing, + gradient_checkpointing_kwargs={ + "use_reentrant": self._rlhf_config.gradient_checkpointing_use_reentrant + }, fp16=self._rlhf_config.fp16, bf16=self._rlhf_config.bf16, weight_decay=self._rlhf_config.weight_decay, @@ -158,7 +144,7 @@ def save(self, output_path=None): def train_and_save(self, output_path=None): start_event = SFTStartEvent( start_time=time.time(), date_time=datetime.utcfromtimestamp(time.time()) - ) + ) self._telemetry.capture(start_event) self.trainer.train() self.save(output_path) @@ -182,9 +168,7 @@ def create_datasets(self, tokenizer, args): if args.dataset_type == "local_db": qa_database = QuestionAnswerDatabase() my_data_pd = qa_database.retrieve_all_question_answers_as_pandas() - my_data_pd = my_data_pd[ - my_data_pd[QA_CSV_HEADER_VOTE_STATUS] == "up" - ] + my_data_pd = my_data_pd[my_data_pd[QA_CSV_HEADER_VOTE_STATUS] == "up"] my_data_pd = my_data_pd[ [QA_CSV_HEADER_ID, QA_CSV_HEADER_QUESTION, QA_CSV_HEADER_ANSWER] ] diff --git a/pykoi/telemetry/events.py b/pykoi/telemetry/events.py index 53b9bde..f381814 100644 --- a/pykoi/telemetry/events.py +++ b/pykoi/telemetry/events.py @@ -1,12 +1,12 @@ """This module contains telemetry events for PyKoi.""" import os -from dataclasses import asdict, dataclass -from typing import ClassVar, Dict, Any import platform -import requests -import pynvml +from dataclasses import asdict, dataclass +from typing import Any, ClassVar, Dict +import pynvml +import requests try: pynvml.nvmlInit() @@ -96,6 +96,7 @@ class AppStartEvent(TelemetryEvent): system (str): The name of the operating system. release (str): The release version of the operating system. """ + name: ClassVar[str] = "app_start" start_time: float date_time: str @@ -116,6 +117,7 @@ class AppStopEvent(TelemetryEvent): date_time (str): The date and time when the application stopped. duration (str): The duration of the application. """ + name: ClassVar[str] = "app_end" end_time: float date_time: str @@ -136,6 +138,7 @@ class SFTStartEvent(TelemetryEvent): system (str): The name of the operating system. release (str): The release version of the operating system. """ + name: ClassVar[str] = "sft_start" start_time: float date_time: str @@ -156,6 +159,7 @@ class SFTStopEvent(TelemetryEvent): date_time (str): The date and time when the supervised finetuning stopped. duration (str): The duration of the supervised finetuning. """ + name: ClassVar[str] = "sft_end" end_time: float date_time: str @@ -176,6 +180,7 @@ class RWStartEvent(TelemetryEvent): system (str): The name of the operating system. release (str): The release version of the operating system. """ + name: ClassVar[str] = "rw_start" start_time: float date_time: str @@ -196,6 +201,7 @@ class RWStopEvent(TelemetryEvent): date_time (str): The date and time when the reward model finetuning stopped. duration (str): The duration of the reward model finetuning. """ + name: ClassVar[str] = "rw_end" end_time: float date_time: str @@ -216,6 +222,7 @@ class RLStartEvent(TelemetryEvent): system (str): The name of the operating system. release (str): The release version of the operating system. """ + name: ClassVar[str] = "rl_start" start_time: float date_time: str @@ -236,6 +243,7 @@ class RLStopEvent(TelemetryEvent): date_time (str): The date and time when the reinforcement learning finetuning stopped. duration (str): The duration of the reinforcement learning finetuning. """ + name: ClassVar[str] = "rl_end" end_time: float date_time: str diff --git a/pykoi/telemetry/telemetry.py b/pykoi/telemetry/telemetry.py index a0fe39a..6cc0b72 100644 --- a/pykoi/telemetry/telemetry.py +++ b/pykoi/telemetry/telemetry.py @@ -1,17 +1,16 @@ """This module contains telemetry for PyKoi.""" +import logging import os import sys import uuid -import logging - -from typing import Dict, Any from pathlib import Path +from typing import Any, Dict + from posthog import Posthog import pykoi from pykoi.telemetry.events import TelemetryEvent - logger = logging.getLogger(__name__) diff --git a/pyproject.toml b/pyproject.toml index c8df6cf..b2054e1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,26 +10,26 @@ readme = "README.md" python = ">=3.9,<3.13" packaging = "23.1" fastapi = "0.100.0" -pydantic = "1.10.11" +pydantic = "2.5.2" starlette = "0.27.0" uvicorn = "0.23.1" scipy = "1.11.1" -openai = "0.27.8" +openai = "1.6.1" passlib = "1.7.4" bcrypt = "4.0.1" posthog = "3.0.1" pynvml = "11.5.0" pandas = "2.0.3" -python-dotenv = "1.0.0" +python-dotenv = "^1.0.0" -transformers = { version = "4.35.0", optional = true } +transformers = { version = "4.36.2", optional = true } einops = { version = "0.6.1", optional = true } accelerate = { version = "0.21.0", optional = true } bitsandbytes = { version = "0.40.2", optional = true } langchain = { version = "0.0.338", optional = true } scikit-learn = { version = "1.3.0", optional = true } -chromadb = { version = "0.3.26", optional = true } +chromadb = { version = "0.4.20", optional = true } pyepsilla = { version = ">=0.1.1", optional = true } pdfminer-six = { version = "20221105", optional = true } docx2txt = { version = "0.8", optional = true } @@ -37,10 +37,10 @@ python-multipart = { version = "0.0.6", optional = true } tiktoken = { version = "0.4.0", optional = true } sentence-transformers = { version = "2.2.2", optional = true } -datasets = { version = "2.14.5", optional = true } +datasets = { version = "2.15.0", optional = true } evaluate = { version = "0.4.0", optional = true } -peft = { version = "0.5.0", optional = true } -trl = { version = "0.4.7", optional = true } +peft = { version = "0.7.1", optional = true } +trl = { version = "0.7.4", optional = true } [tool.poetry.extras] huggingface = [