From 9345545f1d28962602f3e4e568779b7011c0ee23 Mon Sep 17 00:00:00 2001 From: llauraa23 Date: Mon, 9 Oct 2023 01:19:28 -0700 Subject: [PATCH] generate question and answer pairs from understanding of text --- .../immigration_gen_data2.ipynb | 587 ++++++++++++++++++ example/data_generation/umich.txt | 27 + 2 files changed, 614 insertions(+) create mode 100644 example/data_generation/immigration_gen_data2.ipynb create mode 100644 example/data_generation/umich.txt diff --git a/example/data_generation/immigration_gen_data2.ipynb b/example/data_generation/immigration_gen_data2.ipynb new file mode 100644 index 0000000..1f66fdb --- /dev/null +++ b/example/data_generation/immigration_gen_data2.ipynb @@ -0,0 +1,587 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "74b21eea-679e-4bfe-9b4f-919597ca7413", + "metadata": {}, + "source": [ + "# Generate question and answer pairs from text file" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "730c285c-af52-4ba4-8cb2-1a9e468af547", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: openai in /opt/conda/envs/pykoi/lib/python3.10/site-packages (0.27.8)\n", + "Requirement already satisfied: requests>=2.20 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from openai) (2.31.0)\n", + "Requirement already satisfied: tqdm in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from openai) (4.65.0)\n", + "Requirement already satisfied: aiohttp in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from openai) (3.8.5)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from requests>=2.20->openai) (3.2.0)\n", + "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from requests>=2.20->openai) (3.4)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from requests>=2.20->openai) (2.0.4)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from requests>=2.20->openai) (2023.7.22)\n", + "Requirement already satisfied: attrs>=17.3.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from aiohttp->openai) (23.1.0)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from aiohttp->openai) (6.0.4)\n", + "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from aiohttp->openai) (4.0.2)\n", + "Requirement already satisfied: yarl<2.0,>=1.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from aiohttp->openai) (1.9.2)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from aiohttp->openai) (1.4.0)\n", + "Requirement already satisfied: aiosignal>=1.1.2 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from aiohttp->openai) (1.3.1)\n", + "Requirement already satisfied: clean-text in /opt/conda/envs/pykoi/lib/python3.10/site-packages (0.6.0)\n", + "Requirement already satisfied: emoji<2.0.0,>=1.0.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from clean-text) (1.7.0)\n", + "Requirement already satisfied: ftfy<7.0,>=6.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from clean-text) (6.1.1)\n", + "Requirement already satisfied: wcwidth>=0.2.5 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from ftfy<7.0,>=6.0->clean-text) (0.2.6)\n" + ] + } + ], + "source": [ + "! pip install openai\n", + "! pip install clean-text\n", + "\n", + "import dataclasses\n", + "import logging\n", + "import math\n", + "import ast\n", + "import re\n", + "import os\n", + "import io\n", + "import sys\n", + "import time\n", + "import json\n", + "import tqdm\n", + "import copy\n", + "import openai\n", + "import pandas as pd\n", + "\n", + "from typing import Optional, Sequence, Union\n", + "from cleantext import clean\n", + "import nltk\n", + "from nltk.corpus import stopwords\n", + "from nltk.stem import SnowballStemmer\n", + "from nltk.tokenize import word_tokenize\n", + "\n", + "# from openai import openai_object\n", + "openai.api_key = \"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\"" + ] + }, + { + "cell_type": "markdown", + "id": "086d6968-4601-4c0c-8a1b-11436bc66c3e", + "metadata": {}, + "source": [ + "## Load and preprocess" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a531961f-2e59-4437-b5b8-5a0798cd60bd", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "151b55a5-7ca4-404f-94ac-8c39a0bdaa12", + "metadata": {}, + "outputs": [], + "source": [ + "dir_cur = os.getcwd()\n", + "fname = \"umich.txt\"\n", + "with open(os.path.join(dir_cur, fname), \"r\") as file:\n", + " context = file.read()\n" + ] + }, + { + "cell_type": "markdown", + "id": "d28204cb-6112-4118-8564-5f3db55364af", + "metadata": {}, + "source": [ + "#### Personal Identification Information (PII) removal and other preprocessing using cleantext" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "5d61054c-3a80-4bec-bc79-b6a83a411094", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "f_clean = lambda context_raw : clean(context_raw,\n", + " fix_unicode=True, # fix various unicode errors\n", + " to_ascii=True, # transliterate to closest ASCII representation\n", + " lower=True, # lowercase text\n", + " no_line_breaks=False, # fully strip line breaks as opposed to only normalizing them\n", + " no_urls=False, # replace all URLs with a special token\n", + " no_emails=False, # replace all email addresses with a special token\n", + " no_phone_numbers=False, # replace all phone numbers with a special token\n", + " no_numbers=False, # replace all numbers with a special token\n", + " no_digits=False, # replace all digits with a special token\n", + " no_currency_symbols=False, # replace all currency symbols with a special token\n", + " no_punct=False, # remove punctuations\n", + " replace_with_punct=\"\", # instead of removing punctuations you may replace them\n", + " replace_with_url=\"\",\n", + " replace_with_email=\"\",\n", + " replace_with_phone_number=\"\",\n", + " replace_with_number=\"\",\n", + " replace_with_digit=\"0\",\n", + " replace_with_currency_symbol=\"\",\n", + " lang=\"en\" # set to 'de' for German special handling\n", + ")\n", + "pii_remove = False\n", + "if pii_remove:\n", + " context = f_clean(context)\n" + ] + }, + { + "cell_type": "markdown", + "id": "58796bea-3004-4ef8-9ea1-daf2fd5f3fdc", + "metadata": {}, + "source": [ + "#### Dataset customized clenaup and split into list of paragraphs" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "32865211-51a8-42f5-94bc-63dc78bc6a0d", + "metadata": {}, + "outputs": [], + "source": [ + "context = context.lower() # Lowercase \n", + "context = context.strip() # Remove leading/trailing whitespace\n", + "context = re.sub(r'[ \\t]+', ' ', context) # Remove extra space and tabs while MAINTAINING NEW LINE CHARACTERS\n", + "context = re.compile('<.*?>').sub('', context) # Remove HTML tags/markups:\n", + "\n", + "paragraphs = re.split(r'\\n{2,}', context) # split it into paragraphs where there are 2+ consecutive newline characters\n", + "para_len_l = len(paragraphs)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "164e5080-5023-491a-ab53-82269aeaf7ab", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['green card application process\\nthere are essentially three steps in the employment-based green card application process:',\n", + " '1. labor certification (perm)\\nwith limited exceptions, all eb-2 and eb-3 green card applications require that the employer obtain a labor certification from the u.s. department of labor. for petitions requiring this step, the labor certification process is often the hardest and most arduous step. prior to being able to file the labor certification application, the employer must obtain a prevailing wage from the department of labor and prove that there are no minimally qualified u.s. workers available for the positions through the completion of a competitive recruitment process.',\n", + " 'in the case of positions that contain teaching duties, the employer must document that the selected applicant is the “best qualified” for the position. this process is commonly called “special handling.”',\n", + " 'in both the “basic” and the “special handling” process, the employer must complete a formal recruitment process to document that there are no minimally qualified u.s. workers available or that, in the case of positions that have a teaching component, that the selected candidate is the best qualified. it is common that this recruitment process must be completed well after the foreign national employee started his or her position at the university.',\n", + " 'as soon as the labor certification has been filed with the department of labor, the “priority date” for the applicant is established. this date is important to determine when someone can complete step #3, i.e. the adjustment of status. (if no labor certification is required, the priority date is established with the filing of the immigrant petition/ form i-140.',\n", + " '2. immigrant petition\\nonce the department of labor approves the labor certification, the immigrant petition (form i-140) can be filed with uscis. in cases where no labor certification is required (e.g. eb-1), the filing of the i-140 is the first step of the green card process.',\n", + " '3. adjustment of status or obtaining an immigrant visa\\nonce the i-140 application has been approved by uscis, the foreign national can apply for the adjustment of his or her non-immigrant status (form i-485) to that of a legal permanent resident. instead of applying for the adjustment of status, a foreign national may also apply for an immigrant visa at a u.s. consulate or embassy abroad.',\n", + " 'the i-485 adjustment of status application cannot be filed until and unless the “priority date” is current. in practice this means that, depending on one’s country of birth and eb-category, there may be a backlog. the backlog exists because more people apply for green cards in a given category than there are available green card visa numbers. the total number of green cards is further restricted by the fact that, with some exceptions, no more than seven percent of all green cards in a given preference category can go to individuals born in a given country. the backlog is updated each month by the u.s. department of state and is published in the visa bulletin.',\n", + " 'once someone’s priority date date has been reached, as indicated in the visa bulletin, the i-485 can be filed. the priority date is the date on which the labor certification was filed with the department of labor, or, if no labor certification was required, uscis received the i-140 petition.',\n", + " 'note that the visa bulletin contains two separate tables with priority cut-off dates. the actual cut-off dates are indicated in table a “application final action dates for employment-based preference cases.” however, in some instances, uscis may accept the i-485 application if the priority date is current based on table b “dates for filing of employment-based visa applications.” note that uscis will make a determination whether table b may be used several days after the official visa bulletin is published. uscis publishes this information on its website dedicated to the visa bulletin.',\n", + " 'in some cases, it may be possible to file the i-140 and i-485 at the same time. this is not always recommended, even if it is possible. if the i-140 is denied, the i-485 will also be denied if filed concurrently.',\n", + " 'whereas the international center will file both the labor certification and i-140 (unless the petition as a whole is assigned to retained counsel), the adjustment of status application and related applications (advance parole and ead) are filed by retained counsel only.']" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "paragraphs" + ] + }, + { + "cell_type": "markdown", + "id": "966e408e-ce5a-486d-a47b-da2b4e94ab3c", + "metadata": {}, + "source": [ + "## Question answer generation with lmqg" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "92f7f2c7-6d2a-43f9-a748-de849019dc05", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: lmqg in /opt/conda/envs/pykoi/lib/python3.10/site-packages (0.1.1)\n", + "Requirement already satisfied: psutil in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from lmqg) (5.9.5)\n", + "Requirement already satisfied: pytextrank in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from lmqg) (3.2.5)\n", + "Requirement already satisfied: torch in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from lmqg) (2.0.1)\n", + "Requirement already satisfied: tqdm in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from lmqg) (4.65.0)\n", + "Requirement already satisfied: requests in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from lmqg) (2.31.0)\n", + "Requirement already satisfied: pandas in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from lmqg) (2.0.3)\n", + "Requirement already satisfied: numpy in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from lmqg) (1.25.2)\n", + "Requirement already satisfied: transformers>=4.26.1 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from lmqg) (4.31.0)\n", + "Requirement already satisfied: huggingface-hub>=0.12.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from lmqg) (0.16.4)\n", + "Requirement already satisfied: sentencepiece in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from lmqg) (0.1.99)\n", + "Requirement already satisfied: datasets in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from lmqg) (2.13.1)\n", + "Requirement already satisfied: spacy in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from lmqg) (3.7.0)\n", + "Requirement already satisfied: sudachipy in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from lmqg) (0.6.7)\n", + "Requirement already satisfied: sudachidict-core in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from lmqg) (20230927)\n", + "Requirement already satisfied: bert-score in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from lmqg) (0.3.13)\n", + "Requirement already satisfied: pyemd in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from lmqg) (1.0.0)\n", + "Requirement already satisfied: evaluate in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from lmqg) (0.4.0)\n", + "Requirement already satisfied: wandb in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from lmqg) (0.15.11)\n", + "Requirement already satisfied: ray in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from lmqg) (2.7.0)\n", + "Requirement already satisfied: nltk in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from lmqg) (3.8.1)\n", + "Requirement already satisfied: accelerate in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from lmqg) (0.21.0)\n", + "Requirement already satisfied: filelock in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from huggingface-hub>=0.12.0->lmqg) (3.12.2)\n", + "Requirement already satisfied: fsspec in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from huggingface-hub>=0.12.0->lmqg) (2023.6.0)\n", + "Requirement already satisfied: pyyaml>=5.1 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from huggingface-hub>=0.12.0->lmqg) (6.0.1)\n", + "Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from huggingface-hub>=0.12.0->lmqg) (4.7.1)\n", + "Requirement already satisfied: packaging>=20.9 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from huggingface-hub>=0.12.0->lmqg) (23.1)\n", + "Requirement already satisfied: regex!=2019.12.17 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from transformers>=4.26.1->lmqg) (2023.6.3)\n", + "Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from transformers>=4.26.1->lmqg) (0.13.3)\n", + "Requirement already satisfied: safetensors>=0.3.1 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from transformers>=4.26.1->lmqg) (0.3.1)\n", + "Requirement already satisfied: sympy in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from torch->lmqg) (1.12)\n", + "Requirement already satisfied: networkx in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from torch->lmqg) (3.1)\n", + "Requirement already satisfied: jinja2 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from torch->lmqg) (3.1.2)\n", + "Requirement already satisfied: nvidia-cuda-nvrtc-cu11==11.7.99 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from torch->lmqg) (11.7.99)\n", + "Requirement already satisfied: nvidia-cuda-runtime-cu11==11.7.99 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from torch->lmqg) (11.7.99)\n", + "Requirement already satisfied: nvidia-cuda-cupti-cu11==11.7.101 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from torch->lmqg) (11.7.101)\n", + "Requirement already satisfied: nvidia-cudnn-cu11==8.5.0.96 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from torch->lmqg) (8.5.0.96)\n", + "Requirement already satisfied: nvidia-cublas-cu11==11.10.3.66 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from torch->lmqg) (11.10.3.66)\n", + "Requirement already satisfied: nvidia-cufft-cu11==10.9.0.58 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from torch->lmqg) (10.9.0.58)\n", + "Requirement already satisfied: nvidia-curand-cu11==10.2.10.91 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from torch->lmqg) (10.2.10.91)\n", + "Requirement already satisfied: nvidia-cusolver-cu11==11.4.0.1 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from torch->lmqg) (11.4.0.1)\n", + "Requirement already satisfied: nvidia-cusparse-cu11==11.7.4.91 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from torch->lmqg) (11.7.4.91)\n", + "Requirement already satisfied: nvidia-nccl-cu11==2.14.3 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from torch->lmqg) (2.14.3)\n", + "Requirement already satisfied: nvidia-nvtx-cu11==11.7.91 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from torch->lmqg) (11.7.91)\n", + "Requirement already satisfied: triton==2.0.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from torch->lmqg) (2.0.0)\n", + "Requirement already satisfied: setuptools in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from nvidia-cublas-cu11==11.10.3.66->torch->lmqg) (68.0.0)\n", + "Requirement already satisfied: wheel in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from nvidia-cublas-cu11==11.10.3.66->torch->lmqg) (0.41.1)\n", + "Requirement already satisfied: cmake in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from triton==2.0.0->torch->lmqg) (3.27.1)\n", + "Requirement already satisfied: lit in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from triton==2.0.0->torch->lmqg) (16.0.6)\n", + "Requirement already satisfied: matplotlib in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from bert-score->lmqg) (3.8.0)\n", + "Requirement already satisfied: python-dateutil>=2.8.2 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from pandas->lmqg) (2.8.2)\n", + "Requirement already satisfied: pytz>=2020.1 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from pandas->lmqg) (2023.3)\n", + "Requirement already satisfied: tzdata>=2022.1 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from pandas->lmqg) (2023.3)\n", + "Requirement already satisfied: pyarrow>=8.0.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from datasets->lmqg) (12.0.1)\n", + "Requirement already satisfied: dill<0.3.7,>=0.3.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from datasets->lmqg) (0.3.6)\n", + "Requirement already satisfied: xxhash in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from datasets->lmqg) (3.3.0)\n", + "Requirement already satisfied: multiprocess in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from datasets->lmqg) (0.70.14)\n", + "Requirement already satisfied: aiohttp in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from datasets->lmqg) (3.8.5)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from requests->lmqg) (3.2.0)\n", + "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from requests->lmqg) (3.4)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from requests->lmqg) (2.0.4)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from requests->lmqg) (2023.7.22)\n", + "Requirement already satisfied: responses<0.19 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from evaluate->lmqg) (0.18.0)\n", + "Requirement already satisfied: click in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from nltk->lmqg) (8.1.6)\n", + "Requirement already satisfied: joblib in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from nltk->lmqg) (1.3.1)\n", + "Requirement already satisfied: graphviz>=0.13 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from pytextrank->lmqg) (0.20.1)\n", + "Requirement already satisfied: icecream>=2.1 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from pytextrank->lmqg) (2.1.3)\n", + "Requirement already satisfied: pygments>=2.7.4 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from pytextrank->lmqg) (2.16.1)\n", + "Requirement already satisfied: scipy>=1.7 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from pytextrank->lmqg) (1.11.1)\n", + "Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy->lmqg) (3.0.12)\n", + "Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy->lmqg) (1.0.5)\n", + "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy->lmqg) (1.0.10)\n", + "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy->lmqg) (2.0.8)\n", + "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy->lmqg) (3.0.9)\n", + "Requirement already satisfied: thinc<8.3.0,>=8.1.8 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy->lmqg) (8.2.1)\n", + "Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy->lmqg) (1.1.2)\n", + "Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy->lmqg) (2.4.8)\n", + "Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy->lmqg) (2.0.10)\n", + "Requirement already satisfied: weasel<0.4.0,>=0.1.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy->lmqg) (0.3.1)\n", + "Requirement already satisfied: typer<0.10.0,>=0.3.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy->lmqg) (0.7.0)\n", + "Requirement already satisfied: pathy>=0.10.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy->lmqg) (0.10.2)\n", + "Requirement already satisfied: smart-open<7.0.0,>=5.2.1 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy->lmqg) (6.4.0)\n", + "Requirement already satisfied: pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy->lmqg) (1.10.11)\n", + "Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy->lmqg) (3.3.0)\n", + "Requirement already satisfied: jsonschema in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from ray->lmqg) (4.19.0)\n", + "Requirement already satisfied: msgpack<2.0.0,>=1.0.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from ray->lmqg) (1.0.5)\n", + "Requirement already satisfied: protobuf!=3.19.5,>=3.15.3 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from ray->lmqg) (4.23.4)\n", + "Requirement already satisfied: aiosignal in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from ray->lmqg) (1.3.1)\n", + "Requirement already satisfied: frozenlist in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from ray->lmqg) (1.4.0)\n", + "Requirement already satisfied: tensorboardX>=1.9 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from ray->lmqg) (2.6.2.2)\n", + "Requirement already satisfied: GitPython!=3.1.29,>=1.0.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from wandb->lmqg) (3.1.37)\n", + "Requirement already satisfied: sentry-sdk>=1.0.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from wandb->lmqg) (1.31.0)\n", + "Requirement already satisfied: docker-pycreds>=0.4.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from wandb->lmqg) (0.4.0)\n", + "Requirement already satisfied: pathtools in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from wandb->lmqg) (0.1.2)\n", + "Requirement already satisfied: setproctitle in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from wandb->lmqg) (1.3.2)\n", + "Requirement already satisfied: appdirs>=1.4.3 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from wandb->lmqg) (1.4.4)\n", + "Requirement already satisfied: six>=1.4.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from docker-pycreds>=0.4.0->wandb->lmqg) (1.16.0)\n", + "Requirement already satisfied: attrs>=17.3.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from aiohttp->datasets->lmqg) (23.1.0)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from aiohttp->datasets->lmqg) (6.0.4)\n", + "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from aiohttp->datasets->lmqg) (4.0.2)\n", + "Requirement already satisfied: yarl<2.0,>=1.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from aiohttp->datasets->lmqg) (1.9.2)\n", + "Requirement already satisfied: gitdb<5,>=4.0.1 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from GitPython!=3.1.29,>=1.0.0->wandb->lmqg) (4.0.10)\n", + "Requirement already satisfied: colorama>=0.3.9 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from icecream>=2.1->pytextrank->lmqg) (0.4.6)\n", + "Requirement already satisfied: executing>=0.3.1 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from icecream>=2.1->pytextrank->lmqg) (1.2.0)\n", + "Requirement already satisfied: asttokens>=2.0.1 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from icecream>=2.1->pytextrank->lmqg) (2.2.1)\n", + "Requirement already satisfied: contourpy>=1.0.1 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from matplotlib->bert-score->lmqg) (1.1.1)\n", + "Requirement already satisfied: cycler>=0.10 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from matplotlib->bert-score->lmqg) (0.12.0)\n", + "Requirement already satisfied: fonttools>=4.22.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from matplotlib->bert-score->lmqg) (4.43.0)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from matplotlib->bert-score->lmqg) (1.4.5)\n", + "Requirement already satisfied: pillow>=6.2.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from matplotlib->bert-score->lmqg) (10.0.1)\n", + "Requirement already satisfied: pyparsing>=2.3.1 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from matplotlib->bert-score->lmqg) (3.1.1)\n", + "Requirement already satisfied: blis<0.8.0,>=0.7.8 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from thinc<8.3.0,>=8.1.8->spacy->lmqg) (0.7.11)\n", + "Requirement already satisfied: confection<1.0.0,>=0.0.1 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from thinc<8.3.0,>=8.1.8->spacy->lmqg) (0.1.3)\n", + "Requirement already satisfied: cloudpathlib<0.16.0,>=0.7.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from weasel<0.4.0,>=0.1.0->spacy->lmqg) (0.15.1)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from jinja2->torch->lmqg) (2.1.3)\n", + "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from jsonschema->ray->lmqg) (2023.7.1)\n", + "Requirement already satisfied: referencing>=0.28.4 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from jsonschema->ray->lmqg) (0.30.2)\n", + "Requirement already satisfied: rpds-py>=0.7.1 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from jsonschema->ray->lmqg) (0.9.2)\n", + "Requirement already satisfied: mpmath>=0.19 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from sympy->torch->lmqg) (1.3.0)\n", + "Requirement already satisfied: smmap<6,>=3.0.1 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from gitdb<5,>=4.0.1->GitPython!=3.1.29,>=1.0.0->wandb->lmqg) (5.0.1)\n", + "Collecting en-core-web-sm==3.7.0\n", + " Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.0/en_core_web_sm-3.7.0-py3-none-any.whl (12.8 MB)\n", + "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.8/12.8 MB\u001b[0m \u001b[31m97.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mm eta \u001b[36m0:00:01\u001b[0m0:01\u001b[0m01\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: spacy<3.8.0,>=3.7.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from en-core-web-sm==3.7.0) (3.7.0)\n", + "Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (3.0.12)\n", + "Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (1.0.5)\n", + "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (1.0.10)\n", + "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (2.0.8)\n", + "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (3.0.9)\n", + "Requirement already satisfied: thinc<8.3.0,>=8.1.8 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (8.2.1)\n", + "Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (1.1.2)\n", + "Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (2.4.8)\n", + "Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (2.0.10)\n", + "Requirement already satisfied: weasel<0.4.0,>=0.1.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (0.3.1)\n", + "Requirement already satisfied: typer<0.10.0,>=0.3.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (0.7.0)\n", + "Requirement already satisfied: pathy>=0.10.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (0.10.2)\n", + "Requirement already satisfied: smart-open<7.0.0,>=5.2.1 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (6.4.0)\n", + "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (4.65.0)\n", + "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (2.31.0)\n", + "Requirement already satisfied: pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (1.10.11)\n", + "Requirement already satisfied: jinja2 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (3.1.2)\n", + "Requirement already satisfied: setuptools in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (68.0.0)\n", + "Requirement already satisfied: packaging>=20.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (23.1)\n", + "Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (3.3.0)\n", + "Requirement already satisfied: numpy>=1.19.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (1.25.2)\n", + "Requirement already satisfied: typing-extensions>=4.2.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (4.7.1)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (3.2.0)\n", + "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (3.4)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (2.0.4)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (2023.7.22)\n", + "Requirement already satisfied: blis<0.8.0,>=0.7.8 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from thinc<8.3.0,>=8.1.8->spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (0.7.11)\n", + "Requirement already satisfied: confection<1.0.0,>=0.0.1 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from thinc<8.3.0,>=8.1.8->spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (0.1.3)\n", + "Requirement already satisfied: click<9.0.0,>=7.1.1 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from typer<0.10.0,>=0.3.0->spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (8.1.6)\n", + "Requirement already satisfied: cloudpathlib<0.16.0,>=0.7.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from weasel<0.4.0,>=0.1.0->spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (0.15.1)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from jinja2->spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (2.1.3)\n", + "\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n", + "You can now load the package via spacy.load('en_core_web_sm')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/conda/envs/pykoi/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:1714: FutureWarning: The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.\n", + " warnings.warn(\n", + "/opt/conda/envs/pykoi/lib/python3.10/site-packages/transformers/modeling_utils.py:2193: FutureWarning: The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.\n", + " warnings.warn(\n", + "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 31/31 [00:00<00:00, 1683.41it/s]\n", + "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 31/31 [00:00<00:00, 1874.40it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[('How many steps are there in the employment-based green card application '\n", + " 'process?',\n", + " 'three')],\n", + " [('What is the hardest and most arduous step for petitions requiring this '\n", + " 'step?',\n", + " 'labor certification'),\n", + " ('What is often the hardest and most arduous step for petitions requiring '\n", + " 'this step?',\n", + " 'labor certification process'),\n", + " ('What must an employer obtain before being able to file a labor '\n", + " 'certification application?',\n", + " 'prevailing wage')],\n", + " [('What is special handling?',\n", + " 'the employer must document that the selected applicant is the “best '\n", + " 'qualified”'),\n", + " ('What is the process called when an employer must document that an '\n", + " 'applicant is the best qualified for a position?',\n", + " 'special handling.')],\n", + " [('What does the \"basic\" and \"special handling\" process require?',\n", + " 'the employer must complete a formal recruitment process to document that '\n", + " 'there are no minimally qualified u.s. workers available'),\n", + " ('When must the formal recruitment process be completed?',\n", + " 'well after the foreign national employee started his or her position at '\n", + " 'the university.')],\n", + " [('What is filed with the department of labor?', 'labor certification'),\n", + " ('Why is the priority date important?',\n", + " 'to determine when someone can complete step #3, i.e. the adjustment of '\n", + " 'status.'),\n", + " ('When is the priority date established?',\n", + " 'the filing of the immigrant petition/ form i-140.')],\n", + " [('What can be filed with uscis once the department of labor approves the '\n", + " 'labor certification?',\n", + " 'immigrant petition'),\n", + " ('When is the filing of the i-140 the first step of the green card process?',\n", + " 'in cases where no labor certification is required (e.g. eb-1), the filing '\n", + " 'of the i-140 is the first step of the green card process.')],\n", + " [('What can a foreign national apply for once the i-140 application has been '\n", + " 'approved by uscis?',\n", + " 'adjustment of status or obtaining an immigrant visa'),\n", + " ('How can a foreign national apply for an immigrant visa instead of applying '\n", + " 'for the adjustment of status?',\n", + " 'a foreign national may also apply for an immigrant visa at a u.s. '\n", + " 'consulate or embassy abroad.')],\n", + " [('What cannot be filed until and unless the \"priority date\" is current?',\n", + " 'the i-485 adjustment of status application'),\n", + " ('What can cause a backlog in the i-485 adjustment of status application?',\n", + " 'depending on one’s country of birth and eb-category, there may be a '\n", + " 'backlog.'),\n", + " ('Why is there a backlog in the i-485 adjustment of status application?',\n", + " 'more people apply for green cards in a given category than there are '\n", + " 'available green card visa numbers.'),\n", + " ('What percentage of all green cards in a given preference category can go '\n", + " 'to individuals born in a given country?',\n", + " 'no more than seven percent'),\n", + " ('Who updates the backlog of green cards each month?',\n", + " 'the u.s. department of state')],\n", + " [(\"What can be filed once a person's priority date date has been reached?\",\n", + " 'i-485'),\n", + " ('What is the priority date?',\n", + " 'the date on which the labor certification was filed with the department of '\n", + " 'labor, or, if no labor certification was required, uscis received the '\n", + " 'i-140 petition.')],\n", + " [('How many tables does the visa bulletin contain?',\n", + " 'two separate tables with priority cut-off dates.'),\n", + " ('What are the actual cut-off dates in the visa bulletin?',\n", + " 'the actual cut-off dates are indicated in table a “application final '\n", + " 'action dates for employment-based preference cases.”'),\n", + " ('What is the name of the table that may be used by uscis to accept the '\n", + " 'i-485 application?',\n", + " 'in some instances, uscis may accept the i-485 application if the priority '\n", + " 'date is current based on table b'),\n", + " ('How long after the official visa bulletin is the information published?',\n", + " 'uscis will make a determination whether table b may be used several days '\n", + " 'after the official visa bulletin is published.'),\n", + " ('Where does uscis publish this information?',\n", + " 'on its website dedicated to the visa bulletin.')],\n", + " [('In some cases, it may be possible to file the i-140 and i-485 at the same '\n", + " 'time?',\n", + " 'i-140 and i-485'),\n", + " ('Is it always recommended to file the i-140 and i-485 at the same time?',\n", + " 'not always'),\n", + " ('If the i-140 is denied, the i-485 will also be denied?',\n", + " 'if filed concurrently.')],\n", + " [('What will the international center file?',\n", + " 'labor certification and i-140')]]\n" + ] + } + ], + "source": [ + "! pip install lmqg\n", + "from pprint import pprint\n", + "from lmqg import TransformersQG\n", + "\n", + "# Download the en_core_web_sm model explicitly \n", + "! python -m spacy download en_core_web_sm # spacy is a counterpart of nltk\n", + "\n", + "# initialize model\n", + "model = TransformersQG(model='lmqg/t5-base-squad-qg-ae', max_length=1024) # max length of a paragraph \n", + "# paragraph to generate pairs of question and answer\n", + "\n", + "context = context\n", + "\n", + "question_answer = model.generate_qa(paragraphs)\n", + "# the output is a list of tuple (question, answer)\n", + "pprint(question_answer)\n", + "# [\n", + "# ('Who was an English painter who specialised in watercolour landscapes?', 'William Turner'),\n", + "# ('What is William Turner often known as?', 'William Turner of Oxford or just Turner of Oxford'),\n", + "# (\"What did many of Turner's paintings depict?\", 'the countryside around Oxford'),\n", + "# (\"What is one of Turner's best known pictures?\", 'a view of the city of Oxford from Hinksey Hill')\n", + "# ]" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "19e1e276-3def-4980-96fd-91a7ef9dbd4f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "For 12 paragraphs, [1, 3, 2, 2, 3, 2, 2, 5, 2, 5, 3, 1] questions are generated respectively\n" + ] + } + ], + "source": [ + " print (f\"For {len(paragraphs)} paragraphs, {[len(qas) for qas in question_answer]} questions are generated respectively\")" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "b1bf02e9-281f-4cbf-bd50-6dfbc06bd2cf", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "31" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "QApair_flat = [qa for qas in question_answer for qa in qas]\n", + "len(QApair_flat)" + ] + }, + { + "cell_type": "markdown", + "id": "06d8d238-8681-4951-87b0-c35375005d6e", + "metadata": {}, + "source": [ + "## Write generated question answer pairs to csv file" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "9393d02a-baa8-41df-a8ec-5523e0cfc371", + "metadata": {}, + "outputs": [], + "source": [ + "QApair_df = pd.DataFrame(QApair_flat, columns=[\"Qustion\", \"Answer\"])\n", + "dir_cur = os.getcwd()\n", + "QApair_df.to_csv(os.path.join(dir_cur, \"output_qa_augment.csv\"), index=False)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/example/data_generation/umich.txt b/example/data_generation/umich.txt new file mode 100644 index 0000000..be9abd7 --- /dev/null +++ b/example/data_generation/umich.txt @@ -0,0 +1,27 @@ +Green Card Application Process +There are essentially three steps in the employment-based green card application process: + +1. Labor Certification (PERM) +With limited exceptions, all EB-2 and EB-3 green card applications require that the employer obtain a Labor Certification from the U.S. Department of Labor. For petitions requiring this step, the Labor Certification process is often the hardest and most arduous step. Prior to being able to file the Labor Certification application, the employer must obtain a prevailing wage from the Department of Labor and prove that there are no minimally qualified U.S. workers available for the positions through the completion of a competitive recruitment process. + +In the case of positions that contain teaching duties, the employer must document that the selected applicant is the “best qualified” for the position. This process is commonly called “Special Handling.” + +In both the “basic” and the “special handling” process, the employer must complete a formal recruitment process to document that there are no minimally qualified U.S. workers available or that, in the case of positions that have a teaching component, that the selected candidate is the best qualified. It is common that this recruitment process must be completed well after the foreign national employee started his or her position at the University. + +As soon as the Labor Certification has been filed with the Department of Labor, the “priority date” for the applicant is established. This date is important to determine when someone can complete step #3, i.e. the Adjustment of Status. (If no Labor Certification is required, the priority date is established with the filing of the Immigrant Petition/ Form I-140. + +2. Immigrant Petition +Once the Department of Labor approves the Labor Certification, the Immigrant Petition (Form I-140) can be filed with USCIS. In cases where no Labor Certification is required (e.g. EB-1), the filing of the I-140 is the first step of the green card process. + +3. Adjustment of Status or Obtaining an Immigrant Visa +Once the I-140 application has been approved by USCIS, the foreign national can apply for the adjustment of his or her non-immigrant status (Form I-485) to that of a legal permanent resident. Instead of applying for the Adjustment of Status, a foreign national may also apply for an immigrant visa at a U.S. consulate or embassy abroad. + +The I-485 Adjustment of Status application cannot be filed until and unless the “priority date” is current. In practice this means that, depending on one’s country of birth and EB-category, there may be a backlog. The backlog exists because more people apply for green cards in a given category than there are available green card visa numbers. The total number of green cards is further restricted by the fact that, with some exceptions, no more than seven percent of all green cards in a given preference category can go to individuals born in a given country. The backlog is updated each month by the U.S. Department of State and is published in the Visa Bulletin. + +Once someone’s priority date date has been reached, as indicated in the Visa Bulletin, the I-485 can be filed. The priority date is the date on which the Labor Certification was filed with the Department of Labor, or, if no Labor Certification was required, USCIS received the I-140 petition. + +Note that the Visa Bulletin contains two separate tables with priority cut-off dates. The actual cut-off dates are indicated in table A “Application Final Action Dates for Employment-based Preference Cases.” However, in some instances, USCIS may accept the I-485 application if the priority date is current based on table B “Dates for Filing of Employment-based Visa Applications.” Note that USCIS will make a determination whether Table B may be used several days after the official Visa Bulletin is published. USCIS publishes this information on its website dedicated to the Visa Bulletin. + +In some cases, it may be possible to file the I-140 and I-485 at the same time. This is not always recommended, even if it is possible. If the I-140 is denied, the I-485 will also be denied if filed concurrently. + +Whereas the International Center will file both the Labor Certification and I-140 (unless the petition as a whole is assigned to retained counsel), the Adjustment of Status application and related applications (Advance Parole and EAD) are filed by retained counsel only.