273 rows × 2 columns
\n", + "276 rows × 2 columns
\n", "" ], "text/plain": [ " _question \\\n", - "0 Can I begin the green card process if my H-1B ... \n", - "1 If I have a master's degree in engineering man... \n", - "2 If I have a master's degree in engineering man... \n", - "3 What is the eligibility and process for upgrad... \n", - "4 Can you explain the process for changing from ... \n", + "0 Do I need to wait before beginning the green c... \n", + "1 Can I start the green card process if my emplo... \n", + "2 If my employer files an H-1B and it is selecte... \n", + "3 What is the process for upgrading from EB-3 to... \n", + "4 What is involved in the EB-3 to EB-2 porting p... \n", ".. ... \n", - "268 What are the chances of my wife obtaining EB2 ... \n", - "269 Is it possible for my wife to get EB2 processi... \n", - "270 Do I have the qualifications to apply for an E... \n", - "271 If I have 3 years of IT experience and am work... \n", - "272 If I have 3 years of experience in IT and am f... \n", + "271 Does my wife qualify for EB2 processing as an ... \n", + "272 What is the process for my wife to apply for E... \n", + "273 Can my wife use her occupational therapist deg... \n", + "274 I have 3 years of experience in the IT field a... \n", + "275 I have 3 years of experience in IT and am abou... \n", "\n", " _answer \n", - "0 Yes, your employer can initiate the green card... \n", - "1 Yes, the green card process can be initiated b... \n", - "2 Yes, the employer can begin the green card pro... \n", - "3 It is possible to upgrade from EB-3 to EB-2 st... \n", - "4 If the PERM was originally filed as an EB-2, i... \n", + "0 No, the employer can initiate the green card p... \n", + "1 Yes, the employer can begin the green card pro... \n", + "2 The employer can initiate the green card proce... \n", + "3 It is possible to upgrade from EB-3 to EB-2 as... \n", + "4 If the PERM was submitted as an EB-2, you can ... \n", ".. ... \n", - "268 Occupational therapists do not have a special ... \n", - "269 Occupational therapists do not have a separate... \n", - "270 Yes, you should have the qualifications to app... \n", - "271 Yes, you should be able to apply for an EB-2 g... \n", - "272 Yes, you should qualify for an EB-2 green card... \n", + "271 OT's do not have a special category like physi... \n", + "272 OT's do not have a special pathway like physic... \n", + "273 OT's do not have a dedicated category like phy... \n", + "274 I understand you are inquiring if you can appl... \n", + "275 I'm assuming you are asking if you can submit ... \n", "\n", - "[273 rows x 2 columns]" + "[276 rows x 2 columns]" ] }, - "execution_count": 20, + "execution_count": 129, "metadata": {}, "output_type": "execute_result" } @@ -4381,7 +4222,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 130, "id": "a81743fb-966a-4c40-8e32-3bd2f7f7ee4d", "metadata": {}, "outputs": [], @@ -4406,7 +4247,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.13" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/example/rlhf/demo_supervised_finetuning.ipynb b/example/rlhf/demo_pre_training.ipynb similarity index 99% rename from example/rlhf/demo_supervised_finetuning.ipynb rename to example/rlhf/demo_pre_training.ipynb index 476aa34..6082ffe 100644 --- a/example/rlhf/demo_supervised_finetuning.ipynb +++ b/example/rlhf/demo_pre_training.ipynb @@ -41,7 +41,7 @@ "source": [ "from pykoi.chat import QuestionAnswerDatabase\n", "from pykoi.rlhf import RLHFConfig\n", - "from pykoi.rlhf import SupervisedFinetuning" + "from pykoi.rlhf import PreTraining" ] }, { @@ -762,9 +762,9 @@ } ], "source": [ - "# run supervised finetuning\n", + "# run pre-training\n", "config = RLHFConfig(base_model_path=\"elinas/llama-7b-hf-transformers-4.29\", dataset_type=\"local_db\")\n", - "rlhf_step1_sft = SupervisedFinetuning(config)\n", + "rlhf_step1_sft = PreTraining(config)\n", "rlhf_step1_sft.train_and_save(\"./models/rlhf_step1_sft\")\n" ] }, diff --git a/example/rlhf/demo_supervised_finetuning_nike.py b/example/rlhf/demo_pre_training_nike.py similarity index 85% rename from example/rlhf/demo_supervised_finetuning_nike.py rename to example/rlhf/demo_pre_training_nike.py index 484886b..3b67405 100644 --- a/example/rlhf/demo_supervised_finetuning_nike.py +++ b/example/rlhf/demo_pre_training_nike.py @@ -1,11 +1,11 @@ -"""Demo for the supervised fine tuning. +"""Demo for the pre-training. -python -m example.rlhf.demo_supervised_finetuning_nike +python -m example.rlhf.demo_pre_training_nike """ from peft import LoraConfig, TaskType -from pykoi.rlhf import RLHFConfig, SupervisedFinetuning +from pykoi.rlhf import RLHFConfig, PreTraining base_model_path = "meta-llama/Llama-2-7b-chat-hf" dataset_name = "./output_self_instructed_data_nike_10k_2023_FULL.csv" @@ -38,7 +38,7 @@ ) -# run supervised finetuning +# run pre-training config = RLHFConfig( base_model_path=base_model_path, dataset_type=dataset_type, @@ -56,5 +56,5 @@ size_valid_set=size_valid_set, lora_config_rl=lora_config, ) -rlhf_step1_sft = SupervisedFinetuning(config) +rlhf_step1_sft = PreTraining(config) rlhf_step1_sft.train_and_save(peft_model_path) diff --git a/example/rlhf/demo_supervised_finetuning_d2l_eval.ipynb b/example/rlhf/demo_supervised_finetuning_d2l_eval.ipynb new file mode 100644 index 0000000..c7bd175 --- /dev/null +++ b/example/rlhf/demo_supervised_finetuning_d2l_eval.ipynb @@ -0,0 +1,627 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "c9e5fc9d-a419-4f72-95e5-aa7c47554b01", + "metadata": {}, + "source": [ + "## Load fine-tuned model" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "d78cf816-c6a4-4a9d-90d5-79762107d394", + "metadata": {}, + "outputs": [], + "source": [ + "import sys, os\n", + "BASE_MODEL = \"mistralai/Mistral-7B-Instruct-v0.1\"\n", + "OUTPUT_DIR = os.getcwd()+\"/../../models/rlhf_step1_sft/\"\n", + "device_map=\"auto\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "18b63528-6b3a-42d8-afac-d6273d59546c", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/conda/envs/pykoi/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "source": [ + "from peft import PeftModel, PeftConfig\n", + "config = PeftConfig.from_pretrained(OUTPUT_DIR)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "00a810f9-9329-4efd-9156-fe36ddf8a432", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "3" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from transformers import AutoTokenizer\n", + "import sys, os\n", + "sys.path.append(os.getcwd()+\"/../../\")\n", + "from pykoi.chat.llm.instruct_pipeline import END_KEY, INSTRUCTION_KEY, RESPONSE_KEY, INTRO_BLURB\n", + "tokenizer_ft = AutoTokenizer.from_pretrained(config.base_model_name_or_path)\n", + "tokenizer_ft.pad_token = tokenizer_ft.eos_token\n", + "tokenizer_ft.add_special_tokens({\"additional_special_tokens\": [END_KEY, INSTRUCTION_KEY, RESPONSE_KEY+\"_NL\"]})" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "5b0d1142-1d5b-40f6-b9b1-676f2a4a2317", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:05<00:00, 2.76s/it]\n" + ] + } + ], + "source": [ + "from transformers import AutoModelForCausalLM\n", + "model_ft = AutoModelForCausalLM.from_pretrained( \n", + " config.base_model_name_or_path, \n", + " #load_in_8bit=True,\n", + " device_map=device_map,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "94a10174-8790-4f18-9503-2799f5c31dac", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Embedding(32003, 4096)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model_ft = PeftModel.from_pretrained(\n", + " model_ft, \n", + " OUTPUT_DIR, \n", + " device_map=device_map,\n", + ")\n", + "\n", + "model_ft.resize_token_embeddings(len(tokenizer_ft))" + ] + }, + { + "cell_type": "markdown", + "id": "db2f448d", + "metadata": {}, + "source": [ + "## Load data" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "10133dc2", + "metadata": {}, + "outputs": [], + "source": [ + "INPUT_FILE = os.getcwd()+'/../../data/chapter22_valfromseed_data_processed.csv'\n", + "OUTPUT_FILE = os.getcwd()+'/../../data/d2lai_selfinstruct_mistral7b-loraft_1123.csv'" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "167f752b", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import time\n", + "import csv\n", + "\n", + "df_results = pd.read_csv (INPUT_FILE)\n", + "prompt_list = df_results.instruction.values.tolist()\n", + "reference_list = df_results.response.values.tolist()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "da2ddd19", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "660" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "len(prompt_list)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1c1a15f2", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "d92b3c10-6d01-44c6-887f-1400edff2305", + "metadata": {}, + "source": [ + "## Create instruct pipeline\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "08e7f40d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0 - What does linear algebra study?1 - Can you explain the concepts of linear algebra?2 - What are the two ways to visualize vectors in geometry?3 - How can vectors be represented using lines and arrows in geometry?4 - How do column vectors differ from row vectors?5 - Can you explain the distinction between column and row vectors?6 - What is the process of representing vectors as points in a three-dimensional space?7 - Can you explain how to graph vectors using coordinate systems and point representations?8 - Can you represent vectors using arrows on a flat surface?9 - How can vectors be depicted as arrows within a two-dimensional space?10 - What is the process of representing vector addition graphically?" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/conda/envs/pykoi/lib/python3.10/site-packages/transformers/pipelines/base.py:1101: UserWarning: You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "11 - How can vectors be depicted when added together using diagrams?12 - What is the process of representing vector subtraction visually?13 - How can vectors be depicted in a way that shows their difference when they are subtracted?14 - How do you calculate the inner product of two row matrices?15 - What is the formula for finding the product of two vectors with their components aligned vertically?16 - How does the dot product of two vectors relate to their magnitudes and directions in geometry?17 - Can you explain how the dot product of two vectors can be represented graphically using points and lines?18 - How do you calculate the dot product of two vectors using a formula?19 - Can you explain the formula used to determine the dot product between two vectors on page 3?20 - What is the formula to determine the angle between two vectors using their dot product and magnitudes?21 - Can you explain how to compute the angle between two vectors based on their dot product and lengths?22 - What is the significance of determining the angle between two vectors in machine learning?23 - How does calculating the angle between two vectors aid in machine learning processes?24 - What does cosine similarity measure on a scale of -1 to 1?25 - How is cosine similarity calculated between two non-zero vectors?26 - What does \"hyperplane\" mean on page 4?27 - Can you explain what is described as a \"hyperplane\" on page 4?28 - What is the connection between cosine similarity and hyperplanes as described on page 4?29 - On page 4, how is it explained how cosine similarity relates to hyperplanes?30 - What are some widely used deep learning libraries for model development?31 - Which deep learning platforms are commonly employed in creating models?32 - What is the objective of transforming X\\_test?33 - Why do we need to reformat X\\_test?34 - What is the number of samples in X\\_test?35 - How many data points are there in X\\_test?36 - What is the predicted value's data type?37 - Can you specify the format of the results produced by predictions?38 - What data type are the predicted values?39 - Can you specify the data type of the output predictions?40 - On page 6, what function is employed to determine precision?41 - According to page 6, which method is utilized to compute correctness?42 - How do you display the test set accuracy using a print statement and an approximate value? (Page 6)43 - What is the syntax for printing out the test set accuracy along with a rough estimate on Page 6?44 - How does the transpose function work in TensorFlow?45 - Can you explain what the transpose operation accomplishes in TensorFlow?46 - What does the \"reduce_sum\" function do in TensorFlow?47 - Can you explain how the \"reduce_sum\" function works in TensorFlow?48 - What does the \"cast\" function do in TensorFlow?49 - Can you explain how to use the \"cast\" function in TensorFlow?50 - What does the \"shape\" attribute represent in a TensorFlow tensor?51 - Can you explain what the \"shape\" attribute signifies when it comes to a TensorFlow tensor?52 - What data type does a TensorFlow tensor have?53 - Can you tell me what the dtype property represents in TensorFlow?54 - What is the relationship between NumPy arrays and TensorFlow tensors?55 - How do NumPy arrays compare to TensorFlow tensors in terms of functionality and usage?56 - What does the \"mean\" function do in calculating accuracy in TensorFlow?57 - How is accuracy calculated using the \"mean\" function in TensorFlow?58 - How does the cast function work in TensorFlow to match the data types of predicted and target labels?59 - Can you explain how the cast function is utilized in TensorFlow to ensure that the predicted values have the same data type as the target labels? (Page 6)60 - What is the value of the \"shape\" attribute in the flattened form of X\\_test?61 - Can you tell me what the \"shape\" attribute represents when applied to the flattened version of X\\_test?62 - What is the value of the \"axis\" parameter in the dot product calculation involving the flattened versions of X\\_test and predictions?63 - Can you provide the value of the \"axis\" attribute used in the dot product operation between the flattened forms of X\\_test and predictions?64 - What is the result of multiplying the corresponding elements in the flattened versions of X\\_test and predictions, then adding up all those products?65 - How do you calculate the total value obtained by taking the inner product of the flattened versions of X\\_test and predictions, and then summing up all the resulting values?66 - What does the \"mean\" function do in NumPy when calculating accuracy?67 - How is accuracy calculated using the \"mean\" function in NumPy?68 - How does the `astype` function work in converting predicted values to match the data type of target labels in NumPy?69 - In what way does the `astype` function aid in ensuring that the predicted values and target labels have the same data type when working with NumPy arrays?70 - What is the relationship between NumPy arrays and TensorFlow tensors?71 - How do NumPy arrays compare to TensorFlow tensors in terms of functionality and usage?72 - What does the \"mean\" function do in calculating accuracy in TensorFlow?73 - How is accuracy calculated using the \"mean\" function in TensorFlow?74 - How does the cast function work in TensorFlow to match the data types of predicted and target labels?75 - Can you explain how the cast function is utilized in TensorFlow to ensure that the predicted values have the same data type as the target labels? (Page 6)76 - What is the value of the \"shape\" attribute in the flattened form of X\\_test?77 - Can you tell me what the \"shape\" attribute represents when applied to the flattened version of X\\_test?78 - What is the value of the \"axis\" parameter in the dot product calculation involving the flattened versions of X\\_test and predictions?79 - Can you provide the value of the \"axis\" attribute used in the dot product operation between the flattened forms of X\\_test and predictions?80 - What is the position of matrix B in terms of its ranking?81 - Can you determine the order of matrix B based on its rank?82 - What does it mean for a matrix to have full rank?83 - Can you explain what is meant by a full-rank matrix on page 8?84 - Can you provide a definition of determinant on page 9?85 - What does the term \"determinant\" mean according to the information found on page 9?86 - * What does \"tensor contraction\" mean on page 10?87 - * Can you explain what is meant by \"tensor contraction\" as described on page 10?88 - * How do arrays and TensorFlow (tf) notation differ in their representation of tensors?89 - * Can you explain the distinction between using arrays and TensorFlow (tf) notation to represent tensors?90 - What is the equivalent expression of tr(A^4) in Einstein notation for any given matrix A?91 - Can you express tr(A^4) using Einstein notation when dealing with an arbitrary matrix A?92 - What does linear algebra study?93 - Can you explain the concepts of linear algebra?94 - Can you explain what eigendecomposition means?95 - Could you provide a definition for eigendecomposition?96 - What does it mean to determine the eigenvalues of a matrix or linear transformation?97 - Can you explain what identifying the eigenvalues of a system entails?98 - Can you provide the explanation of the matrix $\\mathbf{A}$ on page 4?99 - What does the matrix $\\mathbf{A}$ represent on page 4?100 - What does this code snippet achieve?101 - What is the function of this particular section of code?102 - What do eigenvalues represent in computing?103 - Why are eigenvalues important to compute?104 - What is the objective of scaling the matrix A?105 - Why do we need to rescale the matrix A?106 - What is the formula to determine the ratio between successive norms of a matrix?107 - How do you find the relationship between adjacent norms of a matrix?108 - How do eigenvalues relate to singular values in random matrix theory?109 - Can you explain the connection between eigenvalues and singular values when dealing with random matrices?110 - What does the Gershgorin Circle Theorem state on page 7?111 - Can you explain the meaning of the Gershgorin Circle Theorem as presented on page 7?112 - How does the repeated multiplication of matrices behave?113 - What are the properties of raising a matrix to successive powers?114 - What topic is covered in page 8 of section 22.2.7?115 - On which page and in what section is the discussion found about topic 22.2.7?116 - What information is provided in Section 22.2.8 of the text?117 - Can you tell me what is discussed in page 8 of the document under Section 22.2.8?118 - Can you provide me with the exercises for Section 22.2.9 on Page 8?119 - What are the activities assigned for Section 22.2.9 as found on Page 8?120 - What does linear algebra study?121 - Can you explain the concepts of linear algebra?122 - What is the objective of using pre-trained word embeddings?123 - What do pre-trained word embeddings serve as a function to accomplish?124 - What is the process of using approximations during training in word embeddings?125 - Can you explain how word embeddings are trained using approximation techniques?126 - * What sets global vectors apart from subword embeddings?127 - * How do global vectors differ from subword embeddings?128 - What are the differences between Bidirectional Encoder Representations from Transformers (BERT) and Traditional Recurrent Neural Networks (RNNs)?129 - Can you explain how Bidirectional Encoder Representations from Transformers (BERT) vary in comparison to Traditional Recurrent Neural Networks (RNNs)?130 - How can BERT be customized for tasks that require both sequence-level and token-level analysis?131 - Can you explain how to adapt BERT for use in sequence-level and token-level applications?132 - What is the difference in notation between L(x + epsilon) and L(x)?133 - How does the addition of epsilon affect the function L(x)?134 - Can you explain what the term \"derivative\" means in mathematics?135 - On page 4, there's a definition of what a derivative is - can you recall it?136 - What is the process for calculating the derivative of an explicitly defined function?137 - How can you determine the rate of change of a given explicit function at any point?138 - What does the sum rule state in calculus?139 - Can you explain the concept of the sum rule in calculus on page 4?140 - What is the formula used to find the derivative of a function in calculus?141 - How do you calculate the slope of a tangent line using the power rule in calculus?142 - What is the formula used to find the derivative of a composite function in calculus?143 - Can you explain how to apply the rule of differentiation when dealing with functions composed of multiple functions in calculus?144 - Can you provide the explanation of the epsilon on page 5?145 - What does the term \"epsilon\" mean according to the information found on page 5?146 - What is the objective of discovering a polynomial function of degree n?147 - What is the aim in determining a polynomial equation of order n?148 - * What is the slope of the tangent line to the function f(x) = x^3 - 4x + 1 at any point x?149 - * How does the rate of change of the function f(x) = x^3 - 4x + 1 change as x changes?150 - * What is the slope of the natural logarithm function when its argument is x?151 - * How does the rate of change of the logarithmic function to base e vary as x changes?152 - Does f(x) have a local extrema (maximum or minimum) when its derivative is zero at x?153 - Is it true that if the derivative of f(x) is equal to zero at x, then f(x) will either have a maximum or minimum value at that point?154 - What is the value of x that minimizes f(x) = x*log(x)?155 - At what point does the function f(x) = x*log(x) achieve its lowest possible value?156 - What does linear algebra study?157 - Can you explain the concepts of linear algebra?158 - * What does \"partial derivative\" mean in mathematics?159 - * Can you explain what a \"partial derivative\" is on page 1?160 - How can we calculate an approximation of the loss function when adjustments are made to its parameters?161 - What formula do we use to estimate the loss function when changes are made to its parameters?162 - What does the fundamental algorithm discussed on page 4 serve as?163 - On page 4, what is the intended function of the outlined algorithm?164 - What is the process of updating weights in the gradient descent algorithm?165 - Can you explain how the gradient descent algorithm minimizes the cost function?166 - How does batch normalization differ from layer normalization?167 - Can you explain the distinction between batch normalization and layer normalization?168 - What is the purpose of using dropout regularization in machine learning models?169 - How does dropout regularization help prevent overfitting in neural networks?170 - What does \"early stopping\" refer to in the context of machine learning?171 - Can you explain how early stopping works in the field of artificial intelligence?172 - What does it mean by transfer learning in machine learning and artificial intelligence?173 - Can you explain how transfer learning works in the field of computer science?174 - What does data augmentation involve?175 - Can you explain what data augmentation entails?176 - What does adversarial training involve?177 - How does adversarial training work?178 - What does \"attention mechanism\" refer to on page 4?179 - On page 4, what is described as an \"attention mechanism\"?180 - What does a Recurrent Neural Network (RNN) do?181 - Can you explain what a Recurrent Neural Network (RNN) is?182 - What is the objective of calculating partial derivatives?183 - What do we aim to achieve by determining partial derivatives?184 - What is the definition of the chain rule in calculus and how does it relate to partial derivatives? (Page 5)185 - Can you explain how the concept of the chain rule applies when finding partial derivatives in calculus, as presented on page 5?186 - How does the forward pass work in a backpropagation algorithm?187 - What happens during the backward pass of a backpropagation algorithm?188 - What does regularization play a part in when it comes to deep learning models?189 - How does regularization contribute to the functioning of deep learning models?190 - How does batch normalization differ from layer normalization?191 - Can you explain the distinction between batch normalization and layer normalization on page 5?192 - What do CNNs serve as?193 - What are CNNs used for?194 - How do fully connected and convolutional layers differ in their functionality within a neural network?195 - Can you explain the key distinctions between fully connected and convolutional layers in terms of how they process data in a neural network?196 - What do activation functions accomplish in a neural network?197 - How do activation functions contribute to the functioning of neural networks?198 - How does supervised learning differ from unsupervised learning?199 - Can you explain the distinction between supervised and unsupervised learning algorithms?200 - What does transfer learning achieve in deep learning models?201 - How does transfer learning contribute to the effectiveness of deep learning algorithms?202 - How do shallow and deep neural networks differ in terms of their architecture and functionality?203 - Can you explain the key differences between shallow and deep neural networks, particularly regarding their ability to learn complex patterns?204 - How does regularization help to prevent overfitting in deep learning models?205 - In what way does regularization play a part in controlling overfitting in deep learning algorithms?206 - How does online learning differ from traditional classroom education?207 - In what ways are online courses different from in-person classes?208 - What is the goal of incorporating reinforcement learning into AI systems?209 - Can you explain how reinforcement learning contributes to the development of intelligent machines?210 - How do you calculate the partial derivative of f in relation to w? (Page 6)211 - Can you provide the equation for determining the partial derivative of f with regard to w on page 6?212 - What is required to calculate the first-order derivative of f with respect to w?213 - How many partial derivatives are necessary to determine the gradient of f with respect to w?214 - What is the objective of using the chain rule in calculus?215 - Why do we use the chain rule when finding derivatives?216 - What equation can be used to determine the optimal quadratic function that closely fits a given data point? (Page 9)217 - How do you calculate the most suitable quadratic function that accurately represents a specific data point using a particular formula? (Page 9)218 - - What does this code do on page 10?219 - - Can you explain the function of the code found on page 10?220 - What is the number of dimensions in a meshgrid? (Page 10)221 - What does the `linspace` function do in this specific code block on page 10?222 - Can you explain what purpose the `linspace` function serves within the given code snippet found on page 10?223 - What does \"exp\" represent in this specific code snippet?224 - Can you explain what function \"exp\" serves within the given code block on page 10?225 - - What does the \"constant\" keyword represent on page 10 of the code block?226 - - On page 10 of the code block, what is the function of the variable labeled as a \"constant\"?227 - What does the \"d2l\" variable represent in this code snippet?228 - Can you explain what the role of \"d2l\" is within the given code block?229 - - What does the `plt` module do in this code snippet?230 - - Can you explain what the role of `plt` is within the given code?231 - - What does the image serve as within the given program? (Page 10)232 - - What role does the picture play within the provided code snippet? (Page 10)233 - * What does the `add_subplot` function do in this code snippet?234 - * Can you explain what the role of `add_subplot` is within the provided code block?235 - - What type of projection is being utilized on page 10 of the code?236 - - On page 10 of the code, what kind of projection is employed?237 - What does the `plot_wireframe` function do in this code snippet?238 - Can you explain what the role of `plot_wireframe` is within the given code block?239 - - What does the 'rstride' parameter do on page 10 of the code block?240 - - On page 10 of the code block, what is the purpose of the 'rstride' parameter?241 - - What does \"cstride\" represent in the given code snippet on page 10?242 - - On page 10 of the provided code, what value is assigned to the variable \"cstride\"?243 - - What does the 'color' variable represent on page 10 of the code?244 - - On page 10 of the code, what value is assigned to the 'color' parameter?245 - - What does the `xlabel` function do in this code snippet?246 - - Can you explain what the role of `xlabel` is within the provided code?247 - - What does the `ylabel` function do in this specific code block on page 10?248 - - Can you explain what the role of `ylabel` is within the given code snippet found on page 10?249 - What does `set_figsize` do in the given code snippet?250 - Can you explain what the function `set_figsize` accomplishes within the provided code block?251 - * What does the `set_xlim` function do in this code snippet?252 - * Can you explain what the `set_xlim()` method accomplishes within the provided code block?253 - * What does the `set_ylim` function do in this code snippet?254 - * Can you explain what the `set_ylim()` method accomplishes within the provided code block?255 - * What is the derivative of $\\mathbf{x}^\\top A \\mathbf{x}$ with respect to $x\\_k$?256 - * How do you find the partial derivative of $\\mathbf{x}^\\top A \\mathbf{x}$ with respect to $x\\_k$?257 - - What does \"n\" represent on page 12?258 - - On page 12, what numerical value is assigned to the variable \"n\"?259 - What is the objective of determining slopes or gradients in multi-dimensional spaces?260 - Why do we need to establish the gradient concept in higher dimensions?261 - How is the backpropagation algorithm structured?262 - What is the organization of the backpropagation algorithm?263 - What does matrix calculus study?264 - What are the applications of matrix calculus?265 - What is the derivative of the function f(x) = beta^T x?266 - What is the derivative of the function g(x) = x^T beta?267 - * What is the slope of the function that represents the L2 norm of a vector as a function of its elements? (Page 13)268 - * How does the magnitude of a vector change when one of its elements changes at constant rate? (Page 13)269 - * What is the slope of the natural logarithm function? (Page 13)270 - * Can you determine the rate of change of the logarithmic function using the base e? (Page 13)271 - What are the partial derivatives of the function f(x, y) = x^2y + xy^2 with respect to x and y?272 - How do the partial derivatives of the function f(x, y) = x^2y + xy^2 change as x and y vary?273 - What is the geometrical interpretation of the condition \\(\\nabla f = 0\\)? 274 - Can you explain how to geometrically interpret the condition \\(\\nabla f = 0\\) using g and h?275 - What does linear algebra study?276 - Can you explain the concepts of linear algebra?277 - What are the objectives of using natural language processing (NLP) pre-training methods?278 - Can you explain the aims of employing NLP pre-processing techniques in natural language understanding (NLU)?279 - Which machine learning libraries are widely used?280 - What are some commonly utilized machine learning frameworks?281 - What is the theorem that relates differentiation and integration in calculus?282 - Can you explain the relationship between derivatives and integrals as stated by the Fundamental Theorem of Calculus?283 - What is the role of the Fundamental Theorem of Calculus in resolving integrals?284 - In what way does the Fundamental Theorem of Calculus aid in the process of integration?285 - What is the numerical worth of the indefinite integral of a function evaluated between its limits?286 - How do you calculate the exact sum of an antiderivative when evaluating it at specific bounds?287 - What is the process of determining the total accumulation of change in a function over an interval by integrating it?288 - Can you explain how to find the area between a curve and the x-axis over a given interval through integration?289 - How does flipping a function horizontally affect the calculation of its area under the curve?290 - What is the impact on the area under the curve when a function is flipped horizontally before being calculated?291 - How do you calculate the area under a curve when it can be broken down into several parts using double or triple integrals? (Refer to page 5)292 - Can you determine the formula for calculating the total area under a curve that has been divided into numerous sections by employing multiple integration techniques? Please refer to page 5 for guidance.293 - What does Fubini's theorem state about integrals of functions over sets in multiple dimensions?294 - Can you explain how Fubini's theorem relates to double and triple integrals?295 - What is the impact of modifying variables on multiple definite integrals? (Page 6)296 - How do you calculate the value of multiple indefinite integrals when the limits of integration change? (Page 6)297 - What does linear algebra study?298 - Can you explain the concepts of linear algebra?299 - How do discrete and continuous random variables differ from one another?300 - In what ways are discrete and continuous random variables distinct?301 - What is the difference between working with continuous and discrete random variables?302 - Can you explain how the properties of continuous and discrete random variables affect their use in statistical analysis?303 - What is a continuous random variable and can it be explained through an example?304 - How does the concept of continuous random variables relate to examples?305 - What is the likelihood of a dart landing precisely 2 centimeters away from the center of the board?306 - How likely is it for a dart to strike the board at a distance of exactly 2 centimeters from the center?307 - What number of decimal places must be considered to calculate the likelihood that a distance equals 2.00000..308 - To what degree of precision should we consider the distance as being equal to 2.00000..309 - What is the likelihood of selecting a random number whose final digit is either 7 or 3?310 - Can you determine the chance of choosing a random integer with its last digit being either 7 or 3?311 - How many darts landed in the 2.1 cm bucket when 100 darts were thrown?312 - What is the number of darts that struck the 2.1 cm target area out of 100 darts thrown?313 - What is the chance of selecting a random point inside interval B if interval A is smaller than B?314 - How likely is it to choose a random point located in interval B when interval A is contained within interval B?315 - What is the chance that a randomly chosen point will be located inside an interval measuring ε units in length surrounding 2?316 - Can you calculate the likelihood of selecting a random point that lies within an interval of ε units centered at 2?317 - What is the likelihood that a randomly chosen point will be located inside an interval measuring ε units in length centered at x?318 - Can you calculate the chance that a randomly picked point would fall within an interval having a length of ε and being situated near x?319 - What does the probability density function represent for a continuous random variable X?320 - Can you explain what the concept of a probability density function is in relation to a continuous random variable X?321 - Can you explain what a probability density function means on page 3?322 - What does it mean when we talk about a probability density function according to the information provided on page 3?323 - What does the term \"cumulative distribution function\" mean in statistics?324 - Can you explain what a cumulative distribution function represents on page 4?325 - How do you calculate the variance of a random variable?326 - Can you provide the formula for determining the variance of a random variable?327 - On page 7 of the article, what code was used to create the plot?328 - Can you tell me which code was employed for plotting on page 7 of the article?329 - How many plots does the article display on page 7?330 - On page 7 of the article, how many plots are depicted?331 - What libraries are utilized to create the graphs on page 7?332 - On page 7, which libraries are employed in producing the visualizations?333 - What does the author intend to convey through the first plot on page 7?334 - On page 7, what is the main message or theme that the author wants to communicate through the first plot?335 - What serves as the main objective of the second graph on page 7?336 - On page 7, what does the second diagram aim to achieve?337 - What serves as the main objective of the third graph on page 7?338 - On page 7, what does the third diagram aim to achieve?339 - What serves as the main objective of the fourth graph on page 7?340 - On page 7, what does the function of the fourth chart primarily achieve?341 - What is the purpose of plotting the integrand on Page 8?342 - On Page 8, what method is employed to visualize the necessary integrand for calculating variance?343 - Can you identify the specific function employed to determine the average value of a stochastic variable? (Referring to Page 8)344 - On page 8, what is the name of the mathematical tool utilized to compute the expected value of a random variable?345 - Can you identify the specific function employed to determine the variance of a random variable? (Page 8)346 - What does the formula on Page 8 represent in calculating the variance of a random variable?347 - What is the function that calculates the likelihood of a random variable falling within an epsilon-sized range of a particular value? (Page 8)348 - Can you identify the function employed to determine the chance of a random variable being situated inside an epsilon-sized interval surrounding a specific value? (Page 8)349 - What is the purpose of the function that estimates the likelihood of a random variable falling within an epsilon-near range of a particular value? (Page 8)350 - Can you explain what the function does on Page 8 that calculates the approximation of the probability of a random variable being close to a certain value?351 - Can you identify the function that depicts the probability distribution of two random variables being jointly distributed?352 - On page 8, what does the author refer to as the representation of the probabilities of two random variables occurring simultaneously?353 - What is the purpose of the function that calculates the area under a curve or surface in multiple dimensions?354 - Can you explain what the tool is called which computes the total value of a function across a specified range in higher dimensional spaces?355 - Can you tell me what function produces random numbers ranging from 0 up to 1?356 - What does the function do that generates random numbers within the range of 0 to 1?357 - What does the function on page 8 do to calculate the average value of a random variable?358 - On page 8, what is referred to as the method for determining the central tendency of a random variable through calculating its average value?359 - Can you identify the specific function employed to calculate the variance of a random variable? (Page 8)360 - What does the function on Page 8 do in computing the variance of a random variable?361 - What is the function that calculates the likelihood of a random variable falling within an epsilon-sized range of a particular value? (Page 8)362 - Can you identify the function employed to determine the chance of a random variable being situated inside an epsilon-sized interval surrounding a specific value? (Page 8)363 - What is the purpose of the function that estimates the likelihood of a random variable falling within an epsilon-near range of a particular value? (Page 8)364 - Can you explain what the function does on Page 8 that calculates the approximation of the probability of a random variable being close to a certain value?365 - Can you identify the function that depicts the probability distribution of two random variables being jointly distributed?366 - On page 8, what does the author refer to as the representation of the probabilities of two random variables occurring simultaneously?367 - What is the purpose of the function that calculates the area under a curve or surface in multiple dimensions?368 - Can you explain what the tool is called which computes the total value of a function across a specified range in higher dimensional spaces?369 - Can you tell me what function produces random numbers ranging from 0 up to 1?370 - What does the function do that generates random numbers within the range of 0 to 1?371 - What does the function on page 8 do to calculate the average value of a random variable?372 - On page 8, what is referred to as the method for determining the central tendency of a random variable through calculating its average value?373 - Can you identify the specific function employed to calculate the variance of a random variable? (Page 8)374 - What does the function on Page 8 do in computing the variance of a random variable?375 - What is the function that calculates the likelihood of a random variable falling within an epsilon-sized range of a particular value? (Page 8)376 - Can you identify the function employed to determine the chance of a random variable being situated inside an epsilon-sized interval surrounding a specific value? (Page 8)377 - What is the purpose of the function that estimates the likelihood of a random variable falling within an epsilon-near range of a particular value? (Page 8)378 - Can you explain what the function does on Page 8 that calculates the approximation of the probability of a random variable being close to a certain value?379 - Can you identify the function that depicts the probability distribution of two random variables being jointly distributed?380 - On page 8, what does the author refer to as the representation of the probabilities of two random variables occurring simultaneously?381 - What is the purpose of the function that calculates the area under a curve or surface in multiple dimensions?382 - Can you explain what the tool is called which computes the total value of a function across a specified range in higher dimensional spaces?383 - Can you tell me what function produces random numbers ranging from 0 up to 1?384 - What does the function do that generates random numbers within the range of 0 to 1?385 - What does the function on page 8 do to calculate the average value of a random variable?386 - On page 8, what is referred to as the method for determining the central tendency of a random variable through calculating its average value?387 - How do you calculate the probability mass function (PMF) of a discrete random variable?388 - What is the definition of the cumulative distribution function (CDF) and how can it be used to find the marginal distribution of a continuous random variable?389 - What method is used to determine the relationship between two randomly varying quantities?390 - How do you calculate the correlation coefficient between two probability distributions?391 - - What is the equation used to determine the relationship between two random variables? (Page 10)392 - - How do you calculate the correlation coefficient between two random variables? (Page 10)393 - What is the definition of covariance as a statistical measure of the relationship between two random variables?394 - Can you explain how the formula for calculating covariance represents the strength and direction of the linear relationship between two random variables?395 - Does the correlation coefficient provide a comprehensive understanding of the relationship between two random variables?396 - Can the variance help to identify any underlying patterns or trends in the data?397 - What is the difference between covariance and correlation?398 - Can you explain how the concepts of covariance and correlation relate to each other on page 10?399 - - How do you calculate the correlation coefficient? (Page 11)400 - - Can you explain how to determine the correlation coefficient on page 11?401 - * What does Chebyshev's inequality state?402 - * Can you explain Chebyshev's inequality on page 12?403 - What does linear algebra study?404 - Can you explain the concepts of linear algebra?405 - What does the maximum likelihood principle entail in the field of machine learning?406 - In what way does the maximum likelihood principle play a role in machine learning algorithms?407 - What does maximum likelihood estimation mean?408 - Can you explain what the concept of maximum likelihood estimation refers to?409 - What distinguishes log-likelihood from likelihood?410 - How does log-likelihood differ from regular likelihood?411 - How does the computational complexity vary when calculating the derivative of a likelihood function using the product rule?412 - What is the time and space required to compute the derivative of a likelihood function using the product rule in terms of its computational complexity?413 - How many operations does it take to calculate the gradient of the negative log-likelihood function?414 - What is the time required to compute the gradient of the negative log-likelihood function?415 - What is the connection between negative log-likelihood and information theory?416 - How does the concept of negative log-likelihood fit into the framework of information theory?417 - What is the reason behind switching from probabilities to densities when dealing with continuous data sets?418 - How does the concept of density allow us to work more effectively with continuous variables compared to probabilities?419 - What does linear algebra study?420 - Can you explain the concepts of linear algebra?421 - Which libraries are commonly utilized in probability within the field of machine learning?422 - What are some popular libraries employed in machine learning that incorporate probability concepts?423 - Can you explain what a Bernoulli random variable represents in probability theory?424 - What does it mean when we say that a random variable follows a Bernoulli distribution?425 - Which are the top three libraries used to develop machine learning models?426 - Can you name the most widely-used libraries for implementing machine learning algorithms?427 - What does the tensor look like on page 4?428 - Can you describe the form of the tensor found on page 4?429 - Can you explain what the Poisson distribution is on page 6?430 - What does the Poisson distribution represent according to the information provided on page 6?431 - What does this code do?432 - What is the function of the given code?433 - What is the PDF (probability density function) of a normal distribution?434 - Can you explain what the probability density function of a Gaussian distribution represents on page 8?435 - What is the process to create a probability density function plot of a Gaussian distribution in Pytorch?436 - Can you explain how to visualize the probability density function of a Gaussian distribution using Pytorch on page 8?437 - What is the process to create a probability density function plot of a Gaussian distribution in MXNet?438 - Can you explain how to visualize the probability density function of a Gaussian distribution using MXNet on page 8?439 - What is the process to create a probability density function plot of a Gaussian distribution in TensorFlow?440 - Can you explain how to visualize the probability density function of a Gaussian distribution using TensorFlow on page 8?441 - What does the cumulative distribution function represent in a Gaussian distribution?442 - Can you explain how to calculate the cumulative distribution function of a Gaussian distribution?443 - What is the process to create a cumulative distribution function (CDF) plot of a Gaussian distribution in PyTorch? 444 - How can I visualize the CDF of a Gaussian distribution using PyTorch?445 - What is the process to create a cumulative distribution function graph for a Gaussian distribution in MXNet?446 - Can you explain how to visualize the cumulative distribution function of a Gaussian distribution using MXNet on page 8?447 - What is the process to create a cumulative distribution function plot of a Gaussian distribution in TensorFlow?448 - Can you explain how to visualize the cumulative distribution function of a Gaussian distribution using TensorFlow on page 8?449 - What is the probability distribution that maximizes entropy?450 - How can we determine the most random or unbiased probability distribution?451 - What does the central limit theorem state about the distribution of sample means as the sample size increases?452 - Can you explain how the central limit theorem applies to normal distributions and their relationship with sample means?453 - How does mean differ from variance in a Gaussian distribution?454 - Can you explain the distinction between mean and variance in a Gaussian distribution on page 8?455 - What is the process of generating samples from a Gaussian distribution in Pytorch?456 - Can you explain how to use Pytorch to sample data from a Gaussian distribution on page 8?457 - What is the process of generating samples from a Gaussian distribution in MXNet?458 - Can you explain how to use MXNet to sample data from a Gaussian distribution on page 8?459 - What is the process of generating samples from a Gaussian distribution in TensorFlow?460 - Can you explain how to use TensorFlow to create random samples from a Gaussian distribution on page 8?461 - What type of distribution does `tf.random.normal()` use to create tensors?462 - Can you specify the dimensions of the tensor generated by `tf.random.normal()` function in TensorFlow?463 - What does linear algebra study?464 - Can you explain the concepts of linear algebra?465 - * What is a Naive Bayes classifier in machine learning?466 - * Can you explain what a Naive Bayes algorithm is used for in classification problems?467 - What does the MNIST dataset involve?468 - Can you explain what the purpose of the MNIST dataset is?469 - What is the objective of transforming pixels with values above 128 into 1 and those below 128 into 0? (Page 3)470 - Can you explain why changing pixel values over 128 to 1 and under 128 to 0 serves a specific function on page 3?471 - What is the total number of images present in the MNIST dataset?472 - How does the MNIST dataset consist of images?473 - What does the MNIST train image look like?474 - Can you describe the form of the MNIST training image?475 - What data type does the `train_labels` tensor have?476 - Can you tell me what the dtype of the `train_labels` tensor is?477 - What is the maximum number of examples that can be accessed simultaneously using numpy's stack function?478 - How does numpy's stack function handle accessing multiple examples at once?479 - What does the \"show\\_images\" function do in D2L?480 - Can you explain what the purpose of the \"d2l show\\_images\" function is on page 3?481 - What is the equation used in the Naive Bayes algorithm?482 - Can you provide the mathematical expression for implementing a Naive Bayes Classifier?483 - What is the formula used to calculate P\\_xy in a Naive Bayes Classifier?484 - How can we determine the value of P\\_y in a Naive Bayes Classifier?485 - How does the time complexity of a Naive Bayes Classifier compare to other machine learning algorithms?486 - Can you explain how the time complexity of a Naive Bayes Classifier affects its performance in terms of processing speed and efficiency?487 - What is the objective of including pseudo counts in a document?488 - Why do we incorporate pseudo counts into text analysis?489 - What does the `bayes_pred` function do?490 - Can you explain what the main objective of the `bayes_pred` function is?491 - What does linear algebra study?492 - Can you explain the concepts of linear algebra?493 - When was the first recorded use of statistical methods?494 - Can you provide a historical account of the development of statistics as a discipline?495 - What is the history of modern statistics?496 - When was modern statistics developed?497 - What are some of the topics that statistics covers in modern times?498 - How has the field of statistics evolved to address contemporary issues and concerns?499 - What is the fundamental principle of statistical analysis?500 - Can you explain the central concept in the field of statistics?501 - What does statistics primarily concentrate on?502 - What is the primary subject matter of statistics?503 - Can you provide me with some typical instances of statistical estimators?504 - What are several frequent types of estimators used in statistics?505 - What is the appearance of the plot representing the true density function of a Gaussian random variable?506 - Can you describe how the graphical representation of the probability distribution density of a Gaussian random variable looks like on page 2?507 - What does calculating the average value of a data set represent in statistical analysis?508 - In statistics, what is the function of determining the mean of a sample?509 - What does a Pooled Naive StorageManager do in TensorFlow?510 - Can you explain the function of a Pooled Naive StorageManager in TensorFlow?511 - How do you calculate the mean squared error?512 - What equation can be used to determine the mean squared error?513 - What is the equation used to calculate the standard deviation of a statistical estimate?514 - Can you provide the mathematical expression for determining the standard deviation of a sample mean as an estimator?515 - What is the difference between comparing the estimated population mean and the expected sample mean in terms of measuring fluctuations in an estimator?516 - How do you measure the variability of an estimator when comparing it to the expected sample mean versus the estimated population mean?517 - How does the relationship between bias and variance affect model performance?518 - What are some techniques used to balance bias and variance in machine learning models?519 - What distinguishes high variance from unavoidable error?520 - How does high variability differ from inherent uncertainty?521 - What methods are available to assess the effectiveness of a model in programming language?522 - How do you measure the accuracy of an estimator within your code implementation?523 - What is the objective of performing hypothesis testing?524 - Why do we carry out hypothesis tests?525 - How do you determine statistical significance using a formula?526 - What calculation method can be used to ascertain statistical significance?527 - What does it mean to have statistical power in a study?528 - Can you explain what statistical power refers to in research?529 - What is the relationship between statistical power and the likelihood of committing a type I or type II error?530 - How does the level of statistical power affect the probability of incorrectly rejecting or failing to reject the null hypothesis?531 - How does statistical power aid in deciding on the appropriate sample size?532 - What role does statistical power play in establishing the required number of samples for analysis?533 - How can I determine the necessary sample size for a given effect size and desired level of statistical power?534 - What is the formula for calculating the required sample size when considering both effect size and statistical power in a study?535 - What does \"test statistic\" mean on page 7?536 - On page 7, what is defined as a \"test statistic\"?537 - What does \"p-value\" represent on page 7?538 - On page 7, what is defined as a \"p-value\"?539 - What does it mean when a test only goes in one direction?540 - Can you explain what is meant by an examination that has no return path?541 - What does it mean when a test has two sides?542 - Can you explain what a double-sided examination entails?543 - What is a basic outline of the process for conducting hypothesis testing?544 - Can you describe the main stages involved in carrying out a hypothesis test?545 - Can you explain what a confidence interval represents on page 8?546 - What does the term \"confidence interval\" refer to in statistics, as mentioned on page 8?547 - What is the equation employed to determine the 95% confidence range? (Page 9)548 - How do you calculate the 95% confidence interval using a formula? (Page 9)549 - * What does Pytorch refer to?550 - * Can you explain what Pytorch means on page 10?551 - What does MXNet stand for?552 - Can you explain what MXNet is used for in machine learning?553 - * What does TensorFlow do?554 - * Can you explain what TensorFlow is used for?555 - What are some commonly utilized assessment measures in statistical analysis?556 - Can you name some typical evaluation metrics employed in statistics?557 - What is the process of conducting a hypothesis test?558 - Can you explain how to perform a hypothesis test?559 - * What does a confidence interval represent on page 10?560 - * Can you explain what a confidence interval means in statistics, as presented on page 10?561 - How does a one-sided test differ from a two-sided test?562 - What are the key differences between conducting a one-sided or a two-sided hypothesis test?563 - What is the process typically followed when performing a hypothesis test?564 - Can you outline the main stages of carrying out a hypothesis test?565 - What does linear algebra study?566 - Can you explain the concepts of linear algebra?567 - What is the objective of using natural language processing (NLP) techniques during pre-training?568 - Can you explain how NLP preprocessing helps in improving model performance?569 - What methods do you use to initialize word vectors before fine-tuning them on a specific task?570 - Can you describe some popular approaches for preparing word representations prior to training them on downstream tasks?571 - What is the process of performing sentiment analysis on text data through the use of recurrent neural networks?572 - Can you explain how sentiment analysis can be performed by utilizing recurrent neural networks in analyzing textual information?573 - How can BERT be customized for tasks that require both sequence-level and token-level analysis?574 - Can you explain how to adapt BERT for use in sequence-level and token-level applications?575 - How does value iteration differ from Q-learning in reinforcement learning?576 - Can you explain the key differences between value iteration and Q-learning in the field of reinforcement learning?577 - What is the goal of optimizing hyperparameters in deep learning models?578 - How does hyperparameter tuning improve the performance of deep learning algorithms?579 - * What are GANs?580 - * Can you explain what a GAN is?581 - * What is the process of decomposing a large and complex data set into smaller, more manageable components in order to improve its performance in recommender systems?582 - * Can you explain how matrix factorization can be used to enhance the effectiveness of algorithms employed in recommender systems by breaking down large amounts of information into smaller parts that are easier to analyze and manipulate?583 - How does an autoencoder differ from a variational autoencoder in terms of its architecture and function in deep learning?584 - Can you explain the key differences between an autoencoder and a variational autoencoder in deep learning, particularly regarding their structure and purpose?585 - What do utility functions serve as in the field of deep learning?586 - Can you explain the role of classes within deep learning tool's utility functions?587 - How do you calculate self-information according to the formula on page 3?588 - Can you explain the formula used to determine self-information as presented on page 3?589 - What does the use of a negative logarithm achieve in the calculation of entropy?590 - How does incorporating a negative logarithm into the entropy formula affect its overall result?591 - What does it mean to calculate the joint entropy of two random variables?592 - How do you determine the joint entropy between two probability distributions?593 - What is the relationship between joint entropy and the amount of information conveyed by a set of two randomly generated variables?594 - In what way does the concept of joint entropy connect to the quantity of data contained within a couple of independent random variables? (Page 5)595 - How does joint entropy differ from conditional entropy?596 - Can you explain the distinction between joint entropy and conditional entropy on page 5?597 - How do you calculate differential conditional entropy on page 6?598 - Can you provide a formula for calculating differential conditional entropy as stated on page 6?599 - What is the formula to determine the joint probability distribution of two continuous random variables in order to compute their conditional entropy?600 - Can you explain how to use calculus to find the derivative of the log-likelihood function and apply it to the calculation of the conditional entropy between two continuous random variables?601 - How does conditional entropy differ from joint entropy in terms of their calculations and interpretations?602 - Can you explain the distinction between conditional entropy and joint entropy as they relate to information theory?603 - How do you calculate mutual information?604 - Can you provide a formula for determining mutual information?605 - How does mutual information relate to joint entropy?606 - Can you explain the connection between mutual information and joint entropy?607 - What does \"mutual information\" mean on page 8?608 - On page 8, what is defined as \"mutual information\"?609 - What does \"cross-entropy\" mean on page 10?610 - Can you explain what is meant by \"cross-entropy\" as it appears on page 10?611 - What is the definition of cross-entropy loss on page 11?612 - Can you explain what the cross-entropy loss means in machine learning as presented on page 11?613 - What is the relationship between the log-likelihood function and the cross-entropy loss? (Page 11)614 - Can you explain how the log-likelihood function is related to the cross-entropy loss on page 11?615 - * What does the Negative Log Likelihood measure represent in PyTorch? (Page 11)616 - * How do you calculate the Negative Log Likelihood measure using PyTorch on Page 11?617 - What is the process of converting categorical data into numerical values using one-hot encoding in TensorFlow?618 - What is the measure of disorder or randomness in a system that has four distinct states and an equal likelihood of being found in any of them?619 - How does the concept of entropy apply to a system consisting of four equally probable states?620 - What is the measure of disorder in a system that has three distinct states with different probabilities?621 - How does the entropy change when the likelihoods of the three states vary in a system with three possible outcomes?622 - What is the measure of disorder in a system that has two distinct outcomes but different probabilities? (Line 12)623 - How does the entropy of a system vary when it contains only two equally likely states or two differently probable states? (Page 12)624 - What is the measure of disorder in a system that has two distinct states with an equal chance of occurring? (Page 12)625 - How does the entropy of a system vary when it consists of only two equally likely states? (Page 12)626 - What is the measure of disorder or randomness in a system that has three distinct states with an equal likelihood of occurring? (Entropy Definition)627 - How does the concept of entropy relate to the number of possible outcomes when there are three equally likely states in a system? (Entropy Formula)628 - What is the measure of disorder in a system that has four distinct states with varying probabilities?629 - How does the entropy change when the likelihoods of different outcomes vary in a system with four potential states?630 - What is the measure of disorder in a system that has an equal number of five possible states but different probabilities?631 - How does the entropy change when the probability distribution among the five possible states varies?632 - What is the measure of disorder in a system that has an equal number of six possible states but different probabilities assigned to them?633 - How does the entropy change when the number of possible outcomes remains constant at six, but their likelihoods vary?634 - What is the measure of disorder in a system that has seven distinct states with varying probabilities?635 - How does the entropy change when there are seven different outcomes, but their likelihoods differ?636 - What is the measure of disorder in a system that has eight distinct states but different probabilities associated with them? (Page 12)637 - How does the number of possible outcomes affect the level of randomness or uncertainty in a system when there are only eight states and their likelihoods vary? (Page 12)638 - How does the unevenness in the probabilities of the nine possible states affect the entropy of the system?639 - Can you calculate the entropy of a system that has an equal number of possible states but different probabilities for each state?640 - What is the measure of disorder in a system that has an equal number of ten possible states but different probabilities assigned to them?641 - How does the entropy change when the number of possible states remains constant at ten while the likelihoods of these states vary?642 - What is the measure of disorder in a system that has an equal number of states but different probabilities?643 - How does the entropy change when the number of possible states increases while their probabilities remain constant?644 - What is the measure of disorder in a system that has twelve distinct states but different probabilities associated with them? (Page 12)645 - How does the number of possible outcomes affect the level of randomness or uncertainty in a system when there are twelve unique states and their likelihoods vary? (Page 12)646 - What is the measure of disorder in a system that has 13 distinct states with varying probabilities?647 - How does the entropy of a system change when it contains 13 different states with non-uniform likelihoods?648 - What is the measure of disorder in a system that has 14 distinct states with varying probabilities?649 - How does the entropy of a system change when it has an equal number of 14 possible states but different probabilities assigned to them?650 - What is the measure of disorder in a system that has 15 distinct states but different probabilities associated with them? (Page 12)651 - How does the entropy of a system vary when it contains 15 unique states and their corresponding probabilities are not equal? (Page 12)652 - What is the measure of disorder in a system that has 16 distinct states but different probabilities associated with them? (Page 12)653 - How does the entropy of a system vary when it contains 16 unique states and their corresponding probabilities are not equal? (Page 12)654 - What is the measure of disorder in a system that has 17 distinct states with varying probabilities?655 - How does the entropy of a system change when it contains 17 different states with non-uniform likelihoods?656 - What is the measure of disorder in a system that has 18 distinct states but different probabilities associated with them? (Page 12)657 - How does the entropy of a system vary when it contains 18 unique states and their corresponding probabilities are not equal? (Page 12)658 - What is the measure of disorder in a system that has 19 distinct states with varying probabilities?659 - How does the entropy change when there are 19 different outcomes in a system with non-equal likelihoods?Execution time: 11428.181218385696 seconds\n" + ] + } + ], + "source": [ + "from pykoi.chat.llm.instruct_pipeline import InstructionTextGenerationPipeline\n", + "import time\n", + "response_list = []\n", + "question_list = []\n", + "generate_text = InstructionTextGenerationPipeline(model=model_ft, tokenizer=tokenizer_ft)\n", + "\n", + "st = time.time()\n", + "\n", + "i=0\n", + "for question in prompt_list:\n", + " #print(i,\" \",end = '')\n", + " \n", + " print(i,\"-\",question, end='')\n", + " res = generate_text(question)\n", + " response = res[0][\"generated_text\"]\n", + " question_list.append(question)\n", + " response_list.append(response)\n", + " i = i+1\n", + "\n", + "et = time.time()\n", + "elapsed_time = et - st\n", + "print('Execution time:', elapsed_time, 'seconds')\n", + "import pandas as pd\n", + "OUTPUT_QR_FILE = os.getcwd()+'/../../data/question_generated_answer.csv'\n", + "\n", + "df = pd.DataFrame(list(zip(question_list, response_list)), columns =['Question', 'Answer'])\n", + "\n", + "df.to_csv(OUTPUT_QR_FILE, index=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "6f5fd364-88a5-40a0-a496-f83b0bcb1fc0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Linear algebra is a branch of mathematics that deals with mathematical objects such as vectors and matrices, and their operations such as addition, subtraction, scalar multiplication, and matrix multiplication. It provides a framework for solving systems of linear equations and finding eigenvalues and eigenvectors of matrices. In deep learning, linear algebra plays an important role in representing data using weighted sums of inputs and outputs.',\n", + " 'Linear algebra is a branch of mathematics that deals with mathematical objects such as vectors and matrices, and their operations such as addition, subtraction, scalar multiplication, and matrix multiplication. It provides a framework for solving systems of linear equations and finding eigenvalues and eigenvectors of matrices. In deep learning, linear algebra plays an important role in representing data using weighted sums of inputs and outputs.',\n", + " 'A vector can be interpreted as a point or direction in space. Mathematically, it is represented as a list of numbers, either written as a column or row vector. For example, a vector can be represented as (22.1.1) or (22.1.2).',\n", + " 'Vectors can be interpreted as points or directions in space. Mathematically, they are represented as lists of numbers, either written as columns or rows. For example, a vector can be represented as (22.1.1) or (22.1.2).',\n", + " \"Column vectors represent data examples while row vectors are used to form weighted sums. However, it's more conventional to treat each data example as a row vector in a matrix representing a tabular dataset.\",\n", + " \"Column vectors represent data examples while row vectors are used to form weighted sums. However, it's more conventional to treat each data example as a row vector in a matrix representing a tabular dataset.\",\n", + " 'We can use the components of the vectors to define the location of the points in space compared to a fixed reference called the origin. This can be seen in Fig. 22.1.1.',\n", + " 'We can use the components of the vectors to define the location of the points in space compared to a fixed reference called the origin. This can be seen in Fig. 22.1.1.',\n", + " 'Yes, every vector drawn in Fig. 22.1.2 represents the vector (3,2)^T.',\n", + " 'Yes, every vector drawn in Fig. 22.1.2 represents the vector (3,2)^T.']" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "response_list[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "68965e63-3f94-4887-91d3-0d6d7c5e9a4d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "660" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(response_list)" + ] + }, + { + "cell_type": "markdown", + "id": "4d2aca7f-e19c-42ab-b7fe-97afe9641be7", + "metadata": {}, + "source": [ + "## Evaluation with Rouge score" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "c68f03c2-51ad-49b0-bf53-75f0c9d3ed69", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", + "To disable this warning, you can either:\n", + "\t- Avoid using `tokenizers` before the fork if possible\n", + "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: rouge_score in /opt/conda/envs/pykoi/lib/python3.10/site-packages (0.1.2)\n", + "Requirement already satisfied: absl-py in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from rouge_score) (2.1.0)\n", + "Requirement already satisfied: nltk in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from rouge_score) (3.8.1)\n", + "Requirement already satisfied: numpy in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from rouge_score) (1.26.3)\n", + "Requirement already satisfied: six>=1.14.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from rouge_score) (1.16.0)\n", + "Requirement already satisfied: click in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from nltk->rouge_score) (8.1.7)\n", + "Requirement already satisfied: joblib in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from nltk->rouge_score) (1.3.2)\n", + "Requirement already satisfied: regex>=2021.8.3 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from nltk->rouge_score) (2023.12.25)\n", + "Requirement already satisfied: tqdm in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from nltk->rouge_score) (4.66.1)\n" + ] + } + ], + "source": [ + "!pip install rouge_score\n", + "from rouge_score import rouge_scorer\n", + "\n", + "import evaluate\n", + "rouge = evaluate.load('rouge')\n", + "\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "14332ee7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['What does linear algebra study?',\n", + " 'Can you explain the concepts of linear algebra?',\n", + " 'What are the two ways to visualize vectors in geometry?',\n", + " 'How can vectors be represented using lines and arrows in geometry?',\n", + " 'How do column vectors differ from row vectors?',\n", + " 'Can you explain the distinction between column and row vectors?',\n", + " 'What is the process of representing vectors as points in a three-dimensional space?',\n", + " 'Can you explain how to graph vectors using coordinate systems and point representations?',\n", + " 'Can you represent vectors using arrows on a flat surface?',\n", + " 'How can vectors be depicted as arrows within a two-dimensional space?']" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "question_list[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "9c363697-2a5f-4ba7-bcd6-f88965c08319", + "metadata": {}, + "outputs": [], + "source": [ + "def calculate_rouge(pred_list,ref_list):\n", + " \n", + " rouge_results = rouge.compute(predictions=pred_list,\n", + " references=ref_list,\n", + " use_aggregator=True)\n", + " avg_rougeLsum = np.mean(rouge_results[\"rougeLsum\"])\n", + " avg_rougeL = np.mean(rouge_results[\"rougeL\"])\n", + " avg_rouge2 = np.mean(rouge_results[\"rouge2\"])\n", + " avg_rouge1 = np.mean(rouge_results[\"rouge1\"])\n", + " \n", + " print(\"Average rouge score - \", avg_rougeLsum, avg_rougeL, avg_rouge2, avg_rouge1)\n", + " \n", + " return " + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "6a80aac5-5d32-4aa6-b04f-d03dca376bda", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Average rouge score - 0.19681363156175896 0.19661530539933503 0.14625715957312024 0.21441399074146578\n" + ] + } + ], + "source": [ + "ref_list = reference_list\n", + "response_list = response_list\n", + "calculate_rouge(response_list, ref_list) " + ] + }, + { + "cell_type": "markdown", + "id": "e701fba8-453c-489c-87d2-9104ebfc0c12", + "metadata": {}, + "source": [ + "## Evaluation with semantic similarity score" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "de23c1d7-36f3-4a3e-9ebb-a2501100858e", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", + "To disable this warning, you can either:\n", + "\t- Avoid using `tokenizers` before the fork if possible\n", + "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: sentence-transformers in /opt/conda/envs/pykoi/lib/python3.10/site-packages (2.2.2)\n", + "Collecting sentence-transformers\n", + " Downloading sentence_transformers-2.3.1-py3-none-any.whl.metadata (11 kB)\n", + "Requirement already satisfied: transformers<5.0.0,>=4.32.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from sentence-transformers) (4.36.2)\n", + "Requirement already satisfied: tqdm in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from sentence-transformers) (4.66.1)\n", + "Requirement already satisfied: torch>=1.11.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from sentence-transformers) (2.1.2)\n", + "Requirement already satisfied: numpy in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from sentence-transformers) (1.26.3)\n", + "Requirement already satisfied: scikit-learn in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from sentence-transformers) (1.3.0)\n", + "Requirement already satisfied: scipy in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from sentence-transformers) (1.11.1)\n", + "Requirement already satisfied: nltk in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from sentence-transformers) (3.8.1)\n", + "Requirement already satisfied: sentencepiece in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from sentence-transformers) (0.1.99)\n", + "Requirement already satisfied: huggingface-hub>=0.15.1 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from sentence-transformers) (0.20.2)\n", + "Requirement already satisfied: Pillow in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from sentence-transformers) (10.2.0)\n", + "Requirement already satisfied: filelock in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from huggingface-hub>=0.15.1->sentence-transformers) (3.13.1)\n", + "Requirement already satisfied: fsspec>=2023.5.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from huggingface-hub>=0.15.1->sentence-transformers) (2023.10.0)\n", + "Requirement already satisfied: requests in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from huggingface-hub>=0.15.1->sentence-transformers) (2.31.0)\n", + "Requirement already satisfied: pyyaml>=5.1 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from huggingface-hub>=0.15.1->sentence-transformers) (6.0.1)\n", + "Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from huggingface-hub>=0.15.1->sentence-transformers) (4.9.0)\n", + "Requirement already satisfied: packaging>=20.9 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from huggingface-hub>=0.15.1->sentence-transformers) (23.1)\n", + "Requirement already satisfied: sympy in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from torch>=1.11.0->sentence-transformers) (1.12)\n", + "Requirement already satisfied: networkx in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from torch>=1.11.0->sentence-transformers) (3.2.1)\n", + "Requirement already satisfied: jinja2 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from torch>=1.11.0->sentence-transformers) (3.1.2)\n", + "Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.1.105 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from torch>=1.11.0->sentence-transformers) (12.1.105)\n", + "Requirement already satisfied: nvidia-cuda-runtime-cu12==12.1.105 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from torch>=1.11.0->sentence-transformers) (12.1.105)\n", + "Requirement already satisfied: nvidia-cuda-cupti-cu12==12.1.105 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from torch>=1.11.0->sentence-transformers) (12.1.105)\n", + "Requirement already satisfied: nvidia-cudnn-cu12==8.9.2.26 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from torch>=1.11.0->sentence-transformers) (8.9.2.26)\n", + "Requirement already satisfied: nvidia-cublas-cu12==12.1.3.1 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from torch>=1.11.0->sentence-transformers) (12.1.3.1)\n", + "Requirement already satisfied: nvidia-cufft-cu12==11.0.2.54 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from torch>=1.11.0->sentence-transformers) (11.0.2.54)\n", + "Requirement already satisfied: nvidia-curand-cu12==10.3.2.106 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from torch>=1.11.0->sentence-transformers) (10.3.2.106)\n", + "Requirement already satisfied: nvidia-cusolver-cu12==11.4.5.107 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from torch>=1.11.0->sentence-transformers) (11.4.5.107)\n", + "Requirement already satisfied: nvidia-cusparse-cu12==12.1.0.106 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from torch>=1.11.0->sentence-transformers) (12.1.0.106)\n", + "Requirement already satisfied: nvidia-nccl-cu12==2.18.1 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from torch>=1.11.0->sentence-transformers) (2.18.1)\n", + "Requirement already satisfied: nvidia-nvtx-cu12==12.1.105 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from torch>=1.11.0->sentence-transformers) (12.1.105)\n", + "Requirement already satisfied: triton==2.1.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from torch>=1.11.0->sentence-transformers) (2.1.0)\n", + "Requirement already satisfied: nvidia-nvjitlink-cu12 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from nvidia-cusolver-cu12==11.4.5.107->torch>=1.11.0->sentence-transformers) (12.3.101)\n", + "Requirement already satisfied: regex!=2019.12.17 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from transformers<5.0.0,>=4.32.0->sentence-transformers) (2023.12.25)\n", + "Requirement already satisfied: tokenizers<0.19,>=0.14 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from transformers<5.0.0,>=4.32.0->sentence-transformers) (0.15.0)\n", + "Requirement already satisfied: safetensors>=0.3.1 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from transformers<5.0.0,>=4.32.0->sentence-transformers) (0.4.1)\n", + "Requirement already satisfied: click in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from nltk->sentence-transformers) (8.1.7)\n", + "Requirement already satisfied: joblib in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from nltk->sentence-transformers) (1.3.2)\n", + "Requirement already satisfied: threadpoolctl>=2.0.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from scikit-learn->sentence-transformers) (3.2.0)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from jinja2->torch>=1.11.0->sentence-transformers) (2.1.3)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from requests->huggingface-hub>=0.15.1->sentence-transformers) (3.3.2)\n", + "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from requests->huggingface-hub>=0.15.1->sentence-transformers) (3.6)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from requests->huggingface-hub>=0.15.1->sentence-transformers) (2.1.0)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from requests->huggingface-hub>=0.15.1->sentence-transformers) (2023.11.17)\n", + "Requirement already satisfied: mpmath>=0.19 in /opt/conda/envs/pykoi/lib/python3.10/site-packages (from sympy->torch>=1.11.0->sentence-transformers) (1.3.0)\n", + "Downloading sentence_transformers-2.3.1-py3-none-any.whl (132 kB)\n", + "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m132.8/132.8 kB\u001b[0m \u001b[31m6.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hInstalling collected packages: sentence-transformers\n", + " Attempting uninstall: sentence-transformers\n", + " Found existing installation: sentence-transformers 2.2.2\n", + " Uninstalling sentence-transformers-2.2.2:\n", + " Successfully uninstalled sentence-transformers-2.2.2\n", + "Successfully installed sentence-transformers-2.3.1\n" + ] + } + ], + "source": [ + "!pip install -U sentence-transformers\n", + "from sentence_transformers import SentenceTransformer, util\n", + "import numpy as np\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "0375e6b6-d6ac-4f52-ac48-09a57fe7b38e", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "modules.json: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 385/385 [00:00<00:00, 2.52MB/s]\n", + "README.md: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 67.9k/67.9k [00:00<00:00, 13.9MB/s]\n", + "sentence_bert_config.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 57.0/57.0 [00:00<00:00, 448kB/s]\n", + "config.json: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 619/619 [00:00<00:00, 4.83MB/s]\n", + "model.safetensors: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 670M/670M [00:02<00:00, 239MB/s]\n", + "tokenizer_config.json: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 342/342 [00:00<00:00, 2.49MB/s]\n", + "vocab.txt: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 232k/232k [00:00<00:00, 21.7MB/s]\n", + "tokenizer.json: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 712k/712k [00:00<00:00, 4.93MB/s]\n", + "special_tokens_map.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 125/125 [00:00<00:00, 987kB/s]\n", + "1_Pooling/config.json: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 191/191 [00:00<00:00, 1.34MB/s]\n" + ] + } + ], + "source": [ + "model = SentenceTransformer('thenlper/gte-large') # use this one in production\n" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "b5c737fe-149f-48e3-acaa-e388db22adcd", + "metadata": {}, + "outputs": [], + "source": [ + "def calculate_semantic_sim(pred_list,ref_list):\n", + " \n", + " sem_score = []\n", + " average_sem_sim = 0\n", + " \n", + " for i in range(len(ref_list)):\n", + " \n", + " ref_embedding = model.encode(ref_list[i])\n", + " pred_embedding = model.encode(pred_list[i])\n", + " cos_sim = util.cos_sim(ref_embedding, pred_embedding)\n", + " #print(cos_sim[0][0].item())\n", + " \n", + " sem_score.append(cos_sim[0][0].item())\n", + " \n", + " average_sem_sim = np.mean(sem_score) \n", + " \n", + " #print(\"Average similarity: \", average_sem_sim)\n", + " \n", + " return average_sem_sim" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "c59526b3-fe2c-4269-ae43-45285998d0a4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.836418643322858" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "avg_score = calculate_semantic_sim(reference_list,response_list)\n", + "avg_score" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ac756486-9fdf-42c3-809c-eef684faec30", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pykoi", + "language": "python", + "name": "pykoi" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/example/rlhf/supervised_finetuning_demo.py b/example/rlhf/pre_training_demo.py similarity index 78% rename from example/rlhf/supervised_finetuning_demo.py rename to example/rlhf/pre_training_demo.py index e1d10f4..fed85b7 100644 --- a/example/rlhf/supervised_finetuning_demo.py +++ b/example/rlhf/pre_training_demo.py @@ -1,13 +1,13 @@ -"""Demo for the supervised fine tuning. +"""Demo for the pre-training. -python -m example.rlhf.supervised_finetuning_demo +python -m example.rlhf.pre_training_demo """ from pykoi.chat import QuestionAnswerDatabase from pykoi.chat.db.constants import (QA_CSV_HEADER_ANSWER, QA_CSV_HEADER_ID, QA_CSV_HEADER_QUESTION, QA_CSV_HEADER_VOTE_STATUS) -from pykoi.rlhf import RLHFConfig, SupervisedFinetuning +from pykoi.rlhf import RLHFConfig, PreTraining # get data from local database qa_database = QuestionAnswerDatabase() @@ -25,7 +25,7 @@ print(my_data_pd) print("My local database has {} samples in total".format(my_data_pd.shape[0])) -# run supervised finetuning +# run pre-training config = RLHFConfig(base_model_path="databricks/dolly-v2-3b", dataset_type="local_db") -rlhf_step1_sft = SupervisedFinetuning(config) +rlhf_step1_sft = PreTraining(config) rlhf_step1_sft.train_and_save("./models/rlhf_step1_sft") diff --git a/example/rlhf/supervised_finetuning_demo_d2l.py b/example/rlhf/supervised_finetuning_demo_d2l.py new file mode 100644 index 0000000..c544fc1 --- /dev/null +++ b/example/rlhf/supervised_finetuning_demo_d2l.py @@ -0,0 +1,43 @@ +"""Demo for the supervised fine tuning. + +python -m example.rlhf.supervised_finetuning_demo_d2l +""" + +from peft import LoraConfig +from pykoi.chat import QuestionAnswerDatabase +from pykoi.chat.db.constants import (QA_CSV_HEADER_ANSWER, QA_CSV_HEADER_ID, + QA_CSV_HEADER_QUESTION, + QA_CSV_HEADER_VOTE_STATUS) +from pykoi.rlhf import RLHFConfig, SupervisedFinetuning +from trl import DataCollatorForCompletionOnlyLM + + + +# run supervised finetuning +config = RLHFConfig(base_model_path="mistralai/Mistral-7B-Instruct-v0.1", + dataset_type="local_csv", dataset_name="data/chapter22_trnvalfromseed_data_processed.csv", + train_test_split_ratio=0, # ratio for test set DH:TODO: COBINE TRAIN AND EVAL + max_seq_length=896, + per_device_eval_batch_size=1, + log_freq=20, + # dh: NOTE: 1 EPOCH iterates the dataset once. So log freq 20 means iterating 20 entries when training batch size = 1. + # (i.e., log_freq = 0.12 epoch when the dataset has 166 entires). + save_freq=40000, + num_train_epochs=20, + max_steps=-1, # if a positive number is given, it will override num_train_epochs + device_map="auto", + lora_config_rl=LoraConfig( + r=512, + lora_alpha=1024, + lora_dropout=0.05, + target_modules=["q_proj", "k_proj", "v_proj", "o_proj", ], # "gate_proj","up_proj","down_proj",], #"lm_head",], + bias="none", + task_type="CAUSAL_LM" + ), + data_collator=DataCollatorForCompletionOnlyLM, + no_evaluation=True, + prepare_text="d2l", + split = "train[:10%]" + ) +rlhf_step1_sft = SupervisedFinetuning(config) +rlhf_step1_sft.train_and_save("./models/rlhf_step1_sft") diff --git a/pykoi/rlhf/config.py b/pykoi/rlhf/config.py index e7721f1..c3958cd 100644 --- a/pykoi/rlhf/config.py +++ b/pykoi/rlhf/config.py @@ -5,6 +5,7 @@ from accelerate import Accelerator from peft import LoraConfig, TaskType +import transformers @dataclass @@ -119,6 +120,7 @@ class RLHFConfig: default="./rlhf_checkpoints", metadata={"help": "Output directory for all model weights."}, ) + num_train_epochs: Optional[int] = field(default=5, metadata={"help": "supervised fine tuning training epochs"}) log_freq: Optional[int] = field(default=1, metadata={"help": "Logging frequency."}) eval_freq: Optional[int] = field( default=1000, metadata={"help": "Evaluation frequency."} @@ -182,6 +184,18 @@ class RLHFConfig: ), metadata={"help": "LoRA configuration."}, ) + data_collator: Optional[transformers.DataCollator] = field( + default=None, + metadata={"help": "The data collator to use for training."}, + ) + no_evaluation: Optional[bool] = field( + default=False, + metadata={"help": "Whether to disable evaluations during training."}, + ) + prepare_text: Optional[str] = field( + default="sample", + metadata={"help": "How to prepare the text for the model."}, + ) # Step 2 reward modeling parameters reward_model_path: Optional[str] = field( diff --git a/pykoi/rlhf/pre_traning.py b/pykoi/rlhf/pre_traning.py new file mode 100644 index 0000000..feb58bd --- /dev/null +++ b/pykoi/rlhf/pre_traning.py @@ -0,0 +1,225 @@ +"""pre-training.""" +import os +import time +from datetime import datetime +from typing import Optional + +import torch +from datasets import Dataset, load_dataset +from peft import PeftConfig, PeftModel +from transformers import (AutoModelForCausalLM, + AutoModelForSequenceClassification, AutoTokenizer, + TrainingArguments) +from trl import SFTTrainer +from trl.trainer.utils import ConstantLengthDataset + +from pykoi.chat.db.constants import (QA_CSV_HEADER_ANSWER, QA_CSV_HEADER_ID, + QA_CSV_HEADER_QUESTION, + QA_CSV_HEADER_VOTE_STATUS) +from pykoi.chat.db.qa_database import QuestionAnswerDatabase +from pykoi.rlhf.config import RLHFConfig +from pykoi.telemetry.events import SFTStartEvent, SFTStopEvent +from pykoi.telemetry.telemetry import Telemetry + + +class PreTraining: + """ + A class representing the pre-training trainer. + + Attributes: + rlhf_config (RLHFConfig): The RLHF configuration object. + tokenizer (AutoTokenizer): The tokenizer used for tokenizing the input data. + num_proc (int): The number of workers to use for data loading. + dataset (Dict[str, Dataset]): A dictionary containing the train and eval datasets. + torch_dtype (torch.dtype): The torch data type to use for training. + training_args (TrainingArguments): The training arguments for the trainer. + model (AutoModelForCausalLM): The model to train. + trainer (SFTTrainer): The trainer object used for training the model. + """ + + def __init__(self, rlhf_config: RLHFConfig, enable_telemetry: bool = True) -> None: + """ + Initializes the SFTTrainer object. + + Args: + rlhf_config (RLHFConfig): The RLHF configuration object. + enbale_telemetry (bool): Whether to enable telemetry or not. + """ + self._telemetry = Telemetry(enable_telemetry) + self._rlhf_config = rlhf_config + self.tokenizer = AutoTokenizer.from_pretrained(rlhf_config.base_model_path) + self.num_proc = ( + self._rlhf_config.num_workers if not self._rlhf_config.streaming else None + ) + self.dataset = self.create_datasets(self.tokenizer, self._rlhf_config) + self.torch_dtype = torch.bfloat16 if self._rlhf_config.bf16 else torch.float16 + # self.torch_dtype = torch.bfloat16 if bf16 else (torch.float16 if fp16 else torch.float32) + self.training_args = TrainingArguments( + output_dir=self._rlhf_config.output_dir, + dataloader_drop_last=True, + evaluation_strategy=self._rlhf_config.evaluation_strategy, + max_steps=self._rlhf_config.max_steps, + eval_steps=self._rlhf_config.eval_freq, + save_steps=self._rlhf_config.save_freq, + logging_steps=self._rlhf_config.log_freq, + per_device_train_batch_size=self._rlhf_config.per_device_train_batch_size, + per_device_eval_batch_size=self._rlhf_config.per_device_eval_batch_size, + learning_rate=self._rlhf_config.learning_rate, + lr_scheduler_type=self._rlhf_config.lr_scheduler_type_sft, + warmup_steps=self._rlhf_config.num_warmup_steps, + gradient_accumulation_steps=self._rlhf_config.gradient_accumulation_steps, + gradient_checkpointing=self._rlhf_config.gradient_checkpointing, + gradient_checkpointing_kwargs={ + "use_reentrant": self._rlhf_config.gradient_checkpointing_use_reentrant + }, + fp16=self._rlhf_config.fp16, + bf16=self._rlhf_config.bf16, + weight_decay=self._rlhf_config.weight_decay, + run_name="step1_pre_training", + ddp_find_unused_parameters=False, + ) + self.model = AutoModelForCausalLM.from_pretrained( + self._rlhf_config.base_model_path, + load_in_8bit=self._rlhf_config.load_in_8bit, + device_map=self._rlhf_config.device_map, + ) + self.trainer = SFTTrainer( + model=self.model, + args=self.training_args, + train_dataset=self.dataset["train"], + eval_dataset=self.dataset["eval"], + peft_config=self._rlhf_config.lora_config_rl, + packing=True, + ) + + def train(self): + """ + Trains the model using the SFTTrainer object. + """ + self.trainer.train() + + def load_lora( + self, + base_model_path: Optional[str] = None, + lora_model_path: Optional[str] = None, + ): + if base_model_path is None: + base_model_path = self._rlhf_config.base_model_path + + # Load lora config + if lora_model_path is None: + lora_config = self.trainer.model.config + else: + lora_config = PeftConfig.from_pretrained(lora_model_path) + + # Load the base tokenizer and model + tokenizer = AutoTokenizer.from_pretrained(base_model_path) + if lora_config.task_type == "SEQ_CLS": + # peft is for reward model so load sequence classification + base_model = AutoModelForSequenceClassification.from_pretrained( + base_model_path, + num_labels=1, + torch_dtype=self._rlhf_config.torch_dtype, + ) + elif lora_config.task_type == "CAUSAL_LM": + base_model = AutoModelForCausalLM.from_pretrained( + base_model_path, + return_dict=True, + torch_dtype=self._rlhf_config.torch_dtype, + ) + else: + raise ValueError("Invalid task_type in lora_config") + + # Merge the base model and the Lora model + model = PeftModel.from_pretrained(base_model, lora_config) + return model, tokenizer + + def save(self, output_path=None): + if output_path is None: + output_path = os.path.join( + self._rlhf_config.output_dir, self._rlhf_config.sft_lora_path + ) + self.trainer.save_model(output_path) + + def train_and_save(self, output_path=None): + start_event = SFTStartEvent( + start_time=time.time(), date_time=datetime.utcfromtimestamp(time.time()) + ) + self._telemetry.capture(start_event) + self.trainer.train() + self.save(output_path) + self._telemetry.capture( + SFTStopEvent( + end_time=time.time(), + date_time=datetime.utcfromtimestamp(time.time()), + duration=time.time() - start_event.start_time, + ) + ) + + def prepare_sample_text(self, example): + """Prepare the text from a sample of the dataset.""" + text = ( + f"Question: {example[self._rlhf_config.question_title]}\n\n " + f" Answer: {example[self._rlhf_config.answer_title]}" + ) + return text + + def create_datasets(self, tokenizer, args): + if args.dataset_type == "local_db": + qa_database = QuestionAnswerDatabase() + my_data_pd = qa_database.retrieve_all_question_answers_as_pandas() + my_data_pd = my_data_pd[my_data_pd[QA_CSV_HEADER_VOTE_STATUS] == "up"] + my_data_pd = my_data_pd[ + [QA_CSV_HEADER_ID, QA_CSV_HEADER_QUESTION, QA_CSV_HEADER_ANSWER] + ] + print( + "My local database has {} up vote samples for pre-training".format( + my_data_pd.shape[0] + ) + ) + dataset = Dataset.from_dict(my_data_pd) + elif args.dataset_type == "local_csv": + dataset = load_dataset("csv", data_files=args.dataset_name) + dataset = dataset[args.split] # Convert DatasetDict to Dataset + elif args.dataset_type == "huggingface": + dataset = load_dataset( + args.dataset_name, + data_dir=args.dataset_subset_sft, + split=args.split, + use_auth_token=True, + num_proc=self.num_proc, + streaming=args.streaming, + ) + dataset = dataset[args.split] # Convert DatasetDict to Dataset + else: + raise FileNotFoundError( + "No (supported) data files or dataset script found" + f" {args.dataset_type}" + ) + + dataset = dataset.train_test_split( + test_size=args.train_test_split_ratio, seed=args.seed + ) + print( + f"Size of the train set: {len(dataset['train'])}. " + f" Size of the validation set: {len(dataset['test'])}" + ) + + train_dataset = ConstantLengthDataset( + tokenizer, + dataset["train"], + formatting_func=self.prepare_sample_text, + infinite=True, + seq_length=args.max_seq_length, + # chars_per_token=chars_per_token, + ) + eval_dataset = ConstantLengthDataset( + tokenizer, + dataset["test"], + formatting_func=self.prepare_sample_text, + infinite=False, + seq_length=args.max_seq_length, + # chars_per_token=chars_per_token, + ) + return {"train": train_dataset, "eval": eval_dataset} + \ No newline at end of file diff --git a/pykoi/rlhf/supervised_finetuning.py b/pykoi/rlhf/supervised_finetuning.py index 7a58a9f..971e511 100644 --- a/pykoi/rlhf/supervised_finetuning.py +++ b/pykoi/rlhf/supervised_finetuning.py @@ -11,8 +11,6 @@ AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments) from trl import SFTTrainer -from trl.trainer.utils import ConstantLengthDataset - from pykoi.chat.db.constants import (QA_CSV_HEADER_ANSWER, QA_CSV_HEADER_ID, QA_CSV_HEADER_QUESTION, QA_CSV_HEADER_VOTE_STATUS) @@ -20,6 +18,7 @@ from pykoi.rlhf.config import RLHFConfig from pykoi.telemetry.events import SFTStartEvent, SFTStopEvent from pykoi.telemetry.telemetry import Telemetry +from trl import DataCollatorForCompletionOnlyLM class SupervisedFinetuning: @@ -37,7 +36,10 @@ class SupervisedFinetuning: trainer (SFTTrainer): The trainer object used for training the model. """ - def __init__(self, rlhf_config: RLHFConfig, enable_telemetry: bool = True) -> None: + def __init__( + self, + rlhf_config: RLHFConfig, + enable_telemetry: bool = True) -> None: """ Initializes the SFTTrainer object. @@ -47,10 +49,18 @@ def __init__(self, rlhf_config: RLHFConfig, enable_telemetry: bool = True) -> No """ self._telemetry = Telemetry(enable_telemetry) self._rlhf_config = rlhf_config - self.tokenizer = AutoTokenizer.from_pretrained(rlhf_config.base_model_path) + self.tokenizer = AutoTokenizer.from_pretrained( + rlhf_config.base_model_path) + # dh: add special tokens to tokenizer + self.tokenizer.pad_token = self.tokenizer.eos_token + END_KEY = "### End" + INSTRUCTION_KEY = "### Instruction:" + RESPONSE_KEY = "### Response:" + RESPONSE_KEY_NL = f"{RESPONSE_KEY}\n" + self.tokenizer.add_special_tokens( + {"additional_special_tokens": [END_KEY, INSTRUCTION_KEY, RESPONSE_KEY_NL]}) self.num_proc = ( - self._rlhf_config.num_workers if not self._rlhf_config.streaming else None - ) + self._rlhf_config.num_workers if not self._rlhf_config.streaming else None) self.dataset = self.create_datasets(self.tokenizer, self._rlhf_config) self.torch_dtype = torch.bfloat16 if self._rlhf_config.bf16 else torch.float16 # self.torch_dtype = torch.bfloat16 if bf16 else (torch.float16 if fp16 else torch.float32) @@ -70,26 +80,41 @@ def __init__(self, rlhf_config: RLHFConfig, enable_telemetry: bool = True) -> No gradient_accumulation_steps=self._rlhf_config.gradient_accumulation_steps, gradient_checkpointing=self._rlhf_config.gradient_checkpointing, gradient_checkpointing_kwargs={ - "use_reentrant": self._rlhf_config.gradient_checkpointing_use_reentrant - }, + "use_reentrant": self._rlhf_config.gradient_checkpointing_use_reentrant}, fp16=self._rlhf_config.fp16, bf16=self._rlhf_config.bf16, weight_decay=self._rlhf_config.weight_decay, run_name="step1_supervised_finetuning", ddp_find_unused_parameters=False, + num_train_epochs=self._rlhf_config.num_train_epochs, ) self.model = AutoModelForCausalLM.from_pretrained( self._rlhf_config.base_model_path, load_in_8bit=self._rlhf_config.load_in_8bit, device_map=self._rlhf_config.device_map, ) + # resize the token embeddings to include the added special tokens + self.model.resize_token_embeddings(len(self.tokenizer)) + if self._rlhf_config.data_collator.__name__ == "DataCollatorForCompletionOnlyLM": + # data collator that only predicts the answer part + response_template = RESPONSE_KEY + data_collator = self._rlhf_config.data_collator(response_template, + tokenizer=self.tokenizer) + else: + data_collator = None + self.trainer = SFTTrainer( model=self.model, args=self.training_args, train_dataset=self.dataset["train"], eval_dataset=self.dataset["eval"], peft_config=self._rlhf_config.lora_config_rl, - packing=True, + # TODO: DH: LoraConfig MAY BE IGNORED IF USING FROM_PRETRAINED + packing=False, # required for compatibility with the completiononly data collator + data_collator=data_collator, + formatting_func=self.prepare_text, + dataset_text_field=None, + max_seq_length=self._rlhf_config.max_seq_length, ) def train(self): @@ -103,6 +128,8 @@ def load_lora( base_model_path: Optional[str] = None, lora_model_path: Optional[str] = None, ): + #import pdb; pdb.set_trace() + # dh: not used if base_model_path is None: base_model_path = self._rlhf_config.base_model_path @@ -143,8 +170,9 @@ def save(self, output_path=None): def train_and_save(self, output_path=None): start_event = SFTStartEvent( - start_time=time.time(), date_time=datetime.utcfromtimestamp(time.time()) - ) + start_time=time.time(), + date_time=datetime.utcfromtimestamp( + time.time())) self._telemetry.capture(start_event) self.trainer.train() self.save(output_path) @@ -160,10 +188,79 @@ def prepare_sample_text(self, example): """Prepare the text from a sample of the dataset.""" text = ( f"Question: {example[self._rlhf_config.question_title]}\n\n " - f" Answer: {example[self._rlhf_config.answer_title]}" - ) + f" Answer: {example[self._rlhf_config.answer_title]}") return text + def prepare_blurb_qa_text(self, example): + """Prepare the text from a sample of the d2l dataset .""" + INTRO_BLURB = ( + "Below is an instruction that describes a task. Write a response that appropriately completes the request." + ) + INSTRUCTION_KEY = "### Instruction:" + INPUT_KEY = "Input:" + RESPONSE_KEY = "### Response:" + END_KEY = "### End" + RESPONSE_KEY_NL = f"{RESPONSE_KEY}\n" + DEFAULT_SEED = 42 + + # This is a training prompt that does not contain an input string. The instruction by itself has enough information + # to respond. For example, the instruction might ask for the year a + # historic figure was born. + PROMPT_NO_INPUT_FORMAT = """{intro} + {instruction_key} + {instruction} + {response_key} + {response} + {end_key}""".format( + intro=INTRO_BLURB, + instruction_key=INSTRUCTION_KEY, + instruction="{instruction}", + response_key=RESPONSE_KEY, + response="{response}", + end_key=END_KEY, + ) + + # This is a training prompt that contains an input string that serves as context for the instruction. For example, + # the input might be a passage from Wikipedia and the intruction is to + # extract some information from it. + PROMPT_WITH_INPUT_FORMAT = """{intro} + {instruction_key} + {instruction} + {input_key} + {input} + {response_key} + {response} + {end_key}""".format( + intro=INTRO_BLURB, + instruction_key=INSTRUCTION_KEY, + instruction="{instruction}", + input_key=INPUT_KEY, + input="{input}", + response_key=RESPONSE_KEY, + response="{response}", + end_key=END_KEY, + ) + if "context" in example: + context = example["context"] + else: + context = None + output_texts = [] + for i in range(len(example["instruction"])): + if context: + text = PROMPT_WITH_INPUT_FORMAT.format( + instruction=example["instruction"][i], + response=example["response"][i], + input=context) + else: + text = PROMPT_NO_INPUT_FORMAT.format( + instruction=example["instruction"][i], + response=example["response"][i]) + + output_texts.append(text) + return output_texts + + + def create_datasets(self, tokenizer, args): if args.dataset_type == "local_db": qa_database = QuestionAnswerDatabase() @@ -179,8 +276,17 @@ def create_datasets(self, tokenizer, args): ) dataset = Dataset.from_dict(my_data_pd) elif args.dataset_type == "local_csv": - dataset = load_dataset("csv", data_files=args.dataset_name) - dataset = dataset[args.split] # Convert DatasetDict to Dataset + # this way will load 1660 enetries + # dataset = load_dataset("csv", data_files=args.dataset_name) + # dataset = dataset["train"] # Convert DatasetDict to Dataset + + # this way will load 166 entries + + dataset = load_dataset( + "csv", + data_files=args.dataset_name, + split=args.split) + elif args.dataset_type == "huggingface": dataset = load_dataset( args.dataset_name, @@ -197,28 +303,28 @@ def create_datasets(self, tokenizer, args): f" {args.dataset_type}" ) - dataset = dataset.train_test_split( - test_size=args.train_test_split_ratio, seed=args.seed - ) - print( - f"Size of the train set: {len(dataset['train'])}. " - f" Size of the validation set: {len(dataset['test'])}" - ) + if args.prepare_text == "d2l": + self.prepare_text = self.prepare_blurb_qa_text + else: + self.prepare_text = self.prepare_sample_text + # No test set during training + if args.no_evaluation: + print( + f"Size of the train set: {len(dataset)}. " + ) + + train_dataset = dataset + eval_dataset = None + else: + dataset = dataset.train_test_split( + test_size=args.train_test_split_ratio, seed=args.seed + ) + print( + f"Size of the train set: {len(dataset['train'])}. " + f" Size of the validation set: {len(dataset['test'])}") + + train_dataset = dataset["train"] + + eval_dataset = dataset["test"] - train_dataset = ConstantLengthDataset( - tokenizer, - dataset["train"], - formatting_func=self.prepare_sample_text, - infinite=True, - seq_length=args.max_seq_length, - # chars_per_token=chars_per_token, - ) - eval_dataset = ConstantLengthDataset( - tokenizer, - dataset["test"], - formatting_func=self.prepare_sample_text, - infinite=False, - seq_length=args.max_seq_length, - # chars_per_token=chars_per_token, - ) return {"train": train_dataset, "eval": eval_dataset}