feat: add tokens analysis to notebook

pwr-ai · Apr 9, 2024 · 7acb9d9 · 7acb9d9
1 parent c633615
commit 7acb9d9
Show file tree

Hide file tree

Showing 2 changed files with 88 additions and 9 deletions.
diff --git a/notebooks/2_analyse_text.ipynb b/notebooks/2_analyse_text.ipynb
@@ -2,16 +2,55 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "6b666da3-f393-4d88-8036-e818937d2305",
-   "metadata": {},
    "outputs": [],
    "source": [
+    "import gc\n",
+    "\n",
+    "import torch\n",
     "from datasets import load_from_disk\n",
     "import string\n",
     "import seaborn as sns\n",
-    "import matplotlib.pyplot as plt"
-   ]
+    "import matplotlib.pyplot as plt\n",
+    "from transformers import AutoTokenizer"
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "6b666da3-f393-4d88-8036-e818937d2305",
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "ed482b952dbb39db"
+  },
+  {
+   "cell_type": "code",
+   "outputs": [],
+   "source": [
+    "import datasets"
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "f0dc3a6d9bdb80d",
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "outputs": [],
+   "source": [
+    "# datasets.config.IN_MEMORY_MAX_SIZE = 5e+6\n",
+    "datasets.config.IN_MEMORY_MAX_SIZE\n"
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "f20c87eddc333e19",
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -23,6 +62,19 @@
     "ds = load_from_disk(dataset_path=\"../data/datasets/pl/text/\")"
    ]
   },
+  {
+   "cell_type": "code",
+   "outputs": [],
+   "source": [
+    "tokenizer_mistral = AutoTokenizer.from_pretrained(\"mistralai/Mistral-7B-Instruct-v0.2\")\n",
+    "tokenizer_llama = AutoTokenizer.from_pretrained(\"meta-llama/Llama-2-7b-hf\")"
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "341b7f4c95e36331",
+   "execution_count": null
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -35,18 +87,44 @@
     "    dummy_tokens = text.split()\n",
     "\n",
     "    item[\"chars\"] = len(text)\n",
-    "    item[\"num_dummy_tokens\"] = len(dummy_tokens)\n",
+    "    item[\"num_words\"] = len(dummy_tokens)\n",
     "    item[\"num_non_ws_tokens\"] = sum(\n",
     "        1 for tok in dummy_tokens if any(char not in string.punctuation for char in tok.strip())\n",
     "    )\n",
-    "\n",
+    "    item[\"num_tokens_mistral\"] = tokenizer_mistral.encode_plus(text, return_length=True,  return_attention_mask=False)[\"length\"]\n",
+    "    item[\"num_tokens_llama\"] = tokenizer_llama.encode_plus(text, return_length=True, return_attention_mask=False)[\"length\"]\n",
+    "    gc.collect()\n",
+    "    torch.cuda.empty_cache()\n",
     "    return item\n",
     "\n",
-    "\n",
-    "ds = ds.map(tagger, num_proc=20)\n",
+    "ds = ds.map(tagger, num_proc=15)\n",
     "ds.cleanup_cache_files()"
    ]
   },
+  {
+   "cell_type": "code",
+   "outputs": [],
+   "source": [
+    "ds[0]"
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "4452471c69429a7e",
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "outputs": [],
+   "source": [
+    "ds.map(lambda examples: {\"l\" : tokenizer_mistral(examples[\"text\"], return_length=True,  return_attention_mask=False)[\"length\"]}, batched=True, num_proc=5)"
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "32d17394371464",
+   "execution_count": null
+  },
   {
    "cell_type": "code",
    "execution_count": null,

diff --git a/requirements.txt b/requirements.txt
@@ -25,6 +25,7 @@ torch==2.2.1
 transformers==4.38.2
 langchain==0.1.13
 sentence-transformers==2.5.1
+seaborn==0.13.2
 
 # dev
 nbdev==2.3.13