Skip to content

Commit

Permalink
feat: add tokens analysis to notebook
Browse files Browse the repository at this point in the history
  • Loading branch information
asawczyn committed Apr 9, 2024
1 parent c633615 commit 7acb9d9
Show file tree
Hide file tree
Showing 2 changed files with 88 additions and 9 deletions.
96 changes: 87 additions & 9 deletions notebooks/2_analyse_text.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,55 @@
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "6b666da3-f393-4d88-8036-e818937d2305",
"metadata": {},
"outputs": [],
"source": [
"import gc\n",
"\n",
"import torch\n",
"from datasets import load_from_disk\n",
"import string\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt"
]
"import matplotlib.pyplot as plt\n",
"from transformers import AutoTokenizer"
],
"metadata": {
"collapsed": false
},
"id": "6b666da3-f393-4d88-8036-e818937d2305",
"execution_count": null
},
{
"cell_type": "markdown",
"source": [],
"metadata": {
"collapsed": false
},
"id": "ed482b952dbb39db"
},
{
"cell_type": "code",
"outputs": [],
"source": [
"import datasets"
],
"metadata": {
"collapsed": false
},
"id": "f0dc3a6d9bdb80d",
"execution_count": null
},
{
"cell_type": "code",
"outputs": [],
"source": [
"# datasets.config.IN_MEMORY_MAX_SIZE = 5e+6\n",
"datasets.config.IN_MEMORY_MAX_SIZE\n"
],
"metadata": {
"collapsed": false
},
"id": "f20c87eddc333e19",
"execution_count": null
},
{
"cell_type": "code",
Expand All @@ -23,6 +62,19 @@
"ds = load_from_disk(dataset_path=\"../data/datasets/pl/text/\")"
]
},
{
"cell_type": "code",
"outputs": [],
"source": [
"tokenizer_mistral = AutoTokenizer.from_pretrained(\"mistralai/Mistral-7B-Instruct-v0.2\")\n",
"tokenizer_llama = AutoTokenizer.from_pretrained(\"meta-llama/Llama-2-7b-hf\")"
],
"metadata": {
"collapsed": false
},
"id": "341b7f4c95e36331",
"execution_count": null
},
{
"cell_type": "code",
"execution_count": null,
Expand All @@ -35,18 +87,44 @@
" dummy_tokens = text.split()\n",
"\n",
" item[\"chars\"] = len(text)\n",
" item[\"num_dummy_tokens\"] = len(dummy_tokens)\n",
" item[\"num_words\"] = len(dummy_tokens)\n",
" item[\"num_non_ws_tokens\"] = sum(\n",
" 1 for tok in dummy_tokens if any(char not in string.punctuation for char in tok.strip())\n",
" )\n",
"\n",
" item[\"num_tokens_mistral\"] = tokenizer_mistral.encode_plus(text, return_length=True, return_attention_mask=False)[\"length\"]\n",
" item[\"num_tokens_llama\"] = tokenizer_llama.encode_plus(text, return_length=True, return_attention_mask=False)[\"length\"]\n",
" gc.collect()\n",
" torch.cuda.empty_cache()\n",
" return item\n",
"\n",
"\n",
"ds = ds.map(tagger, num_proc=20)\n",
"ds = ds.map(tagger, num_proc=15)\n",
"ds.cleanup_cache_files()"
]
},
{
"cell_type": "code",
"outputs": [],
"source": [
"ds[0]"
],
"metadata": {
"collapsed": false
},
"id": "4452471c69429a7e",
"execution_count": null
},
{
"cell_type": "code",
"outputs": [],
"source": [
"ds.map(lambda examples: {\"l\" : tokenizer_mistral(examples[\"text\"], return_length=True, return_attention_mask=False)[\"length\"]}, batched=True, num_proc=5)"
],
"metadata": {
"collapsed": false
},
"id": "32d17394371464",
"execution_count": null
},
{
"cell_type": "code",
"execution_count": null,
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ torch==2.2.1
transformers==4.38.2
langchain==0.1.13
sentence-transformers==2.5.1
seaborn==0.13.2

# dev
nbdev==2.3.13
Expand Down

0 comments on commit 7acb9d9

Please sign in to comment.