text analysis

pwr-ai · Apr 9, 2024 · 8cabe13 · 8cabe13
1 parent fdd24cf
commit 8cabe13
Showing 1 changed file with 215 additions and 0 deletions.
diff --git a/nbs/Data/01_Analyze_Polish_Judgements_Texts.ipynb b/nbs/Data/01_Analyze_Polish_Judgements_Texts.ipynb
@@ -0,0 +1,215 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "9e365555",
+   "metadata": {},
+   "source": [
+    "# Analyze Text of Polish Judgements\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6b666da3-f393-4d88-8036-e818937d2305",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# | eval: false\n",
+    "from datasets import load_from_disk\n",
+    "import string\n",
+    "import seaborn as sns\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "from juddges.settings import PL_JUDGEMENTS_PATH_TEXTS"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c1f37c21-de73-48ee-8cc3-8f4f2d4ce735",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# | eval: false\n",
+    "ds = load_from_disk(dataset_path=PL_JUDGEMENTS_PATH_TEXTS)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c49a038b-3bd5-4124-89c2-a019c364fd22",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "33bd783c6a53402c91a8ee3cd8fb122f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map (num_proc=40):   0%|          | 0/408423 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# | eval: false\n",
+    "def tagger(item):\n",
+    "    text = item[\"content\"]\n",
+    "    dummy_tokens = text.split()\n",
+    "\n",
+    "    item[\"chars\"] = len(text)\n",
+    "    item[\"num_dummy_tokens\"] = len(dummy_tokens)\n",
+    "    item[\"num_non_ws_tokens\"] = sum(\n",
+    "        1 for tok in dummy_tokens if any(char not in string.punctuation for char in tok.strip())\n",
+    "    )\n",
+    "\n",
+    "    return item\n",
+    "\n",
+    "\n",
+    "ds = ds.map(tagger, num_proc=40)\n",
+    "ds.cleanup_cache_files()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d7531b42-d802-41b6-b699-d1b16361e8ff",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# | eval: false\n",
+    "stats = (\n",
+    "    ds.select_columns([\"_id\", \"type\", \"chars\", \"num_dummy_tokens\", \"num_non_ws_tokens\"])\n",
+    "    .to_pandas()\n",
+    "    .convert_dtypes(dtype_backend=\"pyarrow\")\n",
+    ")\n",
+    "stats[\"type\"] = stats[\"type\"].astype(\"category\")\n",
+    "stats.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6e4a7432-12fe-4b8b-a9c4-77a69a780ec2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# | eval: false\n",
+    "ax = sns.histplot(\n",
+    "    x=stats[\"num_non_ws_tokens\"],\n",
+    "    log_scale=True,\n",
+    "    bins=50,\n",
+    ")\n",
+    "ax.set(title=\"#tokens distribution\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "480766df-9c10-4e15-808c-85a76d15166e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# | eval: false\n",
+    "card_order = stats[\"type\"].value_counts().index.tolist()\n",
+    "data = stats[\"type\"].value_counts().plot.barh(logx=True, title=\"Types cardinality\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c295c4a0-6926-43c2-bb51-77e8a98ff4da",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# | eval: false\n",
+    "# sns.displot(data=stats, x=\"num_non_ws_tokens\", col=\"type\", col_wrap=3, log_scale=(True, False), facet_kws=dict(sharey=False, sharex=False), kind=\"hist\", bins=25)\n",
+    "\n",
+    "_, ax = plt.subplots(figsize=(8, 12))\n",
+    "ax.set(title=\"Per type text length ditribution\")\n",
+    "sns.boxenplot(data=stats, y=\"type\", x=\"num_non_ws_tokens\", order=card_order, log_scale=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ea06ef3f-c12d-4da6-9fc6-45f1809dabad",
+   "metadata": {},
+   "source": [
+    "# Tokenize\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "08c70fdc-0b03-4983-8da9-8d065161d3e7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# | eval: false\n",
+    "from transformers import AutoTokenizer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0af8c3ba-aa89-4e1a-bfcb-65b618c4559e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# | eval: false\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\"intfloat/multilingual-e5-large\")\n",
+    "ds = ds.map(\n",
+    "    lambda examples: tokenizer(examples[\"content\"], padding=False, truncation=False),\n",
+    "    batched=True,\n",
+    "    num_proc=20,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6f822fae-f91c-4ee1-a114-97a021bf1e81",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# | eval: false\n",
+    "tokenized = []\n",
+    "for item in ds:\n",
+    "    tokenized.append({\"num_tokens\": len(item[\"input_ids\"])})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cdac696f-056a-4b12-a48e-ac8f8dac9eeb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# | eval: false\n",
+    "sns.histplot(tokenized, bins=50)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c890ee73",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "python3",
+   "language": "python",
+   "name": "python3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}