Skip to content

Commit

Permalink
text analysis
Browse files Browse the repository at this point in the history
  • Loading branch information
laugustyniak committed Apr 9, 2024
1 parent fdd24cf commit 8cabe13
Showing 1 changed file with 215 additions and 0 deletions.
215 changes: 215 additions & 0 deletions nbs/Data/01_Analyze_Polish_Judgements_Texts.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "9e365555",
"metadata": {},
"source": [
"# Analyze Text of Polish Judgements\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6b666da3-f393-4d88-8036-e818937d2305",
"metadata": {},
"outputs": [],
"source": [
"# | eval: false\n",
"from datasets import load_from_disk\n",
"import string\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"\n",
"from juddges.settings import PL_JUDGEMENTS_PATH_TEXTS"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c1f37c21-de73-48ee-8cc3-8f4f2d4ce735",
"metadata": {},
"outputs": [],
"source": [
"# | eval: false\n",
"ds = load_from_disk(dataset_path=PL_JUDGEMENTS_PATH_TEXTS)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c49a038b-3bd5-4124-89c2-a019c364fd22",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "33bd783c6a53402c91a8ee3cd8fb122f",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Map (num_proc=40): 0%| | 0/408423 [00:00<?, ? examples/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# | eval: false\n",
"def tagger(item):\n",
" text = item[\"content\"]\n",
" dummy_tokens = text.split()\n",
"\n",
" item[\"chars\"] = len(text)\n",
" item[\"num_dummy_tokens\"] = len(dummy_tokens)\n",
" item[\"num_non_ws_tokens\"] = sum(\n",
" 1 for tok in dummy_tokens if any(char not in string.punctuation for char in tok.strip())\n",
" )\n",
"\n",
" return item\n",
"\n",
"\n",
"ds = ds.map(tagger, num_proc=40)\n",
"ds.cleanup_cache_files()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d7531b42-d802-41b6-b699-d1b16361e8ff",
"metadata": {},
"outputs": [],
"source": [
"# | eval: false\n",
"stats = (\n",
" ds.select_columns([\"_id\", \"type\", \"chars\", \"num_dummy_tokens\", \"num_non_ws_tokens\"])\n",
" .to_pandas()\n",
" .convert_dtypes(dtype_backend=\"pyarrow\")\n",
")\n",
"stats[\"type\"] = stats[\"type\"].astype(\"category\")\n",
"stats.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6e4a7432-12fe-4b8b-a9c4-77a69a780ec2",
"metadata": {},
"outputs": [],
"source": [
"# | eval: false\n",
"ax = sns.histplot(\n",
" x=stats[\"num_non_ws_tokens\"],\n",
" log_scale=True,\n",
" bins=50,\n",
")\n",
"ax.set(title=\"#tokens distribution\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "480766df-9c10-4e15-808c-85a76d15166e",
"metadata": {},
"outputs": [],
"source": [
"# | eval: false\n",
"card_order = stats[\"type\"].value_counts().index.tolist()\n",
"data = stats[\"type\"].value_counts().plot.barh(logx=True, title=\"Types cardinality\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c295c4a0-6926-43c2-bb51-77e8a98ff4da",
"metadata": {},
"outputs": [],
"source": [
"# | eval: false\n",
"# sns.displot(data=stats, x=\"num_non_ws_tokens\", col=\"type\", col_wrap=3, log_scale=(True, False), facet_kws=dict(sharey=False, sharex=False), kind=\"hist\", bins=25)\n",
"\n",
"_, ax = plt.subplots(figsize=(8, 12))\n",
"ax.set(title=\"Per type text length ditribution\")\n",
"sns.boxenplot(data=stats, y=\"type\", x=\"num_non_ws_tokens\", order=card_order, log_scale=True)"
]
},
{
"cell_type": "markdown",
"id": "ea06ef3f-c12d-4da6-9fc6-45f1809dabad",
"metadata": {},
"source": [
"# Tokenize\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "08c70fdc-0b03-4983-8da9-8d065161d3e7",
"metadata": {},
"outputs": [],
"source": [
"# | eval: false\n",
"from transformers import AutoTokenizer"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0af8c3ba-aa89-4e1a-bfcb-65b618c4559e",
"metadata": {},
"outputs": [],
"source": [
"# | eval: false\n",
"tokenizer = AutoTokenizer.from_pretrained(\"intfloat/multilingual-e5-large\")\n",
"ds = ds.map(\n",
" lambda examples: tokenizer(examples[\"content\"], padding=False, truncation=False),\n",
" batched=True,\n",
" num_proc=20,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6f822fae-f91c-4ee1-a114-97a021bf1e81",
"metadata": {},
"outputs": [],
"source": [
"# | eval: false\n",
"tokenized = []\n",
"for item in ds:\n",
" tokenized.append({\"num_tokens\": len(item[\"input_ids\"])})"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cdac696f-056a-4b12-a48e-ac8f8dac9eeb",
"metadata": {},
"outputs": [],
"source": [
"# | eval: false\n",
"sns.histplot(tokenized, bins=50)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c890ee73",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "python3",
"language": "python",
"name": "python3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

0 comments on commit 8cabe13

Please sign in to comment.