From 57c20d2638a116df64489c60258606b6707c049f Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Wed, 6 Dec 2023 20:48:41 +0100 Subject: [PATCH 1/6] fix: updated docs --- docs/evaluation/utils.py | 11 +- docs/performance_ner.ipynb | 654 ++++++++++++++++++------------------- 2 files changed, 318 insertions(+), 347 deletions(-) diff --git a/docs/evaluation/utils.py b/docs/evaluation/utils.py index 99b8b548..7de29704 100644 --- a/docs/evaluation/utils.py +++ b/docs/evaluation/utils.py @@ -6,13 +6,12 @@ import numpy as np import pandas as pd import spacy +from evaluation.datasets import datasets from spacy.language import Language from spacy.scorer import Scorer from spacy.tokens import Doc from spacy.training import Example -from evaluation.datasets import datasets - def bootstrap( examples: List[Example], @@ -74,7 +73,13 @@ def compute_mean_and_ci(scores: List[Dict[str, Any]]) -> Dict[str, Any]: "MISC": "Misc.", } - labels = {label for score in scores for label in score["ents_per_type"]} + def get_ents_per_type(score): + x = score["ents_per_type"] + if x is None: + return [] + return x + + labels = {label for score in scores for label in get_ents_per_type(score)} for label in labels: label_f = [ diff --git a/docs/performance_ner.ipynb b/docs/performance_ner.ipynb index a3ba0659..2a13d263 100644 --- a/docs/performance_ner.ipynb +++ b/docs/performance_ner.ipynb @@ -86,7 +86,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 2, "metadata": { "tags": [ "remove-input" @@ -140,7 +140,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": { "tags": [ "remove-cell" @@ -154,7 +154,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 4, "metadata": { "tags": [ "remove-cell" @@ -178,7 +178,7 @@ "dane (test): Loading prediction for da_core_news_md-3.5.0\n", "dane (test): Loading prediction for da_core_news_sm-3.5.0\n", "dane (test): Loading prediction for openai/gpt-3.5-turbo (02/05/23)\n", - "dane (test): Running openai/gpt-4 (02/05/23)\n" + "dane (test): Loading prediction for openai/gpt-4 (02/05/23)\n" ] } ], @@ -191,7 +191,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 5, "metadata": { "tags": [ "remove-cell" @@ -220,7 +220,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 6, "metadata": { "tags": [ "remove-cell" @@ -282,7 +282,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 7, "metadata": { "tags": [ "remove-cell" @@ -300,7 +300,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 8, "metadata": { "tags": [ "remove-input" @@ -311,140 +311,140 @@ "data": { "text/html": [ "\n", - "\n", + "
\n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
F1 score with 95% confidence interval calculated using bootstrapping with 500 samples.
F1F1
ModelsAverageLocationPersonOrganizationMisc.ModelsAverageLocationPersonOrganizationMisc.
da_dacy_large_trf-0.2.085.4 (81.2, 88.9)89.5 (84.0, 94.7)92.6 (89.0, 95.4)79.0 (72.5, 84.6)79.0 (70.8, 86.0)da_dacy_large_trf-0.2.085.4 (81.2, 88.9)89.5 (84.0, 94.7)92.6 (89.0, 95.4)79.0 (72.5, 84.6)79.0 (70.8, 86.0)
da_dacy_medium_trf-0.2.084.9 (81.0, 88.5)86.8 (81.2, 92.3)92.7 (89.2, 95.6)78.7 (71.8, 85.0)78.7 (70.6, 86.1)da_dacy_medium_trf-0.2.084.9 (81.0, 88.5)86.8 (81.2, 92.3)92.7 (89.2, 95.6)78.7 (71.8, 85.0)78.7 (70.6, 86.1)
da_dacy_small_trf-0.2.082.7 (79.3, 85.9)84.2 (78.3, 89.8)92.2 (88.5, 95.1)75.9 (69.3, 81.7)75.7 (68.8, 81.8)da_dacy_small_trf-0.2.082.7 (79.3, 85.9)84.2 (78.3, 89.8)92.2 (88.5, 95.1)75.9 (69.3, 81.7)75.7 (68.8, 81.8)
saattrupdan/nbailab-base-ner-scandi86.3 (82.4, 89.7)88.6 (83.0, 93.3)95.1 (92.4, 97.8)80.3 (73.6, 85.8)78.6 (69.4, 86.0)saattrupdan/nbailab-base-ner-scandi86.3 (82.4, 89.7)88.6 (83.0, 93.3)95.1 (92.4, 97.8)80.3 (73.6, 85.8)78.6 (69.4, 86.0)
alexandrainst/da-ner-base70.7 (66.2, 75.2)84.8 (77.8, 91.0)90.3 (86.3, 93.9)64.7 (57.0, 71.3) alexandrainst/da-ner-base70.7 (66.2, 75.2)84.8 (77.8, 91.0)90.3 (86.3, 93.9)64.7 (57.0, 71.3)
da_core_news_trf-3.5.079.0 (75.1, 82.3)82.1 (75.5, 88.5)91.6 (88.2, 94.5)68.0 (61.0, 75.2)69.0 (61.1, 77.3)da_core_news_trf-3.5.079.0 (75.1, 82.3)82.1 (75.5, 88.5)91.6 (88.2, 94.5)68.0 (61.0, 75.2)69.0 (61.1, 77.3)
da_core_news_lg-3.5.074.6 (70.8, 78.1)81.6 (75.3, 88.2)85.5 (81.1, 89.9)62.7 (54.8, 70.3)64.4 (55.9, 72.8)da_core_news_lg-3.5.074.6 (70.8, 78.1)81.6 (75.3, 88.2)85.5 (81.1, 89.9)62.7 (54.8, 70.3)64.4 (55.9, 72.8)
da_core_news_md-3.5.071.2 (66.9, 75.2)76.8 (69.9, 83.6)82.6 (77.8, 87.0)58.2 (49.6, 66.7)61.8 (52.6, 70.6)da_core_news_md-3.5.071.2 (66.9, 75.2)76.8 (69.9, 83.6)82.6 (77.8, 87.0)58.2 (49.6, 66.7)61.8 (52.6, 70.6)
da_core_news_sm-3.5.064.4 (59.7, 68.5)61.6 (52.2, 69.9)80.1 (74.9, 85.1)49.0 (39.0, 57.5)58.4 (49.8, 67.1)da_core_news_sm-3.5.064.4 (59.7, 68.5)61.6 (52.2, 69.9)80.1 (74.9, 85.1)49.0 (39.0, 57.5)58.4 (49.8, 67.1)
openai/gpt-3.5-turbo (02/05/23)57.5 (52.3, 62.2)50.7 (41.9, 59.2)81.9 (76.8, 86.5)55.7 (47.1, 63.7) openai/gpt-3.5-turbo (02/05/23)57.5 (52.3, 62.2)50.7 (41.9, 59.2)81.9 (76.8, 86.5)55.7 (47.1, 63.7)
openai/gpt-4 (02/05/23)70.1 (66.0, 74.3)78.9 (71.5, 85.7)85.3 (80.4, 89.5)72.0 (65.4, 78.5) openai/gpt-4 (02/05/23)70.1 (66.0, 74.3)78.9 (71.5, 85.7)85.3 (80.4, 89.5)72.0 (65.4, 78.5)
\n" ], "text/plain": [ - "" + "" ] }, - "execution_count": 6, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -524,26 +524,39 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 9, "metadata": { "tags": [ "remove-cell" ] }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "functools.partial(, model='gpt-4')" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from functools import partial\n", "from evaluation.models import openai_model_loader_fine_ner\n", "MODELS_ = MODELS.copy()\n", "MODELS_[\"openai/gpt-3.5-turbo (02/05/23)\"] = partial(openai_model_loader_fine_ner, model=\"gpt-3.5-turbo\")\n", "MODELS_[\"openai/gpt-4 (02/05/23)\"] = partial(openai_model_loader_fine_ner, model=\"gpt-4\")\n", - "MODELS.pop(\"openai/gpt-3.5-turbo (02/05/23)\")\n", - "MODELS.pop(\"openai/gpt-4 (02/05/23)\")\n" + "\n", + "# don't test openai models on DANSK\n", + "MODELS_.pop(\"openai/gpt-3.5-turbo (02/05/23)\")\n", + "MODELS_.pop(\"openai/gpt-4 (02/05/23)\")\n" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 10, "metadata": { "tags": [ "remove-cell" @@ -589,81 +602,7 @@ "dansk (test): Loading prediction for da_core_news_md-3.5.0\n", "dansk (train): Loading prediction for da_core_news_sm-3.5.0\n", "dansk (dev): Loading prediction for da_core_news_sm-3.5.0\n", - "dansk (test): Loading prediction for da_core_news_sm-3.5.0\n", - "dansk (test): Running openai/gpt-3.5-turbo (02/05/23)\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Found cached dataset parquet (/Users/au561649/.cache/huggingface/datasets/chcaa___parquet/chcaa--DANSK-8622a47955f5c4cb/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)\n", - "Found cached dataset parquet (/Users/au561649/.cache/huggingface/datasets/chcaa___parquet/chcaa--DANSK-8622a47955f5c4cb/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)\n", - "Found cached dataset parquet (/Users/au561649/.cache/huggingface/datasets/chcaa___parquet/chcaa--DANSK-8622a47955f5c4cb/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "dansk (test): Running openai/gpt-4 (02/05/23)\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Found cached dataset parquet (/Users/au561649/.cache/huggingface/datasets/chcaa___parquet/chcaa--DANSK-8622a47955f5c4cb/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)\n", - "Found cached dataset parquet (/Users/au561649/.cache/huggingface/datasets/chcaa___parquet/chcaa--DANSK-8622a47955f5c4cb/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)\n", - "Found cached dataset parquet (/Users/au561649/.cache/huggingface/datasets/chcaa___parquet/chcaa--DANSK-8622a47955f5c4cb/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)\n" - ] - }, - { - "ename": "ConnectionError", - "evalue": "HTTPSConnectionPool(host='api.openai.com', port=443): Max retries exceeded with url: /v1/chat/completions (Caused by NameResolutionError(\": Failed to resolve 'api.openai.com' ([Errno 8] nodename nor servname provided, or not known)\"))", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mgaierror\u001b[0m Traceback (most recent call last)", - "File \u001b[0;32m~/.virtualenvs/dacy/lib/python3.10/site-packages/urllib3/connection.py:200\u001b[0m, in \u001b[0;36mHTTPConnection._new_conn\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 199\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 200\u001b[0m sock \u001b[39m=\u001b[39m connection\u001b[39m.\u001b[39;49mcreate_connection(\n\u001b[1;32m 201\u001b[0m (\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_dns_host, \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mport),\n\u001b[1;32m 202\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mtimeout,\n\u001b[1;32m 203\u001b[0m source_address\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49msource_address,\n\u001b[1;32m 204\u001b[0m socket_options\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49msocket_options,\n\u001b[1;32m 205\u001b[0m )\n\u001b[1;32m 206\u001b[0m \u001b[39mexcept\u001b[39;00m socket\u001b[39m.\u001b[39mgaierror \u001b[39mas\u001b[39;00m e:\n", - "File \u001b[0;32m~/.virtualenvs/dacy/lib/python3.10/site-packages/urllib3/util/connection.py:60\u001b[0m, in \u001b[0;36mcreate_connection\u001b[0;34m(address, timeout, source_address, socket_options)\u001b[0m\n\u001b[1;32m 58\u001b[0m \u001b[39mraise\u001b[39;00m LocationParseError(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m'\u001b[39m\u001b[39m{\u001b[39;00mhost\u001b[39m}\u001b[39;00m\u001b[39m'\u001b[39m\u001b[39m, label empty or too long\u001b[39m\u001b[39m\"\u001b[39m) \u001b[39mfrom\u001b[39;00m \u001b[39mNone\u001b[39;00m\n\u001b[0;32m---> 60\u001b[0m \u001b[39mfor\u001b[39;00m res \u001b[39min\u001b[39;00m socket\u001b[39m.\u001b[39;49mgetaddrinfo(host, port, family, socket\u001b[39m.\u001b[39;49mSOCK_STREAM):\n\u001b[1;32m 61\u001b[0m af, socktype, proto, canonname, sa \u001b[39m=\u001b[39m res\n", - "File \u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/socket.py:955\u001b[0m, in \u001b[0;36mgetaddrinfo\u001b[0;34m(host, port, family, type, proto, flags)\u001b[0m\n\u001b[1;32m 954\u001b[0m addrlist \u001b[39m=\u001b[39m []\n\u001b[0;32m--> 955\u001b[0m \u001b[39mfor\u001b[39;00m res \u001b[39min\u001b[39;00m _socket\u001b[39m.\u001b[39;49mgetaddrinfo(host, port, family, \u001b[39mtype\u001b[39;49m, proto, flags):\n\u001b[1;32m 956\u001b[0m af, socktype, proto, canonname, sa \u001b[39m=\u001b[39m res\n", - "\u001b[0;31mgaierror\u001b[0m: [Errno 8] nodename nor servname provided, or not known", - "\nThe above exception was the direct cause of the following exception:\n", - "\u001b[0;31mNameResolutionError\u001b[0m Traceback (most recent call last)", - "File \u001b[0;32m~/.virtualenvs/dacy/lib/python3.10/site-packages/urllib3/connectionpool.py:790\u001b[0m, in \u001b[0;36mHTTPConnectionPool.urlopen\u001b[0;34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, preload_content, decode_content, **response_kw)\u001b[0m\n\u001b[1;32m 789\u001b[0m \u001b[39m# Make the request on the HTTPConnection object\u001b[39;00m\n\u001b[0;32m--> 790\u001b[0m response \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_make_request(\n\u001b[1;32m 791\u001b[0m conn,\n\u001b[1;32m 792\u001b[0m method,\n\u001b[1;32m 793\u001b[0m url,\n\u001b[1;32m 794\u001b[0m timeout\u001b[39m=\u001b[39;49mtimeout_obj,\n\u001b[1;32m 795\u001b[0m body\u001b[39m=\u001b[39;49mbody,\n\u001b[1;32m 796\u001b[0m headers\u001b[39m=\u001b[39;49mheaders,\n\u001b[1;32m 797\u001b[0m chunked\u001b[39m=\u001b[39;49mchunked,\n\u001b[1;32m 798\u001b[0m retries\u001b[39m=\u001b[39;49mretries,\n\u001b[1;32m 799\u001b[0m response_conn\u001b[39m=\u001b[39;49mresponse_conn,\n\u001b[1;32m 800\u001b[0m preload_content\u001b[39m=\u001b[39;49mpreload_content,\n\u001b[1;32m 801\u001b[0m decode_content\u001b[39m=\u001b[39;49mdecode_content,\n\u001b[1;32m 802\u001b[0m \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mresponse_kw,\n\u001b[1;32m 803\u001b[0m )\n\u001b[1;32m 805\u001b[0m \u001b[39m# Everything went great!\u001b[39;00m\n", - "File \u001b[0;32m~/.virtualenvs/dacy/lib/python3.10/site-packages/urllib3/connectionpool.py:491\u001b[0m, in \u001b[0;36mHTTPConnectionPool._make_request\u001b[0;34m(self, conn, method, url, body, headers, retries, timeout, chunked, response_conn, preload_content, decode_content, enforce_content_length)\u001b[0m\n\u001b[1;32m 490\u001b[0m new_e \u001b[39m=\u001b[39m _wrap_proxy_error(new_e, conn\u001b[39m.\u001b[39mproxy\u001b[39m.\u001b[39mscheme)\n\u001b[0;32m--> 491\u001b[0m \u001b[39mraise\u001b[39;00m new_e\n\u001b[1;32m 493\u001b[0m \u001b[39m# conn.request() calls http.client.*.request, not the method in\u001b[39;00m\n\u001b[1;32m 494\u001b[0m \u001b[39m# urllib3.request. It also calls makefile (recv) on the socket.\u001b[39;00m\n", - "File \u001b[0;32m~/.virtualenvs/dacy/lib/python3.10/site-packages/urllib3/connectionpool.py:467\u001b[0m, in \u001b[0;36mHTTPConnectionPool._make_request\u001b[0;34m(self, conn, method, url, body, headers, retries, timeout, chunked, response_conn, preload_content, decode_content, enforce_content_length)\u001b[0m\n\u001b[1;32m 466\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 467\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_validate_conn(conn)\n\u001b[1;32m 468\u001b[0m \u001b[39mexcept\u001b[39;00m (SocketTimeout, BaseSSLError) \u001b[39mas\u001b[39;00m e:\n", - "File \u001b[0;32m~/.virtualenvs/dacy/lib/python3.10/site-packages/urllib3/connectionpool.py:1092\u001b[0m, in \u001b[0;36mHTTPSConnectionPool._validate_conn\u001b[0;34m(self, conn)\u001b[0m\n\u001b[1;32m 1091\u001b[0m \u001b[39mif\u001b[39;00m conn\u001b[39m.\u001b[39mis_closed:\n\u001b[0;32m-> 1092\u001b[0m conn\u001b[39m.\u001b[39;49mconnect()\n\u001b[1;32m 1094\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m conn\u001b[39m.\u001b[39mis_verified:\n", - "File \u001b[0;32m~/.virtualenvs/dacy/lib/python3.10/site-packages/urllib3/connection.py:604\u001b[0m, in \u001b[0;36mHTTPSConnection.connect\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 603\u001b[0m sock: socket\u001b[39m.\u001b[39msocket \u001b[39m|\u001b[39m ssl\u001b[39m.\u001b[39mSSLSocket\n\u001b[0;32m--> 604\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39msock \u001b[39m=\u001b[39m sock \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_new_conn()\n\u001b[1;32m 605\u001b[0m server_hostname: \u001b[39mstr\u001b[39m \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mhost\n", - "File \u001b[0;32m~/.virtualenvs/dacy/lib/python3.10/site-packages/urllib3/connection.py:207\u001b[0m, in \u001b[0;36mHTTPConnection._new_conn\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 206\u001b[0m \u001b[39mexcept\u001b[39;00m socket\u001b[39m.\u001b[39mgaierror \u001b[39mas\u001b[39;00m e:\n\u001b[0;32m--> 207\u001b[0m \u001b[39mraise\u001b[39;00m NameResolutionError(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mhost, \u001b[39mself\u001b[39m, e) \u001b[39mfrom\u001b[39;00m \u001b[39me\u001b[39;00m\n\u001b[1;32m 208\u001b[0m \u001b[39mexcept\u001b[39;00m SocketTimeout \u001b[39mas\u001b[39;00m e:\n", - "\u001b[0;31mNameResolutionError\u001b[0m: : Failed to resolve 'api.openai.com' ([Errno 8] nodename nor servname provided, or not known)", - "\nThe above exception was the direct cause of the following exception:\n", - "\u001b[0;31mMaxRetryError\u001b[0m Traceback (most recent call last)", - "File \u001b[0;32m~/.virtualenvs/dacy/lib/python3.10/site-packages/requests/adapters.py:486\u001b[0m, in \u001b[0;36mHTTPAdapter.send\u001b[0;34m(self, request, stream, timeout, verify, cert, proxies)\u001b[0m\n\u001b[1;32m 485\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 486\u001b[0m resp \u001b[39m=\u001b[39m conn\u001b[39m.\u001b[39;49murlopen(\n\u001b[1;32m 487\u001b[0m method\u001b[39m=\u001b[39;49mrequest\u001b[39m.\u001b[39;49mmethod,\n\u001b[1;32m 488\u001b[0m url\u001b[39m=\u001b[39;49murl,\n\u001b[1;32m 489\u001b[0m body\u001b[39m=\u001b[39;49mrequest\u001b[39m.\u001b[39;49mbody,\n\u001b[1;32m 490\u001b[0m headers\u001b[39m=\u001b[39;49mrequest\u001b[39m.\u001b[39;49mheaders,\n\u001b[1;32m 491\u001b[0m redirect\u001b[39m=\u001b[39;49m\u001b[39mFalse\u001b[39;49;00m,\n\u001b[1;32m 492\u001b[0m assert_same_host\u001b[39m=\u001b[39;49m\u001b[39mFalse\u001b[39;49;00m,\n\u001b[1;32m 493\u001b[0m preload_content\u001b[39m=\u001b[39;49m\u001b[39mFalse\u001b[39;49;00m,\n\u001b[1;32m 494\u001b[0m decode_content\u001b[39m=\u001b[39;49m\u001b[39mFalse\u001b[39;49;00m,\n\u001b[1;32m 495\u001b[0m retries\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mmax_retries,\n\u001b[1;32m 496\u001b[0m timeout\u001b[39m=\u001b[39;49mtimeout,\n\u001b[1;32m 497\u001b[0m chunked\u001b[39m=\u001b[39;49mchunked,\n\u001b[1;32m 498\u001b[0m )\n\u001b[1;32m 500\u001b[0m \u001b[39mexcept\u001b[39;00m (ProtocolError, \u001b[39mOSError\u001b[39;00m) \u001b[39mas\u001b[39;00m err:\n", - "File \u001b[0;32m~/.virtualenvs/dacy/lib/python3.10/site-packages/urllib3/connectionpool.py:844\u001b[0m, in \u001b[0;36mHTTPConnectionPool.urlopen\u001b[0;34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, preload_content, decode_content, **response_kw)\u001b[0m\n\u001b[1;32m 842\u001b[0m new_e \u001b[39m=\u001b[39m ProtocolError(\u001b[39m\"\u001b[39m\u001b[39mConnection aborted.\u001b[39m\u001b[39m\"\u001b[39m, new_e)\n\u001b[0;32m--> 844\u001b[0m retries \u001b[39m=\u001b[39m retries\u001b[39m.\u001b[39;49mincrement(\n\u001b[1;32m 845\u001b[0m method, url, error\u001b[39m=\u001b[39;49mnew_e, _pool\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m, _stacktrace\u001b[39m=\u001b[39;49msys\u001b[39m.\u001b[39;49mexc_info()[\u001b[39m2\u001b[39;49m]\n\u001b[1;32m 846\u001b[0m )\n\u001b[1;32m 847\u001b[0m retries\u001b[39m.\u001b[39msleep()\n", - "File \u001b[0;32m~/.virtualenvs/dacy/lib/python3.10/site-packages/urllib3/util/retry.py:515\u001b[0m, in \u001b[0;36mRetry.increment\u001b[0;34m(self, method, url, response, error, _pool, _stacktrace)\u001b[0m\n\u001b[1;32m 514\u001b[0m reason \u001b[39m=\u001b[39m error \u001b[39mor\u001b[39;00m ResponseError(cause)\n\u001b[0;32m--> 515\u001b[0m \u001b[39mraise\u001b[39;00m MaxRetryError(_pool, url, reason) \u001b[39mfrom\u001b[39;00m \u001b[39mreason\u001b[39;00m \u001b[39m# type: ignore[arg-type]\u001b[39;00m\n\u001b[1;32m 517\u001b[0m log\u001b[39m.\u001b[39mdebug(\u001b[39m\"\u001b[39m\u001b[39mIncremented Retry for (url=\u001b[39m\u001b[39m'\u001b[39m\u001b[39m%s\u001b[39;00m\u001b[39m'\u001b[39m\u001b[39m): \u001b[39m\u001b[39m%r\u001b[39;00m\u001b[39m\"\u001b[39m, url, new_retry)\n", - "\u001b[0;31mMaxRetryError\u001b[0m: HTTPSConnectionPool(host='api.openai.com', port=443): Max retries exceeded with url: /v1/chat/completions (Caused by NameResolutionError(\": Failed to resolve 'api.openai.com' ([Errno 8] nodename nor servname provided, or not known)\"))", - "\nDuring handling of the above exception, another exception occurred:\n", - "\u001b[0;31mConnectionError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[7], line 7\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m 6\u001b[0m splits\u001b[39m=\u001b[39m[\u001b[39m\"\u001b[39m\u001b[39mtrain\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39mdev\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39mtest\u001b[39m\u001b[39m\"\u001b[39m]\n\u001b[0;32m----> 7\u001b[0m mdl_results \u001b[39m=\u001b[39m apply_models(\n\u001b[1;32m 8\u001b[0m mdl_name, model_getter, dataset\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39mdansk\u001b[39;49m\u001b[39m\"\u001b[39;49m, splits\u001b[39m=\u001b[39;49msplits\n\u001b[1;32m 9\u001b[0m )\n\u001b[1;32m 10\u001b[0m dansk[mdl_name] \u001b[39m=\u001b[39m mdl_results\n", - "File \u001b[0;32m~/Github/DaCy/docs/evaluation/utils.py:177\u001b[0m, in \u001b[0;36mapply_models\u001b[0;34m(mdl_name, mdl_getter, dataset, splits, cache)\u001b[0m\n\u001b[1;32m 175\u001b[0m start \u001b[39m=\u001b[39m time()\n\u001b[1;32m 176\u001b[0m docs \u001b[39m=\u001b[39m nlp\u001b[39m.\u001b[39mpipe(example\u001b[39m.\u001b[39mreference\u001b[39m.\u001b[39mtext \u001b[39mfor\u001b[39;00m example \u001b[39min\u001b[39;00m examples)\n\u001b[0;32m--> 177\u001b[0m \u001b[39mfor\u001b[39;00m doc, example \u001b[39min\u001b[39;00m \u001b[39mzip\u001b[39m(docs, examples):\n\u001b[1;32m 178\u001b[0m example\u001b[39m.\u001b[39mpredicted \u001b[39m=\u001b[39m doc\n\u001b[1;32m 179\u001b[0m end \u001b[39m=\u001b[39m time()\n", - "File \u001b[0;32m~/.virtualenvs/dacy/lib/python3.10/site-packages/spacy/language.py:1574\u001b[0m, in \u001b[0;36mLanguage.pipe\u001b[0;34m(self, texts, as_tuples, batch_size, disable, component_cfg, n_process)\u001b[0m\n\u001b[1;32m 1572\u001b[0m \u001b[39mfor\u001b[39;00m pipe \u001b[39min\u001b[39;00m pipes:\n\u001b[1;32m 1573\u001b[0m docs \u001b[39m=\u001b[39m pipe(docs)\n\u001b[0;32m-> 1574\u001b[0m \u001b[39mfor\u001b[39;00m doc \u001b[39min\u001b[39;00m docs:\n\u001b[1;32m 1575\u001b[0m \u001b[39myield\u001b[39;00m doc\n", - "File \u001b[0;32m~/.virtualenvs/dacy/lib/python3.10/site-packages/spacy/util.py:1670\u001b[0m, in \u001b[0;36m_pipe\u001b[0;34m(docs, proc, name, default_error_handler, kwargs)\u001b[0m\n\u001b[1;32m 1660\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m_pipe\u001b[39m(\n\u001b[1;32m 1661\u001b[0m docs: Iterable[\u001b[39m\"\u001b[39m\u001b[39mDoc\u001b[39m\u001b[39m\"\u001b[39m],\n\u001b[1;32m 1662\u001b[0m proc: \u001b[39m\"\u001b[39m\u001b[39mPipeCallable\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1667\u001b[0m kwargs: Mapping[\u001b[39mstr\u001b[39m, Any],\n\u001b[1;32m 1668\u001b[0m ) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m Iterator[\u001b[39m\"\u001b[39m\u001b[39mDoc\u001b[39m\u001b[39m\"\u001b[39m]:\n\u001b[1;32m 1669\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mhasattr\u001b[39m(proc, \u001b[39m\"\u001b[39m\u001b[39mpipe\u001b[39m\u001b[39m\"\u001b[39m):\n\u001b[0;32m-> 1670\u001b[0m \u001b[39myield from\u001b[39;00m proc\u001b[39m.\u001b[39mpipe(docs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\n\u001b[1;32m 1671\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m 1672\u001b[0m \u001b[39m# We added some args for pipe that __call__ doesn't expect.\u001b[39;00m\n\u001b[1;32m 1673\u001b[0m kwargs \u001b[39m=\u001b[39m \u001b[39mdict\u001b[39m(kwargs)\n", - "File \u001b[0;32m~/.virtualenvs/dacy/lib/python3.10/site-packages/spacy_llm/pipeline/llm.py:140\u001b[0m, in \u001b[0;36mLLMWrapper.pipe\u001b[0;34m(self, stream, batch_size)\u001b[0m\n\u001b[1;32m 138\u001b[0m \u001b[39myield from\u001b[39;00m \u001b[39miter\u001b[39m(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_process_docs(doc_batch))\n\u001b[1;32m 139\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mException\u001b[39;00m \u001b[39mas\u001b[39;00m e:\n\u001b[0;32m--> 140\u001b[0m error_handler(\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_name, \u001b[39mself\u001b[39;49m, doc_batch, e)\n", - "File \u001b[0;32m~/.virtualenvs/dacy/lib/python3.10/site-packages/spacy/util.py:1689\u001b[0m, in \u001b[0;36mraise_error\u001b[0;34m(proc_name, proc, docs, e)\u001b[0m\n\u001b[1;32m 1688\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mraise_error\u001b[39m(proc_name, proc, docs, e):\n\u001b[0;32m-> 1689\u001b[0m \u001b[39mraise\u001b[39;00m e\n", - "File \u001b[0;32m~/.virtualenvs/dacy/lib/python3.10/site-packages/spacy_llm/pipeline/llm.py:138\u001b[0m, in \u001b[0;36mLLMWrapper.pipe\u001b[0;34m(self, stream, batch_size)\u001b[0m\n\u001b[1;32m 136\u001b[0m \u001b[39mfor\u001b[39;00m doc_batch \u001b[39min\u001b[39;00m spacy\u001b[39m.\u001b[39mutil\u001b[39m.\u001b[39mminibatch(stream, batch_size):\n\u001b[1;32m 137\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 138\u001b[0m \u001b[39myield from\u001b[39;00m \u001b[39miter\u001b[39m(\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_process_docs(doc_batch))\n\u001b[1;32m 139\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mException\u001b[39;00m \u001b[39mas\u001b[39;00m e:\n\u001b[1;32m 140\u001b[0m error_handler(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_name, \u001b[39mself\u001b[39m, doc_batch, e)\n", - "File \u001b[0;32m~/.virtualenvs/dacy/lib/python3.10/site-packages/spacy_llm/pipeline/llm.py:152\u001b[0m, in \u001b[0;36mLLMWrapper._process_docs\u001b[0;34m(self, docs)\u001b[0m\n\u001b[1;32m 150\u001b[0m noncached_doc_batch \u001b[39m=\u001b[39m [doc \u001b[39mfor\u001b[39;00m i, doc \u001b[39min\u001b[39;00m \u001b[39menumerate\u001b[39m(docs) \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m is_cached[i]]\n\u001b[1;32m 151\u001b[0m prompts \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_task\u001b[39m.\u001b[39mgenerate_prompts(noncached_doc_batch)\n\u001b[0;32m--> 152\u001b[0m responses \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_backend(prompts)\n\u001b[1;32m 153\u001b[0m modified_docs \u001b[39m=\u001b[39m \u001b[39miter\u001b[39m(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_task\u001b[39m.\u001b[39mparse_responses(noncached_doc_batch, responses))\n\u001b[1;32m 154\u001b[0m final_docs \u001b[39m=\u001b[39m []\n", - "File \u001b[0;32m~/.virtualenvs/dacy/lib/python3.10/site-packages/spacy_llm/backends/rest/backend/openai.py:140\u001b[0m, in \u001b[0;36mOpenAIBackend.__call__\u001b[0;34m(self, prompts)\u001b[0m\n\u001b[1;32m 137\u001b[0m \u001b[39mif\u001b[39;00m url \u001b[39m==\u001b[39m Endpoints\u001b[39m.\u001b[39mCHAT:\n\u001b[1;32m 138\u001b[0m \u001b[39m# The OpenAI API doesn't support batching for /chat/completions yet, so we have to send individual requests.\u001b[39;00m\n\u001b[1;32m 139\u001b[0m \u001b[39mfor\u001b[39;00m prompt \u001b[39min\u001b[39;00m prompts:\n\u001b[0;32m--> 140\u001b[0m responses \u001b[39m=\u001b[39m _request(\n\u001b[1;32m 141\u001b[0m {\u001b[39m\"\u001b[39;49m\u001b[39mmessages\u001b[39;49m\u001b[39m\"\u001b[39;49m: [{\u001b[39m\"\u001b[39;49m\u001b[39mrole\u001b[39;49m\u001b[39m\"\u001b[39;49m: \u001b[39m\"\u001b[39;49m\u001b[39muser\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39m\"\u001b[39;49m\u001b[39mcontent\u001b[39;49m\u001b[39m\"\u001b[39;49m: prompt}]}\n\u001b[1;32m 142\u001b[0m )\n\u001b[1;32m 143\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39m\"\u001b[39m\u001b[39merror\u001b[39m\u001b[39m\"\u001b[39m \u001b[39min\u001b[39;00m responses:\n\u001b[1;32m 144\u001b[0m \u001b[39mreturn\u001b[39;00m responses[\u001b[39m\"\u001b[39m\u001b[39merror\u001b[39m\u001b[39m\"\u001b[39m]\n", - "File \u001b[0;32m~/.virtualenvs/dacy/lib/python3.10/site-packages/spacy_llm/backends/rest/backend/openai.py:111\u001b[0m, in \u001b[0;36mOpenAIBackend.__call__.._request\u001b[0;34m(json_data)\u001b[0m\n\u001b[1;32m 110\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m_request\u001b[39m(json_data: Dict[\u001b[39mstr\u001b[39m, Any]) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m Dict[\u001b[39mstr\u001b[39m, Any]:\n\u001b[0;32m--> 111\u001b[0m r \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mretry(\n\u001b[1;32m 112\u001b[0m call_method\u001b[39m=\u001b[39;49mrequests\u001b[39m.\u001b[39;49mpost,\n\u001b[1;32m 113\u001b[0m url\u001b[39m=\u001b[39;49murl,\n\u001b[1;32m 114\u001b[0m headers\u001b[39m=\u001b[39;49mheaders,\n\u001b[1;32m 115\u001b[0m json\u001b[39m=\u001b[39;49m{\u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mjson_data, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_config},\n\u001b[1;32m 116\u001b[0m timeout\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_max_request_time,\n\u001b[1;32m 117\u001b[0m )\n\u001b[1;32m 118\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m 119\u001b[0m r\u001b[39m.\u001b[39mraise_for_status()\n", - "File \u001b[0;32m~/.virtualenvs/dacy/lib/python3.10/site-packages/spacy_llm/backends/rest/backend/base.py:121\u001b[0m, in \u001b[0;36mBackend.retry\u001b[0;34m(self, call_method, url, **kwargs)\u001b[0m\n\u001b[1;32m 117\u001b[0m \u001b[39mwhile\u001b[39;00m i \u001b[39m<\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_max_tries \u001b[39mand\u001b[39;00m (\n\u001b[1;32m 118\u001b[0m response \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39mor\u001b[39;00m _HTTPRetryErrorCodes\u001b[39m.\u001b[39mhas(response\u001b[39m.\u001b[39mstatus_code)\n\u001b[1;32m 119\u001b[0m ):\n\u001b[1;32m 120\u001b[0m time\u001b[39m.\u001b[39msleep(interval)\n\u001b[0;32m--> 121\u001b[0m response \u001b[39m=\u001b[39m _call_api(i \u001b[39m+\u001b[39;49m \u001b[39m1\u001b[39;49m)\n\u001b[1;32m 122\u001b[0m i \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m \u001b[39m1\u001b[39m\n\u001b[1;32m 123\u001b[0m \u001b[39m# Increase timeout everytime you retry\u001b[39;00m\n", - "File \u001b[0;32m~/.virtualenvs/dacy/lib/python3.10/site-packages/spacy_llm/backends/rest/backend/base.py:101\u001b[0m, in \u001b[0;36mBackend.retry.._call_api\u001b[0;34m(attempt)\u001b[0m\n\u001b[1;32m 94\u001b[0m \u001b[39m\u001b[39m\u001b[39m\"\"\"Calls API with given timeout.\u001b[39;00m\n\u001b[1;32m 95\u001b[0m \u001b[39mattempt (int): Reflects the how many-th try at reaching the API this is. If attempt < self._max_tries and\u001b[39;00m\n\u001b[1;32m 96\u001b[0m \u001b[39m the call fails, None is returned. If attempt == self._max_tries and the call fails, a TimeoutError is\u001b[39;00m\n\u001b[1;32m 97\u001b[0m \u001b[39m raised.\u001b[39;00m\n\u001b[1;32m 98\u001b[0m \u001b[39mRETURNS (Optional[requests.Response]): Response object.\u001b[39;00m\n\u001b[1;32m 99\u001b[0m \u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m 100\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 101\u001b[0m \u001b[39mreturn\u001b[39;00m call_method(url, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m 102\u001b[0m \u001b[39mexcept\u001b[39;00m (ConnectTimeout, ReadTimeout, \u001b[39mTimeoutError\u001b[39;00m) \u001b[39mas\u001b[39;00m err:\n\u001b[1;32m 103\u001b[0m \u001b[39mif\u001b[39;00m attempt \u001b[39m<\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_max_tries:\n", - "File \u001b[0;32m~/.virtualenvs/dacy/lib/python3.10/site-packages/requests/api.py:115\u001b[0m, in \u001b[0;36mpost\u001b[0;34m(url, data, json, **kwargs)\u001b[0m\n\u001b[1;32m 103\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mpost\u001b[39m(url, data\u001b[39m=\u001b[39m\u001b[39mNone\u001b[39;00m, json\u001b[39m=\u001b[39m\u001b[39mNone\u001b[39;00m, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs):\n\u001b[1;32m 104\u001b[0m \u001b[39m \u001b[39m\u001b[39mr\u001b[39m\u001b[39m\"\"\"Sends a POST request.\u001b[39;00m\n\u001b[1;32m 105\u001b[0m \n\u001b[1;32m 106\u001b[0m \u001b[39m :param url: URL for the new :class:`Request` object.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 112\u001b[0m \u001b[39m :rtype: requests.Response\u001b[39;00m\n\u001b[1;32m 113\u001b[0m \u001b[39m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 115\u001b[0m \u001b[39mreturn\u001b[39;00m request(\u001b[39m\"\u001b[39;49m\u001b[39mpost\u001b[39;49m\u001b[39m\"\u001b[39;49m, url, data\u001b[39m=\u001b[39;49mdata, json\u001b[39m=\u001b[39;49mjson, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n", - "File \u001b[0;32m~/.virtualenvs/dacy/lib/python3.10/site-packages/requests/api.py:59\u001b[0m, in \u001b[0;36mrequest\u001b[0;34m(method, url, **kwargs)\u001b[0m\n\u001b[1;32m 55\u001b[0m \u001b[39m# By using the 'with' statement we are sure the session is closed, thus we\u001b[39;00m\n\u001b[1;32m 56\u001b[0m \u001b[39m# avoid leaving sockets open which can trigger a ResourceWarning in some\u001b[39;00m\n\u001b[1;32m 57\u001b[0m \u001b[39m# cases, and look like a memory leak in others.\u001b[39;00m\n\u001b[1;32m 58\u001b[0m \u001b[39mwith\u001b[39;00m sessions\u001b[39m.\u001b[39mSession() \u001b[39mas\u001b[39;00m session:\n\u001b[0;32m---> 59\u001b[0m \u001b[39mreturn\u001b[39;00m session\u001b[39m.\u001b[39;49mrequest(method\u001b[39m=\u001b[39;49mmethod, url\u001b[39m=\u001b[39;49murl, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n", - "File \u001b[0;32m~/.virtualenvs/dacy/lib/python3.10/site-packages/requests/sessions.py:589\u001b[0m, in \u001b[0;36mSession.request\u001b[0;34m(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)\u001b[0m\n\u001b[1;32m 584\u001b[0m send_kwargs \u001b[39m=\u001b[39m {\n\u001b[1;32m 585\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mtimeout\u001b[39m\u001b[39m\"\u001b[39m: timeout,\n\u001b[1;32m 586\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mallow_redirects\u001b[39m\u001b[39m\"\u001b[39m: allow_redirects,\n\u001b[1;32m 587\u001b[0m }\n\u001b[1;32m 588\u001b[0m send_kwargs\u001b[39m.\u001b[39mupdate(settings)\n\u001b[0;32m--> 589\u001b[0m resp \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49msend(prep, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49msend_kwargs)\n\u001b[1;32m 591\u001b[0m \u001b[39mreturn\u001b[39;00m resp\n", - "File \u001b[0;32m~/.virtualenvs/dacy/lib/python3.10/site-packages/requests/sessions.py:703\u001b[0m, in \u001b[0;36mSession.send\u001b[0;34m(self, request, **kwargs)\u001b[0m\n\u001b[1;32m 700\u001b[0m start \u001b[39m=\u001b[39m preferred_clock()\n\u001b[1;32m 702\u001b[0m \u001b[39m# Send the request\u001b[39;00m\n\u001b[0;32m--> 703\u001b[0m r \u001b[39m=\u001b[39m adapter\u001b[39m.\u001b[39;49msend(request, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m 705\u001b[0m \u001b[39m# Total elapsed time of the request (approximately)\u001b[39;00m\n\u001b[1;32m 706\u001b[0m elapsed \u001b[39m=\u001b[39m preferred_clock() \u001b[39m-\u001b[39m start\n", - "File \u001b[0;32m~/.virtualenvs/dacy/lib/python3.10/site-packages/requests/adapters.py:519\u001b[0m, in \u001b[0;36mHTTPAdapter.send\u001b[0;34m(self, request, stream, timeout, verify, cert, proxies)\u001b[0m\n\u001b[1;32m 515\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(e\u001b[39m.\u001b[39mreason, _SSLError):\n\u001b[1;32m 516\u001b[0m \u001b[39m# This branch is for urllib3 v1.22 and later.\u001b[39;00m\n\u001b[1;32m 517\u001b[0m \u001b[39mraise\u001b[39;00m SSLError(e, request\u001b[39m=\u001b[39mrequest)\n\u001b[0;32m--> 519\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mConnectionError\u001b[39;00m(e, request\u001b[39m=\u001b[39mrequest)\n\u001b[1;32m 521\u001b[0m \u001b[39mexcept\u001b[39;00m ClosedPoolError \u001b[39mas\u001b[39;00m e:\n\u001b[1;32m 522\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mConnectionError\u001b[39;00m(e, request\u001b[39m=\u001b[39mrequest)\n", - "\u001b[0;31mConnectionError\u001b[0m: HTTPSConnectionPool(host='api.openai.com', port=443): Max retries exceeded with url: /v1/chat/completions (Caused by NameResolutionError(\": Failed to resolve 'api.openai.com' ([Errno 8] nodename nor servname provided, or not known)\"))" + "dansk (test): Loading prediction for da_core_news_sm-3.5.0\n" ] } ], @@ -682,52 +621,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": { "tags": [ "remove-cell" ] }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", - "To disable this warning, you can either:\n", - "\t- Avoid using `tokenizers` before the fork if possible\n", - "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", - "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", - "To disable this warning, you can either:\n", - "\t- Avoid using `tokenizers` before the fork if possible\n", - "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", - "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", - "To disable this warning, you can either:\n", - "\t- Avoid using `tokenizers` before the fork if possible\n", - "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", - "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", - "To disable this warning, you can either:\n", - "\t- Avoid using `tokenizers` before the fork if possible\n", - "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", - "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", - "To disable this warning, you can either:\n", - "\t- Avoid using `tokenizers` before the fork if possible\n", - "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", - "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", - "To disable this warning, you can either:\n", - "\t- Avoid using `tokenizers` before the fork if possible\n", - "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", - "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", - "To disable this warning, you can either:\n", - "\t- Avoid using `tokenizers` before the fork if possible\n", - "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", - "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", - "To disable this warning, you can either:\n", - "\t- Avoid using `tokenizers` before the fork if possible\n", - "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" - ] - } - ], + "outputs": [], "source": [ "with Pool(8) as p:\n", " tables = p.starmap(\n", @@ -738,7 +638,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": { "tags": [ "remove-cell" @@ -808,7 +708,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": { "tags": [ "remove-input" @@ -820,23 +720,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ "alt.LayerChart(...)" ] }, - "execution_count": 10, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -901,7 +801,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": { "tags": [ "remove-cell" @@ -935,7 +835,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "metadata": { "tags": [ "remove-cell" @@ -963,7 +863,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": { "tags": [ "remove-input" @@ -974,165 +874,165 @@ "data": { "text/html": [ "\n", - "\n", + "
\n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
F1 score with 95% confidence interval calculated using bootstrapping with 100 samples.
  Fine-grained ModelsFine-grained Models
  Large 0.1.0Medium 0.1.0Small 0.1.0Large 0.1.0Medium 0.1.0Small 0.1.0
EntitiesEvent43.5 (27.0, 56.0)64.2 (50.0, 79.4)46.1 (27.8, 62.4)EntitiesEvent43.5 (27.0, 56.0)64.2 (50.0, 79.4)46.1 (27.8, 62.4)
Facility69.8 (54.3, 84.4)72.3 (56.2, 84.6)55.5 (36.2, 70.5)Facility69.8 (54.3, 84.4)72.3 (56.2, 84.6)55.5 (36.2, 70.5)
GPE90.6 (87.2, 93.1)88.0 (82.7, 92.1)79.6 (73.0, 84.6)GPE90.6 (87.2, 93.1)88.0 (82.7, 92.1)79.6 (73.0, 84.6)
Language74.5 (60.0, 83.3)51.9 (23.3, 100.0)45.9 (13.3, 93.3)Language74.5 (60.0, 83.3)51.9 (23.3, 100.0)45.9 (13.3, 93.3)
Law54.2 (38.1, 72.5)59.3 (37.4, 77.3)57.6 (39.6, 75.1)Law54.2 (38.1, 72.5)59.3 (37.4, 77.3)57.6 (39.6, 75.1)
Location75.3 (66.9, 83.8)72.5 (62.1, 80.8)65.6 (55.4, 74.1)Location75.3 (66.9, 83.8)72.5 (62.1, 80.8)65.6 (55.4, 74.1)
NORP84.8 (76.9, 90.8)78.2 (68.6, 85.8)73.3 (62.9, 81.5)NORP84.8 (76.9, 90.8)78.2 (68.6, 85.8)73.3 (62.9, 81.5)
Ordinal37.8 (22.5, 51.2)68.7 (49.1, 82.6)68.5 (47.6, 83.1)Ordinal37.8 (22.5, 51.2)68.7 (49.1, 82.6)68.5 (47.6, 83.1)
Organization79.5 (74.9, 83.1)80.5 (78.1, 84.2)79.1 (75.7, 82.3)Organization79.5 (74.9, 83.1)80.5 (78.1, 84.2)79.1 (75.7, 82.3)
Person85.9 (82.7, 88.8)84.8 (80.6, 88.2)86.8 (83.2, 90.1)Person85.9 (82.7, 88.8)84.8 (80.6, 88.2)86.8 (83.2, 90.1)
Product62.4 (53.9, 72.0)62.6 (53.9, 71.6)59.5 (48.9, 67.9)Product62.4 (53.9, 72.0)62.6 (53.9, 71.6)59.5 (48.9, 67.9)
Work of Art39.3 (25.5, 50.3)58.4 (48.7, 69.1)46.6 (36.2, 56.9)Work of Art39.3 (25.5, 50.3)58.4 (48.7, 69.1)46.6 (36.2, 56.9)
Non-EntitiesCardinal87.0 (82.8, 90.3)80.5 (77.0, 84.4)89.2 (86.0, 91.7)Non-EntitiesCardinal87.0 (82.8, 90.3)80.5 (77.0, 84.4)89.2 (86.0, 91.7)
Date77.3 (71.6, 81.8)77.6 (72.8, 82.2)78.8 (73.9, 83.4)Date77.3 (71.6, 81.8)77.6 (72.8, 82.2)78.8 (73.9, 83.4)
Money99.3 (97.9, 100.0)98.6 (97.2, 100.0)95.2 (90.0, 98.2)Money99.3 (97.9, 100.0)98.6 (97.2, 100.0)95.2 (90.0, 98.2)
Percent100.0 (100.0, 100.0)100.0 (100.0, 100.0)100.0 (100.0, 100.0)Percent100.0 (100.0, 100.0)100.0 (100.0, 100.0)100.0 (100.0, 100.0)
Quantity78.6 (59.8, 93.8)76.9 (63.9, 89.9)71.3 (50.0, 91.1)Quantity78.6 (59.8, 93.8)76.9 (63.9, 89.9)71.3 (50.0, 91.1)
Time90.9 (83.8, 96.7)85.1 (74.0, 93.7)83.4 (68.0, 95.6)Time90.9 (83.8, 96.7)85.1 (74.0, 93.7)83.4 (68.0, 95.6)
AverageAverage80.1 (78.2, 81.9)79.7 (77.7, 81.5)78.4 (76.3, 80.4)AverageAverage80.1 (78.2, 81.9)79.7 (77.7, 81.5)78.4 (76.3, 80.4)
\n" ], "text/plain": [ - "" + "" ] }, - "execution_count": 13, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -1197,7 +1097,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 17, "metadata": { "tags": [ "remove-cell" @@ -1210,7 +1110,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 18, "metadata": { "tags": [ "remove-cell" @@ -1229,7 +1129,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 19, "metadata": { "tags": [ "remove-cell" @@ -1244,7 +1144,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 20, "metadata": { "tags": [ "remove-input" @@ -1256,23 +1156,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ "alt.LayerChart(...)" ] }, - "execution_count": 17, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -1396,7 +1296,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 21, "metadata": { "tags": [ "remove-cell" @@ -1409,7 +1309,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 26, "metadata": { "tags": [ "remove-cell" @@ -1419,11 +1319,11 @@ "source": [ "tables = []\n", "for mdl_name in dansk:\n", - " if \"fine_grained\" in mdl_name:\n", - " continue\n", + " # if \"fine_grained\" in mdl_name:\n", + " # continue\n", " examples = dansk[mdl_name][\"test\"][\"examples\"]\n", - " examples += dansk[mdl_name][\"dev\"][\"examples\"]\n", - " examples += dansk[mdl_name][\"train\"][\"examples\"]\n", + " # examples += dansk[mdl_name][\"dev\"][\"examples\"]\n", + " # examples += dansk[mdl_name][\"train\"][\"examples\"]\n", "\n", " \n", " examples = convert_to_conll_2003(examples)\n", @@ -1435,7 +1335,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 27, "metadata": { "tags": [ "remove-cell" @@ -1450,7 +1350,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 28, "metadata": { "tags": [ "remove-input" @@ -1462,23 +1362,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ "alt.LayerChart(...)" ] }, - "execution_count": 21, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -1594,6 +1494,72 @@ "chart" ] }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\\begin{tabular}{lllll}\n", + "\\toprule\n", + " & Average F1 & Person F1 & Organization F1 & Location F1 \\\\\n", + "Model & & & & \\\\\n", + "\\midrule\n", + "da\\_dacy\\_large\\_trf-0.2.0 & 0.67 (0.63, 0.71) & 0.69 (0.62, 0.76) & 0.54 (0.47, 0.60) & 0.81 (0.76, 0.86) \\\\\n", + "da\\_dacy\\_medium\\_trf-0.2.0 & 0.56 (0.51, 0.61) & 0.60 (0.52, 0.67) & 0.42 (0.34, 0.52) & 0.70 (0.63, 0.76) \\\\\n", + "da\\_dacy\\_small\\_trf-0.2.0 & 0.57 (0.52, 0.62) & 0.66 (0.57, 0.74) & 0.41 (0.35, 0.48) & 0.70 (0.65, 0.75) \\\\\n", + "da\\_dacy\\_large\\_ner\\_fine\\_grained-0.1.0 & 0.85 (0.81, 0.88) & 0.86 (0.80, 0.90) & 0.79 (0.73, 0.85) & 0.93 (0.89, 0.96) \\\\\n", + "da\\_dacy\\_medium\\_ner\\_fine\\_grained-0.1.0 & 0.85 (0.81, 0.88) & 0.85 (0.79, 0.90) & 0.80 (0.76, 0.85) & 0.91 (0.86, 0.96) \\\\\n", + "da\\_dacy\\_small\\_ner\\_fine\\_grained-0.1.0 & 0.83 (0.8, 0.86) & 0.87 (0.82, 0.92) & 0.79 (0.74, 0.83) & 0.85 (0.78, 0.92) \\\\\n", + "saattrupdan/nbailab-base-ner-scandi & 0.64 (0.6, 0.68) & 0.66 (0.57, 0.72) & 0.52 (0.45, 0.59) & 0.75 (0.69, 0.81) \\\\\n", + "alexandrainst/da-ner-base & 0.67 (0.63, 0.73) & 0.70 (0.61, 0.76) & 0.55 (0.47, 0.63) & 0.77 (0.71, 0.83) \\\\\n", + "da\\_core\\_news\\_trf-3.5.0 & 0.6 (0.55, 0.65) & 0.63 (0.55, 0.71) & 0.44 (0.35, 0.52) & 0.74 (0.68, 0.79) \\\\\n", + "da\\_core\\_news\\_lg-3.5.0 & 0.54 (0.48, 0.59) & 0.57 (0.47, 0.65) & 0.39 (0.32, 0.46) & 0.67 (0.59, 0.76) \\\\\n", + "da\\_core\\_news\\_md-3.5.0 & 0.52 (0.46, 0.57) & 0.60 (0.51, 0.68) & 0.34 (0.28, 0.41) & 0.67 (0.59, 0.74) \\\\\n", + "da\\_core\\_news\\_sm-3.5.0 & 0.34 (0.29, 0.39) & 0.34 (0.26, 0.42) & 0.22 (0.17, 0.30) & 0.48 (0.39, 0.56) \\\\\n", + "\\bottomrule\n", + "\\end{tabular}\n", + "\n" + ] + } + ], + "source": [ + "# add ci to average\n", + "\n", + "df = tables\n", + "df = df[df[\"Domain\"] != \"dannet\"] # type: ignore\n", + "df = df[df[\"Domain\"].notnull()]\n", + "\n", + "df[\"Average F1\"] = df[\"Average\"].round(2).astype(str) + \" (\" + df[\"Average Lower CI\"].round(2).astype(str) + \", \" + df[\"Average Upper CI\"].round(2).astype(str) + \")\"\n", + "\n", + "df.drop([\"Average Lower CI\", \"Average Upper CI\"], axis=1, inplace=True)\n", + "df.drop([\"Number of docs\", \"Average\"], axis=1, inplace=True)\n", + "\n", + "\n", + "# filter all but average\n", + "df = df[df[\"Domain\"] == \"All\"]\n", + "df.drop([\"Domain\"], axis=1, inplace=True)\n", + "df.set_index(\"Model\", inplace=True)\n", + "df\n", + "\n", + "# convert to latex using styler\n", + "style = df.style.format_index(escape=\"latex\", axis=1).format_index(\n", + " escape=\"latex\", axis=0\n", + ")\n", + "\n", + "# print latex\n", + "latex = style.to_latex(\n", + " hrules=True,\n", + " convert_css=True,\n", + " )\n", + "\n", + "print(latex)\n", + "\n" + ] + }, { "attachments": {}, "cell_type": "markdown", From 6e0f04f73f7c4581f8a3922807e2558e6a873743 Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Wed, 6 Dec 2023 20:50:42 +0100 Subject: [PATCH 2/6] Update cruft reference --- .cruft.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.cruft.json b/.cruft.json index 0aba82e7..a732c277 100644 --- a/.cruft.json +++ b/.cruft.json @@ -1,5 +1,5 @@ { - "template": "https://github.com/MartinBernstorff/swift-python-cookiecutter", + "template": "https://github.com/KennethEnevoldsen/swift-python-cookiecutter", "commit": "7fdb02999e8596c525377c208ca902645d134f97", "checkout": null, "context": { @@ -16,8 +16,8 @@ "_copy_without_render": [ "*.github" ], - "_template": "https://github.com/MartinBernstorff/swift-python-cookiecutter" + "_template": "https://github.com/KennethEnevoldsen/swift-python-cookiecutter" } }, "directory": null -} +} \ No newline at end of file From 6cbeb42270bf6c8d97af36352465decc2b0a718d Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Wed, 6 Dec 2023 21:04:17 +0100 Subject: [PATCH 3/6] Updated cruft template --- .cookiecutter.json | 1 - .cruft.json | 5 +- .github/dependabot.yml | 16 - .github/workflows/check_for_rej.yml | 27 -- .github/workflows/cruft.yml | 66 ---- .github/workflows/dependabot_automerge.yml | 30 -- .github/workflows/lint.yml | 28 ++ .github/workflows/pre-commit.yml | 79 ---- .github/workflows/static_type_checks.yml | 71 +--- .github/workflows/tests.yml | 44 +-- .pre-commit-config.yaml | 31 -- CONTRIBUTING.md | 27 +- README.md | 4 +- makefile | 39 ++ pyproject.toml | 43 +-- tasks.py | 427 --------------------- training/main/requirements.txt | 3 - 17 files changed, 102 insertions(+), 839 deletions(-) delete mode 100644 .github/dependabot.yml delete mode 100644 .github/workflows/check_for_rej.yml delete mode 100644 .github/workflows/cruft.yml delete mode 100644 .github/workflows/dependabot_automerge.yml create mode 100644 .github/workflows/lint.yml delete mode 100644 .github/workflows/pre-commit.yml delete mode 100644 .pre-commit-config.yaml create mode 100644 makefile delete mode 100644 tasks.py diff --git a/.cookiecutter.json b/.cookiecutter.json index 2665f074..9d03ac9f 100644 --- a/.cookiecutter.json +++ b/.cookiecutter.json @@ -8,7 +8,6 @@ "email": "kennethcenevoldsen@gmail.com", "friendly_name": "DaCy", "github_user": "centre-for-humanities-computing", - "license": "MIT", "package_name": "dacy", "project_name": "dacy", "version": "2.4.2" diff --git a/.cruft.json b/.cruft.json index a732c277..a14abc9a 100644 --- a/.cruft.json +++ b/.cruft.json @@ -1,6 +1,6 @@ { "template": "https://github.com/KennethEnevoldsen/swift-python-cookiecutter", - "commit": "7fdb02999e8596c525377c208ca902645d134f97", + "commit": "e96eb05162a0e45a8ad5aa446c72229372e79cdb", "checkout": null, "context": { "cookiecutter": { @@ -12,7 +12,6 @@ "github_user": "centre-for-humanities-computing", "version": "2.4.2", "copyright_year": "2023", - "license": "MIT", "_copy_without_render": [ "*.github" ], @@ -20,4 +19,4 @@ } }, "directory": null -} \ No newline at end of file +} diff --git a/.github/dependabot.yml b/.github/dependabot.yml deleted file mode 100644 index ee72a897..00000000 --- a/.github/dependabot.yml +++ /dev/null @@ -1,16 +0,0 @@ -# Configuration: https://dependabot.com/docs/config-file/ -# Docs: https://docs.github.com/en/github/administering-a-repository/keeping-your-dependencies-updated-automatically - -version: 2 -updates: - - package-ecosystem: "pip" # See documentation for possible values - directory: "/" # Location of package manifests - schedule: - interval: "weekly" - day: "monday" - time: "13:00" - timezone: "Europe/Copenhagen" - open-pull-requests-limit: 20 - commit-message: - prefix: "deps:" - include: "scope" diff --git a/.github/workflows/check_for_rej.yml b/.github/workflows/check_for_rej.yml deleted file mode 100644 index ed200fcc..00000000 --- a/.github/workflows/check_for_rej.yml +++ /dev/null @@ -1,27 +0,0 @@ -# .rej files occur when cruft update could not merge two files. -# They need to be handled, but are easy to miss if there's no CI -name: Check for .rej files - -on: - pull_request: - types: [opened, synchronize] - -jobs: - check-for-rej-files: - runs-on: ubuntu-latest - - steps: - - name: Checkout code - uses: actions/checkout@v2 - - - name: Check for .rej files - run: | - files=`find . -type f -name "*.rej"` - count=`echo $files | grep -o "\.rej" | wc -l` - if [[ $count != 0 ]]; then - echo "Found .rej files in the repository." - echo $files | - exit 1 - else - echo "No .rej files found in the repository." - fi diff --git a/.github/workflows/cruft.yml b/.github/workflows/cruft.yml deleted file mode 100644 index 714de363..00000000 --- a/.github/workflows/cruft.yml +++ /dev/null @@ -1,66 +0,0 @@ -name: Cruft Check - -on: - pull_request: - branches: - - main - -jobs: - cruft-check: - runs-on: ubuntu-latest - permissions: - pull-requests: write - - steps: - # Avoid infinite loop where main - # Feature PR -> cruft check from main -> - # Cruft update PR -> cruft check from main -> - # Cruft update PR ... - - name: Check if pull request is from a fork - run: | - if [ "${{ github.event.pull_request.head.repo.fork }}" = "true" ]; then - echo "Pull request is from a fork and does not have permissions for PR creation. Exiting gracefully." - exit 0 - elif [ "${{github.event.pull_request.title}}" == "ci - update cruft" ]; then - echo "Pull request is already a cruft update. Exiting gracefully." - exit 0 - else - echo "Pull request is not from a fork, continuing." - fi - - - name: Checkout code - uses: actions/checkout@v3 - with: - ref: main - - - name: Setup Python - uses: actions/setup-python@v4 - with: - python-version: "3.9" - - - name: Install Cruft - run: pip install cruft - - - name: Update cruft - id: cruft_check - run: | - cruft_output=$(cruft update --skip-apply-ask) - if echo "$cruft_output" | grep -q "Good work!"; then - echo "$cruft_output" - echo "cruft_updated=true" >> $GITHUB_OUTPUT - else - echo "$cruft_output" - echo "cruft_updated=false" >> $GITHUB_OUTPUT - fi - - - name: Create Pull Request - uses: peter-evans/create-pull-request@v4 - if: ${{ steps.cruft_check.outputs.cruft_updated == 'true' && github.event.pull_request.title != 'ci - update cruft' }} - continue-on-error: true - with: - title: "ci - update cruft" - branch: "update-cruft" - body: "🌲 Cruft updates" - token: ${{ secrets.PAT }} - commit-message: "ci: update cruft" - labels: "dependencies" # This makes the PR exempt from the stale bot diff --git a/.github/workflows/dependabot_automerge.yml b/.github/workflows/dependabot_automerge.yml deleted file mode 100644 index 22d2ecd7..00000000 --- a/.github/workflows/dependabot_automerge.yml +++ /dev/null @@ -1,30 +0,0 @@ -# GitHub action to automerge dependabot PRs. Only merges if tests passes the -# branch protections in the repository settings. -# You can set branch protections in the repository under Settings > Branches > Add rule -name: automerge-bot-prs - -on: pull_request - -permissions: - contents: write - pull-requests: write - -jobs: - dependabot-automerge: - runs-on: ubuntu-latest - # if actor is dependabot or pre-commit-ci[bot] then run - if: ${{ github.actor == 'dependabot[bot]' }} - - steps: - # Checkout action is required for token to persist - - name: Enable auto-merge for Dependabot PRs - run: gh pr merge --auto --merge "$PR_URL" # Use Github CLI to merge automatically the PR - env: - PR_URL: ${{github.event.pull_request.html_url}} - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - - name: Auto approve dependabot PRs - if: ${{ github.actor == 'dependabot[bot]' }} - uses: hmarr/auto-approve-action@v3.1.0 - with: - github-token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 00000000..d27e4d45 --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,28 @@ +# GitHub action to run linting + +name: run-pre-commit + +on: + pull_request: + branches: [main] + push: + branches: [main] + +jobs: + pre-commit: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - uses: actions/setup-python@v4 + with: + python-version: "3.9" + cache: "pip" + + - name: Install pre-commit + run: make install + + - name: Lint + id: lint + run: | + make lint diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml deleted file mode 100644 index 79671c4a..00000000 --- a/.github/workflows/pre-commit.yml +++ /dev/null @@ -1,79 +0,0 @@ -# GitHub action to check if pre-commit has been run. Runs from .pre-commit-config.yaml, where the pre-commit actions are. - -name: run-pre-commit - -on: - pull_request: - branches: [main] - push: - branches: [main] - -jobs: - pre-commit: - permissions: - pull-requests: write - concurrency: - group: "${{ github.workflow }} @ ${{ github.ref }}" - cancel-in-progress: true - if: ${{ github.actor != 'dependabot[bot]' }} - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - with: - repository: ${{ github.event.pull_request.head.repo.full_name }} - ref: ${{ github.event.pull_request.head.ref }} - token: ${{ secrets.PAT }} - - - uses: actions/setup-python@v4 - with: - python-version: "3.9" - - - name: Install pre-commit - run: pip install pre-commit - - - name: Run pre-commit - id: pre_commit - continue-on-error: true - run: | - if pre-commit run --color always --all-files; then - echo "Pre-commit check passed" - echo "pre_commit_failed=0" >> $GITHUB_OUTPUT - else - echo "Pre-commit check failed" - echo "pre_commit_failed=1" >> $GITHUB_OUTPUT - exit 1 - fi - - # Have this step before commit in case the PR is from a fork. In this case, we want the - # add-pr-comment to fail, because it makes it means that the contributer is directed here, - # and are given the informative error message, instead of directed to a "could not commit error message". - - uses: mshick/add-pr-comment@v2 - if: ${{ steps.pre_commit.outputs.pre_commit_failed == 1 && github.event_name == 'pull_request' }} - id: add_comment - with: - message: | - Looks like some formatting rules failed. - - ✨ The action has attempted automatic fixes ✨ - - If any were succesful, they were committed to the branch. - We suggest using `git pull --rebase` to apply them locally. - - If some errors could not be fixed automatically, you can: - - 🏎️ Get results locally by running `pre-commit run --all-files` - 🕵️ Examine the results in the `Run pre-commit` section of this workflow `pre-commit` - - We also recommend setting up the `ruff` and `black` extensions to auto-format on save in your chosen editor. - - - name: Commit formatting - if: ${{ steps.pre_commit.outputs.pre_commit_failed == 1 && github.event_name == 'pull_request' }} - run: | - git config user.name github-actions - git config user.email github-actions@github.com - git commit -am "style: linting" - git push --no-verify - - - name: Fail workflow - if: ${{ steps.pre_commit.outputs.pre_commit_failed == 1 && github.event_name == 'pull_request' }} - run: exit 1 diff --git a/.github/workflows/static_type_checks.yml b/.github/workflows/static_type_checks.yml index a1baa9c2..04faedde 100644 --- a/.github/workflows/static_type_checks.yml +++ b/.github/workflows/static_type_checks.yml @@ -1,6 +1,3 @@ -# We do not include static_type_checks as a pre-commit hook because pre-commit hooks -# are installed in their own virtual environment, so static_type_checks cannot -# use stubs from imports name: static_type_checks on: @@ -12,11 +9,6 @@ on: jobs: static_type_checks: runs-on: ubuntu-latest - permissions: - pull-requests: write - concurrency: - group: "${{ github.workflow }} @ ${{ github.ref }}" - cancel-in-progress: true strategy: matrix: os: [ubuntu-latest] @@ -24,74 +16,19 @@ jobs: steps: - uses: actions/checkout@v3 - - name: Cache tox - uses: actions/cache@v3.2.6 - id: cache_tox - with: - path: | - .tox - key: ${{ runner.os }}-${{ matrix.python-version }}-static-type-checks - - name: Set up Python uses: actions/setup-python@v4 id: setup_python with: python-version: ${{ matrix.python-version}} + cache: pip - name: Install dependencies shell: bash run: | - pip install invoke tox pyright + make install - name: Run static type checker - id: pyright - continue-on-error: true - run: | - if inv static-type-checks; then - echo "pyright check passed" - echo "pyright_failed=0" >> $GITHUB_OUTPUT - else - echo "pyright check failed" - echo "pyright_failed=1" >> $GITHUB_OUTPUT - fi - - - name: Find Comment - uses: peter-evans/find-comment@v2 - id: find_comment - if: ${{github.event_name == 'pull_request'}} - continue-on-error: true - with: - issue-number: ${{ github.event.pull_request.number }} - comment-author: "github-actions[bot]" - body-includes: ✨ Looks like pyright failed ✨ - - - uses: mshick/add-pr-comment@v2 - if: ${{ steps.pyright.outputs.pyright_failed == 1 && github.event_name == 'pull_request'}} - id: add_comment - with: - message: | - ✨ Looks like pyright failed ✨ - - If you want to fix this, we recommend doing it locally by either: - - a) Enabling pyright in VSCode and going through the errors in the problems tab - - `VSCode settings > Python > Analysis: Type checking mode > "basic"` - - b) Debugging via the command line - - 1. Installing pyright, which is included in the dev dependencies: `pip install -e ".[dev]"` - 2. Diagnosing the errors by running `pyright .` - - - uses: mshick/add-pr-comment@v2 - if: ${{ steps.pyright.outputs.pyright_failed == 0 && steps.find_comment.outputs.comment-id != '' && github.event_name == 'pull_request'}} - with: - message-id: ${{ steps.find_comment.outputs.comment-id }} - message: | - 🌟 pyright succeeds! 🌟 - - - name: Show pyright output - id: fail_run - if: ${{steps.pyright.outputs.pyright_failed == 1}} + shell: bash run: | - inv static-type-checks # Rerunning pyright isn't optimal computationally, but typically takes no more than a couple of seconds, and this ensures that the errors are in the failing step + make static-type-checks diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 8931ad8d..3a1814c5 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -1,6 +1,7 @@ -# This workflow will install Python dependencies, run pytests and run notebooks -# then it will in python 3.9 (ubuntu-latest) create a badge with the coverage -# and add it to the PR. This badge will be updated if the PR is updated. +# This workflow will: +# 1) install Python dependencies +# 2) run make test + name: Tests on: @@ -30,15 +31,6 @@ jobs: steps: - uses: actions/checkout@v3 - - - name: Cache tox - uses: actions/cache@v3.2.6 - id: cache_tox - with: - path: | - .tox - key: ${{ runner.os }}-${{ matrix.python-version }}-tests-1 - - name: Set up Python uses: actions/setup-python@v4 with: @@ -48,31 +40,9 @@ jobs: - name: Install dependencies shell: bash run: | - pip install invoke tox + make install - - name: Run and write pytest + - name: Run tests shell: bash run: | - # Specifying two sets of "--pytest-args" is required for invoke to parse it as a list - export DACY_CACHE_DIR=/tmp/dacy_cache - inv test --pytest-args="--durations=0" --pytest-args="--junitxml=pytest.xml --cov-report=term-missing --cov=src/" - - - - name: Test report on failures - uses: EnricoMi/publish-unit-test-result-action@v2 - id: test_report_with_annotations - if: ${{ matrix.os == 'ubuntu-latest' && matrix.python-version == '3.9' && github.actor != 'dependabot[bot]' && github.event_name == 'pull_request' && (success() || failure()) }} # Do not run for dependabot, run whether tests failed or succeeded - with: - comment_mode: "failures" - files: | - pytest.xml - - - name: Pytest coverage comment - id: coverage-comment - uses: MishaKav/pytest-coverage-comment@main - if: ${{ matrix.os == 'ubuntu-latest' && matrix.python-version == '3.9' && github.actor != 'dependabot[bot]' && github.event_name == 'pull_request' && (success() || failure()) }} - with: - create-new-comment: false - report-only-changed-files: false - pytest-coverage-path: pytest-coverage.txt - junitxml-path: ./pytest.xml + make test \ No newline at end of file diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml deleted file mode 100644 index b9887b16..00000000 --- a/.pre-commit-config.yaml +++ /dev/null @@ -1,31 +0,0 @@ -default_stages: [commit] - -repos: - - repo: https://github.com/psf/black - rev: 23.3.0 - hooks: - - id: black - - - repo: https://github.com/charliermarsh/ruff-pre-commit - rev: v0.0.265 - hooks: - - id: ruff - args: - [ - "--extend-select", - "F401", - "--extend-select", - "F841", - "--fix", - "--exit-non-zero-on-fix", - ] - - - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.4.0 - hooks: - - id: check-yaml - - - repo: https://github.com/repo-helper/pyproject-parser - rev: v0.9.0b2 - hooks: - - id: check-pyproject diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 8b65d3ab..7237cfe6 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -11,7 +11,7 @@ Here is a list of important resources for contributors: - [Issue Tracker] - [Code of Conduct] -[mit license]: https://opensource.org/licenses/MIT +[Apache-2.0 license]: https://opensource.org/license/apache-2-0/ [source code]: https://github.com/centre-for-humanities-computing/dacy [documentation]: https://dacy.readthedocs.io/ [issue tracker]: https://github.com/centre-for-humanities-computing/dacy/issues @@ -37,24 +37,22 @@ Request features on the [Issue Tracker]. ## How to set up your development environment -Install the package with development requirements: +To install all the development dependencies, you can use the [make] command: ```console -$ pip install -e ."[dev,tests]" +$ make install ``` + ## How to test the project Run the full test suite: ```console -$ pytest +$ make test ``` -Unit tests are located in the _tests_ directory, -and are written using the [pytest] testing framework. - -[pytest]: https://pytest.readthedocs.io/ +Unit tests are located in the _tests_ directory. ## How to submit changes @@ -62,23 +60,24 @@ Open a [pull request] to submit changes to this project. Your pull request needs to meet the following guidelines for acceptance: -- The Nox test suite must pass without errors and warnings. -- Include unit tests. This project maintains 100% code coverage. +- The test suite should ideally pass without errors and warnings. +- Ideally add tests for your changes. - If your changes add functionality, update the documentation accordingly. Feel free to submit early, though—we can always iterate on this. -To run linting and code formatting checks before committing your change, you can install pre-commit as a Git hook by running the following command: +To run linting and code formatting checks before committing your change, you can run the following [make] command: ```console -$ nox --session=pre-commit -- install +$ make lint ``` -It is recommended to open an issue before starting work on anything. +It is recommended to open an issue before starting work on any major changes. This will allow a chance to talk it over with the owners and validate your approach. [pull request]: https://github.com/centre-for-humanities-computing/dacy/pulls +[make]: https://makefiletutorial.com -[code of conduct]: CODE_OF_CONDUCT.md +[code of conduct]: CODE_OF_CONDUCT.md \ No newline at end of file diff --git a/README.md b/README.md index 85e5896e..8744f1ef 100644 --- a/README.md +++ b/README.md @@ -4,14 +4,14 @@ [![PyPI](https://img.shields.io/pypi/v/dacy.svg)][pypi status] [![pip downloads](https://img.shields.io/pypi/dm/dacy.svg)](https://pypi.org/project/dacy/) [![Python Version](https://img.shields.io/pypi/pyversions/dacy)][pypi status] -[![Black](https://img.shields.io/badge/code%20style-black-000000.svg)][black] +[![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)]([ruff]) [![documentation](https://github.com/centre-for-humanities-computing/dacy/actions/workflows/documentation.yml/badge.svg)][documentation] [![Tests](https://github.com/centre-for-humanities-computing/dacy/actions/workflows/tests.yml/badge.svg)][tests] [pypi status]: https://pypi.org/project/dacy/ [documentation]: https://centre-for-humanities-computing.github.io/DaCy/ [tests]: https://github.com/centre-for-humanities-computing/dacy/actions?workflow=Tests -[black]: https://github.com/psf/black +[ruff]: https://github.com/astral-sh/ruff diff --git a/makefile b/makefile new file mode 100644 index 00000000..e94def5b --- /dev/null +++ b/makefile @@ -0,0 +1,39 @@ +install: + @echo "--- 🚀 Installing project ---" + pip install -e ".[dev, docs, tests]" + +static-type-check: + @echo "--- 🔍 Running static type check ---" + pyright . + +lint: + @echo "--- 🧹 Running linters ---" + pyproject-parser check pyproject.toml # check pyproject.toml + ruff format . # running ruff formatting + ruff . --fix # running ruff linting + +test: + @echo "--- 🧪 Running tests ---" + pytest tests/ + +pr: + @echo "--- 🚀 Running PR checks ---" + make lint + make static-type-check + make test + @echo "Ready to make a PR" + +build-docs: + @echo "--- 📚 Building docs ---" + @echo "Builds the docs and puts them in the 'site' folder" + mkdocs build + +view-docs: + @echo "--- 👀 Viewing docs ---" + mkdocs serve + +update-from-template: + @echo "--- 🔄 Updating from template ---" + @echo "This will update the project from the template, make sure to resolve any .rej files" + cruft update + diff --git a/pyproject.toml b/pyproject.toml index 3370978d..818c224e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,11 +50,9 @@ name = "Apache License 2.0" [project.optional-dependencies] dev = [ "cruft>=2.0.0", - "pyright==1.1.305", - "pyright-polite>=0.0.1", - "pre-commit>=2.20.0", + "pyright==1.1.328", "ruff>=0.0.262", - "black[jupyter]>=23.3.0", + "pyproject-parser[cli, readme]>=0.9.1", ] tests = ["pytest>=7.1.2", "pytest-cov>=3.0.0", "pytest-instafail>=0.4.2"] docs = [ @@ -112,6 +110,7 @@ exclude = [".*venv*", ".tox"] pythonPlatform = "Darwin" [tool.ruff] +extend-include = ["*.ipynb"] # Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default. select = [ "A", @@ -150,6 +149,7 @@ ignore = [ "F841", "RET504", "ANN202", + "COM812", ] ignore-init-module-imports = true # Allow autofix for all enabled rules (when `--fix`) is provided. @@ -190,6 +190,8 @@ exclude = [ dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" target-version = "py38" +tool.ruff.lint.pydocstyle] +convention = "google" [tool.ruff.flake8-annotations] mypy-init-return = true suppress-none-returning = true @@ -206,35 +208,4 @@ version_toml = ["pyproject.toml:project.version"] build_command = "python -m pip install build; python -m build" [tool.setuptools] -include-package-data = true - - -[tool.tox] -legacy_tox_ini = """ -[tox] -envlist = py{39,310} - -[testenv] -description: run unit tests -extras = tests -use_develop = true -commands = - pytest {posargs:test} - -[testenv:type] -allowlist_externals = pyright -description: run type checks -extras = tests, dev -basepython = py39 # Setting these explicitly avoid recreating env if your shell is set to a different version -use_develop = true -commands = - pyright src/ - -[testenv:docs] -description: build docs -extras = docs -basepython = py39 # Setting these explicitly avoid recreating env if your shell is set to a different version -use_develop = true -commands = - sphinx-build -b html docs docs/_build/html -""" +include-package-data = true \ No newline at end of file diff --git a/tasks.py b/tasks.py deleted file mode 100644 index 20b84722..00000000 --- a/tasks.py +++ /dev/null @@ -1,427 +0,0 @@ -""" -This project uses Invoke (pyinvoke.org) for task management. -Install it via: - -``` -pip install invoke -``` - -And then run: - -``` -inv --list -``` - -If you do not wish to use invoke you can simply delete this file. -""" - - -import platform -import re -import shutil -from pathlib import Path -from typing import List, Optional - -from invoke import Context, Result, task - -# Extract supported python versions from the pyproject.toml classifiers key -SUPPORTED_PYTHON_VERSIONS = [ - line.split("::")[-1].strip().replace('"', "").replace(",", "") - for line in Path("pyproject.toml").read_text().splitlines() - if "Programming Language :: Python ::" in line -] - -NOT_WINDOWS = platform.system() != "Windows" - - -def echo_header(msg: str): - print(f"\n--- {msg} ---") - - -class MsgType: - # Emojis have to be encoded as bytes to not break the terminal on Windows - @property - def DOING(self) -> str: - return b"\xf0\x9f\xa4\x96".decode() if NOT_WINDOWS else "DOING:" - - @property - def GOOD(self) -> str: - return b"\xe2\x9c\x85".decode() if NOT_WINDOWS else "DONE:" - - @property - def FAIL(self) -> str: - return b"\xf0\x9f\x9a\xa8".decode() if NOT_WINDOWS else "FAILED:" - - @property - def WARN(self) -> str: - return b"\xf0\x9f\x9a\xa7".decode() if NOT_WINDOWS else "WARNING:" - - @property - def SYNC(self) -> str: - return b"\xf0\x9f\x9a\x82".decode() if NOT_WINDOWS else "SYNCING:" - - @property - def PY(self) -> str: - return b"\xf0\x9f\x90\x8d".decode() if NOT_WINDOWS else "" - - @property - def CLEAN(self) -> str: - return b"\xf0\x9f\xa7\xb9".decode() if NOT_WINDOWS else "CLEANING:" - - @property - def TEST(self) -> str: - return b"\xf0\x9f\xa7\xaa".decode() if NOT_WINDOWS else "TESTING:" - - @property - def COMMUNICATE(self) -> str: - return b"\xf0\x9f\x93\xa3".decode() if NOT_WINDOWS else "COMMUNICATING:" - - @property - def EXAMINE(self) -> str: - return b"\xf0\x9f\x94\x8d".decode() if NOT_WINDOWS else "VIEWING:" - - -msg_type = MsgType() - - -def git_init(c: Context, branch: str = "main"): - """Initialize a git repository if it does not exist yet.""" - # If no .git directory exits - if not Path(".git").exists(): - echo_header(f"{msg_type.DOING} Initializing Git repository") - c.run(f"git init -b {branch}") - c.run("git add .") - c.run("git commit -m 'Init'") - print(f"{msg_type.GOOD} Git repository initialized") - else: - print(f"{msg_type.GOOD} Git repository already initialized") - - -def setup_venv( - c: Context, - python_path: str, - venv_name: Optional[str] = None, -) -> str: - """Create a virtual environment if it does not exist yet. - - Args: - c: The invoke context. - python_path: The python executable to use. - venv_name: The name of the virtual environment. Defaults to ".venv". - """ - if venv_name is None: - venv_name = ".venv" - - if not Path(venv_name).exists(): - echo_header( - f"{msg_type.DOING} Creating virtual environment using {msg_type.PY}:{python_path}", - ) - c.run(f"{python_path} -m venv {venv_name}") - print(f"{msg_type.GOOD} Virtual environment created") - else: - print(f"{msg_type.GOOD} Virtual environment already exists") - return venv_name - - -def _add_commit(c: Context, msg: Optional[str] = None): - print(f"{msg_type.DOING} Adding and committing changes") - c.run("git add .") - - if msg is None: - msg = input("Commit message: ") - - c.run(f'git commit -m "{msg}"', pty=NOT_WINDOWS, hide=True) - print(f"{msg_type.GOOD} Changes added and committed") - - -def is_uncommitted_changes(c: Context) -> bool: - git_status_result: Result = c.run( - "git status --porcelain", - pty=NOT_WINDOWS, - hide=True, - ) - - uncommitted_changes = git_status_result.stdout != "" - return uncommitted_changes - - -def add_and_commit(c: Context, msg: Optional[str] = None): - """Add and commit all changes.""" - if is_uncommitted_changes(c): - uncommitted_changes_descr = c.run( - "git status --porcelain", - pty=NOT_WINDOWS, - hide=True, - ).stdout - - echo_header( - f"{msg_type.WARN} Uncommitted changes detected", - ) - - for line in uncommitted_changes_descr.splitlines(): - print(f" {line.strip()}") - print("\n") - _add_commit(c, msg=msg) - - -def branch_exists_on_remote(c: Context) -> bool: - branch_name = Path(".git/HEAD").read_text().split("/")[-1].strip() - - branch_exists_result: Result = c.run( - f"git ls-remote --heads origin {branch_name}", - hide=True, - ) - - return branch_name in branch_exists_result.stdout - - -def update_branch(c: Context): - echo_header(f"{msg_type.SYNC} Syncing branch with remote") - - if not branch_exists_on_remote(c): - c.run("git push --set-upstream origin HEAD") - else: - print("Pulling") - c.run("git pull") - print("Pushing") - c.run("git push") - - -def create_pr(c: Context): - c.run( - "gh pr create --web", - pty=NOT_WINDOWS, - ) - - -def update_pr(c: Context): - echo_header(f"{msg_type.COMMUNICATE} Syncing PR") - # Get current branch name - branch_name = Path(".git/HEAD").read_text().split("/")[-1].strip() - pr_result: Result = c.run( - "gh pr list --state OPEN", - pty=False, - hide=True, - ) - - if branch_name not in pr_result.stdout: - create_pr(c) - else: - open_web = input("Open in browser? [y/n] ") - if "y" in open_web.lower(): - c.run("gh pr view --web", pty=NOT_WINDOWS) - - -def exit_if_error_in_stdout(result: Result): - # Find N remaining using regex - - if "error" in result.stdout: - errors_remaining = re.findall(r"\d+(?=( remaining))", result.stdout)[ - 0 - ] # testing - if errors_remaining != "0": - exit(0) - - -def pre_commit(c: Context, auto_fix: bool): - """Run pre-commit checks.""" - - # Essential to have a clean working directory before pre-commit to avoid committing - # heterogenous files under a "style: linting" commit - if is_uncommitted_changes(c): - print( - f"{msg_type.WARN} Your git working directory is not clean. Stash or commit before running pre-commit.", - ) - exit(1) - - echo_header(f"{msg_type.CLEAN} Running pre-commit checks") - pre_commit_cmd = "pre-commit run --all-files" - result = c.run(pre_commit_cmd, pty=NOT_WINDOWS, warn=True) - - exit_if_error_in_stdout(result) - - if ("fixed" in result.stdout or "reformatted" in result.stdout) and auto_fix: - _add_commit(c, msg="style: Auto-fixes from pre-commit") - - print(f"{msg_type.DOING} Fixed errors, re-running pre-commit checks") - second_result = c.run(pre_commit_cmd, pty=NOT_WINDOWS, warn=True) - exit_if_error_in_stdout(second_result) - else: - if result.return_code != 0: - print(f"{msg_type.FAIL} Pre-commit checks failed") - exit(1) - - -@task -def static_type_checks(c: Context): - echo_header(f"{msg_type.CLEAN} Running static type checks") - c.run("tox -e type", pty=NOT_WINDOWS) - - -@task -def install( - c: Context, - pip_args: str = "", - msg: bool = True, - venv_path: Optional[str] = None, -): - """Install the project in editable mode using pip install""" - if msg: - echo_header(f"{msg_type.DOING} Installing project") - - extras = ".[dev,tests,docs]" if NOT_WINDOWS else ".[dev,tests,docs]" - install_cmd = f"pip install -e {extras} {pip_args}" - - if venv_path is not None and NOT_WINDOWS: - with c.prefix(f"source {venv_path}/bin/activate"): - c.run(install_cmd) - return - - c.run(install_cmd) - - -def get_python_path(preferred_version: str) -> Optional[str]: - """Get path to python executable.""" - preferred_version_path = shutil.which(f"python{preferred_version}") - - if preferred_version_path is not None: - return preferred_version_path - - print( - f"{msg_type.WARN}: python{preferred_version} not found, continuing with default python version", - ) - return shutil.which("python") - - -@task -def setup(c: Context, python_path: Optional[str] = None): - """Confirm that a git repo exists and setup a virtual environment. - - Args: - c: Invoke context - python_path: Path to the python executable to use for the virtual environment. Uses the return value of `which python` if not provided. - """ - git_init(c) - - if python_path is None: - # get path to python executable - python_path = get_python_path(preferred_version="3.9") - if not python_path: - print(f"{msg_type.FAIL} Python executable not found") - exit(1) - venv_name = setup_venv(c, python_path=python_path) - - install(c, pip_args="--upgrade", msg=False, venv_path=venv_name) - - if venv_name is not None: - print( - f"{msg_type.DOING} Activate your virtual environment by running: \n\n\t\t source {venv_name}/bin/activate \n", - ) - - -@task -def update(c: Context): - """Update dependencies.""" - echo_header(f"{msg_type.DOING} Updating project") - install(c, pip_args="--upgrade", msg=False) - - -@task(iterable="pytest_args") -def test( - c: Context, - python_versions: List[str] = (SUPPORTED_PYTHON_VERSIONS[0],), # type: ignore - pytest_args: List[str] = [], # noqa -): - """Run tests""" - # Invoke requires lists as type hints, but does not support lists as default arguments. - # Hence this super weird type hint and default argument for the python_versions arg. - echo_header(f"{msg_type.TEST} Running tests") - - python_version_strings = [f"py{v.replace('.', '')}" for v in python_versions] - python_version_arg_string = ",".join(python_version_strings) - - if not pytest_args: - pytest_args = [ - "tests", - "-rfE", - "--failed-first", - "-p no:cov", - "--disable-warnings", - "-q", - ] - - pytest_arg_str = " ".join(pytest_args) - - test_result: Result = c.run( - f"tox -e {python_version_arg_string} -- {pytest_arg_str}", - warn=True, - pty=NOT_WINDOWS, - ) - - # If "failed" in the pytest results - failed_tests = [line for line in test_result.stdout if line.startswith("FAILED")] - - if len(failed_tests) > 0: - print("\n\n\n") - echo_header("Failed tests") - print("\n\n\n") - echo_header("Failed tests") - - for line in failed_tests: - # Remove from start of line until /test_ - line_sans_prefix = line[line.find("test_") :] - - # Keep only that after :: - line_sans_suffix = line_sans_prefix[line_sans_prefix.find("::") + 2 :] - print(f"FAILED {msg_type.FAIL} #{line_sans_suffix} ") - - if test_result.return_code != 0: - exit(test_result.return_code) - - -def test_for_rej(): - # Get all paths in current directory or subdirectories that end in .rej - rej_files = list(Path(".").rglob("*.rej")) - - if len(rej_files) > 0: - print(f"\n{msg_type.FAIL} Found .rej files leftover from cruft update.\n") - for file in rej_files: - print(f" /{file}") - print("\nResolve the conflicts and try again. \n") - exit(1) - - -@task -def lint(c: Context, auto_fix: bool = False): - """Lint the project.""" - test_for_rej() - pre_commit(c=c, auto_fix=auto_fix) - static_type_checks(c) - - -@task -def pr(c: Context, auto_fix: bool = False): - """Run all checks and update the PR.""" - add_and_commit(c) - lint(c, auto_fix=auto_fix) - test(c, python_versions=SUPPORTED_PYTHON_VERSIONS) - update_branch(c) - update_pr(c) - - -@task -def docs(c: Context, view: bool = False, view_only: bool = False): - """ - Build and view docs. If neither build or view are specified, both are run. - """ - if not view_only: - echo_header(f"{msg_type.DOING}: Building docs") - c.run("tox -e docs") - - if view or view_only: - echo_header(f"{msg_type.EXAMINE}: Opening docs in browser") - # check the OS and open the docs in the browser - if platform.system() == "Windows": - c.run("start docs/_build/html/index.html") - else: - c.run("open docs/_build/html/index.html") diff --git a/training/main/requirements.txt b/training/main/requirements.txt index ae058fa2..a44d4e36 100644 --- a/training/main/requirements.txt +++ b/training/main/requirements.txt @@ -14,6 +14,3 @@ wandb >= 0.14.2 # for dataset handling conllu>=4.5.2 wikidata>=0.7.0 - -# style -black>=23.3.0 \ No newline at end of file From ee7e318ee49f5a020d2225c4111e27802a655e06 Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Wed, 6 Dec 2023 21:09:53 +0100 Subject: [PATCH 4/6] lint: ran ruff --- docs/performance_ner.ipynb | 114 +++++++++++++++++--------- docs/tutorials/basic.ipynb | 24 +++--- docs/tutorials/hate-speech.ipynb | 9 +- docs/tutorials/robustness.ipynb | 7 +- docs/tutorials/sentiment.ipynb | 16 ++-- docs/tutorials/textdescriptives.ipynb | 9 +- makefile | 1 - pyproject.toml | 10 +-- 8 files changed, 117 insertions(+), 73 deletions(-) diff --git a/docs/performance_ner.ipynb b/docs/performance_ner.ipynb index 2a13d263..5424eb85 100644 --- a/docs/performance_ner.ipynb +++ b/docs/performance_ner.ipynb @@ -186,7 +186,7 @@ "dane = {}\n", "for mdl_name, model_getter in MODELS.items():\n", " mdl_results = apply_models(mdl_name, model_getter, dataset=\"dane\", splits=[\"test\"])\n", - " dane[mdl_name] = mdl_results[\"test\"]\n" + " dane[mdl_name] = mdl_results[\"test\"]" ] }, { @@ -214,8 +214,7 @@ " for ent in ents:\n", " ent.label_ = mapping[ent.label_]\n", "\n", - " e.x.ents = ents\n", - " " + " e.x.ents = ents" ] }, { @@ -231,6 +230,7 @@ "import pandas as pd\n", "from evaluation.utils import create_dataframe\n", "\n", + "\n", "def highlight_max(s: pd.Series) -> list:\n", " \"\"\"Highlight the maximum in a Series with bold text.\"\"\"\n", " # convert to str for comparison\n", @@ -291,10 +291,15 @@ "outputs": [], "source": [ "from multiprocessing import Pool\n", + "\n", "with Pool(8) as p:\n", " tables = p.starmap(\n", " create_dataframe,\n", - " [(dane[mdl][\"examples\"], mdl, 1, 500) for mdl in dane if \"fine_grained\" not in mdl],\n", + " [\n", + " (dane[mdl][\"examples\"], mdl, 1, 500)\n", + " for mdl in dane\n", + " if \"fine_grained\" not in mdl\n", + " ],\n", " )" ] }, @@ -545,13 +550,18 @@ "source": [ "from functools import partial\n", "from evaluation.models import openai_model_loader_fine_ner\n", + "\n", "MODELS_ = MODELS.copy()\n", - "MODELS_[\"openai/gpt-3.5-turbo (02/05/23)\"] = partial(openai_model_loader_fine_ner, model=\"gpt-3.5-turbo\")\n", - "MODELS_[\"openai/gpt-4 (02/05/23)\"] = partial(openai_model_loader_fine_ner, model=\"gpt-4\")\n", + "MODELS_[\"openai/gpt-3.5-turbo (02/05/23)\"] = partial(\n", + " openai_model_loader_fine_ner, model=\"gpt-3.5-turbo\"\n", + ")\n", + "MODELS_[\"openai/gpt-4 (02/05/23)\"] = partial(\n", + " openai_model_loader_fine_ner, model=\"gpt-4\"\n", + ")\n", "\n", "# don't test openai models on DANSK\n", "MODELS_.pop(\"openai/gpt-3.5-turbo (02/05/23)\")\n", - "MODELS_.pop(\"openai/gpt-4 (02/05/23)\")\n" + "MODELS_.pop(\"openai/gpt-4 (02/05/23)\")" ] }, { @@ -610,12 +620,10 @@ "dansk = {}\n", "for mdl_name, model_getter in MODELS_.items():\n", " if \"openai\" in mdl_name:\n", - " splits=[\"test\"]\n", + " splits = [\"test\"]\n", " else:\n", - " splits=[\"train\", \"dev\", \"test\"]\n", - " mdl_results = apply_models(\n", - " mdl_name, model_getter, dataset=\"dansk\", splits=splits\n", - " )\n", + " splits = [\"train\", \"dev\", \"test\"]\n", + " mdl_results = apply_models(mdl_name, model_getter, dataset=\"dansk\", splits=splits)\n", " dansk[mdl_name] = mdl_results" ] }, @@ -632,8 +640,12 @@ "with Pool(8) as p:\n", " tables = p.starmap(\n", " create_dataframe,\n", - " [(dansk[mdl][\"test\"][\"examples\"], mdl, 1, 100, 2000) for mdl in dansk if \"fine_grained\" in mdl],\n", - " )\n" + " [\n", + " (dansk[mdl][\"test\"][\"examples\"], mdl, 1, 100, 2000)\n", + " for mdl in dansk\n", + " if \"fine_grained\" in mdl\n", + " ],\n", + " )" ] }, { @@ -1123,8 +1135,13 @@ " if \"fine_grained\" not in mdl_name:\n", " continue\n", "\n", - " table = evaluate_generalization(examples=dansk[mdl_name][\"test\"][\"examples\"], mdl_name=mdl_name, n_rep=100, n_samples=1000)\n", - " tables.append(table)\n" + " table = evaluate_generalization(\n", + " examples=dansk[mdl_name][\"test\"][\"examples\"],\n", + " mdl_name=mdl_name,\n", + " n_rep=100,\n", + " n_samples=1000,\n", + " )\n", + " tables.append(table)" ] }, { @@ -1325,9 +1342,14 @@ " # examples += dansk[mdl_name][\"dev\"][\"examples\"]\n", " # examples += dansk[mdl_name][\"train\"][\"examples\"]\n", "\n", - " \n", " examples = convert_to_conll_2003(examples)\n", - " table = evaluate_generalization(mdl_name, examples, n_rep=100, n_samples=1000, create_row_fn=create_row_conll2003)\n", + " table = evaluate_generalization(\n", + " mdl_name,\n", + " examples,\n", + " n_rep=100,\n", + " n_samples=1000,\n", + " create_row_fn=create_row_conll2003,\n", + " )\n", " tables.append(table)\n", "\n", "tables = pd.concat(tables, axis=0)" @@ -1344,7 +1366,7 @@ "outputs": [], "source": [ "df = tables\n", - "df = df[df[\"Domain\"] != \"dannet\"] # type: ignore\n", + "df = df[df[\"Domain\"] != \"dannet\"] # type: ignore\n", "df = df[df[\"Domain\"].notnull()]" ] }, @@ -1486,7 +1508,8 @@ "chart = base + error_bars\n", "\n", "chart = chart.add_params(selection, param_checkbox).properties(\n", - " width=400, height=300,\n", + " width=400,\n", + " height=300,\n", " title=\"Generalization to Unseen Domains\",\n", ")\n", "\n", @@ -1530,10 +1553,17 @@ "# add ci to average\n", "\n", "df = tables\n", - "df = df[df[\"Domain\"] != \"dannet\"] # type: ignore\n", + "df = df[df[\"Domain\"] != \"dannet\"] # type: ignore\n", "df = df[df[\"Domain\"].notnull()]\n", "\n", - "df[\"Average F1\"] = df[\"Average\"].round(2).astype(str) + \" (\" + df[\"Average Lower CI\"].round(2).astype(str) + \", \" + df[\"Average Upper CI\"].round(2).astype(str) + \")\"\n", + "df[\"Average F1\"] = (\n", + " df[\"Average\"].round(2).astype(str)\n", + " + \" (\"\n", + " + df[\"Average Lower CI\"].round(2).astype(str)\n", + " + \", \"\n", + " + df[\"Average Upper CI\"].round(2).astype(str)\n", + " + \")\"\n", + ")\n", "\n", "df.drop([\"Average Lower CI\", \"Average Upper CI\"], axis=1, inplace=True)\n", "df.drop([\"Number of docs\", \"Average\"], axis=1, inplace=True)\n", @@ -1552,12 +1582,11 @@ "\n", "# print latex\n", "latex = style.to_latex(\n", - " hrules=True,\n", - " convert_css=True,\n", - " )\n", + " hrules=True,\n", + " convert_css=True,\n", + ")\n", "\n", - "print(latex)\n", - "\n" + "print(latex)" ] }, { @@ -1637,6 +1666,8 @@ "outputs": [], "source": [ "from collections import defaultdict\n", + "\n", + "\n", "def augmentation_specific_examples(examples):\n", " aug_group = defaultdict(list)\n", " for example in examples:\n", @@ -1678,7 +1709,7 @@ "\n", " aug_group = augmentation_specific_examples(examples)\n", " for aug_name, _examples in aug_group.items():\n", - " _examples = convert_to_conll_2003(_examples) # also removes misc.\n", + " _examples = convert_to_conll_2003(_examples) # also removes misc.\n", " table = create_dataframe(_examples, mdl, n_rep=100, n_samples=1000)\n", " table[\"Augmentation\"] = aug_name\n", " tables.append(table)" @@ -1694,7 +1725,7 @@ }, "outputs": [], "source": [ - "df = pd.concat(tables)\n" + "df = pd.concat(tables)" ] }, { @@ -1709,7 +1740,6 @@ "source": [ "# create the table\n", "def create_table(df, model_order: list[str], baseline=df_average):\n", - "\n", " table_df = df[[\"Models\", \"Augmentation\", \"Average\"]]\n", "\n", " table_df = table_df.pivot(index=\"Models\", columns=\"Augmentation\", values=\"Average\")\n", @@ -1722,7 +1752,6 @@ " # order the columns\n", " table_df = table_df[[\"Baseline\"] + list(table_df.columns[:-1])]\n", "\n", - "\n", " # create augmentation superheader\n", "\n", " aug_superheader = [(\"\", \"Baseline\")]\n", @@ -1736,7 +1765,9 @@ " s = s.apply(underline_second_max, axis=0, subset=df.columns[1:])\n", "\n", " # Add a caption\n", - " s = s.set_caption(\"F1 score for each augmentation with 95% confidence interval calculated over 100 repetitions\")\n", + " s = s.set_caption(\n", + " \"F1 score for each augmentation with 95% confidence interval calculated over 100 repetitions\"\n", + " )\n", "\n", " # Center the header and left align the model names\n", " s = s.set_properties(subset=df.columns[1:], **{\"text-align\": \"right\"})\n", @@ -1753,8 +1784,7 @@ " s = s.hide(axis=\"index\")\n", " # smaller font\n", " s = s.set_table_attributes('style=\"font-size: 0.65em\"')\n", - " return s\n", - "\n" + " return s" ] }, { @@ -1996,7 +2026,7 @@ "\n", " aug_group = augmentation_specific_examples(examples)\n", " for aug_name, _examples in aug_group.items():\n", - " _examples = convert_to_conll_2003(_examples) # also removes misc.\n", + " _examples = convert_to_conll_2003(_examples) # also removes misc.\n", " table = create_dataframe(_examples, mdl, n_rep=100, n_samples=1000)\n", " table[\"Augmentation\"] = aug_name\n", " tables.append(table)" @@ -2226,7 +2256,9 @@ " examples = dane[mdl_name][\"examples\"]\n", " n_words = sum(len(e.y) for e in examples)\n", " wps = n_words / total_time\n", - " rows.append({\"Model\": mdl_name, \"Words per second\": wps, \"Total time (sec)\": total_time})\n", + " rows.append(\n", + " {\"Model\": mdl_name, \"Words per second\": wps, \"Total time (sec)\": total_time}\n", + " )\n", "\n", "speed = pd.DataFrame(rows)" ] @@ -2347,21 +2379,25 @@ " is_min = s == s.min()\n", " return [\"font-weight: bold\" if v else \"\" for v in is_min]\n", "\n", + "\n", "def highlight_max(s):\n", " \"\"\"highlight the minimum in a series with bold\"\"\"\n", " is_max = s == s.max()\n", " return [\"font-weight: bold\" if v else \"\" for v in is_max]\n", "\n", - "style= style.apply(highlight_min, axis=0, subset=[\"Total time (sec)\"])\n", + "\n", + "style = style.apply(highlight_min, axis=0, subset=[\"Total time (sec)\"])\n", "style = style.apply(highlight_max, axis=0, subset=[\"Words per second\"])\n", "\n", - "style = style.set_properties(subset=[\"Words per second\", \"Total time (sec)\"], **{\"text-align\": \"right\"})\n", + "style = style.set_properties(\n", + " subset=[\"Words per second\", \"Total time (sec)\"], **{\"text-align\": \"right\"}\n", + ")\n", "# set decimal places\n", "style = style.format({\"Words per second\": \"{:.1f}\", \"Total time (sec)\": \"{:.2f}\"})\n", "\n", "style = style.hide(axis=\"index\")\n", "style = style.set_properties(subset=[\"Model\"], **{\"text-align\": \"left\"})\n", - "style\n" + "style" ] }, { diff --git a/docs/tutorials/basic.ipynb b/docs/tutorials/basic.ipynb index e3076941..01568b5f 100644 --- a/docs/tutorials/basic.ipynb +++ b/docs/tutorials/basic.ipynb @@ -46,6 +46,7 @@ ], "source": [ "import dacy\n", + "\n", "for model in dacy.models():\n", " print(model)" ] @@ -352,20 +353,18 @@ "metadata": {}, "outputs": [], "source": [ - "\n", "displacy.render(doc, style=\"ent\")\n", "\n", "\n", - "client = Client() # start wikidata client\n", + "client = Client() # start wikidata client\n", "for entity in doc.ents:\n", " print(entity, \":\", entity.kb_id_)\n", "\n", " # print the short description derived from wikidata\n", - " wikidata_entry = client.get(entity.kb_id_, load=True)\n", + " wikidata_entry = client.get(entity.kb_id_, load=True)\n", " print(wikidata_entry.description.get(\"en\"))\n", " print(wikidata_entry.description.get(\"da\"))\n", - " print(\" \")\n", - " \n" + " print(\" \")" ] }, { @@ -443,7 +442,7 @@ "# add the ner component from the state-of-the-art fine-grained model\n", "nlp.add_pipe(\"dacy/ner-fine-grained\", config={\"size\": \"small\"})\n", "# or if you only want to do just NER\n", - "# nlp = dacy.load(\"da_dacy_small_ner_fine_grained-0.1.0\")\n" + "# nlp = dacy.load(\"da_dacy_small_ner_fine_grained-0.1.0\")" ] }, { @@ -485,7 +484,9 @@ } ], "source": [ - "doc = nlp(\"Denne model samt 3 andre blev trænet d. 7. marts af Center for Humantities Computing i Aarhus kommune\")\n", + "doc = nlp(\n", + " \"Denne model samt 3 andre blev trænet d. 7. marts af Center for Humantities Computing i Aarhus kommune\"\n", + ")\n", "\n", "displacy.render(doc, style=\"ent\")" ] @@ -535,8 +536,7 @@ "source": [ "print(\"Token POS-tag\")\n", "for token in doc:\n", - " print(f\"{token}:\\t {token.pos_}\")\n", - "\n" + " print(f\"{token}:\\t {token.pos_}\")" ] }, { @@ -713,7 +713,9 @@ } ], "source": [ - "doc = nlp(\"Sætnings segmentering er en vigtig del af sprogprocessering - Det kan bl.a. benyttes til at opdele lange tekster i mindre bidder uden at miste meningen i hvert sætning.\")\n", + "doc = nlp(\n", + " \"Sætnings segmentering er en vigtig del af sprogprocessering - Det kan bl.a. benyttes til at opdele lange tekster i mindre bidder uden at miste meningen i hvert sætning.\"\n", + ")\n", "\n", "for sent in doc.sents:\n", " print(sent)" @@ -839,7 +841,7 @@ "text = \"Den 4. november 2020 fik minkavler Henning Christensen og hele familien et chok. Efter et pressemøde, fik han at vide at alle mink i Danmark skulle aflives. Dermed fik han fjernet hans livsgrundlag\"\n", "doc = nlp(text)\n", "print(\"Coreference clusters:\")\n", - "print(doc.spans)\n" + "print(doc.spans)" ] }, { diff --git a/docs/tutorials/hate-speech.ipynb b/docs/tutorials/hate-speech.ipynb index 22c1bf0f..01260862 100644 --- a/docs/tutorials/hate-speech.ipynb +++ b/docs/tutorials/hate-speech.ipynb @@ -82,7 +82,7 @@ "import dacy\n", "import spacy\n", "\n", - "nlp = spacy.blank(\"da\") # create an empty pipeline\n", + "nlp = spacy.blank(\"da\") # create an empty pipeline\n", "\n", "# add the hate speech models\n", "nlp.add_pipe(\"dacy/hatespeech_detection\")\n", @@ -118,10 +118,7 @@ } ], "source": [ - "texts = [\n", - " \"senile gamle idiot\", \n", - " \"hej har du haft en god dag\"\n", - "]\n", + "texts = [\"senile gamle idiot\", \"hej har du haft en god dag\"]\n", "\n", "# apply the pipeline\n", "docs = nlp.pipe(texts)\n", @@ -131,7 +128,7 @@ " print(doc._.is_offensive)\n", " # print type of hate-speech if it is hate-speech\n", " if doc._.is_offensive == \"offensive\":\n", - " print(\"\\t\", doc._.hate_speech_type)\n" + " print(\"\\t\", doc._.hate_speech_type)" ] } ], diff --git a/docs/tutorials/robustness.ipynb b/docs/tutorials/robustness.ipynb index 4832141d..30d48e3e 100644 --- a/docs/tutorials/robustness.ipynb +++ b/docs/tutorials/robustness.ipynb @@ -360,13 +360,16 @@ } ], "source": [ - "\n", "lower_aug = create_lower_casing_augmenter(level=1)\n", "female_name_dict = female_names()\n", "# Augmenter that replaces names with random Danish female names. Keep the format of the name as is (force_pattern_size=False)\n", "# but replace the name with one of the two defined patterns\n", "\n", - "patterns = [[\"firstname\"], [\"firstname\", \"lastname\"], [\"firstname\", \"firstname\", \"lastname\"]]\n", + "patterns = [\n", + " [\"firstname\"],\n", + " [\"firstname\", \"lastname\"],\n", + " [\"firstname\", \"firstname\", \"lastname\"],\n", + "]\n", "female_aug = create_per_replace_augmenter_v1(female_name_dict, patterns, level=0.1)\n", "\n", "spacy_aug = score(\n", diff --git a/docs/tutorials/sentiment.ipynb b/docs/tutorials/sentiment.ipynb index cdeadef5..a680afe7 100644 --- a/docs/tutorials/sentiment.ipynb +++ b/docs/tutorials/sentiment.ipynb @@ -79,7 +79,7 @@ "import dacy\n", "import spacy\n", "\n", - "nlp = spacy.blank(\"da\") # an empty spacy pipeline\n", + "nlp = spacy.blank(\"da\") # an empty spacy pipeline\n", "# could also be a dacy pipeline, e.g. nlp = dacy.load(\"large\")\n", "nlp.add_pipe(\"dacy/subjectivity\")" ] @@ -159,7 +159,7 @@ } ], "source": [ - "nlp = spacy.blank(\"da\") # an empty spacy pipeline\n", + "nlp = spacy.blank(\"da\") # an empty spacy pipeline\n", "# could also be a dacy pipeline, e.g. nlp = dacy.load(\"large\")\n", "nlp.add_pipe(\"dacy/polarity\")" ] @@ -256,10 +256,10 @@ } ], "source": [ - "nlp = spacy.blank(\"da\") # an empty spacy pipeline\n", + "nlp = spacy.blank(\"da\") # an empty spacy pipeline\n", "# could also be a dacy pipeline, e.g. nlp = dacy.load(\"large\")\n", - "nlp.add_pipe(\"dacy/emotionally_laden\") # for emotianal/non-emotional\n", - "nlp.add_pipe(\"dacy/emotion\") # for type of emotion" + "nlp.add_pipe(\"dacy/emotionally_laden\") # for emotianal/non-emotional\n", + "nlp.add_pipe(\"dacy/emotion\") # for type of emotion" ] }, { @@ -298,7 +298,7 @@ " \"Ej den bil er såå flot\",\n", " \"Fuck det er bare så FUCKING træls!\",\n", " \"Har i set at Tesla har landet en raket på månen? Det er vildt!!\",\n", - " \"der er et træ i haven\"\n", + " \"der er et træ i haven\",\n", "]\n", "\n", "docs = nlp.pipe(texts)\n", @@ -393,7 +393,9 @@ ], "source": [ "for token in doc:\n", - " print(f\"{token._.polarity} | Valence: {token._.valence} | Negation: {token._.is_negation}\")" + " print(\n", + " f\"{token._.polarity} | Valence: {token._.valence} | Negation: {token._.is_negation}\"\n", + " )" ] }, { diff --git a/docs/tutorials/textdescriptives.ipynb b/docs/tutorials/textdescriptives.ipynb index 6090b3ea..0892c5c3 100644 --- a/docs/tutorials/textdescriptives.ipynb +++ b/docs/tutorials/textdescriptives.ipynb @@ -183,7 +183,8 @@ ], "source": [ "import dacy\n", - "nlp = dacy.load(\"small\") # load the latest version of the small model\n", + "\n", + "nlp = dacy.load(\"small\") # load the latest version of the small model\n", "\n", "nlp.add_pipe(\"textdescriptives/readability\")\n", "nlp.add_pipe(\"textdescriptives/dependency_distance\")" @@ -230,6 +231,7 @@ ], "source": [ "import textdescriptives as td\n", + "\n", "# extract the metrics as a dataframe\n", "metrics = td.extract_df(doc, include_text=False)" ] @@ -523,6 +525,7 @@ ], "source": [ "import seaborn as sns\n", + "\n", "sns.boxplot(x=\"label\", y=\"lix\", data=df)" ] }, @@ -563,7 +566,9 @@ "# encode the label as a boolean\n", "df[\"is_ham\"] = df[\"label\"] == \"ham\"\n", "# compute the correlation between all metrics and the label\n", - "metrics_correlations = metrics.corrwith(df[\"is_ham\"]).sort_values(key=abs, ascending=False)\n", + "metrics_correlations = metrics.corrwith(df[\"is_ham\"]).sort_values(\n", + " key=abs, ascending=False\n", + ")\n", "metrics_correlations[:10]" ] }, diff --git a/makefile b/makefile index e94def5b..c10f5b5e 100644 --- a/makefile +++ b/makefile @@ -8,7 +8,6 @@ static-type-check: lint: @echo "--- 🧹 Running linters ---" - pyproject-parser check pyproject.toml # check pyproject.toml ruff format . # running ruff formatting ruff . --fix # running ruff linting diff --git a/pyproject.toml b/pyproject.toml index 818c224e..0627c5e2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,9 +50,8 @@ name = "Apache License 2.0" [project.optional-dependencies] dev = [ "cruft>=2.0.0", - "pyright==1.1.328", - "ruff>=0.0.262", - "pyproject-parser[cli, readme]>=0.9.1", + "pyright>=1.1.328", + "ruff>=0.0.270", ] tests = ["pytest>=7.1.2", "pytest-cov>=3.0.0", "pytest-instafail>=0.4.2"] docs = [ @@ -190,8 +189,9 @@ exclude = [ dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" target-version = "py38" -tool.ruff.lint.pydocstyle] +[tool.ruff.lint.pydocstyle] convention = "google" + [tool.ruff.flake8-annotations] mypy-init-return = true suppress-none-returning = true @@ -208,4 +208,4 @@ version_toml = ["pyproject.toml:project.version"] build_command = "python -m pip install build; python -m build" [tool.setuptools] -include-package-data = true \ No newline at end of file +include-package-data = true From cac4d92fc50569f5658dfae2039584c1c5ddbbff Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Wed, 6 Dec 2023 21:17:59 +0100 Subject: [PATCH 5/6] changed type checker to pyright --- makefile | 2 +- pyproject.toml | 2 +- src/dacy/datasets/dane.py | 8 ++++---- src/dacy/datasets/names.py | 14 +++++++------- src/dacy/download.py | 12 ++++++------ src/dacy/hate_speech/wrapped_models.py | 10 +++++----- src/dacy/load.py | 4 ++-- src/dacy/ner/fine_grained.py | 6 +++--- src/dacy/score/input_length.py | 6 +++--- src/dacy/score/score.py | 10 +++++----- src/dacy/sentiment/wrapped_models.py | 10 +++++----- 11 files changed, 42 insertions(+), 42 deletions(-) diff --git a/makefile b/makefile index c10f5b5e..d662f628 100644 --- a/makefile +++ b/makefile @@ -4,7 +4,7 @@ install: static-type-check: @echo "--- 🔍 Running static type check ---" - pyright . + pyright src/ lint: @echo "--- 🧹 Running linters ---" diff --git a/pyproject.toml b/pyproject.toml index 0627c5e2..0c62e71b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,7 +50,7 @@ name = "Apache License 2.0" [project.optional-dependencies] dev = [ "cruft>=2.0.0", - "pyright>=1.1.328", + "pyright>=1.1.339", "ruff>=0.0.270", ] tests = ["pytest>=7.1.2", "pytest-cov>=3.0.0", "pytest-instafail>=0.4.2"] diff --git a/src/dacy/datasets/dane.py b/src/dacy/datasets/dane.py index d3dfb3c6..e1332aae 100644 --- a/src/dacy/datasets/dane.py +++ b/src/dacy/datasets/dane.py @@ -14,13 +14,13 @@ def dane( # noqa - save_path: Optional[PathLike] = None, - splits: List[str] = ["train", "dev", "test"], # noqa + save_path: Optional[PathLike] = None, # type: ignore + splits: List[str] = ["train", "dev", "test"], # noqa # type: ignore redownload: bool = False, n_sents: int = 1, open_unverified_connection: bool = False, **kwargs, # noqa -) -> Union[List[Corpus], Corpus]: +) -> Union[List[Corpus], Corpus]: # type: ignore """Reads the DaNE dataset as a spacy Corpus. Args: @@ -110,5 +110,5 @@ def dane( # noqa for split in splits: corpora.append(Corpus(save_path / paths[split])) # type: ignore if len(corpora) == 1: - return corpora[0] + return corpora[0] # type: ignore return corpora diff --git a/src/dacy/datasets/names.py b/src/dacy/datasets/names.py index 8c0f716a..06365ae4 100644 --- a/src/dacy/datasets/names.py +++ b/src/dacy/datasets/names.py @@ -8,10 +8,10 @@ def load_names( min_count: int = 0, - ethnicity: Optional[str] = None, - gender: Optional[str] = None, + ethnicity: Optional[str] = None, # type: ignore + gender: Optional[str] = None, # type: ignore min_prop_gender: float = 0, -) -> Dict[str, List[str]]: +) -> Dict[str, List[str]]: # type: ignore """Loads the names lookup table. Danish are from Danmarks statistik (2021). Muslim names are from Meldgaard (2005), https://nors.ku.dk/publikationer/webpublikationer/muslimske_fornavne/. @@ -64,7 +64,7 @@ def load_names( } -def muslim_names() -> Dict[str, List[str]]: +def muslim_names() -> Dict[str, List[str]]: # type: ignore """Returns a dictionary of Muslim names. Returns: @@ -81,7 +81,7 @@ def muslim_names() -> Dict[str, List[str]]: return load_names(ethnicity="muslim") -def danish_names() -> Dict[str, List[str]]: +def danish_names() -> Dict[str, List[str]]: # type: ignore """Returns a dictionary of Danish names. Returns: @@ -98,7 +98,7 @@ def danish_names() -> Dict[str, List[str]]: return load_names(ethnicity="danish") -def female_names() -> Dict[str, List[str]]: +def female_names() -> Dict[str, List[str]]: # type: ignore """Returns a dictionary of Danish female names. Returns: @@ -114,7 +114,7 @@ def female_names() -> Dict[str, List[str]]: return load_names(ethnicity="danish", gender="female", min_prop_gender=0.5) -def male_names() -> Dict[str, List[str]]: +def male_names() -> Dict[str, List[str]]: # type: ignore """Returns a dictionary of Danish male names. Returns: diff --git a/src/dacy/download.py b/src/dacy/download.py index b602101a..45f98323 100644 --- a/src/dacy/download.py +++ b/src/dacy/download.py @@ -4,7 +4,7 @@ from pathlib import Path from spacy.util import get_installed_models -from tqdm import tqdm +from tqdm import tqdm # type: ignore DACY_DEFAULT_PATH = Path.home() / ".dacy" @@ -40,10 +40,10 @@ def get_latest_version(model: str) -> str: versions = [mdl.split("-")[-1] for mdl in models_url if mdl.startswith(model)] versions = sorted( versions, - key=lambda s: [int(u) for u in s.split(".")], + key=lambda s: [int(u) for u in s.split(".")], # type: ignore reverse=True, ) - return versions[0] + return versions[0] # type: ignore def models() -> list[str]: @@ -69,7 +69,7 @@ def download_url(url: str, output_path: str) -> None: unit="B", unit_scale=True, miniters=1, - desc=url.split("/")[-1], + desc=url.split("/")[-1], # type: ignore ) as t: urllib.request.urlretrieve(url, filename=output_path, reporthook=t.update_to) @@ -104,7 +104,7 @@ def download_model( if model in {"small", "medium", "large"}: latest_version = get_latest_version(model) model = f"da_dacy_{model}_trf-{latest_version}" - mdl_version = model.split("-")[-1] + mdl_version = model.split("-")[-1] # type: ignore if model not in models_url: raise ValueError( @@ -112,7 +112,7 @@ def download_model( + " list of all models", ) - mdl = model.split("-")[0] + mdl = model.split("-")[0] # type: ignore if mdl in get_installed_models() and not force and version(mdl) == mdl_version: return mdl install(models_url[model]) diff --git a/src/dacy/hate_speech/wrapped_models.py b/src/dacy/hate_speech/wrapped_models.py index b4818437..86383ea0 100644 --- a/src/dacy/hate_speech/wrapped_models.py +++ b/src/dacy/hate_speech/wrapped_models.py @@ -76,11 +76,11 @@ def make_offensive_transformer( nlp: Language, name: str, model: Model[List[Doc], FullTransformerBatch], - set_extra_annotations: Callable[[List[Doc], FullTransformerBatch], None], + set_extra_annotations: Callable[[List[Doc], FullTransformerBatch], None], # type: ignore max_batch_items: int, doc_extension_trf_data: str, doc_extension_prediction: str, - labels: List[str], + labels: List[str], # type: ignore ) -> SequenceClassificationTransformer: if not Doc.has_extension("is_offensive"): warn( @@ -107,11 +107,11 @@ def make_offensive_transformer( # offensive if Doc.has_extension("is_offensive"): - def label_getter(doc) -> Optional[str]: # noqa + def label_getter(doc) -> Optional[str]: # noqa # type: ignore if doc._.is_offensive == "offensive": - prob = getattr(doc._, f"{doc_extension_prediction}_prob") + prob = getattr(doc._, f"{doc_extension_prediction}_prob") # type: ignore if prob["prob"] is not None: - return labels[int(prob["prob"].argmax())] + return labels[int(prob["prob"].argmax())] # type: ignore return doc._.is_offensive Doc.set_extension(doc_extension_prediction, getter=label_getter, force=True) diff --git a/src/dacy/load.py b/src/dacy/load.py index 3e25c99f..52cedb6e 100644 --- a/src/dacy/load.py +++ b/src/dacy/load.py @@ -12,7 +12,7 @@ def load( model: str, force: bool = False, - **kwargs: Any, + **kwargs: Any, # type: ignore ) -> Language: """Load a DaCy model as a SpaCy text processing pipeline. If the model is not downloaded it will also download the model. @@ -38,7 +38,7 @@ def load( return spacy.load(path, **kwargs) -def where_is_my_dacy(verbose: bool = True) -> Union[str, Path]: +def where_is_my_dacy(verbose: bool = True) -> Union[str, Path]: # type: ignore """Returns a path to where DaCy models are located. The default the model location can be configured with the environmental variable `DACY_CACHE_DIR`. diff --git a/src/dacy/ner/fine_grained.py b/src/dacy/ner/fine_grained.py index 049c62d7..8ff01252 100644 --- a/src/dacy/ner/fine_grained.py +++ b/src/dacy/ner/fine_grained.py @@ -18,10 +18,10 @@ def create_finegrained_ner_component( nlp: Language, name: str, - size: Literal["small", "medium", "large"], + size: Literal["small", "medium", "large"], # type: ignore transformer_name: str, - version: Optional[str], -) -> Callable[[Doc], Doc]: + version: Optional[str], # type: ignore +) -> Callable[[Doc], Doc]: # type: ignore """Create a fine grained NER component using the dacy models. Args: diff --git a/src/dacy/score/input_length.py b/src/dacy/score/input_length.py index 78eb68fe..32007a70 100644 --- a/src/dacy/score/input_length.py +++ b/src/dacy/score/input_length.py @@ -10,11 +10,11 @@ def n_sents_score( - n_sents: Union[int, List[int]], - apply_fn: Callable, + n_sents: Union[int, List[int]], # type: ignore + apply_fn: Callable, # type: ignore dataset: str = "dane", split: str = "test", - score_fn: List[Union[str, Callable]] = ["token", "pos", "ents", "dep"], # noqa + score_fn: List[Union[str, Callable]] = ["token", "pos", "ents", "dep"], # noqa # type: ignore verbose: bool = True, **kwargs, # noqa ) -> pd.DataFrame: diff --git a/src/dacy/score/score.py b/src/dacy/score/score.py index cb0913cc..634de5cb 100644 --- a/src/dacy/score/score.py +++ b/src/dacy/score/score.py @@ -3,7 +3,7 @@ from copy import copy from functools import partial -from time import time +from time import time # type: ignore from typing import Callable, Iterable import pandas as pd @@ -17,7 +17,7 @@ from ..utils import flatten_dict -def no_misc_getter(doc: Doc, attr: str) -> Iterable[Span]: +def no_misc_getter(doc: Doc, attr: str) -> Iterable[Span]: # type: ignore """A utility getter for scoring entities without including MISC. Args: @@ -27,7 +27,7 @@ def no_misc_getter(doc: Doc, attr: str) -> Iterable[Span]: Returns: Iterable[Span] """ - spans = getattr(doc, attr) + spans = getattr(doc, attr) # type: ignore for span in spans: if span.label_ == "MISC": continue @@ -35,7 +35,7 @@ def no_misc_getter(doc: Doc, attr: str) -> Iterable[Span]: def dep_getter(token, attr): # noqa - dep = getattr(token, attr) + dep = getattr(token, attr) # type: ignore dep = token.vocab.strings.as_string(dep).lower() return dep @@ -149,7 +149,7 @@ def __score(augmenter): # noqa: ANN001 corpus_ = copy(corpus) corpus_.augmenter = augmenter scores_ls = [] - for _i in range(k): + for _i in range(k): # type: ignore s = time() examples = apply_fn(corpus_(nlp)) # type: ignore speed = time() - s diff --git a/src/dacy/sentiment/wrapped_models.py b/src/dacy/sentiment/wrapped_models.py index a4cae25a..5e8ada9e 100644 --- a/src/dacy/sentiment/wrapped_models.py +++ b/src/dacy/sentiment/wrapped_models.py @@ -133,11 +133,11 @@ def make_emotion_transformer( nlp: Language, name: str, model: Model[List[Doc], FullTransformerBatch], - set_extra_annotations: Callable[[List[Doc], FullTransformerBatch], None], + set_extra_annotations: Callable[[List[Doc], FullTransformerBatch], None], # type: ignore max_batch_items: int, doc_extension_trf_data: str, doc_extension_prediction: str, - labels: List[str], + labels: List[str], # type: ignore ) -> SequenceClassificationTransformer: if not Doc.has_extension("dacy/emotionally_laden"): warn( @@ -164,11 +164,11 @@ def make_emotion_transformer( # an emotion if Doc.has_extension("dacy/emotionally_laden"): - def label_getter(doc) -> Optional[str]: # noqa: ANN001 + def label_getter(doc) -> Optional[str]: # noqa: ANN001 # type: ignore if doc._.emotionally_laden == "emotional": - prob = getattr(doc._, f"{doc_extension_prediction}_prob") + prob = getattr(doc._, f"{doc_extension_prediction}_prob") # type: ignore if prob["prob"] is not None: - return labels[int(prob["prob"].argmax())] + return labels[int(prob["prob"].argmax())] # type: ignore return doc._.emotionally_laden Doc.set_extension(doc_extension_prediction, getter=label_getter, force=True) From 72afda1c2eccfb1b3b17887cc3e2c5bc97e2c0fc Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Wed, 6 Dec 2023 21:25:10 +0100 Subject: [PATCH 6/6] ci: fix misspelled cmmand --- .github/workflows/static_type_checks.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/static_type_checks.yml b/.github/workflows/static_type_checks.yml index 04faedde..9975945a 100644 --- a/.github/workflows/static_type_checks.yml +++ b/.github/workflows/static_type_checks.yml @@ -31,4 +31,4 @@ jobs: - name: Run static type checker shell: bash run: | - make static-type-checks + make static-type-check