chartes · ethieblin · Aug 1, 2024 · Aug 1, 2024 · Aug 1, 2024 · Aug 29, 2024
diff --git a/cli.py b/cli.py
@@ -1,3 +1,5 @@
+from csv import DictReader
+import io
 import json
 import pprint
 import re
@@ -85,10 +87,7 @@ def search(query, indexes, term):
         else:
             body = query
 
-        config = {
-            "index": indexes if indexes else app.all_indexes,
-            "body": body
-        }
+        config = {"index": indexes if indexes else app.all_indexes, "body": body}
 
         result = app.elasticsearch.search(**config)
         print("\n", "=" * 12, " RESULT ", "=" * 12)
@@ -129,53 +128,58 @@ def index(years):
         """
 
         # BUILD THE METADATA DICT FROM THE GITHUB TSV FILE
-        response = requests.get(app.config['METADATA_FILE_URL'])
+        response = requests.get(app.config["METADATA_FILE_URL"])
         metadata = {}
-        lines = response.text.splitlines()
-        header = lines.pop(0).split('\t')
-        for line in lines:
-            _d = {}
-            # replace empty strings with null values
-            _values = [v if v != "" else None for v in line.split('\t')]
-            for i, k in enumerate(header):
-                # filter indexable columns
-                if k in app.config['METADATA_FILE_INDEXABLE_COLUMNS']:
-                    # brutally try to cast values as integer
-                    try:
-                        _d[k] = int(_values[i])
-                    except (TypeError, ValueError):
-                        _d[k] = _values[i]
-
-            metadata[_d['id']] = _d
-            # remove id from nested metadata object
-            metadata[_d['id']].pop("id")
-
-        _DTS_URL = app.config['DTS_URL']
+        reader = DictReader(io.StringIO(response.text), delimiter="\t")
+        for row in reader:
+            try:
+                metadata[row["id"]] = {
+                    "author_name": row["author_name"],
+                    "author_firstname": row["author_firstname"],
+                    "title_rich": row["title_rich"],
+                    "promotion_year": int(row["promotion_year"]) if row["promotion_year"] else None,
+                    "topic_notBefore": int(row["topic_notBefore"]) if row["topic_notBefore"] else None,
+                    "topic_notAfter": int(row["topic_notAfter"]) if row["topic_notAfter"] else None,
+                    "author_gender": int(row["author_gender"]) if row["author_gender"] else None,
+                        # 1/2, verify that there is no other value
+                    "author_is_enc_teacher": 1 if row["author_is_enc_teacher"]=="1" else None, 
+                }
+            except Exception as exc:
+                print(f"ERROR while indexing {row['id']}, {exc}")
+
+        _DTS_URL = app.config["DTS_URL"]
 
         # INDEXATION DES DOCUMENTS
+        all_docs = []
         try:
-            _index_name = app.config['DOCUMENT_INDEX']
+            _index_name = app.config["DOCUMENT_INDEX"]
             if years == "all":
-                years = app.config['ALL_YEARS']
-            start_year, end_year = (int(y) for y in years.split('-'))
+                years = app.config["ALL_YEARS"]
+            start_year, end_year = (int(y) for y in years.split("-"))
             for year in range(start_year, end_year + 1):
 
-                _ids = [d for d in metadata.keys() if str(year) in d and "_PREV" not in d and "_NEXT" not in d]
+                _ids = [
+                    d
+                    for d in metadata.keys()
+                    if str(year) in d and "_PREV" not in d and "_NEXT" not in d
+                ]
 
                 for encpos_id in _ids:
-                    response = requests.get(f'{_DTS_URL}/document?id={encpos_id}')
+                    response = requests.get(f"{_DTS_URL}/document?id={encpos_id}")
                     print(encpos_id, response.status_code)
 
                     content = extract_body(response.text)
                     content = remove_html_tags(content)
+                    all_docs.append("\n".join([
+                        json.dumps(
+                            {"index": {"_index": _index_name, "_id": encpos_id}}
+                        ),
+                        json.dumps(
+                            {"content": content, "metadata": metadata[encpos_id]}
+                        )
+                    ]))
 
-                    app.elasticsearch.index(
-                        index=_index_name,
-                        id=encpos_id,
-                        body={
-                            "content": content,
-                            "metadata": metadata[encpos_id]
-                        })
+            app.elasticsearch.bulk(body=all_docs, request_timeout=60*10)
 
         except Exception as e:
             print('Indexation error: ', str(e))
@@ -192,4 +196,3 @@ def index(years):
     cli.add_command(index)
     cli.add_command(search)
     return cli
-
diff --git a/elasticsearch/_global.conf.json b/elasticsearch/_global.conf.json
@@ -4,6 +4,10 @@
   },
   "analysis": {
     "filter": {
+      "my_stop_french":{
+        "type": "stop",
+        "stopwords": "_french_"
+      },
       "french_elision": {
         "type": "elision",
         "articles_case": true,
@@ -29,25 +33,24 @@
         "type": "html_strip"
       }
     },
+    "normalizer":{
+      "keyword": {
+        "filter": [
+          "icu_folding"
+        ]
+      }
+    },
     "analyzer": {
       "folding": {
         "tokenizer": "standard",
-        "stopwords": "_french_",
         "filter": [
           "french_elision",
-          "icu_folding"
+          "icu_folding",
+          "my_stop_french"
         ],
         "char_filter": [
           "html_stripper"
         ]
-      },
-      "keyword": {
-        "tokenizer": "keyword",
-        "stopwords": "_french_",
-        "filter": [
-          "french_elision",
-          "icu_folding"
-        ]
       }
     }
   }

diff --git a/elasticsearch/encpos_document.conf.json b/elasticsearch/encpos_document.conf.json
@@ -6,27 +6,51 @@
         "analyzer": "folding",
         "term_vector": "with_positions_offsets"
       },
+      "metadata_all": {
+        "type": "text"
+      },
       "metadata": {
         "properties": {
           "author_firstname": {
             "type": "text",
-            "fielddata": true,
+            "copy_to": "metadata_all",
             "fields": {
               "keyword": {
-                "type": "text",
-                "analyzer": "keyword",
-                "fielddata": "true"
+                "type": "keyword",
+                "normalizer": "keyword"
               }
             }
           },
           "author_name": {
             "type": "text",
-            "fielddata": true,
+            "copy_to": "metadata_all",
+            "fields": {
+              "keyword": {
+                "type": "keyword",
+                "normalizer": "keyword"
+              }
+            }
+          },
+          "promotion_year": {
+            "type": "short",
+            "copy_to": "metadata_all"
+          },
+          "topic_notAfter": {
+            "type": "short",
+            "copy_to": "metadata_all"
+          },
+          "topic_notBefore": {
+            "type": "short",
+            "copy_to": "metadata_all"
+          },
+          "title_rich": {
+            "type": "text",
+            "copy_to": "metadata_all",
             "fields": {
               "keyword": {
-                "type": "text",
-                "analyzer": "keyword",
-                "fielddata": "true"
+                "type": "keyword",
+                "normalizer": "keyword",
+                "ignore_above": 256
               }
             }
           }

diff --git a/requirements.txt b/requirements.txt
@@ -5,7 +5,7 @@ elasticsearch==8.12.1
 Flask==1.1.2
 itsdangerous==1.1.0
 Jinja2==2.11.3
-lxml==4.6.3
+lxml==4.9.4
 MarkupSafe==1.1.1
 python-dotenv==0.17.0
 requests