Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Es mapping #3

Open
wants to merge 4 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 41 additions & 38 deletions cli.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from csv import DictReader
import io
import json
import pprint
import re
Expand Down Expand Up @@ -85,10 +87,7 @@ def search(query, indexes, term):
else:
body = query

config = {
"index": indexes if indexes else app.all_indexes,
"body": body
}
config = {"index": indexes if indexes else app.all_indexes, "body": body}

result = app.elasticsearch.search(**config)
print("\n", "=" * 12, " RESULT ", "=" * 12)
Expand Down Expand Up @@ -129,53 +128,58 @@ def index(years):
"""

# BUILD THE METADATA DICT FROM THE GITHUB TSV FILE
response = requests.get(app.config['METADATA_FILE_URL'])
response = requests.get(app.config["METADATA_FILE_URL"])
metadata = {}
lines = response.text.splitlines()
header = lines.pop(0).split('\t')
for line in lines:
_d = {}
# replace empty strings with null values
_values = [v if v != "" else None for v in line.split('\t')]
for i, k in enumerate(header):
# filter indexable columns
if k in app.config['METADATA_FILE_INDEXABLE_COLUMNS']:
# brutally try to cast values as integer
try:
_d[k] = int(_values[i])
except (TypeError, ValueError):
_d[k] = _values[i]

metadata[_d['id']] = _d
# remove id from nested metadata object
metadata[_d['id']].pop("id")

_DTS_URL = app.config['DTS_URL']
reader = DictReader(io.StringIO(response.text), delimiter="\t")
for row in reader:
try:
metadata[row["id"]] = {
"author_name": row["author_name"],
"author_firstname": row["author_firstname"],
"title_rich": row["title_rich"],
"promotion_year": int(row["promotion_year"]) if row["promotion_year"] else None,
"topic_notBefore": int(row["topic_notBefore"]) if row["topic_notBefore"] else None,
"topic_notAfter": int(row["topic_notAfter"]) if row["topic_notAfter"] else None,
"author_gender": int(row["author_gender"]) if row["author_gender"] else None,
# 1/2, verify that there is no other value
"author_is_enc_teacher": 1 if row["author_is_enc_teacher"]=="1" else None,
}
except Exception as exc:
print(f"ERROR while indexing {row['id']}, {exc}")

_DTS_URL = app.config["DTS_URL"]

# INDEXATION DES DOCUMENTS
all_docs = []
try:
_index_name = app.config['DOCUMENT_INDEX']
_index_name = app.config["DOCUMENT_INDEX"]
if years == "all":
years = app.config['ALL_YEARS']
start_year, end_year = (int(y) for y in years.split('-'))
years = app.config["ALL_YEARS"]
start_year, end_year = (int(y) for y in years.split("-"))
for year in range(start_year, end_year + 1):

_ids = [d for d in metadata.keys() if str(year) in d and "_PREV" not in d and "_NEXT" not in d]
_ids = [
d
for d in metadata.keys()
if str(year) in d and "_PREV" not in d and "_NEXT" not in d
]

for encpos_id in _ids:
response = requests.get(f'{_DTS_URL}/document?id={encpos_id}')
response = requests.get(f"{_DTS_URL}/document?id={encpos_id}")
print(encpos_id, response.status_code)

content = extract_body(response.text)
content = remove_html_tags(content)
all_docs.append("\n".join([
json.dumps(
{"index": {"_index": _index_name, "_id": encpos_id}}
),
json.dumps(
{"content": content, "metadata": metadata[encpos_id]}
)
]))

app.elasticsearch.index(
index=_index_name,
id=encpos_id,
body={
"content": content,
"metadata": metadata[encpos_id]
})
app.elasticsearch.bulk(body=all_docs, request_timeout=60*10)

except Exception as e:
print('Indexation error: ', str(e))
Expand All @@ -192,4 +196,3 @@ def index(years):
cli.add_command(index)
cli.add_command(search)
return cli

23 changes: 13 additions & 10 deletions elasticsearch/_global.conf.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@
},
"analysis": {
"filter": {
"my_stop_french":{
"type": "stop",
"stopwords": "_french_"
},
"french_elision": {
"type": "elision",
"articles_case": true,
Expand All @@ -29,25 +33,24 @@
"type": "html_strip"
}
},
"normalizer":{
"keyword": {
"filter": [
"icu_folding"
]
}
},
"analyzer": {
"folding": {
"tokenizer": "standard",
"stopwords": "_french_",
"filter": [
"french_elision",
"icu_folding"
"icu_folding",
"my_stop_french"
],
"char_filter": [
"html_stripper"
]
},
"keyword": {
"tokenizer": "keyword",
"stopwords": "_french_",
"filter": [
"french_elision",
"icu_folding"
]
}
}
}
Expand Down
40 changes: 32 additions & 8 deletions elasticsearch/encpos_document.conf.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,27 +6,51 @@
"analyzer": "folding",
"term_vector": "with_positions_offsets"
},
"metadata_all": {
"type": "text"
},
"metadata": {
"properties": {
"author_firstname": {
"type": "text",
"fielddata": true,
"copy_to": "metadata_all",
"fields": {
"keyword": {
"type": "text",
"analyzer": "keyword",
"fielddata": "true"
"type": "keyword",
"normalizer": "keyword"
}
}
},
"author_name": {
"type": "text",
"fielddata": true,
"copy_to": "metadata_all",
"fields": {
"keyword": {
"type": "keyword",
"normalizer": "keyword"
}
}
},
"promotion_year": {
"type": "short",
"copy_to": "metadata_all"
},
"topic_notAfter": {
"type": "short",
"copy_to": "metadata_all"
},
"topic_notBefore": {
"type": "short",
"copy_to": "metadata_all"
},
"title_rich": {
"type": "text",
"copy_to": "metadata_all",
"fields": {
"keyword": {
"type": "text",
"analyzer": "keyword",
"fielddata": "true"
"type": "keyword",
"normalizer": "keyword",
"ignore_above": 256
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ elasticsearch==8.12.1
Flask==1.1.2
itsdangerous==1.1.0
Jinja2==2.11.3
lxml==4.6.3
lxml==4.9.4
MarkupSafe==1.1.1
python-dotenv==0.17.0
requests
Expand Down