From 625c7c10b184377b7800eecbe92096e69fcdff6d Mon Sep 17 00:00:00 2001 From: Pablo Saiz Date: Wed, 13 Sep 2023 13:05:50 +0200 Subject: [PATCH 1/2] migrate: Migration of the stats. --- invenio_stats/cli.py | 62 +++++++++++++++++++ .../file_download/os-v1/file-download-v1.json | 3 + .../file_download/os-v2/file-download-v1.json | 8 ++- .../file_download/v7/file-download-v1.json | 3 + .../record_view/os-v1/record-view-v1.json | 3 + .../record_view/os-v2/record-view-v1.json | 17 ++++- .../record_view/v7/record-view-v1.json | 3 + 7 files changed, 97 insertions(+), 2 deletions(-) diff --git a/invenio_stats/cli.py b/invenio_stats/cli.py index bd49086c..96157dd8 100644 --- a/invenio_stats/cli.py +++ b/invenio_stats/cli.py @@ -13,7 +13,11 @@ import click from dateutil.parser import parse as dateutil_parse +from datetime import datetime +from flask import current_app from flask.cli import with_appcontext +from invenio_search.engine import search +from invenio_search.proxies import current_search_client from werkzeug.local import LocalProxy from .proxies import current_stats @@ -164,3 +168,61 @@ def _aggregations_list_bookmarks( click.echo("{}:".format(a)) for b in bookmarks: click.echo(" - {}".format(b.date)) + + +@stats.command("migrate_zenodo") +@with_appcontext +def _migrate(): + """Migrate the statistics from zenodo.""" + print("Checking if there are any `legacy` indices") + my_date = datetime.utcnow().isoformat() + painless = f'ctx._source.parent_recid=ctx._source.conceptrecid;ctx._source.updated_timestamp="{my_date}";' + # Removing obsolete fields + for f in [ + "conceptdoi", + "resource_type", + "access_right", + "bucket_id", + "file_key", + "referrer", + "size", + "file_id", + "conceptrecid", + "doi", + "owners", + "is_parent", + "communities" + ]: + painless += f'ctx._source.remove("{f}");' + print(painless) + legacy_indices = current_search_client.cat.indices("legacy*", format="json") + i = 0 + total = len(legacy_indices) + for my_index in sorted(legacy_indices, key=lambda d: d["index"]): + print("%i/%i Doing index: %s" % (i, total, my_index["index"])) + i += 1 + target = my_index["index"].replace("legacy-zenodo", "zenodo-prod") + source = my_index["index"] + try: + old = current_search_client.count({}, source) + new = current_search_client.count({}, target) + if old == new: + print("\tThe target has the same number of entries. Skipping") + continue + except search.exceptions.NotFoundError: + pass + try: + current_search_client.reindex( + { + "conflicts": "proceed", + "source": {"index": my_index["index"]}, + "dest": {"index": target, "op_type": "create"}, + "script": { + "lang": "painless", + "source": painless, + }, + } + ) + except Exception as d: + print("NOPE") + print(d) diff --git a/invenio_stats/contrib/file_download/os-v1/file-download-v1.json b/invenio_stats/contrib/file_download/os-v1/file-download-v1.json index b7965963..95392809 100644 --- a/invenio_stats/contrib/file_download/os-v1/file-download-v1.json +++ b/invenio_stats/contrib/file_download/os-v1/file-download-v1.json @@ -54,6 +54,9 @@ }, "size": { "type": "double" + }, + "parent_recid": { + "type": "keyword" } } }, diff --git a/invenio_stats/contrib/file_download/os-v2/file-download-v1.json b/invenio_stats/contrib/file_download/os-v2/file-download-v1.json index b7965963..f1d9a31d 100644 --- a/invenio_stats/contrib/file_download/os-v2/file-download-v1.json +++ b/invenio_stats/contrib/file_download/os-v2/file-download-v1.json @@ -18,7 +18,7 @@ } ], "date_detection": false, - "dynamic": false, + "dynamic": "strict", "numeric_detection": false, "properties": { "timestamp": { @@ -54,6 +54,12 @@ }, "size": { "type": "double" + }, + "user_id": { + "type": "double" + }, + "parent_recid": { + "type": "keyword" } } }, diff --git a/invenio_stats/contrib/file_download/v7/file-download-v1.json b/invenio_stats/contrib/file_download/v7/file-download-v1.json index b7965963..95392809 100644 --- a/invenio_stats/contrib/file_download/v7/file-download-v1.json +++ b/invenio_stats/contrib/file_download/v7/file-download-v1.json @@ -54,6 +54,9 @@ }, "size": { "type": "double" + }, + "parent_recid": { + "type": "keyword" } } }, diff --git a/invenio_stats/contrib/record_view/os-v1/record-view-v1.json b/invenio_stats/contrib/record_view/os-v1/record-view-v1.json index 4cee71cb..c311cfc0 100644 --- a/invenio_stats/contrib/record_view/os-v1/record-view-v1.json +++ b/invenio_stats/contrib/record_view/os-v1/record-view-v1.json @@ -40,6 +40,9 @@ }, "unique_session_id": { "type": "keyword" + }, + "parent_recid": { + "type": "keyword" } } }, diff --git a/invenio_stats/contrib/record_view/os-v2/record-view-v1.json b/invenio_stats/contrib/record_view/os-v2/record-view-v1.json index 4cee71cb..ec1ce3eb 100644 --- a/invenio_stats/contrib/record_view/os-v2/record-view-v1.json +++ b/invenio_stats/contrib/record_view/os-v2/record-view-v1.json @@ -7,7 +7,7 @@ }, "mappings": { "date_detection": false, - "dynamic": false, + "dynamic": "strict", "numeric_detection": false, "properties": { "timestamp": { @@ -35,11 +35,26 @@ "is_robot": { "type": "boolean" }, + "machine": { + "type": "boolean" + }, "unique_id": { "type": "keyword" }, "unique_session_id": { "type": "keyword" + }, + "communities": { + "type": "keyword" + }, + "countries": { + "type": "keyword" + }, + "owners": { + "type": "keyword" + }, + "parent_recid": { + "type": "keyword" } } }, diff --git a/invenio_stats/contrib/record_view/v7/record-view-v1.json b/invenio_stats/contrib/record_view/v7/record-view-v1.json index 4cee71cb..c311cfc0 100644 --- a/invenio_stats/contrib/record_view/v7/record-view-v1.json +++ b/invenio_stats/contrib/record_view/v7/record-view-v1.json @@ -40,6 +40,9 @@ }, "unique_session_id": { "type": "keyword" + }, + "parent_recid": { + "type": "keyword" } } }, From 6e0108fa7fb2b503eeeb2289d300336e9925edb4 Mon Sep 17 00:00:00 2001 From: Pablo Saiz Date: Wed, 11 Oct 2023 15:08:39 +0200 Subject: [PATCH 2/2] Remove is_parent --- invenio_stats/cli.py | 33 ++++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/invenio_stats/cli.py b/invenio_stats/cli.py index 96157dd8..f0a1f228 100644 --- a/invenio_stats/cli.py +++ b/invenio_stats/cli.py @@ -182,23 +182,21 @@ def _migrate(): "conceptdoi", "resource_type", "access_right", - "bucket_id", - "file_key", "referrer", "size", - "file_id", "conceptrecid", "doi", - "owners", "is_parent", - "communities" + "owners", + "communities", ]: painless += f'ctx._source.remove("{f}");' - print(painless) legacy_indices = current_search_client.cat.indices("legacy*", format="json") i = 0 total = len(legacy_indices) for my_index in sorted(legacy_indices, key=lambda d: d["index"]): + if my_index["index"] == "legacy-zenodo-stats-bookmarks": + continue print("%i/%i Doing index: %s" % (i, total, my_index["index"])) i += 1 target = my_index["index"].replace("legacy-zenodo", "zenodo-prod") @@ -212,10 +210,31 @@ def _migrate(): except search.exceptions.NotFoundError: pass try: + print( + { + "conflicts": "proceed", + "source": { + "index": my_index["index"], + "query": { + "bool": {"must_not": [{"term": {"is_parent": True}}]} + }, + }, + "dest": {"index": target, "op_type": "create"}, + "script": { + "lang": "painless", + "source": painless, + }, + } + ) current_search_client.reindex( { "conflicts": "proceed", - "source": {"index": my_index["index"]}, + "source": { + "index": my_index["index"], + "query": { + "bool": {"must_not": [{"term": {"is_parent": True}}]} + }, + }, "dest": {"index": target, "op_type": "create"}, "script": { "lang": "painless",