implement the suggested modifications by Petr

khaledk2 · Nov 28, 2023 · 1b4d77d · 1b4d77d
1 parent f4c7ffe
commit 1b4d77d
Show file tree

Hide file tree

Showing 6 changed files with 142 additions and 37 deletions.
diff --git a/omero_search_engine/api/stats/urls.py b/omero_search_engine/api/stats/urls.py
@@ -41,14 +41,22 @@ def search_terms():
     each resource has a sheet inside the excel book
     """
     logs_folder = search_omero_app.config.get("SEARCHENGINE_LOGS_FOLDER")
-    content = get_search_terms(logs_folder, return_file_content=True)
+    max_top_values = request.args.get("return_values")
+    if not max_top_values:
+        max_top_values = 5
+    elif max_top_values.isdigit():
+        max_top_values = int(max_top_values)
+    else:
+        if max_top_values.lower() != "all":
+            max_top_values = 5
+        else:
+            max_top_values = "all"
+    content = get_search_terms(logs_folder, max_top_values, return_file_content=True)
     headers = {
         "Content-Disposition": "attachment; filename=searchterms.xlsx",
         "Content-type": "application/vnd.ms-excel",
     }
-    return Response(
-        content.getvalue(), mimetype="application/vnd.ms-excel", headers=headers
-    )
+    return Response(content.getvalue(), headers=headers)
 
 
 @stats.route("/metadata", methods=["GET"])
@@ -63,8 +71,8 @@ def get_metadata():
     if not os.path.isdir(base_folder):
         base_folder = os.path.expanduser("~")
     metadata = os.path.join(base_folder, "metadata.xlsx")
+    base_url = request.base_url.replace("stats/metadata", "v1/resources/")
     if not os.path.isfile(metadata):
-        base_url = request.url.replace("stats/metadata", "v1/resources/")
         if "/searchengine/searchengine" in base_url:
             base_url = base_url.replace("/searchengine/searchengine", "/searchengine")
         get_omero_stats(base_url=base_url)

diff --git a/omero_search_engine/api/v1/resources/data/restricted_search_terms.json b/omero_search_engine/api/v1/resources/data/restricted_search_terms.json
@@ -9,6 +9,13 @@
       "Organism"
    ],
    "screen":[
+      "name",
+      "Imaging Method",
+      "Publication Title",
+      "Publication Authors",
+      "Study Type",
+      "License",
+      "Organism",
       "Screen Technology Type",
       "Screen Type"
    ],

diff --git a/omero_search_engine/validation/results_validator.py b/omero_search_engine/validation/results_validator.py
@@ -632,7 +632,7 @@ def test_no_images():
 
 
 def get_omero_stats(base_url=None):
-    values = ["Resource", "Attribute", "No. buckets", "Buckets' URL", "Total number"]
+    columns = ["Resource", "Attribute", "No. of unique values", "Attribute's URL"]
     base_folder = "/etc/searchengine/"
     if not os.path.isdir(base_folder):
         base_folder = os.path.expanduser("~")
@@ -684,14 +684,18 @@ def get_omero_stats(base_url=None):
         #        print("Value is empty string", dat["Key"])
     writer = pd.ExcelWriter(metadata_file, engine="xlsxwriter")
     for resource, data_ in sorted(all_data.items(), reverse=False):
-        df = pd.DataFrame(data_, columns=values)
-        df2 = df.sort_values(by=["Resource", "No. buckets"], ascending=[True, False])
+        if len(columns) == 5:
+            del columns[4]
+        columns.insert(4, "Total number of %s" % resource)
+        df = pd.DataFrame(data_, columns=columns)
+        df2 = df.sort_values(
+            by=["No. of unique values", "Attribute"], ascending=[False, False]
+        )
         df2.to_excel(writer, sheet_name=resource, index=False)
         worksheet = writer.sheets[resource]
         from tools.utils.logs_analyser import adjust_colunms_width
 
-        adjust_colunms_width(worksheet, values, df2)
-
+        adjust_colunms_width(worksheet, columns, df2)
     writer.save()
 
 

diff --git a/tools/instructions.rst b/tools/instructions.rst
@@ -12,18 +12,18 @@ The cluster itself is up if at least two nodes are running.
 
 * The searchEngine functions can be tested using the ``check_searchengine_health.sh`` script. The script takes about 15 minutes to run. The script output is saved to a text file check_report.txt in the``/data/searchengine/searchengine/`` folder.
 
-* It is possible to stop an elasticsearch cluster node using this script::
+* Stop an elasticsearch cluster node using this script (replace n with an integer which represents the node number, e.g. 1,2,3)::
 
     bash stop_node.sh n
-    where n is an integer, e.g. 1,2, 3.
+
 * backup_elasticsearch_data.sh script is used to backup the Elasticsearch data.
 
-* It is possible to index or re-index the data using this bash ``scrpt index_data.sh``.
+* Index or re-index the data using the ``scrpt index_data.sh`` script.
 
-* It is possible to restore the Elasticsearch data from the backup (snapshot) using the following command::
+* Restore the Elasticsearch data from the backup (snapshot) using the following command::
 
     bash restore_elasticsearch_data.sh
 
-* It may take up to 15 minutes to restore the data.
+ It may take up to 15 minutes to restore the data.
 
-* The ``check_indexing_process.sh`` script is used to check the indexing data progress.
+* Check the progress of the data indexing using the ``check_indexing_process.sh`` script
diff --git a/tools/utils/logs_analyser.py b/tools/utils/logs_analyser.py
@@ -4,6 +4,10 @@
 import logging
 import pandas as pd
 
+from omero_search_engine.api.v1.resources.resource_analyser import (
+    get_resource_attributes,
+)
+
 logging.basicConfig(stream=sys.stdout, level=logging.INFO)
 
 """
@@ -13,7 +17,17 @@
 """
 
 
-def get_search_terms(folder_name, return_file_content=False):
+def get_search_terms(folder_name, max_top_values, return_file_content=False):
+    logging.info("checking the available keys for each resource")
+    res_av_keys_ = get_resource_attributes("all")
+    logging.info("prepare keys to validate the search terms")
+    res_av_keys = {}
+    for res, values_ in res_av_keys_.items():
+        values = []
+        res_av_keys[res] = values
+        for val in values_:
+            values.append(val.lower().strip())
+
     logging.info("checking files inside: %s" % folder_name)
     resourses = {}
     for root, dirs, files in os.walk(folder_name):
@@ -23,18 +37,19 @@ def get_search_terms(folder_name, return_file_content=False):
             if file_name.endswith("engine_gunilog.log"):
                 file_name = os.path.join(root, file_name)
                 logging.info("2..... checking %s" % file_name)
-                analyse_log_file(file_name, resourses)
+                analyse_log_file(file_name, resourses, res_av_keys)
     logging.info("Write the reports")
     contents = write_reports(
         resourses,
         return_file_content,
         os.path.join(folder_name, "report.csv"),
+        max_top_values,
     )
     if return_file_content:
         return contents
 
 
-def analyse_log_file(file_name, resourses):
+def analyse_log_file(file_name, resourses, res_av_keys):
     # file_name="/mnt/d/logs/engine_gunilog.log"
     logging.info("Analyse: %s" % file_name)
     f = open(file_name, "r")
@@ -64,52 +79,110 @@ def analyse_log_file(file_name, resourses):
             failes = failes + 1
 
     for filter in filters:
-        check_filters(filter.get("and_filters"), resourses)
+        check_filters(filter.get("and_filters"), resourses, res_av_keys)
         for or_f in filter.get("or_filters"):
-            check_filters(or_f, resourses)
+            check_filters(or_f, resourses, res_av_keys)
+    print("################################################")
+    print("Dalied is %s" % failes)
+    print("################################################")
 
 
-def check_filters(conds, resourses):
+def check_filters(conds, resourses, res_av_keys):
     for cond in conds:
         if cond.get("resource") in resourses:
             names_values = resourses[cond.get("resource")]
         else:
             names_values = {}
             resourses[cond.get("resource")] = names_values
         name = cond.get("name")
+        # check if the key exists
+        # add it only of the key in the available resource's keys
+        if not name.strip().lower() in res_av_keys[cond.get("resource")]:
+            continue
         value = cond.get("value")
         if name in names_values:
             names_values[name].append(value)
         else:
             names_values[name] = [value]
 
 
-def write_reports(resourses, return_file_content, file_name):
-    columns = ["key", "total hits", "unique hits"]
+def write_reports(resourses, return_file_content, file_name, max_top_values=5):
+    columns = [
+        "Keys \n (Shows no. of attempted searches)",
+        "total hits \n(No. of KVP searches)",
+        "unique hits \n(No. of unique KVPs searched for)",
+    ]
+    if max_top_values != 0:
+        if max_top_values == "all":
+            columns.append("All searched values")
+        elif max_top_values > 1:
+            columns.append("Top %s searched values" % max_top_values)
+        else:
+            columns.append("Top searched value")
+
     import io
 
     out_io = io.BytesIO()
     writer = pd.ExcelWriter(out_io, engine="xlsxwriter")
+    # writer.book.formats[0].set_text_wrap()
+    containers_lines = {}
+    containers = ["project", "screen"]
     for res, terms in resourses.items():
+        res_ = res
         lines = []
+        if res in containers:
+            if res in containers_lines:
+                lines = containers_lines[res]
+            else:
+                containers_lines[res] = lines
+
         for name, values in terms.items():
             line = [name]
             lines.append(line)
-            vv = []
+            vv_ = {}
             for val in values:
-                if val not in vv:
-                    vv.append(val)
+                if val not in vv_:
+                    vv_[val] = 1
+                else:
+                    vv_[val] = vv_[val] + 1
+            vv = sorted(vv_.items(), key=lambda kv: kv[1])
+            top_searchvalues = ""
+            if max_top_values == "all":
+                max_top_values = len(vv)
+
+            for i in range(max_top_values):
+                index = len(vv) - 1 - i
+                if index < 0:
+                    break
+                if top_searchvalues:
+                    top_searchvalues = top_searchvalues + ", %s:%s" % (
+                        vv[index][0],
+                        vv[index][1],
+                    )
+                else:
+                    top_searchvalues = "%s:%s" % (vv[index][0], vv[index][1])
             line.insert(1, len(values))
             line.insert(2, len(vv))
-
+            if max_top_values != 0:
+                line.insert(3, "{%s}" % top_searchvalues)
+
+        if res in containers:
+            if len(containers_lines) > 1:
+                lines = adjust_containers(containers_lines)
+                res_ = "container"
+            else:
+                continue
         df = pd.DataFrame(lines, columns=columns)
-        df2 = df.sort_values(by=["total hits", "unique hits"], ascending=[False, False])
-        df2.to_excel(writer, index=False, sheet_name=res)
-        adjust_colunms_width(writer.sheets[res], columns, df2)
-        insert_chart(writer, res, df2, len(lines))
+        df2 = df.sort_values(by=[columns[1], columns[2]], ascending=[False, False])
+        df2.to_excel(
+            writer,
+            index=False,
+            sheet_name=res_,
+        )
+        adjust_colunms_width(writer.sheets[res_], columns, df2)
+        insert_chart(writer, res_, df2, len(lines))
 
     writer.save()
-    writer.close()
     if return_file_content:
         return out_io
     with open(file_name, "wb") as out:
@@ -118,6 +191,9 @@ def write_reports(resourses, return_file_content, file_name):
 
 def adjust_colunms_width(worksheet, columns, df2):
     for idx, col in enumerate(df2.columns):
+        if idx == 3:
+            worksheet.set_column(idx, idx, 30 + 1)
+            continue
         series = df2[col]
         max_width = len(columns[idx])
         max_width_ = [len(str(s)) for s in series if len(str(s)) > max_width]
@@ -139,3 +215,14 @@ def insert_chart(writer, sheet, df, no_points):
         }
     )
     sheet_obj.insert_chart(2, 5, chart)
+
+
+def adjust_containers(containers_lines):
+    lines = []
+    added_keys = []
+    for res, lines_ in containers_lines.items():
+        for line in lines_:
+            if not line[0].strip().lower() in added_keys:
+                lines.append(line)
+                added_keys.append(line[0].lower().strip())
+    return lines
diff --git a/tools/utils/util.py b/tools/utils/util.py
@@ -26,17 +26,16 @@
 
 def copy_tools_subfolder():
     """
-    Copy the maintenance_scripts folder to the searchengine folder
+    Copy the maintenance_scripts folder to the maintenance_scripts
+    inside the searchengine folder
     """
     subfolder = os.path.join(
         os.path.abspath(os.path.dirname(__file__)), "../maintenance_scripts"
     )
     destination_folder = "/etc/searchengine/"
     if not os.path.isdir(destination_folder):
         destination_folder = os.path.expanduser("~")
-    destination_folder = os.path.join(
-        destination_folder, "searchengine/maintenance_scripts"
-    )
+    destination_folder = os.path.join(destination_folder, "maintenance_scripts")
 
     if not os.path.isdir(destination_folder):
         shutil.copytree(subfolder, destination_folder)