Skip to content

Commit

Permalink
implement the suggested modifications by Petr
Browse files Browse the repository at this point in the history
  • Loading branch information
khaledk2 committed Nov 28, 2023
1 parent f4c7ffe commit 1b4d77d
Show file tree
Hide file tree
Showing 6 changed files with 142 additions and 37 deletions.
18 changes: 13 additions & 5 deletions omero_search_engine/api/stats/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,14 +41,22 @@ def search_terms():
each resource has a sheet inside the excel book
"""
logs_folder = search_omero_app.config.get("SEARCHENGINE_LOGS_FOLDER")
content = get_search_terms(logs_folder, return_file_content=True)
max_top_values = request.args.get("return_values")
if not max_top_values:
max_top_values = 5
elif max_top_values.isdigit():
max_top_values = int(max_top_values)
else:
if max_top_values.lower() != "all":
max_top_values = 5
else:
max_top_values = "all"
content = get_search_terms(logs_folder, max_top_values, return_file_content=True)
headers = {
"Content-Disposition": "attachment; filename=searchterms.xlsx",
"Content-type": "application/vnd.ms-excel",
}
return Response(
content.getvalue(), mimetype="application/vnd.ms-excel", headers=headers
)
return Response(content.getvalue(), headers=headers)


@stats.route("/metadata", methods=["GET"])
Expand All @@ -63,8 +71,8 @@ def get_metadata():
if not os.path.isdir(base_folder):
base_folder = os.path.expanduser("~")
metadata = os.path.join(base_folder, "metadata.xlsx")
base_url = request.base_url.replace("stats/metadata", "v1/resources/")
if not os.path.isfile(metadata):
base_url = request.url.replace("stats/metadata", "v1/resources/")
if "/searchengine/searchengine" in base_url:
base_url = base_url.replace("/searchengine/searchengine", "/searchengine")
get_omero_stats(base_url=base_url)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,13 @@
"Organism"
],
"screen":[
"name",
"Imaging Method",
"Publication Title",
"Publication Authors",
"Study Type",
"License",
"Organism",
"Screen Technology Type",
"Screen Type"
],
Expand Down
14 changes: 9 additions & 5 deletions omero_search_engine/validation/results_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -632,7 +632,7 @@ def test_no_images():


def get_omero_stats(base_url=None):
values = ["Resource", "Attribute", "No. buckets", "Buckets' URL", "Total number"]
columns = ["Resource", "Attribute", "No. of unique values", "Attribute's URL"]
base_folder = "/etc/searchengine/"
if not os.path.isdir(base_folder):
base_folder = os.path.expanduser("~")
Expand Down Expand Up @@ -684,14 +684,18 @@ def get_omero_stats(base_url=None):
# print("Value is empty string", dat["Key"])
writer = pd.ExcelWriter(metadata_file, engine="xlsxwriter")
for resource, data_ in sorted(all_data.items(), reverse=False):
df = pd.DataFrame(data_, columns=values)
df2 = df.sort_values(by=["Resource", "No. buckets"], ascending=[True, False])
if len(columns) == 5:
del columns[4]
columns.insert(4, "Total number of %s" % resource)
df = pd.DataFrame(data_, columns=columns)
df2 = df.sort_values(
by=["No. of unique values", "Attribute"], ascending=[False, False]
)
df2.to_excel(writer, sheet_name=resource, index=False)
worksheet = writer.sheets[resource]
from tools.utils.logs_analyser import adjust_colunms_width

adjust_colunms_width(worksheet, values, df2)

adjust_colunms_width(worksheet, columns, df2)
writer.save()


Expand Down
12 changes: 6 additions & 6 deletions tools/instructions.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,18 +12,18 @@ The cluster itself is up if at least two nodes are running.

* The searchEngine functions can be tested using the ``check_searchengine_health.sh`` script. The script takes about 15 minutes to run. The script output is saved to a text file check_report.txt in the``/data/searchengine/searchengine/`` folder.

* It is possible to stop an elasticsearch cluster node using this script::
* Stop an elasticsearch cluster node using this script (replace n with an integer which represents the node number, e.g. 1,2,3)::

bash stop_node.sh n
where n is an integer, e.g. 1,2, 3.

* backup_elasticsearch_data.sh script is used to backup the Elasticsearch data.

* It is possible to index or re-index the data using this bash ``scrpt index_data.sh``.
* Index or re-index the data using the ``scrpt index_data.sh`` script.

* It is possible to restore the Elasticsearch data from the backup (snapshot) using the following command::
* Restore the Elasticsearch data from the backup (snapshot) using the following command::

bash restore_elasticsearch_data.sh

* It may take up to 15 minutes to restore the data.
It may take up to 15 minutes to restore the data.

* The ``check_indexing_process.sh`` script is used to check the indexing data progress.
* Check the progress of the data indexing using the ``check_indexing_process.sh`` script
121 changes: 104 additions & 17 deletions tools/utils/logs_analyser.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@
import logging
import pandas as pd

from omero_search_engine.api.v1.resources.resource_analyser import (
get_resource_attributes,
)

logging.basicConfig(stream=sys.stdout, level=logging.INFO)

"""
Expand All @@ -13,7 +17,17 @@
"""


def get_search_terms(folder_name, return_file_content=False):
def get_search_terms(folder_name, max_top_values, return_file_content=False):
logging.info("checking the available keys for each resource")
res_av_keys_ = get_resource_attributes("all")
logging.info("prepare keys to validate the search terms")
res_av_keys = {}
for res, values_ in res_av_keys_.items():
values = []
res_av_keys[res] = values
for val in values_:
values.append(val.lower().strip())

logging.info("checking files inside: %s" % folder_name)
resourses = {}
for root, dirs, files in os.walk(folder_name):
Expand All @@ -23,18 +37,19 @@ def get_search_terms(folder_name, return_file_content=False):
if file_name.endswith("engine_gunilog.log"):
file_name = os.path.join(root, file_name)
logging.info("2..... checking %s" % file_name)
analyse_log_file(file_name, resourses)
analyse_log_file(file_name, resourses, res_av_keys)
logging.info("Write the reports")
contents = write_reports(
resourses,
return_file_content,
os.path.join(folder_name, "report.csv"),
max_top_values,
)
if return_file_content:
return contents


def analyse_log_file(file_name, resourses):
def analyse_log_file(file_name, resourses, res_av_keys):
# file_name="/mnt/d/logs/engine_gunilog.log"
logging.info("Analyse: %s" % file_name)
f = open(file_name, "r")
Expand Down Expand Up @@ -64,52 +79,110 @@ def analyse_log_file(file_name, resourses):
failes = failes + 1

for filter in filters:
check_filters(filter.get("and_filters"), resourses)
check_filters(filter.get("and_filters"), resourses, res_av_keys)
for or_f in filter.get("or_filters"):
check_filters(or_f, resourses)
check_filters(or_f, resourses, res_av_keys)
print("################################################")
print("Dalied is %s" % failes)
print("################################################")


def check_filters(conds, resourses):
def check_filters(conds, resourses, res_av_keys):
for cond in conds:
if cond.get("resource") in resourses:
names_values = resourses[cond.get("resource")]
else:
names_values = {}
resourses[cond.get("resource")] = names_values
name = cond.get("name")
# check if the key exists
# add it only of the key in the available resource's keys
if not name.strip().lower() in res_av_keys[cond.get("resource")]:
continue
value = cond.get("value")
if name in names_values:
names_values[name].append(value)
else:
names_values[name] = [value]


def write_reports(resourses, return_file_content, file_name):
columns = ["key", "total hits", "unique hits"]
def write_reports(resourses, return_file_content, file_name, max_top_values=5):
columns = [
"Keys \n (Shows no. of attempted searches)",
"total hits \n(No. of KVP searches)",
"unique hits \n(No. of unique KVPs searched for)",
]
if max_top_values != 0:
if max_top_values == "all":
columns.append("All searched values")
elif max_top_values > 1:
columns.append("Top %s searched values" % max_top_values)
else:
columns.append("Top searched value")

import io

out_io = io.BytesIO()
writer = pd.ExcelWriter(out_io, engine="xlsxwriter")
# writer.book.formats[0].set_text_wrap()
containers_lines = {}
containers = ["project", "screen"]
for res, terms in resourses.items():
res_ = res
lines = []
if res in containers:
if res in containers_lines:
lines = containers_lines[res]
else:
containers_lines[res] = lines

for name, values in terms.items():
line = [name]
lines.append(line)
vv = []
vv_ = {}
for val in values:
if val not in vv:
vv.append(val)
if val not in vv_:
vv_[val] = 1
else:
vv_[val] = vv_[val] + 1
vv = sorted(vv_.items(), key=lambda kv: kv[1])
top_searchvalues = ""
if max_top_values == "all":
max_top_values = len(vv)

for i in range(max_top_values):
index = len(vv) - 1 - i
if index < 0:
break
if top_searchvalues:
top_searchvalues = top_searchvalues + ", %s:%s" % (
vv[index][0],
vv[index][1],
)
else:
top_searchvalues = "%s:%s" % (vv[index][0], vv[index][1])
line.insert(1, len(values))
line.insert(2, len(vv))

if max_top_values != 0:
line.insert(3, "{%s}" % top_searchvalues)

if res in containers:
if len(containers_lines) > 1:
lines = adjust_containers(containers_lines)
res_ = "container"
else:
continue
df = pd.DataFrame(lines, columns=columns)
df2 = df.sort_values(by=["total hits", "unique hits"], ascending=[False, False])
df2.to_excel(writer, index=False, sheet_name=res)
adjust_colunms_width(writer.sheets[res], columns, df2)
insert_chart(writer, res, df2, len(lines))
df2 = df.sort_values(by=[columns[1], columns[2]], ascending=[False, False])
df2.to_excel(
writer,
index=False,
sheet_name=res_,
)
adjust_colunms_width(writer.sheets[res_], columns, df2)
insert_chart(writer, res_, df2, len(lines))

writer.save()
writer.close()
if return_file_content:
return out_io
with open(file_name, "wb") as out:
Expand All @@ -118,6 +191,9 @@ def write_reports(resourses, return_file_content, file_name):

def adjust_colunms_width(worksheet, columns, df2):
for idx, col in enumerate(df2.columns):
if idx == 3:
worksheet.set_column(idx, idx, 30 + 1)
continue
series = df2[col]
max_width = len(columns[idx])
max_width_ = [len(str(s)) for s in series if len(str(s)) > max_width]
Expand All @@ -139,3 +215,14 @@ def insert_chart(writer, sheet, df, no_points):
}
)
sheet_obj.insert_chart(2, 5, chart)


def adjust_containers(containers_lines):
lines = []
added_keys = []
for res, lines_ in containers_lines.items():
for line in lines_:
if not line[0].strip().lower() in added_keys:
lines.append(line)
added_keys.append(line[0].lower().strip())
return lines
7 changes: 3 additions & 4 deletions tools/utils/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,17 +26,16 @@

def copy_tools_subfolder():
"""
Copy the maintenance_scripts folder to the searchengine folder
Copy the maintenance_scripts folder to the maintenance_scripts
inside the searchengine folder
"""
subfolder = os.path.join(
os.path.abspath(os.path.dirname(__file__)), "../maintenance_scripts"
)
destination_folder = "/etc/searchengine/"
if not os.path.isdir(destination_folder):
destination_folder = os.path.expanduser("~")
destination_folder = os.path.join(
destination_folder, "searchengine/maintenance_scripts"
)
destination_folder = os.path.join(destination_folder, "maintenance_scripts")

if not os.path.isdir(destination_folder):
shutil.copytree(subfolder, destination_folder)
Expand Down

0 comments on commit 1b4d77d

Please sign in to comment.