diff --git a/manage.py b/manage.py index 29fabf23..e6fca65f 100644 --- a/manage.py +++ b/manage.py @@ -320,14 +320,14 @@ def test_indexing_search_query( from omero_search_engine.validation.results_validator import ( validate_queries, test_no_images, - get_omero_stats, + # get_omero_stats, get_no_images_sql_containers, ) validate_queries(json_file, deep_check) if check_studies: test_no_images() - get_omero_stats() + # get_omero_stats() get_no_images_sql_containers() diff --git a/omero_search_engine/api/stats/urls.py b/omero_search_engine/api/stats/urls.py index 4ded2e77..a3e99728 100644 --- a/omero_search_engine/api/stats/urls.py +++ b/omero_search_engine/api/stats/urls.py @@ -19,7 +19,7 @@ from . import stats from tools.utils.logs_analyser import get_search_terms -from flask import Response, send_file +from flask import Response, send_file, request from omero_search_engine import search_omero_app import os from omero_search_engine.validation.results_validator import ( @@ -32,32 +32,38 @@ def index(): return "OMERO search engine (stats API)" -@stats.route("//search_terms", methods=["GET"]) -def search_terms(resource): +@stats.route("/search_terms", methods=["GET"]) +def search_terms(): + """ + Search the logs file to extract the search terms + and return them to an excel file containing the + search terms and number of hits, unique hits + each resource has a sheet inside the excel book + """ logs_folder = search_omero_app.config.get("SEARCHENGINE_LOGS_FOLDER") - content = get_search_terms(logs_folder, resource=resource, return_file_content=True) + content = get_search_terms(logs_folder, return_file_content=True) + headers = { + "Content-Disposition": "attachment; filename=searchterms.xlsx", + "Content-type": "application/vnd.ms-excel", + } return Response( - content, - mimetype="text/csv", - headers={ - "Content-disposition": "attachment; filename=%s_stats.csv" % (resource) - }, + content.getvalue(), mimetype="application/vnd.ms-excel", headers=headers ) @stats.route("/metadata", methods=["GET"]) def get_metadata(): + """ + Search the database to extract a metadata about each resource + for the common terms + It returns an Excel book which contains the attribute and + its number of buckets in addition to a link to the buckets + """ base_folder = "/etc/searchengine/" if not os.path.isdir(base_folder): base_folder = os.path.expanduser("~") - metadata = os.path.join(base_folder, "metadata.csv") - - if os.path.isfile(metadata): - return send_file(metadata, as_attachment=True) - else: - report = get_omero_stats(return_contents=True) - return Response( - report, - mimetype="text/csv", - headers={"Content-disposition": "attachment; filename=metadata.csv"}, - ) + metadata = os.path.join(base_folder, "metadata.xlsx") + if not os.path.isfile(metadata): + base_url = request.url.replace("stats/metadata", "v1/resources/") + get_omero_stats(base_url=base_url) + return send_file(metadata, as_attachment=True) diff --git a/omero_search_engine/validation/results_validator.py b/omero_search_engine/validation/results_validator.py index 27bf0ee5..bef377d6 100644 --- a/omero_search_engine/validation/results_validator.py +++ b/omero_search_engine/validation/results_validator.py @@ -35,6 +35,7 @@ projects_count, ) import os +import pandas as pd query_methods = { "image": query_images_key_value, @@ -630,31 +631,39 @@ def test_no_images(): """ -def get_omero_stats(return_contents=False): - values = ["Resource", "Attribute", "No. buckets", "Total number"] +def get_omero_stats(base_url=None): + values = ["Resource", "Attribute", "No. buckets", "Buckets' URL", "Total number"] base_folder = "/etc/searchengine/" if not os.path.isdir(base_folder): base_folder = os.path.expanduser("~") - metadata_file = os.path.join(base_folder, "metadata.csv") + metadata_file = os.path.join(base_folder, "metadata.xlsx") from omero_search_engine.api.v1.resources.resource_analyser import ( get_restircted_search_terms, query_cashed_bucket, ) - data = [] + all_data = {} terms = get_restircted_search_terms() for resource, names in terms.items(): + data = [] + all_data[resource] = data for name in names: if name == "name": continue returned_results = query_cashed_bucket(name, resource) + if base_url: + url_ = "%s/%s/searchvaluesusingkey/?key=%s" % (base_url, resource, name) + url = '=HYPERLINK("%s")' % url_ + else: + url = "" if resource == "image": data.append( [ resource, name, returned_results.get("total_number_of_buckets"), + url, returned_results.get("total_number_of_image"), ] ) @@ -665,22 +674,25 @@ def get_omero_stats(return_contents=False): resource, name, returned_results.get("total_number_of_buckets"), + url, returned_results.get(kk), ] ) - for dat in returned_results.get("data"): - if not dat["Value"]: - print("Value is empty string", dat["Key"]) - import pandas as pd + # for dat in returned_results.get("data"): + # if not dat["Value"]: + # print("Value is empty string", dat["Key"]) + writer = pd.ExcelWriter(metadata_file, engine="xlsxwriter") + for resource, data_ in sorted(all_data.items(), reverse=False): + df = pd.DataFrame(data_, columns=values) + df2 = df.sort_values(by=["Resource", "No. buckets"], ascending=[True, False]) + df2.to_excel(writer, sheet_name=resource, index=False) + worksheet = writer.sheets[resource] + from tools.utils.logs_analyser import adjust_colunms_width - df = pd.DataFrame(data, columns=values) - df2 = df.sort_values(by=["Resource", "No. buckets"], ascending=[True, False]) - report = df2.to_csv() - with open(metadata_file, "w") as f: - f.write(report) - if return_contents: - return report + adjust_colunms_width(worksheet, values, df2) + + writer.save() def get_no_images_sql_containers(): diff --git a/requirements.txt b/requirements.txt index cec4cf61..22681e80 100644 --- a/requirements.txt +++ b/requirements.txt @@ -31,3 +31,4 @@ SQLAlchemy==1.4.21 vine==5.0.0 wcwidth==0.2.5 Werkzeug==1.0.1 +XlsxWriter==3.1.9 diff --git a/tools/utils/logs_analyser.py b/tools/utils/logs_analyser.py index 04b86381..50034233 100644 --- a/tools/utils/logs_analyser.py +++ b/tools/utils/logs_analyser.py @@ -2,6 +2,7 @@ import os import sys import logging +import pandas as pd logging.basicConfig(stream=sys.stdout, level=logging.INFO) @@ -12,7 +13,7 @@ """ -def get_search_terms(folder_name, resource=None, return_file_content=False): +def get_search_terms(folder_name, return_file_content=False): logging.info("checking files inside: %s" % folder_name) resourses = {} for root, dirs, files in os.walk(folder_name): @@ -26,7 +27,6 @@ def get_search_terms(folder_name, resource=None, return_file_content=False): logging.info("Write the reports") contents = write_reports( resourses, - resource, return_file_content, os.path.join(folder_name, "report.csv"), ) @@ -84,11 +84,15 @@ def check_filters(conds, resourses): names_values[name] = [value] -def write_reports(resourses, resource, return_file_content, file_name): - for res, itms in resourses.items(): - columns = ["key", "total hits", "unique hits"] +def write_reports(resourses, return_file_content, file_name): + columns = ["key", "total hits", "unique hits"] + import io + + out_io = io.BytesIO() + writer = pd.ExcelWriter(out_io, engine="xlsxwriter") + for res, terms in resourses.items(): lines = [] - for name, values in itms.items(): + for name, values in terms.items(): line = [name] lines.append(line) vv = [] @@ -97,18 +101,25 @@ def write_reports(resourses, resource, return_file_content, file_name): vv.append(val) line.insert(1, len(values)) line.insert(2, len(vv)) - import pandas as pd df = pd.DataFrame(lines, columns=columns) df2 = df.sort_values(by=["total hits", "unique hits"], ascending=[False, False]) - contents = df2.to_csv() - if return_file_content: - if res == resource: - logging.info("================================") - logging.info("%s, %s" % (resource, return_file_content)) - logging.info("================================") - return contents - else: - f = open(file_name.replace(".csv", "_%s.csv" % res), "w") - f.write(contents) - f.close() + df2.to_excel(writer, index=False, sheet_name=res) + adjust_colunms_width(writer.sheets[res], columns, df2) + + writer.save() + writer.close() + if return_file_content: + return out_io + with open(file_name, "wb") as out: + out.write(out_io.getvalue()) + + +def adjust_colunms_width(worksheet, columns, df2): + for idx, col in enumerate(df2.columns): + series = df2[col] + max_width = len(columns[idx]) + max_width_ = [len(str(s)) for s in series if len(str(s)) > max_width] + if len(max_width_) > 0: + max_width = max(max_width_) + worksheet.set_column(idx, idx, max_width + 1)