Skip to content

Commit

Permalink
Update search_terms endpoints to return all the resources, add links …
Browse files Browse the repository at this point in the history
…to the buckets for metadata and return excel files inteads of csv
  • Loading branch information
khaledk2 committed Nov 8, 2023
1 parent e550a93 commit 5ee8a55
Show file tree
Hide file tree
Showing 5 changed files with 85 additions and 55 deletions.
4 changes: 2 additions & 2 deletions manage.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,14 +320,14 @@ def test_indexing_search_query(
from omero_search_engine.validation.results_validator import (
validate_queries,
test_no_images,
get_omero_stats,
# get_omero_stats,
get_no_images_sql_containers,
)

validate_queries(json_file, deep_check)
if check_studies:
test_no_images()
get_omero_stats()
# get_omero_stats()
get_no_images_sql_containers()


Expand Down
46 changes: 26 additions & 20 deletions omero_search_engine/api/stats/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

from . import stats
from tools.utils.logs_analyser import get_search_terms
from flask import Response, send_file
from flask import Response, send_file, request
from omero_search_engine import search_omero_app
import os
from omero_search_engine.validation.results_validator import (
Expand All @@ -32,32 +32,38 @@ def index():
return "OMERO search engine (stats API)"


@stats.route("/<resource>/search_terms", methods=["GET"])
def search_terms(resource):
@stats.route("/search_terms", methods=["GET"])
def search_terms():
"""
Search the logs file to extract the search terms
and return them to an excel file containing the
search terms and number of hits, unique hits
each resource has a sheet inside the excel book
"""
logs_folder = search_omero_app.config.get("SEARCHENGINE_LOGS_FOLDER")
content = get_search_terms(logs_folder, resource=resource, return_file_content=True)
content = get_search_terms(logs_folder, return_file_content=True)
headers = {
"Content-Disposition": "attachment; filename=searchterms.xlsx",
"Content-type": "application/vnd.ms-excel",
}
return Response(
content,
mimetype="text/csv",
headers={
"Content-disposition": "attachment; filename=%s_stats.csv" % (resource)
},
content.getvalue(), mimetype="application/vnd.ms-excel", headers=headers
)


@stats.route("/metadata", methods=["GET"])
def get_metadata():
"""
Search the database to extract a metadata about each resource
for the common terms
It returns an Excel book which contains the attribute and
its number of buckets in addition to a link to the buckets
"""
base_folder = "/etc/searchengine/"
if not os.path.isdir(base_folder):
base_folder = os.path.expanduser("~")
metadata = os.path.join(base_folder, "metadata.csv")

if os.path.isfile(metadata):
return send_file(metadata, as_attachment=True)
else:
report = get_omero_stats(return_contents=True)
return Response(
report,
mimetype="text/csv",
headers={"Content-disposition": "attachment; filename=metadata.csv"},
)
metadata = os.path.join(base_folder, "metadata.xlsx")
if not os.path.isfile(metadata):
base_url = request.url.replace("stats/metadata", "v1/resources/")
get_omero_stats(base_url=base_url)
return send_file(metadata, as_attachment=True)
42 changes: 27 additions & 15 deletions omero_search_engine/validation/results_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
projects_count,
)
import os
import pandas as pd

query_methods = {
"image": query_images_key_value,
Expand Down Expand Up @@ -630,31 +631,39 @@ def test_no_images():
"""


def get_omero_stats(return_contents=False):
values = ["Resource", "Attribute", "No. buckets", "Total number"]
def get_omero_stats(base_url=None):
values = ["Resource", "Attribute", "No. buckets", "Buckets' URL", "Total number"]
base_folder = "/etc/searchengine/"
if not os.path.isdir(base_folder):
base_folder = os.path.expanduser("~")
metadata_file = os.path.join(base_folder, "metadata.csv")
metadata_file = os.path.join(base_folder, "metadata.xlsx")

from omero_search_engine.api.v1.resources.resource_analyser import (
get_restircted_search_terms,
query_cashed_bucket,
)

data = []
all_data = {}
terms = get_restircted_search_terms()
for resource, names in terms.items():
data = []
all_data[resource] = data
for name in names:
if name == "name":
continue
returned_results = query_cashed_bucket(name, resource)
if base_url:
url_ = "%s/%s/searchvaluesusingkey/?key=%s" % (base_url, resource, name)
url = '=HYPERLINK("%s")' % url_
else:
url = ""
if resource == "image":
data.append(
[
resource,
name,
returned_results.get("total_number_of_buckets"),
url,
returned_results.get("total_number_of_image"),
]
)
Expand All @@ -665,22 +674,25 @@ def get_omero_stats(return_contents=False):
resource,
name,
returned_results.get("total_number_of_buckets"),
url,
returned_results.get(kk),
]
)

for dat in returned_results.get("data"):
if not dat["Value"]:
print("Value is empty string", dat["Key"])
import pandas as pd
# for dat in returned_results.get("data"):
# if not dat["Value"]:
# print("Value is empty string", dat["Key"])
writer = pd.ExcelWriter(metadata_file, engine="xlsxwriter")
for resource, data_ in sorted(all_data.items(), reverse=False):
df = pd.DataFrame(data_, columns=values)
df2 = df.sort_values(by=["Resource", "No. buckets"], ascending=[True, False])
df2.to_excel(writer, sheet_name=resource, index=False)
worksheet = writer.sheets[resource]
from tools.utils.logs_analyser import adjust_colunms_width

df = pd.DataFrame(data, columns=values)
df2 = df.sort_values(by=["Resource", "No. buckets"], ascending=[True, False])
report = df2.to_csv()
with open(metadata_file, "w") as f:
f.write(report)
if return_contents:
return report
adjust_colunms_width(worksheet, values, df2)

writer.save()


def get_no_images_sql_containers():
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,4 @@ SQLAlchemy==1.4.21
vine==5.0.0
wcwidth==0.2.5
Werkzeug==1.0.1
XlsxWriter==3.1.9
47 changes: 29 additions & 18 deletions tools/utils/logs_analyser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import os
import sys
import logging
import pandas as pd

logging.basicConfig(stream=sys.stdout, level=logging.INFO)

Expand All @@ -12,7 +13,7 @@
"""


def get_search_terms(folder_name, resource=None, return_file_content=False):
def get_search_terms(folder_name, return_file_content=False):
logging.info("checking files inside: %s" % folder_name)
resourses = {}
for root, dirs, files in os.walk(folder_name):
Expand All @@ -26,7 +27,6 @@ def get_search_terms(folder_name, resource=None, return_file_content=False):
logging.info("Write the reports")
contents = write_reports(
resourses,
resource,
return_file_content,
os.path.join(folder_name, "report.csv"),
)
Expand Down Expand Up @@ -84,11 +84,15 @@ def check_filters(conds, resourses):
names_values[name] = [value]


def write_reports(resourses, resource, return_file_content, file_name):
for res, itms in resourses.items():
columns = ["key", "total hits", "unique hits"]
def write_reports(resourses, return_file_content, file_name):
columns = ["key", "total hits", "unique hits"]
import io

out_io = io.BytesIO()
writer = pd.ExcelWriter(out_io, engine="xlsxwriter")
for res, terms in resourses.items():
lines = []
for name, values in itms.items():
for name, values in terms.items():
line = [name]
lines.append(line)
vv = []
Expand All @@ -97,18 +101,25 @@ def write_reports(resourses, resource, return_file_content, file_name):
vv.append(val)
line.insert(1, len(values))
line.insert(2, len(vv))
import pandas as pd

df = pd.DataFrame(lines, columns=columns)
df2 = df.sort_values(by=["total hits", "unique hits"], ascending=[False, False])
contents = df2.to_csv()
if return_file_content:
if res == resource:
logging.info("================================")
logging.info("%s, %s" % (resource, return_file_content))
logging.info("================================")
return contents
else:
f = open(file_name.replace(".csv", "_%s.csv" % res), "w")
f.write(contents)
f.close()
df2.to_excel(writer, index=False, sheet_name=res)
adjust_colunms_width(writer.sheets[res], columns, df2)

writer.save()
writer.close()
if return_file_content:
return out_io
with open(file_name, "wb") as out:
out.write(out_io.getvalue())


def adjust_colunms_width(worksheet, columns, df2):
for idx, col in enumerate(df2.columns):
series = df2[col]
max_width = len(columns[idx])
max_width_ = [len(str(s)) for s in series if len(str(s)) > max_width]
if len(max_width_) > 0:
max_width = max(max_width_)
worksheet.set_column(idx, idx, max_width + 1)

0 comments on commit 5ee8a55

Please sign in to comment.