From df83556c2d8da13652cfd3a8f03f4805673142bf Mon Sep 17 00:00:00 2001 From: Chris Allan Date: Tue, 14 May 2024 15:36:37 +0000 Subject: [PATCH 01/11] Optimize obj_id_bitmask by using its own query Having the column oriented responses from OMERO.tables transposed into rows is quite the performance hit for the bitmask endpoint. The endpoint also works only on one column the batching and various other semantics of perform_table_query() do not apply. Consequently, the implementation has been rewritten to use its own query and the bit packing has been further optimized based on new input. --- omeroweb/webgateway/views.py | 51 ++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 29 deletions(-) diff --git a/omeroweb/webgateway/views.py b/omeroweb/webgateway/views.py index c9d5b94ff8..ebeaabcec0 100644 --- a/omeroweb/webgateway/views.py +++ b/omeroweb/webgateway/views.py @@ -3228,42 +3228,35 @@ def obj_id_bitmask(request, fileid, conn=None, query=None, **kwargs): else None ) - rsp_data = perform_table_query( - conn, - fileid, - query, - [col_name], - offset=offset, - limit=limit, - lazy=False, - check_max_rows=False, - ) - if "error" in rsp_data: - return rsp_data + ctx = conn.createServiceOptsDict() + ctx.setOmeroGroup("-1") + sr = conn.getSharedResources() + table = sr.openTable(omero.model.OriginalFileI(fileid, False), ctx) + if not table: + return dict(error="Table %s not found" % fileid) + + column_names = [column.name for column in table.getHeaders()] + if col_name not in column_names: + return dict(error="Unknown column %s" % col_name) try: - data = rowsToByteArray(rsp_data["data"]["rows"]) - return HttpResponse(bytes(data), content_type="application/octet-stream") + row_numbers = table.getWhereList(query, None, 0, 0, 0) + (column,) = table.slice([column_names.index(col_name)], row_numbers).columns + return HttpResponse( + column_to_packed_bits(column), content_type="application/octet-stream" + ) except ValueError: logger.error("ValueError when getting obj_id_bitmask") return {"error": "Specified column has invalid type"} -def rowsToByteArray(rows): - maxval = 0 - if len(rows) > 0 and isinstance(rows[0][0], float): +def column_to_packed_bits(column): + if len(column.values) > 0 and isinstance(column.values[0], float): raise ValueError("Cannot have ID of float") - for obj in rows: - obj_id = int(obj[0]) - maxval = max(obj_id, maxval) - bitArray = numpy.zeros(maxval + 1, dtype="uint8") - for obj in rows: - obj_id = int(obj[0]) - bitArray[obj_id] = 1 - packed = numpy.packbits(bitArray, bitorder="big") - data = bytearray() - for val in packed: - data.append(val) - return data + # Coerce strings to uint64 if required + indexes = numpy.array(column.values, dtype="uint64") + bits = numpy.zeros(int(indexes.max() + 1), dtype="uint8") + bits[indexes] = 1 + return numpy.packbits(bits, bitorder="big").tobytes() @login_required() From 8727fbeb4abf874268ad2a06d6d89261910b01bf Mon Sep 17 00:00:00 2001 From: Chris Allan Date: Tue, 14 May 2024 22:19:12 +0000 Subject: [PATCH 02/11] Update test case --- test/unit/test_webgateway.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/test/unit/test_webgateway.py b/test/unit/test_webgateway.py index 65ee3b8598..e634074a2a 100644 --- a/test/unit/test_webgateway.py +++ b/test/unit/test_webgateway.py @@ -4,9 +4,11 @@ import time import os +import numpy import pytest from django.http import HttpResponseBadRequest +from omero.columns import LongColumnI from omeroweb.webgateway.webgateway_cache import FileCache, WebGatewayCache from omeroweb.webgateway.webgateway_cache import WebGatewayTempFile from omeroweb.webgateway import views @@ -389,9 +391,10 @@ def testJsonCache(self): class TestViews(object): - def testRowstoByteArray(self): - rows = [[1], [2], [7], [11], [12]] - data = views.rowsToByteArray(rows) + def testColumnToPackedBits(self): + column = LongColumnI("test") + column.values = [1, 2, 7, 11, 12] + data = numpy.frombuffer(views.column_to_packed_bits(column), dtype="uint8") assert data[0] == 97 # 01100001 First, Second and 7th bits assert data[1] == 24 # 00011000 11th and 12th bits From 1d55d5657bbd2c992ac1bfd28bd64ec70a52cea6 Mon Sep 17 00:00:00 2001 From: Chris Allan Date: Wed, 15 May 2024 08:35:38 +0000 Subject: [PATCH 03/11] Fix stepping --- omeroweb/webgateway/views.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/omeroweb/webgateway/views.py b/omeroweb/webgateway/views.py index ebeaabcec0..bbcfde528b 100644 --- a/omeroweb/webgateway/views.py +++ b/omeroweb/webgateway/views.py @@ -3239,7 +3239,7 @@ def obj_id_bitmask(request, fileid, conn=None, query=None, **kwargs): if col_name not in column_names: return dict(error="Unknown column %s" % col_name) try: - row_numbers = table.getWhereList(query, None, 0, 0, 0) + row_numbers = table.getWhereList(query, None, 0, 0, 1) (column,) = table.slice([column_names.index(col_name)], row_numbers).columns return HttpResponse( column_to_packed_bits(column), content_type="application/octet-stream" From d14b6e254f625eed8c545db3a3dfdc9019e8f54f Mon Sep 17 00:00:00 2001 From: Chris Allan Date: Wed, 22 May 2024 19:57:04 +0000 Subject: [PATCH 04/11] Close table --- omeroweb/webgateway/views.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/omeroweb/webgateway/views.py b/omeroweb/webgateway/views.py index bbcfde528b..8aeabef908 100644 --- a/omeroweb/webgateway/views.py +++ b/omeroweb/webgateway/views.py @@ -3235,10 +3235,10 @@ def obj_id_bitmask(request, fileid, conn=None, query=None, **kwargs): if not table: return dict(error="Table %s not found" % fileid) - column_names = [column.name for column in table.getHeaders()] - if col_name not in column_names: - return dict(error="Unknown column %s" % col_name) try: + column_names = [column.name for column in table.getHeaders()] + if col_name not in column_names: + return dict(error="Unknown column %s" % col_name) row_numbers = table.getWhereList(query, None, 0, 0, 1) (column,) = table.slice([column_names.index(col_name)], row_numbers).columns return HttpResponse( @@ -3247,6 +3247,8 @@ def obj_id_bitmask(request, fileid, conn=None, query=None, **kwargs): except ValueError: logger.error("ValueError when getting obj_id_bitmask") return {"error": "Specified column has invalid type"} + finally: + table.close() def column_to_packed_bits(column): From ec99a1d1814e77ac032d55cf8c6767a0c749a24f Mon Sep 17 00:00:00 2001 From: Chris Allan Date: Wed, 22 May 2024 19:57:39 +0000 Subject: [PATCH 05/11] Use max table download rows for batch size --- omeroweb/webgateway/views.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/omeroweb/webgateway/views.py b/omeroweb/webgateway/views.py index 8aeabef908..109b5a1dfa 100644 --- a/omeroweb/webgateway/views.py +++ b/omeroweb/webgateway/views.py @@ -3044,7 +3044,7 @@ def perform_table_query( def row_generator(table, h): # hits are all consecutive rows - can load them in batches idx = 0 - batch = 1000 + batch = settings.MAX_TABLE_DOWNLOAD_ROWS while idx < len(h): batch = min(batch, len(h) - idx) res = table.slice(col_indices, h[idx : idx + batch]) From 8c96eda946561bbd0724e79bb5c5570eb29d72ed Mon Sep 17 00:00:00 2001 From: Chris Allan Date: Mon, 17 Jun 2024 10:43:51 +0000 Subject: [PATCH 06/11] Try to pin numpy < 2.0.0 --- tox.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index c943243656..648cbd9426 100644 --- a/tox.ini +++ b/tox.ini @@ -11,7 +11,7 @@ requires = pip >= 19.0.0 deps = setuptools pre-commit - numpy>=1.9 + numpy>=1.9,<2.0 pytest PyYAML tables From f4a0849900c8b9b37cf17df8d450982b97b712e8 Mon Sep 17 00:00:00 2001 From: Chris Allan Date: Wed, 19 Jun 2024 09:56:03 +0000 Subject: [PATCH 07/11] Revert "Use max table download rows for batch size" This reverts commit ec99a1d1814e77ac032d55cf8c6767a0c749a24f. --- omeroweb/webgateway/views.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/omeroweb/webgateway/views.py b/omeroweb/webgateway/views.py index 44124592cc..5a8713e564 100644 --- a/omeroweb/webgateway/views.py +++ b/omeroweb/webgateway/views.py @@ -3008,7 +3008,7 @@ def perform_table_query( def row_generator(table, h): # hits are all consecutive rows - can load them in batches idx = 0 - batch = settings.MAX_TABLE_DOWNLOAD_ROWS + batch = 1000 while idx < len(h): batch = min(batch, len(h) - idx) res = table.slice(col_indices, h[idx : idx + batch]) From 1a1094c7cb6664bba367554358b5775b0c89429c Mon Sep 17 00:00:00 2001 From: Chris Allan Date: Wed, 19 Jun 2024 10:05:26 +0000 Subject: [PATCH 08/11] Add documentation --- omeroweb/webgateway/views.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/omeroweb/webgateway/views.py b/omeroweb/webgateway/views.py index 5a8713e564..e525245008 100644 --- a/omeroweb/webgateway/views.py +++ b/omeroweb/webgateway/views.py @@ -3216,6 +3216,10 @@ def obj_id_bitmask(request, fileid, conn=None, query=None, **kwargs): def column_to_packed_bits(column): + """ + Convert a column of integer values (strings will be coerced) to a bit mask + where each value present will be set to 1. + """ if len(column.values) > 0 and isinstance(column.values[0], float): raise ValueError("Cannot have ID of float") # Coerce strings to uint64 if required From b9cd4921b058c60151dcca3fc9ee7b1a91e7bf0b Mon Sep 17 00:00:00 2001 From: Chris Allan Date: Wed, 19 Jun 2024 10:45:29 +0000 Subject: [PATCH 09/11] Be specific with error handling and modernise some formatting --- omeroweb/webgateway/views.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/omeroweb/webgateway/views.py b/omeroweb/webgateway/views.py index e525245008..8df7afacc6 100644 --- a/omeroweb/webgateway/views.py +++ b/omeroweb/webgateway/views.py @@ -3197,20 +3197,23 @@ def obj_id_bitmask(request, fileid, conn=None, query=None, **kwargs): sr = conn.getSharedResources() table = sr.openTable(omero.model.OriginalFileI(fileid, False), ctx) if not table: - return dict(error="Table %s not found" % fileid) - + return {"error": "Table %s not found" % fileid} try: column_names = [column.name for column in table.getHeaders()] if col_name not in column_names: - return dict(error="Unknown column %s" % col_name) + return {"error": "Unknown column %s" % col_name} row_numbers = table.getWhereList(query, None, 0, 0, 1) (column,) = table.slice([column_names.index(col_name)], row_numbers).columns - return HttpResponse( - column_to_packed_bits(column), content_type="application/octet-stream" - ) - except ValueError: - logger.error("ValueError when getting obj_id_bitmask") - return {"error": "Specified column has invalid type"} + try: + return HttpResponse( + column_to_packed_bits(column), content_type="application/octet-stream" + ) + except ValueError: + logger.error("ValueError when getting obj_id_bitmask") + return {"error": "Specified column has invalid type"} + except Exception: + logger.error("Error when getting obj_id_bitmask", exc_info=True) + return {"error", "Unexpected error getting obj_id_bitmask"} finally: table.close() From 51d8ba745b5f5ec248f29c258a7d14150017e5ce Mon Sep 17 00:00:00 2001 From: Chris Allan Date: Tue, 9 Jul 2024 10:10:49 +0000 Subject: [PATCH 10/11] Use first column if column name is unknown --- omeroweb/webgateway/views.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/omeroweb/webgateway/views.py b/omeroweb/webgateway/views.py index 8df7afacc6..b31f0b28b7 100644 --- a/omeroweb/webgateway/views.py +++ b/omeroweb/webgateway/views.py @@ -3201,7 +3201,16 @@ def obj_id_bitmask(request, fileid, conn=None, query=None, **kwargs): try: column_names = [column.name for column in table.getHeaders()] if col_name not in column_names: - return {"error": "Unknown column %s" % col_name} + # Previous implementations used perform_table_query() which + # defaults to returning all columns if the requested column name + # is unknown. We would have then packed the first column. We + # mimic that here by only querying for the first column. + # + # FIXME: This is really weird behaviour, especially with this + # endpoint defaulting to using the "object" column, and likely + # deserves to be refactored and deprecated or changed + # accordingly. + col_name = column_names[0] row_numbers = table.getWhereList(query, None, 0, 0, 1) (column,) = table.slice([column_names.index(col_name)], row_numbers).columns try: From 79c23056136bc929e4ebd57d24600780c92921c5 Mon Sep 17 00:00:00 2001 From: Chris Allan Date: Tue, 9 Jul 2024 10:26:29 +0000 Subject: [PATCH 11/11] Don't allow possible negative values to wrap and become massive --- omeroweb/webgateway/views.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/omeroweb/webgateway/views.py b/omeroweb/webgateway/views.py index b31f0b28b7..05d37c2a24 100644 --- a/omeroweb/webgateway/views.py +++ b/omeroweb/webgateway/views.py @@ -3234,8 +3234,10 @@ def column_to_packed_bits(column): """ if len(column.values) > 0 and isinstance(column.values[0], float): raise ValueError("Cannot have ID of float") - # Coerce strings to uint64 if required - indexes = numpy.array(column.values, dtype="uint64") + # Coerce strings to int64 if required. If we have values > 2**63 they + # wouldn't work anyway so signed is okay here. Note that the + # implementation does get weird if the indexes are negative values. + indexes = numpy.array(column.values, dtype="int64") bits = numpy.zeros(int(indexes.max() + 1), dtype="uint8") bits[indexes] = 1 return numpy.packbits(bits, bitorder="big").tobytes()