From ec4b03ca51fe6466cccbe50dda234b6ccac3424f Mon Sep 17 00:00:00 2001 From: Kirk Wang Date: Wed, 12 Jul 2023 13:29:38 -0700 Subject: [PATCH 1/3] =?UTF-8?q?=F0=9F=90=9B=20Correct=20search=20count?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A funky bug was observed when on a parent work of a PDF, the count was off by one. It registers OCR hits as both an OCR hit and a metadata hit. This is likely because of adding snippets in the search since all the file sets' texts need to be indexed on the parent work as well. The parent work essentially would double all the texts found. This commit is a bit hacky but it removes that extra hit while keeping functionality for both OCR hits and metadata hits. This is a bit of future proofing since it only would happen in applications with snippets enabled. --- .../iiif_search_response_decorator.rb | 4 +++ .../annotation_decorator.rb | 26 +++++++++++++++++-- 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/app/models/iiif_print/iiif_search_response_decorator.rb b/app/models/iiif_print/iiif_search_response_decorator.rb index 8aa96967..76ad70ba 100644 --- a/app/models/iiif_print/iiif_search_response_decorator.rb +++ b/app/models/iiif_print/iiif_search_response_decorator.rb @@ -6,6 +6,10 @@ def annotation_list json_results = super resources = json_results&.[]('resources') + resources.delete_if do |resource| + resource["on"].include?(IiifPrint::BlacklightIiifSearch::AnnotationDecorator::INVALID_MATCH_TEXT) + end + resources&.each do |result_hit| next if result_hit['resource'].present? result_hit['resource'] = { diff --git a/lib/iiif_print/blacklight_iiif_search/annotation_decorator.rb b/lib/iiif_print/blacklight_iiif_search/annotation_decorator.rb index f939e2db..91b333ff 100644 --- a/lib/iiif_print/blacklight_iiif_search/annotation_decorator.rb +++ b/lib/iiif_print/blacklight_iiif_search/annotation_decorator.rb @@ -2,6 +2,7 @@ module IiifPrint module BlacklightIiifSearch module AnnotationDecorator + INVALID_MATCH_TEXT = "xywh=INVALID,INVALID,INVALID,INVALID".freeze ## # Create a URL for the annotation # use a Hyrax-y URL syntax: @@ -28,16 +29,23 @@ def canvas_uri_for_annotation # @return [String] def coordinates return default_coords if query.blank? - coords_json = fetch_and_parse_coords - return default_coords unless coords_json && coords_json['coords'] + sanitized_query = query.match(additional_query_terms_regex)[1].strip + coords_json = fetch_and_parse_coords + + coords_check_result = check_coords_json_and_properties(coords_json, sanitized_query) + return coords_check_result if coords_check_result + query_terms = sanitized_query.split(' ').map(&:downcase) + matches = coords_json['coords'].select do |k, _v| k.downcase =~ /(#{query_terms.join('|')})/ end return default_coords if matches.blank? + coords_array = matches.values.flatten(1)[hl_index] return default_coords unless coords_array + "#xywh=#{coords_array.join(',')}" end @@ -54,6 +62,20 @@ def fetch_and_parse_coords end end + # This is a bit hacky but it is checking if any of the properties contain the query term + # if there are no coords and there is a metadata property match + # then we return the default coords + # else we insert a invalid match text to be stripped out at a later point + # @see IiifPrint::IiifSearchResponseDecorator#annotation_list + def check_coords_json_and_properties(coords_json, sanitized_query) + return if coords_json && coords_json['coords'] + + properties = @document.keys.select { |key| key.ends_with? "_tesim" } + properties.each { |property| return default_coords if @document[property].join.downcase.include?(sanitized_query) } + + INVALID_MATCH_TEXT + end + ## # a default set of coordinates # @return [String] From e175a8318466c29d460072cdb8ddc1df77f4fa8d Mon Sep 17 00:00:00 2001 From: Kirk Wang Date: Wed, 12 Jul 2023 22:07:53 -0700 Subject: [PATCH 2/3] =?UTF-8?q?=E2=99=BB=EF=B8=8F=20Refactor=20and=20fix?= =?UTF-8?q?=20annotation=20count?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit will refactor #annotation_list by breaking it down a bit and also adjusts and compensates the json for removing the invalid hit. --- .../iiif_search_response_decorator.rb | 31 ++++++++++++++++--- .../annotation_decorator.rb | 15 +++++++-- 2 files changed, 39 insertions(+), 7 deletions(-) diff --git a/app/models/iiif_print/iiif_search_response_decorator.rb b/app/models/iiif_print/iiif_search_response_decorator.rb index 76ad70ba..cf66fc70 100644 --- a/app/models/iiif_print/iiif_search_response_decorator.rb +++ b/app/models/iiif_print/iiif_search_response_decorator.rb @@ -4,20 +4,41 @@ module IiifSearchResponseDecorator # @see https://github.com/scientist-softserv/louisville-hyku/commit/67467e5cf9fdb755f54419f17d3c24c87032d0af def annotation_list json_results = super - resources = json_results&.[]('resources') - resources.delete_if do |resource| - resource["on"].include?(IiifPrint::BlacklightIiifSearch::AnnotationDecorator::INVALID_MATCH_TEXT) + # Break down the json_results for easy access + resources = json_results['resources'] + hits = json_results['hits'] + within = json_results['within'] + + # Check and process invalid hit + if resources + remove_invalid_hit(resources, hits, within) + add_metadata_match(resources) end - resources&.each do |result_hit| + json_results + end + + def remove_invalid_hit(resources, hits, within) + invalid_hit = resources.detect { |resource| resource["on"].include?(IiifPrint::BlacklightIiifSearch::AnnotationDecorator::INVALID_MATCH_TEXT) } + return unless invalid_hit + + # Delete invalid hit from resources, remove first hit (which is from the invalid hit), decrement total within + resources.delete(invalid_hit) + hits.shift + within['total'] -= 1 + end + + def add_metadata_match(resources) + resources.each do |result_hit| next if result_hit['resource'].present? + + # Add resource details if not present result_hit['resource'] = { "@type": "cnt:ContentAsText", "chars": "Metadata match, see sidebar for details" } end - json_results end end end diff --git a/lib/iiif_print/blacklight_iiif_search/annotation_decorator.rb b/lib/iiif_print/blacklight_iiif_search/annotation_decorator.rb index 91b333ff..542475bb 100644 --- a/lib/iiif_print/blacklight_iiif_search/annotation_decorator.rb +++ b/lib/iiif_print/blacklight_iiif_search/annotation_decorator.rb @@ -2,7 +2,7 @@ module IiifPrint module BlacklightIiifSearch module AnnotationDecorator - INVALID_MATCH_TEXT = "xywh=INVALID,INVALID,INVALID,INVALID".freeze + INVALID_MATCH_TEXT = "#xywh=INVALID,INVALID,INVALID,INVALID".freeze ## # Create a URL for the annotation # use a Hyrax-y URL syntax: @@ -30,7 +30,7 @@ def canvas_uri_for_annotation def coordinates return default_coords if query.blank? - sanitized_query = query.match(additional_query_terms_regex)[1].strip + sanitized_query = sanitize_query coords_json = fetch_and_parse_coords coords_check_result = check_coords_json_and_properties(coords_json, sanitized_query) @@ -49,6 +49,10 @@ def coordinates "#xywh=#{coords_array.join(',')}" end + def sanitize_query + query.match(additional_query_terms_regex)[1].strip + end + ## # return the JSON word-coordinates file contents # @return [JSON] @@ -119,6 +123,13 @@ def file_set_id def additional_query_terms_regex /(.*)(?= AND (\(.+\)|\w+)$)/ end + + ## + # @return [IIIF::Presentation::Resource] + def text_resource_for_annotation + IIIF::Presentation::Resource.new('@type' => 'cnt:ContentAsText', + 'chars' => sanitize_query) + end end end end From 660d98bb7a3cfe7e02719bc7c41236f51288a2eb Mon Sep 17 00:00:00 2001 From: Kirk Wang Date: Thu, 13 Jul 2023 12:00:24 -0700 Subject: [PATCH 3/3] =?UTF-8?q?=E2=99=BB=EF=B8=8F=20Refactor=20#coordinate?= =?UTF-8?q?s?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit will bring in review changes and make the method clearer to read. --- .../iiif_search_response_decorator.rb | 22 ++++++++----------- .../annotation_decorator.rb | 17 +++++++------- 2 files changed, 18 insertions(+), 21 deletions(-) diff --git a/app/models/iiif_print/iiif_search_response_decorator.rb b/app/models/iiif_print/iiif_search_response_decorator.rb index cf66fc70..e8b399aa 100644 --- a/app/models/iiif_print/iiif_search_response_decorator.rb +++ b/app/models/iiif_print/iiif_search_response_decorator.rb @@ -5,32 +5,28 @@ module IiifSearchResponseDecorator def annotation_list json_results = super - # Break down the json_results for easy access - resources = json_results['resources'] - hits = json_results['hits'] - within = json_results['within'] - # Check and process invalid hit - if resources - remove_invalid_hit(resources, hits, within) - add_metadata_match(resources) + if json_results&.[]('resources') + remove_invalid_hit(json_results) + add_metadata_match(json_results) end json_results end - def remove_invalid_hit(resources, hits, within) + def remove_invalid_hit(json_results) + resources = json_results['resources'] invalid_hit = resources.detect { |resource| resource["on"].include?(IiifPrint::BlacklightIiifSearch::AnnotationDecorator::INVALID_MATCH_TEXT) } return unless invalid_hit # Delete invalid hit from resources, remove first hit (which is from the invalid hit), decrement total within resources.delete(invalid_hit) - hits.shift - within['total'] -= 1 + json_results['hits'].shift + json_results['within']['total'] -= 1 end - def add_metadata_match(resources) - resources.each do |result_hit| + def add_metadata_match(json_results) + json_results['resources'].each do |result_hit| next if result_hit['resource'].present? # Add resource details if not present diff --git a/lib/iiif_print/blacklight_iiif_search/annotation_decorator.rb b/lib/iiif_print/blacklight_iiif_search/annotation_decorator.rb index 542475bb..b52c43bf 100644 --- a/lib/iiif_print/blacklight_iiif_search/annotation_decorator.rb +++ b/lib/iiif_print/blacklight_iiif_search/annotation_decorator.rb @@ -32,9 +32,7 @@ def coordinates sanitized_query = sanitize_query coords_json = fetch_and_parse_coords - - coords_check_result = check_coords_json_and_properties(coords_json, sanitized_query) - return coords_check_result if coords_check_result + return derived_coords_json_and_properties(sanitized_query) unless coords_json && coords_json['coords'] query_terms = sanitized_query.split(' ').map(&:downcase) @@ -71,13 +69,16 @@ def fetch_and_parse_coords # then we return the default coords # else we insert a invalid match text to be stripped out at a later point # @see IiifPrint::IiifSearchResponseDecorator#annotation_list - def check_coords_json_and_properties(coords_json, sanitized_query) - return if coords_json && coords_json['coords'] + def derived_coords_json_and_properties(sanitized_query) + property = @document.keys.detect do |key| + (key.ends_with?("_tesim") || key.ends_with?("_tsim")) && property_includes_sanitized_query?(key, sanitized_query) + end - properties = @document.keys.select { |key| key.ends_with? "_tesim" } - properties.each { |property| return default_coords if @document[property].join.downcase.include?(sanitized_query) } + property ? default_coords : INVALID_MATCH_TEXT + end - INVALID_MATCH_TEXT + def property_includes_sanitized_query?(property, sanitized_query) + @document[property].join.downcase.include?(sanitized_query) end ##