Skip to content

Commit

Permalink
attempt to solve #10
Browse files Browse the repository at this point in the history
  • Loading branch information
eroux committed Jul 25, 2024
1 parent a158c2c commit 86556dc
Showing 1 changed file with 23 additions and 3 deletions.
26 changes: 23 additions & 3 deletions OpenSearch/indexer-autosuggest.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,15 +117,35 @@ def get_weight(data):

return int(score * 10000000)

def truncate_label(label):
"""
Cut the label at a sensible point before character 256
Currently only works for space-based languages (work required for Sanskrit and Chinese)
"""
# Check if the string length is less than or equal to 256 characters
if len(label) <= 256:
return label

# Find the last space before the 256th character
cutoff = label.rfind(' ', 0, 256)

# If there is no space, just return the full string up to 256 characters
if cutoff == -1:
return label[:256]

# Otherwise, return the string up to the last space found
return label[:cutoff]

# create variations to begin suggestions at any token
def suggest_me_variations(label_list, weight):
variations = []
for label in label_list:
# remove shads from end and harmonise apostrophes
label = re.sub('/$', '', label.strip())
label = re.sub(r'[:/]+$', '', label.strip())
label = re.sub("[‘’‛′‵ʼʻˈˊˋ`]", "'", label)
label = label[:256]
tokens = re.split('\s+', label)
label = truncate_label(label)
tokens = re.split(r'\s+', label)
length = len(tokens)
if length > 2:
for i in range(0, len(tokens) - 1):
Expand Down

0 comments on commit 86556dc

Please sign in to comment.