diff --git a/bindings/python/Makefile b/bindings/python/Makefile index d55523af..940cd5bb 100644 --- a/bindings/python/Makefile +++ b/bindings/python/Makefile @@ -1,7 +1,7 @@ default: build .PHONY: -edlib: +edlib: $(shell find ../../edlib) # create a clean (maybe updated) copy of edlib src rm -rf edlib && cp -r ../../edlib . diff --git a/bindings/python/edlib.pyx b/bindings/python/edlib.pyx index a4de39b0..440841ba 100644 --- a/bindings/python/edlib.pyx +++ b/bindings/python/edlib.pyx @@ -32,30 +32,27 @@ def _map_to_bytes(query, target, additional_equalities): query_bytes = _map_ascii_string(query) target_bytes = _map_ascii_string(target) except NeedsAlphabetMapping: - # Map non-ascii symbols into an ASCII alphabet so it can be used - # in the C++ code - query_vals = set(query) - target_vals = set(target) - input_mapping = { - c: chr(idx) - for idx, c in enumerate(query_vals.union(target_vals)) - } - if len(input_mapping) > 256: + # Map elements of alphabet to chars from 0 up to 255, so that Edlib can work with them, + # since C++ Edlib needs chars. + alphabet = set(query).union(set(target)) + if len(alphabet) > 256: raise ValueError( "query and target combined have more than 256 unique values, " "this is not supported.") - map_seq = lambda seq: ''.join(input_mapping[x] for x in seq).encode('ascii') + alphabet_to_byte_mapping = { + c: idx.to_bytes(1, byteorder='big') for idx, c in enumerate(alphabet) + } + map_seq = lambda seq: b''.join(alphabet_to_byte_mapping[c] for c in seq) query_bytes = map_seq(query) target_bytes = map_seq(target) if additional_equalities is not None: additional_equalities = [ - (input_mapping[a], input_mapping[b]) + (alphabet_to_byte_mapping[a].decode('utf-8'), alphabet_to_byte_mapping[b].decode('utf-8')) for a, b in additional_equalities - if a in input_mapping and b in input_mapping] + if a in alphabet_to_byte_mapping and b in alphabet_to_byte_mapping] return query_bytes, target_bytes, additional_equalities - def align(query, target, mode="NW", task="distance", k=-1, additionalEqualities=None): """ Align query with target using edit distance. @param {str or bytes or iterable of hashable objects} query, combined with target must have no more diff --git a/bindings/python/test.py b/bindings/python/test.py index 558f8d86..cdba91e9 100644 --- a/bindings/python/test.py +++ b/bindings/python/test.py @@ -68,6 +68,17 @@ result = edlib.align("telephone", "", mode="SHW") testFailed = testFailed or (not (result and result["editDistance"] == 9)) +# Unicode characters +result = edlib.align("ты милая", "ты гений") +testFailed = testFailed or (not (result and result["editDistance"] == 5 and result["alphabetLength"] == 12)) + +# Long alphabet. +long_alphabet = ''.join([chr(idx) for idx in range(1, 257)]) +long_seq1 = long_alphabet * 3 +long_seq2 = long_alphabet + long_alphabet[::-1] + long_alphabet +result = edlib.align(long_seq1, long_seq2) +testFailed = testFailed or (not (result and result["editDistance"] == 256)) + if testFailed: print("Some of the tests failed!") else: diff --git a/edlib/src/edlib.cpp b/edlib/src/edlib.cpp index e843501a..e19f1194 100644 --- a/edlib/src/edlib.cpp +++ b/edlib/src/edlib.cpp @@ -361,7 +361,7 @@ static inline Word* buildPeq(const int alphabetLength, Word* Peq = new Word[(alphabetLength + 1) * maxNumBlocks]; // Build Peq (1 is match, 0 is mismatch). NOTE: last column is wildcard(symbol that matches anything) with just 1s - for (unsigned char symbol = 0; symbol <= alphabetLength; symbol++) { + for (int symbol = 0; symbol <= alphabetLength; symbol++) { for (int b = 0; b < maxNumBlocks; b++) { if (symbol < alphabetLength) { Peq[symbol * maxNumBlocks + b] = 0;