Skip to content

Commit

Permalink
Merge branch 'secondary_key_playground' of github.com:hendrikmuhs/key…
Browse files Browse the repository at this point in the history
…vi-1 into hendrik/secondary_key_playground
  • Loading branch information
hendrikmuhs committed Jun 18, 2024
2 parents f5ef48a + 7e8e874 commit c3b90a8
Show file tree
Hide file tree
Showing 10 changed files with 158 additions and 47 deletions.
5 changes: 4 additions & 1 deletion keyvi/include/keyvi/dictionary/dictionary_properties.h
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,10 @@ class DictionaryProperties {

size_t GetTransitionsSize() const { return sparse_array_size_ * 2; }

size_t GetEndOffset() const { return GetTransitionsOffset() + GetTransitionsSize(); }
size_t GetEndOffset() const {
return value_store_properties_.GetOffset() ? value_store_properties_.GetOffset() + value_store_properties_.GetSize()
: GetTransitionsOffset() + GetTransitionsSize();
}

const fsa::internal::ValueStoreProperties& GetValueStoreProperties() const { return value_store_properties_; }

Expand Down
6 changes: 3 additions & 3 deletions keyvi/include/keyvi/dictionary/secondary_key_dictionary.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,11 +52,11 @@ class SecondaryKeyDictionary final {
* @param loading_strategy optional: Loading strategy to use.
*/
explicit SecondaryKeyDictionary(const std::string& filename,
loading_strategy_types loading_strategy = loading_strategy_types::lazy)
const loading_strategy_types loading_strategy = loading_strategy_types::lazy)
: SecondaryKeyDictionary(std::make_shared<fsa::Automata>(filename, loading_strategy), loading_strategy) {}

explicit SecondaryKeyDictionary(fsa::automata_t f,
loading_strategy_types loading_strategy = loading_strategy_types::lazy)
explicit SecondaryKeyDictionary(const fsa::automata_t& f,
const loading_strategy_types loading_strategy = loading_strategy_types::lazy)
: dictionary_(std::make_shared<Dictionary>(f)) {
std::string properties = dictionary_->GetFsa()->GetDictionaryProperties()->GetSpecializedDictionaryProperties();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ class SecondaryKeyDictionaryCompiler final {
* @param secondary_keys a list of secondary keys
* @param params compiler parameters
*/
explicit SecondaryKeyDictionaryCompiler(const std::vector<std::string> secondary_keys,
explicit SecondaryKeyDictionaryCompiler(const std::vector<std::string>& secondary_keys,
const keyvi::util::parameters_t& params = keyvi::util::parameters_t())
: params_(params), dictionary_compiler_(params), secondary_keys_(secondary_keys) {}

Expand Down Expand Up @@ -147,9 +147,9 @@ class SecondaryKeyDictionaryCompiler final {
}

private:
keyvi::util::parameters_t params_;
const keyvi::util::parameters_t params_;
DictionaryCompiler<ValueStoreType> dictionary_compiler_;
std::vector<std::string> secondary_keys_;
const std::vector<std::string> secondary_keys_;
std::map<std::string, std::string> secondary_key_replacements_;
uint64_t current_index_ = 2; // starting from 2, 1 is reserved for empty string
std::vector<char> replacements_buffer_;
Expand Down
49 changes: 46 additions & 3 deletions keyvi/tests/keyvi/dictionary/secondary_key_dictionary_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
* Author: hendrik
*/

#include <filesystem>
#include <map>
#include <tuple>
#include <vector>
Expand All @@ -35,7 +36,7 @@ namespace keyvi {
namespace dictionary {
BOOST_AUTO_TEST_SUITE(SecondaryKeyDictionaryTests)

BOOST_AUTO_TEST_CASE(OneSecondaryKey) {
BOOST_AUTO_TEST_CASE(completions) {
std::vector<std::tuple<std::string, std::map<std::string, std::string>, uint32_t>> test_data = {
{"siegfried", {{"company", "acme"}}, 22},
{"walburga", {{"company", "acma"}}, 10},
Expand All @@ -51,10 +52,11 @@ BOOST_AUTO_TEST_CASE(OneSecondaryKey) {
}
compiler.Compile();

boost::filesystem::path temp_path = boost::filesystem::temp_directory_path();
std::filesystem::path temp_path = std::filesystem::temp_directory_path();

temp_path /=
boost::filesystem::unique_path("secondary-key-dictionary-unit-test-dictionarycompiler-%%%%-%%%%-%%%%-%%%%");
boost::filesystem::unique_path("secondary-key-dictionary-unit-test-dictionarycompiler-%%%%-%%%%-%%%%-%%%%")
.string();
std::string file_name = temp_path.string();

compiler.WriteToFile(file_name);
Expand Down Expand Up @@ -113,6 +115,47 @@ BOOST_AUTO_TEST_CASE(OneSecondaryKey) {
completer_it++;
}
BOOST_CHECK_EQUAL(1, i);

std::filesystem::remove_all(temp_path);
}

BOOST_AUTO_TEST_CASE(json) {
std::vector<std::tuple<std::string, std::map<std::string, std::string>, std::string>> test_data = {
{"key", {{"user_id", "a1"}}, "{a:1}"},
{"key", {{"user_id", "a2"}}, "{a:2}"},
{"key", {{"user_id", ""}}, "{c:1}"},
};

SecondaryKeyDictionaryCompiler<fsa::internal::value_store_t::JSON> compiler(
{"user_id"}, keyvi::util::parameters_t({{"memory_limit_mb", "10"}}));

for (auto p : test_data) {
compiler.Add(std::get<0>(p), std::get<1>(p), std::get<2>(p));
}
compiler.Compile();

std::filesystem::path temp_path = std::filesystem::temp_directory_path();

temp_path /=
boost::filesystem::unique_path("secondary-key-dictionary-unit-test-dictionarycompiler-%%%%-%%%%-%%%%-%%%%")
.string();
std::string file_name = temp_path.string();

compiler.WriteToFile(file_name);

SecondaryKeyDictionary d(file_name.c_str());

match_t m = d.GetFirst("key", {{"user_id", "a1"}});
BOOST_CHECK(m);
BOOST_CHECK_EQUAL("\"{a:1}\"", m->GetValueAsString());
m = d.GetFirst("key", {{"user_id", "a2"}});
BOOST_CHECK(m);
BOOST_CHECK_EQUAL("\"{a:2}\"", m->GetValueAsString());
m = d.GetFirst("key", {{"user_id", ""}});
BOOST_CHECK(m);
BOOST_CHECK_EQUAL("\"{c:1}\"", m->GetValueAsString());

std::filesystem::remove_all(temp_path);
}

BOOST_AUTO_TEST_SUITE_END()
Expand Down
34 changes: 17 additions & 17 deletions python/src/addons/SecondaryKeyDictionary.pyx
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@


def get (self, key, meta, default = None):
if isinstance(key, unicode):
key = key.encode('utf-8')
assert isinstance(key, bytes), 'arg in_0 wrong type'
def get (self, the_key, meta, default = None):
if isinstance(the_key, unicode):
the_key = the_key.encode('utf-8')
assert isinstance(the_key, bytes), 'arg in_0 wrong type'
assert isinstance(meta, dict), 'arg in_1 wrong type'

cdef libcpp_map[libcpp_utf8_string, libcpp_utf8_string] * v1 = new libcpp_map[libcpp_utf8_string, libcpp_utf8_string]()
for key, value in meta.items():
if isinstance(key, unicode):
key = key.encode('utf-8')
if isinstance(value, unicode):
value = value.encode('utf-8')
deref(v1)[ (<libcpp_string>key) ] = (<libcpp_string>value)

cdef shared_ptr[_Match] _r = self.inst.get().GetFirst(<libcpp_string>key, deref(v1))
for _key, _value in meta.items():
if isinstance(_key, unicode):
_key = _key.encode('utf-8')
if isinstance(_value, unicode):
_value = _value.encode('utf-8')
deref(v1)[ (<libcpp_string>_key) ] = (<libcpp_string>_value)

cdef shared_ptr[_Match] _r = self.inst.get().GetFirst(<libcpp_string>the_key, deref(v1))
del(v1)

if _r.get() == nullptr:
Expand All @@ -23,14 +23,14 @@
py_result.inst = _r
return py_result

def contains(self, key, meta):
if isinstance(key, unicode):
key = key.encode('utf-8')
def contains(self, the_key, meta):
if isinstance(the_key, unicode):
the_key = the_key.encode('utf-8')

assert isinstance(key, bytes), 'arg in_0 wrong type'
assert isinstance(the_key, bytes), 'arg in_0 wrong type'
assert isinstance(meta, dict), 'arg in_1 wrong type'

return self.inst.get().Contains(key, meta)
return self.inst.get().Contains(the_key, meta)

def __len__(self):
return self.inst.get().GetSize()
Expand Down
12 changes: 6 additions & 6 deletions python/src/pxds/dictionary_compiler.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ cdef extern from "keyvi/dictionary/dictionary_types.h" namespace "keyvi::diction
cdef cppclass SecondaryKeyCompletionDictionaryCompiler:
SecondaryKeyCompletionDictionaryCompiler(libcpp_vector[libcpp_utf8_string] secondary_keys) except +
SecondaryKeyCompletionDictionaryCompiler(libcpp_vector[libcpp_utf8_string] secondary_keys, libcpp_map[libcpp_utf8_string, libcpp_utf8_string] value_store_params) except +
void Add(libcpp_utf8_string key, libcpp_map[libcpp_utf8_string, libcpp_utf8_string] meta, int value) except + # wrap-as:add
void Add(libcpp_utf8_string the_key, libcpp_map[libcpp_utf8_string, libcpp_utf8_string] meta, int the_value) except + # wrap-as:add
void Compile() nogil # wrap-ignore
void Compile(callback_t, void*) nogil # wrap-ignore
void SetManifest(libcpp_utf8_string) except + # wrap-as:set_manifest
Expand All @@ -90,7 +90,7 @@ cdef extern from "keyvi/dictionary/dictionary_types.h" namespace "keyvi::diction
cdef cppclass SecondaryKeyFloatVectorDictionaryCompiler:
SecondaryKeyFloatVectorDictionaryCompiler(libcpp_vector[libcpp_utf8_string] secondary_keys) except +
SecondaryKeyFloatVectorDictionaryCompiler(libcpp_vector[libcpp_utf8_string] secondary_keys, libcpp_map[libcpp_utf8_string, libcpp_utf8_string] value_store_params) except +
void Add(libcpp_utf8_string key, libcpp_map[libcpp_utf8_string, libcpp_utf8_string] meta, libcpp_vector[float] value) except + # wrap-as:add
void Add(libcpp_utf8_string the_key, libcpp_map[libcpp_utf8_string, libcpp_utf8_string] meta, libcpp_vector[float] the_value) except + # wrap-as:add
void Compile() nogil # wrap-ignore
void Compile(callback_t, void*) nogil # wrap-ignore
void SetManifest(libcpp_utf8_string) except + # wrap-as:set_manifest
Expand All @@ -99,7 +99,7 @@ cdef extern from "keyvi/dictionary/dictionary_types.h" namespace "keyvi::diction
cdef cppclass SecondaryKeyIntDictionaryCompiler:
SecondaryKeyIntDictionaryCompiler(libcpp_vector[libcpp_utf8_string] secondary_keys) except +
SecondaryKeyIntDictionaryCompiler(libcpp_vector[libcpp_utf8_string] secondary_keys, libcpp_map[libcpp_utf8_string, libcpp_utf8_string] value_store_params) except +
void Add(libcpp_utf8_string key, libcpp_map[libcpp_utf8_string, libcpp_utf8_string] meta, long value) except + # wrap-as:add
void Add(libcpp_utf8_string the_key, libcpp_map[libcpp_utf8_string, libcpp_utf8_string] meta, long the_value) except + # wrap-as:add
void Compile() nogil # wrap-ignore
void Compile(callback_t, void*) nogil # wrap-ignore
void SetManifest(libcpp_utf8_string) except + # wrap-as:set_manifest
Expand All @@ -108,7 +108,7 @@ cdef extern from "keyvi/dictionary/dictionary_types.h" namespace "keyvi::diction
cdef cppclass SecondaryKeyKeyOnlyDictionaryCompiler:
SecondaryKeyKeyOnlyDictionaryCompiler(libcpp_vector[libcpp_utf8_string] secondary_keys) except +
SecondaryKeyKeyOnlyDictionaryCompiler(libcpp_vector[libcpp_utf8_string] secondary_keys, libcpp_map[libcpp_utf8_string, libcpp_utf8_string] value_store_params) except +
void Add(libcpp_utf8_string key, libcpp_map[libcpp_utf8_string, libcpp_utf8_string] meta) except + # wrap-as:add
void Add(libcpp_utf8_string the_key, libcpp_map[libcpp_utf8_string, libcpp_utf8_string] meta) except + # wrap-as:add
void Compile() nogil # wrap-ignore
void Compile(callback_t, void*) nogil # wrap-ignore
void SetManifest(libcpp_utf8_string) except + # wrap-as:set_manifest
Expand All @@ -117,7 +117,7 @@ cdef extern from "keyvi/dictionary/dictionary_types.h" namespace "keyvi::diction
cdef cppclass SecondaryKeyJsonDictionaryCompiler:
SecondaryKeyJsonDictionaryCompiler(libcpp_vector[libcpp_utf8_string] secondary_keys) except +
SecondaryKeyJsonDictionaryCompiler(libcpp_vector[libcpp_utf8_string] secondary_keys, libcpp_map[libcpp_utf8_string, libcpp_utf8_string] value_store_params) except +
void Add(libcpp_utf8_string key, libcpp_map[libcpp_utf8_string, libcpp_utf8_string] meta, libcpp_utf8_string value) except + # wrap-as:add
void Add(libcpp_utf8_string the_key, libcpp_map[libcpp_utf8_string, libcpp_utf8_string] meta, libcpp_utf8_string the_value) except + # wrap-as:add
void Compile() nogil # wrap-ignore
void Compile(callback_t, void*) nogil # wrap-ignore
void SetManifest(libcpp_utf8_string) except + # wrap-as:set_manifest
Expand All @@ -126,7 +126,7 @@ cdef extern from "keyvi/dictionary/dictionary_types.h" namespace "keyvi::diction
cdef cppclass SecondaryKeyStringDictionaryCompiler:
SecondaryKeyStringDictionaryCompiler(libcpp_vector[libcpp_utf8_string] secondary_keys) except +
SecondaryKeyStringDictionaryCompiler(libcpp_vector[libcpp_utf8_string] secondary_keys, libcpp_map[libcpp_utf8_string, libcpp_utf8_string] value_store_params) except +
void Add(libcpp_utf8_string key, libcpp_map[libcpp_utf8_string, libcpp_utf8_string] meta, libcpp_utf8_string value) except + # wrap-as:add
void Add(libcpp_utf8_string the_key, libcpp_map[libcpp_utf8_string, libcpp_utf8_string] meta, libcpp_utf8_string the_value) except + # wrap-as:add
void Compile() nogil # wrap-ignore
void Compile(callback_t, void*) nogil # wrap-ignore
void SetManifest(libcpp_utf8_string) except + # wrap-as:set_manifest
Expand Down
26 changes: 13 additions & 13 deletions python/src/pxds/secondary_key_dictionary.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -19,21 +19,21 @@ cdef extern from "keyvi/dictionary/secondary_key_dictionary.h" namespace "keyvi:

SecondaryKeyDictionary (libcpp_utf8_string filename) except +
#SecondaryKeyDictionary (libcpp_utf8_string filename, loading_strategy_types) except +
shared_ptr[_Match] GetFirst (libcpp_utf8_string key, libcpp_map[libcpp_utf8_string, libcpp_utf8_string] meta) # wrap-ignore
bool Contains (libcpp_utf8_string key, libcpp_map[libcpp_utf8_string, libcpp_utf8_string] meta) # wrap-ignore
_MatchIteratorPair Get (libcpp_utf8_string key, libcpp_map[libcpp_utf8_string, libcpp_utf8_string] meta) # wrap-as:match
_MatchIteratorPair GetNear (libcpp_utf8_string key, libcpp_map[libcpp_utf8_string, libcpp_utf8_string] meta, size_t minimum_prefix_length) except + # wrap-as:match_near
_MatchIteratorPair GetNear (libcpp_utf8_string key, libcpp_map[libcpp_utf8_string, libcpp_utf8_string] meta, size_t minimum_prefix_length, bool greedy) except + # wrap-as:match_near
_MatchIteratorPair GetFuzzy (libcpp_utf8_string key, libcpp_map[libcpp_utf8_string, libcpp_utf8_string] meta, int32_t max_edit_distance) except + # wrap-as:match_fuzzy
_MatchIteratorPair GetFuzzy (libcpp_utf8_string key, libcpp_map[libcpp_utf8_string, libcpp_utf8_string] meta, int32_t max_edit_distance, size_t minimum_exact_prefix) except + # wrap-as:match_fuzzy
_MatchIteratorPair GetPrefixCompletion (libcpp_utf8_string key, libcpp_map[libcpp_utf8_string, libcpp_utf8_string] meta) except + # wrap-as:complete_prefix
shared_ptr[_Match] GetFirst (libcpp_utf8_string the_key, libcpp_map[libcpp_utf8_string, libcpp_utf8_string] meta) # wrap-ignore
bool Contains (libcpp_utf8_string the_key, libcpp_map[libcpp_utf8_string, libcpp_utf8_string] meta) # wrap-ignore
_MatchIteratorPair Get (libcpp_utf8_string the_key, libcpp_map[libcpp_utf8_string, libcpp_utf8_string] meta) # wrap-as:match
_MatchIteratorPair GetNear (libcpp_utf8_string the_key, libcpp_map[libcpp_utf8_string, libcpp_utf8_string] meta, size_t minimum_prefix_length) except + # wrap-as:match_near
_MatchIteratorPair GetNear (libcpp_utf8_string the_key, libcpp_map[libcpp_utf8_string, libcpp_utf8_string] meta, size_t minimum_prefix_length, bool greedy) except + # wrap-as:match_near
_MatchIteratorPair GetFuzzy (libcpp_utf8_string the_key, libcpp_map[libcpp_utf8_string, libcpp_utf8_string] meta, int32_t max_edit_distance) except + # wrap-as:match_fuzzy
_MatchIteratorPair GetFuzzy (libcpp_utf8_string the_key, libcpp_map[libcpp_utf8_string, libcpp_utf8_string] meta, int32_t max_edit_distance, size_t minimum_exact_prefix) except + # wrap-as:match_fuzzy
_MatchIteratorPair GetPrefixCompletion (libcpp_utf8_string the_key, libcpp_map[libcpp_utf8_string, libcpp_utf8_string] meta) except + # wrap-as:complete_prefix
# wrap-doc:
# Complete the given key to full matches(prefix matching)
# In case the used dictionary supports inner weights, the
# completer traverses the dictionary according to weights,
# otherwise byte-order.

_MatchIteratorPair GetPrefixCompletion (libcpp_utf8_string key, libcpp_map[libcpp_utf8_string, libcpp_utf8_string] meta, size_t top_n) except + # wrap-as:complete_prefix
_MatchIteratorPair GetPrefixCompletion (libcpp_utf8_string the_key, libcpp_map[libcpp_utf8_string, libcpp_utf8_string] meta, size_t top_n) except + # wrap-as:complete_prefix
# wrap-doc:
# Complete the given key to full matches(prefix matching)
# and return the top n completions.
Expand All @@ -47,14 +47,14 @@ cdef extern from "keyvi/dictionary/secondary_key_dictionary.h" namespace "keyvi:
# and truncate the lists of results.
# Only the number of top completions is guaranteed.

_MatchIteratorPair GetMultiwordCompletion (libcpp_utf8_string key, libcpp_map[libcpp_utf8_string, libcpp_utf8_string] meta) except + # wrap-as:complete_multiword
_MatchIteratorPair GetMultiwordCompletion (libcpp_utf8_string the_key, libcpp_map[libcpp_utf8_string, libcpp_utf8_string] meta) except + # wrap-as:complete_multiword
# wrap-doc:
# Complete the given key to full matches after whitespace tokenizing.
# In case the used dictionary supports inner weights, the
# completer traverses the dictionary according to weights,
# otherwise byte-order.

_MatchIteratorPair GetMultiwordCompletion (libcpp_utf8_string key, libcpp_map[libcpp_utf8_string, libcpp_utf8_string] meta, size_t top_n) except + # wrap-as:complete_multiword
_MatchIteratorPair GetMultiwordCompletion (libcpp_utf8_string the_key, libcpp_map[libcpp_utf8_string, libcpp_utf8_string] meta, size_t top_n) except + # wrap-as:complete_multiword
# wrap-doc:
# Complete the given key to full matches after whitespace tokenizing
# and return the top n completions.
Expand All @@ -68,15 +68,15 @@ cdef extern from "keyvi/dictionary/secondary_key_dictionary.h" namespace "keyvi:
# and truncate the lists of results.
# Only the number of top completions is guaranteed.

_MatchIteratorPair GetFuzzyMultiwordCompletion (libcpp_utf8_string key, libcpp_map[libcpp_utf8_string, libcpp_utf8_string] meta, int32_t max_edit_distance) except + # wrap-as:complete_fuzzy_multiword
_MatchIteratorPair GetFuzzyMultiwordCompletion (libcpp_utf8_string the_key, libcpp_map[libcpp_utf8_string, libcpp_utf8_string] meta, int32_t max_edit_distance) except + # wrap-as:complete_fuzzy_multiword
# wrap-doc:
# Complete the given key to full matches after whitespace tokenizing,
# allowing up to max_edit_distance distance(Levenshtein).
# In case the used dictionary supports inner weights, the
# completer traverses the dictionary according to weights,
# otherwise byte-order.

_MatchIteratorPair GetFuzzyMultiwordCompletion (libcpp_utf8_string key, libcpp_map[libcpp_utf8_string, libcpp_utf8_string] meta, int32_t max_edit_distance, size_t minimum_exact_prefix) except + # wrap-as:complete_fuzzy_multiword
_MatchIteratorPair GetFuzzyMultiwordCompletion (libcpp_utf8_string the_key, libcpp_map[libcpp_utf8_string, libcpp_utf8_string] meta, int32_t max_edit_distance, size_t minimum_exact_prefix) except + # wrap-as:complete_fuzzy_multiword
# wrap-doc:
# Complete the given key to full matches after whitespace tokenizing,
# allowing up to max_edit_distance distance(Levenshtein) except for
Expand Down
2 changes: 2 additions & 0 deletions python/src/py/keyvi/compiler/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,5 @@
from keyvi._core import KeyOnlyDictionaryCompiler, KeyOnlyDictionaryGenerator, KeyOnlyDictionaryMerger
from keyvi._core import StringDictionaryCompiler, StringDictionaryMerger
from keyvi._core import FloatVectorDictionaryCompiler
from keyvi._core import SecondaryKeyCompletionDictionaryCompiler, SecondaryKeyFloatVectorDictionaryCompiler, SecondaryKeyIntDictionaryCompiler
from keyvi._core import SecondaryKeyKeyOnlyDictionaryCompiler, SecondaryKeyStringDictionaryCompiler, SecondaryKeyJsonDictionaryCompiler
2 changes: 1 addition & 1 deletion python/src/py/keyvi/dictionary/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,4 @@
limitations under the License.
'''

from keyvi._core import Dictionary
from keyvi._core import Dictionary, SecondaryKeyDictionary
Loading

0 comments on commit c3b90a8

Please sign in to comment.