Skip to content

Commit

Permalink
add secondary key dictionaries (#303)
Browse files Browse the repository at this point in the history
Secondary key dictionaries match a set of key before the real matching. Those secondary keys can be arbitrary strings, e.g. a user, account or a tenant id.

At compile time, a list of secondary keys must be provided at construction, e.g. ["region", "account_id", "user_id"]. The order defines matching order. For every entry as well as for every match operation an unordered map of keys and values must be provided :

{
    "account_id": "xyz", 
    "region": "eu-west",
    "user_id": "abcd"
}

Example use:

dictionary.complete("sie", {"account_id": "xyz", "region": "eu-west", "user_id": "abcd" })

With other words: APIs are equal to ordinary dictionaries with the extension of a map with secondary keys next to the primary key.
  • Loading branch information
hendrikmuhs authored Jul 23, 2024
1 parent 57243dd commit 0f0ffa4
Show file tree
Hide file tree
Showing 30 changed files with 1,516 additions and 331 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ set(CMAKE_MODULE_PATH "${CMAKE_MODULE_PATH};${CMAKE_CURRENT_SOURCE_DIR}/cmake_mo
# configure C++17
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_EXTENSIONS OFF)
set(CMAKE_OSX_DEPLOYMENT_TARGET "10.15" CACHE STRING "Minimum OS X deployment version")

# keyvi specific compile options, definitions and flags
set (_KEVYI_COMPILE_OPTIONS "-Wall")
Expand All @@ -38,7 +39,6 @@ set (_OS_LIBRARIES "")
# OSX specifics
if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
set (_KEYVI_COMPILE_DEFINITIONS "${_KEYVI_COMPILE_DEFINITIONS} OS_MACOSX")
set (_KEYVI_CXX_FLAGS "${_KEYVI_CXX_FLAGS} -mmacosx-version-min=10.9")
endif()

# build type specific settings
Expand Down
316 changes: 185 additions & 131 deletions keyvi/include/keyvi/dictionary/dictionary.h

Large diffs are not rendered by default.

21 changes: 19 additions & 2 deletions keyvi/include/keyvi/dictionary/dictionary_compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,9 @@ template <keyvi::dictionary::fsa::internal::value_store_t ValueStoreType = fsa::
class DictionaryCompiler final {
public:
using ValueStoreT = typename fsa::internal::ValueStoreComponents<ValueStoreType>::value_store_writer_t;
using callback_t = std::function<void(size_t, size_t, void*)>;

private:
using callback_t = std::function<void(size_t, size_t, void*)>;
using GeneratorAdapter = fsa::GeneratorAdapterInterface<typename ValueStoreT::value_t>;

public:
Expand Down Expand Up @@ -138,10 +138,11 @@ class DictionaryCompiler final {
}

generator_->SetManifest(manifest_);
generator_->SetSpecializedDictionaryProperties(specialized_dictionary_properties_);
}

/**
* Set a custom manifest to be embedded into the index file.
* Set a custom manifest to be embedded into the keyvi file.
*
* @param manifest as string
*/
Expand All @@ -155,6 +156,21 @@ class DictionaryCompiler final {
}
}

/**
* Set a specialized dictionary properties.
*
* @param specialized_dictionary_properties properties as string
*/
void SetSpecializedDictionaryProperties(const std::string& specialized_dictionary_properties) {
specialized_dictionary_properties_ = specialized_dictionary_properties;

// if generator object is already there, set it otherwise cache it until it
// is created
if (generator_) {
generator_->SetSpecializedDictionaryProperties(specialized_dictionary_properties_);
}
}

void Write(std::ostream& stream) {
if (!generator_) {
throw compiler_exception("not compiled yet");
Expand All @@ -180,6 +196,7 @@ class DictionaryCompiler final {
ValueStoreT* value_store_;
typename GeneratorAdapter::AdapterPtr generator_;
std::string manifest_;
std::string specialized_dictionary_properties_;
size_t memory_limit_;
size_t memory_estimate_ = 0;
size_t chunk_ = 0;
Expand Down
51 changes: 41 additions & 10 deletions keyvi/include/keyvi/dictionary/dictionary_properties.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ namespace dictionary {
static const char START_STATE_PROPERTY[] = "start_state";
static const char VERSION_PROPERTY[] = "version";
static const char MANIFEST_PROPERTY[] = "manifest";
static const char SPECIALIZED_DICTIONARY_PROPERTY[] = "specialized_dict_properties";
static const char NUMBER_OF_KEYS_PROPERTY[] = "number_of_keys";
static const char VALUE_STORE_TYPE_PROPERTY[] = "value_store_type";
static const char NUMBER_OF_STATES_PROPERTY[] = "number_of_states";
Expand All @@ -66,7 +67,8 @@ class DictionaryProperties {
const uint64_t number_of_keys, const uint64_t number_of_states,
const fsa::internal::value_store_t value_store_type, uint64_t sparse_array_version,
const size_t sparse_array_size, const size_t persistence_offset, const size_t transitions_offset,
const fsa::internal::ValueStoreProperties& value_store_properties, const std::string& manifest) {
const fsa::internal::ValueStoreProperties& value_store_properties, const std::string& manifest,
const std::string& specialized_dictionary_properties) {
file_name_ = file_name;
version_ = version;
start_state_ = start_state;
Expand All @@ -79,14 +81,16 @@ class DictionaryProperties {
transitions_offset_ = transitions_offset;
value_store_properties_ = value_store_properties;
manifest_ = manifest;
specialized_dictionary_properties_ = specialized_dictionary_properties;
}

/**
* Simplified constructor for writing properties, e.g. for dictionary compilation.
*/
DictionaryProperties(const uint64_t version, const uint64_t start_state, const uint64_t number_of_keys,
const uint64_t number_of_states, const fsa::internal::value_store_t value_store_type,
uint64_t sparse_array_version, const size_t sparse_array_size, std::string manifest) {
uint64_t sparse_array_version, const size_t sparse_array_size, const std::string& manifest,
const std::string& specialized_dictionary_properties) {
version_ = version;
start_state_ = start_state;
number_of_keys_ = number_of_keys;
Expand All @@ -95,11 +99,13 @@ class DictionaryProperties {
sparse_array_version_ = sparse_array_version;
sparse_array_size_ = sparse_array_size;
manifest_ = manifest;
specialized_dictionary_properties_ = specialized_dictionary_properties;
}

static DictionaryProperties FromFile(const std::string& file_name) {
static DictionaryProperties FromFile(const std::string& file_name, const size_t offset = 0) {
std::ifstream file_stream(file_name, std::ios::binary);

file_stream.seekg(offset);
if (!file_stream.good()) {
throw std::invalid_argument("dictionary file not found");
}
Expand Down Expand Up @@ -130,9 +136,16 @@ class DictionaryProperties {

size_t GetTransitionsSize() const { return sparse_array_size_ * 2; }

size_t GetEndOffset() const {
return value_store_properties_.GetOffset() ? value_store_properties_.GetOffset() + value_store_properties_.GetSize()
: GetTransitionsOffset() + GetTransitionsSize();
}

const fsa::internal::ValueStoreProperties& GetValueStoreProperties() const { return value_store_properties_; }

const std::string GetManifest() const { return manifest_; }
const std::string& GetManifest() const { return manifest_; }

const std::string& GetSpecializedDictionaryProperties() const { return specialized_dictionary_properties_; }

std::string GetStatistics() const {
rapidjson::StringBuffer string_buffer;
Expand Down Expand Up @@ -191,8 +204,15 @@ class DictionaryProperties {
writer.Key(NUMBER_OF_STATES_PROPERTY);
writer.String(std::to_string(number_of_states_));
// manifest
writer.Key(MANIFEST_PROPERTY);
writer.String(manifest_);
if (!manifest_.empty()) {
writer.Key(MANIFEST_PROPERTY);
writer.String(manifest_);
}
// special properties
if (!specialized_dictionary_properties_.empty()) {
writer.Key(SPECIALIZED_DICTIONARY_PROPERTY);
writer.String(specialized_dictionary_properties_);
}
writer.EndObject();
}

Expand Down Expand Up @@ -232,6 +252,7 @@ class DictionaryProperties {
size_t transitions_offset_ = 0;
fsa::internal::ValueStoreProperties value_store_properties_;
std::string manifest_;
std::string specialized_dictionary_properties_;

static DictionaryProperties ReadJsonFormat(const std::string& file_name, std::ifstream& file_stream) {
rapidjson::Document automata_properties;
Expand Down Expand Up @@ -265,6 +286,14 @@ class DictionaryProperties {
}
}

std::string specialized_dictionary_properties;
if (automata_properties.HasMember(SPECIALIZED_DICTIONARY_PROPERTY)) {
if (automata_properties[SPECIALIZED_DICTIONARY_PROPERTY].IsString()) {
// manifest should be a string, if not ignore it
specialized_dictionary_properties = automata_properties[SPECIALIZED_DICTIONARY_PROPERTY].GetString();
}
}

rapidjson::Document sparse_array_properties;
keyvi::util::SerializationUtils::ReadLengthPrefixedJsonRecord(file_stream, &sparse_array_properties);

Expand All @@ -283,22 +312,24 @@ class DictionaryProperties {
size_t transitions_offset = persistence_offset + sparse_array_size;

// check for file truncation
file_stream.seekg((size_t)file_stream.tellg() + sparse_array_size + bucket_size * sparse_array_size - 1);
file_stream.seekg(static_cast<size_t>(file_stream.tellg()) + sparse_array_size + bucket_size * sparse_array_size -
1);
if (file_stream.peek() == EOF) {
throw std::invalid_argument("file is corrupt(truncated)");
}

file_stream.get();

fsa::internal::ValueStoreProperties value_store_properties;
// not all value stores have properties
if (file_stream.peek() != EOF) {

// not all value stores have persisted properties
if (fsa::internal::ValueStoreHasPersistedProperties(value_store_type)) {
value_store_properties = fsa::internal::ValueStoreProperties::FromJson(file_stream);
}

return DictionaryProperties(file_name, version, start_state, number_of_keys, number_of_states, value_store_type,
sparse_array_version, sparse_array_size, persistence_offset, transitions_offset,
value_store_properties, manifest);
value_store_properties, manifest, specialized_dictionary_properties);
}
};

Expand Down
18 changes: 18 additions & 0 deletions keyvi/include/keyvi/dictionary/dictionary_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
#include "keyvi/dictionary/fsa/internal/json_value_store.h"
#include "keyvi/dictionary/fsa/internal/sparse_array_persistence.h"
#include "keyvi/dictionary/fsa/internal/string_value_store.h"
#include "keyvi/dictionary/secondary_key_dictionary_compiler.h"

namespace keyvi {
namespace dictionary {
Expand Down Expand Up @@ -69,6 +70,23 @@ using KeyOnlyDictionaryMerger = keyvi::dictionary::DictionaryMerger<dictionary_t

using JsonDictionaryIndexCompiler = keyvi::dictionary::DictionaryIndexCompiler<dictionary_type_t::JSON>;

// secondary key types
using SecondaryKeyCompletionDictionaryCompiler =
keyvi::dictionary::SecondaryKeyDictionaryCompiler<dictionary_type_t::INT_WITH_WEIGHTS>;

using SecondaryKeyFloatVectorDictionaryCompiler =
keyvi::dictionary::SecondaryKeyDictionaryCompiler<dictionary_type_t::FLOAT_VECTOR>;

using SecondaryKeyIntDictionaryCompiler = keyvi::dictionary::SecondaryKeyDictionaryCompiler<dictionary_type_t::INT>;

using SecondaryKeyKeyOnlyDictionaryCompiler =
keyvi::dictionary::SecondaryKeyDictionaryCompiler<dictionary_type_t::KEY_ONLY>;

using SecondaryKeyJsonDictionaryCompiler = keyvi::dictionary::SecondaryKeyDictionaryCompiler<dictionary_type_t::JSON>;

using SecondaryKeyStringDictionaryCompiler =
keyvi::dictionary::SecondaryKeyDictionaryCompiler<dictionary_type_t::STRING>;

#ifndef KEYVI_REMOVE_DEPRECATED
using IntDictionaryCompilerSmallData = IntDictionaryCompiler;

Expand Down
17 changes: 16 additions & 1 deletion keyvi/include/keyvi/dictionary/fsa/automata.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,10 @@

namespace keyvi {
namespace dictionary {

// for friending
class SecondaryKeyDictionary;

namespace fsa {

/**
Expand Down Expand Up @@ -77,6 +81,11 @@ class Automata final {
: Automata(std::make_shared<DictionaryProperties>(DictionaryProperties::FromFile(file_name)), loading_strategy,
true) {}

explicit Automata(const std::string& file_name, const size_t offset,
loading_strategy_types loading_strategy = loading_strategy_types::lazy)
: Automata(std::make_shared<DictionaryProperties>(DictionaryProperties::FromFile(file_name, offset)),
loading_strategy, true) {}

private:
explicit Automata(const dictionary_properties_t& dictionary_properties, loading_strategy_types loading_strategy,
const bool load_value_store)
Expand Down Expand Up @@ -389,7 +398,7 @@ class Automata final {
return dictionary_properties_->GetStatistics();
}

std::string GetManifest() const {
const std::string& GetManifest() const {
return dictionary_properties_->GetManifest();
}

Expand Down Expand Up @@ -450,6 +459,12 @@ class Automata final {
TRACE("Compact Transition after resolve %d", resolved_ptr);
return resolved_ptr;
}

friend class keyvi::dictionary::SecondaryKeyDictionary;

const dictionary_properties_t& GetDictionaryProperties() const {
return dictionary_properties_;
}
};

// shared pointer
Expand Down
8 changes: 7 additions & 1 deletion keyvi/include/keyvi/dictionary/fsa/generator.h
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,8 @@ class Generator final {

keyvi::dictionary::DictionaryProperties p(KEYVI_FILE_VERSION_CURRENT, start_state_, number_of_keys_added_,
number_of_states_, value_store_->GetValueStoreType(),
persistence_->GetVersion(), persistence_->GetSize(), manifest_);
persistence_->GetVersion(), persistence_->GetSize(), manifest_,
specialized_dictionary_properties_);
p.WriteAsJsonV2(stream);

// write data from persistence
Expand All @@ -323,6 +324,10 @@ class Generator final {
*/
inline void SetManifest(const std::string& manifest) { manifest_ = manifest; }

inline void SetSpecializedDictionaryProperties(const std::string& specialized_dictionary_properties) {
specialized_dictionary_properties_ = specialized_dictionary_properties;
}

private:
size_t memory_limit_;
keyvi::util::parameters_t params_;
Expand All @@ -337,6 +342,7 @@ class Generator final {
OffsetTypeT start_state_ = 0;
uint64_t number_of_states_ = 0;
std::string manifest_;
std::string specialized_dictionary_properties_;
bool minimize_ = true;

inline void FeedStack(const size_t start, const std::string& key) {
Expand Down
5 changes: 5 additions & 0 deletions keyvi/include/keyvi/dictionary/fsa/generator_adapter.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ class GeneratorAdapterInterface {
virtual void Write(std::ostream& stream) {}
virtual void WriteToFile(const std::string& filename) {}
virtual void SetManifest(const std::string& manifest) {}
virtual void SetSpecializedDictionaryProperties(const std::string& specialized_dictionary_properties) {}

virtual ~GeneratorAdapterInterface() {}
};
Expand Down Expand Up @@ -85,6 +86,10 @@ class GeneratorAdapter final : public GeneratorAdapterInterface<typename ValueSt

void SetManifest(const std::string& manifest) { generator_.SetManifest(manifest); }

void SetSpecializedDictionaryProperties(const std::string& specialized_dictionary_properties) {
generator_.SetSpecializedDictionaryProperties(specialized_dictionary_properties);
}

private:
Generator<PersistenceT, ValueStoreT, OffsetTypeT, HashCodeTypeT> generator_;
};
Expand Down
3 changes: 3 additions & 0 deletions keyvi/include/keyvi/dictionary/fsa/internal/constants.h
Original file line number Diff line number Diff line change
Expand Up @@ -85,4 +85,7 @@ static const char VECTOR_SIZE_KEY[] = "vector_size";
static const char MERGE_MODE[] = "merge_mode";
static const char MERGE_APPEND[] = "append";

// constants for specialized dictionaries
static const char SECONDARY_KEY_DICT_KEYS_PROPERTY[] = "secondary_keys";

#endif // KEYVI_DICTIONARY_FSA_INTERNAL_CONSTANTS_H_
20 changes: 20 additions & 0 deletions keyvi/include/keyvi/dictionary/fsa/internal/value_store_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,26 @@ enum class value_store_t {
FLOAT_VECTOR = 7, //!< FloatVectorValueStore
};

/**
* Wheter the given value store has persisted value store properties.
*/
inline bool ValueStoreHasPersistedProperties(const value_store_t type) {
switch (type) {
case value_store_t::KEY_ONLY:
case value_store_t::INT:
case value_store_t::INT_WITH_WEIGHTS:
return false;
case value_store_t::STRING:
case value_store_t::JSON:
case value_store_t::FLOAT_VECTOR:
return true;
case value_store_t::JSON_DEPRECATED:
throw std::invalid_argument("Deprecated Value Storage type");
default:
throw std::invalid_argument("Unknown Value Storage type");
}
}

} /* namespace internal */
} /* namespace fsa */
} /* namespace dictionary */
Expand Down
Loading

0 comments on commit 0f0ffa4

Please sign in to comment.