-
Notifications
You must be signed in to change notification settings - Fork 1.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[search] Search in downloader by country names. #14081
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
../../data/countries_names.txt |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
#include "search/countries_names_index.hpp" | ||
|
||
#include "platform/platform.hpp" | ||
|
||
#include "coding/file_reader.hpp" | ||
|
||
#include "base/assert.hpp" | ||
|
||
#include <fstream> | ||
#include <set> | ||
#include <sstream> | ||
|
||
using namespace std; | ||
|
||
namespace search | ||
{ | ||
CountriesNamesIndex::CountriesNamesIndex() | ||
{ | ||
ReadCountryNamesFromFile(m_countries); | ||
BuildIndexFromTranslations(); | ||
} | ||
|
||
void CountriesNamesIndex::CollectMatchingCountries(string const & query, | ||
vector<storage::CountryId> & results) | ||
{ | ||
set<size_t> ids; | ||
auto insertId = [&ids](size_t id, bool /* exactMatch */) { ids.insert(id); }; | ||
|
||
vector<strings::UniString> tokens; | ||
search::NormalizeAndTokenizeString(query, tokens); | ||
search::Delimiters delims; | ||
bool const lastTokenIsPrefix = !query.empty() && !delims(strings::LastUniChar(query)); | ||
for (size_t i = 0; i < tokens.size(); ++i) | ||
{ | ||
auto const & token = tokens[i]; | ||
if (i + 1 == tokens.size() && lastTokenIsPrefix) | ||
Retrieve<strings::PrefixDFAModifier<strings::LevenshteinDFA>>(token, insertId); | ||
else | ||
Retrieve<strings::LevenshteinDFA>(token, insertId); | ||
} | ||
|
||
// todo(@m) Do not bother with tf/idf for now. | ||
results.clear(); | ||
for (auto id : ids) | ||
{ | ||
CHECK_LESS(id, m_countries.size(), ()); | ||
results.emplace_back(m_countries[id].m_countryId); | ||
} | ||
} | ||
|
||
void CountriesNamesIndex::ReadCountryNamesFromFile(vector<Country> & countries) | ||
{ | ||
string contents; | ||
|
||
GetPlatform().GetReader(COUNTRIES_NAMES_FILE)->ReadAsString(contents); | ||
istringstream ifs(contents); | ||
|
||
string line; | ||
countries.clear(); | ||
while (getline(ifs, line)) | ||
{ | ||
if (line.empty()) | ||
continue; | ||
strings::Trim(line); | ||
if (line[0] == '[') | ||
{ | ||
CHECK_EQUAL(line[line.size() - 1], ']', ()); | ||
countries.push_back({}); | ||
countries.back().m_countryId = line.substr(1, line.size() - 2); | ||
continue; | ||
} | ||
auto pos = line.find('='); | ||
if (pos == string::npos) | ||
continue; | ||
// Ignore the language code: the language sets differ for StringUtf8Multilang | ||
// and for the translations used by this class. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. после добавления в StringUtf8Multilang норвежского и индонезийского языки на которые переведны страны стали подмножеством языков в StringUtf8Multilang, но я не против игнорить, по крайней мере пока нет жалоб что работает не так ха-ха)) |
||
auto t = line.substr(pos + 1); | ||
strings::Trim(t); | ||
if (!countries.empty()) | ||
countries.back().m_doc.m_translations.push_back(t); | ||
} | ||
} | ||
|
||
void CountriesNamesIndex::BuildIndexFromTranslations() | ||
{ | ||
for (size_t i = 0; i < m_countries.size(); ++i) | ||
m_index.Add(i, m_countries[i].m_doc); | ||
} | ||
} // namespace search |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
#pragma once | ||
|
||
#include "search/base/mem_search_index.hpp" | ||
#include "search/feature_offset_match.hpp" | ||
|
||
#include "storage/storage_defines.hpp" | ||
|
||
#include "indexer/search_string_utils.hpp" | ||
|
||
#include "base/string_utils.hpp" | ||
|
||
#include <cstddef> | ||
#include <string> | ||
#include <utility> | ||
#include <vector> | ||
|
||
namespace search | ||
{ | ||
class CountriesNamesIndex | ||
{ | ||
public: | ||
struct Doc | ||
{ | ||
template <typename Fn> | ||
void ForEachToken(Fn && fn) const | ||
{ | ||
for (auto const & s : m_translations) | ||
fn(StringUtf8Multilang::kDefaultCode, NormalizeAndSimplifyString(s)); | ||
} | ||
|
||
std::vector<std::string> m_translations; | ||
}; | ||
|
||
CountriesNamesIndex(); | ||
|
||
void CollectMatchingCountries(std::string const & query, | ||
std::vector<storage::CountryId> & results); | ||
|
||
private: | ||
struct Country | ||
{ | ||
storage::CountryId m_countryId; | ||
Doc m_doc; | ||
}; | ||
|
||
// todo(@m) Almost the same as in bookmarks/processor.hpp. | ||
template <typename DFA, typename Fn> | ||
void Retrieve(strings::UniString const & s, Fn && fn) const | ||
{ | ||
SearchTrieRequest<DFA> request; | ||
request.m_names.emplace_back(BuildLevenshteinDFA(s)); | ||
request.m_langs.insert(StringUtf8Multilang::kDefaultCode); | ||
|
||
MatchFeaturesInTrie( | ||
request, m_index.GetRootIterator(), [](size_t id) { return true; } /* filter */, | ||
std::forward<Fn>(fn)); | ||
} | ||
|
||
void ReadCountryNamesFromFile(std::vector<Country> & countries); | ||
void BuildIndexFromTranslations(); | ||
|
||
std::vector<Country> m_countries; | ||
search_base::MemSearchIndex<size_t> m_index; | ||
}; | ||
} // namespace search |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,26 +2,40 @@ | |
|
||
#include "search/result.hpp" | ||
|
||
#include "storage/country_info_getter.hpp" | ||
#include "storage/storage.hpp" | ||
|
||
#include "editor/editable_data_source.hpp" | ||
|
||
#include "indexer/data_source.hpp" | ||
|
||
#include "storage/country_info_getter.hpp" | ||
#include "storage/storage.hpp" | ||
|
||
#include "base/assert.hpp" | ||
#include "base/logging.hpp" | ||
#include "base/string_utils.hpp" | ||
|
||
#include <set> | ||
#include <string> | ||
#include <utility> | ||
|
||
namespace | ||
{ | ||
bool GetGroupCountryId(storage::Storage const & storage, std::string & name) | ||
{ | ||
auto const & synonyms = storage.GetCountryNameSynonyms(); | ||
|
||
if (storage.IsInnerNode(name)) | ||
return true; | ||
auto const it = synonyms.find(name); | ||
if (it == synonyms.end()) | ||
return false; | ||
if (!storage.IsInnerNode(it->second)) | ||
return false; | ||
name = it->second; | ||
return true; | ||
} | ||
|
||
bool GetGroupCountryIdFromFeature(storage::Storage const & storage, FeatureType & ft, | ||
std::string & name) | ||
{ | ||
auto const & synonyms = storage.GetCountryNameSynonyms(); | ||
int8_t const langIndices[] = {StringUtf8Multilang::kEnglishCode, | ||
StringUtf8Multilang::kDefaultCode, | ||
StringUtf8Multilang::kInternationalCode}; | ||
|
@@ -30,15 +44,8 @@ bool GetGroupCountryIdFromFeature(storage::Storage const & storage, FeatureType | |
{ | ||
if (!ft.GetName(langIndex, name)) | ||
continue; | ||
if (storage.IsInnerNode(name)) | ||
if (GetGroupCountryId(storage, name)) | ||
return true; | ||
auto const it = synonyms.find(name); | ||
if (it == synonyms.end()) | ||
continue; | ||
if (!storage.IsInnerNode(it->second)) | ||
continue; | ||
name = it->second; | ||
return true; | ||
} | ||
return false; | ||
} | ||
|
@@ -66,6 +73,22 @@ void DownloaderSearchCallback::operator()(search::Results const & results) | |
|
||
for (auto const & result : results) | ||
{ | ||
if (result.GetResultType() == search::Result::Type::DownloaderEntry) | ||
{ | ||
std::string groupFeatureName = result.GetCountryId(); | ||
if (!GetGroupCountryId(m_storage, groupFeatureName)) | ||
continue; | ||
|
||
storage::DownloaderSearchResult downloaderResult(groupFeatureName, | ||
result.GetString() /* m_matchedName */); | ||
if (uniqueResults.find(downloaderResult) == uniqueResults.end()) | ||
{ | ||
uniqueResults.insert(downloaderResult); | ||
downloaderSearchResults.m_results.push_back(downloaderResult); | ||
} | ||
continue; | ||
} | ||
|
||
if (!result.HasPoint()) | ||
continue; | ||
|
||
|
@@ -98,21 +121,25 @@ void DownloaderSearchCallback::operator()(search::Results const & results) | |
} | ||
} | ||
} | ||
auto const & mercator = result.GetFeatureCenter(); | ||
storage::CountryId const & countryId = m_infoGetter.GetRegionCountryId(mercator); | ||
if (countryId == storage::kInvalidCountryId) | ||
continue; | ||
|
||
storage::DownloaderSearchResult downloaderResult(countryId, | ||
result.GetString() /* m_matchedName */); | ||
if (uniqueResults.find(downloaderResult) == uniqueResults.end()) | ||
if (result.GetResultType() == search::Result::Type::LatLon) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. else вместо There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. или наоборот |
||
{ | ||
uniqueResults.insert(downloaderResult); | ||
downloaderSearchResults.m_results.push_back(downloaderResult); | ||
auto const & mercator = result.GetFeatureCenter(); | ||
storage::CountryId const & countryId = m_infoGetter.GetRegionCountryId(mercator); | ||
if (countryId == storage::kInvalidCountryId) | ||
continue; | ||
|
||
storage::DownloaderSearchResult downloaderResult(countryId, | ||
result.GetString() /* m_matchedName */); | ||
if (uniqueResults.find(downloaderResult) == uniqueResults.end()) | ||
{ | ||
uniqueResults.insert(downloaderResult); | ||
downloaderSearchResults.m_results.push_back(downloaderResult); | ||
} | ||
continue; | ||
} | ||
} | ||
|
||
downloaderSearchResults.m_query = m_params.m_query; | ||
downloaderSearchResults.m_endMarker = results.IsEndMarker(); | ||
|
||
if (m_params.m_onResults) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
можно без tf/idf но хотя бы кол-во заматченых токенов учитывать
например:
если я правильно поняла, сейчас можно набрать в поиске "северная осетия", а в результатах ее не увидеть
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
но независимо от реализации я за то чтобы это было не в этом реквесте