Skip to content

Commit

Permalink
Handle punctuations correctly
Browse files Browse the repository at this point in the history
  • Loading branch information
csukuangfj committed Apr 21, 2024
1 parent 6f44b88 commit c06bd79
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 7 deletions.
38 changes: 33 additions & 5 deletions sherpa-onnx/csrc/jieba-lexicon.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ class JiebaLexicon::Impl {
public:
Impl(const std::string &lexicon, const std::string &tokens,
const std::string &dict_dir,
const OfflineTtsVitsModelMetaData &meta_data)
: meta_data_(meta_data) {
const OfflineTtsVitsModelMetaData &meta_data, bool debug)
: meta_data_(meta_data), debug_(debug) {
std::string dict = dict_dir + "/jieba.dict.utf8";
std::string hmm = dict_dir + "/hmm_model.utf8";
std::string user_dict = dict_dir + "/user.dict.utf8";
Expand Down Expand Up @@ -73,6 +73,20 @@ class JiebaLexicon::Impl {
bool is_hmm = true;
jieba_->Cut(text, words, is_hmm);

if (debug_) {
SHERPA_ONNX_LOGE("input text: %s", text.c_str());
SHERPA_ONNX_LOGE("after replacing punctuations: %s", s.c_str());

std::ostringstream os;
std::string sep = "";
for (const auto &w : words) {
os << sep << w;
sep = "_";
}

SHERPA_ONNX_LOGE("after jieba processing: %s", os.str().c_str());
}

std::vector<std::vector<int64_t>> ans;
std::vector<int64_t> this_sentence;

Expand Down Expand Up @@ -122,7 +136,18 @@ class JiebaLexicon::Impl {
return ans;
}

void InitTokens(std::istream &is) { token2id_ = ReadTokens(is); }
void InitTokens(std::istream &is) {
token2id_ = ReadTokens(is);

std::vector<std::pair<std::string, std::string>> puncts = {
{",", ""}, {".", ""}, {"!", ""}, {"?", ""}};

for (const auto &p : puncts) {
if (token2id_.count(p.first) && !token2id_.count(p.second)) {
token2id_[p.second] = token2id_[p.first];
}
}
}

void InitLexicon(std::istream &is) {
std::string word;
Expand Down Expand Up @@ -170,15 +195,18 @@ class JiebaLexicon::Impl {
OfflineTtsVitsModelMetaData meta_data_;

std::unique_ptr<cppjieba::Jieba> jieba_;
bool debug_ = false;
};

JiebaLexicon::~JiebaLexicon() = default;

JiebaLexicon::JiebaLexicon(const std::string &lexicon,
const std::string &tokens,
const std::string &dict_dir,
const OfflineTtsVitsModelMetaData &meta_data)
: impl_(std::make_unique<Impl>(lexicon, tokens, dict_dir, meta_data)) {}
const OfflineTtsVitsModelMetaData &meta_data,
bool debug)
: impl_(std::make_unique<Impl>(lexicon, tokens, dict_dir, meta_data,
debug)) {}

std::vector<std::vector<int64_t>> JiebaLexicon::ConvertTextToTokenIds(
const std::string &text, const std::string &unused_voice /*= ""*/) const {
Expand Down
2 changes: 1 addition & 1 deletion sherpa-onnx/csrc/jieba-lexicon.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ class JiebaLexicon : public OfflineTtsFrontend {
~JiebaLexicon() override;
JiebaLexicon(const std::string &lexicon, const std::string &tokens,
const std::string &dict_dir,
const OfflineTtsVitsModelMetaData &meta_data);
const OfflineTtsVitsModelMetaData &meta_data, bool debug);

#if __ANDROID_API__ >= 9
JiebaLexicon(AAssetManager *mgr, const std::string &lexicon,
Expand Down
3 changes: 2 additions & 1 deletion sherpa-onnx/csrc/offline-tts-vits-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -309,7 +309,8 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl {
} else if (meta_data.jieba && !config_.model.vits.dict_dir.empty()) {
frontend_ = std::make_unique<JiebaLexicon>(
config_.model.vits.lexicon, config_.model.vits.tokens,
config_.model.vits.dict_dir, model_->GetMetaData());
config_.model.vits.dict_dir, model_->GetMetaData(),
config_.model.debug);
} else if ((meta_data.is_piper || meta_data.is_coqui ||
meta_data.is_icefall) &&
!config_.model.vits.data_dir.empty()) {
Expand Down

0 comments on commit c06bd79

Please sign in to comment.