From 264f0f8072ccfd952cf090ebc2777a0195fb9e96 Mon Sep 17 00:00:00 2001 From: frawamudi Date: Mon, 16 Jan 2023 20:43:15 +0100 Subject: [PATCH 1/5] added test case for array of string and decimal paramaters --- src/darts.h | 280 +++++++++++++++++++++++------------------------- src/viterbi.cpp | 118 +++++++++----------- 2 files changed, 184 insertions(+), 214 deletions(-) diff --git a/src/darts.h b/src/darts.h index 91b2eae..4efb94c 100644 --- a/src/darts.h +++ b/src/darts.h @@ -8,72 +8,82 @@ #define DARTS_H_ #define DARTS_VERSION "0.31" -#include -#include #include +#include +#include #ifdef HAVE_ZLIB_H namespace zlib { #include } -#define SH(p)((unsigned short)(unsigned char)((p)[0]) | ((unsigned short)(unsigned char)((p)[1]) << 8)) -#define LG(p)((unsigned long)(SH(p)) |((unsigned long)(SH((p)+2)) << 16)) +#define SH(p) \ + ((unsigned short)(unsigned char)((p)[0]) | \ + ((unsigned short)(unsigned char)((p)[1]) << 8)) +#define LG(p) ((unsigned long)(SH(p)) | ((unsigned long)(SH((p) + 2)) << 16)) #endif namespace MeCab { namespace Darts { -template inline T _max(T x, T y) { return(x > y) ? x : y; } -template inline T* _resize(T* ptr, size_t n, size_t l, T v) { +template +inline T _max(T x, T y) { + return (x > y) ? x : y; +} +template +inline T *_resize(T *ptr, size_t n, size_t l, T v) { T *tmp = new T[l]; for (size_t i = 0; i < n; ++i) tmp[i] = ptr[i]; for (size_t i = n; i < l; ++i) tmp[i] = v; - delete [] ptr; + delete[] ptr; return tmp; } template class Length { - public: size_t operator()(const T *key) const - { size_t i; for (i = 0; key[i] != (T)0; ++i) {} return i; } + public: + size_t operator()(const T *key) const { + size_t i; + for (i = 0; key[i] != (T)0; ++i) { + } + return i; + } }; -template <> class Length { - public: size_t operator()(const char *key) const - { return std::strlen(key); } +template <> +class Length { + public: + size_t operator()(const char *key) const { return std::strlen(key); } }; -template > +template > class DoubleArrayImpl { private: - struct node_t { array_u_type_ code; - size_t depth; - size_t left; - size_t right; + size_t depth; + size_t left; + size_t right; }; struct unit_t { - array_type_ base; + array_type_ base; array_u_type_ check; }; - unit_t *array_; + unit_t *array_; unsigned char *used_; - size_t size_; - size_t alloc_size_; - node_type_ **key_; - size_t key_size_; - size_t *length_; - array_type_ *value_; - size_t progress_; - size_t next_check_pos_; - bool no_delete_; - int error_; + size_t size_; + size_t alloc_size_; + node_type_ **key_; + size_t key_size_; + size_t *length_; + array_type_ *value_; + size_t progress_; + size_t next_check_pos_; + bool no_delete_; + int error_; int (*progress_func_)(size_t, size_t); size_t resize(const size_t new_size) { @@ -81,13 +91,13 @@ class DoubleArrayImpl { tmp.base = 0; tmp.check = 0; array_ = _resize(array_, alloc_size_, new_size, tmp); - used_ = _resize(used_, alloc_size_, new_size, - static_cast(0)); + used_ = + _resize(used_, alloc_size_, new_size, static_cast(0)); alloc_size_ = new_size; return new_size; } - size_t fetch(const node_t &parent, std::vector &siblings) { + size_t fetch(const node_t &parent, std::vector &siblings) { if (error_ < 0) return 0; array_u_type_ prev = 0; @@ -110,9 +120,9 @@ class DoubleArrayImpl { if (cur != prev || siblings.empty()) { node_t tmp_node; tmp_node.depth = parent.depth + 1; - tmp_node.code = cur; - tmp_node.left = i; - if (!siblings.empty()) siblings[siblings.size()-1].right = i; + tmp_node.code = cur; + tmp_node.left = i; + if (!siblings.empty()) siblings[siblings.size() - 1].right = i; siblings.push_back(tmp_node); } @@ -120,24 +130,23 @@ class DoubleArrayImpl { prev = cur; } - if (!siblings.empty()) - siblings[siblings.size()-1].right = parent.right; + if (!siblings.empty()) siblings[siblings.size() - 1].right = parent.right; return siblings.size(); } - size_t insert(const std::vector &siblings) { + size_t insert(const std::vector &siblings) { if (error_ < 0) return 0; size_t begin = 0; - size_t pos = _max((size_t)siblings[0].code + 1, next_check_pos_) - 1; + size_t pos = _max((size_t)siblings[0].code + 1, next_check_pos_) - 1; size_t nonzero_num = 0; - int first = 0; + int first = 0; if (alloc_size_ <= pos) resize(pos + 1); while (true) { - next: + next: ++pos; if (alloc_size_ <= pos) resize(pos + 1); @@ -151,7 +160,7 @@ class DoubleArrayImpl { } begin = pos - siblings[0].code; - if (alloc_size_ <= (begin + siblings[siblings.size()-1].code)) + if (alloc_size_ <= (begin + siblings[siblings.size() - 1].code)) resize(static_cast(alloc_size_ * _max(1.05, 1.0 * key_size_ / progress_))); @@ -168,33 +177,31 @@ class DoubleArrayImpl { // 'next_check_pos' and 'check' is greater than some constant // value(e.g. 0.9), // new 'next_check_pos' index is written by 'check'. - if (1.0 * nonzero_num/(pos - next_check_pos_ + 1) >= 0.95) + if (1.0 * nonzero_num / (pos - next_check_pos_ + 1) >= 0.95) next_check_pos_ = pos; used_[begin] = 1; - size_ = _max(size_, - begin + - static_cast(siblings[siblings.size() - 1].code + 1)); + size_ = _max(size_, begin + static_cast( + siblings[siblings.size() - 1].code + 1)); for (size_t i = 0; i < siblings.size(); ++i) array_[begin + siblings[i].code].check = begin; for (size_t i = 0; i < siblings.size(); ++i) { - std::vector new_siblings; + std::vector new_siblings; if (!fetch(siblings[i], new_siblings)) { array_[begin + siblings[i].code].base = - value_ ? - static_cast(-value_[siblings[i].left]-1) : - static_cast(-siblings[i].left-1); + value_ ? static_cast(-value_[siblings[i].left] - 1) + : static_cast(-siblings[i].left - 1); - if (value_ && (array_type_)(-value_[siblings[i].left]-1) >= 0) { + if (value_ && (array_type_)(-value_[siblings[i].left] - 1) >= 0) { error_ = -2; return 0; } ++progress_; - if (progress_func_)(*progress_func_)(progress_, key_size_); + if (progress_func_) (*progress_func_)(progress_, key_size_); } else { size_t h = insert(new_siblings); @@ -206,26 +213,27 @@ class DoubleArrayImpl { } public: - - typedef array_type_ value_type; - typedef node_type_ key_type; - typedef array_type_ result_type; // for compatibility + typedef array_type_ value_type; + typedef node_type_ key_type; + typedef array_type_ result_type; // for compatibility struct result_pair_type { value_type value; - size_t length; + size_t length; }; - explicit DoubleArrayImpl(): array_(0), used_(0), - size_(0), alloc_size_(0), - no_delete_(0), error_(0) {} + explicit DoubleArrayImpl() + : array_(0), + used_(0), + size_(0), + alloc_size_(0), + no_delete_(0), + error_(0) {} ~DoubleArrayImpl() { clear(); } - void set_result(value_type& x, value_type r, size_t) const { - x = r; - } + void set_result(value_type &x, value_type r, size_t) const { x = r; } - void set_result(result_pair_type& x, value_type r, size_t l) const { + void set_result(result_pair_type &x, value_type r, size_t l) const { x.value = r; x.length = l; } @@ -242,9 +250,8 @@ class DoubleArrayImpl { } void clear() { - if (!no_delete_) - delete [] array_; - delete [] used_; + if (!no_delete_) delete[] array_; + delete[] used_; array_ = 0; used_ = 0; alloc_size_ = 0; @@ -252,8 +259,8 @@ class DoubleArrayImpl { no_delete_ = false; } - size_t unit_size() const { return sizeof(unit_t); } - size_t size() const { return size_; } + size_t unit_size() const { return sizeof(unit_t); } + size_t size() const { return size_; } size_t total_size() const { return size_ * sizeof(unit_t); } size_t nonzero_size() const { @@ -263,19 +270,16 @@ class DoubleArrayImpl { return result; } - int build(size_t key_size, - key_type **key, - size_t *length = 0, - value_type *value = 0, - int (*progress_func)(size_t, size_t) = 0) { + int build(size_t key_size, key_type **key, size_t *length = 0, + value_type *value = 0, int (*progress_func)(size_t, size_t) = 0) { if (!key_size || !key) return 0; progress_func_ = progress_func; - key_ = key; - length_ = length; - key_size_ = key_size; - value_ = value; - progress_ = 0; + key_ = key; + length_ = length; + key_size_ = key_size; + value_ = value; + progress_ = 0; resize(8192); @@ -283,33 +287,31 @@ class DoubleArrayImpl { next_check_pos_ = 0; node_t root_node; - root_node.left = 0; + root_node.left = 0; root_node.right = key_size; root_node.depth = 0; - std::vector siblings; + std::vector siblings; fetch(root_node, siblings); insert(siblings); size_ += (1 << 8 * sizeof(key_type)) + 1; if (size_ >= alloc_size_) resize(size_); - delete [] used_; + delete[] used_; used_ = 0; return error_; } - int open(const char *file, - const char *mode = "rb", - size_t offset = 0, + int open(const char *file, const char *mode = "rb", size_t offset = 0, size_t size = 0) { std::FILE *fp = std::fopen(file, mode); if (!fp) return -1; if (std::fseek(fp, offset, SEEK_SET) != 0) return -1; if (!size) { - if (std::fseek(fp, 0L, SEEK_END) != 0) return -1; + if (std::fseek(fp, 0L, SEEK_END) != 0) return -1; size = std::ftell(fp); if (std::fseek(fp, offset, SEEK_SET) != 0) return -1; } @@ -319,32 +321,29 @@ class DoubleArrayImpl { size_ = size; size_ /= sizeof(unit_t); array_ = new unit_t[size_]; - if (size_ != std::fread(reinterpret_cast(array_), - sizeof(unit_t), size_, fp)) return -1; + if (size_ != std::fread(reinterpret_cast(array_), sizeof(unit_t), + size_, fp)) + return -1; std::fclose(fp); return 0; } - int save(const char *file, - const char *mode = "wb", - size_t offset = 0) { + int save(const char *file, const char *mode = "wb", size_t offset = 0) { if (!size_) return -1; std::FILE *fp = std::fopen(file, mode); if (!fp) return -1; - if (size_ != std::fwrite(reinterpret_cast(array_), - sizeof(unit_t), size_, fp)) + if (size_ != std::fwrite(reinterpret_cast(array_), sizeof(unit_t), + size_, fp)) return -1; std::fclose(fp); return 0; } #ifdef HAVE_ZLIB_H - int gzopen(const char *file, - const char *mode = "rb", - size_t offset = 0, + int gzopen(const char *file, const char *mode = "rb", size_t offset = 0, size_t size = 0) { - std::FILE *fp = std::fopen(file, mode); + std::FILE *fp = std::fopen(file, mode); if (!fp) return -1; clear(); @@ -352,12 +351,11 @@ class DoubleArrayImpl { if (!size_) { if (-1L != static_cast(std::fseek(fp, -8, SEEK_END))) { char buf[8]; - if (std::fread(static_cast(buf), - 1, 8, fp) != sizeof(buf)) { + if (std::fread(static_cast(buf), 1, 8, fp) != sizeof(buf)) { std::fclose(fp); return -1; } - size_ = LG(buf+4); + size_ = LG(buf + 4); size_ /= sizeof(unit_t); } } @@ -375,8 +373,7 @@ class DoubleArrayImpl { return 0; } - int gzsave(const char *file, const char *mode = "wb", - size_t offset = 0) { + int gzsave(const char *file, const char *mode = "wb", size_t offset = 0) { zlib::gzFile gzfp = zlib::gzopen(file, mode); if (!gzfp) return -1; zlib::gzwrite(gzfp, reinterpret_cast(array_), @@ -387,28 +384,25 @@ class DoubleArrayImpl { #endif template - inline void exactMatchSearch(const key_type *key, - T & result, - size_t len = 0, + inline void exactMatchSearch(const key_type *key, T &result, size_t len = 0, size_t node_pos = 0) const { result = exactMatchSearch(key, len, node_pos); return; } template - inline T exactMatchSearch(const key_type *key, - size_t len = 0, + inline T exactMatchSearch(const key_type *key, size_t len = 0, size_t node_pos = 0) const { if (!len) len = length_func_()(key); T result; set_result(result, -1, 0); - register array_type_ b = array_[node_pos].base; - register array_u_type_ p; + array_type_ b = array_[node_pos].base; + array_u_type_ p; - for (register size_t i = 0; i < len; ++i) { - p = b +(node_u_type_)(key[i]) + 1; + for (size_t i = 0; i < len; ++i) { + p = b + (node_u_type_)(key[i]) + 1; if (static_cast(b) == array_[p].check) b = array_[p].base; else @@ -418,35 +412,32 @@ class DoubleArrayImpl { p = b; array_type_ n = array_[p].base; if (static_cast(b) == array_[p].check && n < 0) - set_result(result, -n-1, len); + set_result(result, -n - 1, len); return result; } template - size_t commonPrefixSearch(const key_type *key, - T* result, - size_t result_len, - size_t len = 0, - size_t node_pos = 0) const { + size_t commonPrefixSearch(const key_type *key, T *result, size_t result_len, + size_t len = 0, size_t node_pos = 0) const { if (!len) len = length_func_()(key); - register array_type_ b = array_[node_pos].base; - register size_t num = 0; - register array_type_ n; - register array_u_type_ p; + array_type_ b = array_[node_pos].base; + size_t num = 0; + array_type_ n; + array_u_type_ p; - for (register size_t i = 0; i < len; ++i) { + for (size_t i = 0; i < len; ++i) { p = b; // + 0; n = array_[p].base; - if ((array_u_type_) b == array_[p].check && n < 0) { + if ((array_u_type_)b == array_[p].check && n < 0) { // result[num] = -n-1; - if (num < result_len) set_result(result[num], -n-1, i); + if (num < result_len) set_result(result[num], -n - 1, i); ++num; } - p = b +(node_u_type_)(key[i]) + 1; - if ((array_u_type_) b == array_[p].check) + p = b + (node_u_type_)(key[i]) + 1; + if ((array_u_type_)b == array_[p].check) b = array_[p].base; else return num; @@ -456,24 +447,22 @@ class DoubleArrayImpl { n = array_[p].base; if ((array_u_type_)b == array_[p].check && n < 0) { - if (num < result_len) set_result(result[num], -n-1, len); + if (num < result_len) set_result(result[num], -n - 1, len); ++num; } return num; } - value_type traverse(const key_type *key, - size_t &node_pos, - size_t &key_pos, + value_type traverse(const key_type *key, size_t &node_pos, size_t &key_pos, size_t len = 0) const { if (!len) len = length_func_()(key); - register array_type_ b = array_[node_pos].base; - register array_u_type_ p; + array_type_ b = array_[node_pos].base; + array_u_type_ p; for (; key_pos < len; ++key_pos) { - p = b +(node_u_type_)(key[key_pos]) + 1; + p = b + (node_u_type_)(key[key_pos]) + 1; if (static_cast(b) == array_[p].check) { node_pos = p; b = array_[p].base; @@ -485,34 +474,35 @@ class DoubleArrayImpl { p = b; array_type_ n = array_[p].base; if (static_cast(b) == array_[p].check && n < 0) - return -n-1; + return -n - 1; return -1; // found, but no value } }; #if 4 == 2 -typedef Darts::DoubleArrayImpl DoubleArray; +typedef Darts::DoubleArrayImpl + DoubleArray; #define DARTS_ARRAY_SIZE_IS_DEFINED 1 #endif #if 4 == 4 && !defined(DARTS_ARRAY_SIZE_IS_DEFINED) -typedef Darts::DoubleArrayImpl DoubleArray; +typedef Darts::DoubleArrayImpl + DoubleArray; #define DARTS_ARRAY_SIZE_IS_DEFINED 1 #endif #if 4 == 4 && !defined(DARTS_ARRAY_SIZE_IS_DEFINED) -typedef Darts::DoubleArrayImpl DoubleArray; +typedef Darts::DoubleArrayImpl + DoubleArray; #define DARTS_ARRAY_SIZE_IS_DEFINED 1 #endif #if 4 == 8 && !defined(DARTS_ARRAY_SIZE_IS_DEFINED) typedef Darts::DoubleArrayImpl DoubleArray; + unsigned long long> + DoubleArray; #endif -} -} +} // namespace Darts +} // namespace MeCab #endif diff --git a/src/viterbi.cpp b/src/viterbi.cpp index a4b46dc..9ae785c 100644 --- a/src/viterbi.cpp +++ b/src/viterbi.cpp @@ -3,19 +3,19 @@ // // Copyright(C) 2001-2011 Taku Kudo // Copyright(C) 2004-2006 Nippon Telegraph and Telephone Corporation -#include -#include -#include -#include +#include "viterbi.h" #include "common.h" #include "connector.h" #include "mecab.h" #include "nbest_generator.h" #include "param.h" -#include "viterbi.h" #include "scoped_ptr.h" #include "string_buffer.h" #include "tokenizer.h" +#include +#include +#include +#include namespace MeCab { @@ -23,8 +23,7 @@ namespace { void calc_alpha(Node *n, double beta) { n->alpha = 0.0; for (Path *path = n->lpath; path; path = path->lnext) { - n->alpha = logsumexp(n->alpha, - -beta * path->cost + path->lnode->alpha, + n->alpha = logsumexp(n->alpha, -beta * path->cost + path->lnode->alpha, path == n->lpath); } } @@ -32,16 +31,13 @@ void calc_alpha(Node *n, double beta) { void calc_beta(Node *n, double beta) { n->beta = 0.0; for (Path *path = n->rpath; path; path = path->rnext) { - n->beta = logsumexp(n->beta, - -beta * path->cost + path->rnode->beta, + n->beta = logsumexp(n->beta, -beta * path->cost + path->rnode->beta, path == n->rpath); } } } // namespace -Viterbi::Viterbi() - : tokenizer_(0), connector_(0), - cost_factor_(0) {} +Viterbi::Viterbi() : tokenizer_(0), connector_(0), cost_factor_(0) {} Viterbi::~Viterbi() {} @@ -53,10 +49,8 @@ bool Viterbi::open(const Param ¶m) { connector_.reset(new Connector); CHECK_FALSE(connector_->open(param)) << connector_->what(); - CHECK_FALSE(tokenizer_->dictionary_info()->lsize == - connector_->left_size() && - tokenizer_->dictionary_info()->rsize == - connector_->right_size()) + CHECK_FALSE(tokenizer_->dictionary_info()->lsize == connector_->left_size() && + tokenizer_->dictionary_info()->rsize == connector_->right_size()) << "Transition table and dictionary are not compatible"; cost_factor_ = param.get("cost-factor"); @@ -121,9 +115,7 @@ const Tokenizer *Viterbi::tokenizer() const { return tokenizer_.get(); } -const Connector *Viterbi::connector() const { - return connector_.get(); -} +const Connector *Viterbi::connector() const { return connector_.get(); } // static bool Viterbi::forwardbackward(Lattice *lattice) { @@ -131,7 +123,7 @@ bool Viterbi::forwardbackward(Lattice *lattice) { return true; } - Node **end_node_list = lattice->end_nodes(); + Node **end_node_list = lattice->end_nodes(); Node **begin_node_list = lattice->begin_nodes(); const size_t len = lattice->size(); @@ -158,9 +150,8 @@ bool Viterbi::forwardbackward(Lattice *lattice) { for (Node *node = begin_node_list[pos]; node; node = node->bnext) { node->prob = std::exp(node->alpha + node->beta - Z); for (Path *path = node->lpath; path; path = path->lnext) { - path->prob = std::exp(path->lnode->alpha - - theta * path->cost - + path->rnode->beta - Z); + path->prob = std::exp(path->lnode->alpha - theta * path->cost + + path->rnode->beta - Z); } } } @@ -203,14 +194,13 @@ bool Viterbi::buildAlternative(Lattice *lattice) { if (node->stat == MECAB_BOS_NODE || node->stat == MECAB_EOS_NODE) { continue; } - const size_t pos = node->surface - lattice->sentence() - - node->rlength + node->length; + const size_t pos = + node->surface - lattice->sentence() - node->rlength + node->length; std::cout.write(node->surface, node->length); std::cout << "\t" << node->feature << std::endl; - for (const Node *anode = begin_node_list[pos]; - anode; anode = anode->bnext) { - if (anode->rlength == node->rlength && - anode->length == node->length) { + for (const Node *anode = begin_node_list[pos]; anode; + anode = anode->bnext) { + if (anode->rlength == node->rlength && anode->length == node->length) { std::cout << "@ "; std::cout.write(anode->surface, anode->length); std::cout << "\t" << anode->feature << std::endl; @@ -250,8 +240,7 @@ bool Viterbi::initPartial(Lattice *lattice) { if (!lattice->has_request_type(MECAB_PARTIAL)) { if (lattice->has_constraint()) { lattice->set_boundary_constraint(0, MECAB_TOKEN_BOUNDARY); - lattice->set_boundary_constraint(lattice->size(), - MECAB_TOKEN_BOUNDARY); + lattice->set_boundary_constraint(lattice->size(), MECAB_TOKEN_BOUNDARY); } return true; } @@ -261,10 +250,9 @@ bool Viterbi::initPartial(Lattice *lattice) { strncpy(str, lattice->sentence(), lattice->size() + 1); std::vector lines; - const size_t lsize = tokenize(str, "\n", - std::back_inserter(lines), - lattice->size() + 1); - char* column[2]; + const size_t lsize = + tokenize(str, "\n", std::back_inserter(lines), lattice->size() + 1); + char *column[2]; scoped_array buf(new char[lattice->size() + 1]); StringBuffer os(buf.get(), lattice->size() + 1); @@ -301,8 +289,7 @@ bool Viterbi::initPartial(Lattice *lattice) { if (feature) { lattice->set_feature_constraint(pos, pos + len, feature); for (size_t n = 1; n < len; ++n) { - lattice->set_boundary_constraint(pos + n, - MECAB_INSIDE_TOKEN); + lattice->set_boundary_constraint(pos + n, MECAB_INSIDE_TOKEN); } } pos += len; @@ -312,31 +299,30 @@ bool Viterbi::initPartial(Lattice *lattice) { } namespace { -template bool connect(size_t pos, Node *rnode, - Node **begin_node_list, - Node **end_node_list, - const Connector *connector, - Allocator *allocator) { - for (;rnode; rnode = rnode->bnext) { - register long best_cost = 2147483647; - Node* best_node = 0; +template +bool connect(size_t pos, Node *rnode, Node **begin_node_list, + Node **end_node_list, const Connector *connector, + Allocator *allocator) { + for (; rnode; rnode = rnode->bnext) { + long best_cost = 2147483647; + Node *best_node = 0; for (Node *lnode = end_node_list[pos]; lnode; lnode = lnode->enext) { - register int lcost = connector->cost(lnode, rnode); // local cost - register long cost = lnode->cost + lcost; + int lcost = connector->cost(lnode, rnode); // local cost + long cost = lnode->cost + lcost; if (cost < best_cost) { - best_node = lnode; - best_cost = cost; + best_node = lnode; + best_cost = cost; } if (IsAllPath) { - Path *path = allocator->newPath(); - path->cost = lcost; - path->rnode = rnode; - path->lnode = lnode; - path->lnext = rnode->lpath; + Path *path = allocator->newPath(); + path->cost = lcost; + path->rnode = rnode; + path->lnode = lnode; + path->lnext = rnode->lpath; rnode->lpath = path; - path->rnext = lnode->rpath; + path->rnext = lnode->rpath; lnode->rpath = path; } } @@ -360,7 +346,7 @@ template bool connect(size_t pos, Node *rnode, template bool Viterbi::viterbi(Lattice *lattice) const { - Node **end_node_list = lattice->end_nodes(); + Node **end_node_list = lattice->end_nodes(); Node **begin_node_list = lattice->begin_nodes(); Allocator *allocator = lattice->allocator(); const size_t len = lattice->size(); @@ -373,14 +359,11 @@ bool Viterbi::viterbi(Lattice *lattice) const { for (size_t pos = 0; pos < len; ++pos) { if (end_node_list[pos]) { - Node *right_node = tokenizer_->lookup(begin + pos, end, - allocator, lattice); + Node *right_node = + tokenizer_->lookup(begin + pos, end, allocator, lattice); begin_node_list[pos] = right_node; - if (!connect(pos, right_node, - begin_node_list, - end_node_list, - connector_.get(), - allocator)) { + if (!connect(pos, right_node, begin_node_list, end_node_list, + connector_.get(), allocator)) { lattice->set_what("too long sentence."); return false; } @@ -393,11 +376,8 @@ bool Viterbi::viterbi(Lattice *lattice) const { for (long pos = len; static_cast(pos) >= 0; --pos) { if (end_node_list[pos]) { - if (!connect(pos, eos_node, - begin_node_list, - end_node_list, - connector_.get(), - allocator)) { + if (!connect(pos, eos_node, begin_node_list, end_node_list, + connector_.get(), allocator)) { lattice->set_what("too long sentence."); return false; } @@ -410,4 +390,4 @@ bool Viterbi::viterbi(Lattice *lattice) const { return true; } -} // Mecab +} // namespace MeCab From 083722c1a0b0448cc3d2ee42e4e980dd524354db Mon Sep 17 00:00:00 2001 From: frawamudi Date: Mon, 16 Jan 2023 22:27:20 +0100 Subject: [PATCH 2/5] remove register keyword --- src/char_property.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/char_property.h b/src/char_property.h index 35f4b05..9c904ba 100644 --- a/src/char_property.h +++ b/src/char_property.h @@ -37,7 +37,7 @@ class CharProperty { inline const char *seekToOtherType(const char *begin, const char *end, CharInfo c, CharInfo *fail, size_t *mblen, size_t *clen) const { - register const char *p = begin; + const char *p = begin; *clen = 0; while (p != end && c.isKindOf(*fail = getCharInfo(p, end, mblen))) { p += *mblen; From bf5a1b5c181413b3ffc94304b792a8c9179f2e82 Mon Sep 17 00:00:00 2001 From: frawamudi Date: Tue, 17 Jan 2023 00:10:48 +0100 Subject: [PATCH 3/5] added headers common.h and scoped_ptr.h intoviterbi.h --- src/viterbi.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/viterbi.h b/src/viterbi.h index 54bfdf9..e8c92c5 100644 --- a/src/viterbi.h +++ b/src/viterbi.h @@ -9,6 +9,8 @@ #include #include "mecab.h" #include "thread.h" +#include "scoped_ptr.h" +#include "common.h" namespace MeCab { From 8dbdbba8e7041a293b2c8c621caead48ce594ea8 Mon Sep 17 00:00:00 2001 From: frawamudi Date: Wed, 23 Aug 2023 00:49:25 +0100 Subject: [PATCH 4/5] reverted clang-tidy format --- src/char_property.h | 4 +- src/darts.h | 274 +++++++++--------- src/viterbi.cpp | 687 +++++++++++++++++++++++--------------------- src/viterbi.h | 2 +- 4 files changed, 498 insertions(+), 469 deletions(-) diff --git a/src/char_property.h b/src/char_property.h index 35f4b05..7be47c5 100644 --- a/src/char_property.h +++ b/src/char_property.h @@ -37,7 +37,7 @@ class CharProperty { inline const char *seekToOtherType(const char *begin, const char *end, CharInfo c, CharInfo *fail, size_t *mblen, size_t *clen) const { - register const char *p = begin; + const char *p = begin; *clen = 0; while (p != end && c.isKindOf(*fail = getCharInfo(p, end, mblen))) { p += *mblen; @@ -89,4 +89,4 @@ class CharProperty { whatlog what_; }; } -#endif // MECAB_CHARACTER_CATEGORY_H_ +#endif // MECAB_CHARACTER_CATEGORY_H_ \ No newline at end of file diff --git a/src/darts.h b/src/darts.h index 4efb94c..875b64f 100644 --- a/src/darts.h +++ b/src/darts.h @@ -8,82 +8,72 @@ #define DARTS_H_ #define DARTS_VERSION "0.31" -#include -#include #include +#include +#include #ifdef HAVE_ZLIB_H namespace zlib { #include } -#define SH(p) \ - ((unsigned short)(unsigned char)((p)[0]) | \ - ((unsigned short)(unsigned char)((p)[1]) << 8)) -#define LG(p) ((unsigned long)(SH(p)) | ((unsigned long)(SH((p) + 2)) << 16)) +#define SH(p)((unsigned short)(unsigned char)((p)[0]) | ((unsigned short)(unsigned char)((p)[1]) << 8)) +#define LG(p)((unsigned long)(SH(p)) |((unsigned long)(SH((p)+2)) << 16)) #endif namespace MeCab { namespace Darts { -template -inline T _max(T x, T y) { - return (x > y) ? x : y; -} -template -inline T *_resize(T *ptr, size_t n, size_t l, T v) { +template inline T _max(T x, T y) { return(x > y) ? x : y; } +template inline T* _resize(T* ptr, size_t n, size_t l, T v) { T *tmp = new T[l]; for (size_t i = 0; i < n; ++i) tmp[i] = ptr[i]; for (size_t i = n; i < l; ++i) tmp[i] = v; - delete[] ptr; + delete [] ptr; return tmp; } template class Length { - public: - size_t operator()(const T *key) const { - size_t i; - for (i = 0; key[i] != (T)0; ++i) { - } - return i; - } + public: size_t operator()(const T *key) const + { size_t i; for (i = 0; key[i] != (T)0; ++i) {} return i; } }; -template <> -class Length { - public: - size_t operator()(const char *key) const { return std::strlen(key); } +template <> class Length { + public: size_t operator()(const char *key) const + { return std::strlen(key); } }; -template > +template > class DoubleArrayImpl { private: + struct node_t { array_u_type_ code; - size_t depth; - size_t left; - size_t right; + size_t depth; + size_t left; + size_t right; }; struct unit_t { - array_type_ base; + array_type_ base; array_u_type_ check; }; - unit_t *array_; + unit_t *array_; unsigned char *used_; - size_t size_; - size_t alloc_size_; - node_type_ **key_; - size_t key_size_; - size_t *length_; - array_type_ *value_; - size_t progress_; - size_t next_check_pos_; - bool no_delete_; - int error_; + size_t size_; + size_t alloc_size_; + node_type_ **key_; + size_t key_size_; + size_t *length_; + array_type_ *value_; + size_t progress_; + size_t next_check_pos_; + bool no_delete_; + int error_; int (*progress_func_)(size_t, size_t); size_t resize(const size_t new_size) { @@ -91,13 +81,13 @@ class DoubleArrayImpl { tmp.base = 0; tmp.check = 0; array_ = _resize(array_, alloc_size_, new_size, tmp); - used_ = - _resize(used_, alloc_size_, new_size, static_cast(0)); + used_ = _resize(used_, alloc_size_, new_size, + static_cast(0)); alloc_size_ = new_size; return new_size; } - size_t fetch(const node_t &parent, std::vector &siblings) { + size_t fetch(const node_t &parent, std::vector &siblings) { if (error_ < 0) return 0; array_u_type_ prev = 0; @@ -120,9 +110,9 @@ class DoubleArrayImpl { if (cur != prev || siblings.empty()) { node_t tmp_node; tmp_node.depth = parent.depth + 1; - tmp_node.code = cur; - tmp_node.left = i; - if (!siblings.empty()) siblings[siblings.size() - 1].right = i; + tmp_node.code = cur; + tmp_node.left = i; + if (!siblings.empty()) siblings[siblings.size()-1].right = i; siblings.push_back(tmp_node); } @@ -130,23 +120,24 @@ class DoubleArrayImpl { prev = cur; } - if (!siblings.empty()) siblings[siblings.size() - 1].right = parent.right; + if (!siblings.empty()) + siblings[siblings.size()-1].right = parent.right; return siblings.size(); } - size_t insert(const std::vector &siblings) { + size_t insert(const std::vector &siblings) { if (error_ < 0) return 0; size_t begin = 0; - size_t pos = _max((size_t)siblings[0].code + 1, next_check_pos_) - 1; + size_t pos = _max((size_t)siblings[0].code + 1, next_check_pos_) - 1; size_t nonzero_num = 0; - int first = 0; + int first = 0; if (alloc_size_ <= pos) resize(pos + 1); while (true) { - next: + next: ++pos; if (alloc_size_ <= pos) resize(pos + 1); @@ -160,7 +151,7 @@ class DoubleArrayImpl { } begin = pos - siblings[0].code; - if (alloc_size_ <= (begin + siblings[siblings.size() - 1].code)) + if (alloc_size_ <= (begin + siblings[siblings.size()-1].code)) resize(static_cast(alloc_size_ * _max(1.05, 1.0 * key_size_ / progress_))); @@ -177,31 +168,33 @@ class DoubleArrayImpl { // 'next_check_pos' and 'check' is greater than some constant // value(e.g. 0.9), // new 'next_check_pos' index is written by 'check'. - if (1.0 * nonzero_num / (pos - next_check_pos_ + 1) >= 0.95) + if (1.0 * nonzero_num/(pos - next_check_pos_ + 1) >= 0.95) next_check_pos_ = pos; used_[begin] = 1; - size_ = _max(size_, begin + static_cast( - siblings[siblings.size() - 1].code + 1)); + size_ = _max(size_, + begin + + static_cast(siblings[siblings.size() - 1].code + 1)); for (size_t i = 0; i < siblings.size(); ++i) array_[begin + siblings[i].code].check = begin; for (size_t i = 0; i < siblings.size(); ++i) { - std::vector new_siblings; + std::vector new_siblings; if (!fetch(siblings[i], new_siblings)) { array_[begin + siblings[i].code].base = - value_ ? static_cast(-value_[siblings[i].left] - 1) - : static_cast(-siblings[i].left - 1); + value_ ? + static_cast(-value_[siblings[i].left]-1) : + static_cast(-siblings[i].left-1); - if (value_ && (array_type_)(-value_[siblings[i].left] - 1) >= 0) { + if (value_ && (array_type_)(-value_[siblings[i].left]-1) >= 0) { error_ = -2; return 0; } ++progress_; - if (progress_func_) (*progress_func_)(progress_, key_size_); + if (progress_func_)(*progress_func_)(progress_, key_size_); } else { size_t h = insert(new_siblings); @@ -213,27 +206,26 @@ class DoubleArrayImpl { } public: - typedef array_type_ value_type; - typedef node_type_ key_type; - typedef array_type_ result_type; // for compatibility + + typedef array_type_ value_type; + typedef node_type_ key_type; + typedef array_type_ result_type; // for compatibility struct result_pair_type { value_type value; - size_t length; + size_t length; }; - explicit DoubleArrayImpl() - : array_(0), - used_(0), - size_(0), - alloc_size_(0), - no_delete_(0), - error_(0) {} + explicit DoubleArrayImpl(): array_(0), used_(0), + size_(0), alloc_size_(0), + no_delete_(0), error_(0) {} ~DoubleArrayImpl() { clear(); } - void set_result(value_type &x, value_type r, size_t) const { x = r; } + void set_result(value_type& x, value_type r, size_t) const { + x = r; + } - void set_result(result_pair_type &x, value_type r, size_t l) const { + void set_result(result_pair_type& x, value_type r, size_t l) const { x.value = r; x.length = l; } @@ -250,8 +242,9 @@ class DoubleArrayImpl { } void clear() { - if (!no_delete_) delete[] array_; - delete[] used_; + if (!no_delete_) + delete [] array_; + delete [] used_; array_ = 0; used_ = 0; alloc_size_ = 0; @@ -259,8 +252,8 @@ class DoubleArrayImpl { no_delete_ = false; } - size_t unit_size() const { return sizeof(unit_t); } - size_t size() const { return size_; } + size_t unit_size() const { return sizeof(unit_t); } + size_t size() const { return size_; } size_t total_size() const { return size_ * sizeof(unit_t); } size_t nonzero_size() const { @@ -270,16 +263,19 @@ class DoubleArrayImpl { return result; } - int build(size_t key_size, key_type **key, size_t *length = 0, - value_type *value = 0, int (*progress_func)(size_t, size_t) = 0) { + int build(size_t key_size, + key_type **key, + size_t *length = 0, + value_type *value = 0, + int (*progress_func)(size_t, size_t) = 0) { if (!key_size || !key) return 0; progress_func_ = progress_func; - key_ = key; - length_ = length; - key_size_ = key_size; - value_ = value; - progress_ = 0; + key_ = key; + length_ = length; + key_size_ = key_size; + value_ = value; + progress_ = 0; resize(8192); @@ -287,31 +283,33 @@ class DoubleArrayImpl { next_check_pos_ = 0; node_t root_node; - root_node.left = 0; + root_node.left = 0; root_node.right = key_size; root_node.depth = 0; - std::vector siblings; + std::vector siblings; fetch(root_node, siblings); insert(siblings); size_ += (1 << 8 * sizeof(key_type)) + 1; if (size_ >= alloc_size_) resize(size_); - delete[] used_; + delete [] used_; used_ = 0; return error_; } - int open(const char *file, const char *mode = "rb", size_t offset = 0, + int open(const char *file, + const char *mode = "rb", + size_t offset = 0, size_t size = 0) { std::FILE *fp = std::fopen(file, mode); if (!fp) return -1; if (std::fseek(fp, offset, SEEK_SET) != 0) return -1; if (!size) { - if (std::fseek(fp, 0L, SEEK_END) != 0) return -1; + if (std::fseek(fp, 0L, SEEK_END) != 0) return -1; size = std::ftell(fp); if (std::fseek(fp, offset, SEEK_SET) != 0) return -1; } @@ -321,29 +319,32 @@ class DoubleArrayImpl { size_ = size; size_ /= sizeof(unit_t); array_ = new unit_t[size_]; - if (size_ != std::fread(reinterpret_cast(array_), sizeof(unit_t), - size_, fp)) - return -1; + if (size_ != std::fread(reinterpret_cast(array_), + sizeof(unit_t), size_, fp)) return -1; std::fclose(fp); return 0; } - int save(const char *file, const char *mode = "wb", size_t offset = 0) { + int save(const char *file, + const char *mode = "wb", + size_t offset = 0) { if (!size_) return -1; std::FILE *fp = std::fopen(file, mode); if (!fp) return -1; - if (size_ != std::fwrite(reinterpret_cast(array_), sizeof(unit_t), - size_, fp)) + if (size_ != std::fwrite(reinterpret_cast(array_), + sizeof(unit_t), size_, fp)) return -1; std::fclose(fp); return 0; } #ifdef HAVE_ZLIB_H - int gzopen(const char *file, const char *mode = "rb", size_t offset = 0, + int gzopen(const char *file, + const char *mode = "rb", + size_t offset = 0, size_t size = 0) { - std::FILE *fp = std::fopen(file, mode); + std::FILE *fp = std::fopen(file, mode); if (!fp) return -1; clear(); @@ -351,11 +352,12 @@ class DoubleArrayImpl { if (!size_) { if (-1L != static_cast(std::fseek(fp, -8, SEEK_END))) { char buf[8]; - if (std::fread(static_cast(buf), 1, 8, fp) != sizeof(buf)) { + if (std::fread(static_cast(buf), + 1, 8, fp) != sizeof(buf)) { std::fclose(fp); return -1; } - size_ = LG(buf + 4); + size_ = LG(buf+4); size_ /= sizeof(unit_t); } } @@ -373,7 +375,8 @@ class DoubleArrayImpl { return 0; } - int gzsave(const char *file, const char *mode = "wb", size_t offset = 0) { + int gzsave(const char *file, const char *mode = "wb", + size_t offset = 0) { zlib::gzFile gzfp = zlib::gzopen(file, mode); if (!gzfp) return -1; zlib::gzwrite(gzfp, reinterpret_cast(array_), @@ -384,25 +387,28 @@ class DoubleArrayImpl { #endif template - inline void exactMatchSearch(const key_type *key, T &result, size_t len = 0, + inline void exactMatchSearch(const key_type *key, + T & result, + size_t len = 0, size_t node_pos = 0) const { result = exactMatchSearch(key, len, node_pos); return; } template - inline T exactMatchSearch(const key_type *key, size_t len = 0, + inline T exactMatchSearch(const key_type *key, + size_t len = 0, size_t node_pos = 0) const { if (!len) len = length_func_()(key); T result; set_result(result, -1, 0); - array_type_ b = array_[node_pos].base; + array_type_ b = array_[node_pos].base; array_u_type_ p; for (size_t i = 0; i < len; ++i) { - p = b + (node_u_type_)(key[i]) + 1; + p = b +(node_u_type_)(key[i]) + 1; if (static_cast(b) == array_[p].check) b = array_[p].base; else @@ -412,32 +418,35 @@ class DoubleArrayImpl { p = b; array_type_ n = array_[p].base; if (static_cast(b) == array_[p].check && n < 0) - set_result(result, -n - 1, len); + set_result(result, -n-1, len); return result; } template - size_t commonPrefixSearch(const key_type *key, T *result, size_t result_len, - size_t len = 0, size_t node_pos = 0) const { + size_t commonPrefixSearch(const key_type *key, + T* result, + size_t result_len, + size_t len = 0, + size_t node_pos = 0) const { if (!len) len = length_func_()(key); - array_type_ b = array_[node_pos].base; - size_t num = 0; - array_type_ n; + array_type_ b = array_[node_pos].base; + size_t num = 0; + array_type_ n; array_u_type_ p; - for (size_t i = 0; i < len; ++i) { + for ( size_t i = 0; i < len; ++i) { p = b; // + 0; n = array_[p].base; - if ((array_u_type_)b == array_[p].check && n < 0) { + if ((array_u_type_) b == array_[p].check && n < 0) { // result[num] = -n-1; - if (num < result_len) set_result(result[num], -n - 1, i); + if (num < result_len) set_result(result[num], -n-1, i); ++num; } - p = b + (node_u_type_)(key[i]) + 1; - if ((array_u_type_)b == array_[p].check) + p = b +(node_u_type_)(key[i]) + 1; + if ((array_u_type_) b == array_[p].check) b = array_[p].base; else return num; @@ -447,22 +456,24 @@ class DoubleArrayImpl { n = array_[p].base; if ((array_u_type_)b == array_[p].check && n < 0) { - if (num < result_len) set_result(result[num], -n - 1, len); + if (num < result_len) set_result(result[num], -n-1, len); ++num; } return num; } - value_type traverse(const key_type *key, size_t &node_pos, size_t &key_pos, + value_type traverse(const key_type *key, + size_t &node_pos, + size_t &key_pos, size_t len = 0) const { if (!len) len = length_func_()(key); - array_type_ b = array_[node_pos].base; + array_type_ b = array_[node_pos].base; array_u_type_ p; for (; key_pos < len; ++key_pos) { - p = b + (node_u_type_)(key[key_pos]) + 1; + p = b +(node_u_type_)(key[key_pos]) + 1; if (static_cast(b) == array_[p].check) { node_pos = p; b = array_[p].base; @@ -474,35 +485,34 @@ class DoubleArrayImpl { p = b; array_type_ n = array_[p].base; if (static_cast(b) == array_[p].check && n < 0) - return -n - 1; + return -n-1; return -1; // found, but no value } }; #if 4 == 2 -typedef Darts::DoubleArrayImpl - DoubleArray; +typedef Darts::DoubleArrayImpl DoubleArray; #define DARTS_ARRAY_SIZE_IS_DEFINED 1 #endif #if 4 == 4 && !defined(DARTS_ARRAY_SIZE_IS_DEFINED) -typedef Darts::DoubleArrayImpl - DoubleArray; +typedef Darts::DoubleArrayImpl DoubleArray; #define DARTS_ARRAY_SIZE_IS_DEFINED 1 #endif #if 4 == 4 && !defined(DARTS_ARRAY_SIZE_IS_DEFINED) -typedef Darts::DoubleArrayImpl - DoubleArray; +typedef Darts::DoubleArrayImpl DoubleArray; #define DARTS_ARRAY_SIZE_IS_DEFINED 1 #endif #if 4 == 8 && !defined(DARTS_ARRAY_SIZE_IS_DEFINED) typedef Darts::DoubleArrayImpl - DoubleArray; -#endif -} // namespace Darts -} // namespace MeCab + unsigned long long> DoubleArray; #endif +} +} +#endif \ No newline at end of file diff --git a/src/viterbi.cpp b/src/viterbi.cpp index 9ae785c..ba9ce92 100644 --- a/src/viterbi.cpp +++ b/src/viterbi.cpp @@ -1,393 +1,412 @@ -// MeCab -- Yet Another Part-of-Speech and Morphological Analyzer +// MeCab -- Yet Another Part-of-Speech and Morphological Analyzer // // // Copyright(C) 2001-2011 Taku Kudo // Copyright(C) 2004-2006 Nippon Telegraph and Telephone Corporation -#include "viterbi.h" +#include +#include +#include +#include #include "common.h" -#include "connector.h" -#include "mecab.h" -#include "nbest_generator.h" #include "param.h" -#include "scoped_ptr.h" #include "string_buffer.h" -#include "tokenizer.h" -#include -#include -#include -#include +#include "utils.h" +#include "writer.h" namespace MeCab { -namespace { -void calc_alpha(Node *n, double beta) { - n->alpha = 0.0; - for (Path *path = n->lpath; path; path = path->lnext) { - n->alpha = logsumexp(n->alpha, -beta * path->cost + path->lnode->alpha, - path == n->lpath); - } -} - -void calc_beta(Node *n, double beta) { - n->beta = 0.0; - for (Path *path = n->rpath; path; path = path->rnext) { - n->beta = logsumexp(n->beta, -beta * path->cost + path->rnode->beta, - path == n->rpath); - } -} -} // namespace - -Viterbi::Viterbi() : tokenizer_(0), connector_(0), cost_factor_(0) {} +Writer::Writer() : write_(&Writer::writeLattice) {} +Writer::~Writer() {} -Viterbi::~Viterbi() {} - -bool Viterbi::open(const Param ¶m) { - tokenizer_.reset(new Tokenizer); - CHECK_FALSE(tokenizer_->open(param)) << tokenizer_->what(); - CHECK_FALSE(tokenizer_->dictionary_info()) << "Dictionary is empty"; - - connector_.reset(new Connector); - CHECK_FALSE(connector_->open(param)) << connector_->what(); - - CHECK_FALSE(tokenizer_->dictionary_info()->lsize == connector_->left_size() && - tokenizer_->dictionary_info()->rsize == connector_->right_size()) - << "Transition table and dictionary are not compatible"; - - cost_factor_ = param.get("cost-factor"); - if (cost_factor_ == 0) { - cost_factor_ = 800; - } - - return true; +void Writer::close() { + write_ = &Writer::writeLattice; } -bool Viterbi::analyze(Lattice *lattice) const { - if (!lattice || !lattice->sentence()) { - return false; - } - - if (!initPartial(lattice)) { - return false; - } - - bool result = false; - if (lattice->has_request_type(MECAB_NBEST) || - lattice->has_request_type(MECAB_MARGINAL_PROB)) { - // IsAllPath=true - if (lattice->has_constraint()) { - result = viterbi(lattice); - } else { - result = viterbi(lattice); - } +bool Writer::open(const Param ¶m) { + const std::string ostyle = param.get("output-format-type"); + write_ = &Writer::writeLattice; + + if (ostyle == "wakati") { + write_ = &Writer::writeWakati; + } else if (ostyle == "none") { + write_ = &Writer::writeNone; + } else if (ostyle == "dump") { + write_ = &Writer::writeDump; + } else if (ostyle == "em") { + write_ = &Writer::writeEM; } else { - // IsAllPath=false - if (lattice->has_constraint()) { - result = viterbi(lattice); - } else { - result = viterbi(lattice); - } - } - - if (!result) { - return false; - } - - if (!forwardbackward(lattice)) { - return false; - } - - if (!buildBestLattice(lattice)) { - return false; - } - - if (!buildAllLattice(lattice)) { - return false; - } - - if (!initNBest(lattice)) { - return false; - } - - return true; -} - -const Tokenizer *Viterbi::tokenizer() const { - return tokenizer_.get(); -} - -const Connector *Viterbi::connector() const { return connector_.get(); } - -// static -bool Viterbi::forwardbackward(Lattice *lattice) { - if (!lattice->has_request_type(MECAB_MARGINAL_PROB)) { - return true; - } - - Node **end_node_list = lattice->end_nodes(); - Node **begin_node_list = lattice->begin_nodes(); - - const size_t len = lattice->size(); - const double theta = lattice->theta(); - - end_node_list[0]->alpha = 0.0; - for (int pos = 0; pos <= static_cast(len); ++pos) { - for (Node *node = begin_node_list[pos]; node; node = node->bnext) { - calc_alpha(node, theta); + // default values + std::string node_format = "%m\\t%H\\n"; + std::string unk_format = "%m\\t%H\\n"; + std::string bos_format = ""; + std::string eos_format = "EOS\\n"; + std::string eon_format = ""; + + std::string node_format_key = "node-format"; + std::string bos_format_key = "bos-format"; + std::string eos_format_key = "eos-format"; + std::string unk_format_key = "unk-format"; + std::string eon_format_key = "eon-format"; + + if (!ostyle.empty()) { + node_format_key += "-"; + node_format_key += ostyle; + bos_format_key += "-"; + bos_format_key += ostyle; + eos_format_key += "-"; + eos_format_key += ostyle; + unk_format_key += "-"; + unk_format_key += ostyle; + eon_format_key += "-"; + eon_format_key += ostyle; + const std::string tmp = param.get(node_format_key.c_str()); + CHECK_FALSE(!tmp.empty()) << "unkown format type [" << ostyle << "]"; } - } - begin_node_list[len]->beta = 0.0; - for (int pos = static_cast(len); pos >= 0; --pos) { - for (Node *node = end_node_list[pos]; node; node = node->enext) { - calc_beta(node, theta); - } - } - - const double Z = begin_node_list[len]->alpha; - lattice->set_Z(Z); // alpha of EOS - - for (int pos = 0; pos <= static_cast(len); ++pos) { - for (Node *node = begin_node_list[pos]; node; node = node->bnext) { - node->prob = std::exp(node->alpha + node->beta - Z); - for (Path *path = node->lpath; path; path = path->lnext) { - path->prob = std::exp(path->lnode->alpha - theta * path->cost + - path->rnode->beta - Z); + const std::string node_format2 = + param.get(node_format_key.c_str()); + const std::string bos_format2 = + param.get(bos_format_key.c_str()); + const std::string eos_format2 = + param.get(eos_format_key.c_str()); + const std::string unk_format2 = + param.get(unk_format_key.c_str()); + const std::string eon_format2 = + param.get(eon_format_key.c_str()); + + if (node_format != node_format2 || bos_format != bos_format2 || + eos_format != eos_format2 || unk_format != unk_format2) { + write_ = &Writer::writeUser; + if (node_format != node_format2) { + node_format = node_format2; + } + if (bos_format != bos_format2) { + bos_format = bos_format2; } + if (eos_format != eos_format2) { + eos_format = eos_format2; + } + if (unk_format != unk_format2) { + unk_format = unk_format2; + } else if (node_format != node_format2) { + unk_format = node_format2; + } else { + unk_format = node_format; + } + if (eon_format != eon_format2) { + eon_format = eon_format2; + } + node_format_.reset_string(node_format.c_str()); + bos_format_.reset_string(bos_format.c_str()); + eos_format_.reset_string(eos_format.c_str()); + unk_format_.reset_string(unk_format.c_str()); + eon_format_.reset_string(eon_format.c_str()); } } return true; } -// static -bool Viterbi::buildResultForNBest(Lattice *lattice) { - return buildAllLattice(lattice); -} - -// static -bool Viterbi::buildAllLattice(Lattice *lattice) { - if (!lattice->has_request_type(MECAB_ALL_MORPHS)) { - return true; - } - - Node *prev = lattice->bos_node(); - const size_t len = lattice->size(); - Node **begin_node_list = lattice->begin_nodes(); - - for (long pos = 0; pos <= static_cast(len); ++pos) { - for (Node *node = begin_node_list[pos]; node; node = node->bnext) { - prev->next = node; - node->prev = prev; - prev = node; - } +bool Writer::write(Lattice *lattice, StringBuffer *os) const { + if (!lattice || !lattice->is_available()) { + return false; } - - return true; + return (this->*write_)(lattice, os); } -// static -bool Viterbi::buildAlternative(Lattice *lattice) { - Node **begin_node_list = lattice->begin_nodes(); - - const Node *bos_node = lattice->bos_node(); - for (const Node *node = bos_node; node; node = node->next) { - if (node->stat == MECAB_BOS_NODE || node->stat == MECAB_EOS_NODE) { - continue; - } - const size_t pos = - node->surface - lattice->sentence() - node->rlength + node->length; - std::cout.write(node->surface, node->length); - std::cout << "\t" << node->feature << std::endl; - for (const Node *anode = begin_node_list[pos]; anode; - anode = anode->bnext) { - if (anode->rlength == node->rlength && anode->length == node->length) { - std::cout << "@ "; - std::cout.write(anode->surface, anode->length); - std::cout << "\t" << anode->feature << std::endl; - } - } +bool Writer::writeLattice(Lattice *lattice, StringBuffer *os) const { + for (const Node *node = lattice->bos_node()->next; + node->next; node = node->next) { + os->write(node->surface, node->length); + *os << '\t' << node->feature; // << '\t'; + *os << '\n'; } - - std::cout << "EOS" << std::endl; - + *os << "EOS\n"; return true; } -// static -bool Viterbi::buildBestLattice(Lattice *lattice) { - Node *node = lattice->eos_node(); - for (Node *prev_node; node->prev;) { - node->isbest = 1; - prev_node = node->prev; - prev_node->next = node; - node = prev_node; +bool Writer::writeWakati(Lattice *lattice, StringBuffer *os) const { + for (const Node *node = lattice->bos_node()->next; + node->next; node = node->next) { + os->write(node->surface, node->length); + *os << ' '; } - + *os << '\n'; return true; } -// static -bool Viterbi::initNBest(Lattice *lattice) { - if (!lattice->has_request_type(MECAB_NBEST)) { - return true; - } - lattice->allocator()->nbest_generator()->set(lattice); - return true; +bool Writer::writeNone(Lattice *lattice, StringBuffer *os) const { + return true; // do nothing } -// static -bool Viterbi::initPartial(Lattice *lattice) { - if (!lattice->has_request_type(MECAB_PARTIAL)) { - if (lattice->has_constraint()) { - lattice->set_boundary_constraint(0, MECAB_TOKEN_BOUNDARY); - lattice->set_boundary_constraint(lattice->size(), MECAB_TOKEN_BOUNDARY); +bool Writer::writeEM(Lattice *lattice, StringBuffer *os) const { + static const float min_prob = 0.0001; + for (const Node *node = lattice->bos_node(); node; node = node->next) { + if (node->prob >= min_prob) { + *os << "U\t"; + if (node->stat == MECAB_BOS_NODE) { + *os << "BOS"; + } else if (node->stat == MECAB_EOS_NODE) { + *os << "EOS"; + } else { + os->write(node->surface, node->length); + } + *os << '\t' << node->feature << '\t' << node->prob << '\n'; + } + for (const Path *path = node->lpath; path; path = path->lnext) { + if (path->prob >= min_prob) { + *os << "B\t" << path->lnode->feature << '\t' + << node->feature << '\t' << path->prob << '\n'; + } } - return true; } + *os << "EOS\n"; + return true; +} - Allocator *allocator = lattice->allocator(); - char *str = allocator->partial_buffer(lattice->size() + 1); - strncpy(str, lattice->sentence(), lattice->size() + 1); - - std::vector lines; - const size_t lsize = - tokenize(str, "\n", std::back_inserter(lines), lattice->size() + 1); - char *column[2]; - scoped_array buf(new char[lattice->size() + 1]); - StringBuffer os(buf.get(), lattice->size() + 1); - - std::vector > tokens; - tokens.reserve(lsize); - - size_t pos = 0; - for (size_t i = 0; i < lsize; ++i) { - const size_t size = tokenize(lines[i], "\t", column, 2); - if (size == 1 && std::strcmp(column[0], "EOS") == 0) { - break; - } - const size_t len = std::strlen(column[0]); - if (size == 2) { - tokens.push_back(std::make_pair(column[0], column[1])); +bool Writer::writeDump(Lattice *lattice, StringBuffer *os) const { + const char *str = lattice->sentence(); + for (const Node *node = lattice->bos_node(); node; node = node->next) { + *os << node->id << ' '; + if (node->stat == MECAB_BOS_NODE) { + *os << "BOS"; + } else if (node->stat == MECAB_EOS_NODE) { + *os << "EOS"; } else { - tokens.push_back(std::make_pair(column[0], reinterpret_cast(0))); + os->write(node->surface, node->length); } - os << column[0]; - pos += len; - } - - os << '\0'; - - lattice->set_sentence(os.str()); - pos = 0; - for (size_t i = 0; i < tokens.size(); ++i) { - const char *surface = tokens[i].first; - const char *feature = tokens[i].second; - const size_t len = std::strlen(surface); - lattice->set_boundary_constraint(pos, MECAB_TOKEN_BOUNDARY); - lattice->set_boundary_constraint(pos + len, MECAB_TOKEN_BOUNDARY); - if (feature) { - lattice->set_feature_constraint(pos, pos + len, feature); - for (size_t n = 1; n < len; ++n) { - lattice->set_boundary_constraint(pos + n, MECAB_INSIDE_TOKEN); - } + *os << ' ' << node->feature + << ' ' << static_cast(node->surface - str) + << ' ' << static_cast(node->surface - str + node->length) + << ' ' << node->rcAttr + << ' ' << node->lcAttr + << ' ' << node->posid + << ' ' << static_cast(node->char_type) + << ' ' << static_cast(node->stat) + << ' ' << static_cast(node->isbest) + << ' ' << node->alpha + << ' ' << node->beta + << ' ' << node->prob + << ' ' << node->cost; + + for (const Path *path = node->lpath; path; path = path->lnext) { + *os << ' ' << path->lnode->id << ':' << path->cost << ':' << path->prob; } - pos += len; + *os << '\n'; } - return true; } -namespace { -template -bool connect(size_t pos, Node *rnode, Node **begin_node_list, - Node **end_node_list, const Connector *connector, - Allocator *allocator) { - for (; rnode; rnode = rnode->bnext) { - long best_cost = 2147483647; - Node *best_node = 0; - for (Node *lnode = end_node_list[pos]; lnode; lnode = lnode->enext) { - int lcost = connector->cost(lnode, rnode); // local cost - long cost = lnode->cost + lcost; - - if (cost < best_cost) { - best_node = lnode; - best_cost = cost; - } - - if (IsAllPath) { - Path *path = allocator->newPath(); - path->cost = lcost; - path->rnode = rnode; - path->lnode = lnode; - path->lnext = rnode->lpath; - rnode->lpath = path; - path->rnext = lnode->rpath; - lnode->rpath = path; - } - } - - // overflow check 2003/03/09 - if (!best_node) { +bool Writer::writeUser(Lattice *lattice, StringBuffer *os) const { + if (!writeNode(lattice, bos_format_.get(), lattice->bos_node(), os)) { + return false; + } + const Node *node = 0; + for (node = lattice->bos_node()->next; node->next; node = node->next) { + const char *fmt = (node->stat == MECAB_UNK_NODE ? unk_format_.get() : + node_format_.get()); + if (!writeNode(lattice, fmt, node, os)) { return false; } - - rnode->prev = best_node; - rnode->next = 0; - rnode->cost = best_cost; - const size_t x = rnode->rlength + pos; - rnode->enext = end_node_list[x]; - end_node_list[x] = rnode; } - + if (!writeNode(lattice, eos_format_.get(), node, os)) { + return false; + } return true; } -} // namespace -template -bool Viterbi::viterbi(Lattice *lattice) const { - Node **end_node_list = lattice->end_nodes(); - Node **begin_node_list = lattice->begin_nodes(); - Allocator *allocator = lattice->allocator(); - const size_t len = lattice->size(); - const char *begin = lattice->sentence(); - const char *end = begin + len; - - Node *bos_node = tokenizer_->getBOSNode(lattice->allocator()); - bos_node->surface = lattice->sentence(); - end_node_list[0] = bos_node; - - for (size_t pos = 0; pos < len; ++pos) { - if (end_node_list[pos]) { - Node *right_node = - tokenizer_->lookup(begin + pos, end, allocator, lattice); - begin_node_list[pos] = right_node; - if (!connect(pos, right_node, begin_node_list, end_node_list, - connector_.get(), allocator)) { - lattice->set_what("too long sentence."); - return false; - } - } +bool Writer::writeNode(Lattice *lattice, const Node *node, + StringBuffer *os) const { + switch (node->stat) { + case MECAB_BOS_NODE: + return writeNode(lattice, bos_format_.get(), node, os); + case MECAB_EOS_NODE: + return writeNode(lattice, eos_format_.get(), node, os); + case MECAB_UNK_NODE: + return writeNode(lattice, unk_format_.get(), node, os); + case MECAB_NOR_NODE: + return writeNode(lattice, node_format_.get(), node, os); + case MECAB_EON_NODE: + return writeNode(lattice, eon_format_.get(), node, os); } + return true; +} - Node *eos_node = tokenizer_->getEOSNode(lattice->allocator()); - eos_node->surface = lattice->sentence() + lattice->size(); - begin_node_list[lattice->size()] = eos_node; - - for (long pos = len; static_cast(pos) >= 0; --pos) { - if (end_node_list[pos]) { - if (!connect(pos, eos_node, begin_node_list, end_node_list, - connector_.get(), allocator)) { - lattice->set_what("too long sentence."); - return false; - } - break; - } +bool Writer::writeNode(Lattice *lattice, + const char *p, + const Node *node, + StringBuffer *os) const { + scoped_fixed_array buf; + scoped_fixed_array ptr; + size_t psize = 0; + + for (; *p; p++) { + switch (*p) { + default: *os << *p; break; + + case '\\': *os << getEscapedChar(*++p); break; + + case '%': { // macros + switch (*++p) { + default: { + const std::string error = "unknown meta char: " + *p; + lattice->set_what(error.c_str()); + return false; + } + // input sentence + case 'S': os->write(lattice->sentence(), lattice->size()); break; + // sentence length + case 'L': *os << lattice->size(); break; + // morph + case 'm': os->write(node->surface, node->length); break; + case 'M': os->write(reinterpret_cast + (node->surface - node->rlength + node->length), + node->rlength); + break; + case 'h': *os << node->posid; break; // Part-Of-Speech ID + case '%': *os << '%'; break; // % + case 'c': *os << static_cast(node->wcost); break; // word cost + case 'H': *os << node->feature; break; + case 't': *os << static_cast(node->char_type); break; + case 's': *os << static_cast(node->stat); break; + case 'P': *os << node->prob; break; + case 'p': { + switch (*++p) { + default: + lattice->set_what("[iseSCwcnblLh] is required after %p"); + return false; + case 'i': *os << node->id; break; // node id + case 'S': os->write(reinterpret_cast + (node->surface - + node->rlength + node->length), + node->rlength - node->length); + break; // space + // start position + case 's': *os << static_cast( + node->surface - lattice->sentence()); + break; + // end position + case 'e': *os << static_cast + (node->surface - lattice->sentence() + node->length); + break; + // connection cost + case 'C': *os << node->cost - + node->prev->cost - node->wcost; + break; + case 'w': *os << node->wcost; break; // word cost + case 'c': *os << node->cost; break; // best cost + case 'n': *os << (node->cost - node->prev->cost); break; + // node cost + // * if best path, otherwise ' ' + case 'b': *os << (node->isbest ? '*' : ' '); break; + case 'P': *os << node->prob; break; + case 'A': *os << node->alpha; break; + case 'B': *os << node->beta; break; + case 'l': *os << node->length; break; // length of morph + // length of morph including the spaces + case 'L': *os << node->rlength; break; + case 'h': { // Hidden Layer ID + switch (*++p) { + default: + lattice->set_what("lr is required after %ph"); + return false; + case 'l': *os << node->lcAttr; break; // current + case 'r': *os << node->rcAttr; break; // prev + } + } break; + + case 'p': { + char mode = *++p; + char sep = *++p; + if (sep == '\\') { + sep = getEscapedChar(*++p); + } + if (!node->lpath) { + lattice->set_what("no path information is available"); + return false; + } + for (Path *path = node->lpath; path; path = path->lnext) { + if (path != node->lpath) *os << sep; + switch (mode) { + case 'i': *os << path->lnode->id; break; + case 'c': *os << path->cost; break; + case 'P': *os << path->prob; break; + default: + lattice->set_what("[icP] is required after %pp"); + return false; + } + } + } break; + + } + } break; + + case 'F': + case 'f': { + if (node->feature[0] == '\0') { + lattice->set_what("no feature information available"); + return false; + } + if (!psize) { + std::strncpy(buf.get(), node->feature, buf.size()); + psize = tokenizeCSV(buf.get(), ptr.get(), ptr.size()); + } + + // separator + char separator = '\t'; // default separator + if (*p == 'F') { // change separator + if (*++p == '\\') { + separator = getEscapedChar(*++p); + } else { + separator = *p; + } + } + + if (*++p !='[') { + lattice->set_what("cannot find '['"); + return false; + } + size_t n = 0; + bool sep = false; + bool isfil = false; + p++; + + for (;; ++p) { + switch (*p) { + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + n = 10 * n +(*p - '0'); + break; + case ',': case ']': + if (n >= psize) { + lattice->set_what("given index is out of range"); + return false; + } + isfil = (ptr[n][0] != '*'); + if (isfil) { + if (sep) { + *os << separator; + } + *os << ptr[n]; + } + if (*p == ']') { + goto last; + } + sep = isfil; + n = 0; + break; + default: + lattice->set_what("cannot find ']'"); + return false; + } + } + } last: break; + } // end switch + } break; // end case '%' + } // end switch } - end_node_list[0] = bos_node; - begin_node_list[lattice->size()] = eos_node; - return true; } -} // namespace MeCab +} \ No newline at end of file diff --git a/src/viterbi.h b/src/viterbi.h index 54bfdf9..81ee86e 100644 --- a/src/viterbi.h +++ b/src/viterbi.h @@ -50,4 +50,4 @@ class Viterbi { whatlog what_; }; } -#endif // MECAB_VITERBI_H_ +#endif // MECAB_VITERBI_H_ \ No newline at end of file From e47fa18685109a717d7599f0951b7884ea3bff88 Mon Sep 17 00:00:00 2001 From: frawamudi Date: Wed, 23 Aug 2023 10:32:24 +0100 Subject: [PATCH 5/5] fixe mix up in viterbi.cpp file --- src/viterbi.cpp | 705 ++++++++++++++++++++++++------------------------ 1 file changed, 353 insertions(+), 352 deletions(-) diff --git a/src/viterbi.cpp b/src/viterbi.cpp index ba9ce92..e2273b9 100644 --- a/src/viterbi.cpp +++ b/src/viterbi.cpp @@ -1,412 +1,413 @@ -// MeCab -- Yet Another Part-of-Speech and Morphological Analyzer +// MeCab -- Yet Another Part-of-Speech and Morphological Analyzer // // // Copyright(C) 2001-2011 Taku Kudo // Copyright(C) 2004-2006 Nippon Telegraph and Telephone Corporation -#include +#include +#include +#include #include -#include -#include #include "common.h" +#include "connector.h" +#include "mecab.h" +#include "nbest_generator.h" #include "param.h" +#include "viterbi.h" +#include "scoped_ptr.h" #include "string_buffer.h" -#include "utils.h" -#include "writer.h" +#include "tokenizer.h" namespace MeCab { -Writer::Writer() : write_(&Writer::writeLattice) {} -Writer::~Writer() {} +namespace { +void calc_alpha(Node *n, double beta) { + n->alpha = 0.0; + for (Path *path = n->lpath; path; path = path->lnext) { + n->alpha = logsumexp(n->alpha, + -beta * path->cost + path->lnode->alpha, + path == n->lpath); + } +} -void Writer::close() { - write_ = &Writer::writeLattice; +void calc_beta(Node *n, double beta) { + n->beta = 0.0; + for (Path *path = n->rpath; path; path = path->rnext) { + n->beta = logsumexp(n->beta, + -beta * path->cost + path->rnode->beta, + path == n->rpath); + } } +} // namespace -bool Writer::open(const Param ¶m) { - const std::string ostyle = param.get("output-format-type"); - write_ = &Writer::writeLattice; - - if (ostyle == "wakati") { - write_ = &Writer::writeWakati; - } else if (ostyle == "none") { - write_ = &Writer::writeNone; - } else if (ostyle == "dump") { - write_ = &Writer::writeDump; - } else if (ostyle == "em") { - write_ = &Writer::writeEM; - } else { - // default values - std::string node_format = "%m\\t%H\\n"; - std::string unk_format = "%m\\t%H\\n"; - std::string bos_format = ""; - std::string eos_format = "EOS\\n"; - std::string eon_format = ""; - - std::string node_format_key = "node-format"; - std::string bos_format_key = "bos-format"; - std::string eos_format_key = "eos-format"; - std::string unk_format_key = "unk-format"; - std::string eon_format_key = "eon-format"; - - if (!ostyle.empty()) { - node_format_key += "-"; - node_format_key += ostyle; - bos_format_key += "-"; - bos_format_key += ostyle; - eos_format_key += "-"; - eos_format_key += ostyle; - unk_format_key += "-"; - unk_format_key += ostyle; - eon_format_key += "-"; - eon_format_key += ostyle; - const std::string tmp = param.get(node_format_key.c_str()); - CHECK_FALSE(!tmp.empty()) << "unkown format type [" << ostyle << "]"; - } +Viterbi::Viterbi() + : tokenizer_(0), connector_(0), + cost_factor_(0) {} - const std::string node_format2 = - param.get(node_format_key.c_str()); - const std::string bos_format2 = - param.get(bos_format_key.c_str()); - const std::string eos_format2 = - param.get(eos_format_key.c_str()); - const std::string unk_format2 = - param.get(unk_format_key.c_str()); - const std::string eon_format2 = - param.get(eon_format_key.c_str()); - - if (node_format != node_format2 || bos_format != bos_format2 || - eos_format != eos_format2 || unk_format != unk_format2) { - write_ = &Writer::writeUser; - if (node_format != node_format2) { - node_format = node_format2; - } - if (bos_format != bos_format2) { - bos_format = bos_format2; - } - if (eos_format != eos_format2) { - eos_format = eos_format2; - } - if (unk_format != unk_format2) { - unk_format = unk_format2; - } else if (node_format != node_format2) { - unk_format = node_format2; - } else { - unk_format = node_format; - } - if (eon_format != eon_format2) { - eon_format = eon_format2; - } - node_format_.reset_string(node_format.c_str()); - bos_format_.reset_string(bos_format.c_str()); - eos_format_.reset_string(eos_format.c_str()); - unk_format_.reset_string(unk_format.c_str()); - eon_format_.reset_string(eon_format.c_str()); - } +Viterbi::~Viterbi() {} + +bool Viterbi::open(const Param ¶m) { + tokenizer_.reset(new Tokenizer); + CHECK_FALSE(tokenizer_->open(param)) << tokenizer_->what(); + CHECK_FALSE(tokenizer_->dictionary_info()) << "Dictionary is empty"; + + connector_.reset(new Connector); + CHECK_FALSE(connector_->open(param)) << connector_->what(); + + CHECK_FALSE(tokenizer_->dictionary_info()->lsize == + connector_->left_size() && + tokenizer_->dictionary_info()->rsize == + connector_->right_size()) + << "Transition table and dictionary are not compatible"; + + cost_factor_ = param.get("cost-factor"); + if (cost_factor_ == 0) { + cost_factor_ = 800; } return true; } -bool Writer::write(Lattice *lattice, StringBuffer *os) const { - if (!lattice || !lattice->is_available()) { +bool Viterbi::analyze(Lattice *lattice) const { + if (!lattice || !lattice->sentence()) { return false; } - return (this->*write_)(lattice, os); -} -bool Writer::writeLattice(Lattice *lattice, StringBuffer *os) const { - for (const Node *node = lattice->bos_node()->next; - node->next; node = node->next) { - os->write(node->surface, node->length); - *os << '\t' << node->feature; // << '\t'; - *os << '\n'; + if (!initPartial(lattice)) { + return false; + } + + bool result = false; + if (lattice->has_request_type(MECAB_NBEST) || + lattice->has_request_type(MECAB_MARGINAL_PROB)) { + // IsAllPath=true + if (lattice->has_constraint()) { + result = viterbi(lattice); + } else { + result = viterbi(lattice); + } + } else { + // IsAllPath=false + if (lattice->has_constraint()) { + result = viterbi(lattice); + } else { + result = viterbi(lattice); + } } - *os << "EOS\n"; - return true; -} -bool Writer::writeWakati(Lattice *lattice, StringBuffer *os) const { - for (const Node *node = lattice->bos_node()->next; - node->next; node = node->next) { - os->write(node->surface, node->length); - *os << ' '; + if (!result) { + return false; } - *os << '\n'; + + if (!forwardbackward(lattice)) { + return false; + } + + if (!buildBestLattice(lattice)) { + return false; + } + + if (!buildAllLattice(lattice)) { + return false; + } + + if (!initNBest(lattice)) { + return false; + } + return true; } -bool Writer::writeNone(Lattice *lattice, StringBuffer *os) const { - return true; // do nothing +const Tokenizer *Viterbi::tokenizer() const { + return tokenizer_.get(); } -bool Writer::writeEM(Lattice *lattice, StringBuffer *os) const { - static const float min_prob = 0.0001; - for (const Node *node = lattice->bos_node(); node; node = node->next) { - if (node->prob >= min_prob) { - *os << "U\t"; - if (node->stat == MECAB_BOS_NODE) { - *os << "BOS"; - } else if (node->stat == MECAB_EOS_NODE) { - *os << "EOS"; - } else { - os->write(node->surface, node->length); - } - *os << '\t' << node->feature << '\t' << node->prob << '\n'; +const Connector *Viterbi::connector() const { + return connector_.get(); +} + +// static +bool Viterbi::forwardbackward(Lattice *lattice) { + if (!lattice->has_request_type(MECAB_MARGINAL_PROB)) { + return true; + } + + Node **end_node_list = lattice->end_nodes(); + Node **begin_node_list = lattice->begin_nodes(); + + const size_t len = lattice->size(); + const double theta = lattice->theta(); + + end_node_list[0]->alpha = 0.0; + for (int pos = 0; pos <= static_cast(len); ++pos) { + for (Node *node = begin_node_list[pos]; node; node = node->bnext) { + calc_alpha(node, theta); + } + } + + begin_node_list[len]->beta = 0.0; + for (int pos = static_cast(len); pos >= 0; --pos) { + for (Node *node = end_node_list[pos]; node; node = node->enext) { + calc_beta(node, theta); } - for (const Path *path = node->lpath; path; path = path->lnext) { - if (path->prob >= min_prob) { - *os << "B\t" << path->lnode->feature << '\t' - << node->feature << '\t' << path->prob << '\n'; + } + + const double Z = begin_node_list[len]->alpha; + lattice->set_Z(Z); // alpha of EOS + + for (int pos = 0; pos <= static_cast(len); ++pos) { + for (Node *node = begin_node_list[pos]; node; node = node->bnext) { + node->prob = std::exp(node->alpha + node->beta - Z); + for (Path *path = node->lpath; path; path = path->lnext) { + path->prob = std::exp(path->lnode->alpha + - theta * path->cost + + path->rnode->beta - Z); } } } - *os << "EOS\n"; + return true; } -bool Writer::writeDump(Lattice *lattice, StringBuffer *os) const { - const char *str = lattice->sentence(); - for (const Node *node = lattice->bos_node(); node; node = node->next) { - *os << node->id << ' '; - if (node->stat == MECAB_BOS_NODE) { - *os << "BOS"; - } else if (node->stat == MECAB_EOS_NODE) { - *os << "EOS"; - } else { - os->write(node->surface, node->length); +// static +bool Viterbi::buildResultForNBest(Lattice *lattice) { + return buildAllLattice(lattice); +} + +// static +bool Viterbi::buildAllLattice(Lattice *lattice) { + if (!lattice->has_request_type(MECAB_ALL_MORPHS)) { + return true; + } + + Node *prev = lattice->bos_node(); + const size_t len = lattice->size(); + Node **begin_node_list = lattice->begin_nodes(); + + for (long pos = 0; pos <= static_cast(len); ++pos) { + for (Node *node = begin_node_list[pos]; node; node = node->bnext) { + prev->next = node; + node->prev = prev; + prev = node; } + } + + return true; +} + +// static +bool Viterbi::buildAlternative(Lattice *lattice) { + Node **begin_node_list = lattice->begin_nodes(); - *os << ' ' << node->feature - << ' ' << static_cast(node->surface - str) - << ' ' << static_cast(node->surface - str + node->length) - << ' ' << node->rcAttr - << ' ' << node->lcAttr - << ' ' << node->posid - << ' ' << static_cast(node->char_type) - << ' ' << static_cast(node->stat) - << ' ' << static_cast(node->isbest) - << ' ' << node->alpha - << ' ' << node->beta - << ' ' << node->prob - << ' ' << node->cost; - - for (const Path *path = node->lpath; path; path = path->lnext) { - *os << ' ' << path->lnode->id << ':' << path->cost << ':' << path->prob; + const Node *bos_node = lattice->bos_node(); + for (const Node *node = bos_node; node; node = node->next) { + if (node->stat == MECAB_BOS_NODE || node->stat == MECAB_EOS_NODE) { + continue; + } + const size_t pos = node->surface - lattice->sentence() - + node->rlength + node->length; + std::cout.write(node->surface, node->length); + std::cout << "\t" << node->feature << std::endl; + for (const Node *anode = begin_node_list[pos]; + anode; anode = anode->bnext) { + if (anode->rlength == node->rlength && + anode->length == node->length) { + std::cout << "@ "; + std::cout.write(anode->surface, anode->length); + std::cout << "\t" << anode->feature << std::endl; + } } - *os << '\n'; } + + std::cout << "EOS" << std::endl; + return true; } -bool Writer::writeUser(Lattice *lattice, StringBuffer *os) const { - if (!writeNode(lattice, bos_format_.get(), lattice->bos_node(), os)) { - return false; +// static +bool Viterbi::buildBestLattice(Lattice *lattice) { + Node *node = lattice->eos_node(); + for (Node *prev_node; node->prev;) { + node->isbest = 1; + prev_node = node->prev; + prev_node->next = node; + node = prev_node; } - const Node *node = 0; - for (node = lattice->bos_node()->next; node->next; node = node->next) { - const char *fmt = (node->stat == MECAB_UNK_NODE ? unk_format_.get() : - node_format_.get()); - if (!writeNode(lattice, fmt, node, os)) { - return false; + + return true; +} + +// static +bool Viterbi::initNBest(Lattice *lattice) { + if (!lattice->has_request_type(MECAB_NBEST)) { + return true; + } + lattice->allocator()->nbest_generator()->set(lattice); + return true; +} + +// static +bool Viterbi::initPartial(Lattice *lattice) { + if (!lattice->has_request_type(MECAB_PARTIAL)) { + if (lattice->has_constraint()) { + lattice->set_boundary_constraint(0, MECAB_TOKEN_BOUNDARY); + lattice->set_boundary_constraint(lattice->size(), + MECAB_TOKEN_BOUNDARY); } + return true; } - if (!writeNode(lattice, eos_format_.get(), node, os)) { - return false; + + Allocator *allocator = lattice->allocator(); + char *str = allocator->partial_buffer(lattice->size() + 1); + strncpy(str, lattice->sentence(), lattice->size() + 1); + + std::vector lines; + const size_t lsize = tokenize(str, "\n", + std::back_inserter(lines), + lattice->size() + 1); + char* column[2]; + scoped_array buf(new char[lattice->size() + 1]); + StringBuffer os(buf.get(), lattice->size() + 1); + + std::vector > tokens; + tokens.reserve(lsize); + + size_t pos = 0; + for (size_t i = 0; i < lsize; ++i) { + const size_t size = tokenize(lines[i], "\t", column, 2); + if (size == 1 && std::strcmp(column[0], "EOS") == 0) { + break; + } + const size_t len = std::strlen(column[0]); + if (size == 2) { + tokens.push_back(std::make_pair(column[0], column[1])); + } else { + tokens.push_back(std::make_pair(column[0], reinterpret_cast(0))); + } + os << column[0]; + pos += len; + } + + os << '\0'; + + lattice->set_sentence(os.str()); + + pos = 0; + for (size_t i = 0; i < tokens.size(); ++i) { + const char *surface = tokens[i].first; + const char *feature = tokens[i].second; + const size_t len = std::strlen(surface); + lattice->set_boundary_constraint(pos, MECAB_TOKEN_BOUNDARY); + lattice->set_boundary_constraint(pos + len, MECAB_TOKEN_BOUNDARY); + if (feature) { + lattice->set_feature_constraint(pos, pos + len, feature); + for (size_t n = 1; n < len; ++n) { + lattice->set_boundary_constraint(pos + n, + MECAB_INSIDE_TOKEN); + } + } + pos += len; } + return true; } -bool Writer::writeNode(Lattice *lattice, const Node *node, - StringBuffer *os) const { - switch (node->stat) { - case MECAB_BOS_NODE: - return writeNode(lattice, bos_format_.get(), node, os); - case MECAB_EOS_NODE: - return writeNode(lattice, eos_format_.get(), node, os); - case MECAB_UNK_NODE: - return writeNode(lattice, unk_format_.get(), node, os); - case MECAB_NOR_NODE: - return writeNode(lattice, node_format_.get(), node, os); - case MECAB_EON_NODE: - return writeNode(lattice, eon_format_.get(), node, os); +namespace { +template bool connect(size_t pos, Node *rnode, + Node **begin_node_list, + Node **end_node_list, + const Connector *connector, + Allocator *allocator) { + for (;rnode; rnode = rnode->bnext) { + long best_cost = 2147483647; + Node* best_node = 0; + for (Node *lnode = end_node_list[pos]; lnode; lnode = lnode->enext) { + int lcost = connector->cost(lnode, rnode); // local cost + long cost = lnode->cost + lcost; + + if (cost < best_cost) { + best_node = lnode; + best_cost = cost; + } + + if (IsAllPath) { + Path *path = allocator->newPath(); + path->cost = lcost; + path->rnode = rnode; + path->lnode = lnode; + path->lnext = rnode->lpath; + rnode->lpath = path; + path->rnext = lnode->rpath; + lnode->rpath = path; + } + } + + // overflow check 2003/03/09 + if (!best_node) { + return false; + } + + rnode->prev = best_node; + rnode->next = 0; + rnode->cost = best_cost; + const size_t x = rnode->rlength + pos; + rnode->enext = end_node_list[x]; + end_node_list[x] = rnode; } + return true; } +} // namespace + +template +bool Viterbi::viterbi(Lattice *lattice) const { + Node **end_node_list = lattice->end_nodes(); + Node **begin_node_list = lattice->begin_nodes(); + Allocator *allocator = lattice->allocator(); + const size_t len = lattice->size(); + const char *begin = lattice->sentence(); + const char *end = begin + len; + + Node *bos_node = tokenizer_->getBOSNode(lattice->allocator()); + bos_node->surface = lattice->sentence(); + end_node_list[0] = bos_node; + + for (size_t pos = 0; pos < len; ++pos) { + if (end_node_list[pos]) { + Node *right_node = tokenizer_->lookup(begin + pos, end, + allocator, lattice); + begin_node_list[pos] = right_node; + if (!connect(pos, right_node, + begin_node_list, + end_node_list, + connector_.get(), + allocator)) { + lattice->set_what("too long sentence."); + return false; + } + } + } + + Node *eos_node = tokenizer_->getEOSNode(lattice->allocator()); + eos_node->surface = lattice->sentence() + lattice->size(); + begin_node_list[lattice->size()] = eos_node; -bool Writer::writeNode(Lattice *lattice, - const char *p, - const Node *node, - StringBuffer *os) const { - scoped_fixed_array buf; - scoped_fixed_array ptr; - size_t psize = 0; - - for (; *p; p++) { - switch (*p) { - default: *os << *p; break; - - case '\\': *os << getEscapedChar(*++p); break; - - case '%': { // macros - switch (*++p) { - default: { - const std::string error = "unknown meta char: " + *p; - lattice->set_what(error.c_str()); - return false; - } - // input sentence - case 'S': os->write(lattice->sentence(), lattice->size()); break; - // sentence length - case 'L': *os << lattice->size(); break; - // morph - case 'm': os->write(node->surface, node->length); break; - case 'M': os->write(reinterpret_cast - (node->surface - node->rlength + node->length), - node->rlength); - break; - case 'h': *os << node->posid; break; // Part-Of-Speech ID - case '%': *os << '%'; break; // % - case 'c': *os << static_cast(node->wcost); break; // word cost - case 'H': *os << node->feature; break; - case 't': *os << static_cast(node->char_type); break; - case 's': *os << static_cast(node->stat); break; - case 'P': *os << node->prob; break; - case 'p': { - switch (*++p) { - default: - lattice->set_what("[iseSCwcnblLh] is required after %p"); - return false; - case 'i': *os << node->id; break; // node id - case 'S': os->write(reinterpret_cast - (node->surface - - node->rlength + node->length), - node->rlength - node->length); - break; // space - // start position - case 's': *os << static_cast( - node->surface - lattice->sentence()); - break; - // end position - case 'e': *os << static_cast - (node->surface - lattice->sentence() + node->length); - break; - // connection cost - case 'C': *os << node->cost - - node->prev->cost - node->wcost; - break; - case 'w': *os << node->wcost; break; // word cost - case 'c': *os << node->cost; break; // best cost - case 'n': *os << (node->cost - node->prev->cost); break; - // node cost - // * if best path, otherwise ' ' - case 'b': *os << (node->isbest ? '*' : ' '); break; - case 'P': *os << node->prob; break; - case 'A': *os << node->alpha; break; - case 'B': *os << node->beta; break; - case 'l': *os << node->length; break; // length of morph - // length of morph including the spaces - case 'L': *os << node->rlength; break; - case 'h': { // Hidden Layer ID - switch (*++p) { - default: - lattice->set_what("lr is required after %ph"); - return false; - case 'l': *os << node->lcAttr; break; // current - case 'r': *os << node->rcAttr; break; // prev - } - } break; - - case 'p': { - char mode = *++p; - char sep = *++p; - if (sep == '\\') { - sep = getEscapedChar(*++p); - } - if (!node->lpath) { - lattice->set_what("no path information is available"); - return false; - } - for (Path *path = node->lpath; path; path = path->lnext) { - if (path != node->lpath) *os << sep; - switch (mode) { - case 'i': *os << path->lnode->id; break; - case 'c': *os << path->cost; break; - case 'P': *os << path->prob; break; - default: - lattice->set_what("[icP] is required after %pp"); - return false; - } - } - } break; - - } - } break; - - case 'F': - case 'f': { - if (node->feature[0] == '\0') { - lattice->set_what("no feature information available"); - return false; - } - if (!psize) { - std::strncpy(buf.get(), node->feature, buf.size()); - psize = tokenizeCSV(buf.get(), ptr.get(), ptr.size()); - } - - // separator - char separator = '\t'; // default separator - if (*p == 'F') { // change separator - if (*++p == '\\') { - separator = getEscapedChar(*++p); - } else { - separator = *p; - } - } - - if (*++p !='[') { - lattice->set_what("cannot find '['"); - return false; - } - size_t n = 0; - bool sep = false; - bool isfil = false; - p++; - - for (;; ++p) { - switch (*p) { - case '0': case '1': case '2': case '3': case '4': - case '5': case '6': case '7': case '8': case '9': - n = 10 * n +(*p - '0'); - break; - case ',': case ']': - if (n >= psize) { - lattice->set_what("given index is out of range"); - return false; - } - isfil = (ptr[n][0] != '*'); - if (isfil) { - if (sep) { - *os << separator; - } - *os << ptr[n]; - } - if (*p == ']') { - goto last; - } - sep = isfil; - n = 0; - break; - default: - lattice->set_what("cannot find ']'"); - return false; - } - } - } last: break; - } // end switch - } break; // end case '%' - } // end switch + for (long pos = len; static_cast(pos) >= 0; --pos) { + if (end_node_list[pos]) { + if (!connect(pos, eos_node, + begin_node_list, + end_node_list, + connector_.get(), + allocator)) { + lattice->set_what("too long sentence."); + return false; + } + break; + } } + end_node_list[0] = bos_node; + begin_node_list[lattice->size()] = eos_node; + return true; } -} \ No newline at end of file +} // Mecab \ No newline at end of file