#ifndef CPPJIEBA_MPSEGMENT_H #define CPPJIEBA_MPSEGMENT_H #include #include #include #include "limonp/Logging.hpp" #include "DictTrie.hpp" #include "SegmentTagged.hpp" #include "PosTagger.hpp" namespace cppjieba { class MPSegment: public SegmentTagged { public: MPSegment(const string& dictPath, const string& userDictPath = "") : dictTrie_(new DictTrie(dictPath, userDictPath)), isNeedDestroy_(true) { } MPSegment(const DictTrie* dictTrie) : dictTrie_(dictTrie), isNeedDestroy_(false) { assert(dictTrie_); } ~MPSegment() { if (isNeedDestroy_) { delete dictTrie_; } } void Cut(const string& sentence, vector& words) const { Cut(sentence, words, MAX_WORD_LENGTH); } void Cut(const string& sentence, vector& words, size_t max_word_len) const { vector tmp; Cut(sentence, tmp, max_word_len); GetStringsFromWords(tmp, words); } void Cut(const string& sentence, vector& words, size_t max_word_len = MAX_WORD_LENGTH) const { PreFilter pre_filter(symbols_, sentence); PreFilter::Range range; vector wrs; wrs.reserve(sentence.size()/2); while (pre_filter.HasNext()) { range = pre_filter.Next(); Cut(range.begin, range.end, wrs, max_word_len); } words.clear(); words.reserve(wrs.size()); GetWordsFromWordRanges(sentence, wrs, words); } void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector& words, size_t max_word_len = MAX_WORD_LENGTH) const { vector dags; dictTrie_->Find(begin, end, dags, max_word_len); CalcDP(dags); CutByDag(begin, end, dags, words); } const DictTrie* GetDictTrie() const { return dictTrie_; } bool Tag(const string& src, vector >& res) const { return tagger_.Tag(src, res, *this); } bool IsUserDictSingleChineseWord(const Rune& value) const { return dictTrie_->IsUserDictSingleChineseWord(value); } private: void CalcDP(vector& dags) const { size_t nextPos; const DictUnit* p; double val; for (vector::reverse_iterator rit = dags.rbegin(); rit != dags.rend(); rit++) { rit->pInfo = NULL; rit->weight = MIN_DOUBLE; assert(!rit->nexts.empty()); for (LocalVector >::const_iterator it = rit->nexts.begin(); it != rit->nexts.end(); it++) { nextPos = it->first; p = it->second; val = 0.0; if (nextPos + 1 < dags.size()) { val += dags[nextPos + 1].weight; } if (p) { val += p->weight; } else { val += dictTrie_->GetMinWeight(); } if (val > rit->weight) { rit->pInfo = p; rit->weight = val; } } } } void CutByDag(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, const vector& dags, vector& words) const { size_t i = 0; while (i < dags.size()) { const DictUnit* p = dags[i].pInfo; if (p) { assert(p->word.size() >= 1); WordRange wr(begin + i, begin + i + p->word.size() - 1); words.push_back(wr); i += p->word.size(); } else { //single chinese word WordRange wr(begin + i, begin + i); words.push_back(wr); i++; } } } const DictTrie* dictTrie_; bool isNeedDestroy_; PosTagger tagger_; }; // class MPSegment } // namespace cppjieba #endif