commit b9de6787036c935a33125b866ea2f608f5d4f639 Author: hsp <835110287@qq.com> Date: Fri Jul 31 14:50:46 2020 +0800 first commit diff --git a/README.md b/README.md new file mode 100644 index 0000000..a5a2bfe --- /dev/null +++ b/README.md @@ -0,0 +1,15 @@ +# qt虚拟输入法,可输入中文,可用于ARM-LINUX + +1. 先编译googlepinyin(注意选择release),生成libgooglepinyin.a,将改文件放到plugin/googlepinyin文件夹下 +2. 编译plugin,生成libtgtsmlInputContextPlugin.so,(注意不要复制链接文件,终端输入ls -la查看链接文件),将该文件复制到开发板qt的plugins/platforminputcontexts文件夹下,若无platforminputcontexts文件夹,则手动建立 +3. 将plugin下的dict文件夹放到工程编译生成的APP同一目录下 +4. 在工程的main.cpp的主函数中添加qputenv("QT_IM_MODULE", QByteArray("tgtsml"));,注意放到QApplication a(argc, argv);之前 +5. 运行APP,点击文本框,即可弹出输入法 + +教程链接: + + https://blog.csdn.net/qq_32605451/article/details/107705710 + +原链接: + + https://gitee.com/smartwell/QtInputMethod_GooglePinyin?_from=gitee_search diff --git a/googlepinyin/atomdictbase.h b/googlepinyin/atomdictbase.h new file mode 100644 index 0000000..0a70a51 --- /dev/null +++ b/googlepinyin/atomdictbase.h @@ -0,0 +1,269 @@ +/* + * Copyright (C) 2009 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * This class defines AtomDictBase class which is the base class for all atom + * dictionaries. Atom dictionaries are managed by the decoder class + * MatrixSearch. + * + * When the user appends a new character to the Pinyin string, all enabled atom + * dictionaries' extend_dict() will be called at least once to get candidates + * ended in this step (the information of starting step is also given in the + * parameter). Usually, when extend_dict() is called, a MileStoneHandle object + * returned by a previous calling for a earlier step is given to speed up the + * look-up process, and a new MileStoneHandle object will be returned if + * the extension is successful. + * + * A returned MileStoneHandle object should keep alive until Function + * reset_milestones() is called and this object is noticed to be reset. + * + * Usually, the atom dictionary can use step information to manage its + * MileStoneHandle objects, or it can make the objects in ascendant order to + * make the reset easier. + * + * When the decoder loads the dictionary, it will give a starting lemma id for + * this atom dictionary to map a inner id to a global id. Global ids should be + * used when an atom dictionary talks to any component outside. + */ +#ifndef PINYINIME_INCLUDE_ATOMDICTBASE_H__ +#define PINYINIME_INCLUDE_ATOMDICTBASE_H__ + +#include +#include "./dictdef.h" +#include "./searchutility.h" + +namespace ime_pinyin { +class AtomDictBase { + public: + virtual ~AtomDictBase() {} + + /** + * Load an atom dictionary from a file. + * + * @param file_name The file name to load dictionary. + * @param start_id The starting id used for this atom dictionary. + * @param end_id The end id (included) which can be used for this atom + * dictionary. User dictionary will always use the last id space, so it can + * ignore this paramter. All other atom dictionaries should check this + * parameter. + * @return True if succeed. + */ + virtual bool load_dict(const char *file_name, LemmaIdType start_id, + LemmaIdType end_id) = 0; + + /** + * Close this atom dictionary. + * + * @return True if succeed. + */ + virtual bool close_dict() = 0; + + /** + * Get the total number of lemmas in this atom dictionary. + * + * @return The total number of lemmas. + */ + virtual size_t number_of_lemmas() = 0; + + /** + * This function is called by the decoder when user deletes a character from + * the input string, or begins a new input string. + * + * Different atom dictionaries may implement this function in different way. + * an atom dictionary can use one of these two parameters (or both) to reset + * its corresponding MileStoneHandle objects according its detailed + * implementation. + * + * For example, if an atom dictionary uses step information to manage its + * MileStoneHandle objects, parameter from_step can be used to identify which + * objects should be reset; otherwise, if another atom dictionary does not + * use the detailed step information, it only uses ascendant handles + * (according to step. For the same step, earlier call, smaller handle), it + * can easily reset those MileStoneHandle which are larger than from_handle. + * + * The decoder always reset the decoding state by step. So when it begins + * resetting, it will call reset_milestones() of its atom dictionaries with + * the step information, and the MileStoneHandle objects returned by the + * earliest calling of extend_dict() for that step. + * + * If an atom dictionary does not implement incremental search, this function + * can be totally ignored. + * + * @param from_step From which step(included) the MileStoneHandle + * objects should be reset. + * @param from_handle The ealiest MileStoneHandle object for step from_step + */ + virtual void reset_milestones(uint16 from_step, + MileStoneHandle from_handle) = 0; + + /** + * Used to extend in this dictionary. The handle returned should keep valid + * until reset_milestones() is called. + * + * @param from_handle Its previous returned extended handle without the new + * spelling id, it can be used to speed up the extending. + * @param dep The paramter used for extending. + * @param lpi_items Used to fill in the lemmas matched. + * @param lpi_max The length of the buffer + * @param lpi_num Used to return the newly added items. + * @return The new mile stone for this extending. 0 if fail. + */ + virtual MileStoneHandle extend_dict(MileStoneHandle from_handle, + const DictExtPara *dep, + LmaPsbItem *lpi_items, + size_t lpi_max, size_t *lpi_num) = 0; + + /** + * Get lemma items with scores according to a spelling id stream. + * This atom dictionary does not need to sort the returned items. + * + * @param splid_str The spelling id stream buffer. + * @param splid_str_len The length of the spelling id stream buffer. + * @param lpi_items Used to return matched lemma items with scores. + * @param lpi_max The maximum size of the buffer to return result. + * @return The number of matched items which have been filled in to lpi_items. + */ + virtual size_t get_lpis(const uint16 *splid_str, uint16 splid_str_len, + LmaPsbItem *lpi_items, size_t lpi_max) = 0; + + /** + * Get a lemma string (The Chinese string) by the given lemma id. + * + * @param id_lemma The lemma id to get the string. + * @param str_buf The buffer to return the Chinese string. + * @param str_max The maximum size of the buffer. + * @return The length of the string, 0 if fail. + */ + virtual uint16 get_lemma_str(LemmaIdType id_lemma, char16 *str_buf, + uint16 str_max) = 0; + + /** + * Get the full spelling ids for the given lemma id. + * If the given buffer is too short, return 0. + * + * @param splids Used to return the spelling ids. + * @param splids_max The maximum buffer length of splids. + * @param arg_valid Used to indicate if the incoming parameters have been + * initialized are valid. If it is true, the splids and splids_max are valid + * and there may be half ids in splids to be updated to full ids. In this + * case, splids_max is the number of valid ids in splids. + * @return The number of ids in the buffer. + */ + virtual uint16 get_lemma_splids(LemmaIdType id_lemma, uint16 *splids, + uint16 splids_max, bool arg_valid) = 0; + + /** + * Function used for prediction. + * No need to sort the newly added items. + * + * @param last_hzs The last n Chinese chracters(called Hanzi), its length + * should be less than or equal to kMaxPredictSize. + * @param hzs_len specifies the length(<= kMaxPredictSize) of the history. + * @param npre_items Used used to return the result. + * @param npre_max The length of the buffer to return result + * @param b4_used Number of prediction result (from npre_items[-b4_used]) + * from other atom dictionaries. A atom ditionary can just ignore it. + * @return The number of prediction result from this atom dictionary. + */ + virtual size_t predict(const char16 last_hzs[], uint16 hzs_len, + NPredictItem *npre_items, size_t npre_max, + size_t b4_used) = 0; + + /** + * Add a lemma to the dictionary. If the dictionary allows to add new + * items and this item does not exist, add it. + * + * @param lemma_str The Chinese string of the lemma. + * @param splids The spelling ids of the lemma. + * @param lemma_len The length of the Chinese lemma. + * @param count The frequency count for this lemma. + */ + virtual LemmaIdType put_lemma(char16 lemma_str[], uint16 splids[], + uint16 lemma_len, uint16 count) = 0; + + /** + * Update a lemma's occuring count. + * + * @param lemma_id The lemma id to update. + * @param delta_count The frequnecy count to ajust. + * @param selected Indicate whether this lemma is selected by user and + * submitted to target edit box. + * @return The id if succeed, 0 if fail. + */ + virtual LemmaIdType update_lemma(LemmaIdType lemma_id, int16 delta_count, + bool selected) = 0; + + /** + * Get the lemma id for the given lemma. + * + * @param lemma_str The Chinese string of the lemma. + * @param splids The spelling ids of the lemma. + * @param lemma_len The length of the lemma. + * @return The matched lemma id, or 0 if fail. + */ + virtual LemmaIdType get_lemma_id(char16 lemma_str[], uint16 splids[], + uint16 lemma_len) = 0; + + /** + * Get the lemma score. + * + * @param lemma_id The lemma id to get score. + * @return The score of the lemma, or 0 if fail. + */ + virtual LmaScoreType get_lemma_score(LemmaIdType lemma_id) = 0; + + /** + * Get the lemma score. + * + * @param lemma_str The Chinese string of the lemma. + * @param splids The spelling ids of the lemma. + * @param lemma_len The length of the lemma. + * @return The score of the lamm, or 0 if fail. + */ + virtual LmaScoreType get_lemma_score(char16 lemma_str[], uint16 splids[], + uint16 lemma_len) = 0; + + /** + * If the dictionary allowed, remove a lemma from it. + * + * @param lemma_id The id of the lemma to remove. + * @return True if succeed. + */ + virtual bool remove_lemma(LemmaIdType lemma_id) = 0; + + /** + * Get the total occuring count of this atom dictionary. + * + * @return The total occuring count of this atom dictionary. + */ + virtual size_t get_total_lemma_count() = 0; + + /** + * Set the total occuring count of other atom dictionaries. + * + * @param count The total occuring count of other atom dictionaies. + */ + virtual void set_total_lemma_count_of_others(size_t count) = 0; + + /** + * Notify this atom dictionary to flush the cached data to persistent storage + * if necessary. + */ + virtual void flush_cache() = 0; +}; +} + +#endif // PINYINIME_INCLUDE_ATOMDICTBASE_H__ diff --git a/googlepinyin/dictbuilder.cpp b/googlepinyin/dictbuilder.cpp new file mode 100644 index 0000000..0f790b1 --- /dev/null +++ b/googlepinyin/dictbuilder.cpp @@ -0,0 +1,1070 @@ +/* + * Copyright (C) 2009 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include "dictbuilder.h" +#include "dicttrie.h" +#include "mystdlib.h" +#include "ngram.h" +#include "searchutility.h" +#include "spellingtable.h" +#include "spellingtrie.h" +#include "splparser.h" +#include "utf16reader.h" + +namespace ime_pinyin { + +#ifdef ___BUILD_MODEL___ + +static const size_t kReadBufLen = 512; +static const size_t kSplTableHashLen = 2000; + +// Compare a SingleCharItem, first by Hanzis, then by spelling ids, then by +// frequencies. +int cmp_scis_hz_splid_freq(const void* p1, const void* p2) { + const SingleCharItem *s1, *s2; + s1 = static_cast(p1); + s2 = static_cast(p2); + + if (s1->hz < s2->hz) + return -1; + if (s1->hz > s2->hz) + return 1; + + if (s1->splid.half_splid < s2->splid.half_splid) + return -1; + if (s1->splid.half_splid > s2->splid.half_splid) + return 1; + + if (s1->splid.full_splid < s2->splid.full_splid) + return -1; + if (s1->splid.full_splid > s2->splid.full_splid) + return 1; + + if (s1->freq > s2->freq) + return -1; + if (s1->freq < s2->freq) + return 1; + return 0; +} + +int cmp_scis_hz_splid(const void* p1, const void* p2) { + const SingleCharItem *s1, *s2; + s1 = static_cast(p1); + s2 = static_cast(p2); + + if (s1->hz < s2->hz) + return -1; + if (s1->hz > s2->hz) + return 1; + + if (s1->splid.half_splid < s2->splid.half_splid) + return -1; + if (s1->splid.half_splid > s2->splid.half_splid) + return 1; + + if (s1->splid.full_splid < s2->splid.full_splid) + return -1; + if (s1->splid.full_splid > s2->splid.full_splid) + return 1; + + return 0; +} + +int cmp_lemma_entry_hzs(const void* p1, const void* p2) { + size_t size1 = utf16_strlen(((const LemmaEntry*)p1)->hanzi_str); + size_t size2 = utf16_strlen(((const LemmaEntry*)p2)->hanzi_str); + if (size1 < size2) + return -1; + else if (size1 > size2) + return 1; + + return utf16_strcmp(((const LemmaEntry*)p1)->hanzi_str, + ((const LemmaEntry*)p2)->hanzi_str); +} + +int compare_char16(const void* p1, const void* p2) { + if (*((const char16*)p1) < *((const char16*)p2)) + return -1; + if (*((const char16*)p1) > *((const char16*)p2)) + return 1; + return 0; +} + +int compare_py(const void* p1, const void* p2) { + int ret = utf16_strcmp(((const LemmaEntry*)p1)->spl_idx_arr, + ((const LemmaEntry*)p2)->spl_idx_arr); + + if (0 != ret) + return ret; + + return static_cast(((const LemmaEntry*)p2)->freq) - + static_cast(((const LemmaEntry*)p1)->freq); +} + +// First hanzi, if the same, then Pinyin +int cmp_lemma_entry_hzspys(const void* p1, const void* p2) { + size_t size1 = utf16_strlen(((const LemmaEntry*)p1)->hanzi_str); + size_t size2 = utf16_strlen(((const LemmaEntry*)p2)->hanzi_str); + if (size1 < size2) + return -1; + else if (size1 > size2) + return 1; + int ret = utf16_strcmp(((const LemmaEntry*)p1)->hanzi_str, + ((const LemmaEntry*)p2)->hanzi_str); + + if (0 != ret) + return ret; + + ret = utf16_strcmp(((const LemmaEntry*)p1)->spl_idx_arr, + ((const LemmaEntry*)p2)->spl_idx_arr); + return ret; +} + +int compare_splid2(const void* p1, const void* p2) { + int ret = utf16_strcmp(((const LemmaEntry*)p1)->spl_idx_arr, + ((const LemmaEntry*)p2)->spl_idx_arr); + return ret; +} + +DictBuilder::DictBuilder() { + lemma_arr_ = NULL; + lemma_num_ = 0; + + scis_ = NULL; + scis_num_ = 0; + + lma_nodes_le0_ = NULL; + lma_nodes_ge1_ = NULL; + + lma_nds_used_num_le0_ = 0; + lma_nds_used_num_ge1_ = 0; + + homo_idx_buf_ = NULL; + homo_idx_num_eq1_ = 0; + homo_idx_num_gt1_ = 0; + + top_lmas_ = NULL; + top_lmas_num_ = 0; + + spl_table_ = NULL; + spl_parser_ = NULL; +} + +DictBuilder::~DictBuilder() { + free_resource(); +} + +bool DictBuilder::alloc_resource(size_t lma_num) { + if (0 == lma_num) + return false; + + free_resource(); + + lemma_num_ = lma_num; + lemma_arr_ = new LemmaEntry[lemma_num_]; + + top_lmas_num_ = 0; + top_lmas_ = new LemmaEntry[kTopScoreLemmaNum]; + + // New the scis_ buffer to the possible maximum size. + scis_num_ = lemma_num_ * kMaxLemmaSize; + scis_ = new SingleCharItem[scis_num_]; + + // The root and first level nodes is less than kMaxSpellingNum + 1 + lma_nds_used_num_le0_ = 0; + lma_nodes_le0_ = new LmaNodeLE0[kMaxSpellingNum + 1]; + + // Other nodes is less than lemma_num + lma_nds_used_num_ge1_ = 0; + lma_nodes_ge1_ = new LmaNodeGE1[lemma_num_]; + + homo_idx_buf_ = new LemmaIdType[lemma_num_]; + spl_table_ = new SpellingTable(); + spl_parser_ = new SpellingParser(); + + if (NULL == lemma_arr_ || NULL == top_lmas_ || + NULL == scis_ || NULL == spl_table_ || + NULL == spl_parser_ || NULL == lma_nodes_le0_ || + NULL == lma_nodes_ge1_ || NULL == homo_idx_buf_) { + free_resource(); + return false; + } + + memset(lemma_arr_, 0, sizeof(LemmaEntry) * lemma_num_); + memset(scis_, 0, sizeof(SingleCharItem) * scis_num_); + memset(lma_nodes_le0_, 0, sizeof(LmaNodeLE0) * (kMaxSpellingNum + 1)); + memset(lma_nodes_ge1_, 0, sizeof(LmaNodeGE1) * lemma_num_); + memset(homo_idx_buf_, 0, sizeof(LemmaIdType) * lemma_num_); + spl_table_->init_table(kMaxPinyinSize, kSplTableHashLen, true); + + return true; +} + +char16* DictBuilder::read_valid_hanzis(const char *fn_validhzs, size_t *num) { + if (NULL == fn_validhzs || NULL == num) + return NULL; + + *num = 0; + FILE *fp = fopen(fn_validhzs, "rb"); + if (NULL == fp) + return NULL; + + char16 utf16header; + if (fread(&utf16header, sizeof(char16), 1, fp) != 1 || + 0xfeff != utf16header) { + fclose(fp); + return NULL; + } + + fseek(fp, 0, SEEK_END); + *num = ftell(fp) / sizeof(char16); + assert(*num >= 1); + *num -= 1; + + char16 *hzs = new char16[*num]; + if (NULL == hzs) { + fclose(fp); + return NULL; + } + + fseek(fp, 2, SEEK_SET); + + if (fread(hzs, sizeof(char16), *num, fp) != *num) { + fclose(fp); + delete [] hzs; + return NULL; + } + fclose(fp); + + myqsort(hzs, *num, sizeof(char16), compare_char16); + return hzs; +} + +bool DictBuilder::hz_in_hanzis_list(const char16 *hzs, size_t hzs_len, + char16 hz) { + if (NULL == hzs) + return false; + + char16 *found; + found = static_cast( + mybsearch(&hz, hzs, hzs_len, sizeof(char16), compare_char16)); + if (NULL == found) + return false; + + assert(*found == hz); + return true; +} + +// The caller makes sure that the parameters are valid. +bool DictBuilder::str_in_hanzis_list(const char16 *hzs, size_t hzs_len, + const char16 *str, size_t str_len) { + if (NULL == hzs || NULL == str) + return false; + + for (size_t pos = 0; pos < str_len; pos++) { + if (!hz_in_hanzis_list(hzs, hzs_len, str[pos])) + return false; + } + return true; +} + +void DictBuilder::get_top_lemmas() { + top_lmas_num_ = 0; + if (NULL == lemma_arr_) + return; + + for (size_t pos = 0; pos < lemma_num_; pos++) { + if (0 == top_lmas_num_) { + top_lmas_[0] = lemma_arr_[pos]; + top_lmas_num_ = 1; + continue; + } + + if (lemma_arr_[pos].freq > top_lmas_[top_lmas_num_ - 1].freq) { + if (kTopScoreLemmaNum > top_lmas_num_) + top_lmas_num_ += 1; + + size_t move_pos; + for (move_pos = top_lmas_num_ - 1; move_pos > 0; move_pos--) { + top_lmas_[move_pos] = top_lmas_[move_pos - 1]; + if (0 == move_pos - 1 || + (move_pos - 1 > 0 && + top_lmas_[move_pos - 2].freq > lemma_arr_[pos].freq)) { + break; + } + } + assert(move_pos > 0); + top_lmas_[move_pos - 1] = lemma_arr_[pos]; + } else if (kTopScoreLemmaNum > top_lmas_num_) { + top_lmas_[top_lmas_num_] = lemma_arr_[pos]; + top_lmas_num_ += 1; + } + } + + if (kPrintDebug0) { + printf("\n------Top Lemmas------------------\n"); + for (size_t pos = 0; pos < top_lmas_num_; pos++) { + printf("--%d, idx:%06d, score:%.5f\n", pos, top_lmas_[pos].idx_by_hz, + top_lmas_[pos].freq); + } + } +} + +void DictBuilder::free_resource() { + if (NULL != lemma_arr_) + delete [] lemma_arr_; + + if (NULL != scis_) + delete [] scis_; + + if (NULL != lma_nodes_le0_) + delete [] lma_nodes_le0_; + + if (NULL != lma_nodes_ge1_) + delete [] lma_nodes_ge1_; + + if (NULL != homo_idx_buf_) + delete [] homo_idx_buf_; + + if (NULL != spl_table_) + delete spl_table_; + + if (NULL != spl_parser_) + delete spl_parser_; + + lemma_arr_ = NULL; + scis_ = NULL; + lma_nodes_le0_ = NULL; + lma_nodes_ge1_ = NULL; + homo_idx_buf_ = NULL; + spl_table_ = NULL; + spl_parser_ = NULL; + + lemma_num_ = 0; + lma_nds_used_num_le0_ = 0; + lma_nds_used_num_ge1_ = 0; + homo_idx_num_eq1_ = 0; + homo_idx_num_gt1_ = 0; +} + +size_t DictBuilder::read_raw_dict(const char* fn_raw, + const char *fn_validhzs, + size_t max_item) { + if (NULL == fn_raw) return 0; + + Utf16Reader utf16_reader; + if (!utf16_reader.open(fn_raw, kReadBufLen * 10)) + return false; + + char16 read_buf[kReadBufLen]; + + // Read the number of lemmas in the file + size_t lemma_num = 240000; + + // allocate resource required + if (!alloc_resource(lemma_num)) { + utf16_reader.close(); + } + + // Read the valid Hanzi list. + char16 *valid_hzs = NULL; + size_t valid_hzs_num = 0; + valid_hzs = read_valid_hanzis(fn_validhzs, &valid_hzs_num); + + // Begin reading the lemma entries + for (size_t i = 0; i < max_item; i++) { + // read next entry + if (!utf16_reader.readline(read_buf, kReadBufLen)) { + lemma_num = i; + break; + } + + size_t token_size; + char16 *token; + char16 *to_tokenize = read_buf; + + // Get the Hanzi string + token = utf16_strtok(to_tokenize, &token_size, &to_tokenize); + if (NULL == token) { + free_resource(); + utf16_reader.close(); + return false; + } + + size_t lemma_size = utf16_strlen(token); + + if (lemma_size > kMaxLemmaSize) { + i--; + continue; + } + + if (lemma_size > 4) { + i--; + continue; + } + + // Copy to the lemma entry + utf16_strcpy(lemma_arr_[i].hanzi_str, token); + + lemma_arr_[i].hz_str_len = token_size; + + // Get the freq string + token = utf16_strtok(to_tokenize, &token_size, &to_tokenize); + if (NULL == token) { + free_resource(); + utf16_reader.close(); + return false; + } + lemma_arr_[i].freq = utf16_atof(token); + + if (lemma_size > 1 && lemma_arr_[i].freq < 60) { + i--; + continue; + } + + // Get GBK mark, if no valid Hanzi list available, all items which contains + // GBK characters will be discarded. Otherwise, all items which contains + // characters outside of the valid Hanzi list will be discarded. + token = utf16_strtok(to_tokenize, &token_size, &to_tokenize); + assert(NULL != token); + int gbk_flag = utf16_atoi(token); + if (NULL == valid_hzs || 0 == valid_hzs_num) { + if (0 != gbk_flag) { + i--; + continue; + } + } else { + if (!str_in_hanzis_list(valid_hzs, valid_hzs_num, + lemma_arr_[i].hanzi_str, lemma_arr_[i].hz_str_len)) { + i--; + continue; + } + } + + // Get spelling String + bool spelling_not_support = false; + for (size_t hz_pos = 0; hz_pos < (size_t)lemma_arr_[i].hz_str_len; + hz_pos++) { + // Get a Pinyin + token = utf16_strtok(to_tokenize, &token_size, &to_tokenize); + if (NULL == token) { + free_resource(); + utf16_reader.close(); + return false; + } + + assert(utf16_strlen(token) <= kMaxPinyinSize); + + utf16_strcpy_tochar(lemma_arr_[i].pinyin_str[hz_pos], token); + + format_spelling_str(lemma_arr_[i].pinyin_str[hz_pos]); + + // Put the pinyin to the spelling table + if (!spl_table_->put_spelling(lemma_arr_[i].pinyin_str[hz_pos], + lemma_arr_[i].freq)) { + spelling_not_support = true; + break; + } + } + + // The whole line must have been parsed fully, otherwise discard this one. + token = utf16_strtok(to_tokenize, &token_size, &to_tokenize); + if (spelling_not_support || NULL != token) { + i--; + continue; + } + } + + delete [] valid_hzs; + utf16_reader.close(); + + printf("read successfully, lemma num: %zd\n", lemma_num); + + return lemma_num; +} + +bool DictBuilder::build_dict(const char *fn_raw, + const char *fn_validhzs, + DictTrie *dict_trie) { + if (NULL == fn_raw || NULL == dict_trie) + return false; + + lemma_num_ = read_raw_dict(fn_raw, fn_validhzs, 240000); + if (0 == lemma_num_) + return false; + + // Arrange the spelling table, and build a spelling tree + // The size of an spelling. '\0' is included. If the spelling table is + // initialized to calculate the spelling scores, the last char in the + // spelling string will be score, and it is also included in spl_item_size. + size_t spl_item_size; + size_t spl_num; + const char* spl_buf; + spl_buf = spl_table_->arrange(&spl_item_size, &spl_num); + if (NULL == spl_buf) { + free_resource(); + return false; + } + + SpellingTrie &spl_trie = SpellingTrie::get_instance(); + + if (!spl_trie.construct(spl_buf, spl_item_size, spl_num, + spl_table_->get_score_amplifier(), + spl_table_->get_average_score())) { + free_resource(); + return false; + } + + printf("spelling tree construct successfully.\n"); + + // Convert the spelling string to idxs + for (size_t i = 0; i < lemma_num_; i++) { + for (size_t hz_pos = 0; hz_pos < (size_t)lemma_arr_[i].hz_str_len; + hz_pos++) { + uint16 spl_idxs[2]; + uint16 spl_start_pos[3]; + bool is_pre = true; + int spl_idx_num = + spl_parser_->splstr_to_idxs(lemma_arr_[i].pinyin_str[hz_pos], + strlen(lemma_arr_[i].pinyin_str[hz_pos]), + spl_idxs, spl_start_pos, 2, is_pre); + assert(1 == spl_idx_num); + + if (spl_trie.is_half_id(spl_idxs[0])) { + uint16 num = spl_trie.half_to_full(spl_idxs[0], spl_idxs); + assert(0 != num); + } + lemma_arr_[i].spl_idx_arr[hz_pos] = spl_idxs[0]; + } + } + + // Sort the lemma items according to the hanzi, and give each unique item a + // id + sort_lemmas_by_hz(); + + scis_num_ = build_scis(); + + // Construct the dict list + dict_trie->dict_list_ = new DictList(); + bool dl_success = dict_trie->dict_list_->init_list(scis_, scis_num_, + lemma_arr_, lemma_num_); + assert(dl_success); + + // Construct the NGram information + NGram& ngram = NGram::get_instance(); + ngram.build_unigram(lemma_arr_, lemma_num_, + lemma_arr_[lemma_num_ - 1].idx_by_hz + 1); + + // sort the lemma items according to the spelling idx string + myqsort(lemma_arr_, lemma_num_, sizeof(LemmaEntry), compare_py); + + get_top_lemmas(); + +#ifdef ___DO_STATISTICS___ + stat_init(); +#endif + + lma_nds_used_num_le0_ = 1; // The root node + bool dt_success = construct_subset(static_cast(lma_nodes_le0_), + lemma_arr_, 0, lemma_num_, 0); + if (!dt_success) { + free_resource(); + return false; + } + +#ifdef ___DO_STATISTICS___ + stat_print(); +#endif + + // Move the node data and homo data to the DictTrie + dict_trie->root_ = new LmaNodeLE0[lma_nds_used_num_le0_]; + dict_trie->nodes_ge1_ = new LmaNodeGE1[lma_nds_used_num_ge1_]; + size_t lma_idx_num = homo_idx_num_eq1_ + homo_idx_num_gt1_ + top_lmas_num_; + dict_trie->lma_idx_buf_ = new unsigned char[lma_idx_num * kLemmaIdSize]; + assert(NULL != dict_trie->root_); + assert(NULL != dict_trie->lma_idx_buf_); + dict_trie->lma_node_num_le0_ = lma_nds_used_num_le0_; + dict_trie->lma_node_num_ge1_ = lma_nds_used_num_ge1_; + dict_trie->lma_idx_buf_len_ = lma_idx_num * kLemmaIdSize; + dict_trie->top_lmas_num_ = top_lmas_num_; + + memcpy(dict_trie->root_, lma_nodes_le0_, + sizeof(LmaNodeLE0) * lma_nds_used_num_le0_); + memcpy(dict_trie->nodes_ge1_, lma_nodes_ge1_, + sizeof(LmaNodeGE1) * lma_nds_used_num_ge1_); + + for (size_t pos = 0; pos < homo_idx_num_eq1_ + homo_idx_num_gt1_; pos++) { + id_to_charbuf(dict_trie->lma_idx_buf_ + pos * kLemmaIdSize, + homo_idx_buf_[pos]); + } + + for (size_t pos = homo_idx_num_eq1_ + homo_idx_num_gt1_; + pos < lma_idx_num; pos++) { + LemmaIdType idx = + top_lmas_[pos - homo_idx_num_eq1_ - homo_idx_num_gt1_].idx_by_hz; + id_to_charbuf(dict_trie->lma_idx_buf_ + pos * kLemmaIdSize, idx); + } + + if (kPrintDebug0) { + printf("homo_idx_num_eq1_: %d\n", homo_idx_num_eq1_); + printf("homo_idx_num_gt1_: %d\n", homo_idx_num_gt1_); + printf("top_lmas_num_: %d\n", top_lmas_num_); + } + + free_resource(); + + if (kPrintDebug0) { + printf("Building dict succeds\n"); + } + return dt_success; +} + +void DictBuilder::id_to_charbuf(unsigned char *buf, LemmaIdType id) { + if (NULL == buf) return; + for (size_t pos = 0; pos < kLemmaIdSize; pos++) { + (buf)[pos] = (unsigned char)(id >> (pos * 8)); + } +} + +void DictBuilder::set_son_offset(LmaNodeGE1 *node, size_t offset) { + node->son_1st_off_l = static_cast(offset); + node->son_1st_off_h = static_cast(offset >> 16); +} + +void DictBuilder:: set_homo_id_buf_offset(LmaNodeGE1 *node, size_t offset) { + node->homo_idx_buf_off_l = static_cast(offset); + node->homo_idx_buf_off_h = static_cast(offset >> 16); + +} + +// All spelling strings will be converted to upper case, except that +// spellings started with "ZH"/"CH"/"SH" will be converted to +// "Zh"/"Ch"/"Sh" +void DictBuilder::format_spelling_str(char *spl_str) { + if (NULL == spl_str) + return; + + uint16 pos = 0; + while ('\0' != spl_str[pos]) { + if (spl_str[pos] >= 'a' && spl_str[pos] <= 'z') + spl_str[pos] = spl_str[pos] - 'a' + 'A'; + + if (1 == pos && 'H' == spl_str[pos]) { + if ('C' == spl_str[0] || 'S' == spl_str[0] || 'Z' == spl_str[0]) { + spl_str[pos] = 'h'; + } + } + pos++; + } +} + +LemmaIdType DictBuilder::sort_lemmas_by_hz() { + if (NULL == lemma_arr_ || 0 == lemma_num_) + return 0; + + myqsort(lemma_arr_, lemma_num_, sizeof(LemmaEntry), cmp_lemma_entry_hzs); + + lemma_arr_[0].idx_by_hz = 1; + LemmaIdType idx_max = 1; + for (size_t i = 1; i < lemma_num_; i++) { + if (utf16_strcmp(lemma_arr_[i].hanzi_str, lemma_arr_[i-1].hanzi_str)) { + idx_max++; + lemma_arr_[i].idx_by_hz = idx_max; + } else { + idx_max++; + lemma_arr_[i].idx_by_hz = idx_max; + } + } + return idx_max + 1; +} + +size_t DictBuilder::build_scis() { + if (NULL == scis_ || lemma_num_ * kMaxLemmaSize > scis_num_) + return 0; + + SpellingTrie &spl_trie = SpellingTrie::get_instance(); + + // This first one is blank, because id 0 is invalid. + scis_[0].freq = 0; + scis_[0].hz = 0; + scis_[0].splid.full_splid = 0; + scis_[0].splid.half_splid = 0; + scis_num_ = 1; + + // Copy the hanzis to the buffer + for (size_t pos = 0; pos < lemma_num_; pos++) { + size_t hz_num = lemma_arr_[pos].hz_str_len; + for (size_t hzpos = 0; hzpos < hz_num; hzpos++) { + scis_[scis_num_].hz = lemma_arr_[pos].hanzi_str[hzpos]; + scis_[scis_num_].splid.full_splid = lemma_arr_[pos].spl_idx_arr[hzpos]; + scis_[scis_num_].splid.half_splid = + spl_trie.full_to_half(scis_[scis_num_].splid.full_splid); + if (1 == hz_num) + scis_[scis_num_].freq = lemma_arr_[pos].freq; + else + scis_[scis_num_].freq = 0.000001; + scis_num_++; + } + } + + myqsort(scis_, scis_num_, sizeof(SingleCharItem), cmp_scis_hz_splid_freq); + + // Remove repeated items + size_t unique_scis_num = 1; + for (size_t pos = 1; pos < scis_num_; pos++) { + if (scis_[pos].hz == scis_[pos - 1].hz && + scis_[pos].splid.full_splid == scis_[pos - 1].splid.full_splid) + continue; + scis_[unique_scis_num] = scis_[pos]; + scis_[unique_scis_num].splid.half_splid = + spl_trie.full_to_half(scis_[pos].splid.full_splid); + unique_scis_num++; + } + + scis_num_ = unique_scis_num; + + // Update the lemma list. + for (size_t pos = 0; pos < lemma_num_; pos++) { + size_t hz_num = lemma_arr_[pos].hz_str_len; + for (size_t hzpos = 0; hzpos < hz_num; hzpos++) { + SingleCharItem key; + key.hz = lemma_arr_[pos].hanzi_str[hzpos]; + key.splid.full_splid = lemma_arr_[pos].spl_idx_arr[hzpos]; + key.splid.half_splid = spl_trie.full_to_half(key.splid.full_splid); + + SingleCharItem *found; + found = static_cast(mybsearch(&key, scis_, + unique_scis_num, + sizeof(SingleCharItem), + cmp_scis_hz_splid)); + + assert(found); + + lemma_arr_[pos].hanzi_scis_ids[hzpos] = + static_cast(found - scis_); + lemma_arr_[pos].spl_idx_arr[hzpos] = found->splid.full_splid; + } + } + + return scis_num_; +} + +bool DictBuilder::construct_subset(void* parent, LemmaEntry* lemma_arr, + size_t item_start, size_t item_end, + size_t level) { + if (level >= kMaxLemmaSize || item_end <= item_start) + return false; + + // 1. Scan for how many sons + size_t parent_son_num = 0; + // LemmaNode *son_1st = NULL; + // parent.num_of_son = 0; + + LemmaEntry *lma_last_start = lemma_arr_ + item_start; + uint16 spl_idx_node = lma_last_start->spl_idx_arr[level]; + + // Scan for how many sons to be allocaed + for (size_t i = item_start + 1; i< item_end; i++) { + LemmaEntry *lma_current = lemma_arr + i; + uint16 spl_idx_current = lma_current->spl_idx_arr[level]; + if (spl_idx_current != spl_idx_node) { + parent_son_num++; + spl_idx_node = spl_idx_current; + } + } + parent_son_num++; + +#ifdef ___DO_STATISTICS___ + // Use to indicate whether all nodes of this layer have no son. + bool allson_noson = true; + + assert(level < kMaxLemmaSize); + if (parent_son_num > max_sonbuf_len_[level]) + max_sonbuf_len_[level] = parent_son_num; + + total_son_num_[level] += parent_son_num; + total_sonbuf_num_[level] += 1; + + if (parent_son_num == 1) + sonbufs_num1_++; + else + sonbufs_numgt1_++; + total_lma_node_num_ += parent_son_num; +#endif + + // 2. Update the parent's information + // Update the parent's son list; + LmaNodeLE0 *son_1st_le0 = NULL; // only one of le0 or ge1 is used + LmaNodeGE1 *son_1st_ge1 = NULL; // only one of le0 or ge1 is used. + if (0 == level) { // the parent is root + (static_cast(parent))->son_1st_off = + lma_nds_used_num_le0_; + son_1st_le0 = lma_nodes_le0_ + lma_nds_used_num_le0_; + lma_nds_used_num_le0_ += parent_son_num; + + assert(parent_son_num <= 65535); + (static_cast(parent))->num_of_son = + static_cast(parent_son_num); + } else if (1 == level) { // the parent is a son of root + (static_cast(parent))->son_1st_off = + lma_nds_used_num_ge1_; + son_1st_ge1 = lma_nodes_ge1_ + lma_nds_used_num_ge1_; + lma_nds_used_num_ge1_ += parent_son_num; + + assert(parent_son_num <= 65535); + (static_cast(parent))->num_of_son = + static_cast(parent_son_num); + } else { + set_son_offset((static_cast(parent)), + lma_nds_used_num_ge1_); + son_1st_ge1 = lma_nodes_ge1_ + lma_nds_used_num_ge1_; + lma_nds_used_num_ge1_ += parent_son_num; + + assert(parent_son_num <= 255); + (static_cast(parent))->num_of_son = + (unsigned char)parent_son_num; + } + + // 3. Now begin to construct the son one by one + size_t son_pos = 0; + + lma_last_start = lemma_arr_ + item_start; + spl_idx_node = lma_last_start->spl_idx_arr[level]; + + size_t homo_num = 0; + if (lma_last_start->spl_idx_arr[level + 1] == 0) + homo_num = 1; + + size_t item_start_next = item_start; + + for (size_t i = item_start + 1; i < item_end; i++) { + LemmaEntry* lma_current = lemma_arr_ + i; + uint16 spl_idx_current = lma_current->spl_idx_arr[level]; + + if (spl_idx_current == spl_idx_node) { + if (lma_current->spl_idx_arr[level + 1] == 0) + homo_num++; + } else { + // Construct a node + LmaNodeLE0 *node_cur_le0 = NULL; // only one of them is valid + LmaNodeGE1 *node_cur_ge1 = NULL; + if (0 == level) { + node_cur_le0 = son_1st_le0 + son_pos; + node_cur_le0->spl_idx = spl_idx_node; + node_cur_le0->homo_idx_buf_off = homo_idx_num_eq1_ + homo_idx_num_gt1_; + node_cur_le0->son_1st_off = 0; + homo_idx_num_eq1_ += homo_num; + } else { + node_cur_ge1 = son_1st_ge1 + son_pos; + node_cur_ge1->spl_idx = spl_idx_node; + + set_homo_id_buf_offset(node_cur_ge1, + (homo_idx_num_eq1_ + homo_idx_num_gt1_)); + set_son_offset(node_cur_ge1, 0); + homo_idx_num_gt1_ += homo_num; + } + + if (homo_num > 0) { + LemmaIdType* idx_buf = homo_idx_buf_ + homo_idx_num_eq1_ + + homo_idx_num_gt1_ - homo_num; + if (0 == level) { + assert(homo_num <= 65535); + node_cur_le0->num_of_homo = static_cast(homo_num); + } else { + assert(homo_num <= 255); + node_cur_ge1->num_of_homo = (unsigned char)homo_num; + } + + for (size_t homo_pos = 0; homo_pos < homo_num; homo_pos++) { + idx_buf[homo_pos] = lemma_arr_[item_start_next + homo_pos].idx_by_hz; + } + +#ifdef ___DO_STATISTICS___ + if (homo_num > max_homobuf_len_[level]) + max_homobuf_len_[level] = homo_num; + + total_homo_num_[level] += homo_num; +#endif + } + + if (i - item_start_next > homo_num) { + void *next_parent; + if (0 == level) + next_parent = static_cast(node_cur_le0); + else + next_parent = static_cast(node_cur_ge1); + construct_subset(next_parent, lemma_arr, + item_start_next + homo_num, i, level + 1); +#ifdef ___DO_STATISTICS___ + + total_node_hasson_[level] += 1; + allson_noson = false; +#endif + } + + // for the next son + lma_last_start = lma_current; + spl_idx_node = spl_idx_current; + item_start_next = i; + homo_num = 0; + if (lma_current->spl_idx_arr[level + 1] == 0) + homo_num = 1; + + son_pos++; + } + } + + // 4. The last one to construct + LmaNodeLE0 *node_cur_le0 = NULL; // only one of them is valid + LmaNodeGE1 *node_cur_ge1 = NULL; + if (0 == level) { + node_cur_le0 = son_1st_le0 + son_pos; + node_cur_le0->spl_idx = spl_idx_node; + node_cur_le0->homo_idx_buf_off = homo_idx_num_eq1_ + homo_idx_num_gt1_; + node_cur_le0->son_1st_off = 0; + homo_idx_num_eq1_ += homo_num; + } else { + node_cur_ge1 = son_1st_ge1 + son_pos; + node_cur_ge1->spl_idx = spl_idx_node; + + set_homo_id_buf_offset(node_cur_ge1, + (homo_idx_num_eq1_ + homo_idx_num_gt1_)); + set_son_offset(node_cur_ge1, 0); + homo_idx_num_gt1_ += homo_num; + } + + if (homo_num > 0) { + LemmaIdType* idx_buf = homo_idx_buf_ + homo_idx_num_eq1_ + + homo_idx_num_gt1_ - homo_num; + if (0 == level) { + assert(homo_num <= 65535); + node_cur_le0->num_of_homo = static_cast(homo_num); + } else { + assert(homo_num <= 255); + node_cur_ge1->num_of_homo = (unsigned char)homo_num; + } + + for (size_t homo_pos = 0; homo_pos < homo_num; homo_pos++) { + idx_buf[homo_pos] = lemma_arr[item_start_next + homo_pos].idx_by_hz; + } + +#ifdef ___DO_STATISTICS___ + if (homo_num > max_homobuf_len_[level]) + max_homobuf_len_[level] = homo_num; + + total_homo_num_[level] += homo_num; +#endif + } + + if (item_end - item_start_next > homo_num) { + void *next_parent; + if (0 == level) + next_parent = static_cast(node_cur_le0); + else + next_parent = static_cast(node_cur_ge1); + construct_subset(next_parent, lemma_arr, + item_start_next + homo_num, item_end, level + 1); +#ifdef ___DO_STATISTICS___ + + total_node_hasson_[level] += 1; + allson_noson = false; +#endif + } + +#ifdef ___DO_STATISTICS___ + if (allson_noson) { + total_sonbuf_allnoson_[level] += 1; + total_node_in_sonbuf_allnoson_[level] += parent_son_num; + } +#endif + + assert(son_pos + 1 == parent_son_num); + return true; +} + +#ifdef ___DO_STATISTICS___ +void DictBuilder::stat_init() { + memset(max_sonbuf_len_, 0, sizeof(size_t) * kMaxLemmaSize); + memset(max_homobuf_len_, 0, sizeof(size_t) * kMaxLemmaSize); + memset(total_son_num_, 0, sizeof(size_t) * kMaxLemmaSize); + memset(total_node_hasson_, 0, sizeof(size_t) * kMaxLemmaSize); + memset(total_sonbuf_num_, 0, sizeof(size_t) * kMaxLemmaSize); + memset(total_sonbuf_allnoson_, 0, sizeof(size_t) * kMaxLemmaSize); + memset(total_node_in_sonbuf_allnoson_, 0, sizeof(size_t) * kMaxLemmaSize); + memset(total_homo_num_, 0, sizeof(size_t) * kMaxLemmaSize); + + sonbufs_num1_ = 0; + sonbufs_numgt1_ = 0; + total_lma_node_num_ = 0; +} + +void DictBuilder::stat_print() { + printf("\n------------STAT INFO-------------\n"); + printf("[root is layer -1]\n"); + printf(".. max_sonbuf_len per layer(from layer 0):\n "); + for (size_t i = 0; i < kMaxLemmaSize; i++) + printf("%d, ", max_sonbuf_len_[i]); + printf("-, \n"); + + printf(".. max_homobuf_len per layer:\n -, "); + for (size_t i = 0; i < kMaxLemmaSize; i++) + printf("%d, ", max_homobuf_len_[i]); + printf("\n"); + + printf(".. total_son_num per layer:\n "); + for (size_t i = 0; i < kMaxLemmaSize; i++) + printf("%d, ", total_son_num_[i]); + printf("-, \n"); + + printf(".. total_node_hasson per layer:\n 1, "); + for (size_t i = 0; i < kMaxLemmaSize; i++) + printf("%d, ", total_node_hasson_[i]); + printf("\n"); + + printf(".. total_sonbuf_num per layer:\n "); + for (size_t i = 0; i < kMaxLemmaSize; i++) + printf("%d, ", total_sonbuf_num_[i]); + printf("-, \n"); + + printf(".. total_sonbuf_allnoson per layer:\n "); + for (size_t i = 0; i < kMaxLemmaSize; i++) + printf("%d, ", total_sonbuf_allnoson_[i]); + printf("-, \n"); + + printf(".. total_node_in_sonbuf_allnoson per layer:\n "); + for (size_t i = 0; i < kMaxLemmaSize; i++) + printf("%d, ", total_node_in_sonbuf_allnoson_[i]); + printf("-, \n"); + + printf(".. total_homo_num per layer:\n 0, "); + for (size_t i = 0; i < kMaxLemmaSize; i++) + printf("%d, ", total_homo_num_[i]); + printf("\n"); + + printf(".. son buf allocation number with only 1 son: %d\n", sonbufs_num1_); + printf(".. son buf allocation number with more than 1 son: %d\n", + sonbufs_numgt1_); + printf(".. total lemma node number: %d\n", total_lma_node_num_ + 1); +} +#endif // ___DO_STATISTICS___ + +#endif // ___BUILD_MODEL___ +} // namespace ime_pinyin diff --git a/googlepinyin/dictbuilder.h b/googlepinyin/dictbuilder.h new file mode 100644 index 0000000..da0d6cd --- /dev/null +++ b/googlepinyin/dictbuilder.h @@ -0,0 +1,171 @@ +/* + * Copyright (C) 2009 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef PINYINIME_INCLUDE_DICTBUILDER_H__ +#define PINYINIME_INCLUDE_DICTBUILDER_H__ + +#include +#include "./utf16char.h" +#include "./dictdef.h" +#include "./dictlist.h" +#include "./spellingtable.h" +#include "./spellingtrie.h" +#include "./splparser.h" + +namespace ime_pinyin { + +#ifdef ___BUILD_MODEL___ + +#define ___DO_STATISTICS___ + +class DictTrie; + +class DictBuilder { + private: + // The raw lemma array buffer. + LemmaEntry *lemma_arr_; + size_t lemma_num_; + + // Used to store all possible single char items. + // Two items may have the same Hanzi while their spelling ids are different. + SingleCharItem *scis_; + size_t scis_num_; + + // In the tree, root's level is -1. + // Lemma nodes for root, and level 0 + LmaNodeLE0 *lma_nodes_le0_; + + // Lemma nodes for layers whose levels are deeper than 0 + LmaNodeGE1 *lma_nodes_ge1_; + + // Number of used lemma nodes + size_t lma_nds_used_num_le0_; + size_t lma_nds_used_num_ge1_; + + // Used to store homophonies' ids. + LemmaIdType *homo_idx_buf_; + // Number of homophonies each of which only contains one Chinese character. + size_t homo_idx_num_eq1_; + // Number of homophonies each of which contains more than one character. + size_t homo_idx_num_gt1_; + + // The items with highest scores. + LemmaEntry *top_lmas_; + size_t top_lmas_num_; + + SpellingTable *spl_table_; + SpellingParser *spl_parser_; + +#ifdef ___DO_STATISTICS___ + size_t max_sonbuf_len_[kMaxLemmaSize]; + size_t max_homobuf_len_[kMaxLemmaSize]; + + size_t total_son_num_[kMaxLemmaSize]; + size_t total_node_hasson_[kMaxLemmaSize]; + size_t total_sonbuf_num_[kMaxLemmaSize]; + size_t total_sonbuf_allnoson_[kMaxLemmaSize]; + size_t total_node_in_sonbuf_allnoson_[kMaxLemmaSize]; + size_t total_homo_num_[kMaxLemmaSize]; + + size_t sonbufs_num1_; // Number of son buffer with only 1 son + size_t sonbufs_numgt1_; // Number of son buffer with more 1 son; + + size_t total_lma_node_num_; + + void stat_init(); + void stat_print(); +#endif + + public: + + DictBuilder(); + ~DictBuilder(); + + // Build dictionary trie from the file fn_raw. File fn_validhzs provides + // valid chars. If fn_validhzs is NULL, only chars in GB2312 will be + // included. + bool build_dict(const char* fn_raw, const char* fn_validhzs, + DictTrie *dict_trie); + + private: + // Fill in the buffer with id. The caller guarantees that the paramters are + // vaild. + void id_to_charbuf(unsigned char *buf, LemmaIdType id); + + // Update the offset of sons for a node. + void set_son_offset(LmaNodeGE1 *node, size_t offset); + + // Update the offset of homophonies' ids for a node. + void set_homo_id_buf_offset(LmaNodeGE1 *node, size_t offset); + + // Format a speling string. + void format_spelling_str(char *spl_str); + + // Sort the lemma_arr by the hanzi string, and give each of unique items + // a id. Why we need to sort the lemma list according to their Hanzi string + // is to find items started by a given prefix string to do prediction. + // Actually, the single char items are be in other order, for example, + // in spelling id order, etc. + // Return value is next un-allocated idx available. + LemmaIdType sort_lemmas_by_hz(); + + // Build the SingleCharItem list, and fill the hanzi_scis_ids in the + // lemma buffer lemma_arr_. + // This function should be called after the lemma array is ready. + // Return the number of unique SingleCharItem elements. + size_t build_scis(); + + // Construct a subtree using a subset of the spelling array (from + // item_star to item_end) + // parent is the parent node to update the necessary information + // parent can be a member of LmaNodeLE0 or LmaNodeGE1 + bool construct_subset(void* parent, LemmaEntry* lemma_arr, + size_t item_start, size_t item_end, size_t level); + + + // Read valid Chinese Hanzis from the given file. + // num is used to return number of chars. + // The return buffer is sorted and caller needs to free the returned buffer. + char16* read_valid_hanzis(const char *fn_validhzs, size_t *num); + + + // Read a raw dictionary. max_item is the maximum number of items. If there + // are more items in the ditionary, only the first max_item will be read. + // Returned value is the number of items successfully read from the file. + size_t read_raw_dict(const char* fn_raw, const char *fn_validhzs, + size_t max_item); + + // Try to find if a character is in hzs buffer. + bool hz_in_hanzis_list(const char16 *hzs, size_t hzs_len, char16 hz); + + // Try to find if all characters in str are in hzs buffer. + bool str_in_hanzis_list(const char16 *hzs, size_t hzs_len, + const char16 *str, size_t str_len); + + // Get these lemmas with toppest scores. + void get_top_lemmas(); + + // Allocate resource to build dictionary. + // lma_num is the number of items to be loaded + bool alloc_resource(size_t lma_num); + + // Free resource. + void free_resource(); +}; +#endif // ___BUILD_MODEL___ +} + +#endif // PINYINIME_INCLUDE_DICTBUILDER_H__ diff --git a/googlepinyin/dictdef.h b/googlepinyin/dictdef.h new file mode 100644 index 0000000..5e1d781 --- /dev/null +++ b/googlepinyin/dictdef.h @@ -0,0 +1,157 @@ +/* + * Copyright (C) 2009 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef PINYINIME_INCLUDE_DICTDEF_H__ +#define PINYINIME_INCLUDE_DICTDEF_H__ + +#include +#include "./utf16char.h" + +namespace ime_pinyin { + +// Enable the following line when building the binary dictionary model. +// #define ___BUILD_MODEL___ + +typedef unsigned char uint8; +typedef unsigned short uint16; +typedef unsigned int uint32; + +typedef signed char int8; +typedef short int16; +typedef int int32; +typedef long long int64; +typedef unsigned long long uint64; + +const bool kPrintDebug0 = false; +const bool kPrintDebug1 = false; +const bool kPrintDebug2 = false; + +// The max length of a lemma. +const size_t kMaxLemmaSize = 8; + +// The max length of a Pinyin (spelling). +const size_t kMaxPinyinSize = 6; + +// The number of half spelling ids. For Chinese Pinyin, there 30 half ids. +// See SpellingTrie.h for details. +const size_t kHalfSpellingIdNum = 29; + +// The maximum number of full spellings. For Chinese Pinyin, there are only +// about 410 spellings. +// If change this value is bigger(needs more bits), please also update +// other structures like SpellingNode, to make sure than a spelling id can be +// stored. +// -1 is because that 0 is never used. +const size_t kMaxSpellingNum = 512 - kHalfSpellingIdNum - 1; +const size_t kMaxSearchSteps = 40; + +// One character predicts its following characters. +const size_t kMaxPredictSize = (kMaxLemmaSize - 1); + +// LemmaIdType must always be size_t. +typedef size_t LemmaIdType; +const size_t kLemmaIdSize = 3; // Actually, a Id occupies 3 bytes in storage. +const size_t kLemmaIdComposing = 0xffffff; + +typedef uint16 LmaScoreType; +typedef uint16 KeyScoreType; + +// Number of items with highest score are kept for prediction purpose. +const size_t kTopScoreLemmaNum = 10; + +const size_t kMaxPredictNumByGt3 = 1; +const size_t kMaxPredictNumBy3 = 2; +const size_t kMaxPredictNumBy2 = 2; + +// The last lemma id (included) for the system dictionary. The system +// dictionary's ids always start from 1. +const LemmaIdType kSysDictIdEnd = 500000; + +// The first lemma id for the user dictionary. +const LemmaIdType kUserDictIdStart = 500001; + +// The last lemma id (included) for the user dictionary. +const LemmaIdType kUserDictIdEnd = 600000; + +typedef struct { + uint16 half_splid:5; + uint16 full_splid:11; +} SpellingId, *PSpellingId; + + +/** + * We use different node types for different layers + * Statistical data of the building result for a testing dictionary: + * root, level 0, level 1, level 2, level 3 + * max son num of one node: 406 280 41 2 - + * max homo num of one node: 0 90 23 2 2 + * total node num of a layer: 1 406 31766 13516 993 + * total homo num of a layer: 9 5674 44609 12667 995 + * + * The node number for root and level 0 won't be larger than 500 + * According to the information above, two kinds of nodes can be used; one for + * root and level 0, the other for these layers deeper than 0. + * + * LE = less and equal, + * A node occupies 16 bytes. so, totallly less than 16 * 500 = 8K + */ +struct LmaNodeLE0 { + uint32 son_1st_off; + uint32 homo_idx_buf_off; + uint16 spl_idx; + uint16 num_of_son; + uint16 num_of_homo; +}; + +/** + * GE = great and equal + * A node occupies 8 bytes. + */ +struct LmaNodeGE1 { + uint16 son_1st_off_l; // Low bits of the son_1st_off + uint16 homo_idx_buf_off_l; // Low bits of the homo_idx_buf_off_1 + uint16 spl_idx; + unsigned char num_of_son; // number of son nodes + unsigned char num_of_homo; // number of homo words + unsigned char son_1st_off_h; // high bits of the son_1st_off + unsigned char homo_idx_buf_off_h; // high bits of the homo_idx_buf_off +}; + +#ifdef ___BUILD_MODEL___ +struct SingleCharItem { + float freq; + char16 hz; + SpellingId splid; +}; + +struct LemmaEntry { + LemmaIdType idx_by_py; + LemmaIdType idx_by_hz; + char16 hanzi_str[kMaxLemmaSize + 1]; + + // The SingleCharItem id for each Hanzi. + uint16 hanzi_scis_ids[kMaxLemmaSize]; + + uint16 spl_idx_arr[kMaxLemmaSize + 1]; + char pinyin_str[kMaxLemmaSize][kMaxPinyinSize + 1]; + unsigned char hz_str_len; + float freq; +}; +#endif // ___BUILD_MODEL___ + +} // namespace ime_pinyin + +#endif // PINYINIME_INCLUDE_DICTDEF_H__ diff --git a/googlepinyin/dictlist.cpp b/googlepinyin/dictlist.cpp new file mode 100644 index 0000000..9957b30 --- /dev/null +++ b/googlepinyin/dictlist.cpp @@ -0,0 +1,446 @@ +/* + * Copyright (C) 2009 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include "dictlist.h" +#include "mystdlib.h" +#include "ngram.h" +#include "searchutility.h" + +namespace ime_pinyin { + +DictList::DictList() { + initialized_ = false; + scis_num_ = 0; + scis_hz_ = NULL; + scis_splid_ = NULL; + buf_ = NULL; + spl_trie_ = SpellingTrie::get_cpinstance(); + + assert(kMaxLemmaSize == 8); + cmp_func_[0] = cmp_hanzis_1; + cmp_func_[1] = cmp_hanzis_2; + cmp_func_[2] = cmp_hanzis_3; + cmp_func_[3] = cmp_hanzis_4; + cmp_func_[4] = cmp_hanzis_5; + cmp_func_[5] = cmp_hanzis_6; + cmp_func_[6] = cmp_hanzis_7; + cmp_func_[7] = cmp_hanzis_8; +} + +DictList::~DictList() { + free_resource(); +} + +bool DictList::alloc_resource(size_t buf_size, size_t scis_num) { + // Allocate memory + buf_ = static_cast(malloc(buf_size * sizeof(char16))); + if (NULL == buf_) + return false; + + scis_num_ = scis_num; + + scis_hz_ = static_cast(malloc(scis_num_ * sizeof(char16))); + if (NULL == scis_hz_) + return false; + + scis_splid_ = static_cast + (malloc(scis_num_ * sizeof(SpellingId))); + + if (NULL == scis_splid_) + return false; + + return true; +} + +void DictList::free_resource() { + if (NULL != buf_) + free(buf_); + buf_ = NULL; + + if (NULL != scis_hz_) + free(scis_hz_); + scis_hz_ = NULL; + + if (NULL != scis_splid_) + free(scis_splid_); + scis_splid_ = NULL; +} + +#ifdef ___BUILD_MODEL___ +bool DictList::init_list(const SingleCharItem *scis, size_t scis_num, + const LemmaEntry *lemma_arr, size_t lemma_num) { + if (NULL == scis || 0 == scis_num || NULL == lemma_arr || 0 == lemma_num) + return false; + + initialized_ = false; + + if (NULL != buf_) + free(buf_); + + // calculate the size + size_t buf_size = calculate_size(lemma_arr, lemma_num); + if (0 == buf_size) + return false; + + if (!alloc_resource(buf_size, scis_num)) + return false; + + fill_scis(scis, scis_num); + + // Copy the related content from the array to inner buffer + fill_list(lemma_arr, lemma_num); + + initialized_ = true; + return true; +} + +size_t DictList::calculate_size(const LemmaEntry* lemma_arr, size_t lemma_num) { + size_t last_hz_len = 0; + size_t list_size = 0; + size_t id_num = 0; + + for (size_t i = 0; i < lemma_num; i++) { + if (0 == i) { + last_hz_len = lemma_arr[i].hz_str_len; + + assert(last_hz_len > 0); + assert(lemma_arr[0].idx_by_hz == 1); + + id_num++; + start_pos_[0] = 0; + start_id_[0] = id_num; + + last_hz_len = 1; + list_size += last_hz_len; + } else { + size_t current_hz_len = lemma_arr[i].hz_str_len; + + assert(current_hz_len >= last_hz_len); + + if (current_hz_len == last_hz_len) { + list_size += current_hz_len; + id_num++; + } else { + for (size_t len = last_hz_len; len < current_hz_len - 1; len++) { + start_pos_[len] = start_pos_[len - 1]; + start_id_[len] = start_id_[len - 1]; + } + + start_pos_[current_hz_len - 1] = list_size; + + id_num++; + start_id_[current_hz_len - 1] = id_num; + + last_hz_len = current_hz_len; + list_size += current_hz_len; + } + } + } + + for (size_t i = last_hz_len; i <= kMaxLemmaSize; i++) { + if (0 == i) { + start_pos_[0] = 0; + start_id_[0] = 1; + } else { + start_pos_[i] = list_size; + start_id_[i] = id_num; + } + } + + return start_pos_[kMaxLemmaSize]; +} + +void DictList::fill_scis(const SingleCharItem *scis, size_t scis_num) { + assert(scis_num_ == scis_num); + + for (size_t pos = 0; pos < scis_num_; pos++) { + scis_hz_[pos] = scis[pos].hz; + scis_splid_[pos] = scis[pos].splid; + } +} + +void DictList::fill_list(const LemmaEntry* lemma_arr, size_t lemma_num) { + size_t current_pos = 0; + + utf16_strncpy(buf_, lemma_arr[0].hanzi_str, + lemma_arr[0].hz_str_len); + + current_pos = lemma_arr[0].hz_str_len; + + size_t id_num = 1; + + for (size_t i = 1; i < lemma_num; i++) { + utf16_strncpy(buf_ + current_pos, lemma_arr[i].hanzi_str, + lemma_arr[i].hz_str_len); + + id_num++; + current_pos += lemma_arr[i].hz_str_len; + } + + assert(current_pos == start_pos_[kMaxLemmaSize]); + assert(id_num == start_id_[kMaxLemmaSize]); +} + +char16* DictList::find_pos2_startedbyhz(char16 hz_char) { + char16 *found_2w = static_cast + (mybsearch(&hz_char, buf_ + start_pos_[1], + (start_pos_[2] - start_pos_[1]) / 2, + sizeof(char16) * 2, cmp_hanzis_1)); + if (NULL == found_2w) + return NULL; + + while (found_2w > buf_ + start_pos_[1] && *found_2w == *(found_2w - 1)) + found_2w -= 2; + + return found_2w; +} +#endif // ___BUILD_MODEL___ + +char16* DictList::find_pos_startedbyhzs(const char16 last_hzs[], + size_t word_len, int (*cmp_func)(const void *, const void *)) { + char16 *found_w = static_cast + (mybsearch(last_hzs, buf_ + start_pos_[word_len - 1], + (start_pos_[word_len] - start_pos_[word_len - 1]) + / word_len, + sizeof(char16) * word_len, cmp_func)); + + if (NULL == found_w) + return NULL; + + while (found_w > buf_ + start_pos_[word_len -1] && + cmp_func(found_w, found_w - word_len) == 0) + found_w -= word_len; + + return found_w; +} + +size_t DictList::predict(const char16 last_hzs[], uint16 hzs_len, + NPredictItem *npre_items, size_t npre_max, + size_t b4_used) { + assert(hzs_len <= kMaxPredictSize && hzs_len > 0); + + // 1. Prepare work + int (*cmp_func)(const void *, const void *) = cmp_func_[hzs_len - 1]; + + NGram& ngram = NGram::get_instance(); + + size_t item_num = 0; + + // 2. Do prediction + for (uint16 pre_len = 1; pre_len <= kMaxPredictSize + 1 - hzs_len; + pre_len++) { + uint16 word_len = hzs_len + pre_len; + char16 *w_buf = find_pos_startedbyhzs(last_hzs, word_len, cmp_func); + if (NULL == w_buf) + continue; + while (w_buf < buf_ + start_pos_[word_len] && + cmp_func(w_buf, last_hzs) == 0 && + item_num < npre_max) { + memset(npre_items + item_num, 0, sizeof(NPredictItem)); + utf16_strncpy(npre_items[item_num].pre_hzs, w_buf + hzs_len, pre_len); + npre_items[item_num].psb = + ngram.get_uni_psb((size_t)(w_buf - buf_ - start_pos_[word_len - 1]) + / word_len + start_id_[word_len - 1]); + npre_items[item_num].his_len = hzs_len; + item_num++; + w_buf += word_len; + } + } + + size_t new_num = 0; + for (size_t i = 0; i < item_num; i++) { + // Try to find it in the existing items + size_t e_pos; + for (e_pos = 1; e_pos <= b4_used; e_pos++) { + if (utf16_strncmp((*(npre_items - e_pos)).pre_hzs, npre_items[i].pre_hzs, + kMaxPredictSize) == 0) + break; + } + if (e_pos <= b4_used) + continue; + + // If not found, append it to the buffer + npre_items[new_num] = npre_items[i]; + new_num++; + } + + return new_num; +} + +uint16 DictList::get_lemma_str(LemmaIdType id_lemma, char16 *str_buf, + uint16 str_max) { + if (!initialized_ || id_lemma >= start_id_[kMaxLemmaSize] || NULL == str_buf + || str_max <= 1) + return 0; + + // Find the range + for (uint16 i = 0; i < kMaxLemmaSize; i++) { + if (i + 1 > str_max - 1) + return 0; + if (start_id_[i] <= id_lemma && start_id_[i + 1] > id_lemma) { + size_t id_span = id_lemma - start_id_[i]; + + uint16 *buf = buf_ + start_pos_[i] + id_span * (i + 1); + for (uint16 len = 0; len <= i; len++) { + str_buf[len] = buf[len]; + } + str_buf[i+1] = (char16)'\0'; + return i + 1; + } + } + return 0; +} + +uint16 DictList::get_splids_for_hanzi(char16 hanzi, uint16 half_splid, + uint16 *splids, uint16 max_splids) { + char16 *hz_found = static_cast + (mybsearch(&hanzi, scis_hz_, scis_num_, sizeof(char16), cmp_hanzis_1)); + assert(NULL != hz_found && hanzi == *hz_found); + + // Move to the first one. + while (hz_found > scis_hz_ && hanzi == *(hz_found - 1)) + hz_found--; + + // First try to found if strict comparison result is not zero. + char16 *hz_f = hz_found; + bool strict = false; + while (hz_f < scis_hz_ + scis_num_ && hanzi == *hz_f) { + uint16 pos = hz_f - scis_hz_; + if (0 == half_splid || scis_splid_[pos].half_splid == half_splid) { + strict = true; + } + hz_f++; + } + + uint16 found_num = 0; + while (hz_found < scis_hz_ + scis_num_ && hanzi == *hz_found) { + uint16 pos = hz_found - scis_hz_; + if (0 == half_splid || + (strict && scis_splid_[pos].half_splid == half_splid) || + (!strict && spl_trie_->half_full_compatible(half_splid, + scis_splid_[pos].full_splid))) { + assert(found_num + 1 < max_splids); + splids[found_num] = scis_splid_[pos].full_splid; + found_num++; + } + hz_found++; + } + + return found_num; +} + +LemmaIdType DictList::get_lemma_id(const char16 *str, uint16 str_len) { + if (NULL == str || str_len > kMaxLemmaSize) + return 0; + + char16 *found = find_pos_startedbyhzs(str, str_len, cmp_func_[str_len - 1]); + if (NULL == found) + return 0; + + assert(found > buf_); + assert(static_cast(found - buf_) >= start_pos_[str_len - 1]); + return static_cast + (start_id_[str_len - 1] + + (found - buf_ - start_pos_[str_len - 1]) / str_len); +} + +void DictList::convert_to_hanzis(char16 *str, uint16 str_len) { + assert(NULL != str); + + for (uint16 str_pos = 0; str_pos < str_len; str_pos++) { + str[str_pos] = scis_hz_[str[str_pos]]; + } +} + +void DictList::convert_to_scis_ids(char16 *str, uint16 str_len) { + assert(NULL != str); + + for (uint16 str_pos = 0; str_pos < str_len; str_pos++) { + str[str_pos] = 0x100; + } +} + +bool DictList::save_list(FILE *fp) { + if (!initialized_ || NULL == fp) + return false; + + if (NULL == buf_ || 0 == start_pos_[kMaxLemmaSize] || + NULL == scis_hz_ || NULL == scis_splid_ || 0 == scis_num_) + return false; + + if (fwrite(&scis_num_, sizeof(uint32), 1, fp) != 1) + return false; + + if (fwrite(start_pos_, sizeof(uint32), kMaxLemmaSize + 1, fp) != + kMaxLemmaSize + 1) + return false; + + if (fwrite(start_id_, sizeof(uint32), kMaxLemmaSize + 1, fp) != + kMaxLemmaSize + 1) + return false; + + if (fwrite(scis_hz_, sizeof(char16), scis_num_, fp) != scis_num_) + return false; + + if (fwrite(scis_splid_, sizeof(SpellingId), scis_num_, fp) != scis_num_) + return false; + + if (fwrite(buf_, sizeof(char16), start_pos_[kMaxLemmaSize], fp) != + start_pos_[kMaxLemmaSize]) + return false; + + return true; +} + +bool DictList::load_list(FILE *fp) { + if (NULL == fp) + return false; + + initialized_ = false; + + if (fread(&scis_num_, sizeof(uint32), 1, fp) != 1) + return false; + + if (fread(start_pos_, sizeof(uint32), kMaxLemmaSize + 1, fp) != + kMaxLemmaSize + 1) + return false; + + if (fread(start_id_, sizeof(uint32), kMaxLemmaSize + 1, fp) != + kMaxLemmaSize + 1) + return false; + + free_resource(); + + if (!alloc_resource(start_pos_[kMaxLemmaSize], scis_num_)) + return false; + + if (fread(scis_hz_, sizeof(char16), scis_num_, fp) != scis_num_) + return false; + + if (fread(scis_splid_, sizeof(SpellingId), scis_num_, fp) != scis_num_) + return false; + + if (fread(buf_, sizeof(char16), start_pos_[kMaxLemmaSize], fp) != + start_pos_[kMaxLemmaSize]) + return false; + + initialized_ = true; + return true; +} +} // namespace ime_pinyin diff --git a/googlepinyin/dictlist.h b/googlepinyin/dictlist.h new file mode 100644 index 0000000..27fa6d8 --- /dev/null +++ b/googlepinyin/dictlist.h @@ -0,0 +1,120 @@ +/* + * Copyright (C) 2009 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef PINYINIME_INCLUDE_DICTLIST_H__ +#define PINYINIME_INCLUDE_DICTLIST_H__ + +#include +#include +#include "./dictdef.h" +#include "./searchutility.h" +#include "./spellingtrie.h" +#include "./utf16char.h" + +namespace ime_pinyin { + +class DictList { + private: + bool initialized_; + + const SpellingTrie *spl_trie_; + + // Number of SingCharItem. The first is blank, because id 0 is invalid. + uint32 scis_num_; + char16 *scis_hz_; + SpellingId *scis_splid_; + + // The large memory block to store the word list. + char16 *buf_; + + // Starting position of those words whose lengths are i+1, counted in + // char16 + uint32 start_pos_[kMaxLemmaSize + 1]; + + uint32 start_id_[kMaxLemmaSize + 1]; + + int (*cmp_func_[kMaxLemmaSize])(const void *, const void *); + + bool alloc_resource(size_t buf_size, size_t scim_num); + + void free_resource(); + +#ifdef ___BUILD_MODEL___ + // Calculate the requsted memory, including the start_pos[] buffer. + size_t calculate_size(const LemmaEntry *lemma_arr, size_t lemma_num); + + void fill_scis(const SingleCharItem *scis, size_t scis_num); + + // Copy the related content to the inner buffer + // It should be called after calculate_size() + void fill_list(const LemmaEntry *lemma_arr, size_t lemma_num); + + // Find the starting position for the buffer of those 2-character Chinese word + // whose first character is the given Chinese character. + char16* find_pos2_startedbyhz(char16 hz_char); +#endif + + // Find the starting position for the buffer of those words whose lengths are + // word_len. The given parameter cmp_func decides how many characters from + // beginning will be used to compare. + char16* find_pos_startedbyhzs(const char16 last_hzs[], + size_t word_Len, + int (*cmp_func)(const void *, const void *)); + + public: + + DictList(); + ~DictList(); + + bool save_list(FILE *fp); + bool load_list(FILE *fp); + +#ifdef ___BUILD_MODEL___ + // Init the list from the LemmaEntry array. + // lemma_arr should have been sorted by the hanzi_str, and have been given + // ids from 1 + bool init_list(const SingleCharItem *scis, size_t scis_num, + const LemmaEntry *lemma_arr, size_t lemma_num); +#endif + + // Get the hanzi string for the given id + uint16 get_lemma_str(LemmaIdType id_hz, char16 *str_buf, uint16 str_max); + + void convert_to_hanzis(char16 *str, uint16 str_len); + + void convert_to_scis_ids(char16 *str, uint16 str_len); + + // last_hzs stores the last n Chinese characters history, its length should be + // less or equal than kMaxPredictSize. + // hzs_len specifies the length(<= kMaxPredictSize). + // predict_buf is used to store the result. + // buf_len specifies the buffer length. + // b4_used specifies how many items before predict_buf have been used. + // Returned value is the number of newly added items. + size_t predict(const char16 last_hzs[], uint16 hzs_len, + NPredictItem *npre_items, size_t npre_max, + size_t b4_used); + + // If half_splid is a valid half spelling id, return those full spelling + // ids which share this half id. + uint16 get_splids_for_hanzi(char16 hanzi, uint16 half_splid, + uint16 *splids, uint16 max_splids); + + LemmaIdType get_lemma_id(const char16 *str, uint16 str_len); +}; +} + +#endif // PINYINIME_INCLUDE_DICTLIST_H__ diff --git a/googlepinyin/dicttrie.cpp b/googlepinyin/dicttrie.cpp new file mode 100644 index 0000000..226a4af --- /dev/null +++ b/googlepinyin/dicttrie.cpp @@ -0,0 +1,941 @@ +/* + * Copyright (C) 2009 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include "dicttrie.h" +#include "dictbuilder.h" +#include "lpicache.h" +#include "mystdlib.h" +#include "ngram.h" + +namespace ime_pinyin { + +DictTrie::DictTrie() { + spl_trie_ = SpellingTrie::get_cpinstance(); + + root_ = NULL; + splid_le0_index_ = NULL; + lma_node_num_le0_ = 0; + nodes_ge1_ = NULL; + lma_node_num_ge1_ = 0; + lma_idx_buf_ = NULL; + lma_idx_buf_len_ = 0; + total_lma_num_ = 0; + top_lmas_num_ = 0; + dict_list_ = NULL; + + parsing_marks_ = NULL; + mile_stones_ = NULL; + reset_milestones(0, kFirstValidMileStoneHandle); +} + +DictTrie::~DictTrie() { + free_resource(true); +} + +void DictTrie::free_resource(bool free_dict_list) { + if (NULL != root_) + free(root_); + root_ = NULL; + + if (NULL != splid_le0_index_) + free(splid_le0_index_); + splid_le0_index_ = NULL; + + if (NULL != nodes_ge1_) + free(nodes_ge1_); + nodes_ge1_ = NULL; + + if (NULL != lma_idx_buf_) + free(lma_idx_buf_); + lma_idx_buf_ = NULL; + + if (free_dict_list) { + if (NULL != dict_list_) { + delete dict_list_; + } + dict_list_ = NULL; + } + + if (parsing_marks_) + delete [] parsing_marks_; + parsing_marks_ = NULL; + + if (mile_stones_) + delete [] mile_stones_; + mile_stones_ = NULL; + + reset_milestones(0, kFirstValidMileStoneHandle); +} + +inline size_t DictTrie::get_son_offset(const LmaNodeGE1 *node) { + return ((size_t)node->son_1st_off_l + ((size_t)node->son_1st_off_h << 16)); +} + +inline size_t DictTrie::get_homo_idx_buf_offset(const LmaNodeGE1 *node) { + return ((size_t)node->homo_idx_buf_off_l + + ((size_t)node->homo_idx_buf_off_h << 16)); +} + +inline LemmaIdType DictTrie::get_lemma_id(size_t id_offset) { + LemmaIdType id = 0; + for (uint16 pos = kLemmaIdSize - 1; pos > 0; pos--) + id = (id << 8) + lma_idx_buf_[id_offset * kLemmaIdSize + pos]; + id = (id << 8) + lma_idx_buf_[id_offset * kLemmaIdSize]; + return id; +} + +#ifdef ___BUILD_MODEL___ +bool DictTrie::build_dict(const char* fn_raw, const char* fn_validhzs) { + DictBuilder* dict_builder = new DictBuilder(); + + free_resource(true); + + return dict_builder->build_dict(fn_raw, fn_validhzs, this); +} + +bool DictTrie::save_dict(FILE *fp) { + if (NULL == fp) + return false; + + if (fwrite(&lma_node_num_le0_, sizeof(uint32), 1, fp) != 1) + return false; + + if (fwrite(&lma_node_num_ge1_, sizeof(uint32), 1, fp) != 1) + return false; + + if (fwrite(&lma_idx_buf_len_, sizeof(uint32), 1, fp) != 1) + return false; + + if (fwrite(&top_lmas_num_, sizeof(uint32), 1, fp) != 1) + return false; + + if (fwrite(root_, sizeof(LmaNodeLE0), lma_node_num_le0_, fp) + != lma_node_num_le0_) + return false; + + if (fwrite(nodes_ge1_, sizeof(LmaNodeGE1), lma_node_num_ge1_, fp) + != lma_node_num_ge1_) + return false; + + if (fwrite(lma_idx_buf_, sizeof(unsigned char), lma_idx_buf_len_, fp) != + lma_idx_buf_len_) + return false; + + return true; +} + +bool DictTrie::save_dict(const char *filename) { + if (NULL == filename) + return false; + + if (NULL == root_ || NULL == dict_list_) + return false; + + SpellingTrie &spl_trie = SpellingTrie::get_instance(); + NGram &ngram = NGram::get_instance(); + + FILE *fp = fopen(filename, "wb"); + if (NULL == fp) + return false; + + if (!spl_trie.save_spl_trie(fp) || !dict_list_->save_list(fp) || + !save_dict(fp) || !ngram.save_ngram(fp)) { + fclose(fp); + return false; + } + + fclose(fp); + return true; +} +#endif // ___BUILD_MODEL___ + +bool DictTrie::load_dict(FILE *fp) { + if (NULL == fp) + return false; + if (fread(&lma_node_num_le0_, sizeof(uint32), 1, fp) != 1) + return false; + + if (fread(&lma_node_num_ge1_, sizeof(uint32), 1, fp) != 1) + return false; + + if (fread(&lma_idx_buf_len_, sizeof(uint32), 1, fp) != 1) + return false; + + if (fread(&top_lmas_num_, sizeof(uint32), 1, fp) != 1 || + top_lmas_num_ >= lma_idx_buf_len_) + return false; + + free_resource(false); + + root_ = static_cast + (malloc(lma_node_num_le0_ * sizeof(LmaNodeLE0))); + nodes_ge1_ = static_cast + (malloc(lma_node_num_ge1_ * sizeof(LmaNodeGE1))); + lma_idx_buf_ = (unsigned char*)malloc(lma_idx_buf_len_); + total_lma_num_ = lma_idx_buf_len_ / kLemmaIdSize; + + size_t buf_size = SpellingTrie::get_instance().get_spelling_num() + 1; + assert(lma_node_num_le0_ <= buf_size); + splid_le0_index_ = static_cast(malloc(buf_size * sizeof(uint16))); + + // Init the space for parsing. + parsing_marks_ = new ParsingMark[kMaxParsingMark]; + mile_stones_ = new MileStone[kMaxMileStone]; + reset_milestones(0, kFirstValidMileStoneHandle); + + if (NULL == root_ || NULL == nodes_ge1_ || NULL == lma_idx_buf_ || + NULL == splid_le0_index_ || NULL == parsing_marks_ || + NULL == mile_stones_) { + free_resource(false); + return false; + } + + if (fread(root_, sizeof(LmaNodeLE0), lma_node_num_le0_, fp) + != lma_node_num_le0_) + return false; + + if (fread(nodes_ge1_, sizeof(LmaNodeGE1), lma_node_num_ge1_, fp) + != lma_node_num_ge1_) + return false; + + if (fread(lma_idx_buf_, sizeof(unsigned char), lma_idx_buf_len_, fp) != + lma_idx_buf_len_) + return false; + + // The quick index for the first level sons + uint16 last_splid = kFullSplIdStart; + size_t last_pos = 0; + for (size_t i = 1; i < lma_node_num_le0_; i++) { + for (uint16 splid = last_splid; splid < root_[i].spl_idx; splid++) + splid_le0_index_[splid - kFullSplIdStart] = last_pos; + + splid_le0_index_[root_[i].spl_idx - kFullSplIdStart] = + static_cast(i); + last_splid = root_[i].spl_idx; + last_pos = i; + } + + for (uint16 splid = last_splid + 1; + splid < buf_size + kFullSplIdStart; splid++) { + assert(static_cast(splid - kFullSplIdStart) < buf_size); + splid_le0_index_[splid - kFullSplIdStart] = last_pos + 1; + } + + return true; +} + +bool DictTrie::load_dict(const char *filename, LemmaIdType start_id, + LemmaIdType end_id) { + if (NULL == filename || end_id <= start_id) + return false; + + FILE *fp = fopen(filename, "rb"); + if (NULL == fp) + return false; + + free_resource(true); + + dict_list_ = new DictList(); + if (NULL == dict_list_) { + fclose(fp); + return false; + } + + SpellingTrie &spl_trie = SpellingTrie::get_instance(); + NGram &ngram = NGram::get_instance(); + + if (!spl_trie.load_spl_trie(fp) || !dict_list_->load_list(fp) || + !load_dict(fp) || !ngram.load_ngram(fp) || + total_lma_num_ > end_id - start_id + 1) { + free_resource(true); + fclose(fp); + return false; + } + + fclose(fp); + return true; +} + +bool DictTrie::load_dict_fd(int sys_fd, long start_offset, + long length, LemmaIdType start_id, + LemmaIdType end_id) { + if (start_offset < 0 || length <= 0 || end_id <= start_id) + return false; + + FILE *fp = fdopen(sys_fd, "rb"); + if (NULL == fp) + return false; + + if (-1 == fseek(fp, start_offset, SEEK_SET)) { + fclose(fp); + return false; + } + + free_resource(true); + + dict_list_ = new DictList(); + if (NULL == dict_list_) { + fclose(fp); + return false; + } + + SpellingTrie &spl_trie = SpellingTrie::get_instance(); + NGram &ngram = NGram::get_instance(); + + if (!spl_trie.load_spl_trie(fp) || !dict_list_->load_list(fp) || + !load_dict(fp) || !ngram.load_ngram(fp) || + ftell(fp) < start_offset + length || + total_lma_num_ > end_id - start_id + 1) { + free_resource(true); + fclose(fp); + return false; + } + + fclose(fp); + return true; +} + +size_t DictTrie::fill_lpi_buffer(LmaPsbItem lpi_items[], size_t lpi_max, + LmaNodeLE0 *node) { + size_t lpi_num = 0; + NGram& ngram = NGram::get_instance(); + for (size_t homo = 0; homo < (size_t)node->num_of_homo; homo++) { + lpi_items[lpi_num].id = get_lemma_id(node->homo_idx_buf_off + + homo); + lpi_items[lpi_num].lma_len = 1; + lpi_items[lpi_num].psb = + static_cast(ngram.get_uni_psb(lpi_items[lpi_num].id)); + lpi_num++; + if (lpi_num >= lpi_max) + break; + } + + return lpi_num; +} + +size_t DictTrie::fill_lpi_buffer(LmaPsbItem lpi_items[], size_t lpi_max, + size_t homo_buf_off, LmaNodeGE1 *node, + uint16 lma_len) { + size_t lpi_num = 0; + NGram& ngram = NGram::get_instance(); + for (size_t homo = 0; homo < (size_t)node->num_of_homo; homo++) { + lpi_items[lpi_num].id = get_lemma_id(homo_buf_off + homo); + lpi_items[lpi_num].lma_len = lma_len; + lpi_items[lpi_num].psb = + static_cast(ngram.get_uni_psb(lpi_items[lpi_num].id)); + lpi_num++; + if (lpi_num >= lpi_max) + break; + } + + return lpi_num; +} + +void DictTrie::reset_milestones(uint16 from_step, MileStoneHandle from_handle) { + if (0 == from_step) { + parsing_marks_pos_ = 0; + mile_stones_pos_ = kFirstValidMileStoneHandle; + } else { + if (from_handle > 0 && from_handle < mile_stones_pos_) { + mile_stones_pos_ = from_handle; + + MileStone *mile_stone = mile_stones_ + from_handle; + parsing_marks_pos_ = mile_stone->mark_start; + } + } +} + +MileStoneHandle DictTrie::extend_dict(MileStoneHandle from_handle, + const DictExtPara *dep, + LmaPsbItem *lpi_items, size_t lpi_max, + size_t *lpi_num) { + if (NULL == dep) + return 0; + + // from LmaNodeLE0 (root) to LmaNodeLE0 + if (0 == from_handle) { + assert(0 == dep->splids_extended); + return extend_dict0(from_handle, dep, lpi_items, lpi_max, lpi_num); + } + + // from LmaNodeLE0 to LmaNodeGE1 + if (1 == dep->splids_extended) + return extend_dict1(from_handle, dep, lpi_items, lpi_max, lpi_num); + + // From LmaNodeGE1 to LmaNodeGE1 + return extend_dict2(from_handle, dep, lpi_items, lpi_max, lpi_num); +} + +MileStoneHandle DictTrie::extend_dict0(MileStoneHandle from_handle, + const DictExtPara *dep, + LmaPsbItem *lpi_items, + size_t lpi_max, size_t *lpi_num) { + assert(NULL != dep && 0 == from_handle); + *lpi_num = 0; + MileStoneHandle ret_handle = 0; + + uint16 splid = dep->splids[dep->splids_extended]; + uint16 id_start = dep->id_start; + uint16 id_num = dep->id_num; + + LpiCache& lpi_cache = LpiCache::get_instance(); + bool cached = lpi_cache.is_cached(splid); + + // 2. Begin exgtending + // 2.1 Get the LmaPsbItem list + LmaNodeLE0 *node = root_; + size_t son_start = splid_le0_index_[id_start - kFullSplIdStart]; + size_t son_end = splid_le0_index_[id_start + id_num - kFullSplIdStart]; + for (size_t son_pos = son_start; son_pos < son_end; son_pos++) { + assert(1 == node->son_1st_off); + LmaNodeLE0 *son = root_ + son_pos; + assert(son->spl_idx >= id_start && son->spl_idx < id_start + id_num); + + if (!cached && *lpi_num < lpi_max) { + bool need_lpi = true; + if (spl_trie_->is_half_id_yunmu(splid) && son_pos != son_start) + need_lpi = false; + + if (need_lpi) + *lpi_num += fill_lpi_buffer(lpi_items + (*lpi_num), + lpi_max - *lpi_num, son); + } + + // If necessary, fill in a new mile stone. + if (son->spl_idx == id_start) { + if (mile_stones_pos_ < kMaxMileStone && + parsing_marks_pos_ < kMaxParsingMark) { + parsing_marks_[parsing_marks_pos_].node_offset = son_pos; + parsing_marks_[parsing_marks_pos_].node_num = id_num; + mile_stones_[mile_stones_pos_].mark_start = parsing_marks_pos_; + mile_stones_[mile_stones_pos_].mark_num = 1; + ret_handle = mile_stones_pos_; + parsing_marks_pos_++; + mile_stones_pos_++; + } + } + + if (son->spl_idx >= id_start + id_num -1) + break; + } + + // printf("----- parsing marks: %d, mile stone: %d \n", parsing_marks_pos_, + // mile_stones_pos_); + return ret_handle; +} + +MileStoneHandle DictTrie::extend_dict1(MileStoneHandle from_handle, + const DictExtPara *dep, + LmaPsbItem *lpi_items, + size_t lpi_max, size_t *lpi_num) { + assert(NULL != dep && from_handle > 0 && from_handle < mile_stones_pos_); + + MileStoneHandle ret_handle = 0; + + // 1. If this is a half Id, get its corresponding full starting Id and + // number of full Id. + size_t ret_val = 0; + + uint16 id_start = dep->id_start; + uint16 id_num = dep->id_num; + + // 2. Begin extending. + MileStone *mile_stone = mile_stones_ + from_handle; + + for (uint16 h_pos = 0; h_pos < mile_stone->mark_num; h_pos++) { + ParsingMark p_mark = parsing_marks_[mile_stone->mark_start + h_pos]; + uint16 ext_num = p_mark.node_num; + for (uint16 ext_pos = 0; ext_pos < ext_num; ext_pos++) { + LmaNodeLE0 *node = root_ + p_mark.node_offset + ext_pos; + size_t found_start = 0; + size_t found_num = 0; + for (size_t son_pos = 0; son_pos < (size_t)node->num_of_son; son_pos++) { + assert(node->son_1st_off <= lma_node_num_ge1_); + LmaNodeGE1 *son = nodes_ge1_ + node->son_1st_off + son_pos; + if (son->spl_idx >= id_start + && son->spl_idx < id_start + id_num) { + if (*lpi_num < lpi_max) { + size_t homo_buf_off = get_homo_idx_buf_offset(son); + *lpi_num += fill_lpi_buffer(lpi_items + (*lpi_num), + lpi_max - *lpi_num, homo_buf_off, son, + 2); + } + + // If necessary, fill in the new DTMI + if (0 == found_num) { + found_start = son_pos; + } + found_num++; + } + if (son->spl_idx >= id_start + id_num - 1 || son_pos == + (size_t)node->num_of_son - 1) { + if (found_num > 0) { + if (mile_stones_pos_ < kMaxMileStone && + parsing_marks_pos_ < kMaxParsingMark) { + parsing_marks_[parsing_marks_pos_].node_offset = + node->son_1st_off + found_start; + parsing_marks_[parsing_marks_pos_].node_num = found_num; + if (0 == ret_val) + mile_stones_[mile_stones_pos_].mark_start = + parsing_marks_pos_; + parsing_marks_pos_++; + } + + ret_val++; + } + break; + } // for son_pos + } // for ext_pos + } // for h_pos + } + + if (ret_val > 0) { + mile_stones_[mile_stones_pos_].mark_num = ret_val; + ret_handle = mile_stones_pos_; + mile_stones_pos_++; + ret_val = 1; + } + + // printf("----- parsing marks: %d, mile stone: %d \n", parsing_marks_pos_, + // mile_stones_pos_); + return ret_handle; +} + +MileStoneHandle DictTrie::extend_dict2(MileStoneHandle from_handle, + const DictExtPara *dep, + LmaPsbItem *lpi_items, + size_t lpi_max, size_t *lpi_num) { + assert(NULL != dep && from_handle > 0 && from_handle < mile_stones_pos_); + + MileStoneHandle ret_handle = 0; + + // 1. If this is a half Id, get its corresponding full starting Id and + // number of full Id. + size_t ret_val = 0; + + uint16 id_start = dep->id_start; + uint16 id_num = dep->id_num; + + // 2. Begin extending. + MileStone *mile_stone = mile_stones_ + from_handle; + + for (uint16 h_pos = 0; h_pos < mile_stone->mark_num; h_pos++) { + ParsingMark p_mark = parsing_marks_[mile_stone->mark_start + h_pos]; + uint16 ext_num = p_mark.node_num; + for (uint16 ext_pos = 0; ext_pos < ext_num; ext_pos++) { + LmaNodeGE1 *node = nodes_ge1_ + p_mark.node_offset + ext_pos; + size_t found_start = 0; + size_t found_num = 0; + + for (size_t son_pos = 0; son_pos < (size_t)node->num_of_son; son_pos++) { + assert(node->son_1st_off_l > 0 || node->son_1st_off_h > 0); + LmaNodeGE1 *son = nodes_ge1_ + get_son_offset(node) + son_pos; + if (son->spl_idx >= id_start + && son->spl_idx < id_start + id_num) { + if (*lpi_num < lpi_max) { + size_t homo_buf_off = get_homo_idx_buf_offset(son); + *lpi_num += fill_lpi_buffer(lpi_items + (*lpi_num), + lpi_max - *lpi_num, homo_buf_off, son, + dep->splids_extended + 1); + } + + // If necessary, fill in the new DTMI + if (0 == found_num) { + found_start = son_pos; + } + found_num++; + } + if (son->spl_idx >= id_start + id_num - 1 || son_pos == + (size_t)node->num_of_son - 1) { + if (found_num > 0) { + if (mile_stones_pos_ < kMaxMileStone && + parsing_marks_pos_ < kMaxParsingMark) { + parsing_marks_[parsing_marks_pos_].node_offset = + get_son_offset(node) + found_start; + parsing_marks_[parsing_marks_pos_].node_num = found_num; + if (0 == ret_val) + mile_stones_[mile_stones_pos_].mark_start = + parsing_marks_pos_; + parsing_marks_pos_++; + } + + ret_val++; + } + break; + } + } // for son_pos + } // for ext_pos + } // for h_pos + + if (ret_val > 0) { + mile_stones_[mile_stones_pos_].mark_num = ret_val; + ret_handle = mile_stones_pos_; + mile_stones_pos_++; + } + + // printf("----- parsing marks: %d, mile stone: %d \n", parsing_marks_pos_, + // mile_stones_pos_); + return ret_handle; +} + +bool DictTrie::try_extend(const uint16 *splids, uint16 splid_num, + LemmaIdType id_lemma) { + if (0 == splid_num || NULL == splids) + return false; + + void *node = root_ + splid_le0_index_[splids[0] - kFullSplIdStart]; + + for (uint16 pos = 1; pos < splid_num; pos++) { + if (1 == pos) { + LmaNodeLE0 *node_le0 = reinterpret_cast(node); + LmaNodeGE1 *node_son; + uint16 son_pos; + for (son_pos = 0; son_pos < static_cast(node_le0->num_of_son); + son_pos++) { + assert(node_le0->son_1st_off <= lma_node_num_ge1_); + node_son = nodes_ge1_ + node_le0->son_1st_off + + son_pos; + if (node_son->spl_idx == splids[pos]) + break; + } + if (son_pos < node_le0->num_of_son) + node = reinterpret_cast(node_son); + else + return false; + } else { + LmaNodeGE1 *node_ge1 = reinterpret_cast(node); + LmaNodeGE1 *node_son; + uint16 son_pos; + for (son_pos = 0; son_pos < static_cast(node_ge1->num_of_son); + son_pos++) { + assert(node_ge1->son_1st_off_l > 0 || node_ge1->son_1st_off_h > 0); + node_son = nodes_ge1_ + get_son_offset(node_ge1) + son_pos; + if (node_son->spl_idx == splids[pos]) + break; + } + if (son_pos < node_ge1->num_of_son) + node = reinterpret_cast(node_son); + else + return false; + } + } + + if (1 == splid_num) { + LmaNodeLE0* node_le0 = reinterpret_cast(node); + size_t num_of_homo = (size_t)node_le0->num_of_homo; + for (size_t homo_pos = 0; homo_pos < num_of_homo; homo_pos++) { + LemmaIdType id_this = get_lemma_id(node_le0->homo_idx_buf_off + homo_pos); + char16 str[2]; + get_lemma_str(id_this, str, 2); + if (id_this == id_lemma) + return true; + } + } else { + LmaNodeGE1* node_ge1 = reinterpret_cast(node); + size_t num_of_homo = (size_t)node_ge1->num_of_homo; + for (size_t homo_pos = 0; homo_pos < num_of_homo; homo_pos++) { + size_t node_homo_off = get_homo_idx_buf_offset(node_ge1); + if (get_lemma_id(node_homo_off + homo_pos) == id_lemma) + return true; + } + } + + return false; +} + +size_t DictTrie::get_lpis(const uint16* splid_str, uint16 splid_str_len, + LmaPsbItem* lma_buf, size_t max_lma_buf) { + if (splid_str_len > kMaxLemmaSize) + return 0; + +#define MAX_EXTENDBUF_LEN 200 + + size_t* node_buf1[MAX_EXTENDBUF_LEN]; // use size_t for data alignment + size_t* node_buf2[MAX_EXTENDBUF_LEN]; + LmaNodeLE0** node_fr_le0 = + reinterpret_cast(node_buf1); // Nodes from. + LmaNodeLE0** node_to_le0 = + reinterpret_cast(node_buf2); // Nodes to. + LmaNodeGE1** node_fr_ge1 = NULL; + LmaNodeGE1** node_to_ge1 = NULL; + size_t node_fr_num = 1; + size_t node_to_num = 0; + node_fr_le0[0] = root_; + if (NULL == node_fr_le0[0]) + return 0; + + size_t spl_pos = 0; + + while (spl_pos < splid_str_len) { + uint16 id_num = 1; + uint16 id_start = splid_str[spl_pos]; + // If it is a half id + if (spl_trie_->is_half_id(splid_str[spl_pos])) { + id_num = spl_trie_->half_to_full(splid_str[spl_pos], &id_start); + assert(id_num > 0); + } + + // Extend the nodes + if (0 == spl_pos) { // From LmaNodeLE0 (root) to LmaNodeLE0 nodes + for (size_t node_fr_pos = 0; node_fr_pos < node_fr_num; node_fr_pos++) { + LmaNodeLE0 *node = node_fr_le0[node_fr_pos]; + assert(node == root_ && 1 == node_fr_num); + size_t son_start = splid_le0_index_[id_start - kFullSplIdStart]; + size_t son_end = + splid_le0_index_[id_start + id_num - kFullSplIdStart]; + for (size_t son_pos = son_start; son_pos < son_end; son_pos++) { + assert(1 == node->son_1st_off); + LmaNodeLE0 *node_son = root_ + son_pos; + assert(node_son->spl_idx >= id_start + && node_son->spl_idx < id_start + id_num); + if (node_to_num < MAX_EXTENDBUF_LEN) { + node_to_le0[node_to_num] = node_son; + node_to_num++; + } + // id_start + id_num - 1 is the last one, which has just been + // recorded. + if (node_son->spl_idx >= id_start + id_num - 1) + break; + } + } + + spl_pos++; + if (spl_pos >= splid_str_len || node_to_num == 0) + break; + // Prepare the nodes for next extending + // next time, from LmaNodeLE0 to LmaNodeGE1 + LmaNodeLE0** node_tmp = node_fr_le0; + node_fr_le0 = node_to_le0; + node_to_le0 = NULL; + node_to_ge1 = reinterpret_cast(node_tmp); + } else if (1 == spl_pos) { // From LmaNodeLE0 to LmaNodeGE1 nodes + for (size_t node_fr_pos = 0; node_fr_pos < node_fr_num; node_fr_pos++) { + LmaNodeLE0 *node = node_fr_le0[node_fr_pos]; + for (size_t son_pos = 0; son_pos < (size_t)node->num_of_son; + son_pos++) { + assert(node->son_1st_off <= lma_node_num_ge1_); + LmaNodeGE1 *node_son = nodes_ge1_ + node->son_1st_off + + son_pos; + if (node_son->spl_idx >= id_start + && node_son->spl_idx < id_start + id_num) { + if (node_to_num < MAX_EXTENDBUF_LEN) { + node_to_ge1[node_to_num] = node_son; + node_to_num++; + } + } + // id_start + id_num - 1 is the last one, which has just been + // recorded. + if (node_son->spl_idx >= id_start + id_num - 1) + break; + } + } + + spl_pos++; + if (spl_pos >= splid_str_len || node_to_num == 0) + break; + // Prepare the nodes for next extending + // next time, from LmaNodeGE1 to LmaNodeGE1 + node_fr_ge1 = node_to_ge1; + node_to_ge1 = reinterpret_cast(node_fr_le0); + node_fr_le0 = NULL; + node_to_le0 = NULL; + } else { // From LmaNodeGE1 to LmaNodeGE1 nodes + for (size_t node_fr_pos = 0; node_fr_pos < node_fr_num; node_fr_pos++) { + LmaNodeGE1 *node = node_fr_ge1[node_fr_pos]; + for (size_t son_pos = 0; son_pos < (size_t)node->num_of_son; + son_pos++) { + assert(node->son_1st_off_l > 0 || node->son_1st_off_h > 0); + LmaNodeGE1 *node_son = nodes_ge1_ + + get_son_offset(node) + son_pos; + if (node_son->spl_idx >= id_start + && node_son->spl_idx < id_start + id_num) { + if (node_to_num < MAX_EXTENDBUF_LEN) { + node_to_ge1[node_to_num] = node_son; + node_to_num++; + } + } + // id_start + id_num - 1 is the last one, which has just been + // recorded. + if (node_son->spl_idx >= id_start + id_num - 1) + break; + } + } + + spl_pos++; + if (spl_pos >= splid_str_len || node_to_num == 0) + break; + // Prepare the nodes for next extending + // next time, from LmaNodeGE1 to LmaNodeGE1 + LmaNodeGE1 **node_tmp = node_fr_ge1; + node_fr_ge1 = node_to_ge1; + node_to_ge1 = node_tmp; + } + + // The number of node for next extending + node_fr_num = node_to_num; + node_to_num = 0; + } // while + + if (0 == node_to_num) + return 0; + + NGram &ngram = NGram::get_instance(); + size_t lma_num = 0; + + // If the length is 1, and the splid is a one-char Yunmu like 'a', 'o', 'e', + // only those candidates for the full matched one-char id will be returned. + if (1 == splid_str_len && spl_trie_->is_half_id_yunmu(splid_str[0])) + node_to_num = node_to_num > 0 ? 1 : 0; + + for (size_t node_pos = 0; node_pos < node_to_num; node_pos++) { + size_t num_of_homo = 0; + if (spl_pos <= 1) { // Get from LmaNodeLE0 nodes + LmaNodeLE0* node_le0 = node_to_le0[node_pos]; + num_of_homo = (size_t)node_le0->num_of_homo; + for (size_t homo_pos = 0; homo_pos < num_of_homo; homo_pos++) { + size_t ch_pos = lma_num + homo_pos; + lma_buf[ch_pos].id = + get_lemma_id(node_le0->homo_idx_buf_off + homo_pos); + lma_buf[ch_pos].lma_len = 1; + lma_buf[ch_pos].psb = + static_cast(ngram.get_uni_psb(lma_buf[ch_pos].id)); + + if (lma_num + homo_pos >= max_lma_buf - 1) + break; + } + } else { // Get from LmaNodeGE1 nodes + LmaNodeGE1* node_ge1 = node_to_ge1[node_pos]; + num_of_homo = (size_t)node_ge1->num_of_homo; + for (size_t homo_pos = 0; homo_pos < num_of_homo; homo_pos++) { + size_t ch_pos = lma_num + homo_pos; + size_t node_homo_off = get_homo_idx_buf_offset(node_ge1); + lma_buf[ch_pos].id = get_lemma_id(node_homo_off + homo_pos); + lma_buf[ch_pos].lma_len = splid_str_len; + lma_buf[ch_pos].psb = + static_cast(ngram.get_uni_psb(lma_buf[ch_pos].id)); + + if (lma_num + homo_pos >= max_lma_buf - 1) + break; + } + } + + lma_num += num_of_homo; + if (lma_num >= max_lma_buf) { + lma_num = max_lma_buf; + break; + } + } + return lma_num; +} + +uint16 DictTrie::get_lemma_str(LemmaIdType id_lemma, char16 *str_buf, + uint16 str_max) { + return dict_list_->get_lemma_str(id_lemma, str_buf, str_max); +} + +uint16 DictTrie::get_lemma_splids(LemmaIdType id_lemma, uint16 *splids, + uint16 splids_max, bool arg_valid) { + char16 lma_str[kMaxLemmaSize + 1]; + uint16 lma_len = get_lemma_str(id_lemma, lma_str, kMaxLemmaSize + 1); + assert((!arg_valid && splids_max >= lma_len) || lma_len == splids_max); + + uint16 spl_mtrx[kMaxLemmaSize * 5]; + uint16 spl_start[kMaxLemmaSize + 1]; + spl_start[0] = 0; + uint16 try_num = 1; + + for (uint16 pos = 0; pos < lma_len; pos++) { + uint16 cand_splids_this = 0; + if (arg_valid && spl_trie_->is_full_id(splids[pos])) { + spl_mtrx[spl_start[pos]] = splids[pos]; + cand_splids_this = 1; + } else { + cand_splids_this = dict_list_->get_splids_for_hanzi(lma_str[pos], + arg_valid ? splids[pos] : 0, spl_mtrx + spl_start[pos], + kMaxLemmaSize * 5 - spl_start[pos]); + assert(cand_splids_this > 0); + } + spl_start[pos + 1] = spl_start[pos] + cand_splids_this; + try_num *= cand_splids_this; + } + + for (uint16 try_pos = 0; try_pos < try_num; try_pos++) { + uint16 mod = 1; + for (uint16 pos = 0; pos < lma_len; pos++) { + uint16 radix = spl_start[pos + 1] - spl_start[pos]; + splids[pos] = spl_mtrx[ spl_start[pos] + try_pos / mod % radix]; + mod *= radix; + } + + if (try_extend(splids, lma_len, id_lemma)) + return lma_len; + } + + return 0; +} + +void DictTrie::set_total_lemma_count_of_others(size_t count) { + NGram& ngram = NGram::get_instance(); + ngram.set_total_freq_none_sys(count); +} + +void DictTrie::convert_to_hanzis(char16 *str, uint16 str_len) { + return dict_list_->convert_to_hanzis(str, str_len); +} + +void DictTrie::convert_to_scis_ids(char16 *str, uint16 str_len) { + return dict_list_->convert_to_scis_ids(str, str_len); +} + +LemmaIdType DictTrie::get_lemma_id(const char16 lemma_str[], uint16 lemma_len) { + if (NULL == lemma_str || lemma_len > kMaxLemmaSize) + return 0; + + return dict_list_->get_lemma_id(lemma_str, lemma_len); +} + +size_t DictTrie::predict_top_lmas(size_t his_len, NPredictItem *npre_items, + size_t npre_max, size_t b4_used) { + NGram &ngram = NGram::get_instance(); + + size_t item_num = 0; + size_t top_lmas_id_offset = lma_idx_buf_len_ / kLemmaIdSize - top_lmas_num_; + size_t top_lmas_pos = 0; + while (item_num < npre_max && top_lmas_pos < top_lmas_num_) { + memset(npre_items + item_num, 0, sizeof(NPredictItem)); + LemmaIdType top_lma_id = get_lemma_id(top_lmas_id_offset + top_lmas_pos); + top_lmas_pos += 1; + if (dict_list_->get_lemma_str(top_lma_id, + npre_items[item_num].pre_hzs, + kMaxLemmaSize - 1) == 0) { + continue; + } + npre_items[item_num].psb = ngram.get_uni_psb(top_lma_id); + npre_items[item_num].his_len = his_len; + item_num++; + } + return item_num; +} + +size_t DictTrie::predict(const char16 *last_hzs, uint16 hzs_len, + NPredictItem *npre_items, size_t npre_max, + size_t b4_used) { + return dict_list_->predict(last_hzs, hzs_len, npre_items, npre_max, b4_used); +} +} // namespace ime_pinyin diff --git a/googlepinyin/dicttrie.h b/googlepinyin/dicttrie.h new file mode 100644 index 0000000..75b7ee0 --- /dev/null +++ b/googlepinyin/dicttrie.h @@ -0,0 +1,233 @@ +/* + * Copyright (C) 2009 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef PINYINIME_INCLUDE_DICTTRIE_H__ +#define PINYINIME_INCLUDE_DICTTRIE_H__ + +#include +#include "./atomdictbase.h" +#include "./dictdef.h" +#include "./dictlist.h" +#include "./searchutility.h" + +namespace ime_pinyin { + +class DictTrie : AtomDictBase { + private: + struct ParsingMark { + size_t node_offset:24; + size_t node_num:8; // Number of nodes with this spelling id given + // by spl_id. If spl_id is a Shengmu, for nodes + // in the first layer of DictTrie, it equals to + // SpellingTrie::shm2full_num(); but for those + // nodes which are not in the first layer, + // node_num < SpellingTrie::shm2full_num(). + // For a full spelling id, node_num = 1; + }; + + // Used to indicate an extended mile stone. + // An extended mile stone is used to mark a partial match in the dictionary + // trie to speed up further potential extending. + // For example, when the user inputs "w", a mile stone is created to mark the + // partial match status, so that when user inputs another char 'm', it will be + // faster to extend search space based on this mile stone. + // + // For partial match status of "wm", there can be more than one sub mile + // stone, for example, "wm" can be matched to "wanm", "wom", ..., etc, so + // there may be more one parsing mark used to mark these partial matchings. + // A mile stone records the starting position in the mark list and number of + // marks. + struct MileStone { + uint16 mark_start; + uint16 mark_num; + }; + + DictList* dict_list_; + + const SpellingTrie *spl_trie_; + + LmaNodeLE0* root_; // Nodes for root and the first layer. + LmaNodeGE1* nodes_ge1_; // Nodes for other layers. + + // An quick index from spelling id to the LmaNodeLE0 node buffer, or + // to the root_ buffer. + // Index length: + // SpellingTrie::get_instance().get_spelling_num() + 1. The last one is used + // to get the end. + // All Shengmu ids are not indexed because they will be converted into + // corresponding full ids. + // So, given an id splid, the son is: + // root_[splid_le0_index_[splid - kFullSplIdStart]] + uint16 *splid_le0_index_; + + uint32 lma_node_num_le0_; + uint32 lma_node_num_ge1_; + + // The first part is for homophnies, and the last top_lma_num_ items are + // lemmas with highest scores. + unsigned char *lma_idx_buf_; + uint32 lma_idx_buf_len_; // The total size of lma_idx_buf_ in byte. + uint32 total_lma_num_; // Total number of lemmas in this dictionary. + uint32 top_lmas_num_; // Number of lemma with highest scores. + + // Parsing mark list used to mark the detailed extended statuses. + ParsingMark *parsing_marks_; + // The position for next available mark. + uint16 parsing_marks_pos_; + + // Mile stone list used to mark the extended status. + MileStone *mile_stones_; + // The position for the next available mile stone. We use positions (except 0) + // as handles. + MileStoneHandle mile_stones_pos_; + + // Get the offset of sons for a node. + inline size_t get_son_offset(const LmaNodeGE1 *node); + + // Get the offset of homonious ids for a node. + inline size_t get_homo_idx_buf_offset(const LmaNodeGE1 *node); + + // Get the lemma id by the offset. + inline LemmaIdType get_lemma_id(size_t id_offset); + + void free_resource(bool free_dict_list); + + bool load_dict(FILE *fp); + + // Given a LmaNodeLE0 node, extract the lemmas specified by it, and fill + // them into the lpi_items buffer. + // This function is called by the search engine. + size_t fill_lpi_buffer(LmaPsbItem lpi_items[], size_t max_size, + LmaNodeLE0 *node); + + // Given a LmaNodeGE1 node, extract the lemmas specified by it, and fill + // them into the lpi_items buffer. + // This function is called by inner functions extend_dict0(), extend_dict1() + // and extend_dict2(). + size_t fill_lpi_buffer(LmaPsbItem lpi_items[], size_t max_size, + size_t homo_buf_off, LmaNodeGE1 *node, + uint16 lma_len); + + // Extend in the trie from level 0. + MileStoneHandle extend_dict0(MileStoneHandle from_handle, + const DictExtPara *dep, LmaPsbItem *lpi_items, + size_t lpi_max, size_t *lpi_num); + + // Extend in the trie from level 1. + MileStoneHandle extend_dict1(MileStoneHandle from_handle, + const DictExtPara *dep, LmaPsbItem *lpi_items, + size_t lpi_max, size_t *lpi_num); + + // Extend in the trie from level 2. + MileStoneHandle extend_dict2(MileStoneHandle from_handle, + const DictExtPara *dep, LmaPsbItem *lpi_items, + size_t lpi_max, size_t *lpi_num); + + // Try to extend the given spelling id buffer, and if the given id_lemma can + // be successfully gotten, return true; + // The given spelling ids are all valid full ids. + bool try_extend(const uint16 *splids, uint16 splid_num, LemmaIdType id_lemma); + +#ifdef ___BUILD_MODEL___ + bool save_dict(FILE *fp); +#endif // ___BUILD_MODEL___ + + static const int kMaxMileStone = 100; + static const int kMaxParsingMark = 600; + static const MileStoneHandle kFirstValidMileStoneHandle = 1; + + friend class DictParser; + friend class DictBuilder; + + public: + + DictTrie(); + ~DictTrie(); + +#ifdef ___BUILD_MODEL___ + // Construct the tree from the file fn_raw. + // fn_validhzs provide the valid hanzi list. If fn_validhzs is + // NULL, only chars in GB2312 will be included. + bool build_dict(const char *fn_raw, const char *fn_validhzs); + + // Save the binary dictionary + // Actually, the SpellingTrie/DictList instance will be also saved. + bool save_dict(const char *filename); +#endif // ___BUILD_MODEL___ + + void convert_to_hanzis(char16 *str, uint16 str_len); + + void convert_to_scis_ids(char16 *str, uint16 str_len); + + // Load a binary dictionary + // The SpellingTrie instance/DictList will be also loaded + bool load_dict(const char *filename, LemmaIdType start_id, + LemmaIdType end_id); + bool load_dict_fd(int sys_fd, long start_offset, long length, + LemmaIdType start_id, LemmaIdType end_id); + bool close_dict() {return true;} + size_t number_of_lemmas() {return 0;} + + void reset_milestones(uint16 from_step, MileStoneHandle from_handle); + + MileStoneHandle extend_dict(MileStoneHandle from_handle, + const DictExtPara *dep, + LmaPsbItem *lpi_items, + size_t lpi_max, size_t *lpi_num); + + size_t get_lpis(const uint16 *splid_str, uint16 splid_str_len, + LmaPsbItem *lpi_items, size_t lpi_max); + + uint16 get_lemma_str(LemmaIdType id_lemma, char16 *str_buf, uint16 str_max); + + uint16 get_lemma_splids(LemmaIdType id_lemma, uint16 *splids, + uint16 splids_max, bool arg_valid); + + size_t predict(const char16 *last_hzs, uint16 hzs_len, + NPredictItem *npre_items, size_t npre_max, + size_t b4_used); + + LemmaIdType put_lemma(char16 /*lemma_str*/[], uint16 /*splids*/[], + uint16 /*lemma_len*/, uint16 /*count*/) {return 0;} + + LemmaIdType update_lemma(LemmaIdType /*lemma_id*/, int16 /*delta_count*/, + bool /*selected*/) {return 0;} + + LemmaIdType get_lemma_id(char16 /*lemma_str*/[], uint16 /*splids*/[], + uint16 /*lemma_len*/) {return 0;} + + LmaScoreType get_lemma_score(LemmaIdType /*lemma_id*/) {return 0;} + + LmaScoreType get_lemma_score(char16 /*lemma_str*/[], uint16 /*splids*/[], + uint16 /*lemma_len*/) {return 0;} + + bool remove_lemma(LemmaIdType /*lemma_id*/) {return false;} + + size_t get_total_lemma_count() {return 0;} + void set_total_lemma_count_of_others(size_t count); + + void flush_cache() {} + + LemmaIdType get_lemma_id(const char16 lemma_str[], uint16 lemma_len); + + // Fill the lemmas with highest scores to the prediction buffer. + // his_len is the history length to fill in the prediction buffer. + size_t predict_top_lmas(size_t his_len, NPredictItem *npre_items, + size_t npre_max, size_t b4_used); +}; +} + +#endif // PINYINIME_INCLUDE_DICTTRIE_H__ diff --git a/googlepinyin/googlepinyin.pro b/googlepinyin/googlepinyin.pro new file mode 100644 index 0000000..58eb2e0 --- /dev/null +++ b/googlepinyin/googlepinyin.pro @@ -0,0 +1,59 @@ +QT -= gui + +TARGET = googlepinyin +TEMPLATE = lib +CONFIG += staticlib + +SOURCES += \ + dictbuilder.cpp \ + dictlist.cpp \ + dicttrie.cpp \ + lpicache.cpp \ + matrixsearch.cpp \ + mystdlib.cpp \ + ngram.cpp \ + pinyinime.cpp \ + searchutility.cpp \ + spellingtable.cpp \ + spellingtrie.cpp \ + splparser.cpp \ + sync.cpp \ + userdict.cpp \ + utf16char.cpp \ + utf16reader.cpp + +HEADERS += \ + atomdictbase.h \ + dictbuilder.h \ + dictdef.h \ + dictlist.h \ + dicttrie.h \ + lpicache.h \ + matrixsearch.h \ + mystdlib.h \ + ngram.h \ + pinyinime.h \ + searchutility.h \ + spellingtable.h \ + spellingtrie.h \ + splparser.h \ + sync.h \ + userdict.h \ + utf16char.h \ + utf16reader.h + +win32{ + CONFIG += debug_and_release build_all + CONFIG(debug, debug|release){ + TARGET = ../../plugin/googlepinyin/$$join(TARGET,,,d) + }CONFIG(release, debug|release){ + TARGET = ../../plugin/googlepinyin/$$TARGET + } +} +unix{ + TARGET = ../plugin/googlepinyin/$$TARGET + MOC_DIR = ../tmpfiles + RCC_DIR = ../tmpfiles + UI_DIR = ../tmpfiles + OBJECTS_DIR = ../tmpfiles +} diff --git a/googlepinyin/lpicache.cpp b/googlepinyin/lpicache.cpp new file mode 100644 index 0000000..f185499 --- /dev/null +++ b/googlepinyin/lpicache.cpp @@ -0,0 +1,81 @@ +/* + * Copyright (C) 2009 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "lpicache.h" + +namespace ime_pinyin { + +LpiCache* LpiCache::instance_ = NULL; + +LpiCache::LpiCache() { + lpi_cache_ = new LmaPsbItem[kFullSplIdStart * kMaxLpiCachePerId]; + lpi_cache_len_ = new uint16[kFullSplIdStart]; + assert(NULL != lpi_cache_); + assert(NULL != lpi_cache_len_); + for (uint16 id = 0; id < kFullSplIdStart; id++) + lpi_cache_len_[id] = 0; +} + +LpiCache::~LpiCache() { + if (NULL != lpi_cache_) + delete [] lpi_cache_; + + if (NULL != lpi_cache_len_) + delete [] lpi_cache_len_; +} + +LpiCache& LpiCache::get_instance() { + if (NULL == instance_) { + instance_ = new LpiCache(); + assert(NULL != instance_); + } + return *instance_; +} + +bool LpiCache::is_cached(uint16 splid) { + if (splid >= kFullSplIdStart) + return false; + return lpi_cache_len_[splid] != 0; +} + +size_t LpiCache::put_cache(uint16 splid, LmaPsbItem lpi_items[], + size_t lpi_num) { + uint16 num = kMaxLpiCachePerId; + if (num > lpi_num) + num = static_cast(lpi_num); + + LmaPsbItem *lpi_cache_this = lpi_cache_ + splid * kMaxLpiCachePerId; + for (uint16 pos = 0; pos < num; pos++) + lpi_cache_this[pos] = lpi_items[pos]; + + lpi_cache_len_[splid] = num; + return num; +} + +size_t LpiCache::get_cache(uint16 splid, LmaPsbItem lpi_items[], + size_t lpi_max) { + if (lpi_max > lpi_cache_len_[splid]) + lpi_max = lpi_cache_len_[splid]; + + LmaPsbItem *lpi_cache_this = lpi_cache_ + splid * kMaxLpiCachePerId; + for (uint16 pos = 0; pos < lpi_max; pos++) { + lpi_items[pos] = lpi_cache_this[pos]; + } + return lpi_max; +} + +} // namespace ime_pinyin diff --git a/googlepinyin/lpicache.h b/googlepinyin/lpicache.h new file mode 100644 index 0000000..6073597 --- /dev/null +++ b/googlepinyin/lpicache.h @@ -0,0 +1,62 @@ +/* + * Copyright (C) 2009 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef PINYINIME_ANDPY_INCLUDE_LPICACHE_H__ +#define PINYINIME_ANDPY_INCLUDE_LPICACHE_H__ + +#include +#include "./searchutility.h" +#include "./spellingtrie.h" + +namespace ime_pinyin { + +// Used to cache LmaPsbItem list for half spelling ids. +class LpiCache { + private: + static LpiCache *instance_; + static const int kMaxLpiCachePerId = 15; + + LmaPsbItem *lpi_cache_; + uint16 *lpi_cache_len_; + + public: + LpiCache(); + ~LpiCache(); + + static LpiCache& get_instance(); + + // Test if the LPI list of the given splid has been cached. + // If splid is a full spelling id, it returns false, because we only cache + // list for half ids. + bool is_cached(uint16 splid); + + // Put LPI list to cahce. If the length of the list, lpi_num, is longer than + // the cache buffer. the list will be truncated, and function returns the + // maximum length of the cache buffer. + // Note: splid must be a half id, and lpi_items must be not NULL. The + // caller of this function should guarantee this. + size_t put_cache(uint16 splid, LmaPsbItem lpi_items[], size_t lpi_num); + + // Get the cached list for the given half id. + // Return the length of the cached buffer. + // Note: splid must be a half id, and lpi_items must be not NULL. The + // caller of this function should guarantee this. + size_t get_cache(uint16 splid, LmaPsbItem lpi_items[], size_t lpi_max); +}; + +} // namespace + +#endif // PINYINIME_ANDPY_INCLUDE_LPICACHE_H__ diff --git a/googlepinyin/matrixsearch.cpp b/googlepinyin/matrixsearch.cpp new file mode 100644 index 0000000..022628b --- /dev/null +++ b/googlepinyin/matrixsearch.cpp @@ -0,0 +1,1981 @@ +/* + * Copyright (C) 2009 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include "lpicache.h" +#include "matrixsearch.h" +#include "mystdlib.h" +#include "ngram.h" +#include "userdict.h" + +namespace ime_pinyin { + +#define PRUMING_SCORE 8000.0 + +MatrixSearch::MatrixSearch() { + inited_ = false; + spl_trie_ = SpellingTrie::get_cpinstance(); + + reset_pointers_to_null(); + + pys_decoded_len_ = 0; + mtrx_nd_pool_used_ = 0; + dmi_pool_used_ = 0; + xi_an_enabled_ = false; + dmi_c_phrase_ = false; + + assert(kMaxSearchSteps > 0); + max_sps_len_ = kMaxSearchSteps - 1; + max_hzs_len_ = kMaxSearchSteps; +} + +MatrixSearch::~MatrixSearch() { + free_resource(); +} + +void MatrixSearch::reset_pointers_to_null() { + dict_trie_ = NULL; + user_dict_ = NULL; + spl_parser_ = NULL; + + share_buf_ = NULL; + + // The following four buffers are used for decoding, and they are based on + // share_buf_, no need to delete them. + mtrx_nd_pool_ = NULL; + dmi_pool_ = NULL; + matrix_ = NULL; + dep_ = NULL; + + // Based on share_buf_, no need to delete them. + npre_items_ = NULL; +} + +bool MatrixSearch::alloc_resource() { + free_resource(); + + dict_trie_ = new DictTrie(); + user_dict_ = static_cast(new UserDict()); + spl_parser_ = new SpellingParser(); + + size_t mtrx_nd_size = sizeof(MatrixNode) * kMtrxNdPoolSize; + mtrx_nd_size = align_to_size_t(mtrx_nd_size) / sizeof(size_t); + size_t dmi_size = sizeof(DictMatchInfo) * kDmiPoolSize; + dmi_size = align_to_size_t(dmi_size) / sizeof(size_t); + size_t matrix_size = sizeof(MatrixRow) * kMaxRowNum; + matrix_size = align_to_size_t(matrix_size) / sizeof(size_t); + size_t dep_size = sizeof(DictExtPara); + dep_size = align_to_size_t(dep_size) / sizeof(size_t); + + // share_buf's size is determined by the buffers for search. + share_buf_ = new size_t[mtrx_nd_size + dmi_size + matrix_size + dep_size]; + + if (NULL == dict_trie_ || NULL == user_dict_ || NULL == spl_parser_ || + NULL == share_buf_) + return false; + + // The buffers for search are based on the share buffer + mtrx_nd_pool_ = reinterpret_cast(share_buf_); + dmi_pool_ = reinterpret_cast(share_buf_ + mtrx_nd_size); + matrix_ = reinterpret_cast(share_buf_ + mtrx_nd_size + dmi_size); + dep_ = reinterpret_cast + (share_buf_ + mtrx_nd_size + dmi_size + matrix_size); + + // The prediction buffer is also based on the share buffer. + npre_items_ = reinterpret_cast(share_buf_); + npre_items_len_ = (mtrx_nd_size + dmi_size + matrix_size + dep_size) * + sizeof(size_t) / sizeof(NPredictItem); + return true; +} + +void MatrixSearch::free_resource() { + if (NULL != dict_trie_) + delete dict_trie_; + + if (NULL != user_dict_) + delete user_dict_; + + if (NULL != spl_parser_) + delete spl_parser_; + + if (NULL != share_buf_) + delete [] share_buf_; + + reset_pointers_to_null(); +} + +bool MatrixSearch::init(const char *fn_sys_dict, const char *fn_usr_dict) { + if (NULL == fn_sys_dict || NULL == fn_usr_dict) + return false; + + if (!alloc_resource()) + return false; + + if (!dict_trie_->load_dict(fn_sys_dict, 1, kSysDictIdEnd)) + return false; + + // If engine fails to load the user dictionary, reset the user dictionary + // to NULL. + if (!user_dict_->load_dict(fn_usr_dict, kUserDictIdStart, kUserDictIdEnd)) { + delete user_dict_; + user_dict_ = NULL; + } else{ + user_dict_->set_total_lemma_count_of_others(NGram::kSysDictTotalFreq); + } + + reset_search0(); + + inited_ = true; + return true; +} + +bool MatrixSearch::init_fd(int sys_fd, long start_offset, long length, + const char *fn_usr_dict) { + if (NULL == fn_usr_dict) + return false; + + if (!alloc_resource()) + return false; + + if (!dict_trie_->load_dict_fd(sys_fd, start_offset, length, 1, kSysDictIdEnd)) + return false; + + if (!user_dict_->load_dict(fn_usr_dict, kUserDictIdStart, kUserDictIdEnd)) { + delete user_dict_; + user_dict_ = NULL; + } else { + user_dict_->set_total_lemma_count_of_others(NGram::kSysDictTotalFreq); + } + + reset_search0(); + + inited_ = true; + return true; +} + +void MatrixSearch::init_user_dictionary(const char *fn_usr_dict) { + assert(inited_); + + if (NULL != user_dict_) { + delete user_dict_; + user_dict_ = NULL; + } + + if (NULL != fn_usr_dict) { + user_dict_ = static_cast(new UserDict()); + if (!user_dict_->load_dict(fn_usr_dict, kUserDictIdStart, kUserDictIdEnd)) { + delete user_dict_; + user_dict_ = NULL; + } + } + + reset_search0(); +} + +bool MatrixSearch::is_user_dictionary_enabled() const { + return NULL != user_dict_; +} + +void MatrixSearch::set_max_lens(size_t max_sps_len, size_t max_hzs_len) { + if (0 != max_sps_len) + max_sps_len_ = max_sps_len; + if (0 != max_hzs_len) + max_hzs_len_ = max_hzs_len; +} + +void MatrixSearch::close() { + flush_cache(); + free_resource(); + inited_ = false; +} + +void MatrixSearch::flush_cache() { + if (NULL != user_dict_) + user_dict_->flush_cache(); +} + +void MatrixSearch::set_xi_an_switch(bool xi_an_enabled) { + xi_an_enabled_ = xi_an_enabled; +} + +bool MatrixSearch::get_xi_an_switch() { + return xi_an_enabled_; +} + +bool MatrixSearch::reset_search() { + if (!inited_) + return false; + return reset_search0(); +} + +bool MatrixSearch::reset_search0() { + if (!inited_) + return false; + + pys_decoded_len_ = 0; + mtrx_nd_pool_used_ = 0; + dmi_pool_used_ = 0; + + // Get a MatrixNode from the pool + matrix_[0].mtrx_nd_pos = mtrx_nd_pool_used_; + matrix_[0].mtrx_nd_num = 1; + mtrx_nd_pool_used_ += 1; + + // Update the node, and make it to be a starting node + MatrixNode *node = mtrx_nd_pool_ + matrix_[0].mtrx_nd_pos; + node->id = 0; + node->score = 0; + node->from = NULL; + node->step = 0; + node->dmi_fr = (PoolPosType)-1; + + matrix_[0].dmi_pos = 0; + matrix_[0].dmi_num = 0; + matrix_[0].dmi_has_full_id = 1; + matrix_[0].mtrx_nd_fixed = node; + + lma_start_[0] = 0; + fixed_lmas_ = 0; + spl_start_[0] = 0; + fixed_hzs_ = 0; + + dict_trie_->reset_milestones(0, 0); + if (NULL != user_dict_) + user_dict_->reset_milestones(0, 0); + + return true; +} + +bool MatrixSearch::reset_search(size_t ch_pos, bool clear_fixed_this_step, + bool clear_dmi_this_step, + bool clear_mtrx_this_step) { + if (!inited_ || ch_pos > pys_decoded_len_ || ch_pos >= kMaxRowNum) + return false; + + if (0 == ch_pos) { + reset_search0(); + } else { + // Prepare mile stones of this step to clear. + MileStoneHandle *dict_handles_to_clear = NULL; + if (clear_dmi_this_step && matrix_[ch_pos].dmi_num > 0) { + dict_handles_to_clear = dmi_pool_[matrix_[ch_pos].dmi_pos].dict_handles; + } + + // If there are more steps, and this step is not allowed to clear, find + // milestones of next step. + if (pys_decoded_len_ > ch_pos && !clear_dmi_this_step) { + dict_handles_to_clear = NULL; + if (matrix_[ch_pos + 1].dmi_num > 0) { + dict_handles_to_clear = + dmi_pool_[matrix_[ch_pos + 1].dmi_pos].dict_handles; + } + } + + if (NULL != dict_handles_to_clear) { + dict_trie_->reset_milestones(ch_pos, dict_handles_to_clear[0]); + if (NULL != user_dict_) + user_dict_->reset_milestones(ch_pos, dict_handles_to_clear[1]); + } + + pys_decoded_len_ = ch_pos; + + if (clear_dmi_this_step) { + dmi_pool_used_ = matrix_[ch_pos - 1].dmi_pos + + matrix_[ch_pos - 1].dmi_num; + matrix_[ch_pos].dmi_num = 0; + } else { + dmi_pool_used_ = matrix_[ch_pos].dmi_pos + matrix_[ch_pos].dmi_num; + } + + if (clear_mtrx_this_step) { + mtrx_nd_pool_used_ = matrix_[ch_pos - 1].mtrx_nd_pos + + matrix_[ch_pos - 1].mtrx_nd_num; + matrix_[ch_pos].mtrx_nd_num = 0; + } else { + mtrx_nd_pool_used_ = matrix_[ch_pos].mtrx_nd_pos + + matrix_[ch_pos].mtrx_nd_num; + } + + // Modify fixed_hzs_ + if (fixed_hzs_ > 0 && + ((kLemmaIdComposing != lma_id_[0]) || + (kLemmaIdComposing == lma_id_[0] && + spl_start_[c_phrase_.length] <= ch_pos))) { + size_t fixed_ch_pos = ch_pos; + if (clear_fixed_this_step) + fixed_ch_pos = fixed_ch_pos > 0 ? fixed_ch_pos - 1 : 0; + while (NULL == matrix_[fixed_ch_pos].mtrx_nd_fixed && fixed_ch_pos > 0) + fixed_ch_pos--; + + fixed_lmas_ = 0; + fixed_hzs_ = 0; + if (fixed_ch_pos > 0) { + while (spl_start_[fixed_hzs_] < fixed_ch_pos) + fixed_hzs_++; + assert(spl_start_[fixed_hzs_] == fixed_ch_pos); + + while (lma_start_[fixed_lmas_] < fixed_hzs_) + fixed_lmas_++; + assert(lma_start_[fixed_lmas_] == fixed_hzs_); + } + + // Re-search the Pinyin string for the unlocked lemma + // which was previously fixed. + // + // Prepare mile stones of this step to clear. + MileStoneHandle *dict_handles_to_clear = NULL; + if (clear_dmi_this_step && ch_pos == fixed_ch_pos && + matrix_[fixed_ch_pos].dmi_num > 0) { + dict_handles_to_clear = dmi_pool_[matrix_[fixed_ch_pos].dmi_pos].dict_handles; + } + + // If there are more steps, and this step is not allowed to clear, find + // milestones of next step. + if (pys_decoded_len_ > fixed_ch_pos && !clear_dmi_this_step) { + dict_handles_to_clear = NULL; + if (matrix_[fixed_ch_pos + 1].dmi_num > 0) { + dict_handles_to_clear = + dmi_pool_[matrix_[fixed_ch_pos + 1].dmi_pos].dict_handles; + } + } + + if (NULL != dict_handles_to_clear) { + dict_trie_->reset_milestones(fixed_ch_pos, dict_handles_to_clear[0]); + if (NULL != user_dict_) + user_dict_->reset_milestones(fixed_ch_pos, dict_handles_to_clear[1]); + } + + + pys_decoded_len_ = fixed_ch_pos; + + if (clear_dmi_this_step && ch_pos == fixed_ch_pos) { + dmi_pool_used_ = matrix_[fixed_ch_pos - 1].dmi_pos + + matrix_[fixed_ch_pos - 1].dmi_num; + matrix_[fixed_ch_pos].dmi_num = 0; + } else { + dmi_pool_used_ = matrix_[fixed_ch_pos].dmi_pos + + matrix_[fixed_ch_pos].dmi_num; + } + + if (clear_mtrx_this_step && ch_pos == fixed_ch_pos) { + mtrx_nd_pool_used_ = matrix_[fixed_ch_pos - 1].mtrx_nd_pos + + matrix_[fixed_ch_pos - 1].mtrx_nd_num; + matrix_[fixed_ch_pos].mtrx_nd_num = 0; + } else { + mtrx_nd_pool_used_ = matrix_[fixed_ch_pos].mtrx_nd_pos + + matrix_[fixed_ch_pos].mtrx_nd_num; + } + + for (uint16 re_pos = fixed_ch_pos; re_pos < ch_pos; re_pos++) { + add_char(pys_[re_pos]); + } + } else if (fixed_hzs_ > 0 && kLemmaIdComposing == lma_id_[0]) { + for (uint16 subpos = 0; subpos < c_phrase_.sublma_num; subpos++) { + uint16 splpos_begin = c_phrase_.sublma_start[subpos]; + uint16 splpos_end = c_phrase_.sublma_start[subpos + 1]; + for (uint16 splpos = splpos_begin; splpos < splpos_end; splpos++) { + // If ch_pos is in this spelling + uint16 spl_start = c_phrase_.spl_start[splpos]; + uint16 spl_end = c_phrase_.spl_start[splpos + 1]; + if (ch_pos >= spl_start && ch_pos < spl_end) { + // Clear everything after this position + c_phrase_.chn_str[splpos] = static_cast('\0'); + c_phrase_.sublma_start[subpos + 1] = splpos; + c_phrase_.sublma_num = subpos + 1; + c_phrase_.length = splpos; + + if (splpos == splpos_begin) { + c_phrase_.sublma_num = subpos; + } + } + } + } + + // Extend the composing phrase. + reset_search0(); + dmi_c_phrase_ = true; + uint16 c_py_pos = 0; + while (c_py_pos < spl_start_[c_phrase_.length]) { + bool b_ac_tmp = add_char(pys_[c_py_pos]); + assert(b_ac_tmp); + c_py_pos++; + } + dmi_c_phrase_ = false; + + lma_id_num_ = 1; + fixed_lmas_ = 1; + fixed_lmas_no1_[0] = 0; // A composing string is always modified. + fixed_hzs_ = c_phrase_.length; + lma_start_[1] = fixed_hzs_; + lma_id_[0] = kLemmaIdComposing; + matrix_[spl_start_[fixed_hzs_]].mtrx_nd_fixed = mtrx_nd_pool_ + + matrix_[spl_start_[fixed_hzs_]].mtrx_nd_pos; + } + } + + return true; +} + +void MatrixSearch::del_in_pys(size_t start, size_t len) { + while (start < kMaxRowNum - len && '\0' != pys_[start]) { + pys_[start] = pys_[start + len]; + start++; + } +} + +size_t MatrixSearch::search(const char *py, size_t py_len) { + if (!inited_ || NULL == py) + return 0; + + // If the search Pinyin string is too long, it will be truncated. + if (py_len > kMaxRowNum - 1) + py_len = kMaxRowNum - 1; + + // Compare the new string with the previous one. Find their prefix to + // increase search efficiency. + size_t ch_pos = 0; + for (ch_pos = 0; ch_pos < pys_decoded_len_; ch_pos++) { + if ('\0' == py[ch_pos] || py[ch_pos] != pys_[ch_pos]) + break; + } + + bool clear_fix = true; + if (ch_pos == pys_decoded_len_) + clear_fix = false; + + reset_search(ch_pos, clear_fix, false, false); + + memcpy(pys_ + ch_pos, py + ch_pos, py_len - ch_pos); + pys_[py_len] = '\0'; + + while ('\0' != pys_[ch_pos]) { + if (!add_char(py[ch_pos])) { + pys_decoded_len_ = ch_pos; + break; + } + ch_pos++; + } + + // Get spelling ids and starting positions. + get_spl_start_id(); + + // If there are too many spellings, remove the last letter until the spelling + // number is acceptable. + while (spl_id_num_ > 9) { + py_len--; + reset_search(py_len, false, false, false); + pys_[py_len] = '\0'; + get_spl_start_id(); + } + + prepare_candidates(); + + if (kPrintDebug0) { + printf("--Matrix Node Pool Used: %d\n", mtrx_nd_pool_used_); + printf("--DMI Pool Used: %d\n", dmi_pool_used_); + + if (kPrintDebug1) { + for (PoolPosType pos = 0; pos < dmi_pool_used_; pos++) { + debug_print_dmi(pos, 1); + } + } + } + + return ch_pos; +} + +size_t MatrixSearch::delsearch(size_t pos, bool is_pos_in_splid, + bool clear_fixed_this_step) { + if (!inited_) + return 0; + + size_t reset_pos = pos; + + // Out of range for both Pinyin mode and Spelling id mode. + if (pys_decoded_len_ <= pos) { + del_in_pys(pos, 1); + + reset_pos = pys_decoded_len_; + // Decode the string after the un-decoded position + while ('\0' != pys_[reset_pos]) { + if (!add_char(pys_[reset_pos])) { + pys_decoded_len_ = reset_pos; + break; + } + reset_pos++; + } + get_spl_start_id(); + prepare_candidates(); + return pys_decoded_len_; + } + + // Spelling id mode, but out of range. + if (is_pos_in_splid && spl_id_num_ <= pos) + return pys_decoded_len_; + + // Begin to handle two modes respectively. + // Pinyin mode by default + size_t c_py_len = 0; // The length of composing phrase's Pinyin + size_t del_py_len = 1; + if (!is_pos_in_splid) { + // Pinyin mode is only allowed to delete beyond the fixed lemmas. + if (fixed_lmas_ > 0 && pos < spl_start_[lma_start_[fixed_lmas_]]) + return pys_decoded_len_; + + del_in_pys(pos, 1); + + // If the deleted character is just the one after the last fixed lemma + if (pos == spl_start_[lma_start_[fixed_lmas_]]) { + // If all fixed lemmas have been merged, and the caller of the function + // request to unlock the last fixed lemma. + if (kLemmaIdComposing == lma_id_[0] && clear_fixed_this_step) { + // Unlock the last sub lemma in the composing phrase. Because it is not + // easy to unlock it directly. Instead, we re-decode the modified + // composing phrase. + c_phrase_.sublma_num--; + c_phrase_.length = c_phrase_.sublma_start[c_phrase_.sublma_num]; + reset_pos = spl_start_[c_phrase_.length]; + c_py_len = reset_pos; + } + } + } else { + del_py_len = spl_start_[pos + 1] - spl_start_[pos]; + + del_in_pys(spl_start_[pos], del_py_len); + + if (pos >= lma_start_[fixed_lmas_]) { + c_py_len = 0; + reset_pos = spl_start_[pos + 1] - del_py_len; + } else { + c_py_len = spl_start_[lma_start_[fixed_lmas_]] - del_py_len; + reset_pos = c_py_len; + if (c_py_len > 0) + merge_fixed_lmas(pos); + } + } + + if (c_py_len > 0) { + assert(c_phrase_.length > 0 && c_py_len == + c_phrase_.spl_start[c_phrase_.sublma_start[c_phrase_.sublma_num]]); + // The composing phrase is valid, reset all search space, + // and begin a new search which will only extend the composing + // phrase. + reset_search0(); + + dmi_c_phrase_ = true; + // Extend the composing phrase. + uint16 c_py_pos = 0; + while (c_py_pos < c_py_len) { + bool b_ac_tmp = add_char(pys_[c_py_pos]); + assert(b_ac_tmp); + c_py_pos++; + } + dmi_c_phrase_ = false; + + // Fixd the composing phrase as the first choice. + lma_id_num_ = 1; + fixed_lmas_ = 1; + fixed_lmas_no1_[0] = 0; // A composing string is always modified. + fixed_hzs_ = c_phrase_.length; + lma_start_[1] = fixed_hzs_; + lma_id_[0] = kLemmaIdComposing; + matrix_[spl_start_[fixed_hzs_]].mtrx_nd_fixed = mtrx_nd_pool_ + + matrix_[spl_start_[fixed_hzs_]].mtrx_nd_pos; + } else { + // Reseting search only clear pys_decoded_len_, but the string is kept. + reset_search(reset_pos, clear_fixed_this_step, false, false); + } + + // Decode the string after the delete position. + while ('\0' != pys_[reset_pos]) { + if (!add_char(pys_[reset_pos])) { + pys_decoded_len_ = reset_pos; + break; + } + reset_pos++; + } + + get_spl_start_id(); + prepare_candidates(); + return pys_decoded_len_; +} + +size_t MatrixSearch::get_candidate_num() { + if (!inited_ || 0 == pys_decoded_len_ || + 0 == matrix_[pys_decoded_len_].mtrx_nd_num) + return 0; + + return 1 + lpi_total_; +} + +char16* MatrixSearch::get_candidate(size_t cand_id, char16 *cand_str, + size_t max_len) { + if (!inited_ || 0 == pys_decoded_len_ || NULL == cand_str) + return NULL; + + if (0 == cand_id) { + return get_candidate0(cand_str, max_len, NULL, false); + } else { + cand_id--; + } + + // For this case: the current sentence is a word only, and the user fixed it, + // so the result will be fixed to the sentence space, and + // lpi_total_ will be set to 0. + if (0 == lpi_total_) { + return get_candidate0(cand_str, max_len, NULL, false); + } + + LemmaIdType id = lpi_items_[cand_id].id; + char16 s[kMaxLemmaSize + 1]; + + uint16 s_len = lpi_items_[cand_id].lma_len; + if (s_len > 1) { + s_len = get_lemma_str(id, s, kMaxLemmaSize + 1); + } else { + // For a single character, Hanzi is ready. + s[0] = lpi_items_[cand_id].hanzi; + s[1] = static_cast(0); + } + + if (s_len > 0 && max_len > s_len) { + utf16_strncpy(cand_str, s, s_len); + cand_str[s_len] = (char16)'\0'; + return cand_str; + } + + return NULL; +} + +void MatrixSearch::update_dict_freq() { + if (NULL != user_dict_) { + // Update the total frequency of all lemmas, including system lemmas and + // user dictionary lemmas. + size_t total_freq = user_dict_->get_total_lemma_count(); + dict_trie_->set_total_lemma_count_of_others(total_freq); + } +} + +bool MatrixSearch::add_lma_to_userdict(uint16 lma_fr, uint16 lma_to, + float score) { + if (lma_to - lma_fr <= 1 || NULL == user_dict_) + return false; + + char16 word_str[kMaxLemmaSize + 1]; + uint16 spl_ids[kMaxLemmaSize]; + + uint16 spl_id_fr = 0; + + for (uint16 pos = lma_fr; pos < lma_to; pos++) { + LemmaIdType lma_id = lma_id_[pos]; + if (is_user_lemma(lma_id)) { + user_dict_->update_lemma(lma_id, 1, true); + } + uint16 lma_len = lma_start_[pos + 1] - lma_start_[pos]; + utf16_strncpy(spl_ids + spl_id_fr, spl_id_ + lma_start_[pos], lma_len); + + uint16 tmp = get_lemma_str(lma_id, word_str + spl_id_fr, + kMaxLemmaSize + 1 - spl_id_fr); + assert(tmp == lma_len); + + tmp = get_lemma_splids(lma_id, spl_ids + spl_id_fr, lma_len, true); + if (tmp != lma_len) { + return false; + } + + spl_id_fr += lma_len; + } + + assert(spl_id_fr <= kMaxLemmaSize); + + return user_dict_->put_lemma(static_cast(word_str), spl_ids, + spl_id_fr, 1); +} + +void MatrixSearch::debug_print_dmi(PoolPosType dmi_pos, uint16 nest_level) { + if (dmi_pos >= dmi_pool_used_) return; + + DictMatchInfo *dmi = dmi_pool_ + dmi_pos; + + if (1 == nest_level) { + printf("-----------------%d\'th DMI node begin----------->\n", dmi_pos); + } + if (dmi->dict_level > 1) { + debug_print_dmi(dmi->dmi_fr, nest_level + 1); + } + printf("---%d\n", dmi->dict_level); + printf(" MileStone: %x, %x\n", dmi->dict_handles[0], dmi->dict_handles[1]); + printf(" Spelling : %s, %d\n", SpellingTrie::get_instance(). + get_spelling_str(dmi->spl_id), dmi->spl_id); + printf(" Total Pinyin Len: %d\n", dmi->splstr_len); + if (1 == nest_level) { + printf("<----------------%d\'th DMI node end--------------\n\n", dmi_pos); + } +} + +bool MatrixSearch::try_add_cand0_to_userdict() { + size_t new_cand_num = get_candidate_num(); + if (fixed_hzs_ > 0 && 1 == new_cand_num) { + float score_from = 0; + uint16 lma_id_from = 0; + uint16 pos = 0; + bool modified = false; + while (pos < fixed_lmas_) { + if (lma_start_[pos + 1] - lma_start_[lma_id_from] > + static_cast(kMaxLemmaSize)) { + float score_to_add = + mtrx_nd_pool_[matrix_[spl_start_[lma_start_[pos]]] + .mtrx_nd_pos].score - score_from; + if (modified) { + score_to_add += 1.0; + if (score_to_add > NGram::kMaxScore) { + score_to_add = NGram::kMaxScore; + } + add_lma_to_userdict(lma_id_from, pos, score_to_add); + } + lma_id_from = pos; + score_from += score_to_add; + + // Clear the flag for next user lemma. + modified = false; + } + + if (0 == fixed_lmas_no1_[pos]) { + modified = true; + } + pos++; + } + + // Single-char word is not allowed to add to userdict. + if (lma_start_[pos] - lma_start_[lma_id_from] > 1) { + float score_to_add = + mtrx_nd_pool_[matrix_[spl_start_[lma_start_[pos]]] + .mtrx_nd_pos].score - score_from; + if (modified) { + score_to_add += 1.0; + if (score_to_add > NGram::kMaxScore) { + score_to_add = NGram::kMaxScore; + } + add_lma_to_userdict(lma_id_from, pos, score_to_add); + } + } + } + return true; +} + +// Choose a candidate, and give new candidates for next step. +// If user finishes selection, we will try to communicate with user dictionary +// to add new items or update score of some existing items. +// +// Basic rule: +// 1. If user selects the first choice: +// 1.1. If the first choice is not a sentence, instead, it is a lemma: +// 1.1.1. If the first choice is a user lemma, notify the user +// dictionary that a user lemma is hit, and add occuring count +// by 1. +// 1.1.2. If the first choice is a system lemma, do nothing. +// 1.2. If the first choice is a sentence containing more than one lemma: +// 1.2.1. The whole sentence will be added as a user lemma. If the +// sentence contains user lemmas, -> hit, and add occuring count +// by 1. +size_t MatrixSearch::choose(size_t cand_id) { + if (!inited_ || 0 == pys_decoded_len_) + return 0; + + if (0 == cand_id) { + fixed_hzs_ = spl_id_num_; + matrix_[spl_start_[fixed_hzs_]].mtrx_nd_fixed = mtrx_nd_pool_ + + matrix_[spl_start_[fixed_hzs_]].mtrx_nd_pos; + for (size_t pos = fixed_lmas_; pos < lma_id_num_; pos++) { + fixed_lmas_no1_[pos] = 1; + } + fixed_lmas_ = lma_id_num_; + lpi_total_ = 0; // Clean all other candidates. + + // 1. It is the first choice + if (1 == lma_id_num_) { + // 1.1. The first choice is not a sentence but a lemma + if (is_user_lemma(lma_id_[0])) { + // 1.1.1. The first choice is a user lemma, notify the user dictionary + // that it is hit. + if (NULL != user_dict_) + user_dict_->update_lemma(lma_id_[0], 1, true); + } else { + // 1.1.2. do thing for a system lemma. + } + } else { + // 1.2. The first choice is a sentence. + // 1.2.1 Try to add the whole sentence to user dictionary, the whole + // sentence may be splitted into many items. + if (NULL != user_dict_) { + try_add_cand0_to_userdict(); + } + } + update_dict_freq(); + return 1; + } else { + cand_id--; + } + + // 2. It is not the full sentence candidate. + // Find the length of the candidate. + LemmaIdType id_chosen = lpi_items_[cand_id].id; + LmaScoreType score_chosen = lpi_items_[cand_id].psb; + size_t cand_len = lpi_items_[cand_id].lma_len; + + assert(cand_len > 0); + + // Notify the atom dictionary that this item is hit. + if (is_user_lemma(id_chosen)) { + if (NULL != user_dict_) { + user_dict_->update_lemma(id_chosen, 1, true); + } + update_dict_freq(); + } + + // 3. Fixed the chosen item. + // 3.1 Get the steps number. + size_t step_fr = spl_start_[fixed_hzs_]; + size_t step_to = spl_start_[fixed_hzs_ + cand_len]; + + // 3.2 Save the length of the original string. + size_t pys_decoded_len = pys_decoded_len_; + + // 3.2 Reset the space of the fixed part. + reset_search(step_to, false, false, true); + + // 3.3 For the last character of the fixed part, the previous DMI + // information will be kept, while the MTRX information will be re-extended, + // and only one node will be extended. + matrix_[step_to].mtrx_nd_num = 0; + + LmaPsbItem lpi_item; + lpi_item.psb = score_chosen; + lpi_item.id = id_chosen; + + PoolPosType step_to_dmi_fr = match_dmi(step_to, + spl_id_ + fixed_hzs_, cand_len); + //assert(step_to_dmi_fr != static_cast(-1)); + + extend_mtrx_nd(matrix_[step_fr].mtrx_nd_fixed, &lpi_item, 1, + step_to_dmi_fr, step_to); + + matrix_[step_to].mtrx_nd_fixed = mtrx_nd_pool_ + matrix_[step_to].mtrx_nd_pos; + mtrx_nd_pool_used_ = matrix_[step_to].mtrx_nd_pos + + matrix_[step_to].mtrx_nd_num; + + if (id_chosen == lma_id_[fixed_lmas_]) + fixed_lmas_no1_[fixed_lmas_] = 1; + else + fixed_lmas_no1_[fixed_lmas_] = 0; + lma_id_[fixed_lmas_] = id_chosen; + lma_start_[fixed_lmas_ + 1] = lma_start_[fixed_lmas_] + cand_len; + fixed_lmas_++; + fixed_hzs_ = fixed_hzs_ + cand_len; + + while (step_to != pys_decoded_len) { + bool b = add_char(pys_[step_to]); + assert(b); + step_to++; + } + + if (fixed_hzs_ < spl_id_num_) { + prepare_candidates(); + } else { + lpi_total_ = 0; + if (NULL != user_dict_) { + try_add_cand0_to_userdict(); + } + } + + return get_candidate_num(); +} + +size_t MatrixSearch::cancel_last_choice() { + if (!inited_ || 0 == pys_decoded_len_) + return 0; + + size_t step_start = 0; + if (fixed_hzs_ > 0) { + size_t step_end = spl_start_[fixed_hzs_]; + MatrixNode *end_node = matrix_[step_end].mtrx_nd_fixed; + assert(NULL != end_node); + + step_start = end_node->from->step; + + if (step_start > 0) { + DictMatchInfo *dmi = dmi_pool_ + end_node->dmi_fr; + fixed_hzs_ -= dmi->dict_level; + } else { + fixed_hzs_ = 0; + } + + reset_search(step_start, false, false, false); + + while (pys_[step_start] != '\0') { + bool b = add_char(pys_[step_start]); + assert(b); + step_start++; + } + + prepare_candidates(); + } + return get_candidate_num(); +} + +size_t MatrixSearch::get_fixedlen() { + if (!inited_ || 0 == pys_decoded_len_) + return 0; + return fixed_hzs_; +} + +bool MatrixSearch::prepare_add_char(char ch) { + if (pys_decoded_len_ >= kMaxRowNum - 1 || + (!spl_parser_->is_valid_to_parse(ch) && ch != '\'')) + return false; + + if (dmi_pool_used_ >= kDmiPoolSize) return false; + + pys_[pys_decoded_len_] = ch; + pys_decoded_len_++; + + MatrixRow *mtrx_this_row = matrix_ + pys_decoded_len_; + mtrx_this_row->mtrx_nd_pos = mtrx_nd_pool_used_; + mtrx_this_row->mtrx_nd_num = 0; + mtrx_this_row->dmi_pos = dmi_pool_used_; + mtrx_this_row->dmi_num = 0; + mtrx_this_row->dmi_has_full_id = 0; + + return true; +} + +bool MatrixSearch::is_split_at(uint16 pos) { + return !spl_parser_->is_valid_to_parse(pys_[pos - 1]); +} + +void MatrixSearch::fill_dmi(DictMatchInfo *dmi, MileStoneHandle *handles, + PoolPosType dmi_fr, uint16 spl_id, + uint16 node_num, unsigned char dict_level, + bool splid_end_split, unsigned char splstr_len, + unsigned char all_full_id) { + dmi->dict_handles[0] = handles[0]; + dmi->dict_handles[1] = handles[1]; + dmi->dmi_fr = dmi_fr; + dmi->spl_id = spl_id; + dmi->dict_level = dict_level; + dmi->splid_end_split = splid_end_split ? 1 : 0; + dmi->splstr_len = splstr_len; + dmi->all_full_id = all_full_id; + dmi->c_phrase = 0; +} + +bool MatrixSearch::add_char(char ch) { + if (!prepare_add_char(ch)) + return false; + return add_char_qwerty(); +} + +bool MatrixSearch::add_char_qwerty() { + matrix_[pys_decoded_len_].mtrx_nd_num = 0; + + bool spl_matched = false; + uint16 longest_ext = 0; + // Extend the search matrix, from the oldest unfixed row. ext_len means + // extending length. + for (uint16 ext_len = kMaxPinyinSize + 1; ext_len > 0; ext_len--) { + if (ext_len > pys_decoded_len_ - spl_start_[fixed_hzs_]) + continue; + + // Refer to the declaration of the variable dmi_has_full_id for the + // explanation of this piece of code. In one word, it is used to prevent + // from the unwise extending of "shoud ou" but allow the reasonable + // extending of "heng ao", "lang a", etc. + if (ext_len > 1 && 0 != longest_ext && + 0 == matrix_[pys_decoded_len_ - ext_len].dmi_has_full_id) { + if (xi_an_enabled_) + continue; + else + break; + } + + uint16 oldrow = pys_decoded_len_ - ext_len; + + // 0. If that row is before the last fixed step, ignore. + if (spl_start_[fixed_hzs_] > oldrow) + continue; + + // 1. Check if that old row has valid MatrixNode. If no, means that row is + // not a boundary, either a word boundary or a spelling boundary. + // If it is for extending composing phrase, it's OK to ignore the 0. + if (0 == matrix_[oldrow].mtrx_nd_num && !dmi_c_phrase_) + continue; + + // 2. Get spelling id(s) for the last ext_len chars. + uint16 spl_idx; + bool is_pre = false; + spl_idx = spl_parser_->get_splid_by_str(pys_ + oldrow, + ext_len, &is_pre); + if (is_pre) + spl_matched = true; + + if (0 == spl_idx) + continue; + + bool splid_end_split = is_split_at(oldrow + ext_len); + + // 3. Extend the DMI nodes of that old row + // + 1 is to extend an extra node from the root + for (PoolPosType dmi_pos = matrix_[oldrow].dmi_pos; + dmi_pos < matrix_[oldrow].dmi_pos + matrix_[oldrow].dmi_num + 1; + dmi_pos++) { + DictMatchInfo *dmi = dmi_pool_ + dmi_pos; + if (dmi_pos == matrix_[oldrow].dmi_pos + matrix_[oldrow].dmi_num) { + dmi = NULL; // The last one, NULL means extending from the root. + } else { + // If the dmi is covered by the fixed arrange, ignore it. + if (fixed_hzs_ > 0 && + pys_decoded_len_ - ext_len - dmi->splstr_len < + spl_start_[fixed_hzs_]) { + continue; + } + // If it is not in mode for composing phrase, and the source DMI node + // is marked for composing phrase, ignore this node. + if (dmi->c_phrase != 0 && !dmi_c_phrase_) { + continue; + } + } + + // For example, if "gao" is extended, "g ao" is not allowed. + // or "zh" has been passed, "z h" is not allowed. + // Both word and word-connection will be prevented. + if (longest_ext > ext_len) { + if (NULL == dmi && 0 == matrix_[oldrow].dmi_has_full_id) { + continue; + } + + // "z h" is not allowed. + if (NULL != dmi && spl_trie_->is_half_id(dmi->spl_id)) { + continue; + } + } + + dep_->splids_extended = 0; + if (NULL != dmi) { + uint16 prev_ids_num = dmi->dict_level; + if ((!dmi_c_phrase_ && prev_ids_num >= kMaxLemmaSize) || + (dmi_c_phrase_ && prev_ids_num >= kMaxRowNum)) { + continue; + } + + DictMatchInfo *d = dmi; + while (d) { + dep_->splids[--prev_ids_num] = d->spl_id; + if ((PoolPosType)-1 == d->dmi_fr) + break; + d = dmi_pool_ + d->dmi_fr; + } + assert(0 == prev_ids_num); + dep_->splids_extended = dmi->dict_level; + } + dep_->splids[dep_->splids_extended] = spl_idx; + dep_->ext_len = ext_len; + dep_->splid_end_split = splid_end_split; + + dep_->id_num = 1; + dep_->id_start = spl_idx; + if (spl_trie_->is_half_id(spl_idx)) { + // Get the full id list + dep_->id_num = spl_trie_->half_to_full(spl_idx, &(dep_->id_start)); + assert(dep_->id_num > 0); + } + + uint16 new_dmi_num; + + new_dmi_num = extend_dmi(dep_, dmi); + + if (new_dmi_num > 0) { + if (dmi_c_phrase_) { + dmi_pool_[dmi_pool_used_].c_phrase = 1; + } + matrix_[pys_decoded_len_].dmi_num += new_dmi_num; + dmi_pool_used_ += new_dmi_num; + + if (!spl_trie_->is_half_id(spl_idx)) + matrix_[pys_decoded_len_].dmi_has_full_id = 1; + } + + // If get candiate lemmas, try to extend the path + if (lpi_total_ > 0) { + uint16 fr_row; + if (NULL == dmi) { + fr_row = oldrow; + } else { + assert(oldrow >= dmi->splstr_len); + fr_row = oldrow - dmi->splstr_len; + } + for (PoolPosType mtrx_nd_pos = matrix_[fr_row].mtrx_nd_pos; + mtrx_nd_pos < matrix_[fr_row].mtrx_nd_pos + + matrix_[fr_row].mtrx_nd_num; + mtrx_nd_pos++) { + MatrixNode *mtrx_nd = mtrx_nd_pool_ + mtrx_nd_pos; + + extend_mtrx_nd(mtrx_nd, lpi_items_, lpi_total_, + dmi_pool_used_ - new_dmi_num, pys_decoded_len_); + if (longest_ext == 0) + longest_ext = ext_len; + } + } + } // for dmi_pos + } // for ext_len + mtrx_nd_pool_used_ += matrix_[pys_decoded_len_].mtrx_nd_num; + + if (dmi_c_phrase_) + return true; + + return (matrix_[pys_decoded_len_].mtrx_nd_num != 0 || spl_matched); +} + +void MatrixSearch::prepare_candidates() { + // Get candiates from the first un-fixed step. + uint16 lma_size_max = kMaxLemmaSize; + if (lma_size_max > spl_id_num_ - fixed_hzs_) + lma_size_max = spl_id_num_ - fixed_hzs_; + + uint16 lma_size = lma_size_max; + + // If the full sentense candidate's unfixed part may be the same with a normal + // lemma. Remove the lemma candidate in this case. + char16 fullsent[kMaxLemmaSize + 1]; + char16 *pfullsent = NULL; + uint16 sent_len; + pfullsent = get_candidate0(fullsent, kMaxLemmaSize + 1, &sent_len, true); + + // If the unfixed part contains more than one ids, it is not necessary to + // check whether a lemma's string is the same to the unfixed part of the full + // sentence candidate, so, set it to NULL; + if (sent_len > kMaxLemmaSize) + pfullsent = NULL; + + lpi_total_ = 0; + size_t lpi_num_full_match = 0; // Number of items which are fully-matched. + while (lma_size > 0) { + size_t lma_num; + lma_num = get_lpis(spl_id_ + fixed_hzs_, lma_size, + lpi_items_ + lpi_total_, + size_t(kMaxLmaPsbItems - lpi_total_), + pfullsent, lma_size == lma_size_max); + + if (lma_num > 0) { + lpi_total_ += lma_num; + // For next lemma candidates which are not the longest, it is not + // necessary to compare with the full sentence candiate. + pfullsent = NULL; + } + if (lma_size == lma_size_max) { + lpi_num_full_match = lpi_total_; + } + lma_size--; + } + + // Sort those partially-matched items by their unified scores. + myqsort(lpi_items_ + lpi_num_full_match, lpi_total_ - lpi_num_full_match, + sizeof(LmaPsbItem), cmp_lpi_with_unified_psb); + + if (kPrintDebug0) { + printf("-----Prepare candidates, score:\n"); + for (size_t a = 0; a < lpi_total_; a++) { + printf("[%03d]%d ", a, lpi_items_[a].psb); + if ((a + 1) % 6 == 0) printf("\n"); + } + printf("\n"); + } + + if (kPrintDebug0) { + printf("--- lpi_total_ = %d\n", lpi_total_); + } +} + +const char* MatrixSearch::get_pystr(size_t *decoded_len) { + if (!inited_ || NULL == decoded_len) + return NULL; + + *decoded_len = pys_decoded_len_; + return pys_; +} + +void MatrixSearch::merge_fixed_lmas(size_t del_spl_pos) { + if (fixed_lmas_ == 0) + return; + // Update spelling segmentation information first. + spl_id_num_ -= 1; + uint16 del_py_len = spl_start_[del_spl_pos + 1] - spl_start_[del_spl_pos]; + for (size_t pos = del_spl_pos; pos <= spl_id_num_; pos++) { + spl_start_[pos] = spl_start_[pos + 1] - del_py_len; + if (pos == spl_id_num_) + break; + spl_id_[pos] = spl_id_[pos + 1]; + } + + // Begin to merge. + uint16 phrase_len = 0; + + // Update the spelling ids to the composing phrase. + // We need to convert these ids into full id in the future. + memcpy(c_phrase_.spl_ids, spl_id_, spl_id_num_ * sizeof(uint16)); + memcpy(c_phrase_.spl_start, spl_start_, (spl_id_num_ + 1) * sizeof(uint16)); + + // If composing phrase has not been created, first merge all fixed + // lemmas into a composing phrase without deletion. + if (fixed_lmas_ > 1 || kLemmaIdComposing != lma_id_[0]) { + uint16 bp = 1; // Begin position of real fixed lemmas. + // There is no existing composing phrase. + if (kLemmaIdComposing != lma_id_[0]) { + c_phrase_.sublma_num = 0; + bp = 0; + } + + uint16 sub_num = c_phrase_.sublma_num; + for (uint16 pos = bp; pos <= fixed_lmas_; pos++) { + c_phrase_.sublma_start[sub_num + pos - bp] = lma_start_[pos]; + if (lma_start_[pos] > del_spl_pos) { + c_phrase_.sublma_start[sub_num + pos - bp] -= 1; + } + + if (pos == fixed_lmas_) + break; + + uint16 lma_len; + char16 *lma_str = c_phrase_.chn_str + + c_phrase_.sublma_start[sub_num] + phrase_len; + + lma_len = get_lemma_str(lma_id_[pos], lma_str, kMaxRowNum - phrase_len); + assert(lma_len == lma_start_[pos + 1] - lma_start_[pos]); + phrase_len += lma_len; + } + assert(phrase_len == lma_start_[fixed_lmas_]); + c_phrase_.length = phrase_len; // will be deleted by 1 + c_phrase_.sublma_num += fixed_lmas_ - bp; + } else { + for (uint16 pos = 0; pos <= c_phrase_.sublma_num; pos++) { + if (c_phrase_.sublma_start[pos] > del_spl_pos) { + c_phrase_.sublma_start[pos] -= 1; + } + } + phrase_len = c_phrase_.length; + } + + assert(phrase_len > 0); + if (1 == phrase_len) { + // After the only one is deleted, nothing will be left. + fixed_lmas_ = 0; + return; + } + + // Delete the Chinese character in the merged phrase. + // The corresponding elements in spl_ids and spl_start of the + // phrase have been deleted. + char16 *chn_str = c_phrase_.chn_str + del_spl_pos; + for (uint16 pos = 0; + pos < c_phrase_.sublma_start[c_phrase_.sublma_num] - del_spl_pos; + pos++) { + chn_str[pos] = chn_str[pos + 1]; + } + c_phrase_.length -= 1; + + // If the deleted spelling id is in a sub lemma which contains more than + // one id, del_a_sub will be false; but if the deleted id is in a sub lemma + // which only contains 1 id, the whole sub lemma needs to be deleted, so + // del_a_sub will be true. + bool del_a_sub = false; + for (uint16 pos = 1; pos <= c_phrase_.sublma_num; pos++) { + if (c_phrase_.sublma_start[pos - 1] == + c_phrase_.sublma_start[pos]) { + del_a_sub = true; + } + if (del_a_sub) { + c_phrase_.sublma_start[pos - 1] = + c_phrase_.sublma_start[pos]; + } + } + if (del_a_sub) + c_phrase_.sublma_num -= 1; + + return; +} + +void MatrixSearch::get_spl_start_id() { + lma_id_num_ = 0; + lma_start_[0] = 0; + + spl_id_num_ = 0; + spl_start_[0] = 0; + if (!inited_ || 0 == pys_decoded_len_ || + 0 == matrix_[pys_decoded_len_].mtrx_nd_num) + return; + + // Calculate number of lemmas and spellings + // Only scan those part which is not fixed. + lma_id_num_ = fixed_lmas_; + spl_id_num_ = fixed_hzs_; + + MatrixNode *mtrx_nd = mtrx_nd_pool_ + matrix_[pys_decoded_len_].mtrx_nd_pos; + while (mtrx_nd != mtrx_nd_pool_) { + if (fixed_hzs_ > 0) { + if (mtrx_nd->step <= spl_start_[fixed_hzs_]) + break; + } + + // Update the spelling segamentation information + unsigned char word_splstr_len = 0; + PoolPosType dmi_fr = mtrx_nd->dmi_fr; + if ((PoolPosType)-1 != dmi_fr) + word_splstr_len = dmi_pool_[dmi_fr].splstr_len; + + while ((PoolPosType)-1 != dmi_fr) { + spl_start_[spl_id_num_ + 1] = mtrx_nd->step - + (word_splstr_len - dmi_pool_[dmi_fr].splstr_len); + spl_id_[spl_id_num_] = dmi_pool_[dmi_fr].spl_id; + spl_id_num_++; + dmi_fr = dmi_pool_[dmi_fr].dmi_fr; + } + + // Update the lemma segmentation information + lma_start_[lma_id_num_ + 1] = spl_id_num_; + lma_id_[lma_id_num_] = mtrx_nd->id; + lma_id_num_++; + + mtrx_nd = mtrx_nd->from; + } + + // Reverse the result of spelling info + for (size_t pos = fixed_hzs_; + pos < fixed_hzs_ + (spl_id_num_ - fixed_hzs_ + 1) / 2; pos++) { + if (spl_id_num_ + fixed_hzs_ - pos != pos + 1) { + spl_start_[pos + 1] ^= spl_start_[spl_id_num_ - pos + fixed_hzs_]; + spl_start_[spl_id_num_ - pos + fixed_hzs_] ^= spl_start_[pos + 1]; + spl_start_[pos + 1] ^= spl_start_[spl_id_num_ - pos + fixed_hzs_]; + + spl_id_[pos] ^= spl_id_[spl_id_num_ + fixed_hzs_ - pos - 1]; + spl_id_[spl_id_num_ + fixed_hzs_- pos - 1] ^= spl_id_[pos]; + spl_id_[pos] ^= spl_id_[spl_id_num_ + fixed_hzs_- pos - 1]; + } + } + + // Reverse the result of lemma info + for (size_t pos = fixed_lmas_; + pos < fixed_lmas_ + (lma_id_num_ - fixed_lmas_ + 1) / 2; pos++) { + assert(lma_id_num_ + fixed_lmas_ - pos - 1 >= pos); + + if (lma_id_num_ + fixed_lmas_ - pos > pos + 1) { + lma_start_[pos + 1] ^= lma_start_[lma_id_num_ - pos + fixed_lmas_]; + lma_start_[lma_id_num_ - pos + fixed_lmas_] ^= lma_start_[pos + 1]; + lma_start_[pos + 1] ^= lma_start_[lma_id_num_ - pos + fixed_lmas_]; + + lma_id_[pos] ^= lma_id_[lma_id_num_ - 1 - pos + fixed_lmas_]; + lma_id_[lma_id_num_ - 1 - pos + fixed_lmas_] ^= lma_id_[pos]; + lma_id_[pos] ^= lma_id_[lma_id_num_ - 1 - pos + fixed_lmas_]; + } + } + + for (size_t pos = fixed_lmas_ + 1; pos <= lma_id_num_; pos++) { + if (pos < lma_id_num_) + lma_start_[pos] = lma_start_[pos - 1] + + (lma_start_[pos] - lma_start_[pos + 1]); + else + lma_start_[pos] = lma_start_[pos - 1] + lma_start_[pos] - + lma_start_[fixed_lmas_]; + } + + // Find the last fixed position + fixed_hzs_ = 0; + for (size_t pos = spl_id_num_; pos > 0; pos--) { + if (NULL != matrix_[spl_start_[pos]].mtrx_nd_fixed) { + fixed_hzs_ = pos; + break; + } + } + + return; +} + +size_t MatrixSearch::get_spl_start(const uint16 *&spl_start) { + get_spl_start_id(); + spl_start = spl_start_; + return spl_id_num_; +} + +size_t MatrixSearch::extend_dmi(DictExtPara *dep, DictMatchInfo *dmi_s) { + if (dmi_pool_used_ >= kDmiPoolSize) return 0; + + if (dmi_c_phrase_) + return extend_dmi_c(dep, dmi_s); + + LpiCache& lpi_cache = LpiCache::get_instance(); + uint16 splid = dep->splids[dep->splids_extended]; + + bool cached = false; + if (0 == dep->splids_extended) + cached = lpi_cache.is_cached(splid); + + // 1. If this is a half Id, get its corresponding full starting Id and + // number of full Id. + size_t ret_val = 0; + PoolPosType mtrx_dmi_fr = (PoolPosType)-1; // From which dmi node + + lpi_total_ = 0; + + MileStoneHandle from_h[3]; + from_h[0] = 0; + from_h[1] = 0; + + if (0 != dep->splids_extended) { + from_h[0] = dmi_s->dict_handles[0]; + from_h[1] = dmi_s->dict_handles[1]; + } + + // 2. Begin exgtending in the system dictionary + size_t lpi_num = 0; + MileStoneHandle handles[2]; + handles[0] = handles[1] = 0; + if (from_h[0] > 0 || NULL == dmi_s) { + handles[0] = dict_trie_->extend_dict(from_h[0], dep, lpi_items_, + kMaxLmaPsbItems, &lpi_num); + } + if (handles[0] > 0) + lpi_total_ = lpi_num; + + if (NULL == dmi_s) { // from root + assert(0 != handles[0]); + mtrx_dmi_fr = dmi_pool_used_; + } + + // 3. Begin extending in the user dictionary + if (NULL != user_dict_ && (from_h[1] > 0 || NULL == dmi_s)) { + handles[1] = user_dict_->extend_dict(from_h[1], dep, + lpi_items_ + lpi_total_, + kMaxLmaPsbItems - lpi_total_, + &lpi_num); + if (handles[1] > 0) { + if (kPrintDebug0) { + for (size_t t = 0; t < lpi_num; t++) { + printf("--Extend in user dict: uid:%d uscore:%d\n", lpi_items_[lpi_total_ + t].id, + lpi_items_[lpi_total_ + t].psb); + } + } + lpi_total_ += lpi_num; + } + } + + if (0 != handles[0] || 0 != handles[1]) { + if (dmi_pool_used_ >= kDmiPoolSize) return 0; + + DictMatchInfo *dmi_add = dmi_pool_ + dmi_pool_used_; + if (NULL == dmi_s) { + fill_dmi(dmi_add, handles, + (PoolPosType)-1, splid, + 1, 1, dep->splid_end_split, dep->ext_len, + spl_trie_->is_half_id(splid) ? 0 : 1); + } else { + fill_dmi(dmi_add, handles, + dmi_s - dmi_pool_, splid, 1, + dmi_s->dict_level + 1, dep->splid_end_split, + dmi_s->splstr_len + dep->ext_len, + spl_trie_->is_half_id(splid) ? 0 : dmi_s->all_full_id); + } + + ret_val = 1; + } + + if (!cached) { + if (0 == lpi_total_) + return ret_val; + + if (kPrintDebug0) { + printf("--- lpi_total_ = %d\n", lpi_total_); + } + + myqsort(lpi_items_, lpi_total_, sizeof(LmaPsbItem), cmp_lpi_with_psb); + if (NULL == dmi_s && spl_trie_->is_half_id(splid)) + lpi_total_ = lpi_cache.put_cache(splid, lpi_items_, lpi_total_); + } else { + assert(spl_trie_->is_half_id(splid)); + lpi_total_ = lpi_cache.get_cache(splid, lpi_items_, kMaxLmaPsbItems); + } + + return ret_val; +} + +size_t MatrixSearch::extend_dmi_c(DictExtPara *dep, DictMatchInfo *dmi_s) { + lpi_total_ = 0; + + uint16 pos = dep->splids_extended; + assert(dmi_c_phrase_); + if (pos >= c_phrase_.length) + return 0; + + uint16 splid = dep->splids[pos]; + if (splid == c_phrase_.spl_ids[pos]) { + DictMatchInfo *dmi_add = dmi_pool_ + dmi_pool_used_; + MileStoneHandle handles[2]; // Actually never used. + if (NULL == dmi_s) + fill_dmi(dmi_add, handles, + (PoolPosType)-1, splid, + 1, 1, dep->splid_end_split, dep->ext_len, + spl_trie_->is_half_id(splid) ? 0 : 1); + else + fill_dmi(dmi_add, handles, + dmi_s - dmi_pool_, splid, 1, + dmi_s->dict_level + 1, dep->splid_end_split, + dmi_s->splstr_len + dep->ext_len, + spl_trie_->is_half_id(splid) ? 0 : dmi_s->all_full_id); + + if (pos == c_phrase_.length - 1) { + lpi_items_[0].id = kLemmaIdComposing; + lpi_items_[0].psb = 0; // 0 is bigger than normal lemma score. + lpi_total_ = 1; + } + return 1; + } + return 0; +} + +size_t MatrixSearch::extend_mtrx_nd(MatrixNode *mtrx_nd, LmaPsbItem lpi_items[], + size_t lpi_num, PoolPosType dmi_fr, + size_t res_row) { + assert(NULL != mtrx_nd); + matrix_[res_row].mtrx_nd_fixed = NULL; + + if (mtrx_nd_pool_used_ >= kMtrxNdPoolSize - kMaxNodeARow) + return 0; + + if (0 == mtrx_nd->step) { + // Because the list is sorted, if the source step is 0, it is only + // necessary to pick up the first kMaxNodeARow items. + if (lpi_num > kMaxNodeARow) + lpi_num = kMaxNodeARow; + } + + MatrixNode *mtrx_nd_res_min = mtrx_nd_pool_ + matrix_[res_row].mtrx_nd_pos; + for (size_t pos = 0; pos < lpi_num; pos++) { + float score = mtrx_nd->score + lpi_items[pos].psb; + if (pos > 0 && score - PRUMING_SCORE > mtrx_nd_res_min->score) + break; + + // Try to add a new node + size_t mtrx_nd_num = matrix_[res_row].mtrx_nd_num; + MatrixNode *mtrx_nd_res = mtrx_nd_res_min + mtrx_nd_num; + bool replace = false; + // Find its position + while (mtrx_nd_res > mtrx_nd_res_min && score < (mtrx_nd_res - 1)->score) { + if (static_cast(mtrx_nd_res - mtrx_nd_res_min) < kMaxNodeARow) + *mtrx_nd_res = *(mtrx_nd_res - 1); + mtrx_nd_res--; + replace = true; + } + if (replace || (mtrx_nd_num < kMaxNodeARow && + matrix_[res_row].mtrx_nd_pos + mtrx_nd_num < kMtrxNdPoolSize)) { + mtrx_nd_res->id = lpi_items[pos].id; + mtrx_nd_res->score = score; + mtrx_nd_res->from = mtrx_nd; + mtrx_nd_res->dmi_fr = dmi_fr; + mtrx_nd_res->step = res_row; + if (matrix_[res_row].mtrx_nd_num < kMaxNodeARow) + matrix_[res_row].mtrx_nd_num++; + } + } + return matrix_[res_row].mtrx_nd_num; +} + +PoolPosType MatrixSearch::match_dmi(size_t step_to, uint16 spl_ids[], + uint16 spl_id_num) { + if (pys_decoded_len_ < step_to || 0 == matrix_[step_to].dmi_num) { + return static_cast(-1); + } + + for (PoolPosType dmi_pos = 0; dmi_pos < matrix_[step_to].dmi_num; dmi_pos++) { + DictMatchInfo *dmi = dmi_pool_ + matrix_[step_to].dmi_pos + dmi_pos; + + if (dmi->dict_level != spl_id_num) + continue; + + bool matched = true; + for (uint16 spl_pos = 0; spl_pos < spl_id_num; spl_pos++) { + if (spl_ids[spl_id_num - spl_pos - 1] != dmi->spl_id) { + matched = false; + break; + } + + dmi = dmi_pool_ + dmi->dmi_fr; + } + if (matched) { + return matrix_[step_to].dmi_pos + dmi_pos; + } + } + + return static_cast(-1); +} + +char16* MatrixSearch::get_candidate0(char16 *cand_str, size_t max_len, + uint16 *retstr_len, + bool only_unfixed) { + if (pys_decoded_len_ == 0 || + matrix_[pys_decoded_len_].mtrx_nd_num == 0) + return NULL; + + LemmaIdType idxs[kMaxRowNum]; + size_t id_num = 0; + + MatrixNode *mtrx_nd = mtrx_nd_pool_ + matrix_[pys_decoded_len_].mtrx_nd_pos; + + if (kPrintDebug0) { + printf("--- sentence score: %f\n", mtrx_nd->score); + } + + if (kPrintDebug1) { + printf("==============Sentence DMI (reverse order) begin===========>>\n"); + } + + while (mtrx_nd != NULL) { + idxs[id_num] = mtrx_nd->id; + id_num++; + + if (kPrintDebug1) { + printf("---MatrixNode [step: %d, lma_idx: %d, total score:%.5f]\n", + mtrx_nd->step, mtrx_nd->id, mtrx_nd->score); + debug_print_dmi(mtrx_nd->dmi_fr, 1); + } + + mtrx_nd = mtrx_nd->from; + } + + if (kPrintDebug1) { + printf("<<==============Sentence DMI (reverse order) end=============\n"); + } + + size_t ret_pos = 0; + do { + id_num--; + if (0 == idxs[id_num]) + continue; + + char16 str[kMaxLemmaSize + 1]; + uint16 str_len = get_lemma_str(idxs[id_num], str, kMaxLemmaSize + 1); + if (str_len > 0 && ((!only_unfixed && max_len - ret_pos > str_len) || + (only_unfixed && max_len - ret_pos + fixed_hzs_ > str_len))) { + if (!only_unfixed) + utf16_strncpy(cand_str + ret_pos, str, str_len); + else if (ret_pos >= fixed_hzs_) + utf16_strncpy(cand_str + ret_pos - fixed_hzs_, str, str_len); + + ret_pos += str_len; + } else { + return NULL; + } + } while (id_num != 0); + + if (!only_unfixed) { + if (NULL != retstr_len) + *retstr_len = ret_pos; + cand_str[ret_pos] = (char16)'\0'; + } else { + if (NULL != retstr_len) + *retstr_len = ret_pos - fixed_hzs_; + cand_str[ret_pos - fixed_hzs_] = (char16)'\0'; + } + return cand_str; +} + +size_t MatrixSearch::get_lpis(const uint16* splid_str, size_t splid_str_len, + LmaPsbItem* lma_buf, size_t max_lma_buf, + const char16 *pfullsent, bool sort_by_psb) { + if (splid_str_len > kMaxLemmaSize) + return 0; + + size_t num1 = dict_trie_->get_lpis(splid_str, splid_str_len, + lma_buf, max_lma_buf); + size_t num2 = 0; + if (NULL != user_dict_) { + num2 = user_dict_->get_lpis(splid_str, splid_str_len, + lma_buf + num1, max_lma_buf - num1); + } + + size_t num = num1 + num2; + + if (0 == num) + return 0; + + // Remove repeated items. + if (splid_str_len > 1) { + LmaPsbStrItem *lpsis = reinterpret_cast(lma_buf + num); + size_t lpsi_num = (max_lma_buf - num) * sizeof(LmaPsbItem) / + sizeof(LmaPsbStrItem); + //assert(lpsi_num > num); + if (num > lpsi_num) num = lpsi_num; + lpsi_num = num; + + for (size_t pos = 0; pos < lpsi_num; pos++) { + lpsis[pos].lpi = lma_buf[pos]; + get_lemma_str(lma_buf[pos].id, lpsis[pos].str, kMaxLemmaSize + 1); + } + + myqsort(lpsis, lpsi_num, sizeof(LmaPsbStrItem), cmp_lpsi_with_str); + + size_t remain_num = 0; + for (size_t pos = 0; pos < lpsi_num; pos++) { + if (pos > 0 && utf16_strcmp(lpsis[pos].str, lpsis[pos - 1].str) == 0) { + if (lpsis[pos].lpi.psb < lpsis[pos - 1].lpi.psb) { + assert(remain_num > 0); + lma_buf[remain_num - 1] = lpsis[pos].lpi; + } + continue; + } + if (NULL != pfullsent && utf16_strcmp(lpsis[pos].str, pfullsent) == 0) + continue; + + lma_buf[remain_num] = lpsis[pos].lpi; + remain_num++; + } + + // Update the result number + num = remain_num; + } else { + // For single character, some characters have more than one spelling, for + // example, "de" and "di" are all valid for a Chinese character, so when + // the user input "d", repeated items are generated. + // For single character lemmas, Hanzis will be gotten + for (size_t pos = 0; pos < num; pos++) { + char16 hanzis[2]; + get_lemma_str(lma_buf[pos].id, hanzis, 2); + lma_buf[pos].hanzi = hanzis[0]; + } + + myqsort(lma_buf, num, sizeof(LmaPsbItem), cmp_lpi_with_hanzi); + + size_t remain_num = 0; + for (size_t pos = 0; pos < num; pos++) { + if (pos > 0 && lma_buf[pos].hanzi == lma_buf[pos - 1].hanzi) { + if (NULL != pfullsent && + static_cast(0) == pfullsent[1] && + lma_buf[pos].hanzi == pfullsent[0]) + continue; + + if (lma_buf[pos].psb < lma_buf[pos - 1].psb) { + assert(remain_num > 0); + assert(lma_buf[remain_num - 1].hanzi == lma_buf[pos].hanzi); + lma_buf[remain_num - 1] = lma_buf[pos]; + } + continue; + } + if (NULL != pfullsent && + static_cast(0) == pfullsent[1] && + lma_buf[pos].hanzi == pfullsent[0]) + continue; + + lma_buf[remain_num] = lma_buf[pos]; + remain_num++; + } + + num = remain_num; + } + + if (sort_by_psb) { + myqsort(lma_buf, num, sizeof(LmaPsbItem), cmp_lpi_with_psb); + } + return num; +} + +uint16 MatrixSearch::get_lemma_str(LemmaIdType id_lemma, char16 *str_buf, + uint16 str_max) { + uint16 str_len = 0; + + if (is_system_lemma(id_lemma)) { + str_len = dict_trie_->get_lemma_str(id_lemma, str_buf, str_max); + } else if (is_user_lemma(id_lemma)) { + if (NULL != user_dict_) { + str_len = user_dict_->get_lemma_str(id_lemma, str_buf, str_max); + } else { + str_len = 0; + str_buf[0] = static_cast('\0'); + } + } else if (is_composing_lemma(id_lemma)) { + if (str_max <= 1) + return 0; + str_len = c_phrase_.sublma_start[c_phrase_.sublma_num]; + if (str_len > str_max - 1) + str_len = str_max - 1; + utf16_strncpy(str_buf, c_phrase_.chn_str, str_len); + str_buf[str_len] = (char16)'\0'; + return str_len; + } + + return str_len; +} + +uint16 MatrixSearch::get_lemma_splids(LemmaIdType id_lemma, uint16 *splids, + uint16 splids_max, bool arg_valid) { + uint16 splid_num = 0; + + if (arg_valid) { + for (splid_num = 0; splid_num < splids_max; splid_num++) { + if (spl_trie_->is_half_id(splids[splid_num])) + break; + } + if (splid_num == splids_max) + return splid_num; + } + + if (is_system_lemma(id_lemma)) { + splid_num = dict_trie_->get_lemma_splids(id_lemma, splids, splids_max, + arg_valid); + } else if (is_user_lemma(id_lemma)) { + if (NULL != user_dict_) { + splid_num = user_dict_->get_lemma_splids(id_lemma, splids, splids_max, + arg_valid); + } else { + splid_num = 0; + } + } else if (is_composing_lemma(id_lemma)) { + if (c_phrase_.length > splids_max) { + return 0; + } + for (uint16 pos = 0; pos < c_phrase_.length; pos++) { + splids[pos] = c_phrase_.spl_ids[pos]; + if (spl_trie_->is_half_id(splids[pos])) { + return 0; + } + } + } + return splid_num; +} + +size_t MatrixSearch::inner_predict(const char16 *fixed_buf, uint16 fixed_len, + char16 predict_buf[][kMaxPredictSize + 1], + size_t buf_len) { + size_t res_total = 0; + memset(npre_items_, 0, sizeof(NPredictItem) * npre_items_len_); + // In order to shorten the comments, j-character candidates predicted by + // i-character prefix are called P(i,j). All candiates predicted by + // i-character prefix are called P(i,*) + // Step 1. Get P(kMaxPredictSize, *) and sort them, here + // P(kMaxPredictSize, *) == P(kMaxPredictSize, 1) + for (size_t len = fixed_len; len >0; len--) { + // How many blank items are available + size_t this_max = npre_items_len_ - res_total; + size_t res_this; + // If the history is longer than 1, and we can not get prediction from + // lemmas longer than 2, in this case, we will add lemmas with + // highest scores as the prediction result. + if (fixed_len > 1 && 1 == len && 0 == res_total) { + // Try to find if recent n (n>1) characters can be a valid lemma in system + // dictionary. + bool nearest_n_word = false; + for (size_t nlen = 2; nlen <= fixed_len; nlen++) { + if (dict_trie_->get_lemma_id(fixed_buf + fixed_len - nlen, nlen) > 0) { + nearest_n_word = true; + break; + } + } + res_this = dict_trie_->predict_top_lmas(nearest_n_word ? len : 0, + npre_items_ + res_total, + this_max, res_total); + res_total += res_this; + } + + // How many blank items are available + this_max = npre_items_len_ - res_total; + res_this = 0; + if (!kOnlyUserDictPredict) { + res_this = + dict_trie_->predict(fixed_buf + fixed_len - len, len, + npre_items_ + res_total, this_max, + res_total); + } + + if (NULL != user_dict_) { + res_this = res_this + + user_dict_->predict(fixed_buf + fixed_len - len, len, + npre_items_ + res_total + res_this, + this_max - res_this, res_total + res_this); + } + + if (kPredictLimitGt1) { + myqsort(npre_items_ + res_total, res_this, sizeof(NPredictItem), + cmp_npre_by_score); + + if (len > 3) { + if (res_this > kMaxPredictNumByGt3) + res_this = kMaxPredictNumByGt3; + } else if (3 == len) { + if (res_this > kMaxPredictNumBy3) + res_this = kMaxPredictNumBy3; + } else if (2 == len) { + if (res_this > kMaxPredictNumBy2) + res_this = kMaxPredictNumBy2; + } + } + + res_total += res_this; + } + + res_total = remove_duplicate_npre(npre_items_, res_total); + + if (kPreferLongHistoryPredict) { + myqsort(npre_items_, res_total, sizeof(NPredictItem), + cmp_npre_by_hislen_score); + } else { + myqsort(npre_items_, res_total, sizeof(NPredictItem), + cmp_npre_by_score); + } + + if (buf_len < res_total) { + res_total = buf_len; + } + + if (kPrintDebug2) { + printf("/////////////////Predicted Items Begin////////////////////>>\n"); + for (size_t i = 0; i < res_total; i++) { + printf("---"); + for (size_t j = 0; j < kMaxPredictSize; j++) { + printf("%d ", npre_items_[i].pre_hzs[j]); + } + printf("\n"); + } + printf("< kMaxPredictSize || 0 == buf_len) + return 0; + + return inner_predict(fixed_buf, fixed_len, predict_buf, buf_len); +} + +} // namespace ime_pinyin diff --git a/googlepinyin/matrixsearch.h b/googlepinyin/matrixsearch.h new file mode 100644 index 0000000..61e78aa --- /dev/null +++ b/googlepinyin/matrixsearch.h @@ -0,0 +1,460 @@ +/* + * Copyright (C) 2009 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef PINYINIME_ANDPY_INCLUDE_MATRIXSEARCH_H__ +#define PINYINIME_ANDPY_INCLUDE_MATRIXSEARCH_H__ + +#include +#include "./atomdictbase.h" +#include "./dicttrie.h" +#include "./searchutility.h" +#include "./spellingtrie.h" +#include "./splparser.h" + +namespace ime_pinyin { + +static const size_t kMaxRowNum = kMaxSearchSteps; + +typedef struct { + // MileStoneHandle objects for the system and user dictionaries. + MileStoneHandle dict_handles[2]; + // From which DMI node. -1 means it's from root. + PoolPosType dmi_fr; + // The spelling id for the Pinyin string from the previous DMI to this node. + // If it is a half id like Shengmu, the node pointed by dict_node is the first + // node with this Shengmu, + uint16 spl_id; + // What's the level of the dict node. Level of root is 0, but root is never + // recorded by dict_node. + unsigned char dict_level:7; + // If this node is for composing phrase, this bit is 1. + unsigned char c_phrase:1; + // Whether the spl_id is parsed with a split character at the end. + unsigned char splid_end_split:1; + // What's the length of the spelling string for this match, for the whole + // word. + unsigned char splstr_len:7; + // Used to indicate whether all spelling ids from the root are full spelling + // ids. This information is useful for keymapping mode(not finished). Because + // in this mode, there is no clear boundaries, we prefer those results which + // have full spelling ids. + unsigned char all_full_id:1; +} DictMatchInfo, *PDictMatchInfo; + +typedef struct MatrixNode { + LemmaIdType id; + float score; + MatrixNode *from; + // From which DMI node. Used to trace the spelling segmentation. + PoolPosType dmi_fr; + uint16 step; +} MatrixNode, *PMatrixNode; + +typedef struct { + // The MatrixNode position in the matrix pool + PoolPosType mtrx_nd_pos; + // The DictMatchInfo position in the DictMatchInfo pool. + PoolPosType dmi_pos; + uint16 mtrx_nd_num; + uint16 dmi_num:15; + // Used to indicate whether there are dmi nodes in this step with full + // spelling id. This information is used to decide whether a substring of a + // valid Pinyin should be extended. + // + // Example1: shoudao + // When the last char 'o' is added, the parser will find "dao" is a valid + // Pinyin, and because all dmi nodes at location 'd' (including those for + // "shoud", and those for "d") have Shengmu id only, so it is not necessary + // to extend "ao", otherwise the result may be "shoud ao", that is not + // reasonable. + // + // Example2: hengao + // When the last 'o' is added, the parser finds "gao" is a valid Pinyin. + // Because some dmi nodes at 'g' has Shengmu ids (hen'g and g), but some dmi + // nodes at 'g' has full ids ('heng'), so it is necessary to extend "ao", thus + // "heng ao" can also be the result. + // + // Similarly, "ganga" is expanded to "gang a". + // + // For Pinyin string "xian", because "xian" is a valid Pinyin, because all dmi + // nodes at 'x' only have Shengmu ids, the parser will not try "x ian" (and it + // is not valid either). If the parser uses break in the loop, the result + // always be "xian"; but if the parser uses continue in the loop, "xi an" will + // also be tried. This behaviour can be set via the function + // set_xi_an_switch(). + uint16 dmi_has_full_id:1; + // Points to a MatrixNode of the current step to indicate which choice the + // user selects. + MatrixNode *mtrx_nd_fixed; +} MatrixRow, *PMatrixRow; + +// When user inputs and selects candidates, the fixed lemma ids are stored in +// lma_id_ of class MatrixSearch, and fixed_lmas_ is used to indicate how many +// lemmas from the beginning are fixed. If user deletes Pinyin characters one +// by one from the end, these fixed lemmas can be unlocked one by one when +// necessary. Whenever user deletes a Chinese character and its spelling string +// in these fixed lemmas, all fixed lemmas will be merged together into a unit +// named ComposingPhrase with a lemma id kLemmaIdComposing, and this composing +// phrase will be the first lemma in the sentence. Because it contains some +// modified lemmas (by deleting a character), these merged lemmas are called +// sub lemmas (sublma), and each of them are represented individually, so that +// when user deletes Pinyin characters from the end, these sub lemmas can also +// be unlocked one by one. +typedef struct { + uint16 spl_ids[kMaxRowNum]; + uint16 spl_start[kMaxRowNum]; + char16 chn_str[kMaxRowNum]; // Chinese string. + uint16 sublma_start[kMaxRowNum]; // Counted in Chinese characters. + size_t sublma_num; + uint16 length; // Counted in Chinese characters. +} ComposingPhrase, *TComposingPhrase; + +class MatrixSearch { + private: + // If it is true, prediction list by string whose length is greater than 1 + // will be limited to a reasonable number. + static const bool kPredictLimitGt1 = false; + + // If it is true, the engine will prefer long history based prediction, + // for example, when user inputs "BeiJing", we prefer "DaXue", etc., which are + // based on the two-character history. + static const bool kPreferLongHistoryPredict = true; + + // If it is true, prediction will only be based on user dictionary. this flag + // is for debug purpose. + static const bool kOnlyUserDictPredict = false; + + // The maximum buffer to store LmaPsbItems. + static const size_t kMaxLmaPsbItems = 1450; + + // How many rows for each step. + static const size_t kMaxNodeARow = 5; + + // The maximum length of the sentence candidates counted in chinese + // characters + static const size_t kMaxSentenceLength = 16; + + // The size of the matrix node pool. + static const size_t kMtrxNdPoolSize = 200; + + // The size of the DMI node pool. + static const size_t kDmiPoolSize = 800; + + // Used to indicate whether this object has been initialized. + bool inited_; + + // Spelling trie. + const SpellingTrie *spl_trie_; + + // Used to indicate this switcher status: when "xian" is parseed, should + // "xi an" also be extended. Default is false. + // These cases include: xia, xian, xiang, zhuan, jiang..., etc. The string + // should be valid for a FULL spelling, or a combination of two spellings, + // first of which is a FULL id too. So even it is true, "da" will never be + // split into "d a", because "d" is not a full spelling id. + bool xi_an_enabled_; + + // System dictionary. + DictTrie* dict_trie_; + + // User dictionary. + AtomDictBase* user_dict_; + + // Spelling parser. + SpellingParser* spl_parser_; + + // The maximum allowed length of spelling string (such as a Pinyin string). + size_t max_sps_len_; + + // The maximum allowed length of a result Chinese string. + size_t max_hzs_len_; + + // Pinyin string. Max length: kMaxRowNum - 1 + char pys_[kMaxRowNum]; + + // The length of the string that has been decoded successfully. + size_t pys_decoded_len_; + + // Shared buffer for multiple purposes. + size_t *share_buf_; + + MatrixNode *mtrx_nd_pool_; + PoolPosType mtrx_nd_pool_used_; // How many nodes used in the pool + DictMatchInfo *dmi_pool_; + PoolPosType dmi_pool_used_; // How many items used in the pool + + MatrixRow *matrix_; // The first row is for starting + + DictExtPara *dep_; // Parameter used to extend DMI nodes. + + NPredictItem *npre_items_; // Used to do prediction + size_t npre_items_len_; + + // The starting positions and lemma ids for the full sentence candidate. + size_t lma_id_num_; + uint16 lma_start_[kMaxRowNum]; // Counted in spelling ids. + LemmaIdType lma_id_[kMaxRowNum]; + size_t fixed_lmas_; + + // If fixed_lmas_ is bigger than i, Element i is used to indicate whether + // the i'th lemma id in lma_id_ is the first candidate for that step. + // If all candidates are the first one for that step, the whole string can be + // decoded by the engine automatically, so no need to add it to user + // dictionary. (We are considering to add it to user dictionary in the + // future). + uint8 fixed_lmas_no1_[kMaxRowNum]; + + // Composing phrase + ComposingPhrase c_phrase_; + + // If dmi_c_phrase_ is true, the decoder will try to match the + // composing phrase (And definitely it will match successfully). If it + // is false, the decoder will try to match lemmas items in dictionaries. + bool dmi_c_phrase_; + + // The starting positions and spelling ids for the first full sentence + // candidate. + size_t spl_id_num_; // Number of splling ids + uint16 spl_start_[kMaxRowNum]; // Starting positions + uint16 spl_id_[kMaxRowNum]; // Spelling ids + // Used to remember the last fixed position, counted in Hanzi. + size_t fixed_hzs_; + + // Lemma Items with possibility score, two purposes: + // 1. In Viterbi decoding, this buffer is used to get all possible candidates + // for current step; + // 2. When the search is done, this buffer is used to get candiates from the + // first un-fixed step and show them to the user. + LmaPsbItem lpi_items_[kMaxLmaPsbItems]; + size_t lpi_total_; + + // Assign the pointers with NULL. The caller makes sure that all pointers are + // not valid before calling it. This function only will be called in the + // construction function and free_resource(). + void reset_pointers_to_null(); + + bool alloc_resource(); + + void free_resource(); + + // Reset the search space totally. + bool reset_search0(); + + // Reset the search space from ch_pos step. For example, if the original + // input Pinyin is "an", reset_search(1) will reset the search space to the + // result of "a". If the given position is out of range, return false. + // if clear_fixed_this_step is true, and the ch_pos step is a fixed step, + // clear its fixed status. if clear_dmi_his_step is true, clear the DMI nodes. + // If clear_mtrx_this_sTep is true, clear the mtrx nodes of this step. + // The DMI nodes will be kept. + // + // Note: this function should not destroy content of pys_. + bool reset_search(size_t ch_pos, bool clear_fixed_this_step, + bool clear_dmi_this_step, bool clear_mtrx_this_step); + + // Delete a part of the content in pys_. + void del_in_pys(size_t start, size_t len); + + // Delete a spelling id and its corresponding Chinese character, and merge + // the fixed lemmas into the composing phrase. + // del_spl_pos indicates which spelling id needs to be delete. + // This function will update the lemma and spelling segmentation information. + // The caller guarantees that fixed_lmas_ > 0 and del_spl_pos is within + // the fixed lemmas. + void merge_fixed_lmas(size_t del_spl_pos); + + // Get spelling start posistions and ids. The result will be stored in + // spl_id_num_, spl_start_[], spl_id_[]. + // fixed_hzs_ will be also assigned. + void get_spl_start_id(); + + // Get all lemma ids with match the given spelling id stream(shorter than the + // maximum length of a word). + // If pfullsent is not NULL, means the full sentence candidate may be the + // same with the coming lemma string, if so, remove that lemma. + // The result is sorted in descendant order by the frequency score. + size_t get_lpis(const uint16* splid_str, size_t splid_str_len, + LmaPsbItem* lma_buf, size_t max_lma_buf, + const char16 *pfullsent, bool sort_by_psb); + + uint16 get_lemma_str(LemmaIdType id_lemma, char16 *str_buf, uint16 str_max); + + uint16 get_lemma_splids(LemmaIdType id_lemma, uint16 *splids, + uint16 splids_max, bool arg_valid); + + + // Extend a DMI node with a spelling id. ext_len is the length of the rows + // to extend, actually, it is the size of the spelling string of splid. + // return value can be 1 or 0. + // 1 means a new DMI is filled in (dmi_pool_used_ is the next blank DMI in + // the pool). + // 0 means either the dmi node can not be extended with splid, or the splid + // is a Shengmu id, which is only used to get lpi_items, or the result node + // in DictTrie has no son, it is not nccessary to keep the new DMI. + // + // This function modifies the content of lpi_items_ and lpi_total_. + // lpi_items_ is used to get the LmaPsbItem list, lpi_total_ returns the size. + // The function's returned value has no relation with the value of lpi_num. + // + // If dmi == NULL, this function will extend the root node of DictTrie + // + // This function will not change dmi_nd_pool_used_. Please change it after + // calling this function if necessary. + // + // The caller should guarantees that NULL != dep. + size_t extend_dmi(DictExtPara *dep, DictMatchInfo *dmi_s); + + // Extend dmi for the composing phrase. + size_t extend_dmi_c(DictExtPara *dep, DictMatchInfo *dmi_s); + + // Extend a MatrixNode with the give LmaPsbItem list. + // res_row is the destination row number. + // This function does not change mtrx_nd_pool_used_. Please change it after + // calling this function if necessary. + // return 0 always. + size_t extend_mtrx_nd(MatrixNode *mtrx_nd, LmaPsbItem lpi_items[], + size_t lpi_num, PoolPosType dmi_fr, size_t res_row); + + + // Try to find a dmi node at step_to position, and the found dmi node should + // match the given spelling id strings. + PoolPosType match_dmi(size_t step_to, uint16 spl_ids[], uint16 spl_id_num); + + bool add_char(char ch); + bool prepare_add_char(char ch); + + // Called after prepare_add_char, so the input char has been saved. + bool add_char_qwerty(); + + // Prepare candidates from the last fixed hanzi position. + void prepare_candidates(); + + // Is the character in step pos a splitter character? + // The caller guarantees that the position is valid. + bool is_split_at(uint16 pos); + + void fill_dmi(DictMatchInfo *dmi, MileStoneHandle *handles, + PoolPosType dmi_fr, + uint16 spl_id, uint16 node_num, unsigned char dict_level, + bool splid_end_split, unsigned char splstr_len, + unsigned char all_full_id); + + size_t inner_predict(const char16 fixed_scis_ids[], uint16 scis_num, + char16 predict_buf[][kMaxPredictSize + 1], + size_t buf_len); + + // Add the first candidate to the user dictionary. + bool try_add_cand0_to_userdict(); + + // Add a user lemma to the user dictionary. This lemma is a subset of + // candidate 0. lma_from is from which lemma in lma_ids_, lma_num is the + // number of lemmas to be combined together as a new lemma. The caller + // gurantees that the combined new lemma's length is less or equal to + // kMaxLemmaSize. + bool add_lma_to_userdict(uint16 lma_from, uint16 lma_num, float score); + + // Update dictionary frequencies. + void update_dict_freq(); + + void debug_print_dmi(PoolPosType dmi_pos, uint16 nest_level); + + public: + MatrixSearch(); + ~MatrixSearch(); + + bool init(const char *fn_sys_dict, const char *fn_usr_dict); + + bool init_fd(int sys_fd, long start_offset, long length, + const char *fn_usr_dict); + + void init_user_dictionary(const char *fn_usr_dict); + + bool is_user_dictionary_enabled() const; + + void set_max_lens(size_t max_sps_len, size_t max_hzs_len); + + void close(); + + void flush_cache(); + + void set_xi_an_switch(bool xi_an_enabled); + + bool get_xi_an_switch(); + + // Reset the search space. Equivalent to reset_search(0). + // If inited, always return true; + bool reset_search(); + + // Search a Pinyin string. + // Return value is the position successfully parsed. + size_t search(const char *py, size_t py_len); + + // Used to delete something in the Pinyin string kept by the engine, and do + // a re-search. + // Return value is the new length of Pinyin string kept by the engine which + // is parsed successfully. + // If is_pos_in_splid is false, pos is used to indicate that pos-th Pinyin + // character needs to be deleted. If is_pos_in_splid is true, all Pinyin + // characters for pos-th spelling id needs to be deleted. + // If the deleted character(s) is just after a fixed lemma or sub lemma in + // composing phrase, clear_fixed_this_step indicates whether we needs to + // unlock the last fixed lemma or sub lemma. + // If is_pos_in_splid is false, and pos-th character is in the range for the + // fixed lemmas or composing string, this function will do nothing and just + // return the result of the previous search. + size_t delsearch(size_t pos, bool is_pos_in_splid, + bool clear_fixed_this_step); + + // Get the number of candiates, called after search(). + size_t get_candidate_num(); + + // Get the Pinyin string stored by the engine. + // *decoded_len returns the length of the successfully decoded string. + const char* get_pystr(size_t *decoded_len); + + // Get the spelling boundaries for the first sentence candidate. + // Number of spellings will be returned. The number of valid elements in + // spl_start is one more than the return value because the last one is used + // to indicate the beginning of the next un-input speling. + // For a Pinyin "women", the returned value is 2, spl_start is [0, 2, 5] . + size_t get_spl_start(const uint16 *&spl_start); + + // Get one candiate string. If full sentence candidate is available, it will + // be the first one. + char16* get_candidate(size_t cand_id, char16 *cand_str, size_t max_len); + + // Get the first candiate, which is a "full sentence". + // retstr_len is not NULL, it will be used to return the string length. + // If only_unfixed is true, only unfixed part will be fetched. + char16* get_candidate0(char16* cand_str, size_t max_len, + uint16 *retstr_len, bool only_unfixed); + + // Choose a candidate. The decoder will do a search after the fixed position. + size_t choose(size_t cand_id); + + // Cancel the last choosing operation, and return the new number of choices. + size_t cancel_last_choice(); + + // Get the length of fixed Hanzis. + size_t get_fixedlen(); + + size_t get_predicts(const char16 fixed_buf[], + char16 predict_buf[][kMaxPredictSize + 1], + size_t buf_len); +}; +} + +#endif // PINYINIME_ANDPY_INCLUDE_MATRIXSEARCH_H__ diff --git a/googlepinyin/mystdlib.cpp b/googlepinyin/mystdlib.cpp new file mode 100644 index 0000000..93bbcc9 --- /dev/null +++ b/googlepinyin/mystdlib.cpp @@ -0,0 +1,34 @@ +/* + * Copyright (C) 2009 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +namespace ime_pinyin { + +// For debug purpose. You can add a fixed version of qsort and bsearch functions +// here so that the output will be totally the same under different platforms. + +void myqsort(void *p, size_t n, size_t es, + int (*cmp)(const void *, const void *)) { + qsort(p,n, es, cmp); +} + +void *mybsearch(const void *k, const void *b, + size_t n, size_t es, + int (*cmp)(const void *, const void *)) { + return bsearch(k, b, n, es, cmp); +} +} // namespace ime_pinyin diff --git a/googlepinyin/mystdlib.h b/googlepinyin/mystdlib.h new file mode 100644 index 0000000..dfcf980 --- /dev/null +++ b/googlepinyin/mystdlib.h @@ -0,0 +1,32 @@ +/* + * Copyright (C) 2009 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef PINYINIME_INCLUDE_MYSTDLIB_H__ +#define PINYINIME_INCLUDE_MYSTDLIB_H__ + +#include + +namespace ime_pinyin { + +void myqsort(void *p, size_t n, size_t es, + int (*cmp)(const void *, const void *)); + +void *mybsearch(const void *key, const void *base, + size_t nmemb, size_t size, + int (*compar)(const void *, const void *)); +} + +#endif // PINYINIME_INCLUDE_MYSTDLIB_H__ diff --git a/googlepinyin/ngram.cpp b/googlepinyin/ngram.cpp new file mode 100644 index 0000000..cacb188 --- /dev/null +++ b/googlepinyin/ngram.cpp @@ -0,0 +1,342 @@ +/* + * Copyright (C) 2009 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include "mystdlib.h" +#include "ngram.h" + +namespace ime_pinyin { + +#define ADD_COUNT 0.3 + +int comp_double(const void *p1, const void *p2) { + if (*static_cast(p1) < *static_cast(p2)) + return -1; + if (*static_cast(p1) > *static_cast(p2)) + return 1; + return 0; +} + +inline double distance(double freq, double code) { + // return fabs(freq - code); + return freq * fabs(log(freq) - log(code)); +} + +// Find the index of the code value which is nearest to the given freq +int qsearch_nearest(double code_book[], double freq, int start, int end) { + if (start == end) + return start; + + if (start + 1 == end) { + if (distance(freq, code_book[end]) > distance(freq, code_book[start])) + return start; + return end; + } + + int mid = (start + end) / 2; + + if (code_book[mid] > freq) + return qsearch_nearest(code_book, freq, start, mid); + else + return qsearch_nearest(code_book, freq, mid, end); +} + +size_t update_code_idx(double freqs[], size_t num, double code_book[], + CODEBOOK_TYPE *code_idx) { + size_t changed = 0; + for (size_t pos = 0; pos < num; pos++) { + CODEBOOK_TYPE idx; + idx = qsearch_nearest(code_book, freqs[pos], 0, kCodeBookSize - 1); + if (idx != code_idx[pos]) + changed++; + code_idx[pos] = idx; + } + return changed; +} + +double recalculate_kernel(double freqs[], size_t num, double code_book[], + CODEBOOK_TYPE *code_idx) { + double ret = 0; + + size_t *item_num = new size_t[kCodeBookSize]; + assert(item_num); + memset(item_num, 0, sizeof(size_t) * kCodeBookSize); + + double *cb_new = new double[kCodeBookSize]; + assert(cb_new); + memset(cb_new, 0, sizeof(double) * kCodeBookSize); + + for (size_t pos = 0; pos < num; pos++) { + ret += distance(freqs[pos], code_book[code_idx[pos]]); + + cb_new[code_idx[pos]] += freqs[pos]; + item_num[code_idx[pos]] += 1; + } + + for (size_t code = 0; code < kCodeBookSize; code++) { + assert(item_num[code] > 0); + code_book[code] = cb_new[code] / item_num[code]; + } + + delete [] item_num; + delete [] cb_new; + + return ret; +} + +void iterate_codes(double freqs[], size_t num, double code_book[], + CODEBOOK_TYPE *code_idx) { + size_t iter_num = 0; + double delta_last = 0; + do { + size_t changed = update_code_idx(freqs, num, code_book, code_idx); + + double delta = recalculate_kernel(freqs, num, code_book, code_idx); + + if (kPrintDebug0) { + printf("---Unigram codebook iteration: %d : %d, %.9f\n", + iter_num, changed, delta); + } + iter_num++; + + if (iter_num > 1 && + (delta == 0 || fabs(delta_last - delta)/fabs(delta) < 0.000000001)) + break; + delta_last = delta; + } while (true); +} + + +NGram* NGram::instance_ = NULL; + +NGram::NGram() { + initialized_ = false; + idx_num_ = 0; + lma_freq_idx_ = NULL; + sys_score_compensation_ = 0; + +#ifdef ___BUILD_MODEL___ + freq_codes_df_ = NULL; +#endif + freq_codes_ = NULL; +} + +NGram::~NGram() { + if (NULL != lma_freq_idx_) + free(lma_freq_idx_); + +#ifdef ___BUILD_MODEL___ + if (NULL != freq_codes_df_) + free(freq_codes_df_); +#endif + + if (NULL != freq_codes_) + free(freq_codes_); +} + +NGram& NGram::get_instance() { + if (NULL == instance_) + instance_ = new NGram(); + return *instance_; +} + +bool NGram::save_ngram(FILE *fp) { + if (!initialized_ || NULL == fp) + return false; + + if (0 == idx_num_ || NULL == freq_codes_ || NULL == lma_freq_idx_) + return false; + + if (fwrite(&idx_num_, sizeof(uint32), 1, fp) != 1) + return false; + + if (fwrite(freq_codes_, sizeof(LmaScoreType), kCodeBookSize, fp) != + kCodeBookSize) + return false; + + if (fwrite(lma_freq_idx_, sizeof(CODEBOOK_TYPE), idx_num_, fp) != idx_num_) + return false; + + return true; +} + +bool NGram::load_ngram(FILE *fp) { + if (NULL == fp) + return false; + + initialized_ = false; + + if (fread(&idx_num_, sizeof(uint32), 1, fp) != 1 ) + return false; + + if (NULL != lma_freq_idx_) + free(lma_freq_idx_); + + if (NULL != freq_codes_) + free(freq_codes_); + + lma_freq_idx_ = static_cast + (malloc(idx_num_ * sizeof(CODEBOOK_TYPE))); + freq_codes_ = static_cast + (malloc(kCodeBookSize * sizeof(LmaScoreType))); + + if (NULL == lma_freq_idx_ || NULL == freq_codes_) + return false; + + if (fread(freq_codes_, sizeof(LmaScoreType), kCodeBookSize, fp) != + kCodeBookSize) + return false; + + if (fread(lma_freq_idx_, sizeof(CODEBOOK_TYPE), idx_num_, fp) != idx_num_) + return false; + + initialized_ = true; + + total_freq_none_sys_ = 0; + return true; +} + +void NGram::set_total_freq_none_sys(size_t freq_none_sys) { + total_freq_none_sys_ = freq_none_sys; + if (0 == total_freq_none_sys_) { + sys_score_compensation_ = 0; + } else { + double factor = static_cast(kSysDictTotalFreq) / ( + kSysDictTotalFreq + total_freq_none_sys_); + sys_score_compensation_ = static_cast( + log(factor) * kLogValueAmplifier); + } +} + +// The caller makes sure this oject is initialized. +float NGram::get_uni_psb(LemmaIdType lma_id) { + return static_cast(freq_codes_[lma_freq_idx_[lma_id]]) + + sys_score_compensation_; +} + +float NGram::convert_psb_to_score(double psb) { + float score = static_cast( + log(psb) * static_cast(kLogValueAmplifier)); + if (score > static_cast(kMaxScore)) { + score = static_cast(kMaxScore); + } + return score; +} + +#ifdef ___BUILD_MODEL___ +bool NGram::build_unigram(LemmaEntry *lemma_arr, size_t lemma_num, + LemmaIdType next_idx_unused) { + if (NULL == lemma_arr || 0 == lemma_num || next_idx_unused <= 1) + return false; + + double total_freq = 0; + double *freqs = new double[next_idx_unused]; + if (NULL == freqs) + return false; + + freqs[0] = ADD_COUNT; + total_freq += freqs[0]; + LemmaIdType idx_now = 0; + for (size_t pos = 0; pos < lemma_num; pos++) { + if (lemma_arr[pos].idx_by_hz == idx_now) + continue; + idx_now++; + + assert(lemma_arr[pos].idx_by_hz == idx_now); + + freqs[idx_now] = lemma_arr[pos].freq; + if (freqs[idx_now] <= 0) + freqs[idx_now] = 0.3; + + total_freq += freqs[idx_now]; + } + + double max_freq = 0; + idx_num_ = idx_now + 1; + assert(idx_now + 1 == next_idx_unused); + + for (size_t pos = 0; pos < idx_num_; pos++) { + freqs[pos] = freqs[pos] / total_freq; + assert(freqs[pos] > 0); + if (freqs[pos] > max_freq) + max_freq = freqs[pos]; + } + + // calculate the code book + if (NULL == freq_codes_df_) + freq_codes_df_ = new double[kCodeBookSize]; + assert(freq_codes_df_); + memset(freq_codes_df_, 0, sizeof(double) * kCodeBookSize); + + if (NULL == freq_codes_) + freq_codes_ = new LmaScoreType[kCodeBookSize]; + assert(freq_codes_); + memset(freq_codes_, 0, sizeof(LmaScoreType) * kCodeBookSize); + + size_t freq_pos = 0; + for (size_t code_pos = 0; code_pos < kCodeBookSize; code_pos++) { + bool found = true; + + while (found) { + found = false; + double cand = freqs[freq_pos]; + for (size_t i = 0; i < code_pos; i++) + if (freq_codes_df_[i] == cand) { + found = true; + break; + } + if (found) + freq_pos++; + } + + freq_codes_df_[code_pos] = freqs[freq_pos]; + freq_pos++; + } + + myqsort(freq_codes_df_, kCodeBookSize, sizeof(double), comp_double); + + if (NULL == lma_freq_idx_) + lma_freq_idx_ = new CODEBOOK_TYPE[idx_num_]; + assert(lma_freq_idx_); + + iterate_codes(freqs, idx_num_, freq_codes_df_, lma_freq_idx_); + + delete [] freqs; + + if (kPrintDebug0) { + printf("\n------Language Model Unigram Codebook------\n"); + } + + for (size_t code_pos = 0; code_pos < kCodeBookSize; code_pos++) { + double log_score = log(freq_codes_df_[code_pos]); + float final_score = convert_psb_to_score(freq_codes_df_[code_pos]); + if (kPrintDebug0) { + printf("code:%d, probability:%.9f, log score:%.3f, final score: %.3f\n", + code_pos, freq_codes_df_[code_pos], log_score, final_score); + } + freq_codes_[code_pos] = static_cast(final_score); + } + + initialized_ = true; + return true; +} +#endif + +} // namespace ime_pinyin diff --git a/googlepinyin/ngram.h b/googlepinyin/ngram.h new file mode 100644 index 0000000..7adb46d --- /dev/null +++ b/googlepinyin/ngram.h @@ -0,0 +1,96 @@ +/* + * Copyright (C) 2009 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef PINYINIME_INCLUDE_NGRAM_H__ +#define PINYINIME_INCLUDE_NGRAM_H__ + +#include +#include +#include "./dictdef.h" + +namespace ime_pinyin { + +typedef unsigned char CODEBOOK_TYPE; + +static const size_t kCodeBookSize = 256; + +class NGram { + public: + // The maximum score of a lemma item. + static const LmaScoreType kMaxScore = 0x3fff; + + // In order to reduce the storage size, the original log value is amplified by + // kScoreAmplifier, and we use LmaScoreType to store. + // After this process, an item with a lower score has a higher frequency. + static const int kLogValueAmplifier = -800; + + // System words' total frequency. It is not the real total frequency, instead, + // It is only used to adjust system lemmas' scores when the user dictionary's + // total frequency changes. + // In this version, frequencies of system lemmas are fixed. We are considering + // to make them changable in next version. + static const size_t kSysDictTotalFreq = 100000000; + + private: + + static NGram* instance_; + + bool initialized_; + uint32 idx_num_; + + size_t total_freq_none_sys_; + + // Score compensation for system dictionary lemmas. + // Because after user adds some user lemmas, the total frequency changes, and + // we use this value to normalize the score. + float sys_score_compensation_; + +#ifdef ___BUILD_MODEL___ + double *freq_codes_df_; +#endif + LmaScoreType *freq_codes_; + CODEBOOK_TYPE *lma_freq_idx_; + + public: + NGram(); + ~NGram(); + + static NGram& get_instance(); + + bool save_ngram(FILE *fp); + bool load_ngram(FILE *fp); + + // Set the total frequency of all none system dictionaries. + void set_total_freq_none_sys(size_t freq_none_sys); + + float get_uni_psb(LemmaIdType lma_id); + + // Convert a probability to score. Actually, the score will be limited to + // kMaxScore, but at runtime, we also need float expression to get accurate + // value of the score. + // After the conversion, a lower score indicates a higher probability of the + // item. + static float convert_psb_to_score(double psb); + +#ifdef ___BUILD_MODEL___ + // For constructing the unigram mode model. + bool build_unigram(LemmaEntry *lemma_arr, size_t num, + LemmaIdType next_idx_unused); +#endif +}; +} + +#endif // PINYINIME_INCLUDE_NGRAM_H__ diff --git a/googlepinyin/pinyinime.cpp b/googlepinyin/pinyinime.cpp new file mode 100644 index 0000000..2e98e9a --- /dev/null +++ b/googlepinyin/pinyinime.cpp @@ -0,0 +1,197 @@ +/* + * Copyright (C) 2009 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "pinyinime.h" +#include "dicttrie.h" +#include "matrixsearch.h" +#include "spellingtrie.h" + +#ifdef __cplusplus +extern "C" { +#endif + + using namespace ime_pinyin; + + // The maximum number of the prediction items. + static const size_t kMaxPredictNum = 500; + + // Used to search Pinyin string and give the best candidate. + MatrixSearch* matrix_search = NULL; + + char16 predict_buf[kMaxPredictNum][kMaxPredictSize + 1]; + + bool im_open_decoder(const char *fn_sys_dict, const char *fn_usr_dict) { + if (NULL != matrix_search) + delete matrix_search; + + matrix_search = new MatrixSearch(); + if (NULL == matrix_search) { + return false; + } + + return matrix_search->init(fn_sys_dict, fn_usr_dict); + } + + bool im_open_decoder_fd(int sys_fd, long start_offset, long length, + const char *fn_usr_dict) { + if (NULL != matrix_search) + delete matrix_search; + + matrix_search = new MatrixSearch(); + if (NULL == matrix_search) + return false; + + return matrix_search->init_fd(sys_fd, start_offset, length, fn_usr_dict); + } + + void im_close_decoder() { + if (NULL != matrix_search) { + matrix_search->close(); + delete matrix_search; + } + matrix_search = NULL; + } + + void im_set_max_lens(size_t max_sps_len, size_t max_hzs_len) { + if (NULL != matrix_search) { + matrix_search->set_max_lens(max_sps_len, max_hzs_len); + } + } + + void im_flush_cache() { + if (NULL != matrix_search) + matrix_search->flush_cache(); + } + + // To be updated. + size_t im_search(const char* pybuf, size_t pylen) { + if (NULL == matrix_search) + return 0; + + matrix_search->search(pybuf, pylen); + return matrix_search->get_candidate_num(); + } + + size_t im_delsearch(size_t pos, bool is_pos_in_splid, + bool clear_fixed_this_step) { + if (NULL == matrix_search) + return 0; + matrix_search->delsearch(pos, is_pos_in_splid, clear_fixed_this_step); + return matrix_search->get_candidate_num(); + } + + void im_reset_search() { + if (NULL == matrix_search) + return; + + matrix_search->reset_search(); + } + + // To be removed + size_t im_add_letter(char ch) { + return 0; + } + + const char* im_get_sps_str(size_t *decoded_len) { + if (NULL == matrix_search) + return NULL; + + return matrix_search->get_pystr(decoded_len); + } + + char16* im_get_candidate(size_t cand_id, char16* cand_str, + size_t max_len) { + if (NULL == matrix_search) + return NULL; + + return matrix_search->get_candidate(cand_id, cand_str, max_len); + } + + size_t im_get_spl_start_pos(const uint16 *&spl_start) { + if (NULL == matrix_search) + return 0; + + return matrix_search->get_spl_start(spl_start); + } + + size_t im_choose(size_t choice_id) { + if (NULL == matrix_search) + return 0; + + return matrix_search->choose(choice_id); + } + + size_t im_cancel_last_choice() { + if (NULL == matrix_search) + return 0; + + return matrix_search->cancel_last_choice(); + } + + size_t im_get_fixed_len() { + if (NULL == matrix_search) + return 0; + + return matrix_search->get_fixedlen(); + } + + // To be removed + bool im_cancel_input() { + return true; + } + + + size_t im_get_predicts(const char16 *his_buf, + char16 (*&pre_buf)[kMaxPredictSize + 1]) { + if (NULL == his_buf) + return 0; + + size_t fixed_len = utf16_strlen(his_buf); + const char16 *fixed_ptr = his_buf; + if (fixed_len > kMaxPredictSize) { + fixed_ptr += fixed_len - kMaxPredictSize; + fixed_len = kMaxPredictSize; + } + + pre_buf = predict_buf; + return matrix_search->get_predicts(his_buf, pre_buf, kMaxPredictNum); + } + + void im_enable_shm_as_szm(bool enable) { + SpellingTrie &spl_trie = SpellingTrie::get_instance(); + spl_trie.szm_enable_shm(enable); + } + + void im_enable_ym_as_szm(bool enable) { + SpellingTrie &spl_trie = SpellingTrie::get_instance(); + spl_trie.szm_enable_ym(enable); + } + + void im_init_user_dictionary(const char *fn_usr_dict) { + if (!matrix_search) + return; + matrix_search->flush_cache(); + matrix_search->init_user_dictionary(fn_usr_dict); + } + + bool im_is_user_dictionary_enabled(void) { + return NULL != matrix_search ? matrix_search->is_user_dictionary_enabled() : false; + } + +#ifdef __cplusplus +} +#endif diff --git a/googlepinyin/pinyinime.h b/googlepinyin/pinyinime.h new file mode 100644 index 0000000..e376c20 --- /dev/null +++ b/googlepinyin/pinyinime.h @@ -0,0 +1,223 @@ +/* + * Copyright (C) 2009 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef PINYINIME_INCLUDE_ANDPYIME_H__ +#define PINYINIME_INCLUDE_ANDPYIME_H__ + +#include +#include "./dictdef.h" + +#ifdef __cplusplus +extern "C" { +#endif + + namespace ime_pinyin { + + /** + * Open the decoder engine via the system and user dictionary file names. + * + * @param fn_sys_dict The file name of the system dictionary. + * @param fn_usr_dict The file name of the user dictionary. + * @return true if open the decoder engine successfully. + */ + bool im_open_decoder(const char *fn_sys_dict, const char *fn_usr_dict); + + /** + * Open the decoder engine via the system dictionary FD and user dictionary + * file name. Because on Android, the system dictionary is embedded in the + * whole application apk file. + * + * @param sys_fd The file in which the system dictionary is embedded. + * @param start_offset The starting position of the system dictionary in the + * file sys_fd. + * @param length The length of the system dictionary in the file sys_fd, + * counted in byte. + * @return true if succeed. + */ + bool im_open_decoder_fd(int sys_fd, long start_offset, long length, + const char *fn_usr_dict); + + /** + * Close the decoder engine. + */ + void im_close_decoder(); + + /** + * Set maximum limitations for decoding. If this function is not called, + * default values will be used. For example, due to screen size limitation, + * the UI engine of the IME can only show a certain number of letters(input) + * to decode, and a certain number of Chinese characters(output). If after + * user adds a new letter, the input or the output string is longer than the + * limitations, the engine will discard the recent letter. + * + * @param max_sps_len Maximum length of the spelling string(Pinyin string). + * @max_hzs_len Maximum length of the decoded Chinese character string. + */ + void im_set_max_lens(size_t max_sps_len, size_t max_hzs_len); + + /** + * Flush cached data to persistent memory. Because at runtime, in order to + * achieve best performance, some data is only store in memory. + */ + void im_flush_cache(); + + /** + * Use a spelling string(Pinyin string) to search. The engine will try to do + * an incremental search based on its previous search result, so if the new + * string has the same prefix with the previous one stored in the decoder, + * the decoder will only continue the search from the end of the prefix. + * If the caller needs to do a brand new search, please call im_reset_search() + * first. Calling im_search() is equivalent to calling im_add_letter() one by + * one. + * + * @param sps_buf The spelling string buffer to decode. + * @param sps_len The length of the spelling string buffer. + * @return The number of candidates. + */ + size_t im_search(const char* sps_buf, size_t sps_len); + + /** + * Make a delete operation in the current search result, and make research if + * necessary. + * + * @param pos The posistion of char in spelling string to delete, or the + * position of spelling id in result string to delete. + * @param is_pos_in_splid Indicate whether the pos parameter is the position + * in the spelling string, or the position in the result spelling id string. + * @return The number of candidates. + */ + size_t im_delsearch(size_t pos, bool is_pos_in_splid, + bool clear_fixed_this_step); + + /** + * Reset the previous search result. + */ + void im_reset_search(); + + /** + * Add a Pinyin letter to the current spelling string kept by decoder. If the + * decoder fails in adding the letter, it will do nothing. im_get_sps_str() + * can be used to get the spelling string kept by decoder currently. + * + * @param ch The letter to add. + * @return The number of candidates. + */ + size_t im_add_letter(char ch); + + /** + * Get the spelling string kept by the decoder. + * + * @param decoded_len Used to return how many characters in the spelling + * string is successfully parsed. + * @return The spelling string kept by the decoder. + */ + const char *im_get_sps_str(size_t *decoded_len); + + /** + * Get a candidate(or choice) string. + * + * @param cand_id The id to get a candidate. Started from 0. Usually, id 0 + * is a sentence-level candidate. + * @param cand_str The buffer to store the candidate. + * @param max_len The maximum length of the buffer. + * @return cand_str if succeeds, otherwise NULL. + */ + char16* im_get_candidate(size_t cand_id, char16* cand_str, + size_t max_len); + + /** + * Get the segmentation information(the starting positions) of the spelling + * string. + * + * @param spl_start Used to return the starting posistions. + * @return The number of spelling ids. If it is L, there will be L+1 valid + * elements in spl_start, and spl_start[L] is the posistion after the end of + * the last spelling id. + */ + size_t im_get_spl_start_pos(const uint16 *&spl_start); + + /** + * Choose a candidate and make it fixed. If the candidate does not match + * the end of all spelling ids, new candidates will be provided from the + * first unfixed position. If the candidate matches the end of the all + * spelling ids, there will be only one new candidates, or the whole fixed + * sentence. + * + * @param cand_id The id of candidate to select and make it fixed. + * @return The number of candidates. If after the selection, the whole result + * string has been fixed, there will be only one candidate. + */ + size_t im_choose(size_t cand_id); + + /** + * Cancel the last selection, or revert the last operation of im_choose(). + * + * @return The number of candidates. + */ + size_t im_cancel_last_choice(); + + /** + * Get the number of fixed spelling ids, or Chinese characters. + * + * @return The number of fixed spelling ids, of Chinese characters. + */ + size_t im_get_fixed_len(); + + /** + * Cancel the input state and reset the search workspace. + */ + bool im_cancel_input(); + + /** + * Get prediction candiates based on the given fixed Chinese string as the + * history. + * + * @param his_buf The history buffer to do the prediction. It should be ended + * with '\0'. + * @param pre_buf Used to return prediction result list. + * @return The number of predicted result string. + */ + size_t im_get_predicts(const char16 *his_buf, + char16 (*&pre_buf)[kMaxPredictSize + 1]); + + /** + * Enable Shengmus in ShouZiMu mode. + */ + void im_enable_shm_as_szm(bool enable); + + /** + * Enable Yunmus in ShouZiMu mode. + */ + void im_enable_ym_as_szm(bool enable); + + /** + * Initializes or uninitializes the user dictionary. + * + * @param fn_usr_dict The file name of the user dictionary. + */ + void im_init_user_dictionary(const char *fn_usr_dict); + + /** + * Returns the current status of user dictinary. + */ + bool im_is_user_dictionary_enabled(void); +} + +#ifdef __cplusplus +} +#endif + +#endif // PINYINIME_INCLUDE_ANDPYIME_H__ diff --git a/googlepinyin/searchutility.cpp b/googlepinyin/searchutility.cpp new file mode 100644 index 0000000..7900d5c --- /dev/null +++ b/googlepinyin/searchutility.cpp @@ -0,0 +1,210 @@ +/* + * Copyright (C) 2009 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "mystdlib.h" +#include "searchutility.h" + +namespace ime_pinyin { + +bool is_system_lemma(LemmaIdType lma_id) { + return (0 < lma_id && lma_id <= kSysDictIdEnd); +} + +bool is_user_lemma(LemmaIdType lma_id) { + return (kUserDictIdStart <= lma_id && lma_id <= kUserDictIdEnd); +} + +bool is_composing_lemma(LemmaIdType lma_id) { + return (kLemmaIdComposing == lma_id); +} + +int cmp_lpi_with_psb(const void *p1, const void *p2) { + if ((static_cast(p1))->psb > + (static_cast(p2))->psb) + return 1; + if ((static_cast(p1))->psb < + (static_cast(p2))->psb) + return -1; + return 0; +} + +int cmp_lpi_with_unified_psb(const void *p1, const void *p2) { + const LmaPsbItem *item1 = static_cast(p1); + const LmaPsbItem *item2 = static_cast(p2); + + // The real unified psb is psb1 / lma_len1 and psb2 * lma_len2 + // But we use psb1 * lma_len2 and psb2 * lma_len1 to get better + // precision. + size_t up1 = item1->psb * (item2->lma_len); + size_t up2 = item2->psb * (item1->lma_len); + if (up1 < up2) { + return -1; + } + if (up1 > up2) { + return 1; + } + return 0; +} + +int cmp_lpi_with_id(const void *p1, const void *p2) { + if ((static_cast(p1))->id < + (static_cast(p2))->id) + return -1; + if ((static_cast(p1))->id > + (static_cast(p2))->id) + return 1; + return 0; +} + +int cmp_lpi_with_hanzi(const void *p1, const void *p2) { + if ((static_cast(p1))->hanzi < + (static_cast(p2))->hanzi) + return -1; + if ((static_cast(p1))->hanzi > + (static_cast(p2))->hanzi) + return 1; + + return 0; +} + +int cmp_lpsi_with_str(const void *p1, const void *p2) { + return utf16_strcmp((static_cast(p1))->str, + (static_cast(p2))->str); +} + + +int cmp_hanzis_1(const void *p1, const void *p2) { + if (*static_cast(p1) < + *static_cast(p2)) + return -1; + + if (*static_cast(p1) > + *static_cast(p2)) + return 1; + return 0; +} + +int cmp_hanzis_2(const void *p1, const void *p2) { + return utf16_strncmp(static_cast(p1), + static_cast(p2), 2); +} + +int cmp_hanzis_3(const void *p1, const void *p2) { + return utf16_strncmp(static_cast(p1), + static_cast(p2), 3); +} + +int cmp_hanzis_4(const void *p1, const void *p2) { + return utf16_strncmp(static_cast(p1), + static_cast(p2), 4); +} + +int cmp_hanzis_5(const void *p1, const void *p2) { + return utf16_strncmp(static_cast(p1), + static_cast(p2), 5); +} + +int cmp_hanzis_6(const void *p1, const void *p2) { + return utf16_strncmp(static_cast(p1), + static_cast(p2), 6); +} + +int cmp_hanzis_7(const void *p1, const void *p2) { + return utf16_strncmp(static_cast(p1), + static_cast(p2), 7); +} + +int cmp_hanzis_8(const void *p1, const void *p2) { + return utf16_strncmp(static_cast(p1), + static_cast(p2), 8); +} + +int cmp_npre_by_score(const void *p1, const void *p2) { + if ((static_cast(p1))->psb > + (static_cast(p2))->psb) + return 1; + + if ((static_cast(p1))->psb < + (static_cast(p2))->psb) + return -1; + + return 0; +} + +int cmp_npre_by_hislen_score(const void *p1, const void *p2) { + if ((static_cast(p1))->his_len < + (static_cast(p2))->his_len) + return 1; + + if ((static_cast(p1))->his_len > + (static_cast(p2))->his_len) + return -1; + + if ((static_cast(p1))->psb > + (static_cast(p2))->psb) + return 1; + + if ((static_cast(p1))->psb < + (static_cast(p2))->psb) + return -1; + + return 0; +} + +int cmp_npre_by_hanzi_score(const void *p1, const void *p2) { + int ret_v = (utf16_strncmp((static_cast(p1))->pre_hzs, + (static_cast(p2))->pre_hzs, kMaxPredictSize)); + if (0 != ret_v) + return ret_v; + + if ((static_cast(p1))->psb > + (static_cast(p2))->psb) + return 1; + + if ((static_cast(p1))->psb < + (static_cast(p2))->psb) + return -1; + + return 0; +} + +size_t remove_duplicate_npre(NPredictItem *npre_items, size_t npre_num) { + if (NULL == npre_items || 0 == npre_num) + return 0; + + myqsort(npre_items, npre_num, sizeof(NPredictItem), cmp_npre_by_hanzi_score); + + size_t remain_num = 1; // The first one is reserved. + for (size_t pos = 1; pos < npre_num; pos++) { + if (utf16_strncmp(npre_items[pos].pre_hzs, + npre_items[remain_num - 1].pre_hzs, + kMaxPredictSize) != 0) { + if (remain_num != pos) { + npre_items[remain_num] = npre_items[pos]; + } + remain_num++; + } + } + return remain_num; +} + +size_t align_to_size_t(size_t size) { + size_t s = sizeof(size_t); + return (size + s -1) / s * s; +} + +} // namespace ime_pinyin diff --git a/googlepinyin/searchutility.h b/googlepinyin/searchutility.h new file mode 100644 index 0000000..f135710 --- /dev/null +++ b/googlepinyin/searchutility.h @@ -0,0 +1,142 @@ +/* + * Copyright (C) 2009 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef PINYINIME_ANDPY_INCLUDE_SEARCHCOMMON_H__ +#define PINYINIME_ANDPY_INCLUDE_SEARCHCOMMON_H__ + +#include +#include "./spellingtrie.h" + +namespace ime_pinyin { + +// Type used to identify the size of a pool, such as id pool, etc. +typedef uint16 PoolPosType; + +// Type used to identify a parsing mile stone in an atom dictionary. +typedef uint16 MileStoneHandle; + +// Type used to express a lemma and its probability score. +typedef struct { + size_t id:(kLemmaIdSize * 8); + size_t lma_len:4; + uint16 psb; // The score, the lower psb, the higher possibility. + // For single character items, we may also need Hanzi. + // For multiple characer items, ignore it. + char16 hanzi; +} LmaPsbItem, *PLmaPsbItem; + +// LmaPsbItem extended with string. +typedef struct { + LmaPsbItem lpi; + char16 str[kMaxLemmaSize + 1]; +} LmaPsbStrItem, *PLmaPsbStrItem; + + +typedef struct { + float psb; + char16 pre_hzs[kMaxPredictSize]; + uint16 his_len; // The length of the history used to do the prediction. +} NPredictItem, *PNPredictItem; + +// Parameter structure used to extend in a dictionary. All dictionaries +// receives the same DictExtPara and a dictionary specific MileStoneHandle for +// extending. +// +// When the user inputs a new character, AtomDictBase::extend_dict() will be +// called at least once for each dictionary. +// +// For example, when the user inputs "wm", extend_dict() will be called twice, +// and the DictExtPara parameter are as follows respectively: +// 1. splids = {w, m}; splids_extended = 1; ext_len = 1; step_no = 1; +// splid_end_split = false; id_start = wa(the first id start with 'w'); +// id_num = number of ids starting with 'w'. +// 2. splids = {m}; splids_extended = 0; ext_len = 1; step_no = 1; +// splid_end_split = false; id_start = wa; id_num = number of ids starting with +// 'w'. +// +// For string "women", one of the cases of the DictExtPara parameter is: +// splids = {wo, men}, splids_extended = 1, ext_len = 3 (length of "men"), +// step_no = 4; splid_end_split = false; id_start = men, id_num = 1. +// +typedef struct { + // Spelling ids for extending, there are splids_extended + 1 ids in the + // buffer. + // For a normal lemma, there can only be kMaxLemmaSize spelling ids in max, + // but for a composing phrase, there can kMaxSearchSteps spelling ids. + uint16 splids[kMaxSearchSteps]; + + // Number of ids that have been used before. splids[splids_extended] is the + // newly added id for the current extension. + uint16 splids_extended; + + // The step span of the extension. It is also the size of the string for + // the newly added spelling id. + uint16 ext_len; + + // The step number for the current extension. It is also the ending position + // in the input Pinyin string for the substring of spelling ids in splids[]. + // For example, when the user inputs "women", step_no = 4. + // This parameter may useful to manage the MileStoneHandle list for each + // step. When the user deletes a character from the string, MileStoneHandle + // objects for the the steps after that character should be reset; when the + // user begins a new string, all MileStoneHandle objects should be reset. + uint16 step_no; + + // Indicate whether the newly added spelling ends with a splitting character + bool splid_end_split; + + // If the newly added id is a half id, id_start is the first id of the + // corresponding full ids; if the newly added id is a full id, id_start is + // that id. + uint16 id_start; + + // If the newly added id is a half id, id_num is the number of corresponding + // ids; if it is a full id, id_num == 1. + uint16 id_num; +}DictExtPara, *PDictExtPara; + +bool is_system_lemma(LemmaIdType lma_id); +bool is_user_lemma(LemmaIdType lma_id); +bool is_composing_lemma(LemmaIdType lma_id); + +int cmp_lpi_with_psb(const void *p1, const void *p2); +int cmp_lpi_with_unified_psb(const void *p1, const void *p2); +int cmp_lpi_with_id(const void *p1, const void *p2); +int cmp_lpi_with_hanzi(const void *p1, const void *p2); + +int cmp_lpsi_with_str(const void *p1, const void *p2); + +int cmp_hanzis_1(const void *p1, const void *p2); +int cmp_hanzis_2(const void *p1, const void *p2); +int cmp_hanzis_3(const void *p1, const void *p2); +int cmp_hanzis_4(const void *p1, const void *p2); +int cmp_hanzis_5(const void *p1, const void *p2); +int cmp_hanzis_6(const void *p1, const void *p2); +int cmp_hanzis_7(const void *p1, const void *p2); +int cmp_hanzis_8(const void *p1, const void *p2); + +int cmp_npre_by_score(const void *p1, const void *p2); +int cmp_npre_by_hislen_score(const void *p1, const void *p2); +int cmp_npre_by_hanzi_score(const void *p1, const void *p2); + + +size_t remove_duplicate_npre(NPredictItem *npre_items, size_t npre_num); + +size_t align_to_size_t(size_t size); + +} // namespace + +#endif // PINYINIME_ANDPY_INCLUDE_SEARCHCOMMON_H__ diff --git a/googlepinyin/spellingtable.cpp b/googlepinyin/spellingtable.cpp new file mode 100644 index 0000000..78ae51f --- /dev/null +++ b/googlepinyin/spellingtable.cpp @@ -0,0 +1,313 @@ +/* + * Copyright (C) 2009 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include "spellingtable.h" + +namespace ime_pinyin { + +#ifdef ___BUILD_MODEL___ + +const char SpellingTable:: + kNotSupportList[kNotSupportNum][kMaxSpellingSize + 1] = {"HM", "HNG", "NG"}; + +// "" is the biggest, so that all empty strings will be moved to the end +// _eb mean empty is biggest +int compare_raw_spl_eb(const void* p1, const void* p2) { + if ('\0' == (static_cast(p1))->str[0]) + return 1; + + if ('\0' == (static_cast(p2))->str[0]) + return -1; + + return strcmp((static_cast(p1))->str, + (static_cast(p2))->str); +} + +size_t get_odd_next(size_t value) { + size_t v_next = value; + while (true) { + size_t v_next_sqrt = (size_t)sqrt(v_next); + + bool is_odd = true; + for (size_t v_dv = 2; v_dv < v_next_sqrt + 1; v_dv++) { + if (v_next % v_dv == 0) { + is_odd = false; + break; + } + } + + if (is_odd) + return v_next; + + v_next++; + } + + // never reach here + return 0; +} + +SpellingTable::SpellingTable() { + need_score_ = false; + raw_spellings_ = NULL; + spelling_buf_ = NULL; + spelling_num_ = 0; + total_freq_ = 0; + frozen_ = true; +} + +SpellingTable::~SpellingTable() { + free_resource(); +} + +size_t SpellingTable::get_hash_pos(const char* spelling_str) { + size_t hash_pos = 0; + for (size_t pos = 0; pos < spelling_size_; pos++) { + if ('\0' == spelling_str[pos]) + break; + hash_pos += (size_t)spelling_str[pos]; + } + + hash_pos = hash_pos % spelling_max_num_; + return hash_pos; +} + +size_t SpellingTable::hash_pos_next(size_t hash_pos) { + hash_pos += 123; + hash_pos = hash_pos % spelling_max_num_; + return hash_pos; +} + +void SpellingTable::free_resource() { + if (NULL != raw_spellings_) + delete [] raw_spellings_; + raw_spellings_ = NULL; + + if (NULL != spelling_buf_) + delete [] spelling_buf_; + spelling_buf_ = NULL; +} + +bool SpellingTable::init_table(size_t pure_spl_size, size_t spl_max_num, + bool need_score) { + if (pure_spl_size == 0 || spl_max_num ==0) + return false; + + need_score_ = need_score; + + free_resource(); + + spelling_size_ = pure_spl_size + 1; + if (need_score) + spelling_size_ += 1; + spelling_max_num_ = get_odd_next(spl_max_num); + spelling_num_ = 0; + + raw_spellings_ = new RawSpelling[spelling_max_num_]; + spelling_buf_ = new char[spelling_max_num_ * (spelling_size_)]; + if (NULL == raw_spellings_ || NULL == spelling_buf_) { + free_resource(); + return false; + } + + memset(raw_spellings_, 0, spelling_max_num_ * sizeof(RawSpelling)); + memset(spelling_buf_, 0, spelling_max_num_ * (spelling_size_)); + frozen_ = false; + total_freq_ = 0; + return true; +} + +bool SpellingTable::put_spelling(const char* spelling_str, double freq) { + if (frozen_ || NULL == spelling_str) + return false; + + for (size_t pos = 0; pos < kNotSupportNum; pos++) { + if (strcmp(spelling_str, kNotSupportList[pos]) == 0) { + return false; + } + } + + total_freq_ += freq; + + size_t hash_pos = get_hash_pos(spelling_str); + + raw_spellings_[hash_pos].str[spelling_size_ - 1] = '\0'; + + if (strncmp(raw_spellings_[hash_pos].str, spelling_str, + spelling_size_ - 1) == 0) { + raw_spellings_[hash_pos].freq += freq; + return true; + } + + size_t hash_pos_ori = hash_pos; + + while (true) { + if (strncmp(raw_spellings_[hash_pos].str, + spelling_str, spelling_size_ - 1) == 0) { + raw_spellings_[hash_pos].freq += freq; + return true; + } + + if ('\0' == raw_spellings_[hash_pos].str[0]) { + raw_spellings_[hash_pos].freq += freq; + strncpy(raw_spellings_[hash_pos].str, spelling_str, spelling_size_ - 1); + raw_spellings_[hash_pos].str[spelling_size_ - 1] = '\0'; + spelling_num_++; + return true; + } + + hash_pos = hash_pos_next(hash_pos); + if (hash_pos_ori == hash_pos) + return false; + } + + // never reach here + return false; +} + +bool SpellingTable::contain(const char* spelling_str) { + if (NULL == spelling_str || NULL == spelling_buf_ || frozen_) + return false; + + size_t hash_pos = get_hash_pos(spelling_str); + + if ('\0' == raw_spellings_[hash_pos].str[0]) + return false; + + if (strncmp(raw_spellings_[hash_pos].str, spelling_str, spelling_size_ - 1) + == 0) + return true; + + size_t hash_pos_ori = hash_pos; + + while (true) { + hash_pos = hash_pos_next(hash_pos); + if (hash_pos_ori == hash_pos) + return false; + + if ('\0' == raw_spellings_[hash_pos].str[0]) + return false; + + if (strncmp(raw_spellings_[hash_pos].str, spelling_str, spelling_size_ - 1) + == 0) + return true; + } + + // never reach here + return false; +} + +const char* SpellingTable::arrange(size_t *item_size, size_t *spl_num) { + if (NULL == raw_spellings_ || NULL == spelling_buf_ || + NULL == item_size || NULL == spl_num) + return NULL; + + qsort(raw_spellings_, spelling_max_num_, sizeof(RawSpelling), + compare_raw_spl_eb); + + // After sorting, only the first spelling_num_ items are valid. + // Copy them to the destination buffer. + for (size_t pos = 0; pos < spelling_num_; pos++) { + strncpy(spelling_buf_ + pos * spelling_size_, raw_spellings_[pos].str, + spelling_size_); + } + + if (need_score_) { + if (kPrintDebug0) + printf("------------Spelling Possiblities--------------\n"); + + double max_score = 0; + double min_score = 0; + + // After sorting, only the first spelling_num_ items are valid. + for (size_t pos = 0; pos < spelling_num_; pos++) { + raw_spellings_[pos].freq /= total_freq_; + if (need_score_) { + if (0 == pos) { + max_score = raw_spellings_[0].freq; + min_score = max_score; + } else { + if (raw_spellings_[pos].freq > max_score) + max_score = raw_spellings_[pos].freq; + if (raw_spellings_[pos].freq < min_score) + min_score = raw_spellings_[pos].freq; + } + } + } + + if (kPrintDebug0) + printf("-----max psb: %f, min psb: %f\n", max_score, min_score); + + max_score = log(max_score); + min_score = log(min_score); + + if (kPrintDebug0) + printf("-----max log value: %f, min log value: %f\n", + max_score, min_score); + + // The absolute value of min_score is bigger than that of max_score because + // both of them are negative after log function. + score_amplifier_ = 1.0 * 255 / min_score; + + double average_score = 0; + for (size_t pos = 0; pos < spelling_num_; pos++) { + double score = log(raw_spellings_[pos].freq) * score_amplifier_; + assert(score >= 0); + + average_score += score; + + // Because of calculation precision issue, score might be a little bigger + // than 255 after being amplified. + if (score > 255) + score = 255; + char *this_spl_buf = spelling_buf_ + pos * spelling_size_; + this_spl_buf[spelling_size_ - 1] = + static_cast((unsigned char)score); + + if (kPrintDebug0) { + printf("---pos:%d, %s, psb:%d\n", pos, this_spl_buf, + (unsigned char)this_spl_buf[spelling_size_ -1]); + } + } + average_score /= spelling_num_; + assert(average_score <= 255); + average_score_ = static_cast(average_score); + + if (kPrintDebug0) + printf("\n----Score Amplifier: %f, Average Score: %d\n", score_amplifier_, + average_score_); + } + + *item_size = spelling_size_; + *spl_num = spelling_num_; + frozen_ = true; + return spelling_buf_; +} + +float SpellingTable::get_score_amplifier() { + return static_cast(score_amplifier_); +} + +unsigned char SpellingTable::get_average_score() { + return average_score_; +} + +#endif // ___BUILD_MODEL___ +} // namespace ime_pinyin diff --git a/googlepinyin/spellingtable.h b/googlepinyin/spellingtable.h new file mode 100644 index 0000000..fd79c6e --- /dev/null +++ b/googlepinyin/spellingtable.h @@ -0,0 +1,111 @@ +/* + * Copyright (C) 2009 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef PINYINIME_INCLUDE_SPELLINGTABLE_H__ +#define PINYINIME_INCLUDE_SPELLINGTABLE_H__ + +#include +#include "./dictdef.h" + +namespace ime_pinyin { + +#ifdef ___BUILD_MODEL___ + +const size_t kMaxSpellingSize = kMaxPinyinSize; + +typedef struct { + char str[kMaxSpellingSize + 1]; + double freq; +} RawSpelling, *PRawSpelling; + +// This class is used to store the spelling strings +// The length of the input spelling string should be less or equal to the +// spelling_size_ (set by init_table). If the input string is too long, +// we only keep its first spelling_size_ chars. +class SpellingTable { + private: + static const size_t kNotSupportNum = 3; + static const char kNotSupportList[kNotSupportNum][kMaxSpellingSize + 1]; + + bool need_score_; + + size_t spelling_max_num_; + + RawSpelling *raw_spellings_; + + // Used to store spelling strings. If the spelling table needs to calculate + // score, an extra char after each spelling string is the score. + // An item with a lower score has a higher probability. + char *spelling_buf_; + size_t spelling_size_; + + double total_freq_; + + size_t spelling_num_; + + double score_amplifier_; + + unsigned char average_score_; + + // If frozen is true, put_spelling() and contain() are not allowed to call. + bool frozen_; + + size_t get_hash_pos(const char* spelling_str); + size_t hash_pos_next(size_t hash_pos); + void free_resource(); + public: + SpellingTable(); + ~SpellingTable(); + + // pure_spl_size is the pure maximum spelling string size. For example, + // "zhuang" is the longgest item in Pinyin, so pure_spl_size should be 6. + // spl_max_num is the maximum number of spelling strings to store. + // need_score is used to indicate whether the caller needs to calculate a + // score for each spelling. + bool init_table(size_t pure_spl_size, size_t spl_max_num, bool need_score); + + // Put a spelling string to the table. + // It always returns false if called after arrange() withtout a new + // init_table() operation. + // freq is the spelling's occuring count. + // If the spelling has been in the table, occuring count will accumulated. + bool put_spelling(const char* spelling_str, double spl_count); + + // Test whether a spelling string is in the table. + // It always returns false, when being called after arrange() withtout a new + // init_table() operation. + bool contain(const char* spelling_str); + + // Sort the spelling strings and put them from the begin of the buffer. + // Return the pointer of the sorted spelling strings. + // item_size and spl_num return the item size and number of spelling. + // Because each spelling uses a '\0' as terminator, the returned item_size is + // at least one char longer than the spl_size parameter specified by + // init_table(). If the table is initialized to calculate score, item_size + // will be increased by 1, and current_spl_str[item_size - 1] stores an + // unsinged char score. + // An item with a lower score has a higher probability. + // Do not call put_spelling() and contains() after arrange(). + const char* arrange(size_t *item_size, size_t *spl_num); + + float get_score_amplifier(); + + unsigned char get_average_score(); +}; +#endif // ___BUILD_MODEL___ +} + +#endif // PINYINIME_INCLUDE_SPELLINGTABLE_H__ diff --git a/googlepinyin/spellingtrie.cpp b/googlepinyin/spellingtrie.cpp new file mode 100644 index 0000000..7b03b6f --- /dev/null +++ b/googlepinyin/spellingtrie.cpp @@ -0,0 +1,832 @@ +/* + * Copyright (C) 2009 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include "dictdef.h" + +#ifdef _WIN32 +#define snprintf _snprintf +#endif + +#ifdef ___BUILD_MODEL___ +#include "spellingtable.h" +#endif + +#include "spellingtrie.h" + +namespace ime_pinyin { + +SpellingTrie* SpellingTrie::instance_ = NULL; + +// z/c/s is for Zh/Ch/Sh +const char SpellingTrie::kHalfId2Sc_[kFullSplIdStart + 1] = + "0ABCcDEFGHIJKLMNOPQRSsTUVWXYZz"; + +// Bit 0 : is it a Shengmu char? +// Bit 1 : is it a Yunmu char? (one char is a Yunmu) +// Bit 2 : is it enabled in ShouZiMu(first char) mode? +unsigned char SpellingTrie::char_flags_[] = { + // a b c d e f g + 0x02, 0x01, 0x01, 0x01, 0x02, 0x01, 0x01, + // h i j k l m n + 0x01, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, + // o p q r s t + 0x02, 0x01, 0x01, 0x01, 0x01, 0x01, + // u v w x y z + 0x00, 0x00, 0x01, 0x01, 0x01, 0x01 +}; + +int compare_spl(const void* p1, const void* p2) { + return strcmp((const char*)(p1), (const char*)(p2)); +} + +SpellingTrie::SpellingTrie() { + spelling_buf_ = NULL; + spelling_size_ = 0; + spelling_num_ = 0; + spl_ym_ids_ = NULL; + splstr_queried_ = NULL; + splstr16_queried_ = NULL; + root_ = NULL; + dumb_node_ = NULL; + splitter_node_ = NULL; + instance_ = NULL; + ym_buf_ = NULL; + f2h_ = NULL; + + szm_enable_shm(true); + szm_enable_ym(true); + +#ifdef ___BUILD_MODEL___ + node_num_ = 0; +#endif +} + +SpellingTrie::~SpellingTrie() { + if (NULL != spelling_buf_) + delete [] spelling_buf_; + + if (NULL != splstr_queried_) + delete [] splstr_queried_; + + if (NULL != splstr16_queried_) + delete [] splstr16_queried_; + + if (NULL != spl_ym_ids_) + delete [] spl_ym_ids_; + + if (NULL != root_) { + free_son_trie(root_); + delete root_; + } + + if (NULL != dumb_node_) { + delete [] dumb_node_; + } + + if (NULL != splitter_node_) { + delete [] splitter_node_; + } + + if (NULL != instance_) { + delete instance_; + instance_ = NULL; + } + + if (NULL != ym_buf_) + delete [] ym_buf_; + + if (NULL != f2h_) + delete [] f2h_; +} + +bool SpellingTrie::if_valid_id_update(uint16 *splid) const { + if (NULL == splid || 0 == *splid) + return false; + + if (*splid >= kFullSplIdStart) + return true; + if (*splid < kFullSplIdStart) { + char ch = kHalfId2Sc_[*splid]; + if (ch > 'Z') { + return true; + } else { + if (szm_is_enabled(ch)) { + return true; + } else if (is_yunmu_char(ch)) { + assert(h2f_num_[*splid] > 0); + *splid = h2f_start_[*splid]; + return true; + } + } + } + return false; +} + +bool SpellingTrie::is_half_id(uint16 splid) const { + if (0 == splid || splid >= kFullSplIdStart) + return false; + + return true; +} + +bool SpellingTrie::is_full_id(uint16 splid) const { + if (splid < kFullSplIdStart || splid >= kFullSplIdStart + spelling_num_) + return false; + return true; +} + +bool SpellingTrie::half_full_compatible(uint16 half_id, uint16 full_id) const { + uint16 half_fr_full = full_to_half(full_id); + + if (half_fr_full == half_id) + return true; + + // &~0x20 is used to conver the char to upper case. + // So that Zh/Ch/Sh(whose char is z/c/s) can be matched with Z/C/S. + char ch_f = (kHalfId2Sc_[half_fr_full] & (~0x20)); + char ch_h = kHalfId2Sc_[half_id]; + if (ch_f == ch_h) + return true; + + return false; +} + +bool SpellingTrie::is_half_id_yunmu(uint16 splid) const { + if (0 == splid || splid >= kFullSplIdStart) + return false; + + char ch = kHalfId2Sc_[splid]; + // If ch >= 'a', that means the half id is one of Zh/Ch/Sh + if (ch >= 'a') { + return false; + } + + return char_flags_[ch - 'A'] & kHalfIdYunmuMask; +} + +bool SpellingTrie::is_shengmu_char(char ch) const { + return char_flags_[ch - 'A'] & kHalfIdShengmuMask; +} + +bool SpellingTrie::is_yunmu_char(char ch) const { + return char_flags_[ch - 'A'] & kHalfIdYunmuMask; +} + +bool SpellingTrie::is_szm_char(char ch) const { + return is_shengmu_char(ch) || is_yunmu_char(ch); +} + +bool SpellingTrie::szm_is_enabled(char ch) const { + return char_flags_[ch - 'A'] & kHalfIdSzmMask; +} + +void SpellingTrie::szm_enable_shm(bool enable) { + if (enable) { + for (char ch = 'A'; ch <= 'Z'; ch++) { + if (is_shengmu_char(ch)) + char_flags_[ch - 'A'] = char_flags_[ch - 'A'] | kHalfIdSzmMask; + } + } else { + for (char ch = 'A'; ch <= 'Z'; ch++) { + if (is_shengmu_char(ch)) + char_flags_[ch - 'A'] = char_flags_[ch - 'A'] & (kHalfIdSzmMask ^ 0xff); + } + } +} + +void SpellingTrie::szm_enable_ym(bool enable) { + if (enable) { + for (char ch = 'A'; ch <= 'Z'; ch++) { + if (is_yunmu_char(ch)) + char_flags_[ch - 'A'] = char_flags_[ch - 'A'] | kHalfIdSzmMask; + } + } else { + for (char ch = 'A'; ch <= 'Z'; ch++) { + if (is_yunmu_char(ch)) + char_flags_[ch - 'A'] = char_flags_[ch - 'A'] & (kHalfIdSzmMask ^ 0xff); + } + } +} + +bool SpellingTrie::is_szm_enabled(char ch) const { + return char_flags_[ch - 'A'] & kHalfIdSzmMask; +} + +const SpellingTrie* SpellingTrie::get_cpinstance() { + return &get_instance(); +} + +SpellingTrie& SpellingTrie::get_instance() { + if (NULL == instance_) + instance_ = new SpellingTrie(); + + return *instance_; +} + +uint16 SpellingTrie::half2full_num(uint16 half_id) const { + if (NULL == root_ || half_id >= kFullSplIdStart) + return 0; + return h2f_num_[half_id]; +} + +uint16 SpellingTrie::half_to_full(uint16 half_id, uint16 *spl_id_start) const { + if (NULL == spl_id_start || NULL == root_ || half_id >= kFullSplIdStart) + return 0; + + *spl_id_start = h2f_start_[half_id]; + return h2f_num_[half_id]; +} + +uint16 SpellingTrie::full_to_half(uint16 full_id) const { + if (NULL == root_ || full_id < kFullSplIdStart || + full_id > spelling_num_ + kFullSplIdStart) + return 0; + + return f2h_[full_id - kFullSplIdStart]; +} + +void SpellingTrie::free_son_trie(SpellingNode* node) { + if (NULL == node) + return; + + for (size_t pos = 0; pos < node->num_of_son; pos++) { + free_son_trie(node->first_son + pos); + } + + if (NULL != node->first_son) + delete [] node->first_son; +} + +bool SpellingTrie::construct(const char* spelling_arr, size_t item_size, + size_t item_num, float score_amplifier, + unsigned char average_score) { + if (spelling_arr == NULL) + return false; + + memset(h2f_start_, 0, sizeof(uint16) * kFullSplIdStart); + memset(h2f_num_, 0, sizeof(uint16) * kFullSplIdStart); + + // If the arr is the same as the buf, means this function is called by + // load_table(), the table data are ready; otherwise the array should be + // saved. + if (spelling_arr != spelling_buf_) { + if (NULL != spelling_buf_) + delete [] spelling_buf_; + spelling_buf_ = new char[item_size * item_num]; + if (NULL == spelling_buf_) + return false; + memcpy(spelling_buf_, spelling_arr, sizeof(char) * item_size * item_num); + } + + spelling_size_ = item_size; + spelling_num_ = item_num; + + score_amplifier_ = score_amplifier; + average_score_ = average_score; + + if (NULL != splstr_queried_) + delete [] splstr_queried_; + splstr_queried_ = new char[spelling_size_]; + if (NULL == splstr_queried_) + return false; + + if (NULL != splstr16_queried_) + delete [] splstr16_queried_; + splstr16_queried_ = new char16[spelling_size_]; + if (NULL == splstr16_queried_) + return false; + + // First, sort the buf to ensure they are in ascendant order + qsort(spelling_buf_, spelling_num_, spelling_size_, compare_spl); + +#ifdef ___BUILD_MODEL___ + node_num_ = 1; +#endif + + root_ = new SpellingNode(); + memset(root_, 0, sizeof(SpellingNode)); + + dumb_node_ = new SpellingNode(); + memset(dumb_node_, 0, sizeof(SpellingNode)); + dumb_node_->score = average_score_; + + splitter_node_ = new SpellingNode(); + memset(splitter_node_, 0, sizeof(SpellingNode)); + splitter_node_->score = average_score_; + + memset(level1_sons_, 0, sizeof(SpellingNode*) * kValidSplCharNum); + + root_->first_son = construct_spellings_subset(0, spelling_num_, 0, root_); + + // Root's score should be cleared. + root_->score = 0; + + if (NULL == root_->first_son) + return false; + + h2f_start_[0] = h2f_num_[0] = 0; + + if (!build_f2h()) + return false; + +#ifdef ___BUILD_MODEL___ + if (kPrintDebug0) { + printf("---SpellingTrie Nodes: %d\n", (int)node_num_); + } + return build_ym_info(); +#else + return true; +#endif +} + +#ifdef ___BUILD_MODEL___ +const char* SpellingTrie::get_ym_str(const char *spl_str) { + bool start_ZCS = false; + if (is_shengmu_char(*spl_str)) { + if ('Z' == *spl_str || 'C' == *spl_str || 'S' == *spl_str) + start_ZCS = true; + spl_str += 1; + if (start_ZCS && 'h' == *spl_str) + spl_str += 1; + } + return spl_str; +} + +bool SpellingTrie::build_ym_info() { + bool sucess; + SpellingTable *spl_table = new SpellingTable(); + + sucess = spl_table->init_table(kMaxPinyinSize - 1, 2 * kMaxYmNum, false); + assert(sucess); + + for (uint16 pos = 0; pos < spelling_num_; pos++) { + const char *spl_str = spelling_buf_ + spelling_size_ * pos; + spl_str = get_ym_str(spl_str); + if ('\0' != spl_str[0]) { + sucess = spl_table->put_spelling(spl_str, 0); + assert(sucess); + } + } + + size_t ym_item_size; // '\0' is included + size_t ym_num; + const char* ym_buf; + ym_buf = spl_table->arrange(&ym_item_size, &ym_num); + + if (NULL != ym_buf_) + delete [] ym_buf_; + ym_buf_ = new char[ym_item_size * ym_num]; + if (NULL == ym_buf_) { + delete spl_table; + return false; + } + + memcpy(ym_buf_, ym_buf, sizeof(char) * ym_item_size * ym_num); + ym_size_ = ym_item_size; + ym_num_ = ym_num; + + delete spl_table; + + // Generate the maping from the spelling ids to the Yunmu ids. + if (spl_ym_ids_) + delete spl_ym_ids_; + spl_ym_ids_ = new uint8[spelling_num_ + kFullSplIdStart]; + if (NULL == spl_ym_ids_) + return false; + + memset(spl_ym_ids_, 0, sizeof(uint8) * (spelling_num_ + kFullSplIdStart)); + + for (uint16 id = 1; id < spelling_num_ + kFullSplIdStart; id++) { + const char *str = get_spelling_str(id); + + str = get_ym_str(str); + if ('\0' != str[0]) { + uint8 ym_id = get_ym_id(str); + spl_ym_ids_[id] = ym_id; + assert(ym_id > 0); + } else { + spl_ym_ids_[id] = 0; + } + } + return true; +} +#endif + +SpellingNode* SpellingTrie::construct_spellings_subset( + size_t item_start, size_t item_end, size_t level, SpellingNode* parent) { + if (level >= spelling_size_ || item_end <= item_start || NULL == parent) + return NULL; + + SpellingNode *first_son = NULL; + uint16 num_of_son = 0; + unsigned char min_son_score = 255; + + const char *spelling_last_start = spelling_buf_ + spelling_size_ * item_start; + char char_for_node = spelling_last_start[level]; + assert((char_for_node >= 'A' && char_for_node <= 'Z') || + 'h' == char_for_node); + + // Scan the array to find how many sons + for (size_t i = item_start + 1; i < item_end; i++) { + const char *spelling_current = spelling_buf_ + spelling_size_ * i; + char char_current = spelling_current[level]; + if (char_current != char_for_node) { + num_of_son++; + char_for_node = char_current; + } + } + num_of_son++; + + // Allocate memory +#ifdef ___BUILD_MODEL___ + node_num_ += num_of_son; +#endif + first_son = new SpellingNode[num_of_son]; + memset(first_son, 0, sizeof(SpellingNode)*num_of_son); + + // Now begin construct tree + size_t son_pos = 0; + + spelling_last_start = spelling_buf_ + spelling_size_ * item_start; + char_for_node = spelling_last_start[level]; + + bool spelling_endable = true; + if (spelling_last_start[level + 1] != '\0') + spelling_endable = false; + + size_t item_start_next = item_start; + + for (size_t i = item_start + 1; i < item_end; i++) { + const char *spelling_current = spelling_buf_ + spelling_size_ * i; + char char_current = spelling_current[level]; + assert(is_valid_spl_char(char_current)); + + if (char_current != char_for_node) { + // Construct a node + SpellingNode *node_current = first_son + son_pos; + node_current->char_this_node = char_for_node; + + // For quick search in the first level + if (0 == level) + level1_sons_[char_for_node - 'A'] = node_current; + + if (spelling_endable) { + node_current->spelling_idx = kFullSplIdStart + item_start_next; + } + + if (spelling_last_start[level + 1] != '\0' || i - item_start_next > 1) { + size_t real_start = item_start_next; + if (spelling_last_start[level + 1] == '\0') + real_start++; + + node_current->first_son = + construct_spellings_subset(real_start, i, level + 1, + node_current); + + if (real_start == item_start_next + 1) { + uint16 score_this = static_cast( + spelling_last_start[spelling_size_ - 1]); + if (score_this < node_current->score) + node_current->score = score_this; + } + } else { + node_current->first_son = NULL; + node_current->score = static_cast( + spelling_last_start[spelling_size_ - 1]); + } + + if (node_current->score < min_son_score) + min_son_score = node_current->score; + + bool is_half = false; + if (level == 0 && is_szm_char(char_for_node)) { + node_current->spelling_idx = + static_cast(char_for_node - 'A' + 1); + + if (char_for_node > 'C') + node_current->spelling_idx++; + if (char_for_node > 'S') + node_current->spelling_idx++; + + h2f_num_[node_current->spelling_idx] = i - item_start_next; + is_half = true; + } else if (level == 1 && char_for_node == 'h') { + char ch_level0 = spelling_last_start[0]; + uint16 part_id = 0; + if (ch_level0 == 'C') + part_id = 'C' - 'A' + 1 + 1; + else if (ch_level0 == 'S') + part_id = 'S' - 'A' + 1 + 2; + else if (ch_level0 == 'Z') + part_id = 'Z' - 'A' + 1 + 3; + if (0 != part_id) { + node_current->spelling_idx = part_id; + h2f_num_[node_current->spelling_idx] = i - item_start_next; + is_half = true; + } + } + + if (is_half) { + if (h2f_num_[node_current->spelling_idx] > 0) + h2f_start_[node_current->spelling_idx] = + item_start_next + kFullSplIdStart; + else + h2f_start_[node_current->spelling_idx] = 0; + } + + // for next sibling + spelling_last_start = spelling_current; + char_for_node = char_current; + item_start_next = i; + spelling_endable = true; + if (spelling_current[level + 1] != '\0') + spelling_endable = false; + + son_pos++; + } + } + + // the last one + SpellingNode *node_current = first_son + son_pos; + node_current->char_this_node = char_for_node; + + // For quick search in the first level + if (0 == level) + level1_sons_[char_for_node - 'A'] = node_current; + + if (spelling_endable) { + node_current->spelling_idx = kFullSplIdStart + item_start_next; + } + + if (spelling_last_start[level + 1] != '\0' || + item_end - item_start_next > 1) { + size_t real_start = item_start_next; + if (spelling_last_start[level + 1] == '\0') + real_start++; + + node_current->first_son = + construct_spellings_subset(real_start, item_end, level + 1, + node_current); + + if (real_start == item_start_next + 1) { + uint16 score_this = static_cast( + spelling_last_start[spelling_size_ - 1]); + if (score_this < node_current->score) + node_current->score = score_this; + } + } else { + node_current->first_son = NULL; + node_current->score = static_cast( + spelling_last_start[spelling_size_ - 1]); + } + + if (node_current->score < min_son_score) + min_son_score = node_current->score; + + assert(son_pos + 1 == num_of_son); + + bool is_half = false; + if (level == 0 && szm_is_enabled(char_for_node)) { + node_current->spelling_idx = static_cast(char_for_node - 'A' + 1); + + if (char_for_node > 'C') + node_current->spelling_idx++; + if (char_for_node > 'S') + node_current->spelling_idx++; + + h2f_num_[node_current->spelling_idx] = item_end - item_start_next; + is_half = true; + } else if (level == 1 && char_for_node == 'h') { + char ch_level0 = spelling_last_start[0]; + uint16 part_id = 0; + if (ch_level0 == 'C') + part_id = 'C' - 'A' + 1 + 1; + else if (ch_level0 == 'S') + part_id = 'S' - 'A' + 1 + 2; + else if (ch_level0 == 'Z') + part_id = 'Z' - 'A' + 1 + 3; + if (0 != part_id) { + node_current->spelling_idx = part_id; + h2f_num_[node_current->spelling_idx] = item_end - item_start_next; + is_half = true; + } + } + if (is_half) { + if (h2f_num_[node_current->spelling_idx] > 0) + h2f_start_[node_current->spelling_idx] = + item_start_next + kFullSplIdStart; + else + h2f_start_[node_current->spelling_idx] = 0; + } + + parent->num_of_son = num_of_son; + parent->score = min_son_score; + return first_son; +} + +bool SpellingTrie::save_spl_trie(FILE *fp) { + if (NULL == fp || NULL == spelling_buf_) + return false; + + if (fwrite(&spelling_size_, sizeof(uint32), 1, fp) != 1) + return false; + + if (fwrite(&spelling_num_, sizeof(uint32), 1, fp) != 1) + return false; + + if (fwrite(&score_amplifier_, sizeof(float), 1, fp) != 1) + return false; + + if (fwrite(&average_score_, sizeof(unsigned char), 1, fp) != 1) + return false; + + if (fwrite(spelling_buf_, sizeof(char) * spelling_size_, + spelling_num_, fp) != spelling_num_) + return false; + + return true; +} + +bool SpellingTrie::load_spl_trie(FILE *fp) { + if (NULL == fp) + return false; + + if (fread(&spelling_size_, sizeof(uint32), 1, fp) != 1) + return false; + + if (fread(&spelling_num_, sizeof(uint32), 1, fp) != 1) + return false; + + if (fread(&score_amplifier_, sizeof(float), 1, fp) != 1) + return false; + + if (fread(&average_score_, sizeof(unsigned char), 1, fp) != 1) + return false; + + if (NULL != spelling_buf_) + delete [] spelling_buf_; + + spelling_buf_ = new char[spelling_size_ * spelling_num_]; + if (NULL == spelling_buf_) + return false; + + if (fread(spelling_buf_, sizeof(char) * spelling_size_, + spelling_num_, fp) != spelling_num_) + return false; + + return construct(spelling_buf_, spelling_size_, spelling_num_, + score_amplifier_, average_score_); +} + +bool SpellingTrie::build_f2h() { + if (NULL != f2h_) + delete [] f2h_; + f2h_ = new uint16[spelling_num_]; + if (NULL == f2h_) + return false; + + for (uint16 hid = 0; hid < kFullSplIdStart; hid++) { + for (uint16 fid = h2f_start_[hid]; + fid < h2f_start_[hid] + h2f_num_[hid]; fid++) + f2h_[fid - kFullSplIdStart] = hid; + } + + return true; +} + +size_t SpellingTrie::get_spelling_num() { + return spelling_num_; +} + +uint8 SpellingTrie::get_ym_id(const char *ym_str) { + if (NULL == ym_str || NULL == ym_buf_) + return 0; + + for (uint8 pos = 0; pos < ym_num_; pos++) + if (strcmp(ym_buf_ + ym_size_ * pos, ym_str) == 0) + return pos + 1; + + return 0; +} + +const char* SpellingTrie::get_spelling_str(uint16 splid) { + splstr_queried_[0] = '\0'; + + if (splid >= kFullSplIdStart) { + splid -= kFullSplIdStart; + snprintf(splstr_queried_, spelling_size_, "%s", + spelling_buf_ + splid * spelling_size_); + } else { + if (splid == 'C' - 'A' + 1 + 1) { + snprintf(splstr_queried_, spelling_size_, "%s", "Ch"); + } else if (splid == 'S' - 'A' + 1 + 2) { + snprintf(splstr_queried_, spelling_size_, "%s", "Sh"); + } else if (splid == 'Z' - 'A' + 1 + 3) { + snprintf(splstr_queried_, spelling_size_, "%s", "Zh"); + } else { + if (splid > 'C' - 'A' + 1) + splid--; + if (splid > 'S' - 'A' + 1) + splid--; + splstr_queried_[0] = 'A' + splid - 1; + splstr_queried_[1] = '\0'; + } + } + return splstr_queried_; +} + +const char16* SpellingTrie::get_spelling_str16(uint16 splid) { + splstr16_queried_[0] = '\0'; + + if (splid >= kFullSplIdStart) { + splid -= kFullSplIdStart; + for (size_t pos = 0; pos < spelling_size_; pos++) { + splstr16_queried_[pos] = static_cast + (spelling_buf_[splid * spelling_size_ + pos]); + } + } else { + if (splid == 'C' - 'A' + 1 + 1) { + splstr16_queried_[0] = static_cast('C'); + splstr16_queried_[1] = static_cast('h'); + splstr16_queried_[2] = static_cast('\0'); + } else if (splid == 'S' - 'A' + 1 + 2) { + splstr16_queried_[0] = static_cast('S'); + splstr16_queried_[1] = static_cast('h'); + splstr16_queried_[2] = static_cast('\0'); + } else if (splid == 'Z' - 'A' + 1 + 3) { + splstr16_queried_[0] = static_cast('Z'); + splstr16_queried_[1] = static_cast('h'); + splstr16_queried_[2] = static_cast('\0'); + } else { + if (splid > 'C' - 'A' + 1) + splid--; + if (splid > 'S' - 'A' + 1) + splid--; + splstr16_queried_[0] = 'A' + splid - 1; + splstr16_queried_[1] = '\0'; + } + } + return splstr16_queried_; +} + +size_t SpellingTrie::get_spelling_str16(uint16 splid, char16 *splstr16, + size_t splstr16_len) { + if (NULL == splstr16 || splstr16_len < kMaxPinyinSize + 1) return 0; + + if (splid >= kFullSplIdStart) { + splid -= kFullSplIdStart; + for (size_t pos = 0; pos <= kMaxPinyinSize; pos++) { + splstr16[pos] = static_cast + (spelling_buf_[splid * spelling_size_ + pos]); + if (static_cast('\0') == splstr16[pos]) { + return pos; + } + } + } else { + if (splid == 'C' - 'A' + 1 + 1) { + splstr16[0] = static_cast('C'); + splstr16[1] = static_cast('h'); + splstr16[2] = static_cast('\0'); + return 2; + } else if (splid == 'S' - 'A' + 1 + 2) { + splstr16[0] = static_cast('S'); + splstr16[1] = static_cast('h'); + splstr16[2] = static_cast('\0'); + return 2; + } else if (splid == 'Z' - 'A' + 1 + 3) { + splstr16[0] = static_cast('Z'); + splstr16[1] = static_cast('h'); + splstr16[2] = static_cast('\0'); + return 2; + } else { + if (splid > 'C' - 'A' + 1) + splid--; + if (splid > 'S' - 'A' + 1) + splid--; + splstr16[0] = 'A' + splid - 1; + splstr16[1] = '\0'; + return 1; + } + } + + // Not reachable. + return 0; +} + +} // namespace ime_pinyin diff --git a/googlepinyin/spellingtrie.h b/googlepinyin/spellingtrie.h new file mode 100644 index 0000000..03510ed --- /dev/null +++ b/googlepinyin/spellingtrie.h @@ -0,0 +1,258 @@ +/* + * Copyright (C) 2009 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef PINYINIME_INCLUDE_SPELLINGTRIE_H__ +#define PINYINIME_INCLUDE_SPELLINGTRIE_H__ + +#include +#include +#include "./dictdef.h" + +namespace ime_pinyin { + +static const unsigned short kFullSplIdStart = kHalfSpellingIdNum + 1; + +// Node used for the trie of spellings +struct SpellingNode { + SpellingNode *first_son; + // The spelling id for each node. If you need more bits to store + // spelling id, please adjust this structure. + uint16 spelling_idx:11; + uint16 num_of_son:5; + char char_this_node; + unsigned char score; +}; + +class SpellingTrie { + private: + static const int kMaxYmNum = 64; + static const size_t kValidSplCharNum = 26; + + static const uint16 kHalfIdShengmuMask = 0x01; + static const uint16 kHalfIdYunmuMask = 0x02; + static const uint16 kHalfIdSzmMask = 0x04; + + // Map from half spelling id to single char. + // For half ids of Zh/Ch/Sh, map to z/c/s (low case) respectively. + // For example, 1 to 'A', 2 to 'B', 3 to 'C', 4 to 'c', 5 to 'D', ..., + // 28 to 'Z', 29 to 'z'. + // [0] is not used to achieve better efficiency. + static const char kHalfId2Sc_[kFullSplIdStart + 1]; + + static unsigned char char_flags_[]; + static SpellingTrie* instance_; + + // The spelling table + char *spelling_buf_; + + // The size of longest spelling string, includes '\0' and an extra char to + // store score. For example, "zhuang" is the longgest item in Pinyin list, + // so spelling_size_ is 8. + // Structure: The string ended with '\0' + score char. + // An item with a lower score has a higher probability. + uint32 spelling_size_; + + // Number of full spelling ids. + uint32 spelling_num_; + + float score_amplifier_; + unsigned char average_score_; + + // The Yunmu id list for the spelling ids (for half ids of Shengmu, + // the Yunmu id is 0). + // The length of the list is spelling_num_ + kFullSplIdStart, + // so that spl_ym_ids_[splid] is the Yunmu id of the splid. + uint8 *spl_ym_ids_; + + // The Yunmu table. + // Each Yunmu will be assigned with Yunmu id from 1. + char *ym_buf_; + size_t ym_size_; // The size of longest Yunmu string, '\0'included. + size_t ym_num_; + + // The spelling string just queried + char *splstr_queried_; + + // The spelling string just queried + char16 *splstr16_queried_; + + // The root node of the spelling tree + SpellingNode* root_; + + // If a none qwerty key such as a fnction key like ENTER is given, this node + // will be used to indicate that this is not a QWERTY node. + SpellingNode* dumb_node_; + + // If a splitter key is pressed, this node will be used to indicate that this + // is a splitter key. + SpellingNode* splitter_node_; + + // Used to get the first level sons. + SpellingNode* level1_sons_[kValidSplCharNum]; + + // The full spl_id range for specific half id. + // h2f means half to full. + // A half id can be a ShouZiMu id (id to represent the first char of a full + // spelling, including Shengmu and Yunmu), or id of zh/ch/sh. + // [1..kFullSplIdStart-1] is the arrange of half id. + uint16 h2f_start_[kFullSplIdStart]; + uint16 h2f_num_[kFullSplIdStart]; + + // Map from full id to half id. + uint16 *f2h_; + +#ifdef ___BUILD_MODEL___ + // How many node used to build the trie. + size_t node_num_; +#endif + + SpellingTrie(); + + void free_son_trie(SpellingNode* node); + + // Construct a subtree using a subset of the spelling array (from + // item_star to item_end). + // Member spelliing_buf_ and spelling_size_ should be valid. + // parent is used to update its num_of_son and score. + SpellingNode* construct_spellings_subset(size_t item_start, size_t item_end, + size_t level, SpellingNode *parent); + bool build_f2h(); + + // The caller should guarantee ch >= 'A' && ch <= 'Z' + bool is_shengmu_char(char ch) const; + + // The caller should guarantee ch >= 'A' && ch <= 'Z' + bool is_yunmu_char(char ch) const; + +#ifdef ___BUILD_MODEL___ + // Given a spelling string, return its Yunmu string. + // The caller guaratees spl_str is valid. + const char* get_ym_str(const char *spl_str); + + // Build the Yunmu list, and the mapping relation between the full ids and the + // Yunmu ids. This functin is called after the spelling trie is built. + bool build_ym_info(); +#endif + + friend class SpellingParser; + friend class SmartSplParser; + friend class SmartSplParser2; + + public: + ~SpellingTrie(); + + inline static bool is_valid_spl_char(char ch) { + return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z'); + } + + // The caller guarantees that the two chars are valid spelling chars. + inline static bool is_same_spl_char(char ch1, char ch2) { + return ch1 == ch2 || ch1 - ch2 == 'a' - 'A' || ch2 - ch1 == 'a' - 'A'; + } + + // Construct the tree from the input pinyin array + // The given string list should have been sorted. + // score_amplifier is used to convert a possibility value into score. + // average_score is the average_score of all spellings. The dumb node is + // assigned with this score. + bool construct(const char* spelling_arr, size_t item_size, size_t item_num, + float score_amplifier, unsigned char average_score); + + // Test if the given id is a valid spelling id. + // If function returns true, the given splid may be updated like this: + // When 'A' is not enabled in ShouZiMu mode, the parsing result for 'A' is + // first given as a half id 1, but because 'A' is a one-char Yunmu and + // it is a valid id, it needs to updated to its corresponding full id. + bool if_valid_id_update(uint16 *splid) const; + + // Test if the given id is a half id. + bool is_half_id(uint16 splid) const; + + bool is_full_id(uint16 splid) const; + + // Test if the given id is a one-char Yunmu id (obviously, it is also a half + // id), such as 'A', 'E' and 'O'. + bool is_half_id_yunmu(uint16 splid) const; + + // Test if this char is a ShouZiMu char. This ShouZiMu char may be not enabled. + // For Pinyin, only i/u/v is not a ShouZiMu char. + // The caller should guarantee that ch >= 'A' && ch <= 'Z' + bool is_szm_char(char ch) const; + + // Test If this char is enabled in ShouZiMu mode. + // The caller should guarantee that ch >= 'A' && ch <= 'Z' + bool szm_is_enabled(char ch) const; + + // Enable/disable Shengmus in ShouZiMu mode(using the first char of a spelling + // to input). + void szm_enable_shm(bool enable); + + // Enable/disable Yunmus in ShouZiMu mode. + void szm_enable_ym(bool enable); + + // Test if this char is enabled in ShouZiMu mode. + // The caller should guarantee ch >= 'A' && ch <= 'Z' + bool is_szm_enabled(char ch) const; + + // Return the number of full ids for the given half id. + uint16 half2full_num(uint16 half_id) const; + + // Return the number of full ids for the given half id, and fill spl_id_start + // to return the first full id. + uint16 half_to_full(uint16 half_id, uint16 *spl_id_start) const; + + // Return the corresponding half id for the given full id. + // Not frequently used, low efficient. + // Return 0 if fails. + uint16 full_to_half(uint16 full_id) const; + + // To test whether a half id is compatible with a full id. + // Generally, when half_id == full_to_half(full_id), return true. + // But for "Zh, Ch, Sh", if fussy mode is on, half id for 'Z' is compatible + // with a full id like "Zhe". (Fussy mode is not ready). + bool half_full_compatible(uint16 half_id, uint16 full_id) const; + + static const SpellingTrie* get_cpinstance(); + + static SpellingTrie& get_instance(); + + // Save to the file stream + bool save_spl_trie(FILE *fp); + + // Load from the file stream + bool load_spl_trie(FILE *fp); + + // Get the number of spellings + size_t get_spelling_num(); + + // Return the Yunmu id for the given Yunmu string. + // If the string is not valid, return 0; + uint8 get_ym_id(const char* ym_str); + + // Get the readonly Pinyin string for a given spelling id + const char* get_spelling_str(uint16 splid); + + // Get the readonly Pinyin string for a given spelling id + const char16* get_spelling_str16(uint16 splid); + + // Get Pinyin string for a given spelling id. Return the length of the + // string, and fill-in '\0' at the end. + size_t get_spelling_str16(uint16 splid, char16 *splstr16, + size_t splstr16_len); +}; +} + +#endif // PINYINIME_INCLUDE_SPELLINGTRIE_H__ diff --git a/googlepinyin/splparser.cpp b/googlepinyin/splparser.cpp new file mode 100644 index 0000000..a547ffd --- /dev/null +++ b/googlepinyin/splparser.cpp @@ -0,0 +1,341 @@ +/* + * Copyright (C) 2009 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "splparser.h" + +namespace ime_pinyin { + +SpellingParser::SpellingParser() { + spl_trie_ = SpellingTrie::get_cpinstance(); +} + +bool SpellingParser::is_valid_to_parse(char ch) { + return SpellingTrie::is_valid_spl_char(ch); +} + +uint16 SpellingParser::splstr_to_idxs(const char *splstr, uint16 str_len, + uint16 spl_idx[], uint16 start_pos[], + uint16 max_size, bool &last_is_pre) { + if (NULL == splstr || 0 == max_size || 0 == str_len) + return 0; + + if (!SpellingTrie::is_valid_spl_char(splstr[0])) + return 0; + + last_is_pre = false; + + const SpellingNode *node_this = spl_trie_->root_; + + uint16 str_pos = 0; + uint16 idx_num = 0; + if (NULL != start_pos) + start_pos[0] = 0; + bool last_is_splitter = false; + + while (str_pos < str_len) { + char char_this = splstr[str_pos]; + // all characters outside of [a, z] are considered as splitters + if (!SpellingTrie::is_valid_spl_char(char_this)) { + // test if the current node is endable + uint16 id_this = node_this->spelling_idx; + if (spl_trie_->if_valid_id_update(&id_this)) { + spl_idx[idx_num] = id_this; + + idx_num++; + str_pos++; + if (NULL != start_pos) + start_pos[idx_num] = str_pos; + if (idx_num >= max_size) + return idx_num; + + node_this = spl_trie_->root_; + last_is_splitter = true; + continue; + } else { + if (last_is_splitter) { + str_pos++; + if (NULL != start_pos) + start_pos[idx_num] = str_pos; + continue; + } else { + return idx_num; + } + } + } + + last_is_splitter = false; + + SpellingNode *found_son = NULL; + + if (0 == str_pos) { + if (char_this >= 'a') + found_son = spl_trie_->level1_sons_[char_this - 'a']; + else + found_son = spl_trie_->level1_sons_[char_this - 'A']; + } else { + SpellingNode *first_son = node_this->first_son; + // Because for Zh/Ch/Sh nodes, they are the last in the buffer and + // frequently used, so we scan from the end. + for (int i = 0; i < node_this->num_of_son; i++) { + SpellingNode *this_son = first_son + i; + if (SpellingTrie::is_same_spl_char( + this_son->char_this_node, char_this)) { + found_son = this_son; + break; + } + } + } + + // found, just move the current node pointer to the the son + if (NULL != found_son) { + node_this = found_son; + } else { + // not found, test if it is endable + uint16 id_this = node_this->spelling_idx; + if (spl_trie_->if_valid_id_update(&id_this)) { + // endable, remember the index + spl_idx[idx_num] = id_this; + + idx_num++; + if (NULL != start_pos) + start_pos[idx_num] = str_pos; + if (idx_num >= max_size) + return idx_num; + node_this = spl_trie_->root_; + continue; + } else { + return idx_num; + } + } + + str_pos++; + } + + uint16 id_this = node_this->spelling_idx; + if (spl_trie_->if_valid_id_update(&id_this)) { + // endable, remember the index + spl_idx[idx_num] = id_this; + + idx_num++; + if (NULL != start_pos) + start_pos[idx_num] = str_pos; + } + + last_is_pre = !last_is_splitter; + + return idx_num; +} + +uint16 SpellingParser::splstr_to_idxs_f(const char *splstr, uint16 str_len, + uint16 spl_idx[], uint16 start_pos[], + uint16 max_size, bool &last_is_pre) { + uint16 idx_num = splstr_to_idxs(splstr, str_len, spl_idx, start_pos, + max_size, last_is_pre); + for (uint16 pos = 0; pos < idx_num; pos++) { + if (spl_trie_->is_half_id_yunmu(spl_idx[pos])) { + spl_trie_->half_to_full(spl_idx[pos], spl_idx + pos); + if (pos == idx_num - 1) { + last_is_pre = false; + } + } + } + return idx_num; +} + +uint16 SpellingParser::splstr16_to_idxs(const char16 *splstr, uint16 str_len, + uint16 spl_idx[], uint16 start_pos[], + uint16 max_size, bool &last_is_pre) { + if (NULL == splstr || 0 == max_size || 0 == str_len) + return 0; + + if (!SpellingTrie::is_valid_spl_char(splstr[0])) + return 0; + + last_is_pre = false; + + const SpellingNode *node_this = spl_trie_->root_; + + uint16 str_pos = 0; + uint16 idx_num = 0; + if (NULL != start_pos) + start_pos[0] = 0; + bool last_is_splitter = false; + + while (str_pos < str_len) { + char16 char_this = splstr[str_pos]; + // all characters outside of [a, z] are considered as splitters + if (!SpellingTrie::is_valid_spl_char(char_this)) { + // test if the current node is endable + uint16 id_this = node_this->spelling_idx; + if (spl_trie_->if_valid_id_update(&id_this)) { + spl_idx[idx_num] = id_this; + + idx_num++; + str_pos++; + if (NULL != start_pos) + start_pos[idx_num] = str_pos; + if (idx_num >= max_size) + return idx_num; + + node_this = spl_trie_->root_; + last_is_splitter = true; + continue; + } else { + if (last_is_splitter) { + str_pos++; + if (NULL != start_pos) + start_pos[idx_num] = str_pos; + continue; + } else { + return idx_num; + } + } + } + + last_is_splitter = false; + + SpellingNode *found_son = NULL; + + if (0 == str_pos) { + if (char_this >= 'a') + found_son = spl_trie_->level1_sons_[char_this - 'a']; + else + found_son = spl_trie_->level1_sons_[char_this - 'A']; + } else { + SpellingNode *first_son = node_this->first_son; + // Because for Zh/Ch/Sh nodes, they are the last in the buffer and + // frequently used, so we scan from the end. + for (int i = 0; i < node_this->num_of_son; i++) { + SpellingNode *this_son = first_son + i; + if (SpellingTrie::is_same_spl_char( + this_son->char_this_node, char_this)) { + found_son = this_son; + break; + } + } + } + + // found, just move the current node pointer to the the son + if (NULL != found_son) { + node_this = found_son; + } else { + // not found, test if it is endable + uint16 id_this = node_this->spelling_idx; + if (spl_trie_->if_valid_id_update(&id_this)) { + // endable, remember the index + spl_idx[idx_num] = id_this; + + idx_num++; + if (NULL != start_pos) + start_pos[idx_num] = str_pos; + if (idx_num >= max_size) + return idx_num; + node_this = spl_trie_->root_; + continue; + } else { + return idx_num; + } + } + + str_pos++; + } + + uint16 id_this = node_this->spelling_idx; + if (spl_trie_->if_valid_id_update(&id_this)) { + // endable, remember the index + spl_idx[idx_num] = id_this; + + idx_num++; + if (NULL != start_pos) + start_pos[idx_num] = str_pos; + } + + last_is_pre = !last_is_splitter; + + return idx_num; +} + +uint16 SpellingParser::splstr16_to_idxs_f(const char16 *splstr, uint16 str_len, + uint16 spl_idx[], uint16 start_pos[], + uint16 max_size, bool &last_is_pre) { + uint16 idx_num = splstr16_to_idxs(splstr, str_len, spl_idx, start_pos, + max_size, last_is_pre); + for (uint16 pos = 0; pos < idx_num; pos++) { + if (spl_trie_->is_half_id_yunmu(spl_idx[pos])) { + spl_trie_->half_to_full(spl_idx[pos], spl_idx + pos); + if (pos == idx_num - 1) { + last_is_pre = false; + } + } + } + return idx_num; +} + +uint16 SpellingParser::get_splid_by_str(const char *splstr, uint16 str_len, + bool *is_pre) { + if (NULL == is_pre) + return 0; + + uint16 spl_idx[2]; + uint16 start_pos[3]; + + if (splstr_to_idxs(splstr, str_len, spl_idx, start_pos, 2, *is_pre) != 1) + return 0; + + if (start_pos[1] != str_len) + return 0; + return spl_idx[0]; +} + +uint16 SpellingParser::get_splid_by_str_f(const char *splstr, uint16 str_len, + bool *is_pre) { + if (NULL == is_pre) + return 0; + + uint16 spl_idx[2]; + uint16 start_pos[3]; + + if (splstr_to_idxs(splstr, str_len, spl_idx, start_pos, 2, *is_pre) != 1) + return 0; + + if (start_pos[1] != str_len) + return 0; + if (spl_trie_->is_half_id_yunmu(spl_idx[0])) { + spl_trie_->half_to_full(spl_idx[0], spl_idx); + *is_pre = false; + } + + return spl_idx[0]; +} + +uint16 SpellingParser::get_splids_parallel(const char *splstr, uint16 str_len, + uint16 splidx[], uint16 max_size, + uint16 &full_id_num, bool &is_pre) { + if (max_size <= 0 || !is_valid_to_parse(splstr[0])) + return 0; + + splidx[0] = get_splid_by_str(splstr, str_len, &is_pre); + full_id_num = 0; + if (0 != splidx[0]) { + if (splidx[0] >= kFullSplIdStart) + full_id_num = 1; + return 1; + } + return 0; +} + +} // namespace ime_pinyin diff --git a/googlepinyin/splparser.h b/googlepinyin/splparser.h new file mode 100644 index 0000000..d783bd7 --- /dev/null +++ b/googlepinyin/splparser.h @@ -0,0 +1,96 @@ +/* + * Copyright (C) 2009 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef PINYINIME_INCLUDE_SPLPARSER_H__ +#define PINYINIME_INCLUDE_SPLPARSER_H__ + +#include "./dictdef.h" +#include "./spellingtrie.h" + +namespace ime_pinyin { + +class SpellingParser { + protected: + const SpellingTrie *spl_trie_; + + public: + SpellingParser(); + + // Given a string, parse it into a spelling id stream. + // If the whole string are sucessfully parsed, last_is_pre will be true; + // if the whole string is not fullly parsed, last_is_pre will return whether + // the last part of the string is a prefix of a full spelling string. For + // example, given string "zhengzhon", "zhon" is not a valid speling, but it is + // the prefix of "zhong". + // + // If splstr starts with a character not in ['a'-z'] (it is a split char), + // return 0. + // Split char can only appear in the middle of the string or at the end. + uint16 splstr_to_idxs(const char *splstr, uint16 str_len, uint16 splidx[], + uint16 start_pos[], uint16 max_size, bool &last_is_pre); + + // Similar to splstr_to_idxs(), the only difference is that splstr_to_idxs() + // convert single-character Yunmus into half ids, while this function converts + // them into full ids. + uint16 splstr_to_idxs_f(const char *splstr, uint16 str_len, uint16 splidx[], + uint16 start_pos[], uint16 max_size, bool &last_is_pre); + + // Similar to splstr_to_idxs(), the only difference is that this function + // uses char16 instead of char8. + uint16 splstr16_to_idxs(const char16 *splstr, uint16 str_len, uint16 splidx[], + uint16 start_pos[], uint16 max_size, bool &last_is_pre); + + // Similar to splstr_to_idxs_f(), the only difference is that this function + // uses char16 instead of char8. + uint16 splstr16_to_idxs_f(const char16 *splstr16, uint16 str_len, + uint16 splidx[], uint16 start_pos[], + uint16 max_size, bool &last_is_pre); + + // If the given string is a spelling, return the id, others, return 0. + // If the give string is a single char Yunmus like "A", and the char is + // enabled in ShouZiMu mode, the returned spelling id will be a half id. + // When the returned spelling id is a half id, *is_pre returns whether it + // is a prefix of a full spelling string. + uint16 get_splid_by_str(const char *splstr, uint16 str_len, bool *is_pre); + + // If the given string is a spelling, return the id, others, return 0. + // If the give string is a single char Yunmus like "a", no matter the char + // is enabled in ShouZiMu mode or not, the returned spelling id will be + // a full id. + // When the returned spelling id is a half id, *p_is_pre returns whether it + // is a prefix of a full spelling string. + uint16 get_splid_by_str_f(const char *splstr, uint16 str_len, bool *is_pre); + + // Splitter chars are not included. + bool is_valid_to_parse(char ch); + + // When auto-correction is not enabled, get_splid_by_str() will be called to + // return the single result. When auto-correction is enabled, this function + // will be called to get the results. Auto-correction is not ready. + // full_id_num returns number of full spelling ids. + // is_pre returns whether the given string is the prefix of a full spelling + // string. + // If splstr starts with a character not in [a-zA-Z] (it is a split char), + // return 0. + // Split char can only appear in the middle of the string or at the end. + // The caller should guarantee NULL != splstr && str_len > 0 && NULL != splidx + uint16 get_splids_parallel(const char *splstr, uint16 str_len, + uint16 splidx[], uint16 max_size, + uint16 &full_id_num, bool &is_pre); +}; +} + +#endif // PINYINIME_INCLUDE_SPLPARSER_H__ diff --git a/googlepinyin/sync.cpp b/googlepinyin/sync.cpp new file mode 100644 index 0000000..db042ef --- /dev/null +++ b/googlepinyin/sync.cpp @@ -0,0 +1,112 @@ +/* + * Copyright (C) 2009 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "sync.h" +#include +#include + +#ifdef ___SYNC_ENABLED___ + +namespace ime_pinyin { + +Sync::Sync() + : userdict_(NULL), + dictfile_(NULL), + last_count_(0) { +}; + +Sync::~Sync() { +} + + +bool Sync::begin(const char * filename) { + if (userdict_) { + finish(); + } + + if (!filename) { + return false; + } + + dictfile_ = strdup(filename); + if (!dictfile_) { + return false; + } + + userdict_ = new UserDict(); + if (!userdict_) { + free(dictfile_); + dictfile_ = NULL; + return false; + } + + if (userdict_->load_dict((const char*)dictfile_, kUserDictIdStart, + kUserDictIdEnd) == false) { + delete userdict_; + userdict_ = NULL; + free(dictfile_); + dictfile_ = NULL; + return false; + } + + userdict_->set_limit(kUserDictMaxLemmaCount, kUserDictMaxLemmaSize, kUserDictRatio); + + return true; +} + +int Sync::put_lemmas(char16 * lemmas, int len) { + return userdict_->put_lemmas_no_sync_from_utf16le_string(lemmas, len); +} + +int Sync::get_lemmas(char16 * str, int size) { + return userdict_->get_sync_lemmas_in_utf16le_string_from_beginning(str, size, &last_count_); +} + +int Sync::get_last_got_count() { + return last_count_; +} + +int Sync::get_total_count() { + return userdict_->get_sync_count(); +} + +void Sync::clear_last_got() { + if (last_count_ < 0) { + return; + } + userdict_->clear_sync_lemmas(0, last_count_); + last_count_ = 0; +} + +void Sync::finish() { + if (userdict_) { + userdict_->close_dict(); + delete userdict_; + userdict_ = NULL; + free(dictfile_); + dictfile_ = NULL; + last_count_ = 0; + } +} + +int Sync::get_capacity() { + UserDict::UserDictStat stat; + userdict_->state(&stat); + return stat.limit_lemma_count - stat.lemma_count; +} + +} +#endif diff --git a/googlepinyin/sync.h b/googlepinyin/sync.h new file mode 100644 index 0000000..bf42d1f --- /dev/null +++ b/googlepinyin/sync.h @@ -0,0 +1,85 @@ +/* + * Copyright (C) 2009 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef PINYINIME_INCLUDE_SYNC_H__ +#define PINYINIME_INCLUDE_SYNC_H__ + +#define ___SYNC_ENABLED___ + +#ifdef ___SYNC_ENABLED___ + +#include "userdict.h" + +namespace ime_pinyin { + +// Class for user dictionary synchronization +// This class is not thread safe +// Normal invoking flow will be +// begin() -> +// put_lemmas() x N -> +// { +// get_lemmas() -> +// [ get_last_got_count() ] -> +// clear_last_got() -> +// } x N -> +// finish() +class Sync { + public: + Sync(); + ~Sync(); + + static const int kUserDictMaxLemmaCount = 5000; + static const int kUserDictMaxLemmaSize = 200000; + static const int kUserDictRatio = 20; + + bool begin(const char * filename); + + // Merge lemmas downloaded from sync server into local dictionary + // lemmas, lemmas string encoded in UTF16LE + // len, length of lemmas string + // Return how many lemmas merged successfully + int put_lemmas(char16 * lemmas, int len); + + // Get local new user lemmas into UTF16LE string + // str, buffer ptr to store new user lemmas + // size, size of buffer + // Return length of returned buffer in measure of UTF16LE + int get_lemmas(char16 * str, int size); + + // Return lemmas count in last get_lemmas() + int get_last_got_count(); + + // Return total lemmas count need get_lemmas() + int get_total_count(); + + // Clear lemmas got by recent get_lemmas() + void clear_last_got(); + + void finish(); + + int get_capacity(); + + private: + UserDict * userdict_; + char * dictfile_; + int last_count_; +}; + +} + +#endif + +#endif // PINYINIME_INCLUDE_SYNC_H__ diff --git a/googlepinyin/userdict.cpp b/googlepinyin/userdict.cpp new file mode 100644 index 0000000..9dbb1e4 --- /dev/null +++ b/googlepinyin/userdict.cpp @@ -0,0 +1,2290 @@ +/* + * Copyright (C) 2009 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "userdict.h" +#include "splparser.h" +#include "ngram.h" +#include +#include +#include +#ifdef ___DEBUG_PERF___ +#include +#endif +#ifdef _WIN32 +#include +#else +#include +#endif +#include +#include +#include +#include +#include +#ifndef _WIN32 +#include +#endif +#include +#ifdef _WIN32 +#undef max +#undef min +#include +#include +#else +#include +#endif +#include + +namespace ime_pinyin { + +#ifdef _WIN32 +static int gettimeofday(struct timeval *tp, void *) { + const qint64 current_msecs_since_epoch = QDateTime::currentMSecsSinceEpoch(); + tp->tv_sec = (long)(current_msecs_since_epoch / 1000); + tp->tv_usec = (long)((current_msecs_since_epoch % 1000) * 1000); + return 0; +} +#endif + +#ifdef ___DEBUG_PERF___ +static uint64 _ellapse_ = 0; +static struct timeval _tv_start_, _tv_end_; +#define DEBUG_PERF_BEGIN \ + do { \ + gettimeofday(&_tv_start_, NULL); \ + } while (0) +#define DEBUG_PERF_END \ + do { \ + gettimeofday(&_tv_end_, NULL); \ + _ellapse_ = (_tv_end_.tv_sec - _tv_start_.tv_sec) * 1000000 + \ + (_tv_end_.tv_usec - _tv_start_.tv_usec); \ + } while (0) +#define LOGD_PERF(message) \ + ALOGD("PERFORMANCE[%s] %llu usec.", message, _ellapse_); +#else +#define DEBUG_PERF_BEGIN +#define DEBUG_PERF_END +#define LOGD_PERF(message) +#endif + +// XXX File load and write are thread-safe by g_mutex_ +#ifdef _WIN32 +static QMutex g_mutex_; +#define pthread_mutex_lock(MUTEX) ((MUTEX)->lock()) +#define pthread_mutex_unlock(MUTEX) ((MUTEX)->unlock()) +#define pthread_mutex_trylock(MUTEX) (!(MUTEX)->tryLock(0)) +#else +static pthread_mutex_t g_mutex_ = PTHREAD_MUTEX_INITIALIZER; +#endif +static struct timeval g_last_update_ = {0, 0}; + +inline uint32 UserDict::get_dict_file_size(UserDictInfo * info) { + return (4 + info->lemma_size + (info->lemma_count << 3) +#ifdef ___PREDICT_ENABLED___ + + (info->lemma_count << 2) +#endif +#ifdef ___SYNC_ENABLED___ + + (info->sync_count << 2) +#endif + + sizeof(*info)); +} + +inline LmaScoreType UserDict::translate_score(int raw_score) { + // 1) ori_freq: original user frequency + uint32 ori_freq = extract_score_freq(raw_score); + // 2) lmt_off: lmt index (week offset for example) + uint64 lmt_off = ((raw_score & 0xffff0000) >> 16); + if (kUserDictLMTBitWidth < 16) { + uint64 mask = ~(1 << kUserDictLMTBitWidth); + lmt_off &= mask; + } + // 3) now_off: current time index (current week offset for example) + // assuming load_time_ is around current time + uint64 now_off = load_time_.tv_sec; + now_off = (now_off - kUserDictLMTSince) / kUserDictLMTGranularity; + now_off = (now_off << (64 - kUserDictLMTBitWidth)); + now_off = (now_off >> (64 - kUserDictLMTBitWidth)); + // 4) factor: decide expand-factor + int delta = now_off - lmt_off; + if (delta > 4) + delta = 4; + int factor = 80 - (delta << 4); + + double tf = (double)(dict_info_.total_nfreq + total_other_nfreq_); + return (LmaScoreType)(log((double)factor * (double)ori_freq / tf) + * NGram::kLogValueAmplifier); +} + +inline int UserDict::extract_score_freq(int raw_score) { + // Frequence stored in lowest 16 bits + int freq = (raw_score & 0x0000ffff); + return freq; +} + +inline uint64 UserDict::extract_score_lmt(int raw_score) { + uint64 lmt = ((raw_score & 0xffff0000) >> 16); + if (kUserDictLMTBitWidth < 16) { + uint64 mask = ~(1 << kUserDictLMTBitWidth); + lmt &= mask; + } + lmt = lmt * kUserDictLMTGranularity + kUserDictLMTSince; + return lmt; +} + +inline int UserDict::build_score(uint64 lmt, int freq) { + lmt = (lmt - kUserDictLMTSince) / kUserDictLMTGranularity; + lmt = (lmt << (64 - kUserDictLMTBitWidth)); + lmt = (lmt >> (64 - kUserDictLMTBitWidth)); + uint16 lmt16 = (uint16)lmt; + int s = freq; + s &= 0x0000ffff; + s = (lmt16 << 16) | s; + return s; +} + +inline int64 UserDict::utf16le_atoll(uint16 *s, int len) { + int64 ret = 0; + if (len <= 0) + return ret; + + int flag = 1; + const uint16 * endp = s + len; + if (*s == '-') { + flag = -1; + s++; + } else if (*s == '+') { + s++; + } + + while (*s >= '0' && *s <= '9' && s < endp) { + ret += ret * 10 + (*s) - '0'; + s++; + } + return ret * flag; +} + +inline int UserDict::utf16le_lltoa(int64 v, uint16 *s, int size) { + if (!s || size <= 0) + return 0; + uint16 *endp = s + size; + int ret_len = 0; + if (v < 0) { + *(s++) = '-'; + ++ret_len; + v *= -1; + } + + uint16 *b = s; + while (s < endp && v != 0) { + *(s++) = '0' + (v % 10); + v = v / 10; + ++ret_len; + } + + if (v != 0) + return 0; + + --s; + + while (b < s) { + *b = *s; + ++b, --s; + } + + return ret_len; +} + +inline void UserDict::set_lemma_flag(uint32 offset, uint8 flag) { + offset &= kUserDictOffsetMask; + lemmas_[offset] |= flag; +} + +inline char UserDict::get_lemma_flag(uint32 offset) { + offset &= kUserDictOffsetMask; + return (char)(lemmas_[offset]); +} + +inline char UserDict::get_lemma_nchar(uint32 offset) { + offset &= kUserDictOffsetMask; + return (char)(lemmas_[offset + 1]); +} + +inline uint16 * UserDict::get_lemma_spell_ids(uint32 offset) { + offset &= kUserDictOffsetMask; + return (uint16 *)(lemmas_ + offset + 2); +} + +inline uint16 * UserDict::get_lemma_word(uint32 offset) { + offset &= kUserDictOffsetMask; + uint8 nchar = get_lemma_nchar(offset); + return (uint16 *)(lemmas_ + offset + 2 + (nchar << 1)); +} + +inline LemmaIdType UserDict::get_max_lemma_id() { + // When a lemma is deleted, we don't not claim its id back for + // simplicity and performance + return start_id_ + dict_info_.lemma_count - 1; +} + +inline bool UserDict::is_valid_lemma_id(LemmaIdType id) { + if (id >= start_id_ && id <= get_max_lemma_id()) + return true; + return false; +} + +inline bool UserDict::is_valid_state() { + if (state_ == USER_DICT_NONE) + return false; + return true; +} + +UserDict::UserDict() + : start_id_(0), + version_(0), + lemmas_(NULL), + offsets_(NULL), + scores_(NULL), + ids_(NULL), +#ifdef ___PREDICT_ENABLED___ + predicts_(NULL), +#endif +#ifdef ___SYNC_ENABLED___ + syncs_(NULL), + sync_count_size_(0), +#endif + offsets_by_id_(NULL), + lemma_count_left_(0), + lemma_size_left_(0), + dict_file_(NULL), + state_(USER_DICT_NONE) { + memset(&dict_info_, 0, sizeof(dict_info_)); + memset(&load_time_, 0, sizeof(load_time_)); +#ifdef ___CACHE_ENABLED___ + cache_init(); +#endif +} + +UserDict::~UserDict() { + close_dict(); +} + +bool UserDict::load_dict(const char *file_name, LemmaIdType start_id, + LemmaIdType end_id) { +#ifdef ___DEBUG_PERF___ + DEBUG_PERF_BEGIN; +#endif + dict_file_ = strdup(file_name); + if (!dict_file_) + return false; + + start_id_ = start_id; + + if (false == validate(file_name) && false == reset(file_name)) { + goto error; + } + if (false == load(file_name, start_id)) { + goto error; + } + + state_ = USER_DICT_SYNC; + + gettimeofday(&load_time_, NULL); + +#ifdef ___DEBUG_PERF___ + DEBUG_PERF_END; + LOGD_PERF("load_dict"); +#endif + return true; + error: + free((void*)dict_file_); + dict_file_ = NULL; + start_id_ = 0; + return false; +} + +bool UserDict::close_dict() { + if (state_ == USER_DICT_NONE) + return true; + if (state_ == USER_DICT_SYNC) + goto out; + + // If dictionary is written back by others, + // we can not simply write back here + // To do a safe flush, we have to discard all newly added + // lemmas and try to reload dict file. + pthread_mutex_lock(&g_mutex_); + if (load_time_.tv_sec > g_last_update_.tv_sec || + (load_time_.tv_sec == g_last_update_.tv_sec && + load_time_.tv_usec > g_last_update_.tv_usec)) { + write_back(); + gettimeofday(&g_last_update_, NULL); + } + pthread_mutex_unlock(&g_mutex_); + + out: + free((void*)dict_file_); + free(lemmas_); + free(offsets_); + free(offsets_by_id_); + free(scores_); + free(ids_); +#ifdef ___PREDICT_ENABLED___ + free(predicts_); +#endif + + version_ = 0; + dict_file_ = NULL; + lemmas_ = NULL; +#ifdef ___SYNC_ENABLED___ + syncs_ = NULL; + sync_count_size_ = 0; +#endif + offsets_ = NULL; + offsets_by_id_ = NULL; + scores_ = NULL; + ids_ = NULL; +#ifdef ___PREDICT_ENABLED___ + predicts_ = NULL; +#endif + + memset(&dict_info_, 0, sizeof(dict_info_)); + lemma_count_left_ = 0; + lemma_size_left_ = 0; + state_ = USER_DICT_NONE; + + return true; +} + +size_t UserDict::number_of_lemmas() { + return dict_info_.lemma_count; +} + +void UserDict::reset_milestones(uint16 from_step, MileStoneHandle from_handle) { + return; +} + +MileStoneHandle UserDict::extend_dict(MileStoneHandle from_handle, + const DictExtPara *dep, + LmaPsbItem *lpi_items, + size_t lpi_max, size_t *lpi_num) { + if (is_valid_state() == false) + return 0; + + bool need_extend = false; + +#ifdef ___DEBUG_PERF___ + DEBUG_PERF_BEGIN; +#endif + *lpi_num = _get_lpis(dep->splids, dep->splids_extended + 1, + lpi_items, lpi_max, &need_extend); +#ifdef ___DEBUG_PERF___ + DEBUG_PERF_END; + LOGD_PERF("extend_dict"); +#endif + return ((*lpi_num > 0 || need_extend) ? 1 : 0); +} + +int UserDict::is_fuzzy_prefix_spell_id( + const uint16 * id1, uint16 len1, const UserDictSearchable *searchable) { + if (len1 < searchable->splids_len) + return 0; + + SpellingTrie &spl_trie = SpellingTrie::get_instance(); + uint32 i = 0; + for (i = 0; i < searchable->splids_len; i++) { + const char py1 = *spl_trie.get_spelling_str(id1[i]); + uint16 off = 8 * (i % 4); + const char py2 = ((searchable->signature[i/4] & (0xff << off)) >> off); + if (py1 == py2) + continue; + return 0; + } + return 1; +} + +int UserDict::fuzzy_compare_spell_id( + const uint16 * id1, uint16 len1, const UserDictSearchable *searchable) { + if (len1 < searchable->splids_len) + return -1; + if (len1 > searchable->splids_len) + return 1; + + SpellingTrie &spl_trie = SpellingTrie::get_instance(); + uint32 i = 0; + for (i = 0; i < len1; i++) { + const char py1 = *spl_trie.get_spelling_str(id1[i]); + uint16 off = 8 * (i % 4); + const char py2 = ((searchable->signature[i/4] & (0xff << off)) >> off); + if (py1 == py2) + continue; + if (py1 > py2) + return 1; + return -1; + } + return 0; +} + +bool UserDict::is_prefix_spell_id( + const uint16 * fullids, uint16 fulllen, + const UserDictSearchable *searchable) { + if (fulllen < searchable->splids_len) + return false; + + uint32 i = 0; + for (; i < searchable->splids_len; i++) { + uint16 start_id = searchable->splid_start[i]; + uint16 count = searchable->splid_count[i]; + if (fullids[i] >= start_id && fullids[i] < start_id + count) + continue; + else + return false; + } + return true; +} + +bool UserDict::equal_spell_id( + const uint16 * fullids, uint16 fulllen, + const UserDictSearchable *searchable) { + if (fulllen != searchable->splids_len) + return false; + + uint32 i = 0; + for (; i < fulllen; i++) { + uint16 start_id = searchable->splid_start[i]; + uint16 count = searchable->splid_count[i]; + if (fullids[i] >= start_id && fullids[i] < start_id + count) + continue; + else + return false; + } + return true; +} + +int32 UserDict::locate_first_in_offsets(const UserDictSearchable * searchable) { + int32 begin = 0; + int32 end = dict_info_.lemma_count - 1; + int32 middle = -1; + + int32 first_prefix = middle; + int32 last_matched = middle; + + while (begin <= end) { + middle = (begin + end) >> 1; + uint32 offset = offsets_[middle]; + uint8 nchar = get_lemma_nchar(offset); + const uint16 * splids = get_lemma_spell_ids(offset); + int cmp = fuzzy_compare_spell_id(splids, nchar, searchable); + int pre = is_fuzzy_prefix_spell_id(splids, nchar, searchable); + + if (pre) + first_prefix = middle; + + if (cmp < 0) { + begin = middle + 1; + } else if (cmp > 0) { + end = middle - 1; + } else { + end = middle - 1; + last_matched = middle; + } + } + + return first_prefix; +} + +void UserDict::prepare_locate(UserDictSearchable *searchable, + const uint16 *splid_str, + uint16 splid_str_len) { + searchable->splids_len = splid_str_len; + memset(searchable->signature, 0, sizeof(searchable->signature)); + + SpellingTrie &spl_trie = SpellingTrie::get_instance(); + uint32 i = 0; + for (; i < splid_str_len; i++) { + if (spl_trie.is_half_id(splid_str[i])) { + searchable->splid_count[i] = + spl_trie.half_to_full(splid_str[i], + &(searchable->splid_start[i])); + } else { + searchable->splid_count[i] = 1; + searchable->splid_start[i] = splid_str[i]; + } + const unsigned char py = *spl_trie.get_spelling_str(splid_str[i]); + searchable->signature[i>>2] |= (py << (8 * (i % 4))); + } +} + +size_t UserDict::get_lpis(const uint16 *splid_str, uint16 splid_str_len, + LmaPsbItem *lpi_items, size_t lpi_max) { + return _get_lpis(splid_str, splid_str_len, lpi_items, lpi_max, NULL); +} + +size_t UserDict::_get_lpis(const uint16 *splid_str, + uint16 splid_str_len, LmaPsbItem *lpi_items, + size_t lpi_max, bool * need_extend) { + bool tmp_extend; + if (!need_extend) + need_extend = &tmp_extend; + + *need_extend = false; + + if (is_valid_state() == false) + return 0; + if (lpi_max <= 0) + return 0; + + if (0 == pthread_mutex_trylock(&g_mutex_)) { + if (load_time_.tv_sec < g_last_update_.tv_sec || + (load_time_.tv_sec == g_last_update_.tv_sec && + load_time_.tv_usec < g_last_update_.tv_usec)) { + // Others updated disk file, have to reload + pthread_mutex_unlock(&g_mutex_); + flush_cache(); + } else { + pthread_mutex_unlock(&g_mutex_); + } + } else { + } + + UserDictSearchable searchable; + prepare_locate(&searchable, splid_str, splid_str_len); + + uint32 max_off = dict_info_.lemma_count; +#ifdef ___CACHE_ENABLED___ + int32 middle; + uint32 start, count; + bool cached = cache_hit(&searchable, &start, &count); + if (cached) { + middle = start; + max_off = start + count; + } else { + middle = locate_first_in_offsets(&searchable); + start = middle; + } +#else + int32 middle = locate_first_in_offsets(&searchable); +#endif + + if (middle == -1) { +#ifdef ___CACHE_ENABLED___ + if (!cached) + cache_push(USER_DICT_MISS_CACHE, &searchable, 0, 0); +#endif + return 0; + } + + size_t lpi_current = 0; + + bool fuzzy_break = false; + bool prefix_break = false; + while ((size_t)middle < max_off && !fuzzy_break && !prefix_break) { + if (lpi_current >= lpi_max) + break; + uint32 offset = offsets_[middle]; + // Ignore deleted lemmas + if (offset & kUserDictOffsetFlagRemove) { + middle++; + continue; + } + uint8 nchar = get_lemma_nchar(offset); + uint16 * splids = get_lemma_spell_ids(offset); +#ifdef ___CACHE_ENABLED___ + if (!cached && 0 != fuzzy_compare_spell_id(splids, nchar, &searchable)) { +#else + if (0 != fuzzy_compare_spell_id(splids, nchar, &searchable)) { +#endif + fuzzy_break = true; + } + + if (prefix_break == false) { + if (is_fuzzy_prefix_spell_id(splids, nchar, &searchable)) { + if (*need_extend == false && + is_prefix_spell_id(splids, nchar, &searchable)) { + *need_extend = true; + } + } else { + prefix_break = true; + } + } + + if (equal_spell_id(splids, nchar, &searchable) == true) { + lpi_items[lpi_current].psb = translate_score(scores_[middle]); + lpi_items[lpi_current].id = ids_[middle]; + lpi_items[lpi_current].lma_len = nchar; + lpi_current++; + } + middle++; + } + +#ifdef ___CACHE_ENABLED___ + if (!cached) { + count = middle - start; + cache_push(USER_DICT_CACHE, &searchable, start, count); + } +#endif + + return lpi_current; +} + +uint16 UserDict::get_lemma_str(LemmaIdType id_lemma, char16* str_buf, + uint16 str_max) { + if (is_valid_state() == false) + return 0; + if (is_valid_lemma_id(id_lemma) == false) + return 0; + uint32 offset = offsets_by_id_[id_lemma - start_id_]; + uint8 nchar = get_lemma_nchar(offset); + char16 * str = get_lemma_word(offset); + uint16 m = nchar < str_max -1 ? nchar : str_max - 1; + int i = 0; + for (; i < m; i++) { + str_buf[i] = str[i]; + } + str_buf[i] = 0; + return m; +} + +uint16 UserDict::get_lemma_splids(LemmaIdType id_lemma, uint16 *splids, + uint16 splids_max, bool arg_valid) { + if (is_valid_lemma_id(id_lemma) == false) + return 0; + uint32 offset = offsets_by_id_[id_lemma - start_id_]; + uint8 nchar = get_lemma_nchar(offset); + const uint16 * ids = get_lemma_spell_ids(offset); + int i = 0; + for (; i < nchar && i < splids_max; i++) + splids[i] = ids[i]; + return i; +} + +size_t UserDict::predict(const char16 last_hzs[], uint16 hzs_len, + NPredictItem *npre_items, size_t npre_max, + size_t b4_used) { + uint32 new_added = 0; +#ifdef ___PREDICT_ENABLED___ + int32 end = dict_info_.lemma_count - 1; + int j = locate_first_in_predicts((const uint16*)last_hzs, hzs_len); + if (j == -1) + return 0; + + while (j <= end) { + uint32 offset = predicts_[j]; + // Ignore deleted lemmas + if (offset & kUserDictOffsetFlagRemove) { + j++; + continue; + } + uint32 nchar = get_lemma_nchar(offset); + uint16 * words = get_lemma_word(offset); + uint16 * splids = get_lemma_spell_ids(offset); + + if (nchar <= hzs_len) { + j++; + continue; + } + + if (memcmp(words, last_hzs, hzs_len << 1) == 0) { + if (new_added >= npre_max) { + return new_added; + } + uint32 cpy_len = + (nchar < kMaxPredictSize ? (nchar << 1) : (kMaxPredictSize << 1)) + - (hzs_len << 1); + npre_items[new_added].his_len = hzs_len; + npre_items[new_added].psb = get_lemma_score(words, splids, nchar); + memcpy(npre_items[new_added].pre_hzs, words + hzs_len, cpy_len); + if ((cpy_len >> 1) < kMaxPredictSize) { + npre_items[new_added].pre_hzs[cpy_len >> 1] = 0; + } + new_added++; + } else { + break; + } + + j++; + } +#endif + return new_added; +} + +int32 UserDict::locate_in_offsets(char16 lemma_str[], uint16 splid_str[], + uint16 lemma_len) { + int32 max_off = dict_info_.lemma_count; + + UserDictSearchable searchable; + prepare_locate(&searchable, splid_str, lemma_len); +#ifdef ___CACHE_ENABLED___ + int32 off; + uint32 start, count; + bool cached = load_cache(&searchable, &start, &count); + if (cached) { + off = start; + max_off = start + count; + } else { + off = locate_first_in_offsets(&searchable); + start = off; + } +#else + int32 off = locate_first_in_offsets(&searchable); +#endif + + if (off == -1) { + return off; + } + + while (off < max_off) { + uint32 offset = offsets_[off]; + if (offset & kUserDictOffsetFlagRemove) { + off++; + continue; + } + uint16 * splids = get_lemma_spell_ids(offset); +#ifdef ___CACHE_ENABLED___ + if (!cached && 0 != fuzzy_compare_spell_id(splids, lemma_len, &searchable)) + break; +#else + if (0 != fuzzy_compare_spell_id(splids, lemma_len, &searchable)) + break; +#endif + if (equal_spell_id(splids, lemma_len, &searchable) == true) { + uint16 * str = get_lemma_word(offset); + uint32 i = 0; + for (i = 0; i < lemma_len; i++) { + if (str[i] == lemma_str[i]) + continue; + break; + } + if (i < lemma_len) { + off++; + continue; + } +#ifdef ___CACHE_ENABLED___ + // No need to save_cache here, since current function is invoked by + // put_lemma. It's rarely possible for a user input same lemma twice. + // That means first time user type a new lemma, it is newly added into + // user dictionary, then it's possible that user type the same lemma + // again. + // Another reason save_cache can not be invoked here is this function + // aborts when lemma is found, and it never knows the count. +#endif + return off; + } + off++; + } + + return -1; +} + +#ifdef ___PREDICT_ENABLED___ +uint32 UserDict::locate_where_to_insert_in_predicts( + const uint16 * words, int lemma_len) { + int32 begin = 0; + int32 end = dict_info_.lemma_count - 1; + int32 middle = end; + + uint32 last_matched = middle; + + while (begin <= end) { + middle = (begin + end) >> 1; + uint32 offset = offsets_[middle]; + uint8 nchar = get_lemma_nchar(offset); + const uint16 * ws = get_lemma_word(offset); + + uint32 minl = nchar < lemma_len ? nchar : lemma_len; + uint32 k = 0; + int cmp = 0; + + for (; k < minl; k++) { + if (ws[k] < words[k]) { + cmp = -1; + break; + } else if (ws[k] > words[k]) { + cmp = 1; + break; + } + } + if (cmp == 0) { + if (nchar < lemma_len) + cmp = -1; + else if (nchar > lemma_len) + cmp = 1; + } + + if (cmp < 0) { + begin = middle + 1; + last_matched = middle; + } else if (cmp > 0) { + end = middle - 1; + } else { + end = middle - 1; + last_matched = middle; + } + } + + return last_matched; +} + +int32 UserDict::locate_first_in_predicts(const uint16 * words, int lemma_len) { + int32 begin = 0; + int32 end = dict_info_.lemma_count - 1; + int32 middle = -1; + + int32 last_matched = middle; + + while (begin <= end) { + middle = (begin + end) >> 1; + uint32 offset = offsets_[middle]; + uint8 nchar = get_lemma_nchar(offset); + const uint16 * ws = get_lemma_word(offset); + + uint32 minl = nchar < lemma_len ? nchar : lemma_len; + uint32 k = 0; + int cmp = 0; + + for (; k < minl; k++) { + if (ws[k] < words[k]) { + cmp = -1; + break; + } else if (ws[k] > words[k]) { + cmp = 1; + break; + } + } + if (cmp == 0) { + if (nchar >= lemma_len) + last_matched = middle; + if (nchar < lemma_len) + cmp = -1; + else if (nchar > lemma_len) + cmp = 1; + } + + if (cmp < 0) { + begin = middle + 1; + } else if (cmp > 0) { + end = middle - 1; + } else { + end = middle - 1; + } + } + + return last_matched; +} + +#endif + +LemmaIdType UserDict::get_lemma_id(char16 lemma_str[], uint16 splids[], + uint16 lemma_len) { + int32 off = locate_in_offsets(lemma_str, splids, lemma_len); + if (off == -1) { + return 0; + } + + return ids_[off]; +} + +LmaScoreType UserDict::get_lemma_score(LemmaIdType lemma_id) { + if (is_valid_state() == false) + return 0; + if (is_valid_lemma_id(lemma_id) == false) + return 0; + + return translate_score(_get_lemma_score(lemma_id)); +} + +LmaScoreType UserDict::get_lemma_score(char16 lemma_str[], uint16 splids[], + uint16 lemma_len) { + if (is_valid_state() == false) + return 0; + return translate_score(_get_lemma_score(lemma_str, splids, lemma_len)); +} + +int UserDict::_get_lemma_score(LemmaIdType lemma_id) { + if (is_valid_state() == false) + return 0; + if (is_valid_lemma_id(lemma_id) == false) + return 0; + + uint32 offset = offsets_by_id_[lemma_id - start_id_]; + + uint32 nchar = get_lemma_nchar(offset); + uint16 * spl = get_lemma_spell_ids(offset); + uint16 * wrd = get_lemma_word(offset); + + int32 off = locate_in_offsets(wrd, spl, nchar); + if (off == -1) { + return 0; + } + + return scores_[off]; +} + +int UserDict::_get_lemma_score(char16 lemma_str[], uint16 splids[], + uint16 lemma_len) { + if (is_valid_state() == false) + return 0; + + int32 off = locate_in_offsets(lemma_str, splids, lemma_len); + if (off == -1) { + return 0; + } + + return scores_[off]; +} + +#ifdef ___SYNC_ENABLED___ +void UserDict::remove_lemma_from_sync_list(uint32 offset) { + offset &= kUserDictOffsetMask; + uint32 i = 0; + for (; i < dict_info_.sync_count; i++) { + unsigned int off = (syncs_[i] & kUserDictOffsetMask); + if (off == offset) + break; + } + if (i < dict_info_.sync_count) { + syncs_[i] = syncs_[dict_info_.sync_count - 1]; + dict_info_.sync_count--; + } +} +#endif + +#ifdef ___PREDICT_ENABLED___ +void UserDict::remove_lemma_from_predict_list(uint32 offset) { + offset &= kUserDictOffsetMask; + uint32 i = 0; + for (; i < dict_info_.lemma_count; i++) { + unsigned int off = (predicts_[i] & kUserDictOffsetMask); + if (off == offset) { + predicts_[i] |= kUserDictOffsetFlagRemove; + break; + } + } +} +#endif + +bool UserDict::remove_lemma_by_offset_index(int offset_index) { + if (is_valid_state() == false) + return 0; + + int32 off = offset_index; + if (off == -1) { + return false; + } + + uint32 offset = offsets_[off]; + uint32 nchar = get_lemma_nchar(offset); + + offsets_[off] |= kUserDictOffsetFlagRemove; + +#ifdef ___SYNC_ENABLED___ + // Remove corresponding sync item + remove_lemma_from_sync_list(offset); +#endif + +#ifdef ___PREDICT_ENABLED___ + remove_lemma_from_predict_list(offset); +#endif + dict_info_.free_count++; + dict_info_.free_size += (2 + (nchar << 2)); + + if (state_ < USER_DICT_OFFSET_DIRTY) + state_ = USER_DICT_OFFSET_DIRTY; + return true; +} + +bool UserDict::remove_lemma(LemmaIdType lemma_id) { + if (is_valid_state() == false) + return 0; + if (is_valid_lemma_id(lemma_id) == false) + return false; + uint32 offset = offsets_by_id_[lemma_id - start_id_]; + + uint32 nchar = get_lemma_nchar(offset); + uint16 * spl = get_lemma_spell_ids(offset); + uint16 * wrd = get_lemma_word(offset); + + int32 off = locate_in_offsets(wrd, spl, nchar); + + return remove_lemma_by_offset_index(off); +} + +void UserDict::flush_cache() { + LemmaIdType start_id = start_id_; + if (!dict_file_) + return; + const char * file = strdup(dict_file_); + if (!file) + return; + close_dict(); + load_dict(file, start_id, kUserDictIdEnd); + free((void*)file); +#ifdef ___CACHE_ENABLED___ + cache_init(); +#endif + return; +} + +bool UserDict::reset(const char *file) { + FILE *fp = fopen(file, "w+"); + if (!fp) { + return false; + } + uint32 version = kUserDictVersion; + size_t wred = fwrite(&version, 1, 4, fp); + UserDictInfo info; + memset(&info, 0, sizeof(info)); + // By default, no limitation for lemma count and size + // thereby, reclaim_ratio is never used + wred += fwrite(&info, 1, sizeof(info), fp); + if (wred != sizeof(info) + sizeof(version)) { + fclose(fp); + unlink(file); + return false; + } + fclose(fp); + return true; +} + +bool UserDict::validate(const char *file) { + // b is ignored in POSIX compatible os including Linux + // while b is important flag for Windows to specify binary mode + FILE *fp = fopen(file, "rb"); + if (!fp) { + return false; + } + + size_t size; + size_t readed; + uint32 version; + UserDictInfo dict_info; + + // validate + int err = fseek(fp, 0, SEEK_END); + if (err) { + goto error; + } + + size = ftell(fp); + if (size < 4 + sizeof(dict_info)) { + goto error; + } + + err = fseek(fp, 0, SEEK_SET); + if (err) { + goto error; + } + + readed = fread(&version, 1, sizeof(version), fp); + if (readed < sizeof(version)) { + goto error; + } + if (version != kUserDictVersion) { + goto error; + } + + err = fseek(fp, -1 * sizeof(dict_info), SEEK_END); + if (err) { + goto error; + } + + readed = fread(&dict_info, 1, sizeof(dict_info), fp); + if (readed != sizeof(dict_info)) { + goto error; + } + + if (size != get_dict_file_size(&dict_info)) { + goto error; + } + + fclose(fp); + return true; + + error: + fclose(fp); + return false; +} + +bool UserDict::load(const char *file, LemmaIdType start_id) { + if (0 != pthread_mutex_trylock(&g_mutex_)) { + return false; + } + // b is ignored in POSIX compatible os including Linux + // while b is important flag for Windows to specify binary mode + FILE *fp = fopen(file, "rb"); + if (!fp) { + pthread_mutex_unlock(&g_mutex_); + return false; + } + + size_t readed, toread; + UserDictInfo dict_info; + uint8 *lemmas = NULL; + uint32 *offsets = NULL; +#ifdef ___SYNC_ENABLED___ + uint32 *syncs = NULL; +#endif + uint32 *scores = NULL; + uint32 *ids = NULL; + uint32 *offsets_by_id = NULL; +#ifdef ___PREDICT_ENABLED___ + uint32 *predicts = NULL; +#endif + size_t i; + int err; + + err = fseek(fp, -1 * sizeof(dict_info), SEEK_END); + if (err) goto error; + + readed = fread(&dict_info, 1, sizeof(dict_info), fp); + if (readed != sizeof(dict_info)) goto error; + + lemmas = (uint8 *)malloc( + dict_info.lemma_size + + (kUserDictPreAlloc * (2 + (kUserDictAverageNchar << 2)))); + + if (!lemmas) goto error; + + offsets = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2); + if (!offsets) goto error; + +#ifdef ___PREDICT_ENABLED___ + predicts = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2); + if (!predicts) goto error; +#endif + +#ifdef ___SYNC_ENABLED___ + syncs = (uint32 *)malloc((dict_info.sync_count + kUserDictPreAlloc) << 2); + if (!syncs) goto error; +#endif + + scores = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2); + if (!scores) goto error; + + ids = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2); + if (!ids) goto error; + + offsets_by_id = (uint32 *)malloc( + (dict_info.lemma_count + kUserDictPreAlloc) << 2); + if (!offsets_by_id) goto error; + + err = fseek(fp, 4, SEEK_SET); + if (err) goto error; + + readed = 0; + while (readed < dict_info.lemma_size && !ferror(fp) && !feof(fp)) { + readed += fread(lemmas + readed, 1, dict_info.lemma_size - readed, fp); + } + if (readed < dict_info.lemma_size) + goto error; + + toread = (dict_info.lemma_count << 2); + readed = 0; + while (readed < toread && !ferror(fp) && !feof(fp)) { + readed += fread((((uint8*)offsets) + readed), 1, toread - readed, fp); + } + if (readed < toread) + goto error; + +#ifdef ___PREDICT_ENABLED___ + toread = (dict_info.lemma_count << 2); + readed = 0; + while (readed < toread && !ferror(fp) && !feof(fp)) { + readed += fread((((uint8*)predicts) + readed), 1, toread - readed, fp); + } + if (readed < toread) + goto error; +#endif + + readed = 0; + while (readed < toread && !ferror(fp) && !feof(fp)) { + readed += fread((((uint8*)scores) + readed), 1, toread - readed, fp); + } + if (readed < toread) + goto error; + +#ifdef ___SYNC_ENABLED___ + toread = (dict_info.sync_count << 2); + readed = 0; + while (readed < toread && !ferror(fp) && !feof(fp)) { + readed += fread((((uint8*)syncs) + readed), 1, toread - readed, fp); + } + if (readed < toread) + goto error; +#endif + + for (i = 0; i < dict_info.lemma_count; i++) { + ids[i] = start_id + i; + offsets_by_id[i] = offsets[i]; + } + + lemmas_ = lemmas; + offsets_ = offsets; +#ifdef ___SYNC_ENABLED___ + syncs_ = syncs; + sync_count_size_ = dict_info.sync_count + kUserDictPreAlloc; +#endif + offsets_by_id_ = offsets_by_id; + scores_ = scores; + ids_ = ids; +#ifdef ___PREDICT_ENABLED___ + predicts_ = predicts; +#endif + lemma_count_left_ = kUserDictPreAlloc; + lemma_size_left_ = kUserDictPreAlloc * (2 + (kUserDictAverageNchar << 2)); + memcpy(&dict_info_, &dict_info, sizeof(dict_info)); + state_ = USER_DICT_SYNC; + + fclose(fp); + + pthread_mutex_unlock(&g_mutex_); + return true; + + error: + if (lemmas) free(lemmas); + if (offsets) free(offsets); +#ifdef ___SYNC_ENABLED___ + if (syncs) free(syncs); +#endif + if (scores) free(scores); + if (ids) free(ids); + if (offsets_by_id) free(offsets_by_id); +#ifdef ___PREDICT_ENABLED___ + if (predicts) free(predicts); +#endif + fclose(fp); + pthread_mutex_unlock(&g_mutex_); + return false; +} + +void UserDict::write_back() { + // XXX write back is only allowed from close_dict due to thread-safe sake + if (state_ == USER_DICT_NONE || state_ == USER_DICT_SYNC) + return; + int fd = open(dict_file_, O_WRONLY); + if (fd == -1) + return; + switch (state_) { + case USER_DICT_DEFRAGMENTED: + write_back_all(fd); + break; + case USER_DICT_LEMMA_DIRTY: + write_back_lemma(fd); + break; + case USER_DICT_OFFSET_DIRTY: + write_back_offset(fd); + break; + case USER_DICT_SCORE_DIRTY: + write_back_score(fd); + break; +#ifdef ___SYNC_ENABLED___ + case USER_DICT_SYNC_DIRTY: + write_back_sync(fd); + break; +#endif + default: + break; + } + // It seems truncate is not need on Linux, Windows except Mac + // I am doing it here anyway for safety. + off_t cur = lseek(fd, 0, SEEK_CUR); +#ifndef _WIN32 + ftruncate(fd, cur); +#endif + close(fd); + state_ = USER_DICT_SYNC; +} + +#ifdef ___SYNC_ENABLED___ +void UserDict::write_back_sync(int fd) { + int err = lseek(fd, 4 + dict_info_.lemma_size + + (dict_info_.lemma_count << 3) +#ifdef ___PREDICT_ENABLED___ + + (dict_info_.lemma_count << 2) +#endif + , SEEK_SET); + if (err == -1) + return; + write(fd, syncs_, dict_info_.sync_count << 2); + write(fd, &dict_info_, sizeof(dict_info_)); +} +#endif + +void UserDict::write_back_offset(int fd) { + int err = lseek(fd, 4 + dict_info_.lemma_size, SEEK_SET); + if (err == -1) + return; + write(fd, offsets_, dict_info_.lemma_count << 2); +#ifdef ___PREDICT_ENABLED___ + write(fd, predicts_, dict_info_.lemma_count << 2); +#endif + write(fd, scores_, dict_info_.lemma_count << 2); +#ifdef ___SYNC_ENABLED___ + write(fd, syncs_, dict_info_.sync_count << 2); +#endif + write(fd, &dict_info_, sizeof(dict_info_)); +} + +void UserDict::write_back_score(int fd) { + int err = lseek(fd, 4 + dict_info_.lemma_size + + (dict_info_.lemma_count << 2) +#ifdef ___PREDICT_ENABLED___ + + (dict_info_.lemma_count << 2) +#endif + , SEEK_SET); + if (err == -1) + return; + write(fd, scores_, dict_info_.lemma_count << 2); +#ifdef ___SYNC_ENABLED___ + write(fd, syncs_, dict_info_.sync_count << 2); +#endif + write(fd, &dict_info_, sizeof(dict_info_)); +} + +void UserDict::write_back_lemma(int fd) { + int err = lseek(fd, 4, SEEK_SET); + if (err == -1) + return; + // New lemmas are always appended, no need to write whole lemma block + size_t need_write = kUserDictPreAlloc * + (2 + (kUserDictAverageNchar << 2)) - lemma_size_left_; + err = lseek(fd, dict_info_.lemma_size - need_write, SEEK_CUR); + if (err == -1) + return; + write(fd, lemmas_ + dict_info_.lemma_size - need_write, need_write); + + write(fd, offsets_, dict_info_.lemma_count << 2); +#ifdef ___PREDICT_ENABLED___ + write(fd, predicts_, dict_info_.lemma_count << 2); +#endif + write(fd, scores_, dict_info_.lemma_count << 2); +#ifdef ___SYNC_ENABLED___ + write(fd, syncs_, dict_info_.sync_count << 2); +#endif + write(fd, &dict_info_, sizeof(dict_info_)); +} + +void UserDict::write_back_all(int fd) { + // XXX lemma_size is handled differently in writeall + // and writelemma. I update lemma_size and lemma_count in different + // places for these two cases. Should fix it to make it consistent. + int err = lseek(fd, 4, SEEK_SET); + if (err == -1) + return; + write(fd, lemmas_, dict_info_.lemma_size); + write(fd, offsets_, dict_info_.lemma_count << 2); +#ifdef ___PREDICT_ENABLED___ + write(fd, predicts_, dict_info_.lemma_count << 2); +#endif + write(fd, scores_, dict_info_.lemma_count << 2); +#ifdef ___SYNC_ENABLED___ + write(fd, syncs_, dict_info_.sync_count << 2); +#endif + write(fd, &dict_info_, sizeof(dict_info_)); +} + +#ifdef ___CACHE_ENABLED___ +bool UserDict::load_cache(UserDictSearchable *searchable, + uint32 *offset, uint32 *length) { + UserDictCache *cache = &caches_[searchable->splids_len - 1]; + if (cache->head == cache->tail) + return false; + + uint16 j, sig_len = kMaxLemmaSize / 4; + uint16 i = cache->head; + while (1) { + j = 0; + for (; j < sig_len; j++) { + if (cache->signatures[i][j] != searchable->signature[j]) + break; + } + if (j < sig_len) { + i++; + if (i >= kUserDictCacheSize) + i -= kUserDictCacheSize; + if (i == cache->tail) + break; + continue; + } + *offset = cache->offsets[i]; + *length = cache->lengths[i]; + return true; + } + return false; +} + +void UserDict::save_cache(UserDictSearchable *searchable, + uint32 offset, uint32 length) { + UserDictCache *cache = &caches_[searchable->splids_len - 1]; + uint16 next = cache->tail; + + cache->offsets[next] = offset; + cache->lengths[next] = length; + uint16 sig_len = kMaxLemmaSize / 4; + uint16 j = 0; + for (; j < sig_len; j++) { + cache->signatures[next][j] = searchable->signature[j]; + } + + if (++next >= kUserDictCacheSize) { + next -= kUserDictCacheSize; + } + if (next == cache->head) { + cache->head++; + if (cache->head >= kUserDictCacheSize) { + cache->head -= kUserDictCacheSize; + } + } + cache->tail = next; +} + +void UserDict::reset_cache() { + memset(caches_, 0, sizeof(caches_)); +} + +bool UserDict::load_miss_cache(UserDictSearchable *searchable) { + UserDictMissCache *cache = &miss_caches_[searchable->splids_len - 1]; + if (cache->head == cache->tail) + return false; + + uint16 j, sig_len = kMaxLemmaSize / 4; + uint16 i = cache->head; + while (1) { + j = 0; + for (; j < sig_len; j++) { + if (cache->signatures[i][j] != searchable->signature[j]) + break; + } + if (j < sig_len) { + i++; + if (i >= kUserDictMissCacheSize) + i -= kUserDictMissCacheSize; + if (i == cache->tail) + break; + continue; + } + return true; + } + return false; +} + +void UserDict::save_miss_cache(UserDictSearchable *searchable) { + UserDictMissCache *cache = &miss_caches_[searchable->splids_len - 1]; + uint16 next = cache->tail; + + uint16 sig_len = kMaxLemmaSize / 4; + uint16 j = 0; + for (; j < sig_len; j++) { + cache->signatures[next][j] = searchable->signature[j]; + } + + if (++next >= kUserDictMissCacheSize) { + next -= kUserDictMissCacheSize; + } + if (next == cache->head) { + cache->head++; + if (cache->head >= kUserDictMissCacheSize) { + cache->head -= kUserDictMissCacheSize; + } + } + cache->tail = next; +} + +void UserDict::reset_miss_cache() { + memset(miss_caches_, 0, sizeof(miss_caches_)); +} + +void UserDict::cache_init() { + reset_cache(); + reset_miss_cache(); +} + +bool UserDict::cache_hit(UserDictSearchable *searchable, + uint32 *offset, uint32 *length) { + bool hit = load_miss_cache(searchable); + if (hit) { + *offset = 0; + *length = 0; + return true; + } + hit = load_cache(searchable, offset, length); + if (hit) { + return true; + } + return false; +} + +void UserDict::cache_push(UserDictCacheType type, + UserDictSearchable *searchable, + uint32 offset, uint32 length) { + switch (type) { + case USER_DICT_MISS_CACHE: + save_miss_cache(searchable); + break; + case USER_DICT_CACHE: + save_cache(searchable, offset, length); + break; + default: + break; + } +} + +#endif + +void UserDict::defragment(void) { +#ifdef ___DEBUG_PERF___ + DEBUG_PERF_BEGIN; +#endif + if (is_valid_state() == false) + return; + // Fixup offsets_, set REMOVE flag to lemma's flag if needed + size_t first_freed = 0; + size_t first_inuse = 0; + while (first_freed < dict_info_.lemma_count) { + // Find first freed offset + while ((offsets_[first_freed] & kUserDictOffsetFlagRemove) == 0 && + first_freed < dict_info_.lemma_count) { + first_freed++; + } + if (first_freed < dict_info_.lemma_count) { + // Save REMOVE flag to lemma flag + int off = offsets_[first_freed]; + set_lemma_flag(off, kUserDictLemmaFlagRemove); + } else { + break; + } + // Find first inuse offse after first_freed + first_inuse = first_freed + 1; + while ((offsets_[first_inuse] & kUserDictOffsetFlagRemove) && + (first_inuse < dict_info_.lemma_count)) { + // Save REMOVE flag to lemma flag + int off = offsets_[first_inuse]; + set_lemma_flag(off, kUserDictLemmaFlagRemove); + first_inuse++; + } + if (first_inuse >= dict_info_.lemma_count) { + break; + } + // Swap offsets_ + int tmp = offsets_[first_inuse]; + offsets_[first_inuse] = offsets_[first_freed]; + offsets_[first_freed] = tmp; + // Move scores_, no need to swap + tmp = scores_[first_inuse]; + scores_[first_inuse] = scores_[first_freed]; + scores_[first_freed] = tmp; + // Swap ids_ + LemmaIdType tmpid = ids_[first_inuse]; + ids_[first_inuse] = ids_[first_freed]; + ids_[first_freed] = tmpid; + // Go on + first_freed++; + } +#ifdef ___PREDICT_ENABLED___ + // Fixup predicts_ + first_freed = 0; + first_inuse = 0; + while (first_freed < dict_info_.lemma_count) { + // Find first freed offset + while ((predicts_[first_freed] & kUserDictOffsetFlagRemove) == 0 && + first_freed < dict_info_.lemma_count) { + first_freed++; + } + if (first_freed >= dict_info_.lemma_count) + break; + // Find first inuse offse after first_freed + first_inuse = first_freed + 1; + while ((predicts_[first_inuse] & kUserDictOffsetFlagRemove) + && (first_inuse < dict_info_.lemma_count)) { + first_inuse++; + } + if (first_inuse >= dict_info_.lemma_count) { + break; + } + // Swap offsets_ + int tmp = predicts_[first_inuse]; + predicts_[first_inuse] = predicts_[first_freed]; + predicts_[first_freed] = tmp; + // Go on + first_freed++; + } +#endif + dict_info_.lemma_count = first_freed; + // Fixup lemmas_ + size_t begin = 0; + size_t end = 0; + size_t dst = 0; + int total_size = dict_info_.lemma_size + lemma_size_left_; + int total_count = dict_info_.lemma_count + lemma_count_left_; + size_t real_size = total_size - lemma_size_left_; + while (dst < real_size) { + unsigned char flag = get_lemma_flag(dst); + unsigned char nchr = get_lemma_nchar(dst); + if ((flag & kUserDictLemmaFlagRemove) == 0) { + dst += nchr * 4 + 2; + continue; + } + break; + } + if (dst >= real_size) + return; + + end = dst; + while (end < real_size) { + begin = end + get_lemma_nchar(end) * 4 + 2; + repeat: + // not used any more + if (begin >= real_size) + break; + unsigned char flag = get_lemma_flag(begin); + unsigned char nchr = get_lemma_nchar(begin); + if (flag & kUserDictLemmaFlagRemove) { + begin += nchr * 4 + 2; + goto repeat; + } + end = begin + nchr * 4 + 2; + while (end < real_size) { + unsigned char eflag = get_lemma_flag(end); + unsigned char enchr = get_lemma_nchar(end); + if ((eflag & kUserDictLemmaFlagRemove) == 0) { + end += enchr * 4 + 2; + continue; + } + break; + } + memmove(lemmas_ + dst, lemmas_ + begin, end - begin); + for (size_t j = 0; j < dict_info_.lemma_count; j++) { + if (offsets_[j] >= begin && offsets_[j] < end) { + offsets_[j] -= (begin - dst); + offsets_by_id_[ids_[j] - start_id_] = offsets_[j]; + } +#ifdef ___PREDICT_ENABLED___ + if (predicts_[j] >= begin && predicts_[j] < end) { + predicts_[j] -= (begin - dst); + } +#endif + } +#ifdef ___SYNC_ENABLED___ + for (size_t j = 0; j < dict_info_.sync_count; j++) { + if (syncs_[j] >= begin && syncs_[j] < end) { + syncs_[j] -= (begin - dst); + } + } +#endif + dst += (end - begin); + } + + dict_info_.free_count = 0; + dict_info_.free_size = 0; + dict_info_.lemma_size = dst; + lemma_size_left_ = total_size - dict_info_.lemma_size; + lemma_count_left_ = total_count - dict_info_.lemma_count; + + // XXX Without following code, + // offsets_by_id_ is not reordered. + // That's to say, all removed lemmas' ids are not collected back. + // There may not be room for addition of new lemmas due to + // offsests_by_id_ reason, although lemma_size_left_ is fixed. + // By default, we do want defrag as fast as possible, because + // during defrag procedure, other peers can not write new lemmas + // to user dictionary file. + // XXX If write-back is invoked immediately after + // this defragment, no need to fix up following in-mem data. + for (uint32 i = 0; i < dict_info_.lemma_count; i++) { + ids_[i] = start_id_ + i; + offsets_by_id_[i] = offsets_[i]; + } + + state_ = USER_DICT_DEFRAGMENTED; + +#ifdef ___DEBUG_PERF___ + DEBUG_PERF_END; + LOGD_PERF("defragment"); +#endif +} + +#ifdef ___SYNC_ENABLED___ +void UserDict::clear_sync_lemmas(unsigned int start, unsigned int end) { + if (is_valid_state() == false) + return; + if (end > dict_info_.sync_count) + end = dict_info_.sync_count; + memmove(syncs_ + start, syncs_ + end, (dict_info_.sync_count - end) << 2); + dict_info_.sync_count -= (end - start); + if (state_ < USER_DICT_SYNC_DIRTY) + state_ = USER_DICT_SYNC_DIRTY; +} + +int UserDict::get_sync_count() { + if (is_valid_state() == false) + return 0; + return dict_info_.sync_count; +} + +LemmaIdType UserDict::put_lemma_no_sync(char16 lemma_str[], uint16 splids[], + uint16 lemma_len, uint16 count, uint64 lmt) { + int again = 0; + begin: + LemmaIdType id; + uint32 * syncs_bak = syncs_; + syncs_ = NULL; + id = _put_lemma(lemma_str, splids, lemma_len, count, lmt); + syncs_ = syncs_bak; + if (id == 0 && again == 0) { + if ((dict_info_.limit_lemma_count > 0 && + dict_info_.lemma_count >= dict_info_.limit_lemma_count) + || (dict_info_.limit_lemma_size > 0 && + dict_info_.lemma_size + (2 + (lemma_len << 2)) + > dict_info_.limit_lemma_size)) { + // XXX Always reclaim and defrag in sync code path + // sync thread is background thread and ok with heavy work + reclaim(); + defragment(); + flush_cache(); + again = 1; + goto begin; + } + } + return id; +} + +int UserDict::put_lemmas_no_sync_from_utf16le_string(char16 * lemmas, int len) { + int newly_added = 0; + + SpellingParser * spl_parser = new SpellingParser(); + if (!spl_parser) { + return 0; + } +#ifdef ___DEBUG_PERF___ + DEBUG_PERF_BEGIN; +#endif + char16 *ptr = lemmas; + + // Extract pinyin,words,frequence,last_mod_time + char16 * p = ptr, * py16 = ptr; + char16 * hz16 = NULL; + int py16_len = 0; + uint16 splid[kMaxLemmaSize]; + int splid_len = 0; + int hz16_len = 0; + char16 * fr16 = NULL; + int fr16_len = 0; + + while (p - ptr < len) { + // Pinyin + py16 = p; + splid_len = 0; + while (*p != 0x2c && (p - ptr) < len) { + if (*p == 0x20) + splid_len++; + p++; + } + splid_len++; + if (p - ptr == len) + break; + py16_len = p - py16; + if (kMaxLemmaSize < splid_len) { + break; + } + bool is_pre; + int splidl = spl_parser->splstr16_to_idxs_f( + py16, py16_len, splid, NULL, kMaxLemmaSize, is_pre); + if (splidl != splid_len) + break; + // Phrase + hz16 = ++p; + while (*p != 0x2c && (p - ptr) < len) { + p++; + } + hz16_len = p - hz16; + if (hz16_len != splid_len) + break; + // Frequency + fr16 = ++p; + fr16_len = 0; + while (*p != 0x2c && (p - ptr) < len) { + p++; + } + fr16_len = p - fr16; + uint32 intf = (uint32)utf16le_atoll(fr16, fr16_len); + // Last modified time + fr16 = ++p; + fr16_len = 0; + while (*p != 0x3b && (p - ptr) < len) { + p++; + } + fr16_len = p - fr16; + uint64 last_mod = utf16le_atoll(fr16, fr16_len); + + put_lemma_no_sync(hz16, splid, splid_len, intf, last_mod); + newly_added++; + + p++; + } + +#ifdef ___DEBUG_PERF___ + DEBUG_PERF_END; + LOGD_PERF("put_lemmas_no_sync_from_utf16le_string"); +#endif + return newly_added; +} + +int UserDict::get_sync_lemmas_in_utf16le_string_from_beginning( + char16 * str, int size, int * count) { + int len = 0; + *count = 0; + + int left_len = size; + + if (is_valid_state() == false) + return len; + + SpellingTrie * spl_trie = &SpellingTrie::get_instance(); + if (!spl_trie) { + return 0; + } + + uint32 i; + for (i = 0; i < dict_info_.sync_count; i++) { + int offset = syncs_[i]; + uint32 nchar = get_lemma_nchar(offset); + uint16 *spl = get_lemma_spell_ids(offset); + uint16 *wrd = get_lemma_word(offset); + int score = _get_lemma_score(wrd, spl, nchar); + + static char score_temp[32], *pscore_temp = score_temp; + static char16 temp[256], *ptemp = temp; + + pscore_temp = score_temp; + ptemp = temp; + + uint32 j; + // Add pinyin + for (j = 0; j < nchar; j++) { + int ret_len = spl_trie->get_spelling_str16( + spl[j], ptemp, temp + sizeof(temp) - ptemp); + if (ret_len <= 0) + break; + ptemp += ret_len; + if (ptemp < temp + sizeof(temp) - 1) { + *(ptemp++) = ' '; + } else { + j = 0; + break; + } + } + if (j < nchar) { + continue; + } + ptemp--; + if (ptemp < temp + sizeof(temp) - 1) { + *(ptemp++) = ','; + } else { + continue; + } + // Add phrase + for (j = 0; j < nchar; j++) { + if (ptemp < temp + sizeof(temp) - 1) { + *(ptemp++) = wrd[j]; + } else { + break; + } + } + if (j < nchar) { + continue; + } + if (ptemp < temp + sizeof(temp) - 1) { + *(ptemp++) = ','; + } else { + continue; + } + // Add frequency + uint32 intf = extract_score_freq(score); + int ret_len = utf16le_lltoa(intf, ptemp, temp + sizeof(temp) - ptemp); + if (ret_len <= 0) + continue; + ptemp += ret_len; + if (ptemp < temp + sizeof(temp) - 1) { + *(ptemp++) = ','; + } else { + continue; + } + // Add last modified time + uint64 last_mod = extract_score_lmt(score); + ret_len = utf16le_lltoa(last_mod, ptemp, temp + sizeof(temp) - ptemp); + if (ret_len <= 0) + continue; + ptemp += ret_len; + if (ptemp < temp + sizeof(temp) - 1) { + *(ptemp++) = ';'; + } else { + continue; + } + + // Write to string + int need_len = ptemp - temp; + if (need_len > left_len) + break; + memcpy(str + len, temp, need_len * 2); + left_len -= need_len; + + len += need_len; + (*count)++; + } + + if (len > 0) { + if (state_ < USER_DICT_SYNC_DIRTY) + state_ = USER_DICT_SYNC_DIRTY; + } + return len; +} + +#endif + +bool UserDict::state(UserDictStat * stat) { + if (is_valid_state() == false) + return false; + if (!stat) + return false; + stat->version = version_; + stat->file_name = dict_file_; + stat->load_time.tv_sec = load_time_.tv_sec; + stat->load_time.tv_usec = load_time_.tv_usec; + pthread_mutex_lock(&g_mutex_); + stat->last_update.tv_sec = g_last_update_.tv_sec; + stat->last_update.tv_usec = g_last_update_.tv_usec; + pthread_mutex_unlock(&g_mutex_); + stat->disk_size = get_dict_file_size(&dict_info_); + stat->lemma_count = dict_info_.lemma_count; + stat->lemma_size = dict_info_.lemma_size; + stat->delete_count = dict_info_.free_count; + stat->delete_size = dict_info_.free_size; +#ifdef ___SYNC_ENABLED___ + stat->sync_count = dict_info_.sync_count; +#endif + stat->limit_lemma_count = dict_info_.limit_lemma_count; + stat->limit_lemma_size = dict_info_.limit_lemma_size; + stat->reclaim_ratio = dict_info_.reclaim_ratio; + return true; +} + +void UserDict::set_limit(uint32 max_lemma_count, + uint32 max_lemma_size, uint32 reclaim_ratio) { + dict_info_.limit_lemma_count = max_lemma_count; + dict_info_.limit_lemma_size = max_lemma_size; + if (reclaim_ratio > 100) + reclaim_ratio = 100; + dict_info_.reclaim_ratio = reclaim_ratio; +} + +void UserDict::reclaim() { + if (is_valid_state() == false) + return; + + switch (dict_info_.reclaim_ratio) { + case 0: + return; + case 100: + // TODO: CLEAR to be implemented + assert(false); + return; + default: + break; + } + + // XXX Reclaim is only based on count, not size + uint32 count = dict_info_.lemma_count; + int rc = count * dict_info_.reclaim_ratio / 100; + + UserDictScoreOffsetPair * score_offset_pairs = NULL; + score_offset_pairs = (UserDictScoreOffsetPair *)malloc( + sizeof(UserDictScoreOffsetPair) * rc); + if (score_offset_pairs == NULL) { + return; + } + + for (int i = 0; i < rc; i++) { + int s = scores_[i]; + score_offset_pairs[i].score = s; + score_offset_pairs[i].offset_index = i; + } + + for (int i = (rc + 1) / 2; i >= 0; i--) + shift_down(score_offset_pairs, i, rc); + + for (uint32 i = rc; i < dict_info_.lemma_count; i++) { + int s = scores_[i]; + if (s < score_offset_pairs[0].score) { + score_offset_pairs[0].score = s; + score_offset_pairs[0].offset_index = i; + shift_down(score_offset_pairs, 0, rc); + } + } + + for (int i = 0; i < rc; i++) { + int off = score_offset_pairs[i].offset_index; + remove_lemma_by_offset_index(off); + } + if (rc > 0) { + if (state_ < USER_DICT_OFFSET_DIRTY) + state_ = USER_DICT_OFFSET_DIRTY; + } + + free(score_offset_pairs); +} + +inline void UserDict::swap(UserDictScoreOffsetPair * sop, int i, int j) { + int s = sop[i].score; + int p = sop[i].offset_index; + sop[i].score = sop[j].score; + sop[i].offset_index = sop[j].offset_index; + sop[j].score = s; + sop[j].offset_index = p; +} + +void UserDict::shift_down(UserDictScoreOffsetPair * sop, int i, int n) { + int par = i; + while (par < n) { + int left = par * 2 + 1; + int right = left + 1; + if (left >= n && right >= n) + break; + if (right >= n) { + if (sop[left].score > sop[par].score) { + swap(sop, left, par); + par = left; + continue; + } + } else if (sop[left].score > sop[right].score && + sop[left].score > sop[par].score) { + swap(sop, left, par); + par = left; + continue; + } else if (sop[right].score > sop[left].score && + sop[right].score > sop[par].score) { + swap(sop, right, par); + par = right; + continue; + } + break; + } +} + +LemmaIdType UserDict::put_lemma(char16 lemma_str[], uint16 splids[], + uint16 lemma_len, uint16 count) { + return _put_lemma(lemma_str, splids, lemma_len, count, time(NULL)); +} + +LemmaIdType UserDict::_put_lemma(char16 lemma_str[], uint16 splids[], + uint16 lemma_len, uint16 count, uint64 lmt) { +#ifdef ___DEBUG_PERF___ + DEBUG_PERF_BEGIN; +#endif + if (is_valid_state() == false) + return 0; + int32 off = locate_in_offsets(lemma_str, splids, lemma_len); + if (off != -1) { + int delta_score = count - scores_[off]; + dict_info_.total_nfreq += delta_score; + scores_[off] = build_score(lmt, count); + if (state_ < USER_DICT_SCORE_DIRTY) + state_ = USER_DICT_SCORE_DIRTY; +#ifdef ___DEBUG_PERF___ + DEBUG_PERF_END; + LOGD_PERF("_put_lemma(update)"); +#endif + return ids_[off]; + } else { + if ((dict_info_.limit_lemma_count > 0 && + dict_info_.lemma_count >= dict_info_.limit_lemma_count) + || (dict_info_.limit_lemma_size > 0 && + dict_info_.lemma_size + (2 + (lemma_len << 2)) + > dict_info_.limit_lemma_size)) { + // XXX Don't defragment here, it's too time-consuming. + return 0; + } + int flushed = 0; + if (lemma_count_left_ == 0 || + lemma_size_left_ < (size_t)(2 + (lemma_len << 2))) { + + // XXX When there is no space for new lemma, we flush to disk + // flush_cache() may be called by upper user + // and better place shoule be found instead of here + flush_cache(); + flushed = 1; + // Or simply return and do nothing + // return 0; + } +#ifdef ___DEBUG_PERF___ + DEBUG_PERF_END; + LOGD_PERF(flushed ? "_put_lemma(flush+add)" : "_put_lemma(add)"); +#endif + LemmaIdType id = append_a_lemma(lemma_str, splids, lemma_len, count, lmt); +#ifdef ___SYNC_ENABLED___ + if (syncs_ && id != 0) { + queue_lemma_for_sync(id); + } +#endif + return id; + } + return 0; +} + +#ifdef ___SYNC_ENABLED___ +void UserDict::queue_lemma_for_sync(LemmaIdType id) { + if (dict_info_.sync_count < sync_count_size_) { + syncs_[dict_info_.sync_count++] = offsets_by_id_[id - start_id_]; + } else { + uint32 * syncs = (uint32*)realloc( + syncs_, (sync_count_size_ + kUserDictPreAlloc) << 2); + if (syncs) { + sync_count_size_ += kUserDictPreAlloc; + syncs_ = syncs; + syncs_[dict_info_.sync_count++] = offsets_by_id_[id - start_id_]; + } + } +} +#endif + +LemmaIdType UserDict::update_lemma(LemmaIdType lemma_id, int16 delta_count, + bool selected) { +#ifdef ___DEBUG_PERF___ + DEBUG_PERF_BEGIN; +#endif + if (is_valid_state() == false) + return 0; + if (is_valid_lemma_id(lemma_id) == false) + return 0; + uint32 offset = offsets_by_id_[lemma_id - start_id_]; + uint8 lemma_len = get_lemma_nchar(offset); + char16 * lemma_str = get_lemma_word(offset); + uint16 * splids = get_lemma_spell_ids(offset); + + int32 off = locate_in_offsets(lemma_str, splids, lemma_len); + if (off != -1) { + int score = scores_[off]; + int count = extract_score_freq(score); + uint64 lmt = extract_score_lmt(score); + if (count + delta_count > kUserDictMaxFrequency || + count + delta_count < count) { + delta_count = kUserDictMaxFrequency - count; + } + count += delta_count; + dict_info_.total_nfreq += delta_count; + if (selected) { + lmt = time(NULL); + } + scores_[off] = build_score(lmt, count); + if (state_ < USER_DICT_SCORE_DIRTY) + state_ = USER_DICT_SCORE_DIRTY; +#ifdef ___DEBUG_PERF___ + DEBUG_PERF_END; + LOGD_PERF("update_lemma"); +#endif +#ifdef ___SYNC_ENABLED___ + queue_lemma_for_sync(ids_[off]); +#endif + return ids_[off]; + } + return 0; +} + +size_t UserDict::get_total_lemma_count() { + return dict_info_.total_nfreq; +} + +void UserDict::set_total_lemma_count_of_others(size_t count) { + total_other_nfreq_ = count; +} + +LemmaIdType UserDict::append_a_lemma(char16 lemma_str[], uint16 splids[], + uint16 lemma_len, uint16 count, uint64 lmt) { + LemmaIdType id = get_max_lemma_id() + 1; + size_t offset = dict_info_.lemma_size; + if (offset > kUserDictOffsetMask) + return 0; + + lemmas_[offset] = 0; + lemmas_[offset + 1] = (uint8)lemma_len; + for (size_t i = 0; i < lemma_len; i++) { + *((uint16*)&lemmas_[offset + 2 + (i << 1)]) = splids[i]; + *((char16*)&lemmas_[offset + 2 + (lemma_len << 1) + (i << 1)]) + = lemma_str[i]; + } + uint32 off = dict_info_.lemma_count; + offsets_[off] = offset; + scores_[off] = build_score(lmt, count); + ids_[off] = id; +#ifdef ___PREDICT_ENABLED___ + predicts_[off] = offset; +#endif + + offsets_by_id_[id - start_id_] = offset; + + dict_info_.lemma_count++; + dict_info_.lemma_size += (2 + (lemma_len << 2)); + lemma_count_left_--; + lemma_size_left_ -= (2 + (lemma_len << 2)); + + // Sort + + UserDictSearchable searchable; + prepare_locate(&searchable, splids, lemma_len); + + size_t i = 0; + while (i < off) { + offset = offsets_[i]; + uint32 nchar = get_lemma_nchar(offset); + uint16 * spl = get_lemma_spell_ids(offset); + + if (0 <= fuzzy_compare_spell_id(spl, nchar, &searchable)) + break; + i++; + } + if (i != off) { + uint32 temp = offsets_[off]; + memmove(offsets_ + i + 1, offsets_ + i, (off - i) << 2); + offsets_[i] = temp; + + temp = scores_[off]; + memmove(scores_ + i + 1, scores_ + i, (off - i) << 2); + scores_[i] = temp; + + temp = ids_[off]; + memmove(ids_ + i + 1, ids_ + i, (off - i) << 2); + ids_[i] = temp; + } + +#ifdef ___PREDICT_ENABLED___ + uint32 j = 0; + uint16 * words_new = get_lemma_word(predicts_[off]); + j = locate_where_to_insert_in_predicts(words_new, lemma_len); + if (j != off) { + uint32 temp = predicts_[off]; + memmove(predicts_ + j + 1, predicts_ + j, (off - j) << 2); + predicts_[j] = temp; + } +#endif + + if (state_ < USER_DICT_LEMMA_DIRTY) + state_ = USER_DICT_LEMMA_DIRTY; + +#ifdef ___CACHE_ENABLED___ + cache_init(); +#endif + + dict_info_.total_nfreq += count; + return id; +} +} diff --git a/googlepinyin/userdict.h b/googlepinyin/userdict.h new file mode 100644 index 0000000..1b9673f --- /dev/null +++ b/googlepinyin/userdict.h @@ -0,0 +1,432 @@ +/* + * Copyright (C) 2009 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef PINYINIME_INCLUDE_USERDICT_H__ +#define PINYINIME_INCLUDE_USERDICT_H__ + +#define ___CACHE_ENABLED___ +#define ___SYNC_ENABLED___ +#define ___PREDICT_ENABLED___ + +// Debug performance for operations +// #define ___DEBUG_PERF___ + +#ifdef _WIN32 +#include // timeval +#else +#include +#endif +#include "atomdictbase.h" + +namespace ime_pinyin { + +class UserDict : public AtomDictBase { + public: + UserDict(); + ~UserDict(); + + bool load_dict(const char *file_name, LemmaIdType start_id, + LemmaIdType end_id); + + bool close_dict(); + + size_t number_of_lemmas(); + + void reset_milestones(uint16 from_step, MileStoneHandle from_handle); + + MileStoneHandle extend_dict(MileStoneHandle from_handle, + const DictExtPara *dep, LmaPsbItem *lpi_items, + size_t lpi_max, size_t *lpi_num); + + size_t get_lpis(const uint16 *splid_str, uint16 splid_str_len, + LmaPsbItem *lpi_items, size_t lpi_max); + + uint16 get_lemma_str(LemmaIdType id_lemma, char16* str_buf, + uint16 str_max); + + uint16 get_lemma_splids(LemmaIdType id_lemma, uint16 *splids, + uint16 splids_max, bool arg_valid); + + size_t predict(const char16 last_hzs[], uint16 hzs_len, + NPredictItem *npre_items, size_t npre_max, + size_t b4_used); + + // Full spelling ids are required + LemmaIdType put_lemma(char16 lemma_str[], uint16 splids[], + uint16 lemma_len, uint16 count); + + LemmaIdType update_lemma(LemmaIdType lemma_id, int16 delta_count, + bool selected); + + LemmaIdType get_lemma_id(char16 lemma_str[], uint16 splids[], + uint16 lemma_len); + + LmaScoreType get_lemma_score(LemmaIdType lemma_id); + + LmaScoreType get_lemma_score(char16 lemma_str[], uint16 splids[], + uint16 lemma_len); + + bool remove_lemma(LemmaIdType lemma_id); + + size_t get_total_lemma_count(); + void set_total_lemma_count_of_others(size_t count); + + void flush_cache(); + + void set_limit(uint32 max_lemma_count, uint32 max_lemma_size, + uint32 reclaim_ratio); + + void reclaim(); + + void defragment(); + +#ifdef ___SYNC_ENABLED___ + void clear_sync_lemmas(unsigned int start, unsigned int end); + + int get_sync_count(); + + LemmaIdType put_lemma_no_sync(char16 lemma_str[], uint16 splids[], + uint16 lemma_len, uint16 count, uint64 lmt); + /** + * Add lemmas encoded in UTF-16LE into dictionary without adding sync flag. + * + * @param lemmas in format of 'wo men,WM,0.32;da jia,DJ,0.12' + * @param len length of lemmas string in UTF-16LE + * @return newly added lemma count + */ + int put_lemmas_no_sync_from_utf16le_string(char16 * lemmas, int len); + + /** + * Get lemmas need sync to a UTF-16LE string of above format. + * Note: input buffer (str) must not be too small. If str is too small to + * contain single one lemma, there might be a dead loop. + * + * @param str buffer to write lemmas + * @param size buffer size in UTF-16LE + * @param count output value of lemma returned + * @return UTF-16LE string length + */ + int get_sync_lemmas_in_utf16le_string_from_beginning( + char16 * str, int size, int * count); + +#endif + + struct UserDictStat { + uint32 version; + const char * file_name; + struct timeval load_time; + struct timeval last_update; + uint32 disk_size; + uint32 lemma_count; + uint32 lemma_size; + uint32 delete_count; + uint32 delete_size; +#ifdef ___SYNC_ENABLED___ + uint32 sync_count; +#endif + uint32 reclaim_ratio; + uint32 limit_lemma_count; + uint32 limit_lemma_size; + }; + + bool state(UserDictStat * stat); + + private: + uint32 total_other_nfreq_; + struct timeval load_time_; + LemmaIdType start_id_; + uint32 version_; + uint8 * lemmas_; + + // In-Memory-Only flag for each lemma + static const uint8 kUserDictLemmaFlagRemove = 1; + // Inuse lemmas' offset + uint32 * offsets_; + // Highest bit in offset tells whether corresponding lemma is removed + static const uint32 kUserDictOffsetFlagRemove = (1 << 31); + // Maximum possible for the offset + static const uint32 kUserDictOffsetMask = ~(kUserDictOffsetFlagRemove); + // Bit width for last modified time, from 1 to 16 + static const uint32 kUserDictLMTBitWidth = 16; + // Granularity for last modified time in second + static const uint32 kUserDictLMTGranularity = 60 * 60 * 24 * 7; + // Maximum frequency count + static const uint16 kUserDictMaxFrequency = 0xFFFF; + +#define COARSE_UTC(year, month, day, hour, minute, second) \ + ( \ + (year - 1970) * 365 * 24 * 60 * 60 + \ + (month - 1) * 30 * 24 * 60 * 60 + \ + (day - 1) * 24 * 60 * 60 + \ + (hour - 0) * 60 * 60 + \ + (minute - 0) * 60 + \ + (second - 0) \ + ) + static const uint64 kUserDictLMTSince = COARSE_UTC(2009, 1, 1, 0, 0, 0); + + // Correspond to offsets_ + uint32 * scores_; + // Following two fields are only valid in memory + uint32 * ids_; +#ifdef ___PREDICT_ENABLED___ + uint32 * predicts_; +#endif +#ifdef ___SYNC_ENABLED___ + uint32 * syncs_; + size_t sync_count_size_; +#endif + uint32 * offsets_by_id_; + + size_t lemma_count_left_; + size_t lemma_size_left_; + + const char * dict_file_; + + // Be sure size is 4xN + struct UserDictInfo { + // When limitation reached, how much percentage will be reclaimed (1 ~ 100) + uint32 reclaim_ratio; + // maximum lemma count, 0 means no limitation + uint32 limit_lemma_count; + // Maximum lemma size, it's different from + // whole disk file size or in-mem dict size + // 0 means no limitation + uint32 limit_lemma_size; + // Total lemma count including deleted and inuse + // Also indicate offsets_ size + uint32 lemma_count; + // Total size of lemmas including used and freed + uint32 lemma_size; + // Freed lemma count + uint32 free_count; + // Freed lemma size in byte + uint32 free_size; +#ifdef ___SYNC_ENABLED___ + uint32 sync_count; +#endif + int32 total_nfreq; + } dict_info_; + + static const uint32 kUserDictVersion = 0x0ABCDEF0; + + static const uint32 kUserDictPreAlloc = 32; + static const uint32 kUserDictAverageNchar = 8; + + enum UserDictState { + // Keep in order + USER_DICT_NONE = 0, + USER_DICT_SYNC, +#ifdef ___SYNC_ENABLED___ + USER_DICT_SYNC_DIRTY, +#endif + USER_DICT_SCORE_DIRTY, + USER_DICT_OFFSET_DIRTY, + USER_DICT_LEMMA_DIRTY, + + USER_DICT_DEFRAGMENTED, + } state_; + + struct UserDictSearchable { + uint16 splids_len; + uint16 splid_start[kMaxLemmaSize]; + uint16 splid_count[kMaxLemmaSize]; + // Compact inital letters for both FuzzyCompareSpellId and cache system + uint32 signature[kMaxLemmaSize / 4]; + }; + +#ifdef ___CACHE_ENABLED___ + enum UserDictCacheType { + USER_DICT_CACHE, + USER_DICT_MISS_CACHE, + }; + + static const int kUserDictCacheSize = 4; + static const int kUserDictMissCacheSize = kMaxLemmaSize - 1; + + struct UserDictMissCache { + uint32 signatures[kUserDictMissCacheSize][kMaxLemmaSize / 4]; + uint16 head, tail; + } miss_caches_[kMaxLemmaSize]; + + struct UserDictCache { + uint32 signatures[kUserDictCacheSize][kMaxLemmaSize / 4]; + uint32 offsets[kUserDictCacheSize]; + uint32 lengths[kUserDictCacheSize]; + // Ring buffer + uint16 head, tail; + } caches_[kMaxLemmaSize]; + + void cache_init(); + + void cache_push(UserDictCacheType type, + UserDictSearchable *searchable, + uint32 offset, uint32 length); + + bool cache_hit(UserDictSearchable *searchable, + uint32 *offset, uint32 *length); + + bool load_cache(UserDictSearchable *searchable, + uint32 *offset, uint32 *length); + + void save_cache(UserDictSearchable *searchable, + uint32 offset, uint32 length); + + void reset_cache(); + + bool load_miss_cache(UserDictSearchable *searchable); + + void save_miss_cache(UserDictSearchable *searchable); + + void reset_miss_cache(); +#endif + + LmaScoreType translate_score(int f); + + int extract_score_freq(int raw_score); + + uint64 extract_score_lmt(int raw_score); + + inline int build_score(uint64 lmt, int freq); + + inline int64 utf16le_atoll(uint16 *s, int len); + + inline int utf16le_lltoa(int64 v, uint16 *s, int size); + + LemmaIdType _put_lemma(char16 lemma_str[], uint16 splids[], + uint16 lemma_len, uint16 count, uint64 lmt); + + size_t _get_lpis(const uint16 *splid_str, uint16 splid_str_len, + LmaPsbItem *lpi_items, size_t lpi_max, bool * need_extend); + + int _get_lemma_score(char16 lemma_str[], uint16 splids[], uint16 lemma_len); + + int _get_lemma_score(LemmaIdType lemma_id); + + int is_fuzzy_prefix_spell_id(const uint16 * id1, uint16 len1, + const UserDictSearchable *searchable); + + bool is_prefix_spell_id(const uint16 * fullids, + uint16 fulllen, const UserDictSearchable *searchable); + + uint32 get_dict_file_size(UserDictInfo * info); + + bool reset(const char *file); + + bool validate(const char *file); + + bool load(const char *file, LemmaIdType start_id); + + bool is_valid_state(); + + bool is_valid_lemma_id(LemmaIdType id); + + LemmaIdType get_max_lemma_id(); + + void set_lemma_flag(uint32 offset, uint8 flag); + + char get_lemma_flag(uint32 offset); + + char get_lemma_nchar(uint32 offset); + + uint16 * get_lemma_spell_ids(uint32 offset); + + uint16 * get_lemma_word(uint32 offset); + + // Prepare searchable to fasten locate process + void prepare_locate(UserDictSearchable *searchable, + const uint16 * splids, uint16 len); + + // Compare initial letters only + int32 fuzzy_compare_spell_id(const uint16 * id1, uint16 len1, + const UserDictSearchable *searchable); + + // Compare exactly two spell ids + // First argument must be a full id spell id + bool equal_spell_id(const uint16 * fullids, + uint16 fulllen, const UserDictSearchable *searchable); + + // Find first item by initial letters + int32 locate_first_in_offsets(const UserDictSearchable *searchable); + + LemmaIdType append_a_lemma(char16 lemma_str[], uint16 splids[], + uint16 lemma_len, uint16 count, uint64 lmt); + + // Check if a lemma is in dictionary + int32 locate_in_offsets(char16 lemma_str[], + uint16 splid_str[], uint16 lemma_len); + + bool remove_lemma_by_offset_index(int offset_index); +#ifdef ___PREDICT_ENABLED___ + uint32 locate_where_to_insert_in_predicts(const uint16 * words, + int lemma_len); + + int32 locate_first_in_predicts(const uint16 * words, int lemma_len); + + void remove_lemma_from_predict_list(uint32 offset); +#endif +#ifdef ___SYNC_ENABLED___ + void queue_lemma_for_sync(LemmaIdType id); + + void remove_lemma_from_sync_list(uint32 offset); + + void write_back_sync(int fd); +#endif + void write_back_score(int fd); + void write_back_offset(int fd); + void write_back_lemma(int fd); + void write_back_all(int fd); + void write_back(); + + struct UserDictScoreOffsetPair { + int score; + uint32 offset_index; + }; + + inline void swap(UserDictScoreOffsetPair * sop, int i, int j); + + void shift_down(UserDictScoreOffsetPair * sop, int i, int n); + + // On-disk format for each lemma + // +-------------+ + // | Version (4) | + // +-------------+ + // +-----------+-----------+--------------------+-------------------+ + // | Spare (1) | Nchar (1) | Splids (2 x Nchar) | Lemma (2 x Nchar) | + // +-----------+-----------+--------------------+-------------------+ + // ... + // +-----------------------+ +-------------+ <---Offset of offset + // | Offset1 by_splids (4) | ... | OffsetN (4) | + // +-----------------------+ +-------------+ +#ifdef ___PREDICT_ENABLED___ + // +----------------------+ +-------------+ + // | Offset1 by_lemma (4) | ... | OffsetN (4) | + // +----------------------+ +-------------+ +#endif + // +------------+ +------------+ + // | Score1 (4) | ... | ScoreN (4) | + // +------------+ +------------+ +#ifdef ___SYNC_ENABLED___ + // +-------------+ +-------------+ + // | NewAdd1 (4) | ... | NewAddN (4) | + // +-------------+ +-------------+ +#endif + // +----------------+ + // | Dict Info (4x) | + // +----------------+ +}; +} + +#endif diff --git a/googlepinyin/utf16char.cpp b/googlepinyin/utf16char.cpp new file mode 100644 index 0000000..9a945e2 --- /dev/null +++ b/googlepinyin/utf16char.cpp @@ -0,0 +1,182 @@ +/* + * Copyright (C) 2009 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "utf16char.h" + +namespace ime_pinyin { + +#ifdef __cplusplus +extern "C" { +#endif + + char16* utf16_strtok(char16 *utf16_str, size_t *token_size, + char16 **utf16_str_next) { + if (NULL == utf16_str || NULL == token_size || NULL == utf16_str_next) { + return NULL; + } + + // Skip the splitters + size_t pos = 0; + while ((char16)' ' == utf16_str[pos] || (char16)'\n' == utf16_str[pos] + || (char16)'\t' == utf16_str[pos]) + pos++; + + utf16_str += pos; + pos = 0; + + while ((char16)'\0' != utf16_str[pos] && (char16)' ' != utf16_str[pos] + && (char16)'\n' != utf16_str[pos] + && (char16)'\t' != utf16_str[pos]) { + pos++; + } + + char16 *ret_val = utf16_str; + if ((char16)'\0' == utf16_str[pos]) { + *utf16_str_next = NULL; + if (0 == pos) + return NULL; + } else { + *utf16_str_next = utf16_str + pos + 1; + } + + utf16_str[pos] = (char16)'\0'; + *token_size = pos; + + return ret_val; + } + + int utf16_atoi(const char16 *utf16_str) { + if (NULL == utf16_str) + return 0; + + int value = 0; + int sign = 1; + size_t pos = 0; + + if ((char16)'-' == utf16_str[pos]) { + sign = -1; + pos++; + } + + while ((char16)'0' <= utf16_str[pos] && + (char16)'9' >= utf16_str[pos]) { + value = value * 10 + static_cast(utf16_str[pos] - (char16)'0'); + pos++; + } + + return value*sign; + } + + float utf16_atof(const char16 *utf16_str) { + // A temporary implemetation. + char char8[256]; + if (utf16_strlen(utf16_str) >= 256) return 0; + + utf16_strcpy_tochar(char8, utf16_str); + return atof(char8); + } + + size_t utf16_strlen(const char16 *utf16_str) { + if (NULL == utf16_str) + return 0; + + size_t size = 0; + while ((char16)'\0' != utf16_str[size]) + size++; + return size; + } + + int utf16_strcmp(const char16* str1, const char16* str2) { + size_t pos = 0; + while (str1[pos] == str2[pos] && (char16)'\0' != str1[pos]) + pos++; + + return static_cast(str1[pos]) - static_cast(str2[pos]); + } + + int utf16_strncmp(const char16 *str1, const char16 *str2, size_t size) { + size_t pos = 0; + while (pos < size && str1[pos] == str2[pos] && (char16)'\0' != str1[pos]) + pos++; + + if (pos == size) + return 0; + + return static_cast(str1[pos]) - static_cast(str2[pos]); + } + + // we do not consider overlapping + char16* utf16_strcpy(char16 *dst, const char16 *src) { + if (NULL == src || NULL == dst) + return NULL; + + char16* cp = dst; + + while ((char16)'\0' != *src) { + *cp = *src; + cp++; + src++; + } + + *cp = *src; + + return dst; + } + + char16* utf16_strncpy(char16 *dst, const char16 *src, size_t size) { + if (NULL == src || NULL == dst || 0 == size) + return NULL; + + if (src == dst) + return dst; + + char16* cp = dst; + + if (dst < src || (dst > src && dst >= src + size)) { + while (size-- && (*cp++ = *src++)) + ; + } else { + cp += size - 1; + src += size - 1; + while (size-- && (*cp-- == *src--)) + ; + } + return dst; + } + + // We do not handle complicated cases like overlapping, because in this + // codebase, it is not necessary. + char* utf16_strcpy_tochar(char *dst, const char16 *src) { + if (NULL == src || NULL == dst) + return NULL; + + char* cp = dst; + + while ((char16)'\0' != *src) { + *cp = static_cast(*src); + cp++; + src++; + } + *cp = *src; + + return dst; + } + +#ifdef __cplusplus +} +#endif +} // namespace ime_pinyin diff --git a/googlepinyin/utf16char.h b/googlepinyin/utf16char.h new file mode 100644 index 0000000..7e957db --- /dev/null +++ b/googlepinyin/utf16char.h @@ -0,0 +1,56 @@ +/* + * Copyright (C) 2009 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef PINYINIME_INCLUDE_UTF16CHAR_H__ +#define PINYINIME_INCLUDE_UTF16CHAR_H__ + +#include + +namespace ime_pinyin { + +#ifdef __cplusplus +extern "C" { +#endif + + typedef unsigned short char16; + + // Get a token from utf16_str, + // Returned pointer is a '\0'-terminated utf16 string, or NULL + // *utf16_str_next returns the next part of the string for further tokenizing + char16* utf16_strtok(char16 *utf16_str, size_t *token_size, + char16 **utf16_str_next); + + int utf16_atoi(const char16 *utf16_str); + + float utf16_atof(const char16 *utf16_str); + + size_t utf16_strlen(const char16 *utf16_str); + + int utf16_strcmp(const char16 *str1, const char16 *str2); + int utf16_strncmp(const char16 *str1, const char16 *str2, size_t size); + + char16* utf16_strcpy(char16 *dst, const char16 *src); + char16* utf16_strncpy(char16 *dst, const char16 *src, size_t size); + + + char* utf16_strcpy_tochar(char *dst, const char16 *src); + +#ifdef __cplusplus +} +#endif +} + +#endif // PINYINIME_INCLUDE_UTF16CHAR_H__ diff --git a/googlepinyin/utf16reader.cpp b/googlepinyin/utf16reader.cpp new file mode 100644 index 0000000..364f186 --- /dev/null +++ b/googlepinyin/utf16reader.cpp @@ -0,0 +1,131 @@ +/* + * Copyright (C) 2009 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "utf16reader.h" + +namespace ime_pinyin { + +#define MIN_BUF_LEN 128 +#define MAX_BUF_LEN 65535 + +Utf16Reader::Utf16Reader() { + fp_ = NULL; + buffer_ = NULL; + buffer_total_len_ = 0; + buffer_next_pos_ = 0; + buffer_valid_len_ = 0; +} + +Utf16Reader::~Utf16Reader() { + if (NULL != fp_) + fclose(fp_); + + if (NULL != buffer_) + delete [] buffer_; +} + + +bool Utf16Reader::open(const char* filename, size_t buffer_len) { + if (filename == NULL) + return false; + + if (buffer_len < MIN_BUF_LEN) + buffer_len = MIN_BUF_LEN; + else if (buffer_len > MAX_BUF_LEN) + buffer_len = MAX_BUF_LEN; + + buffer_total_len_ = buffer_len; + + if (NULL != buffer_) + delete [] buffer_; + buffer_ = new char16[buffer_total_len_]; + if (NULL == buffer_) + return false; + + if ((fp_ = fopen(filename, "rb")) == NULL) + return false; + + // the UTF16 file header, skip + char16 header; + if (fread(&header, sizeof(header), 1, fp_) != 1 || header != 0xfeff) { + fclose(fp_); + fp_ = NULL; + return false; + } + + return true; +} + +char16* Utf16Reader::readline(char16* read_buf, size_t max_len) { + if (NULL == fp_ || NULL == read_buf || 0 == max_len) + return NULL; + + size_t ret_len = 0; + + do { + if (buffer_valid_len_ == 0) { + buffer_next_pos_ = 0; + buffer_valid_len_ = fread(buffer_, sizeof(char16), + buffer_total_len_, fp_); + if (buffer_valid_len_ == 0) { + if (0 == ret_len) + return NULL; + read_buf[ret_len] = (char16)'\0'; + return read_buf; + } + } + + for (size_t i = 0; i < buffer_valid_len_; i++) { + if (i == max_len - 1 || + buffer_[buffer_next_pos_ + i] == (char16)'\n') { + if (ret_len + i > 0 && read_buf[ret_len + i - 1] == (char16)'\r') { + read_buf[ret_len + i - 1] = (char16)'\0'; + } else { + read_buf[ret_len + i] = (char16)'\0'; + } + + i++; + buffer_next_pos_ += i; + buffer_valid_len_ -= i; + if (buffer_next_pos_ == buffer_total_len_) { + buffer_next_pos_ = 0; + buffer_valid_len_ = 0; + } + return read_buf; + } else { + read_buf[ret_len + i] = buffer_[buffer_next_pos_ + i]; + } + } + + ret_len += buffer_valid_len_; + buffer_valid_len_ = 0; + } while (true); + + // Never reach here + return NULL; +} + +bool Utf16Reader::close() { + if (NULL != fp_) + fclose(fp_); + fp_ = NULL; + + if (NULL != buffer_) + delete [] buffer_; + buffer_ = NULL; + return true; +} +} // namespace ime_pinyin diff --git a/googlepinyin/utf16reader.h b/googlepinyin/utf16reader.h new file mode 100644 index 0000000..b6d6719 --- /dev/null +++ b/googlepinyin/utf16reader.h @@ -0,0 +1,48 @@ +/* + * Copyright (C) 2009 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef PINYINIME_INCLUDE_UTF16READER_H__ +#define PINYINIME_INCLUDE_UTF16READER_H__ + +#include +#include "./utf16char.h" + +namespace ime_pinyin { + +class Utf16Reader { + private: + FILE *fp_; + char16 *buffer_; + size_t buffer_total_len_; + size_t buffer_next_pos_; + + // Always less than buffer_total_len_ - buffer_next_pos_ + size_t buffer_valid_len_; + + public: + Utf16Reader(); + ~Utf16Reader(); + + // filename is the name of the file to open. + // buffer_len specifies how long buffer should be allocated to speed up the + // future reading + bool open(const char* filename, size_t buffer_len); + char16* readline(char16* read_buf, size_t max_len); + bool close(); +}; +} + +#endif // PINYINIME_INCLUDE_UTF16READER_H__ diff --git a/plugin/dict/dict_pinyin.dat b/plugin/dict/dict_pinyin.dat new file mode 100644 index 0000000..1be3f9c Binary files /dev/null and b/plugin/dict/dict_pinyin.dat differ diff --git a/plugin/googlepinyin/dictdef.h b/plugin/googlepinyin/dictdef.h new file mode 100644 index 0000000..5e1d781 --- /dev/null +++ b/plugin/googlepinyin/dictdef.h @@ -0,0 +1,157 @@ +/* + * Copyright (C) 2009 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef PINYINIME_INCLUDE_DICTDEF_H__ +#define PINYINIME_INCLUDE_DICTDEF_H__ + +#include +#include "./utf16char.h" + +namespace ime_pinyin { + +// Enable the following line when building the binary dictionary model. +// #define ___BUILD_MODEL___ + +typedef unsigned char uint8; +typedef unsigned short uint16; +typedef unsigned int uint32; + +typedef signed char int8; +typedef short int16; +typedef int int32; +typedef long long int64; +typedef unsigned long long uint64; + +const bool kPrintDebug0 = false; +const bool kPrintDebug1 = false; +const bool kPrintDebug2 = false; + +// The max length of a lemma. +const size_t kMaxLemmaSize = 8; + +// The max length of a Pinyin (spelling). +const size_t kMaxPinyinSize = 6; + +// The number of half spelling ids. For Chinese Pinyin, there 30 half ids. +// See SpellingTrie.h for details. +const size_t kHalfSpellingIdNum = 29; + +// The maximum number of full spellings. For Chinese Pinyin, there are only +// about 410 spellings. +// If change this value is bigger(needs more bits), please also update +// other structures like SpellingNode, to make sure than a spelling id can be +// stored. +// -1 is because that 0 is never used. +const size_t kMaxSpellingNum = 512 - kHalfSpellingIdNum - 1; +const size_t kMaxSearchSteps = 40; + +// One character predicts its following characters. +const size_t kMaxPredictSize = (kMaxLemmaSize - 1); + +// LemmaIdType must always be size_t. +typedef size_t LemmaIdType; +const size_t kLemmaIdSize = 3; // Actually, a Id occupies 3 bytes in storage. +const size_t kLemmaIdComposing = 0xffffff; + +typedef uint16 LmaScoreType; +typedef uint16 KeyScoreType; + +// Number of items with highest score are kept for prediction purpose. +const size_t kTopScoreLemmaNum = 10; + +const size_t kMaxPredictNumByGt3 = 1; +const size_t kMaxPredictNumBy3 = 2; +const size_t kMaxPredictNumBy2 = 2; + +// The last lemma id (included) for the system dictionary. The system +// dictionary's ids always start from 1. +const LemmaIdType kSysDictIdEnd = 500000; + +// The first lemma id for the user dictionary. +const LemmaIdType kUserDictIdStart = 500001; + +// The last lemma id (included) for the user dictionary. +const LemmaIdType kUserDictIdEnd = 600000; + +typedef struct { + uint16 half_splid:5; + uint16 full_splid:11; +} SpellingId, *PSpellingId; + + +/** + * We use different node types for different layers + * Statistical data of the building result for a testing dictionary: + * root, level 0, level 1, level 2, level 3 + * max son num of one node: 406 280 41 2 - + * max homo num of one node: 0 90 23 2 2 + * total node num of a layer: 1 406 31766 13516 993 + * total homo num of a layer: 9 5674 44609 12667 995 + * + * The node number for root and level 0 won't be larger than 500 + * According to the information above, two kinds of nodes can be used; one for + * root and level 0, the other for these layers deeper than 0. + * + * LE = less and equal, + * A node occupies 16 bytes. so, totallly less than 16 * 500 = 8K + */ +struct LmaNodeLE0 { + uint32 son_1st_off; + uint32 homo_idx_buf_off; + uint16 spl_idx; + uint16 num_of_son; + uint16 num_of_homo; +}; + +/** + * GE = great and equal + * A node occupies 8 bytes. + */ +struct LmaNodeGE1 { + uint16 son_1st_off_l; // Low bits of the son_1st_off + uint16 homo_idx_buf_off_l; // Low bits of the homo_idx_buf_off_1 + uint16 spl_idx; + unsigned char num_of_son; // number of son nodes + unsigned char num_of_homo; // number of homo words + unsigned char son_1st_off_h; // high bits of the son_1st_off + unsigned char homo_idx_buf_off_h; // high bits of the homo_idx_buf_off +}; + +#ifdef ___BUILD_MODEL___ +struct SingleCharItem { + float freq; + char16 hz; + SpellingId splid; +}; + +struct LemmaEntry { + LemmaIdType idx_by_py; + LemmaIdType idx_by_hz; + char16 hanzi_str[kMaxLemmaSize + 1]; + + // The SingleCharItem id for each Hanzi. + uint16 hanzi_scis_ids[kMaxLemmaSize]; + + uint16 spl_idx_arr[kMaxLemmaSize + 1]; + char pinyin_str[kMaxLemmaSize][kMaxPinyinSize + 1]; + unsigned char hz_str_len; + float freq; +}; +#endif // ___BUILD_MODEL___ + +} // namespace ime_pinyin + +#endif // PINYINIME_INCLUDE_DICTDEF_H__ diff --git a/plugin/googlepinyin/libgooglepinyin.a b/plugin/googlepinyin/libgooglepinyin.a new file mode 100644 index 0000000..422216c Binary files /dev/null and b/plugin/googlepinyin/libgooglepinyin.a differ diff --git a/plugin/googlepinyin/pinyinime.h b/plugin/googlepinyin/pinyinime.h new file mode 100644 index 0000000..e376c20 --- /dev/null +++ b/plugin/googlepinyin/pinyinime.h @@ -0,0 +1,223 @@ +/* + * Copyright (C) 2009 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef PINYINIME_INCLUDE_ANDPYIME_H__ +#define PINYINIME_INCLUDE_ANDPYIME_H__ + +#include +#include "./dictdef.h" + +#ifdef __cplusplus +extern "C" { +#endif + + namespace ime_pinyin { + + /** + * Open the decoder engine via the system and user dictionary file names. + * + * @param fn_sys_dict The file name of the system dictionary. + * @param fn_usr_dict The file name of the user dictionary. + * @return true if open the decoder engine successfully. + */ + bool im_open_decoder(const char *fn_sys_dict, const char *fn_usr_dict); + + /** + * Open the decoder engine via the system dictionary FD and user dictionary + * file name. Because on Android, the system dictionary is embedded in the + * whole application apk file. + * + * @param sys_fd The file in which the system dictionary is embedded. + * @param start_offset The starting position of the system dictionary in the + * file sys_fd. + * @param length The length of the system dictionary in the file sys_fd, + * counted in byte. + * @return true if succeed. + */ + bool im_open_decoder_fd(int sys_fd, long start_offset, long length, + const char *fn_usr_dict); + + /** + * Close the decoder engine. + */ + void im_close_decoder(); + + /** + * Set maximum limitations for decoding. If this function is not called, + * default values will be used. For example, due to screen size limitation, + * the UI engine of the IME can only show a certain number of letters(input) + * to decode, and a certain number of Chinese characters(output). If after + * user adds a new letter, the input or the output string is longer than the + * limitations, the engine will discard the recent letter. + * + * @param max_sps_len Maximum length of the spelling string(Pinyin string). + * @max_hzs_len Maximum length of the decoded Chinese character string. + */ + void im_set_max_lens(size_t max_sps_len, size_t max_hzs_len); + + /** + * Flush cached data to persistent memory. Because at runtime, in order to + * achieve best performance, some data is only store in memory. + */ + void im_flush_cache(); + + /** + * Use a spelling string(Pinyin string) to search. The engine will try to do + * an incremental search based on its previous search result, so if the new + * string has the same prefix with the previous one stored in the decoder, + * the decoder will only continue the search from the end of the prefix. + * If the caller needs to do a brand new search, please call im_reset_search() + * first. Calling im_search() is equivalent to calling im_add_letter() one by + * one. + * + * @param sps_buf The spelling string buffer to decode. + * @param sps_len The length of the spelling string buffer. + * @return The number of candidates. + */ + size_t im_search(const char* sps_buf, size_t sps_len); + + /** + * Make a delete operation in the current search result, and make research if + * necessary. + * + * @param pos The posistion of char in spelling string to delete, or the + * position of spelling id in result string to delete. + * @param is_pos_in_splid Indicate whether the pos parameter is the position + * in the spelling string, or the position in the result spelling id string. + * @return The number of candidates. + */ + size_t im_delsearch(size_t pos, bool is_pos_in_splid, + bool clear_fixed_this_step); + + /** + * Reset the previous search result. + */ + void im_reset_search(); + + /** + * Add a Pinyin letter to the current spelling string kept by decoder. If the + * decoder fails in adding the letter, it will do nothing. im_get_sps_str() + * can be used to get the spelling string kept by decoder currently. + * + * @param ch The letter to add. + * @return The number of candidates. + */ + size_t im_add_letter(char ch); + + /** + * Get the spelling string kept by the decoder. + * + * @param decoded_len Used to return how many characters in the spelling + * string is successfully parsed. + * @return The spelling string kept by the decoder. + */ + const char *im_get_sps_str(size_t *decoded_len); + + /** + * Get a candidate(or choice) string. + * + * @param cand_id The id to get a candidate. Started from 0. Usually, id 0 + * is a sentence-level candidate. + * @param cand_str The buffer to store the candidate. + * @param max_len The maximum length of the buffer. + * @return cand_str if succeeds, otherwise NULL. + */ + char16* im_get_candidate(size_t cand_id, char16* cand_str, + size_t max_len); + + /** + * Get the segmentation information(the starting positions) of the spelling + * string. + * + * @param spl_start Used to return the starting posistions. + * @return The number of spelling ids. If it is L, there will be L+1 valid + * elements in spl_start, and spl_start[L] is the posistion after the end of + * the last spelling id. + */ + size_t im_get_spl_start_pos(const uint16 *&spl_start); + + /** + * Choose a candidate and make it fixed. If the candidate does not match + * the end of all spelling ids, new candidates will be provided from the + * first unfixed position. If the candidate matches the end of the all + * spelling ids, there will be only one new candidates, or the whole fixed + * sentence. + * + * @param cand_id The id of candidate to select and make it fixed. + * @return The number of candidates. If after the selection, the whole result + * string has been fixed, there will be only one candidate. + */ + size_t im_choose(size_t cand_id); + + /** + * Cancel the last selection, or revert the last operation of im_choose(). + * + * @return The number of candidates. + */ + size_t im_cancel_last_choice(); + + /** + * Get the number of fixed spelling ids, or Chinese characters. + * + * @return The number of fixed spelling ids, of Chinese characters. + */ + size_t im_get_fixed_len(); + + /** + * Cancel the input state and reset the search workspace. + */ + bool im_cancel_input(); + + /** + * Get prediction candiates based on the given fixed Chinese string as the + * history. + * + * @param his_buf The history buffer to do the prediction. It should be ended + * with '\0'. + * @param pre_buf Used to return prediction result list. + * @return The number of predicted result string. + */ + size_t im_get_predicts(const char16 *his_buf, + char16 (*&pre_buf)[kMaxPredictSize + 1]); + + /** + * Enable Shengmus in ShouZiMu mode. + */ + void im_enable_shm_as_szm(bool enable); + + /** + * Enable Yunmus in ShouZiMu mode. + */ + void im_enable_ym_as_szm(bool enable); + + /** + * Initializes or uninitializes the user dictionary. + * + * @param fn_usr_dict The file name of the user dictionary. + */ + void im_init_user_dictionary(const char *fn_usr_dict); + + /** + * Returns the current status of user dictinary. + */ + bool im_is_user_dictionary_enabled(void); +} + +#ifdef __cplusplus +} +#endif + +#endif // PINYINIME_INCLUDE_ANDPYIME_H__ diff --git a/plugin/googlepinyin/utf16char.h b/plugin/googlepinyin/utf16char.h new file mode 100644 index 0000000..7e957db --- /dev/null +++ b/plugin/googlepinyin/utf16char.h @@ -0,0 +1,56 @@ +/* + * Copyright (C) 2009 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef PINYINIME_INCLUDE_UTF16CHAR_H__ +#define PINYINIME_INCLUDE_UTF16CHAR_H__ + +#include + +namespace ime_pinyin { + +#ifdef __cplusplus +extern "C" { +#endif + + typedef unsigned short char16; + + // Get a token from utf16_str, + // Returned pointer is a '\0'-terminated utf16 string, or NULL + // *utf16_str_next returns the next part of the string for further tokenizing + char16* utf16_strtok(char16 *utf16_str, size_t *token_size, + char16 **utf16_str_next); + + int utf16_atoi(const char16 *utf16_str); + + float utf16_atof(const char16 *utf16_str); + + size_t utf16_strlen(const char16 *utf16_str); + + int utf16_strcmp(const char16 *str1, const char16 *str2); + int utf16_strncmp(const char16 *str1, const char16 *str2, size_t size); + + char16* utf16_strcpy(char16 *dst, const char16 *src); + char16* utf16_strncpy(char16 *dst, const char16 *src, size_t size); + + + char* utf16_strcpy_tochar(char *dst, const char16 *src); + +#ifdef __cplusplus +} +#endif +} + +#endif // PINYINIME_INCLUDE_UTF16CHAR_H__ diff --git a/plugin/keyboardform.cpp b/plugin/keyboardform.cpp new file mode 100644 index 0000000..4ac722b --- /dev/null +++ b/plugin/keyboardform.cpp @@ -0,0 +1,557 @@ +#include "keyboardform.h" +#include +#include +#include +#include +#include +#include +#include +#include "pinyinime.h" + +using namespace ime_pinyin; + +#define chinesecharacters_number 7 +const char *keyboard_characters = "qwertyuiopasdfghjklzxcvbnm,.?"; +const QString keyboard_symbols[] = {"1", "2", "3", "4", "5", "6", "7", "8", "9", "0", + "@", "#", "_", "\"", "“", "”", ",", ",", ".", "。", + ";", ";", ":", ":", "'", "’", "、", "!", "!", + "~", "~", "+", "-", "*", "/", "=", "÷", "×", "√", + "`", "?", "^", "&&", "%", "|", "(", ")", "(", ")", + "[", "]", "【", "】", "{", "}", "<", ">", "《", + "》", "$", "€", "£", "¢", "¥", "§", "—", "/", "\", + "·", "……", "——", "→", "←", "↑", "↓", "■", "□", "●", + "○", "『", "』", "「", "」", "★", "☆", "◆", "◇"}; //29*3 + +KeyboardForm::KeyboardForm(QWidget *parent) + : QWidget(parent) +{ + character_btns_list.clear(); + current_mode = InputMode::en; + upper_mode = false; + m_symbol_page = 0; + + this->setFixedSize(800,250); + int keyboard_btn_width = this->width()/11.5; + int keyboard_btn_height = this->height()/5.0; + + //设置主窗体样式 + this->setAttribute(Qt::WA_TranslucentBackground); + this->setWindowFlags(Qt::Tool | \ + Qt::FramelessWindowHint | \ + Qt::WindowStaysOnTopHint | \ + Qt::WindowDoesNotAcceptFocus); + + //加载QSS样式表 + QFile qss(":/styles/res/stylesheet.qss"); + if(false == qss.open(QFile::ReadOnly))return; + this->setStyleSheet(qss.readAll()); + qss.close(); + + //图标字体 + int fontId = QFontDatabase::addApplicationFont(":/font/res/FontAwesome.otf"); + QString fontName = QFontDatabase::applicationFontFamilies(fontId).at(0); + QFont btnicofont(fontName); + btnicofont.setPixelSize(10); + + //单行布局 + QHBoxLayout *hb[6]; + for(int i=0; i<6; i++) + { + hb[i] = new QHBoxLayout(); + hb[i]->setMargin(0); + i == 1 ? hb[i]->setSpacing(2) : hb[i]->setSpacing(0); + } + + widget_pinyin = new QWidget(this); + widget_pinyin->setFixedHeight(keyboard_btn_height); + //拼音缓存 + m_label_pinyin = new QLabel(this); + m_label_pinyin->setFixedHeight(keyboard_btn_height*0.4); + hb[0]->addWidget(m_label_pinyin); + hb[0]->addStretch(1); + + //汉子缓存 + for(int i=0; isetFixedHeight(keyboard_btn_height*0.6); + btn->setFixedWidth(keyboard_btn_width); /* 增加翻页按钮宽度20200731 */ + hb[1]->addWidget(btn); + if(i != chinesecharacters_number - 1) hb[1]->addStretch(1); + if (i == 0 || i == chinesecharacters_number-1) + { + change_chinese_characters_page_list.append(btn); + btn->setSizePolicy(QSizePolicy::Fixed, QSizePolicy::Fixed); + btn->setFont(btnicofont); + btn->setText(i == 0 ? QString(QChar(0xf0d9)) : QString(QChar(0xf0da))); + btn->setObjectName("hanzichangepage"); + i == 0 ? \ + connect(btn, &QPushButton::clicked, this, &KeyboardForm::chineseCharactersUpdatePrevious) : + connect(btn, &QPushButton::clicked, this, &KeyboardForm::chineseCharactersUpdateNext); + } + else + { + chinese_characters_list.append(btn); + btn->setObjectName("hanzicandidates"); + connect(btn, &QPushButton::clicked, this, &KeyboardForm::chineseCharactersSelected); + } + } + + QVBoxLayout *vb_pinyin = new QVBoxLayout(widget_pinyin); + vb_pinyin->addLayout(hb[0]); + vb_pinyin->addLayout(hb[1]); + vb_pinyin->setMargin(0); + vb_pinyin->setSpacing(0); + + widget_keyboard = new QWidget(this); + widget_keyboard->setFixedHeight(keyboard_btn_height*4.0); + //键盘 + for(int i=0; i<29; i++) + { + QPushButton *btn = new QPushButton(QChar(keyboard_characters[i]),this); + btn->setFixedSize(keyboard_btn_width, keyboard_btn_height); + character_btns_list.append(btn); + connect(btn, &QPushButton::clicked, this, &KeyboardForm::characterButtonClicked); + } + //第一排字母:0-9 + for(int i=0; i<10; i++) + { + hb[2]->addWidget(character_btns_list.at(i)); + } + QPushButton *btn_backspace = new QPushButton(QChar(0xf060)); + btn_backspace->setFont(btnicofont); + btn_backspace->setFixedSize(keyboard_btn_width*1.5, keyboard_btn_height); + btn_backspace->setObjectName("function_button"); + hb[2]->addWidget(btn_backspace); + connect(btn_backspace, &QPushButton::clicked, this, &KeyboardForm::btnBackspaceClicked); + //第二排字母:10-18 + hb[3]->addStretch(1); + for(int i=10; i<19; i++) + { + hb[3]->addWidget(character_btns_list.at(i)); + } + QPushButton *btn_enter = new QPushButton("Enter"); + btn_enter->setFixedSize(keyboard_btn_width*1.5, keyboard_btn_height); + btn_enter->setObjectName("function_button"); + hb[3]->addWidget(btn_enter); + hb[3]->addStretch(1); + connect(btn_enter, &QPushButton::clicked, this, &KeyboardForm::btnEnterClicked); + //第三排字母:20-26 + QPushButton *btn_upper = new QPushButton(QChar(0xf062)); + btn_upper->setFixedSize(keyboard_btn_width*1.5, keyboard_btn_height); + btn_upper->setFont(btnicofont); + btn_upper->setObjectName("function_button"); + hb[4]->addWidget(btn_upper); + connect(btn_upper, &QPushButton::clicked, this, &KeyboardForm::btnUpperClicked); + for(int i=19; i<29; i++) + { + hb[4]->addWidget(character_btns_list.at(i)); + } + character_btns_list.append(btn_upper); + //第四排功能键 + QPushButton *btn_symbols = new QPushButton(".?123"); + btn_symbols->setFixedSize(keyboard_btn_width*1.5, keyboard_btn_height); + btn_symbols->setObjectName("function_button"); + hb[5]->addWidget(btn_symbols); + connect(btn_symbols, &QPushButton::clicked, this, &KeyboardForm::btnSymbolsClicked); + QPushButton *btn_language = new QPushButton(QChar(0xf0ac)); + btn_language->setFixedSize(keyboard_btn_width, keyboard_btn_height); + btn_language->setFont(btnicofont); + btn_language->setObjectName("function_button"); + hb[5]->addWidget(btn_language); + connect(btn_language, &QPushButton::clicked, this, &KeyboardForm::btnLanguageClicked); + QPushButton *btn_blankspace = new QPushButton("English"); + btn_blankspace->setFixedHeight(keyboard_btn_height); + hb[5]->addWidget(btn_blankspace); + character_btns_list.append(btn_blankspace); + connect(btn_blankspace, &QPushButton::clicked, this, &KeyboardForm::btnBlankspaceClicked); + QPushButton *btn_emoji = new QPushButton(QChar(0xf118)); + btn_emoji->setFixedSize(keyboard_btn_width, keyboard_btn_height); + btn_emoji->setFont(btnicofont); + btn_emoji->setObjectName("emoji"); + hb[5]->addWidget(btn_emoji); + connect(btn_emoji, &QPushButton::clicked, this, &KeyboardForm::btnEmojiClicked); + QPushButton *btn_hidekeyboard = new QPushButton(QString(QChar(0xf11c)).append(QChar(0xf103))); + btn_hidekeyboard->setFixedSize(keyboard_btn_width*1.5, keyboard_btn_height); + btn_hidekeyboard->setFont(btnicofont); + btn_hidekeyboard->setObjectName("function_button"); + hb[5]->addWidget(btn_hidekeyboard); + connect(btn_hidekeyboard, &QPushButton::clicked, this, &KeyboardForm::hideKeyboard); + + QVBoxLayout *vb_keyboard = new QVBoxLayout(widget_keyboard); + vb_keyboard->setMargin(0); + vb_keyboard->setSpacing(0); + for(int i=2; i<6; i++) + { + vb_keyboard->addLayout(hb[i]); + } + + QVBoxLayout *vb_system = new QVBoxLayout(this); + vb_system->setMargin(0); + vb_system->setSpacing(0); + vb_system->addStretch(1); + vb_system->addWidget(widget_pinyin); + vb_system->addWidget(widget_keyboard); + widget_pinyin->hide(); + + updateButtonStateOfChineseCharacters(); +} + +void KeyboardForm::updateButtonStateOfChineseCharacters() +{ + if(m_label_pinyin->text().isEmpty()) + { + m_label_pinyin->setHidden(true); + change_chinese_characters_page_list.at(0)->setHidden(true); + change_chinese_characters_page_list.at(1)->setHidden(true); + } + else + { + m_label_pinyin->setHidden(false); + change_chinese_characters_page_list.at(0)->setHidden(false); + change_chinese_characters_page_list.at(1)->setHidden(false); + } +} + +void KeyboardForm::chineseCharactersUpdatePrevious() +{ + searchChineseCharacters(-1); +} + +void KeyboardForm::chineseCharactersUpdateNext() +{ + searchChineseCharacters(1); +} + +void KeyboardForm::chineseCharactersSelected() +{ + emit sendKeyToFocusItem(((QPushButton*)sender())->text()); + clearChineseCache(); +} + +void KeyboardForm::btnBackspaceClicked() +{ + if(current_mode != InputMode::zh || m_label_pinyin->text().isEmpty()) + { + emit sendKeyToFocusItem("\x7F"); + } + else + { + m_label_pinyin->setText(m_label_pinyin->text().left(m_label_pinyin->text().length()-1)); + if(m_label_pinyin->text().isEmpty()) + { + clearChineseCache(); + } + else + { + searchChineseCharacters(0); + } + } +} + +void KeyboardForm::btnEnterClicked() +{ + if(current_mode != InputMode::zh || m_label_pinyin->text().isEmpty()) + { + emit sendKeyToFocusItem("\n"); + } + else + { + emit sendKeyToFocusItem(m_label_pinyin->text()); + clearChineseCache(); + } +} + +void KeyboardForm::btnUpperClicked() +{ + if(current_mode == InputMode::en) + { + upper_mode = !upper_mode; + } + else if(current_mode == InputMode::zh) + { + if(!m_label_pinyin->text().isEmpty() && m_label_pinyin->text().right(1).compare("'")) + { + m_label_pinyin->setText(m_label_pinyin->text().append("'")); + } + } + else + { + if(m_symbol_page == 0) + { + m_symbol_page = 1; + character_btns_list.at(character_btns_list.length()-2)->setText("2/3"); + } + else if(m_symbol_page == 1) + { + m_symbol_page = 2; + character_btns_list.at(character_btns_list.length()-2)->setText("3/3"); + } + else + { + m_symbol_page = 0; + character_btns_list.at(character_btns_list.length()-2)->setText("1/3"); + } + } + updateKeyboard(); +} + +void KeyboardForm::btnSymbolsClicked() +{ + if(current_mode != InputMode::symb) + { + widget_pinyin->setHidden(true); + if(current_mode == InputMode::en) + { + character_btns_list.at(character_btns_list.length()-1)->setText("Symbols"); + } + else if(current_mode == InputMode::zh) + { + character_btns_list.at(character_btns_list.length()-1)->setText("符号"); + } + ((QPushButton*)sender())->setText("abc"); + last_mode = current_mode; + current_mode = InputMode::symb; + character_btns_list.at(character_btns_list.length()-2)->setText("1/3"); + } + else + { + ((QPushButton*)sender())->setText(".?123"); + current_mode = last_mode; + m_symbol_page = 0; + } + upper_mode = false; + updateKeyboard(); +} + +void KeyboardForm::btnLanguageClicked() +{ + upper_mode = false; + if(current_mode == InputMode::zh) + { + current_mode = InputMode::en; + } + else if(current_mode == InputMode::en) + { + current_mode = InputMode::zh; + } + if(current_mode != InputMode::symb) + { + last_mode = current_mode; + updateKeyboard(); + } +} + +void KeyboardForm::clearChineseCache() +{ + m_label_pinyin->setText(""); + for(int i=0; isetText(""); + } + updateButtonStateOfChineseCharacters(); +} + +void KeyboardForm::hideKeyboard() +{ + clearChineseCache(); + this->hide(); +} + +void KeyboardForm::updateKeyboard() +{ + if(current_mode != InputMode::zh) + { + clearChineseCache(); + } + if(current_mode == InputMode::symb) + { + character_btns_list.at(character_btns_list.length()-2)->setCheckable(false); + for(int i=0; i<29; i++) + { + character_btns_list.at(i)->setText(keyboard_symbols[i + m_symbol_page*29]); + } + } + else + { + if(true == upper_mode && current_mode == InputMode::en) + { + character_btns_list.at(character_btns_list.length()-2)->setCheckable(true); + character_btns_list.at(character_btns_list.length()-2)->setChecked(true); + for(int i=0; i<26; i++) + { + character_btns_list.at(i)->setText(QChar(keyboard_characters[i]).toUpper()); + } + } + else + { + for(int i=0; i<26; i++) + { + character_btns_list.at(i)->setText(QChar(keyboard_characters[i])); + } + } + if(current_mode == InputMode::en) + { + widget_pinyin->setHidden(true); + character_btns_list.at(character_btns_list.length()-5)->setText(","); + character_btns_list.at(character_btns_list.length()-4)->setText("."); + character_btns_list.at(character_btns_list.length()-3)->setText("?"); + character_btns_list.at(character_btns_list.length()-2)->setText(QChar(0xf062)); + character_btns_list.at(character_btns_list.length()-1)->setText("English"); + } + else if(current_mode == InputMode::zh) + { + character_btns_list.at(character_btns_list.length()-2)->setCheckable(false); + widget_pinyin->setHidden(false); + character_btns_list.at(character_btns_list.length()-5)->setText(","); + character_btns_list.at(character_btns_list.length()-4)->setText("。"); + character_btns_list.at(character_btns_list.length()-3)->setText("?"); + character_btns_list.at(character_btns_list.length()-2)->setText("分词"); + character_btns_list.at(character_btns_list.length()-1)->setText("拼音"); + } + } +} + +void KeyboardForm::btnBlankspaceClicked() +{ + if(current_mode != InputMode::zh || m_label_pinyin->text().isEmpty()) + { + emit sendKeyToFocusItem(" "); + } + else + { + emit sendKeyToFocusItem(chinese_characters_list.at(0)->text()); + clearChineseCache(); + } +} + +void KeyboardForm::btnEmojiClicked() +{ + emit sendKeyToFocusItem("::)"); +} + +void KeyboardForm::characterButtonClicked() +{ + if(current_mode == InputMode::zh) + { + if(((QPushButton*)sender())->text() == "," || ((QPushButton*)sender())->text() == "。" || ((QPushButton*)sender())->text() == "?") + { + emit sendKeyToFocusItem(((QPushButton*)sender())->text()); + } + else + { + if(m_label_pinyin->text().length()<15) + { + m_label_pinyin->setText(m_label_pinyin->text().append(((QPushButton*)sender())->text())); + searchChineseCharacters(0); + updateButtonStateOfChineseCharacters(); + } + } + } + else + { + emit sendKeyToFocusItem(((QPushButton*)sender())->text()); + } +} + +void KeyboardForm::searchChineseCharacters(const int ¤tpage) +{ + const int max_spelling_length = 32; + const int max_decoded_length = 32; + const int max_single_hanzi = 20; + static unsigned int page_change_times = 0; + + QString app_dir(qApp->applicationDirPath()+"/dict"); + im_open_decoder(QString("%1/dict_pinyin.dat").arg(app_dir).toLocal8Bit().data(), + QString("%1/dict_pinyin_user.dat").arg(app_dir).toLocal8Bit().data()); + im_set_max_lens(max_spelling_length, max_decoded_length); + im_reset_search(); + + QByteArray bytearray(m_label_pinyin->text().toUtf8()); + char *pinyin(bytearray.data()); + size_t cand_num = im_search(pinyin, bytearray.size()); + + size_t decode_len; + im_get_sps_str(&decode_len); + if (decode_len == 1) + { + if (cand_num > 10) cand_num = 10; + } + else + { + size_t single = 0; + size_t multi = 0; + char16 *cand_buf = new char16[max_decoded_length]; + for(size_t i = 0; i < cand_num; i++) + { + im_get_candidate(i, cand_buf, max_decoded_length); + if (strlen((char *)cand_buf) > 2) + { + multi++; + } + else + { + single++; + if (single > max_single_hanzi) break; + } + } + cand_num = multi + single; + delete cand_buf; + } + + switch(currentpage) + { + case 1: + if(cand_num > chinese_characters_list.length() && page_change_times < cand_num - chinese_characters_list.length()) + page_change_times++; + break; + case -1: + if(page_change_times > 0) page_change_times--; + break; + default: + page_change_times = 0; + break; + } + if(0 == page_change_times) + change_chinese_characters_page_list.at(0)->setEnabled(false); + else + change_chinese_characters_page_list.at(0)->setEnabled(true); + if(page_change_times == cand_num - chinese_characters_list.length()) + change_chinese_characters_page_list.at(1)->setEnabled(false); + else + change_chinese_characters_page_list.at(1)->setEnabled(true); + + char16 *cand_buf = new char16[max_decoded_length]; + char16 *cand; + QString cand_str; + for (unsigned i = 0; i < cand_num; i++) + { + cand = im_get_candidate(i, cand_buf, max_decoded_length); + if (cand) + { + cand_str = QString::fromUtf16(cand); + if (i == 0) cand_str.remove(0, im_get_fixed_len()); + } + else + { + cand_str = ""; + } + int tmpindex = i - page_change_times; + if(tmpindex >= 0 && tmpindex < chinese_characters_list.length()) + { + switch(currentpage) + { + case 1: + chinese_characters_list.at(tmpindex)->setText(cand_str); + break; + case -1: + chinese_characters_list.at(tmpindex)->setText(cand_str); + break; + default: + chinese_characters_list.at(tmpindex)->setText(cand_str); + break; + } + } + } + delete cand_buf; +} diff --git a/plugin/keyboardform.h b/plugin/keyboardform.h new file mode 100644 index 0000000..d5f531c --- /dev/null +++ b/plugin/keyboardform.h @@ -0,0 +1,45 @@ +#ifndef KEYBOARDFORM_H +#define KEYBOARDFORM_H + +#include +class QPushButton; +class QLabel; + +class KeyboardForm : public QWidget +{ + Q_OBJECT + +public: + KeyboardForm(QWidget *parent = 0); + void clearChineseCache(); + +private: + void chineseCharactersUpdatePrevious(); + void chineseCharactersUpdateNext(); + void chineseCharactersSelected(); + void btnBackspaceClicked(); + void btnEnterClicked(); + void btnUpperClicked(); + void btnSymbolsClicked(); + void btnLanguageClicked(); + void btnBlankspaceClicked(); + void btnEmojiClicked(); + void characterButtonClicked(); + void updateKeyboard(); + void updateButtonStateOfChineseCharacters(); + void searchChineseCharacters(const int ¤tpage); + void hideKeyboard(); + + QList character_btns_list, chinese_characters_list, change_chinese_characters_page_list; + enum InputMode{zh, en, symb}; + InputMode current_mode, last_mode; + QWidget *widget_keyboard, *widget_pinyin; + bool upper_mode; + QLabel *m_label_pinyin; + int m_symbol_page; + +signals: + void sendKeyToFocusItem(const QString &keytext); +}; + +#endif // KEYBOARDFORM_H diff --git a/plugin/plugin.pro b/plugin/plugin.pro new file mode 100644 index 0000000..100b96a --- /dev/null +++ b/plugin/plugin.pro @@ -0,0 +1,39 @@ +QT = core gui-private widgets + +TEMPLATE = lib +TARGET = tgtsmlInputContextPlugin + +HEADERS += \ + tgtsmlplatforminputcontextplugin.h \ + tgtsmlplatforminputcontext.h \ + keyboardform.h + +SOURCES += \ + tgtsmlplatforminputcontextplugin.cpp \ + tgtsmlplatforminputcontext.cpp \ + keyboardform.cpp + +RESOURCES += \ + res.qrc + +INCLUDEPATH += $$PWD/googlepinyin + +win32{ + CONFIG += debug_and_release build_all + + CONFIG(debug, debug|release){ + TARGET = ../../testWindow/debug/platformInputContexts/$$join(TARGET,,,d) + LIBS += -L$$PWD/googlepinyin/ -lgooglepinyind + }CONFIG(release, debug|release){ + TARGET = ../../testWindow/release/platformInputContexts/$$TARGET + LIBS += -L$$PWD/googlepinyin/ -lgooglepinyin + } +} +unix{ + TARGET = ../target/$$TARGET + LIBS += -L$$PWD/googlepinyin/ -lgooglepinyin + MOC_DIR = ../tmpfiles + RCC_DIR = ../tmpfiles + UI_DIR = ../tmpfiles + OBJECTS_DIR = ../tmpfiles +} diff --git a/plugin/res.qrc b/plugin/res.qrc new file mode 100644 index 0000000..3e2432d --- /dev/null +++ b/plugin/res.qrc @@ -0,0 +1,8 @@ + + + res/stylesheet.qss + + + res/FontAwesome.otf + + diff --git a/plugin/res/FontAwesome.otf b/plugin/res/FontAwesome.otf new file mode 100644 index 0000000..401ec0f Binary files /dev/null and b/plugin/res/FontAwesome.otf differ diff --git a/plugin/res/stylesheet.qss b/plugin/res/stylesheet.qss new file mode 100644 index 0000000..8625f77 --- /dev/null +++ b/plugin/res/stylesheet.qss @@ -0,0 +1,68 @@ +QWidget{ + background-color:black; +} + +QPushButton{ + font:25px; + background-color:gray; + color:white; + border:2px solid black; + border-radius:10px; +} + +QPushButton#function_button{ + background-color:rgb(80,80,80); + color:rgb(200,200,200); + font:20px; +} + +QPushButton:hover{ + background-color:green; + color:white; +} + +QPushButton:pressed, +QPushButton#function_button::pressed{ + background-color:red; + color:white; +} + +QPushButton#function_button:checked{ + background-color:rgb(200,100,50); + color:yellow; +} + +QPushButton#hanzichangepage, +QPushButton#hanzicandidates{ + background-color:transparent; + color:white; +} + +QPushButton#hanzichangepage:pressed{ + color:red; +} + +QPushButton#hanzichangepage:disabled{ + color:gray; +} + +QPushButton#emoji{ + background-color:rgb(80,80,80); + color:yellow; +} + +QPushButton#emoji:hover{ + color:red; +} + +QPushButton#emoji:pressed{ + color:blue; +} + +QLabel{ + background-color:rgba(255,255,255,80); + border-radius:2px; + color:white; + font:15px; + margin-left: 5px; +} diff --git a/plugin/res/tgtsml.json b/plugin/res/tgtsml.json new file mode 100644 index 0000000..9ff3862 --- /dev/null +++ b/plugin/res/tgtsml.json @@ -0,0 +1,3 @@ +{ + "Keys": [ "tgtsml" ] +} diff --git a/plugin/tgtsmlplatforminputcontext.cpp b/plugin/tgtsmlplatforminputcontext.cpp new file mode 100644 index 0000000..607a2af --- /dev/null +++ b/plugin/tgtsmlplatforminputcontext.cpp @@ -0,0 +1,99 @@ +#include "tgtsmlplatforminputcontext.h" +#include +#include +#include +#include + +TgtsmlPlatformInputContext::TgtsmlPlatformInputContext() +{ + m_focusitem = nullptr; + m_keyboard = nullptr; +} + +TgtsmlPlatformInputContext::~TgtsmlPlatformInputContext() +{ + disconnect(m_keyboard, &KeyboardForm::sendKeyToFocusItem, this, &TgtsmlPlatformInputContext::sendKeyToFocusItem); + if(m_keyboard) delete m_keyboard; +} + +void TgtsmlPlatformInputContext::sendKeyToFocusItem(const QString &keytext) +{ + if(!m_focusitem)return; + + if(keytext == QString("\x7F")) //Backspace <-- + { + QCoreApplication::sendEvent(m_focusitem, new QKeyEvent(QEvent::KeyPress, Qt::Key_Backspace, Qt::NoModifier)); + QCoreApplication::sendEvent(m_focusitem, new QKeyEvent(QEvent::KeyRelease, Qt::Key_Backspace, Qt::NoModifier)); + } + else if(keytext == QString("\n")) + { + QCoreApplication::sendEvent(m_focusitem, new QKeyEvent(QEvent::KeyPress, Qt::Key_Return, Qt::NoModifier)); + QCoreApplication::sendEvent(m_focusitem, new QKeyEvent(QEvent::KeyRelease, Qt::Key_Return, Qt::NoModifier)); + } + else if(keytext == QString("&&")) + { + QCoreApplication::sendEvent(m_focusitem, new QKeyEvent(QEvent::KeyPress, 0, Qt::NoModifier, "&")); + QCoreApplication::sendEvent(m_focusitem, new QKeyEvent(QEvent::KeyRelease, 0, Qt::NoModifier, "&")); + } + else + { + QCoreApplication::sendEvent(m_focusitem, new QKeyEvent(QEvent::KeyPress, 0, Qt::NoModifier, keytext)); + QCoreApplication::sendEvent(m_focusitem, new QKeyEvent(QEvent::KeyRelease, 0, Qt::NoModifier, keytext)); + } +} + +bool TgtsmlPlatformInputContext::isValid() const +{ + return true; +} + +void TgtsmlPlatformInputContext::setFocusObject(QObject *object) +{ + m_focusitem = object; +} + +void TgtsmlPlatformInputContext::showInputPanel() +{ + if(!m_keyboard){ + m_keyboard = new KeyboardForm; + connect(m_keyboard, &KeyboardForm::sendKeyToFocusItem, this, &TgtsmlPlatformInputContext::sendKeyToFocusItem); + } + if(m_keyboard->isHidden())m_keyboard->show(); + QWidget *widgetTmp = qobject_cast(m_focusitem); + if(widgetTmp){ + QPoint widgetGlobalPos = widgetTmp->mapToGlobal(QPoint(0, 0)); + if(widgetGlobalPos.x() < 0){ + widgetGlobalPos.setX(0); + } + if(widgetGlobalPos.y() < 0){ + widgetGlobalPos.setY(0); + } + if(qApp->desktop()->width() - widgetGlobalPos.x() < m_keyboard->width()){ + widgetGlobalPos.setX(qApp->desktop()->width() - m_keyboard->width()); + } + if(qApp->desktop()->height() - widgetGlobalPos.y() - 30 < m_keyboard->height()){ + widgetGlobalPos.setY(widgetGlobalPos.y() - m_keyboard->height() - 10); + } + else{ + widgetGlobalPos = widgetGlobalPos + QPoint(0,30); + } + m_keyboard->move(widgetGlobalPos); + } +} + +void TgtsmlPlatformInputContext::hideInputPanel() +{ + if(!m_keyboard){ + return; + } + if(!m_keyboard->isHidden()){ + m_keyboard->hide(); + } + m_keyboard->clearChineseCache(); +} + +bool TgtsmlPlatformInputContext::isInputPanelVisible() const +{ + return m_keyboard->isVisible(); +} + diff --git a/plugin/tgtsmlplatforminputcontext.h b/plugin/tgtsmlplatforminputcontext.h new file mode 100644 index 0000000..8de6efc --- /dev/null +++ b/plugin/tgtsmlplatforminputcontext.h @@ -0,0 +1,27 @@ +#ifndef TGTSMLPLATFORMINPUTCONTEXT_H +#define TGTSMLPLATFORMINPUTCONTEXT_H + +#include +#include "keyboardform.h" + +class TgtsmlPlatformInputContext : public QPlatformInputContext +{ + Q_OBJECT +public: + TgtsmlPlatformInputContext(); + ~TgtsmlPlatformInputContext(); + + bool isValid() const Q_DECL_OVERRIDE; + void setFocusObject(QObject *object) Q_DECL_OVERRIDE; + void showInputPanel() Q_DECL_OVERRIDE; + void hideInputPanel() Q_DECL_OVERRIDE; + bool isInputPanelVisible() const Q_DECL_OVERRIDE; + +private: + void sendKeyToFocusItem(const QString &keytext); + + KeyboardForm *m_keyboard; + QObject *m_focusitem; +}; + +#endif // TGTSMLPLATFORMINPUTCONTEXT_H diff --git a/plugin/tgtsmlplatforminputcontextplugin.cpp b/plugin/tgtsmlplatforminputcontextplugin.cpp new file mode 100644 index 0000000..6daefae --- /dev/null +++ b/plugin/tgtsmlplatforminputcontextplugin.cpp @@ -0,0 +1,12 @@ +#include "tgtsmlplatforminputcontextplugin.h" + +TgtsmlPlatformInputContext * TgtsmlPlatformInputContextPlugin::create(const QString &key, const QStringList ¶mlist) +{ + Q_UNUSED(paramlist) + + if(key.compare("tgtsml",Qt::CaseInsensitive) == 0) + { + return new TgtsmlPlatformInputContext; + } + return nullptr; +} diff --git a/plugin/tgtsmlplatforminputcontextplugin.h b/plugin/tgtsmlplatforminputcontextplugin.h new file mode 100644 index 0000000..21023b4 --- /dev/null +++ b/plugin/tgtsmlplatforminputcontextplugin.h @@ -0,0 +1,16 @@ +#ifndef TGTSMLPLATFORMINPUTCONTEXTPLUGIN_H +#define TGTSMLPLATFORMINPUTCONTEXTPLUGIN_H + +#include +#include "tgtsmlplatforminputcontext.h" + +class TgtsmlPlatformInputContextPlugin : public QPlatformInputContextPlugin +{ + Q_OBJECT + Q_PLUGIN_METADATA(IID QPlatformInputContextFactoryInterface_iid FILE "./res/tgtsml.json") + +public: + TgtsmlPlatformInputContext *create(const QString &key, const QStringList ¶mlist); +}; + +#endif // TGTSMLPLATFORMINPUTCONTEXTPLUGIN_H diff --git a/testWindow/main.cpp b/testWindow/main.cpp new file mode 100644 index 0000000..b0ce345 --- /dev/null +++ b/testWindow/main.cpp @@ -0,0 +1,13 @@ +#include "mainwindow.h" + +#include + +int main(int argc, char *argv[]) +{ + //qputenv("QT_IM_MODULE", QByteArray("tgtsml")); + //qputenv("QT_IM_MODULE", QByteArray("qtvirtualkeyboard")); + QApplication a(argc, argv); + MainWindow w; + w.show(); + return a.exec(); +} diff --git a/testWindow/mainwindow.cpp b/testWindow/mainwindow.cpp new file mode 100644 index 0000000..41a26bd --- /dev/null +++ b/testWindow/mainwindow.cpp @@ -0,0 +1,15 @@ +#include "mainwindow.h" +#include "ui_mainwindow.h" + +MainWindow::MainWindow(QWidget *parent) + : QMainWindow(parent) + , ui(new Ui::MainWindow) +{ + ui->setupUi(this); +} + +MainWindow::~MainWindow() +{ + delete ui; +} + diff --git a/testWindow/mainwindow.h b/testWindow/mainwindow.h new file mode 100644 index 0000000..4643e32 --- /dev/null +++ b/testWindow/mainwindow.h @@ -0,0 +1,21 @@ +#ifndef MAINWINDOW_H +#define MAINWINDOW_H + +#include + +QT_BEGIN_NAMESPACE +namespace Ui { class MainWindow; } +QT_END_NAMESPACE + +class MainWindow : public QMainWindow +{ + Q_OBJECT + +public: + MainWindow(QWidget *parent = nullptr); + ~MainWindow(); + +private: + Ui::MainWindow *ui; +}; +#endif // MAINWINDOW_H diff --git a/testWindow/mainwindow.ui b/testWindow/mainwindow.ui new file mode 100644 index 0000000..b2a6ad7 --- /dev/null +++ b/testWindow/mainwindow.ui @@ -0,0 +1,158 @@ + + + MainWindow + + + + 0 + 0 + 800 + 600 + + + + MainWindow + + + + + + + + + + + + + + + + + + + PushButton + + + + + + + + + + + + + + + + PushButton + + + + + + + PushButton + + + + + + + + + + + + + + + + + + + + + + + + + PushButton + + + + + + + + + + + + + PushButton + + + + + + + + + + + + + + + + + + + + + + + + + PushButton + + + + + + + + + + PushButton + + + + + + + + + + + + + + + + + + 0 + 0 + 800 + 23 + + + + + + + + diff --git a/testWindow/testWindow.pro b/testWindow/testWindow.pro new file mode 100644 index 0000000..c7c3bd0 --- /dev/null +++ b/testWindow/testWindow.pro @@ -0,0 +1,34 @@ +QT += core gui + +greaterThan(QT_MAJOR_VERSION, 4): QT += widgets + +CONFIG += c++11 + +# The following define makes your compiler emit warnings if you use +# any Qt feature that has been marked deprecated (the exact warnings +# depend on your compiler). Please consult the documentation of the +# deprecated API in order to know how to port your code away from it. +DEFINES += QT_DEPRECATED_WARNINGS + +# You can also make your code fail to compile if it uses deprecated APIs. +# In order to do so, uncomment the following line. +# You can also select to disable deprecated APIs only up to a certain version of Qt. +#DEFINES += QT_DISABLE_DEPRECATED_BEFORE=0x060000 # disables all the APIs deprecated before Qt 6.0.0 + +SOURCES += \ + main.cpp \ + mainwindow.cpp + +HEADERS += \ + mainwindow.h + +FORMS += \ + mainwindow.ui + +unix{ + TARGET = ../target/window + MOC_DIR = ../tmpfiles + RCC_DIR = ../tmpfiles + UI_DIR = ../tmpfiles + OBJECTS_DIR = ../tmpfiles +} diff --git a/test_exe/Qt5Core.dll b/test_exe/Qt5Core.dll new file mode 100644 index 0000000..a3b195d Binary files /dev/null and b/test_exe/Qt5Core.dll differ diff --git a/test_exe/Qt5Gui.dll b/test_exe/Qt5Gui.dll new file mode 100644 index 0000000..4c14e83 Binary files /dev/null and b/test_exe/Qt5Gui.dll differ diff --git a/test_exe/Qt5Widgets.dll b/test_exe/Qt5Widgets.dll new file mode 100644 index 0000000..099a9f4 Binary files /dev/null and b/test_exe/Qt5Widgets.dll differ diff --git a/test_exe/dict/dict_pinyin.dat b/test_exe/dict/dict_pinyin.dat new file mode 100644 index 0000000..1be3f9c Binary files /dev/null and b/test_exe/dict/dict_pinyin.dat differ diff --git a/test_exe/dict/dict_pinyin_user.dat b/test_exe/dict/dict_pinyin_user.dat new file mode 100644 index 0000000..52009bc Binary files /dev/null and b/test_exe/dict/dict_pinyin_user.dat differ diff --git a/test_exe/libgcc_s_seh-1.dll b/test_exe/libgcc_s_seh-1.dll new file mode 100644 index 0000000..4ec945b Binary files /dev/null and b/test_exe/libgcc_s_seh-1.dll differ diff --git a/test_exe/libstdc++-6.dll b/test_exe/libstdc++-6.dll new file mode 100644 index 0000000..8e55acc Binary files /dev/null and b/test_exe/libstdc++-6.dll differ diff --git a/test_exe/libwinpthread-1.dll b/test_exe/libwinpthread-1.dll new file mode 100644 index 0000000..d9f4e1a Binary files /dev/null and b/test_exe/libwinpthread-1.dll differ diff --git a/test_exe/platformInputContexts/tgtsmlInputContextPlugin.dll b/test_exe/platformInputContexts/tgtsmlInputContextPlugin.dll new file mode 100644 index 0000000..ee531d0 Binary files /dev/null and b/test_exe/platformInputContexts/tgtsmlInputContextPlugin.dll differ diff --git a/test_exe/platforms/qwindows.dll b/test_exe/platforms/qwindows.dll new file mode 100644 index 0000000..103e4b0 Binary files /dev/null and b/test_exe/platforms/qwindows.dll differ diff --git a/test_exe/styles/qwindowsvistastyle.dll b/test_exe/styles/qwindowsvistastyle.dll new file mode 100644 index 0000000..5b2f0a5 Binary files /dev/null and b/test_exe/styles/qwindowsvistastyle.dll differ diff --git a/test_exe/test.png b/test_exe/test.png new file mode 100644 index 0000000..040c99d Binary files /dev/null and b/test_exe/test.png differ diff --git a/test_exe/testWindow.exe b/test_exe/testWindow.exe new file mode 100644 index 0000000..d05a893 Binary files /dev/null and b/test_exe/testWindow.exe differ diff --git a/tgtsmlInputContextPlugin.pro b/tgtsmlInputContextPlugin.pro new file mode 100755 index 0000000..d1f2701 --- /dev/null +++ b/tgtsmlInputContextPlugin.pro @@ -0,0 +1,7 @@ +TEMPLATE = subdirs +SUBDIRS = googlepinyin plugin testWindow + +CONFIG += ordered + +testWindow.depends = plugin +plugin.depends = googlepinyin diff --git a/tgtsmlInputContextPlugin.pro.user b/tgtsmlInputContextPlugin.pro.user new file mode 100644 index 0000000..d985e90 --- /dev/null +++ b/tgtsmlInputContextPlugin.pro.user @@ -0,0 +1,866 @@ + + + + + + EnvironmentId + {9ebfe731-b774-4413-aeaf-2512ba22bbcf} + + + ProjectExplorer.Project.ActiveTarget + 0 + + + ProjectExplorer.Project.EditorSettings + + true + false + true + + Cpp + + CppGlobal + + + + QmlJS + + QmlJSGlobal + + + 2 + UTF-8 + false + 4 + false + 80 + true + true + 1 + true + false + 0 + true + true + 0 + 8 + true + 1 + true + true + true + false + + + + ProjectExplorer.Project.PluginSettings + + + + ProjectExplorer.Project.Target.0 + + Desktop Qt 5.6.1 GCC 64bit + Desktop Qt 5.6.1 GCC 64bit + qt.56.gcc_64_kit + 0 + 0 + 0 + + /home/han/qtinput/build-tgtsmlInputContextPlugin-Desktop_Qt_5_6_1_GCC_64bit-Debug + + + true + qmake + + QtProjectManager.QMakeBuildStep + true + + false + false + false + + + true + Make + + Qt4ProjectManager.MakeStep + + -w + -r + + false + + + + 2 + 构建 + + ProjectExplorer.BuildSteps.Build + + + + true + Make + + Qt4ProjectManager.MakeStep + + -w + -r + + true + clean + + + 1 + 清理 + + ProjectExplorer.BuildSteps.Clean + + 2 + false + + Debug + + Qt4ProjectManager.Qt4BuildConfiguration + 2 + true + + + /home/han/qtinput/build-tgtsmlInputContextPlugin-Desktop_Qt_5_6_1_GCC_64bit-Release + + + true + qmake + + QtProjectManager.QMakeBuildStep + false + + false + false + false + + + true + Make + + Qt4ProjectManager.MakeStep + + -w + -r + + false + + + + 2 + 构建 + + ProjectExplorer.BuildSteps.Build + + + + true + Make + + Qt4ProjectManager.MakeStep + + -w + -r + + true + clean + + + 1 + 清理 + + ProjectExplorer.BuildSteps.Clean + + 2 + false + + Release + + Qt4ProjectManager.Qt4BuildConfiguration + 0 + true + + + /home/han/qtinput/build-tgtsmlInputContextPlugin-Desktop_Qt_5_6_1_GCC_64bit-Profile + + + true + qmake + + QtProjectManager.QMakeBuildStep + true + + false + true + false + + + true + Make + + Qt4ProjectManager.MakeStep + + -w + -r + + false + + + + 2 + 构建 + + ProjectExplorer.BuildSteps.Build + + + + true + Make + + Qt4ProjectManager.MakeStep + + -w + -r + + true + clean + + + 1 + 清理 + + ProjectExplorer.BuildSteps.Clean + + 2 + false + + Profile + + Qt4ProjectManager.Qt4BuildConfiguration + 0 + true + + 3 + + + 0 + 部署 + + ProjectExplorer.BuildSteps.Deploy + + 1 + 在本地部署 + + ProjectExplorer.DefaultDeployConfiguration + + 1 + + + false + false + 1000 + + true + + false + false + false + false + true + 0.01 + 10 + true + 1 + 25 + + 1 + true + false + true + valgrind + + 0 + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10 + 11 + 12 + 13 + 14 + + 2 + + testWindow + + Qt4ProjectManager.Qt4RunConfiguration:/home/han/qtinput/QtInputMethod_GooglePinyin/testWindow/testWindow.pro + true + + testWindow/testWindow.pro + false + + /../target + 3768 + false + true + false + false + true + + 1 + + + + ProjectExplorer.Project.Target.1 + + imx6ull + imx6ull + {76d84a76-b517-431b-8e29-c381d34bd8b9} + 0 + 0 + 0 + + /home/han/qtinput/build-tgtsmlInputContextPlugin-imx6ull-Debug + + + true + qmake + + QtProjectManager.QMakeBuildStep + true + + false + false + false + + + true + Make + + Qt4ProjectManager.MakeStep + + -w + -r + + false + + + + 2 + 构建 + + ProjectExplorer.BuildSteps.Build + + + + true + Make + + Qt4ProjectManager.MakeStep + + -w + -r + + true + clean + + + 1 + 清理 + + ProjectExplorer.BuildSteps.Clean + + 2 + false + + Debug + + Qt4ProjectManager.Qt4BuildConfiguration + 2 + true + + + /home/han/qtinput/build-tgtsmlInputContextPlugin-imx6ull-Release + + + true + qmake + + QtProjectManager.QMakeBuildStep + false + + false + false + false + + + true + Make + + Qt4ProjectManager.MakeStep + + -w + -r + + false + + + + 2 + 构建 + + ProjectExplorer.BuildSteps.Build + + + + true + Make + + Qt4ProjectManager.MakeStep + + -w + -r + + true + clean + + + 1 + 清理 + + ProjectExplorer.BuildSteps.Clean + + 2 + false + + Release + + Qt4ProjectManager.Qt4BuildConfiguration + 0 + true + + + /home/han/qtinput/build-tgtsmlInputContextPlugin-imx6ull-Profile + + + true + qmake + + QtProjectManager.QMakeBuildStep + true + + false + true + false + + + true + Make + + Qt4ProjectManager.MakeStep + + -w + -r + + false + + + + 2 + 构建 + + ProjectExplorer.BuildSteps.Build + + + + true + Make + + Qt4ProjectManager.MakeStep + + -w + -r + + true + clean + + + 1 + 清理 + + ProjectExplorer.BuildSteps.Clean + + 2 + false + + Profile + + Qt4ProjectManager.Qt4BuildConfiguration + 0 + true + + 3 + + + 0 + 部署 + + ProjectExplorer.BuildSteps.Deploy + + 1 + 在本地部署 + + ProjectExplorer.DefaultDeployConfiguration + + 1 + + + false + false + 1000 + + true + + false + false + false + false + true + 0.01 + 10 + true + 1 + 25 + + 1 + true + false + true + valgrind + + 0 + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10 + 11 + 12 + 13 + 14 + + -1 + + testWindow + + Qt4ProjectManager.Qt4RunConfiguration:/home/han/qtinput/QtInputMethod_GooglePinyin/testWindow/testWindow.pro + true + + testWindow/testWindow.pro + false + + + 3768 + false + true + false + false + true + + 1 + + + + ProjectExplorer.Project.Target.2 + + ARM-A64 + ARM-A64 + {8e99dd39-dea9-494c-aff4-641bf74d0113} + 0 + 0 + 0 + + /home/han/qtinput/build-tgtsmlInputContextPlugin-ARM_A64-Debug + + + true + qmake + + QtProjectManager.QMakeBuildStep + true + + false + false + false + + + true + Make + + Qt4ProjectManager.MakeStep + + -w + -r + + false + + + + 2 + 构建 + + ProjectExplorer.BuildSteps.Build + + + + true + Make + + Qt4ProjectManager.MakeStep + + -w + -r + + true + clean + + + 1 + 清理 + + ProjectExplorer.BuildSteps.Clean + + 2 + false + + Debug + + Qt4ProjectManager.Qt4BuildConfiguration + 2 + true + + + /home/han/qtinput/build-tgtsmlInputContextPlugin-ARM_A64-Release + + + true + qmake + + QtProjectManager.QMakeBuildStep + false + + false + false + false + + + true + Make + + Qt4ProjectManager.MakeStep + + -w + -r + + false + + + + 2 + 构建 + + ProjectExplorer.BuildSteps.Build + + + + true + Make + + Qt4ProjectManager.MakeStep + + -w + -r + + true + clean + + + 1 + 清理 + + ProjectExplorer.BuildSteps.Clean + + 2 + false + + Release + + Qt4ProjectManager.Qt4BuildConfiguration + 0 + true + + + /home/han/qtinput/build-tgtsmlInputContextPlugin-ARM_A64-Profile + + + true + qmake + + QtProjectManager.QMakeBuildStep + true + + false + true + false + + + true + Make + + Qt4ProjectManager.MakeStep + + -w + -r + + false + + + + 2 + 构建 + + ProjectExplorer.BuildSteps.Build + + + + true + Make + + Qt4ProjectManager.MakeStep + + -w + -r + + true + clean + + + 1 + 清理 + + ProjectExplorer.BuildSteps.Clean + + 2 + false + + Profile + + Qt4ProjectManager.Qt4BuildConfiguration + 0 + true + + 3 + + + 0 + 部署 + + ProjectExplorer.BuildSteps.Deploy + + 1 + 在本地部署 + + ProjectExplorer.DefaultDeployConfiguration + + 1 + + + false + false + 1000 + + true + + false + false + false + false + true + 0.01 + 10 + true + 1 + 25 + + 1 + true + false + true + valgrind + + 0 + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10 + 11 + 12 + 13 + 14 + + -1 + + testWindow + + Qt4ProjectManager.Qt4RunConfiguration:/home/han/qtinput/QtInputMethod_GooglePinyin/testWindow/testWindow.pro + true + + testWindow/testWindow.pro + false + + + 3768 + false + true + false + false + true + + 1 + + + + ProjectExplorer.Project.TargetCount + 3 + + + ProjectExplorer.Project.Updater.FileVersion + 18 + + + Version + 18 + +