/* * Copyright (C) 2009 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef PINYINIME_INCLUDE_DICTTRIE_H__ #define PINYINIME_INCLUDE_DICTTRIE_H__ #include #include "./atomdictbase.h" #include "./dictdef.h" #include "./dictlist.h" #include "./searchutility.h" namespace ime_pinyin { class DictTrie : AtomDictBase { private: struct ParsingMark { size_t node_offset:24; size_t node_num:8; // Number of nodes with this spelling id given // by spl_id. If spl_id is a Shengmu, for nodes // in the first layer of DictTrie, it equals to // SpellingTrie::shm2full_num(); but for those // nodes which are not in the first layer, // node_num < SpellingTrie::shm2full_num(). // For a full spelling id, node_num = 1; }; // Used to indicate an extended mile stone. // An extended mile stone is used to mark a partial match in the dictionary // trie to speed up further potential extending. // For example, when the user inputs "w", a mile stone is created to mark the // partial match status, so that when user inputs another char 'm', it will be // faster to extend search space based on this mile stone. // // For partial match status of "wm", there can be more than one sub mile // stone, for example, "wm" can be matched to "wanm", "wom", ..., etc, so // there may be more one parsing mark used to mark these partial matchings. // A mile stone records the starting position in the mark list and number of // marks. struct MileStone { uint16 mark_start; uint16 mark_num; }; DictList* dict_list_; const SpellingTrie *spl_trie_; LmaNodeLE0* root_; // Nodes for root and the first layer. LmaNodeGE1* nodes_ge1_; // Nodes for other layers. // An quick index from spelling id to the LmaNodeLE0 node buffer, or // to the root_ buffer. // Index length: // SpellingTrie::get_instance().get_spelling_num() + 1. The last one is used // to get the end. // All Shengmu ids are not indexed because they will be converted into // corresponding full ids. // So, given an id splid, the son is: // root_[splid_le0_index_[splid - kFullSplIdStart]] uint16 *splid_le0_index_; uint32 lma_node_num_le0_; uint32 lma_node_num_ge1_; // The first part is for homophnies, and the last top_lma_num_ items are // lemmas with highest scores. unsigned char *lma_idx_buf_; uint32 lma_idx_buf_len_; // The total size of lma_idx_buf_ in byte. uint32 total_lma_num_; // Total number of lemmas in this dictionary. uint32 top_lmas_num_; // Number of lemma with highest scores. // Parsing mark list used to mark the detailed extended statuses. ParsingMark *parsing_marks_; // The position for next available mark. uint16 parsing_marks_pos_; // Mile stone list used to mark the extended status. MileStone *mile_stones_; // The position for the next available mile stone. We use positions (except 0) // as handles. MileStoneHandle mile_stones_pos_; // Get the offset of sons for a node. inline size_t get_son_offset(const LmaNodeGE1 *node); // Get the offset of homonious ids for a node. inline size_t get_homo_idx_buf_offset(const LmaNodeGE1 *node); // Get the lemma id by the offset. inline LemmaIdType get_lemma_id(size_t id_offset); void free_resource(bool free_dict_list); bool load_dict(FILE *fp); // Given a LmaNodeLE0 node, extract the lemmas specified by it, and fill // them into the lpi_items buffer. // This function is called by the search engine. size_t fill_lpi_buffer(LmaPsbItem lpi_items[], size_t max_size, LmaNodeLE0 *node); // Given a LmaNodeGE1 node, extract the lemmas specified by it, and fill // them into the lpi_items buffer. // This function is called by inner functions extend_dict0(), extend_dict1() // and extend_dict2(). size_t fill_lpi_buffer(LmaPsbItem lpi_items[], size_t max_size, size_t homo_buf_off, LmaNodeGE1 *node, uint16 lma_len); // Extend in the trie from level 0. MileStoneHandle extend_dict0(MileStoneHandle from_handle, const DictExtPara *dep, LmaPsbItem *lpi_items, size_t lpi_max, size_t *lpi_num); // Extend in the trie from level 1. MileStoneHandle extend_dict1(MileStoneHandle from_handle, const DictExtPara *dep, LmaPsbItem *lpi_items, size_t lpi_max, size_t *lpi_num); // Extend in the trie from level 2. MileStoneHandle extend_dict2(MileStoneHandle from_handle, const DictExtPara *dep, LmaPsbItem *lpi_items, size_t lpi_max, size_t *lpi_num); // Try to extend the given spelling id buffer, and if the given id_lemma can // be successfully gotten, return true; // The given spelling ids are all valid full ids. bool try_extend(const uint16 *splids, uint16 splid_num, LemmaIdType id_lemma); #ifdef ___BUILD_MODEL___ bool save_dict(FILE *fp); #endif // ___BUILD_MODEL___ static const int kMaxMileStone = 100; static const int kMaxParsingMark = 600; static const MileStoneHandle kFirstValidMileStoneHandle = 1; friend class DictParser; friend class DictBuilder; public: DictTrie(); ~DictTrie(); #ifdef ___BUILD_MODEL___ // Construct the tree from the file fn_raw. // fn_validhzs provide the valid hanzi list. If fn_validhzs is // NULL, only chars in GB2312 will be included. bool build_dict(const char *fn_raw, const char *fn_validhzs); // Save the binary dictionary // Actually, the SpellingTrie/DictList instance will be also saved. bool save_dict(const char *filename); #endif // ___BUILD_MODEL___ void convert_to_hanzis(char16 *str, uint16 str_len); void convert_to_scis_ids(char16 *str, uint16 str_len); // Load a binary dictionary // The SpellingTrie instance/DictList will be also loaded bool load_dict(const char *filename, LemmaIdType start_id, LemmaIdType end_id); bool load_dict_fd(int sys_fd, long start_offset, long length, LemmaIdType start_id, LemmaIdType end_id); bool close_dict() {return true;} size_t number_of_lemmas() {return 0;} void reset_milestones(uint16 from_step, MileStoneHandle from_handle); MileStoneHandle extend_dict(MileStoneHandle from_handle, const DictExtPara *dep, LmaPsbItem *lpi_items, size_t lpi_max, size_t *lpi_num); size_t get_lpis(const uint16 *splid_str, uint16 splid_str_len, LmaPsbItem *lpi_items, size_t lpi_max); uint16 get_lemma_str(LemmaIdType id_lemma, char16 *str_buf, uint16 str_max); uint16 get_lemma_splids(LemmaIdType id_lemma, uint16 *splids, uint16 splids_max, bool arg_valid); size_t predict(const char16 *last_hzs, uint16 hzs_len, NPredictItem *npre_items, size_t npre_max, size_t b4_used); LemmaIdType put_lemma(char16 /*lemma_str*/[], uint16 /*splids*/[], uint16 /*lemma_len*/, uint16 /*count*/) {return 0;} LemmaIdType update_lemma(LemmaIdType /*lemma_id*/, int16 /*delta_count*/, bool /*selected*/) {return 0;} LemmaIdType get_lemma_id(char16 /*lemma_str*/[], uint16 /*splids*/[], uint16 /*lemma_len*/) {return 0;} LmaScoreType get_lemma_score(LemmaIdType /*lemma_id*/) {return 0;} LmaScoreType get_lemma_score(char16 /*lemma_str*/[], uint16 /*splids*/[], uint16 /*lemma_len*/) {return 0;} bool remove_lemma(LemmaIdType /*lemma_id*/) {return false;} size_t get_total_lemma_count() {return 0;} void set_total_lemma_count_of_others(size_t count); void flush_cache() {} LemmaIdType get_lemma_id(const char16 lemma_str[], uint16 lemma_len); // Fill the lemmas with highest scores to the prediction buffer. // his_len is the history length to fill in the prediction buffer. size_t predict_top_lmas(size_t his_len, NPredictItem *npre_items, size_t npre_max, size_t b4_used); }; } #endif // PINYINIME_INCLUDE_DICTTRIE_H__