QtInputMethod_GooglePinyin/googlepinyin/dicttrie.h

/*
 * Copyright (C) 2009 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef PINYINIME_INCLUDE_DICTTRIE_H__
#define PINYINIME_INCLUDE_DICTTRIE_H__

#include <stdlib.h>
#include "./atomdictbase.h"
#include "./dictdef.h"
#include "./dictlist.h"
#include "./searchutility.h"

namespace ime_pinyin {

class DictTrie : AtomDictBase {
 private:
  struct ParsingMark {
    size_t node_offset:24;
    size_t node_num:8;           // Number of nodes with this spelling id given
                                 // by spl_id. If spl_id is a Shengmu, for nodes
                                 // in the first layer of DictTrie, it equals to
                                 // SpellingTrie::shm2full_num(); but for those
                                 // nodes which are not in the first layer,
                                 // node_num < SpellingTrie::shm2full_num().
                                 // For a full spelling id, node_num = 1;
  };

  // Used to indicate an extended mile stone.
  // An extended mile stone is used to mark a partial match in the dictionary
  // trie to speed up further potential extending.
  // For example, when the user inputs "w", a mile stone is created to mark the
  // partial match status, so that when user inputs another char 'm', it will be
  // faster to extend search space based on this mile stone.
  //
  // For partial match status of "wm", there can be more than one sub mile
  // stone, for example, "wm" can be matched to "wanm", "wom", ..., etc, so
  // there may be more one parsing mark used to mark these partial matchings.
  // A mile stone records the starting position in the mark list and number of
  // marks.
  struct MileStone {
    uint16 mark_start;
    uint16 mark_num;
  };

  DictList* dict_list_;

  const SpellingTrie *spl_trie_;

  LmaNodeLE0* root_;        // Nodes for root and the first layer.
  LmaNodeGE1* nodes_ge1_;   // Nodes for other layers.

  // An quick index from spelling id to the LmaNodeLE0 node buffer, or
  // to the root_ buffer.
  // Index length:
  // SpellingTrie::get_instance().get_spelling_num() + 1. The last one is used
  // to get the end.
  // All Shengmu ids are not indexed because they will be converted into
  // corresponding full ids.
  // So, given an id splid, the son is:
  // root_[splid_le0_index_[splid - kFullSplIdStart]]
  uint16 *splid_le0_index_;

  uint32 lma_node_num_le0_;
  uint32 lma_node_num_ge1_;

  // The first part is for homophnies, and the last  top_lma_num_ items are
  // lemmas with highest scores.
  unsigned char *lma_idx_buf_;
  uint32 lma_idx_buf_len_;  // The total size of lma_idx_buf_ in byte.
  uint32 total_lma_num_;    // Total number of lemmas in this dictionary.
  uint32 top_lmas_num_;     // Number of lemma with highest scores.

  // Parsing mark list used to mark the detailed extended statuses.
  ParsingMark *parsing_marks_;
  // The position for next available mark.
  uint16 parsing_marks_pos_;

  // Mile stone list used to mark the extended status.
  MileStone *mile_stones_;
  // The position for the next available mile stone. We use positions (except 0)
  // as handles.
  MileStoneHandle mile_stones_pos_;

  // Get the offset of sons for a node.
  inline size_t get_son_offset(const LmaNodeGE1 *node);

  // Get the offset of homonious ids for a node.
  inline size_t get_homo_idx_buf_offset(const LmaNodeGE1 *node);

  // Get the lemma id by the offset.
  inline LemmaIdType get_lemma_id(size_t id_offset);

  void free_resource(bool free_dict_list);

  bool load_dict(FILE *fp);

  // Given a LmaNodeLE0 node, extract the lemmas specified by it, and fill
  // them into the lpi_items buffer.
  // This function is called by the search engine.
  size_t fill_lpi_buffer(LmaPsbItem lpi_items[], size_t max_size,
                         LmaNodeLE0 *node);

  // Given a LmaNodeGE1 node, extract the lemmas specified by it, and fill
  // them into the lpi_items buffer.
  // This function is called by inner functions extend_dict0(), extend_dict1()
  // and extend_dict2().
  size_t fill_lpi_buffer(LmaPsbItem lpi_items[], size_t max_size,
                         size_t homo_buf_off, LmaNodeGE1 *node,
                         uint16 lma_len);

  // Extend in the trie from level 0.
  MileStoneHandle extend_dict0(MileStoneHandle from_handle,
                               const DictExtPara *dep, LmaPsbItem *lpi_items,
                               size_t lpi_max, size_t *lpi_num);

  // Extend in the trie from level 1.
  MileStoneHandle extend_dict1(MileStoneHandle from_handle,
                               const DictExtPara *dep, LmaPsbItem *lpi_items,
                               size_t lpi_max, size_t *lpi_num);

  // Extend in the trie from level 2.
  MileStoneHandle extend_dict2(MileStoneHandle from_handle,
                               const DictExtPara *dep, LmaPsbItem *lpi_items,
                               size_t lpi_max, size_t *lpi_num);

  // Try to extend the given spelling id buffer, and if the given id_lemma can
  // be successfully gotten, return true;
  // The given spelling ids are all valid full ids.
  bool try_extend(const uint16 *splids, uint16 splid_num, LemmaIdType id_lemma);

#ifdef ___BUILD_MODEL___
  bool save_dict(FILE *fp);
#endif  // ___BUILD_MODEL___

  static const int kMaxMileStone = 100;
  static const int kMaxParsingMark = 600;
  static const MileStoneHandle kFirstValidMileStoneHandle = 1;

  friend class DictParser;
  friend class DictBuilder;

 public:

  DictTrie();
  ~DictTrie();

#ifdef ___BUILD_MODEL___
  // Construct the tree from the file fn_raw.
  // fn_validhzs provide the valid hanzi list. If fn_validhzs is
  // NULL, only chars in GB2312 will be included.
  bool build_dict(const char *fn_raw, const char *fn_validhzs);

  // Save the binary dictionary
  // Actually, the SpellingTrie/DictList instance will be also saved.
  bool save_dict(const char *filename);
#endif  // ___BUILD_MODEL___

  void convert_to_hanzis(char16 *str, uint16 str_len);

  void convert_to_scis_ids(char16 *str, uint16 str_len);

  // Load a binary dictionary
  // The SpellingTrie instance/DictList will be also loaded
  bool load_dict(const char *filename, LemmaIdType start_id,
                 LemmaIdType end_id);
  bool load_dict_fd(int sys_fd, long start_offset, long length,
                    LemmaIdType start_id, LemmaIdType end_id);
  bool close_dict() {return true;}
  size_t number_of_lemmas() {return 0;}

  void reset_milestones(uint16 from_step, MileStoneHandle from_handle);

  MileStoneHandle extend_dict(MileStoneHandle from_handle,
                              const DictExtPara *dep,
                              LmaPsbItem *lpi_items,
                              size_t lpi_max, size_t *lpi_num);

  size_t get_lpis(const uint16 *splid_str, uint16 splid_str_len,
                  LmaPsbItem *lpi_items, size_t lpi_max);

  uint16 get_lemma_str(LemmaIdType id_lemma, char16 *str_buf, uint16 str_max);

  uint16 get_lemma_splids(LemmaIdType id_lemma, uint16 *splids,
                          uint16 splids_max, bool arg_valid);

  size_t predict(const char16 *last_hzs, uint16 hzs_len,
                 NPredictItem *npre_items, size_t npre_max,
                 size_t b4_used);

  LemmaIdType put_lemma(char16 /*lemma_str*/[], uint16 /*splids*/[],
                        uint16 /*lemma_len*/, uint16 /*count*/) {return 0;}

  LemmaIdType update_lemma(LemmaIdType /*lemma_id*/, int16 /*delta_count*/,
                           bool /*selected*/) {return 0;}

  LemmaIdType get_lemma_id(char16 /*lemma_str*/[], uint16 /*splids*/[],
                           uint16 /*lemma_len*/) {return 0;}

  LmaScoreType get_lemma_score(LemmaIdType /*lemma_id*/) {return 0;}

  LmaScoreType get_lemma_score(char16 /*lemma_str*/[], uint16 /*splids*/[],
                        uint16 /*lemma_len*/) {return 0;}

  bool remove_lemma(LemmaIdType /*lemma_id*/) {return false;}

  size_t get_total_lemma_count() {return 0;}
  void set_total_lemma_count_of_others(size_t count);

  void flush_cache() {}

  LemmaIdType get_lemma_id(const char16 lemma_str[], uint16 lemma_len);

  // Fill the lemmas with highest scores to the prediction buffer.
  // his_len is the history length to fill in the prediction buffer.
  size_t predict_top_lmas(size_t his_len, NPredictItem *npre_items,
                          size_t npre_max, size_t b4_used);
};
}

#endif  // PINYINIME_INCLUDE_DICTTRIE_H__
first commit 2020-07-31 06:50:46 +00:00			`/*`
			`* Copyright (C) 2009 The Android Open Source Project`
			`*`
			`* Licensed under the Apache License, Version 2.0 (the "License");`
			`* you may not use this file except in compliance with the License.`
			`* You may obtain a copy of the License at`
			`*`
			`* http://www.apache.org/licenses/LICENSE-2.0`
			`*`
			`* Unless required by applicable law or agreed to in writing, software`
			`* distributed under the License is distributed on an "AS IS" BASIS,`
			`* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`* See the License for the specific language governing permissions and`
			`* limitations under the License.`
			`*/`

			`#ifndef PINYINIME_INCLUDE_DICTTRIE_H__`
			`#define PINYINIME_INCLUDE_DICTTRIE_H__`

			`#include <stdlib.h>`
			`#include "./atomdictbase.h"`
			`#include "./dictdef.h"`
			`#include "./dictlist.h"`
			`#include "./searchutility.h"`

			`namespace ime_pinyin {`

			`class DictTrie : AtomDictBase {`
			`private:`
			`struct ParsingMark {`
			`size_t node_offset:24;`
			`size_t node_num:8; // Number of nodes with this spelling id given`
			`// by spl_id. If spl_id is a Shengmu, for nodes`
			`// in the first layer of DictTrie, it equals to`
			`// SpellingTrie::shm2full_num(); but for those`
			`// nodes which are not in the first layer,`
			`// node_num < SpellingTrie::shm2full_num().`
			`// For a full spelling id, node_num = 1;`
			`};`

			`// Used to indicate an extended mile stone.`
			`// An extended mile stone is used to mark a partial match in the dictionary`
			`// trie to speed up further potential extending.`
			`// For example, when the user inputs "w", a mile stone is created to mark the`
			`// partial match status, so that when user inputs another char 'm', it will be`
			`// faster to extend search space based on this mile stone.`
			`//`
			`// For partial match status of "wm", there can be more than one sub mile`
			`// stone, for example, "wm" can be matched to "wanm", "wom", ..., etc, so`
			`// there may be more one parsing mark used to mark these partial matchings.`
			`// A mile stone records the starting position in the mark list and number of`
			`// marks.`
			`struct MileStone {`
			`uint16 mark_start;`
			`uint16 mark_num;`
			`};`

			`DictList* dict_list_;`

			`const SpellingTrie *spl_trie_;`

			`LmaNodeLE0* root_; // Nodes for root and the first layer.`
			`LmaNodeGE1* nodes_ge1_; // Nodes for other layers.`

			`// An quick index from spelling id to the LmaNodeLE0 node buffer, or`
			`// to the root_ buffer.`
			`// Index length:`
			`// SpellingTrie::get_instance().get_spelling_num() + 1. The last one is used`
			`// to get the end.`
			`// All Shengmu ids are not indexed because they will be converted into`
			`// corresponding full ids.`
			`// So, given an id splid, the son is:`
			`// root_[splid_le0_index_[splid - kFullSplIdStart]]`
			`uint16 *splid_le0_index_;`

			`uint32 lma_node_num_le0_;`
			`uint32 lma_node_num_ge1_;`

			`// The first part is for homophnies, and the last top_lma_num_ items are`
			`// lemmas with highest scores.`
			`unsigned char *lma_idx_buf_;`
			`uint32 lma_idx_buf_len_; // The total size of lma_idx_buf_ in byte.`
			`uint32 total_lma_num_; // Total number of lemmas in this dictionary.`
			`uint32 top_lmas_num_; // Number of lemma with highest scores.`

			`// Parsing mark list used to mark the detailed extended statuses.`
			`ParsingMark *parsing_marks_;`
			`// The position for next available mark.`
			`uint16 parsing_marks_pos_;`

			`// Mile stone list used to mark the extended status.`
			`MileStone *mile_stones_;`
			`// The position for the next available mile stone. We use positions (except 0)`
			`// as handles.`
			`MileStoneHandle mile_stones_pos_;`

			`// Get the offset of sons for a node.`
			`inline size_t get_son_offset(const LmaNodeGE1 *node);`

			`// Get the offset of homonious ids for a node.`
			`inline size_t get_homo_idx_buf_offset(const LmaNodeGE1 *node);`

			`// Get the lemma id by the offset.`
			`inline LemmaIdType get_lemma_id(size_t id_offset);`

			`void free_resource(bool free_dict_list);`

			`bool load_dict(FILE *fp);`

			`// Given a LmaNodeLE0 node, extract the lemmas specified by it, and fill`
			`// them into the lpi_items buffer.`
			`// This function is called by the search engine.`
			`size_t fill_lpi_buffer(LmaPsbItem lpi_items[], size_t max_size,`
			`LmaNodeLE0 *node);`

			`// Given a LmaNodeGE1 node, extract the lemmas specified by it, and fill`
			`// them into the lpi_items buffer.`
			`// This function is called by inner functions extend_dict0(), extend_dict1()`
			`// and extend_dict2().`
			`size_t fill_lpi_buffer(LmaPsbItem lpi_items[], size_t max_size,`
			`size_t homo_buf_off, LmaNodeGE1 *node,`
			`uint16 lma_len);`

			`// Extend in the trie from level 0.`
			`MileStoneHandle extend_dict0(MileStoneHandle from_handle,`
			`const DictExtPara dep, LmaPsbItem lpi_items,`
			`size_t lpi_max, size_t *lpi_num);`

			`// Extend in the trie from level 1.`
			`MileStoneHandle extend_dict1(MileStoneHandle from_handle,`
			`const DictExtPara dep, LmaPsbItem lpi_items,`
			`size_t lpi_max, size_t *lpi_num);`

			`// Extend in the trie from level 2.`
			`MileStoneHandle extend_dict2(MileStoneHandle from_handle,`
			`const DictExtPara dep, LmaPsbItem lpi_items,`
			`size_t lpi_max, size_t *lpi_num);`

			`// Try to extend the given spelling id buffer, and if the given id_lemma can`
			`// be successfully gotten, return true;`
			`// The given spelling ids are all valid full ids.`
			`bool try_extend(const uint16 *splids, uint16 splid_num, LemmaIdType id_lemma);`

			`#ifdef ___BUILD_MODEL___`
			`bool save_dict(FILE *fp);`
			`#endif // ___BUILD_MODEL___`

			`static const int kMaxMileStone = 100;`
			`static const int kMaxParsingMark = 600;`
			`static const MileStoneHandle kFirstValidMileStoneHandle = 1;`

			`friend class DictParser;`
			`friend class DictBuilder;`

			`public:`

			`DictTrie();`
			`~DictTrie();`

			`#ifdef ___BUILD_MODEL___`
			`// Construct the tree from the file fn_raw.`
			`// fn_validhzs provide the valid hanzi list. If fn_validhzs is`
			`// NULL, only chars in GB2312 will be included.`
			`bool build_dict(const char fn_raw, const char fn_validhzs);`

			`// Save the binary dictionary`
			`// Actually, the SpellingTrie/DictList instance will be also saved.`
			`bool save_dict(const char *filename);`
			`#endif // ___BUILD_MODEL___`

			`void convert_to_hanzis(char16 *str, uint16 str_len);`

			`void convert_to_scis_ids(char16 *str, uint16 str_len);`

			`// Load a binary dictionary`
			`// The SpellingTrie instance/DictList will be also loaded`
			`bool load_dict(const char *filename, LemmaIdType start_id,`
			`LemmaIdType end_id);`
			`bool load_dict_fd(int sys_fd, long start_offset, long length,`
			`LemmaIdType start_id, LemmaIdType end_id);`
			`bool close_dict() {return true;}`
			`size_t number_of_lemmas() {return 0;}`

			`void reset_milestones(uint16 from_step, MileStoneHandle from_handle);`

			`MileStoneHandle extend_dict(MileStoneHandle from_handle,`
			`const DictExtPara *dep,`
			`LmaPsbItem *lpi_items,`
			`size_t lpi_max, size_t *lpi_num);`

			`size_t get_lpis(const uint16 *splid_str, uint16 splid_str_len,`
			`LmaPsbItem *lpi_items, size_t lpi_max);`

			`uint16 get_lemma_str(LemmaIdType id_lemma, char16 *str_buf, uint16 str_max);`

			`uint16 get_lemma_splids(LemmaIdType id_lemma, uint16 *splids,`
			`uint16 splids_max, bool arg_valid);`

			`size_t predict(const char16 *last_hzs, uint16 hzs_len,`
			`NPredictItem *npre_items, size_t npre_max,`
			`size_t b4_used);`

			`LemmaIdType put_lemma(char16 /lemma_str/[], uint16 /splids/[],`
			`uint16 /lemma_len/, uint16 /count/) {return 0;}`

			`LemmaIdType update_lemma(LemmaIdType /lemma_id/, int16 /delta_count/,`
			`bool /selected/) {return 0;}`

			`LemmaIdType get_lemma_id(char16 /lemma_str/[], uint16 /splids/[],`
			`uint16 /lemma_len/) {return 0;}`

			`LmaScoreType get_lemma_score(LemmaIdType /lemma_id/) {return 0;}`

			`LmaScoreType get_lemma_score(char16 /lemma_str/[], uint16 /splids/[],`
			`uint16 /lemma_len/) {return 0;}`

			`bool remove_lemma(LemmaIdType /lemma_id/) {return false;}`

			`size_t get_total_lemma_count() {return 0;}`
			`void set_total_lemma_count_of_others(size_t count);`

			`void flush_cache() {}`

			`LemmaIdType get_lemma_id(const char16 lemma_str[], uint16 lemma_len);`

			`// Fill the lemmas with highest scores to the prediction buffer.`
			`// his_len is the history length to fill in the prediction buffer.`
			`size_t predict_top_lmas(size_t his_len, NPredictItem *npre_items,`
			`size_t npre_max, size_t b4_used);`
			`};`
			`}`

			`#endif // PINYINIME_INCLUDE_DICTTRIE_H__`