QtInputMethod_GooglePinyin/googlepinyin/dictbuilder.h

/*
 * Copyright (C) 2009 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef PINYINIME_INCLUDE_DICTBUILDER_H__
#define PINYINIME_INCLUDE_DICTBUILDER_H__

#include <stdlib.h>
#include "./utf16char.h"
#include "./dictdef.h"
#include "./dictlist.h"
#include "./spellingtable.h"
#include "./spellingtrie.h"
#include "./splparser.h"

namespace ime_pinyin {

#ifdef ___BUILD_MODEL___

#define ___DO_STATISTICS___

class DictTrie;

class DictBuilder {
 private:
  // The raw lemma array buffer.
  LemmaEntry *lemma_arr_;
  size_t lemma_num_;

  // Used to store all possible single char items.
  // Two items may have the same Hanzi while their spelling ids are different.
  SingleCharItem *scis_;
  size_t scis_num_;

  // In the tree, root's level is -1.
  // Lemma nodes for root, and level 0
  LmaNodeLE0 *lma_nodes_le0_;

  // Lemma nodes for layers whose levels are deeper than 0
  LmaNodeGE1 *lma_nodes_ge1_;

  // Number of used lemma nodes
  size_t lma_nds_used_num_le0_;
  size_t lma_nds_used_num_ge1_;

  // Used to store homophonies' ids.
  LemmaIdType *homo_idx_buf_;
  // Number of homophonies each of which only contains one Chinese character.
  size_t homo_idx_num_eq1_;
  // Number of homophonies each of which contains more than one character.
  size_t homo_idx_num_gt1_;

  // The items with highest scores.
  LemmaEntry *top_lmas_;
  size_t top_lmas_num_;

  SpellingTable *spl_table_;
  SpellingParser *spl_parser_;

#ifdef ___DO_STATISTICS___
  size_t max_sonbuf_len_[kMaxLemmaSize];
  size_t max_homobuf_len_[kMaxLemmaSize];

  size_t total_son_num_[kMaxLemmaSize];
  size_t total_node_hasson_[kMaxLemmaSize];
  size_t total_sonbuf_num_[kMaxLemmaSize];
  size_t total_sonbuf_allnoson_[kMaxLemmaSize];
  size_t total_node_in_sonbuf_allnoson_[kMaxLemmaSize];
  size_t total_homo_num_[kMaxLemmaSize];

  size_t sonbufs_num1_;     // Number of son buffer with only 1 son
  size_t sonbufs_numgt1_;   // Number of son buffer with more 1 son;

  size_t total_lma_node_num_;

  void stat_init();
  void stat_print();
#endif

 public:

  DictBuilder();
  ~DictBuilder();

  // Build dictionary trie from the file fn_raw. File fn_validhzs provides
  // valid chars. If fn_validhzs is NULL, only chars in GB2312 will be
  // included.
  bool build_dict(const char* fn_raw, const char* fn_validhzs,
                  DictTrie *dict_trie);

 private:
  // Fill in the buffer with id. The caller guarantees that the paramters are
  // vaild.
  void id_to_charbuf(unsigned char *buf, LemmaIdType id);

  // Update the offset of sons for a node.
  void set_son_offset(LmaNodeGE1 *node, size_t offset);

  // Update the offset of homophonies' ids for a node.
  void set_homo_id_buf_offset(LmaNodeGE1 *node, size_t offset);

  // Format a speling string.
  void format_spelling_str(char *spl_str);

  // Sort the lemma_arr by the hanzi string, and give each of unique items
  // a id. Why we need to sort the lemma list according to their Hanzi string
  // is to find items started by a given prefix string to do prediction.
  // Actually, the single char items are be in other order, for example,
  // in spelling id order, etc.
  // Return value is next un-allocated idx available.
  LemmaIdType sort_lemmas_by_hz();

  // Build the SingleCharItem list, and fill the hanzi_scis_ids in the
  // lemma buffer lemma_arr_.
  // This function should be called after the lemma array is ready.
  // Return the number of unique SingleCharItem elements.
  size_t build_scis();

  // Construct a subtree using a subset of the spelling array (from
  // item_star to item_end)
  // parent is the parent node to update the necessary information
  // parent can be a member of LmaNodeLE0 or LmaNodeGE1
  bool construct_subset(void* parent, LemmaEntry* lemma_arr,
                        size_t item_start, size_t item_end, size_t level);


  // Read valid Chinese Hanzis from the given file.
  // num is used to return number of chars.
  // The return buffer is sorted and caller needs to free the returned buffer.
  char16* read_valid_hanzis(const char *fn_validhzs, size_t *num);


  // Read a raw dictionary. max_item is the maximum number of items. If there
  // are more items in the ditionary, only the first max_item will be read.
  // Returned value is the number of items successfully read from the file.
  size_t read_raw_dict(const char* fn_raw, const char *fn_validhzs,
                       size_t max_item);

  // Try to find if a character is in hzs buffer.
  bool hz_in_hanzis_list(const char16 *hzs, size_t hzs_len, char16 hz);

  // Try to find if all characters in str are in hzs buffer.
  bool str_in_hanzis_list(const char16 *hzs, size_t hzs_len,
                          const char16 *str, size_t str_len);

  // Get these lemmas with toppest scores.
  void get_top_lemmas();

  // Allocate resource to build dictionary.
  // lma_num is the number of items to be loaded
  bool alloc_resource(size_t lma_num);

  // Free resource.
  void free_resource();
};
#endif  // ___BUILD_MODEL___
}

#endif  // PINYINIME_INCLUDE_DICTBUILDER_H__
first commit 2020-07-31 06:50:46 +00:00			`/*`
			`* Copyright (C) 2009 The Android Open Source Project`
			`*`
			`* Licensed under the Apache License, Version 2.0 (the "License");`
			`* you may not use this file except in compliance with the License.`
			`* You may obtain a copy of the License at`
			`*`
			`* http://www.apache.org/licenses/LICENSE-2.0`
			`*`
			`* Unless required by applicable law or agreed to in writing, software`
			`* distributed under the License is distributed on an "AS IS" BASIS,`
			`* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`* See the License for the specific language governing permissions and`
			`* limitations under the License.`
			`*/`

			`#ifndef PINYINIME_INCLUDE_DICTBUILDER_H__`
			`#define PINYINIME_INCLUDE_DICTBUILDER_H__`

			`#include <stdlib.h>`
			`#include "./utf16char.h"`
			`#include "./dictdef.h"`
			`#include "./dictlist.h"`
			`#include "./spellingtable.h"`
			`#include "./spellingtrie.h"`
			`#include "./splparser.h"`

			`namespace ime_pinyin {`

			`#ifdef ___BUILD_MODEL___`

			`#define ___DO_STATISTICS___`

			`class DictTrie;`

			`class DictBuilder {`
			`private:`
			`// The raw lemma array buffer.`
			`LemmaEntry *lemma_arr_;`
			`size_t lemma_num_;`

			`// Used to store all possible single char items.`
			`// Two items may have the same Hanzi while their spelling ids are different.`
			`SingleCharItem *scis_;`
			`size_t scis_num_;`

			`// In the tree, root's level is -1.`
			`// Lemma nodes for root, and level 0`
			`LmaNodeLE0 *lma_nodes_le0_;`

			`// Lemma nodes for layers whose levels are deeper than 0`
			`LmaNodeGE1 *lma_nodes_ge1_;`

			`// Number of used lemma nodes`
			`size_t lma_nds_used_num_le0_;`
			`size_t lma_nds_used_num_ge1_;`

			`// Used to store homophonies' ids.`
			`LemmaIdType *homo_idx_buf_;`
			`// Number of homophonies each of which only contains one Chinese character.`
			`size_t homo_idx_num_eq1_;`
			`// Number of homophonies each of which contains more than one character.`
			`size_t homo_idx_num_gt1_;`

			`// The items with highest scores.`
			`LemmaEntry *top_lmas_;`
			`size_t top_lmas_num_;`

			`SpellingTable *spl_table_;`
			`SpellingParser *spl_parser_;`

			`#ifdef ___DO_STATISTICS___`
			`size_t max_sonbuf_len_[kMaxLemmaSize];`
			`size_t max_homobuf_len_[kMaxLemmaSize];`

			`size_t total_son_num_[kMaxLemmaSize];`
			`size_t total_node_hasson_[kMaxLemmaSize];`
			`size_t total_sonbuf_num_[kMaxLemmaSize];`
			`size_t total_sonbuf_allnoson_[kMaxLemmaSize];`
			`size_t total_node_in_sonbuf_allnoson_[kMaxLemmaSize];`
			`size_t total_homo_num_[kMaxLemmaSize];`

			`size_t sonbufs_num1_; // Number of son buffer with only 1 son`
			`size_t sonbufs_numgt1_; // Number of son buffer with more 1 son;`

			`size_t total_lma_node_num_;`

			`void stat_init();`
			`void stat_print();`
			`#endif`

			`public:`

			`DictBuilder();`
			`~DictBuilder();`

			`// Build dictionary trie from the file fn_raw. File fn_validhzs provides`
			`// valid chars. If fn_validhzs is NULL, only chars in GB2312 will be`
			`// included.`
			`bool build_dict(const char* fn_raw, const char* fn_validhzs,`
			`DictTrie *dict_trie);`

			`private:`
			`// Fill in the buffer with id. The caller guarantees that the paramters are`
			`// vaild.`
			`void id_to_charbuf(unsigned char *buf, LemmaIdType id);`

			`// Update the offset of sons for a node.`
			`void set_son_offset(LmaNodeGE1 *node, size_t offset);`

			`// Update the offset of homophonies' ids for a node.`
			`void set_homo_id_buf_offset(LmaNodeGE1 *node, size_t offset);`

			`// Format a speling string.`
			`void format_spelling_str(char *spl_str);`

			`// Sort the lemma_arr by the hanzi string, and give each of unique items`
			`// a id. Why we need to sort the lemma list according to their Hanzi string`
			`// is to find items started by a given prefix string to do prediction.`
			`// Actually, the single char items are be in other order, for example,`
			`// in spelling id order, etc.`
			`// Return value is next un-allocated idx available.`
			`LemmaIdType sort_lemmas_by_hz();`

			`// Build the SingleCharItem list, and fill the hanzi_scis_ids in the`
			`// lemma buffer lemma_arr_.`
			`// This function should be called after the lemma array is ready.`
			`// Return the number of unique SingleCharItem elements.`
			`size_t build_scis();`

			`// Construct a subtree using a subset of the spelling array (from`
			`// item_star to item_end)`
			`// parent is the parent node to update the necessary information`
			`// parent can be a member of LmaNodeLE0 or LmaNodeGE1`
			`bool construct_subset(void* parent, LemmaEntry* lemma_arr,`
			`size_t item_start, size_t item_end, size_t level);`


			`// Read valid Chinese Hanzis from the given file.`
			`// num is used to return number of chars.`
			`// The return buffer is sorted and caller needs to free the returned buffer.`
			`char16* read_valid_hanzis(const char fn_validhzs, size_t num);`


			`// Read a raw dictionary. max_item is the maximum number of items. If there`
			`// are more items in the ditionary, only the first max_item will be read.`
			`// Returned value is the number of items successfully read from the file.`
			`size_t read_raw_dict(const char* fn_raw, const char *fn_validhzs,`
			`size_t max_item);`

			`// Try to find if a character is in hzs buffer.`
			`bool hz_in_hanzis_list(const char16 *hzs, size_t hzs_len, char16 hz);`

			`// Try to find if all characters in str are in hzs buffer.`
			`bool str_in_hanzis_list(const char16 *hzs, size_t hzs_len,`
			`const char16 *str, size_t str_len);`

			`// Get these lemmas with toppest scores.`
			`void get_top_lemmas();`

			`// Allocate resource to build dictionary.`
			`// lma_num is the number of items to be loaded`
			`bool alloc_resource(size_t lma_num);`

			`// Free resource.`
			`void free_resource();`
			`};`
			`#endif // ___BUILD_MODEL___`
			`}`

			`#endif // PINYINIME_INCLUDE_DICTBUILDER_H__`