QtInputMethod_GooglePinyin/plugin/googlepinyin/dictdef.h

/*
 * Copyright (C) 2009 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef PINYINIME_INCLUDE_DICTDEF_H__
#define PINYINIME_INCLUDE_DICTDEF_H__

#include <stdlib.h>
#include "./utf16char.h"

namespace ime_pinyin {

// Enable the following line when building the binary dictionary model.
// #define ___BUILD_MODEL___

typedef unsigned char      uint8;
typedef unsigned short     uint16;
typedef unsigned int       uint32;

typedef signed char        int8;
typedef short              int16;
typedef int                int32;
typedef long long          int64;
typedef unsigned long long uint64;

const bool kPrintDebug0 = false;
const bool kPrintDebug1 = false;
const bool kPrintDebug2 = false;

// The max length of a lemma.
const size_t kMaxLemmaSize = 8;

// The max length of a Pinyin (spelling).
const size_t kMaxPinyinSize = 6;

// The number of half spelling ids. For Chinese Pinyin, there 30 half ids.
// See SpellingTrie.h for details.
const size_t kHalfSpellingIdNum = 29;

// The maximum number of full spellings. For Chinese Pinyin, there are only
// about 410 spellings.
// If change this value is bigger(needs more bits), please also update
// other structures like SpellingNode, to make sure than a spelling id can be
// stored.
// -1 is because that 0 is never used.
const size_t kMaxSpellingNum = 512 - kHalfSpellingIdNum - 1;
const size_t kMaxSearchSteps = 40;

// One character predicts its following characters.
const size_t kMaxPredictSize = (kMaxLemmaSize - 1);

// LemmaIdType must always be size_t.
typedef size_t LemmaIdType;
const size_t kLemmaIdSize = 3;  // Actually, a Id occupies 3 bytes in storage.
const size_t kLemmaIdComposing = 0xffffff;

typedef uint16 LmaScoreType;
typedef uint16 KeyScoreType;

// Number of items with highest score are kept for prediction purpose.
const size_t kTopScoreLemmaNum = 10;

const size_t kMaxPredictNumByGt3 = 1;
const size_t kMaxPredictNumBy3 = 2;
const size_t kMaxPredictNumBy2 = 2;

// The last lemma id (included) for the system dictionary. The system
// dictionary's ids always start from 1.
const LemmaIdType kSysDictIdEnd = 500000;

// The first lemma id for the user dictionary.
const LemmaIdType kUserDictIdStart = 500001;

// The last lemma id (included) for the user dictionary.
const LemmaIdType kUserDictIdEnd = 600000;

typedef struct {
  uint16 half_splid:5;
  uint16 full_splid:11;
} SpellingId, *PSpellingId;


/**
 * We use different node types for different layers
 * Statistical data of the building result for a testing dictionary:
 *                              root,   level 0,   level 1,   level 2,   level 3
 * max son num of one node:     406        280         41          2          -
 * max homo num of one node:      0         90         23          2          2
 * total node num of a layer:     1        406      31766      13516        993
 * total homo num of a layer:     9       5674      44609      12667        995
 *
 * The node number for root and level 0 won't be larger than 500
 * According to the information above, two kinds of nodes can be used; one for
 * root and level 0, the other for these layers deeper than 0.
 *
 * LE = less and equal,
 * A node occupies 16 bytes. so, totallly less than 16 * 500 = 8K
 */
struct LmaNodeLE0 {
  uint32 son_1st_off;
  uint32 homo_idx_buf_off;
  uint16 spl_idx;
  uint16 num_of_son;
  uint16 num_of_homo;
};

/**
 * GE = great and equal
 * A node occupies 8 bytes.
 */
struct LmaNodeGE1 {
  uint16 son_1st_off_l;        // Low bits of the son_1st_off
  uint16 homo_idx_buf_off_l;   // Low bits of the homo_idx_buf_off_1
  uint16 spl_idx;
  unsigned char num_of_son;            // number of son nodes
  unsigned char num_of_homo;           // number of homo words
  unsigned char son_1st_off_h;         // high bits of the son_1st_off
  unsigned char homo_idx_buf_off_h;    // high bits of the homo_idx_buf_off
};

#ifdef ___BUILD_MODEL___
struct SingleCharItem {
  float freq;
  char16 hz;
  SpellingId splid;
};

struct LemmaEntry {
  LemmaIdType idx_by_py;
  LemmaIdType idx_by_hz;
  char16 hanzi_str[kMaxLemmaSize + 1];

  // The SingleCharItem id for each Hanzi.
  uint16 hanzi_scis_ids[kMaxLemmaSize];

  uint16 spl_idx_arr[kMaxLemmaSize + 1];
  char pinyin_str[kMaxLemmaSize][kMaxPinyinSize + 1];
  unsigned char hz_str_len;
  float freq;
};
#endif  // ___BUILD_MODEL___

}  //  namespace ime_pinyin

#endif  // PINYINIME_INCLUDE_DICTDEF_H__
first commit 2020-07-31 06:50:46 +00:00			`/*`
			`* Copyright (C) 2009 The Android Open Source Project`
			`*`
			`* Licensed under the Apache License, Version 2.0 (the "License");`
			`* you may not use this file except in compliance with the License.`
			`* You may obtain a copy of the License at`
			`*`
			`* http://www.apache.org/licenses/LICENSE-2.0`
			`*`
			`* Unless required by applicable law or agreed to in writing, software`
			`* distributed under the License is distributed on an "AS IS" BASIS,`
			`* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`* See the License for the specific language governing permissions and`
			`* limitations under the License.`
			`*/`

			`#ifndef PINYINIME_INCLUDE_DICTDEF_H__`
			`#define PINYINIME_INCLUDE_DICTDEF_H__`

			`#include <stdlib.h>`
			`#include "./utf16char.h"`

			`namespace ime_pinyin {`

			`// Enable the following line when building the binary dictionary model.`
			`// #define ___BUILD_MODEL___`

			`typedef unsigned char uint8;`
			`typedef unsigned short uint16;`
			`typedef unsigned int uint32;`

			`typedef signed char int8;`
			`typedef short int16;`
			`typedef int int32;`
			`typedef long long int64;`
			`typedef unsigned long long uint64;`

			`const bool kPrintDebug0 = false;`
			`const bool kPrintDebug1 = false;`
			`const bool kPrintDebug2 = false;`

			`// The max length of a lemma.`
			`const size_t kMaxLemmaSize = 8;`

			`// The max length of a Pinyin (spelling).`
			`const size_t kMaxPinyinSize = 6;`

			`// The number of half spelling ids. For Chinese Pinyin, there 30 half ids.`
			`// See SpellingTrie.h for details.`
			`const size_t kHalfSpellingIdNum = 29;`

			`// The maximum number of full spellings. For Chinese Pinyin, there are only`
			`// about 410 spellings.`
			`// If change this value is bigger(needs more bits), please also update`
			`// other structures like SpellingNode, to make sure than a spelling id can be`
			`// stored.`
			`// -1 is because that 0 is never used.`
			`const size_t kMaxSpellingNum = 512 - kHalfSpellingIdNum - 1;`
			`const size_t kMaxSearchSteps = 40;`

			`// One character predicts its following characters.`
			`const size_t kMaxPredictSize = (kMaxLemmaSize - 1);`

			`// LemmaIdType must always be size_t.`
			`typedef size_t LemmaIdType;`
			`const size_t kLemmaIdSize = 3; // Actually, a Id occupies 3 bytes in storage.`
			`const size_t kLemmaIdComposing = 0xffffff;`

			`typedef uint16 LmaScoreType;`
			`typedef uint16 KeyScoreType;`

			`// Number of items with highest score are kept for prediction purpose.`
			`const size_t kTopScoreLemmaNum = 10;`

			`const size_t kMaxPredictNumByGt3 = 1;`
			`const size_t kMaxPredictNumBy3 = 2;`
			`const size_t kMaxPredictNumBy2 = 2;`

			`// The last lemma id (included) for the system dictionary. The system`
			`// dictionary's ids always start from 1.`
			`const LemmaIdType kSysDictIdEnd = 500000;`

			`// The first lemma id for the user dictionary.`
			`const LemmaIdType kUserDictIdStart = 500001;`

			`// The last lemma id (included) for the user dictionary.`
			`const LemmaIdType kUserDictIdEnd = 600000;`

			`typedef struct {`
			`uint16 half_splid:5;`
			`uint16 full_splid:11;`
			`} SpellingId, *PSpellingId;`


			`/**`
			`* We use different node types for different layers`
			`* Statistical data of the building result for a testing dictionary:`
			`* root, level 0, level 1, level 2, level 3`
			`* max son num of one node: 406 280 41 2 -`
			`* max homo num of one node: 0 90 23 2 2`
			`* total node num of a layer: 1 406 31766 13516 993`
			`* total homo num of a layer: 9 5674 44609 12667 995`
			`*`
			`* The node number for root and level 0 won't be larger than 500`
			`* According to the information above, two kinds of nodes can be used; one for`
			`* root and level 0, the other for these layers deeper than 0.`
			`*`
			`* LE = less and equal,`
			`* A node occupies 16 bytes. so, totallly less than 16 * 500 = 8K`
			`*/`
			`struct LmaNodeLE0 {`
			`uint32 son_1st_off;`
			`uint32 homo_idx_buf_off;`
			`uint16 spl_idx;`
			`uint16 num_of_son;`
			`uint16 num_of_homo;`
			`};`

			`/**`
			`* GE = great and equal`
			`* A node occupies 8 bytes.`
			`*/`
			`struct LmaNodeGE1 {`
			`uint16 son_1st_off_l; // Low bits of the son_1st_off`
			`uint16 homo_idx_buf_off_l; // Low bits of the homo_idx_buf_off_1`
			`uint16 spl_idx;`
			`unsigned char num_of_son; // number of son nodes`
			`unsigned char num_of_homo; // number of homo words`
			`unsigned char son_1st_off_h; // high bits of the son_1st_off`
			`unsigned char homo_idx_buf_off_h; // high bits of the homo_idx_buf_off`
			`};`

			`#ifdef ___BUILD_MODEL___`
			`struct SingleCharItem {`
			`float freq;`
			`char16 hz;`
			`SpellingId splid;`
			`};`

			`struct LemmaEntry {`
			`LemmaIdType idx_by_py;`
			`LemmaIdType idx_by_hz;`
			`char16 hanzi_str[kMaxLemmaSize + 1];`

			`// The SingleCharItem id for each Hanzi.`
			`uint16 hanzi_scis_ids[kMaxLemmaSize];`

			`uint16 spl_idx_arr[kMaxLemmaSize + 1];`
			`char pinyin_str[kMaxLemmaSize][kMaxPinyinSize + 1];`
			`unsigned char hz_str_len;`
			`float freq;`
			`};`
			`#endif // ___BUILD_MODEL___`

			`} // namespace ime_pinyin`

			`#endif // PINYINIME_INCLUDE_DICTDEF_H__`