97 lines
2.8 KiB
C++
97 lines
2.8 KiB
C++
/*
|
|
* Copyright (C) 2009 The Android Open Source Project
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#ifndef PINYINIME_INCLUDE_NGRAM_H__
|
|
#define PINYINIME_INCLUDE_NGRAM_H__
|
|
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include "./dictdef.h"
|
|
|
|
namespace ime_pinyin {
|
|
|
|
typedef unsigned char CODEBOOK_TYPE;
|
|
|
|
static const size_t kCodeBookSize = 256;
|
|
|
|
class NGram {
|
|
public:
|
|
// The maximum score of a lemma item.
|
|
static const LmaScoreType kMaxScore = 0x3fff;
|
|
|
|
// In order to reduce the storage size, the original log value is amplified by
|
|
// kScoreAmplifier, and we use LmaScoreType to store.
|
|
// After this process, an item with a lower score has a higher frequency.
|
|
static const int kLogValueAmplifier = -800;
|
|
|
|
// System words' total frequency. It is not the real total frequency, instead,
|
|
// It is only used to adjust system lemmas' scores when the user dictionary's
|
|
// total frequency changes.
|
|
// In this version, frequencies of system lemmas are fixed. We are considering
|
|
// to make them changable in next version.
|
|
static const size_t kSysDictTotalFreq = 100000000;
|
|
|
|
private:
|
|
|
|
static NGram* instance_;
|
|
|
|
bool initialized_;
|
|
uint32 idx_num_;
|
|
|
|
size_t total_freq_none_sys_;
|
|
|
|
// Score compensation for system dictionary lemmas.
|
|
// Because after user adds some user lemmas, the total frequency changes, and
|
|
// we use this value to normalize the score.
|
|
float sys_score_compensation_;
|
|
|
|
#ifdef ___BUILD_MODEL___
|
|
double *freq_codes_df_;
|
|
#endif
|
|
LmaScoreType *freq_codes_;
|
|
CODEBOOK_TYPE *lma_freq_idx_;
|
|
|
|
public:
|
|
NGram();
|
|
~NGram();
|
|
|
|
static NGram& get_instance();
|
|
|
|
bool save_ngram(FILE *fp);
|
|
bool load_ngram(FILE *fp);
|
|
|
|
// Set the total frequency of all none system dictionaries.
|
|
void set_total_freq_none_sys(size_t freq_none_sys);
|
|
|
|
float get_uni_psb(LemmaIdType lma_id);
|
|
|
|
// Convert a probability to score. Actually, the score will be limited to
|
|
// kMaxScore, but at runtime, we also need float expression to get accurate
|
|
// value of the score.
|
|
// After the conversion, a lower score indicates a higher probability of the
|
|
// item.
|
|
static float convert_psb_to_score(double psb);
|
|
|
|
#ifdef ___BUILD_MODEL___
|
|
// For constructing the unigram mode model.
|
|
bool build_unigram(LemmaEntry *lemma_arr, size_t num,
|
|
LemmaIdType next_idx_unused);
|
|
#endif
|
|
};
|
|
}
|
|
|
|
#endif // PINYINIME_INCLUDE_NGRAM_H__
|