/* * Copyright (C) 2009 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef PINYINIME_INCLUDE_USERDICT_H__ #define PINYINIME_INCLUDE_USERDICT_H__ #define ___CACHE_ENABLED___ #define ___SYNC_ENABLED___ #define ___PREDICT_ENABLED___ // Debug performance for operations // #define ___DEBUG_PERF___ #ifdef _WIN32 #include // timeval #else #include #endif #include "atomdictbase.h" namespace ime_pinyin { class UserDict : public AtomDictBase { public: UserDict(); ~UserDict(); bool load_dict(const char *file_name, LemmaIdType start_id, LemmaIdType end_id); bool close_dict(); size_t number_of_lemmas(); void reset_milestones(uint16 from_step, MileStoneHandle from_handle); MileStoneHandle extend_dict(MileStoneHandle from_handle, const DictExtPara *dep, LmaPsbItem *lpi_items, size_t lpi_max, size_t *lpi_num); size_t get_lpis(const uint16 *splid_str, uint16 splid_str_len, LmaPsbItem *lpi_items, size_t lpi_max); uint16 get_lemma_str(LemmaIdType id_lemma, char16* str_buf, uint16 str_max); uint16 get_lemma_splids(LemmaIdType id_lemma, uint16 *splids, uint16 splids_max, bool arg_valid); size_t predict(const char16 last_hzs[], uint16 hzs_len, NPredictItem *npre_items, size_t npre_max, size_t b4_used); // Full spelling ids are required LemmaIdType put_lemma(char16 lemma_str[], uint16 splids[], uint16 lemma_len, uint16 count); LemmaIdType update_lemma(LemmaIdType lemma_id, int16 delta_count, bool selected); LemmaIdType get_lemma_id(char16 lemma_str[], uint16 splids[], uint16 lemma_len); LmaScoreType get_lemma_score(LemmaIdType lemma_id); LmaScoreType get_lemma_score(char16 lemma_str[], uint16 splids[], uint16 lemma_len); bool remove_lemma(LemmaIdType lemma_id); size_t get_total_lemma_count(); void set_total_lemma_count_of_others(size_t count); void flush_cache(); void set_limit(uint32 max_lemma_count, uint32 max_lemma_size, uint32 reclaim_ratio); void reclaim(); void defragment(); #ifdef ___SYNC_ENABLED___ void clear_sync_lemmas(unsigned int start, unsigned int end); int get_sync_count(); LemmaIdType put_lemma_no_sync(char16 lemma_str[], uint16 splids[], uint16 lemma_len, uint16 count, uint64 lmt); /** * Add lemmas encoded in UTF-16LE into dictionary without adding sync flag. * * @param lemmas in format of 'wo men,WM,0.32;da jia,DJ,0.12' * @param len length of lemmas string in UTF-16LE * @return newly added lemma count */ int put_lemmas_no_sync_from_utf16le_string(char16 * lemmas, int len); /** * Get lemmas need sync to a UTF-16LE string of above format. * Note: input buffer (str) must not be too small. If str is too small to * contain single one lemma, there might be a dead loop. * * @param str buffer to write lemmas * @param size buffer size in UTF-16LE * @param count output value of lemma returned * @return UTF-16LE string length */ int get_sync_lemmas_in_utf16le_string_from_beginning( char16 * str, int size, int * count); #endif struct UserDictStat { uint32 version; const char * file_name; struct timeval load_time; struct timeval last_update; uint32 disk_size; uint32 lemma_count; uint32 lemma_size; uint32 delete_count; uint32 delete_size; #ifdef ___SYNC_ENABLED___ uint32 sync_count; #endif uint32 reclaim_ratio; uint32 limit_lemma_count; uint32 limit_lemma_size; }; bool state(UserDictStat * stat); private: uint32 total_other_nfreq_; struct timeval load_time_; LemmaIdType start_id_; uint32 version_; uint8 * lemmas_; // In-Memory-Only flag for each lemma static const uint8 kUserDictLemmaFlagRemove = 1; // Inuse lemmas' offset uint32 * offsets_; // Highest bit in offset tells whether corresponding lemma is removed static const uint32 kUserDictOffsetFlagRemove = (1 << 31); // Maximum possible for the offset static const uint32 kUserDictOffsetMask = ~(kUserDictOffsetFlagRemove); // Bit width for last modified time, from 1 to 16 static const uint32 kUserDictLMTBitWidth = 16; // Granularity for last modified time in second static const uint32 kUserDictLMTGranularity = 60 * 60 * 24 * 7; // Maximum frequency count static const uint16 kUserDictMaxFrequency = 0xFFFF; #define COARSE_UTC(year, month, day, hour, minute, second) \ ( \ (year - 1970) * 365 * 24 * 60 * 60 + \ (month - 1) * 30 * 24 * 60 * 60 + \ (day - 1) * 24 * 60 * 60 + \ (hour - 0) * 60 * 60 + \ (minute - 0) * 60 + \ (second - 0) \ ) static const uint64 kUserDictLMTSince = COARSE_UTC(2009, 1, 1, 0, 0, 0); // Correspond to offsets_ uint32 * scores_; // Following two fields are only valid in memory uint32 * ids_; #ifdef ___PREDICT_ENABLED___ uint32 * predicts_; #endif #ifdef ___SYNC_ENABLED___ uint32 * syncs_; size_t sync_count_size_; #endif uint32 * offsets_by_id_; size_t lemma_count_left_; size_t lemma_size_left_; const char * dict_file_; // Be sure size is 4xN struct UserDictInfo { // When limitation reached, how much percentage will be reclaimed (1 ~ 100) uint32 reclaim_ratio; // maximum lemma count, 0 means no limitation uint32 limit_lemma_count; // Maximum lemma size, it's different from // whole disk file size or in-mem dict size // 0 means no limitation uint32 limit_lemma_size; // Total lemma count including deleted and inuse // Also indicate offsets_ size uint32 lemma_count; // Total size of lemmas including used and freed uint32 lemma_size; // Freed lemma count uint32 free_count; // Freed lemma size in byte uint32 free_size; #ifdef ___SYNC_ENABLED___ uint32 sync_count; #endif int32 total_nfreq; } dict_info_; static const uint32 kUserDictVersion = 0x0ABCDEF0; static const uint32 kUserDictPreAlloc = 32; static const uint32 kUserDictAverageNchar = 8; enum UserDictState { // Keep in order USER_DICT_NONE = 0, USER_DICT_SYNC, #ifdef ___SYNC_ENABLED___ USER_DICT_SYNC_DIRTY, #endif USER_DICT_SCORE_DIRTY, USER_DICT_OFFSET_DIRTY, USER_DICT_LEMMA_DIRTY, USER_DICT_DEFRAGMENTED, } state_; struct UserDictSearchable { uint16 splids_len; uint16 splid_start[kMaxLemmaSize]; uint16 splid_count[kMaxLemmaSize]; // Compact inital letters for both FuzzyCompareSpellId and cache system uint32 signature[kMaxLemmaSize / 4]; }; #ifdef ___CACHE_ENABLED___ enum UserDictCacheType { USER_DICT_CACHE, USER_DICT_MISS_CACHE, }; static const int kUserDictCacheSize = 4; static const int kUserDictMissCacheSize = kMaxLemmaSize - 1; struct UserDictMissCache { uint32 signatures[kUserDictMissCacheSize][kMaxLemmaSize / 4]; uint16 head, tail; } miss_caches_[kMaxLemmaSize]; struct UserDictCache { uint32 signatures[kUserDictCacheSize][kMaxLemmaSize / 4]; uint32 offsets[kUserDictCacheSize]; uint32 lengths[kUserDictCacheSize]; // Ring buffer uint16 head, tail; } caches_[kMaxLemmaSize]; void cache_init(); void cache_push(UserDictCacheType type, UserDictSearchable *searchable, uint32 offset, uint32 length); bool cache_hit(UserDictSearchable *searchable, uint32 *offset, uint32 *length); bool load_cache(UserDictSearchable *searchable, uint32 *offset, uint32 *length); void save_cache(UserDictSearchable *searchable, uint32 offset, uint32 length); void reset_cache(); bool load_miss_cache(UserDictSearchable *searchable); void save_miss_cache(UserDictSearchable *searchable); void reset_miss_cache(); #endif LmaScoreType translate_score(int f); int extract_score_freq(int raw_score); uint64 extract_score_lmt(int raw_score); inline int build_score(uint64 lmt, int freq); inline int64 utf16le_atoll(uint16 *s, int len); inline int utf16le_lltoa(int64 v, uint16 *s, int size); LemmaIdType _put_lemma(char16 lemma_str[], uint16 splids[], uint16 lemma_len, uint16 count, uint64 lmt); size_t _get_lpis(const uint16 *splid_str, uint16 splid_str_len, LmaPsbItem *lpi_items, size_t lpi_max, bool * need_extend); int _get_lemma_score(char16 lemma_str[], uint16 splids[], uint16 lemma_len); int _get_lemma_score(LemmaIdType lemma_id); int is_fuzzy_prefix_spell_id(const uint16 * id1, uint16 len1, const UserDictSearchable *searchable); bool is_prefix_spell_id(const uint16 * fullids, uint16 fulllen, const UserDictSearchable *searchable); uint32 get_dict_file_size(UserDictInfo * info); bool reset(const char *file); bool validate(const char *file); bool load(const char *file, LemmaIdType start_id); bool is_valid_state(); bool is_valid_lemma_id(LemmaIdType id); LemmaIdType get_max_lemma_id(); void set_lemma_flag(uint32 offset, uint8 flag); char get_lemma_flag(uint32 offset); char get_lemma_nchar(uint32 offset); uint16 * get_lemma_spell_ids(uint32 offset); uint16 * get_lemma_word(uint32 offset); // Prepare searchable to fasten locate process void prepare_locate(UserDictSearchable *searchable, const uint16 * splids, uint16 len); // Compare initial letters only int32 fuzzy_compare_spell_id(const uint16 * id1, uint16 len1, const UserDictSearchable *searchable); // Compare exactly two spell ids // First argument must be a full id spell id bool equal_spell_id(const uint16 * fullids, uint16 fulllen, const UserDictSearchable *searchable); // Find first item by initial letters int32 locate_first_in_offsets(const UserDictSearchable *searchable); LemmaIdType append_a_lemma(char16 lemma_str[], uint16 splids[], uint16 lemma_len, uint16 count, uint64 lmt); // Check if a lemma is in dictionary int32 locate_in_offsets(char16 lemma_str[], uint16 splid_str[], uint16 lemma_len); bool remove_lemma_by_offset_index(int offset_index); #ifdef ___PREDICT_ENABLED___ uint32 locate_where_to_insert_in_predicts(const uint16 * words, int lemma_len); int32 locate_first_in_predicts(const uint16 * words, int lemma_len); void remove_lemma_from_predict_list(uint32 offset); #endif #ifdef ___SYNC_ENABLED___ void queue_lemma_for_sync(LemmaIdType id); void remove_lemma_from_sync_list(uint32 offset); void write_back_sync(int fd); #endif void write_back_score(int fd); void write_back_offset(int fd); void write_back_lemma(int fd); void write_back_all(int fd); void write_back(); struct UserDictScoreOffsetPair { int score; uint32 offset_index; }; inline void swap(UserDictScoreOffsetPair * sop, int i, int j); void shift_down(UserDictScoreOffsetPair * sop, int i, int n); // On-disk format for each lemma // +-------------+ // | Version (4) | // +-------------+ // +-----------+-----------+--------------------+-------------------+ // | Spare (1) | Nchar (1) | Splids (2 x Nchar) | Lemma (2 x Nchar) | // +-----------+-----------+--------------------+-------------------+ // ... // +-----------------------+ +-------------+ <---Offset of offset // | Offset1 by_splids (4) | ... | OffsetN (4) | // +-----------------------+ +-------------+ #ifdef ___PREDICT_ENABLED___ // +----------------------+ +-------------+ // | Offset1 by_lemma (4) | ... | OffsetN (4) | // +----------------------+ +-------------+ #endif // +------------+ +------------+ // | Score1 (4) | ... | ScoreN (4) | // +------------+ +------------+ #ifdef ___SYNC_ENABLED___ // +-------------+ +-------------+ // | NewAdd1 (4) | ... | NewAddN (4) | // +-------------+ +-------------+ #endif // +----------------+ // | Dict Info (4x) | // +----------------+ }; } #endif