/* * Copyright (C) 2009 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include #include #include "dictdef.h" #ifdef _WIN32 #define snprintf _snprintf #endif #ifdef ___BUILD_MODEL___ #include "spellingtable.h" #endif #include "spellingtrie.h" namespace ime_pinyin { SpellingTrie* SpellingTrie::instance_ = NULL; // z/c/s is for Zh/Ch/Sh const char SpellingTrie::kHalfId2Sc_[kFullSplIdStart + 1] = "0ABCcDEFGHIJKLMNOPQRSsTUVWXYZz"; // Bit 0 : is it a Shengmu char? // Bit 1 : is it a Yunmu char? (one char is a Yunmu) // Bit 2 : is it enabled in ShouZiMu(first char) mode? unsigned char SpellingTrie::char_flags_[] = { // a b c d e f g 0x02, 0x01, 0x01, 0x01, 0x02, 0x01, 0x01, // h i j k l m n 0x01, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, // o p q r s t 0x02, 0x01, 0x01, 0x01, 0x01, 0x01, // u v w x y z 0x00, 0x00, 0x01, 0x01, 0x01, 0x01 }; int compare_spl(const void* p1, const void* p2) { return strcmp((const char*)(p1), (const char*)(p2)); } SpellingTrie::SpellingTrie() { spelling_buf_ = NULL; spelling_size_ = 0; spelling_num_ = 0; spl_ym_ids_ = NULL; splstr_queried_ = NULL; splstr16_queried_ = NULL; root_ = NULL; dumb_node_ = NULL; splitter_node_ = NULL; instance_ = NULL; ym_buf_ = NULL; f2h_ = NULL; szm_enable_shm(true); szm_enable_ym(true); #ifdef ___BUILD_MODEL___ node_num_ = 0; #endif } SpellingTrie::~SpellingTrie() { if (NULL != spelling_buf_) delete [] spelling_buf_; if (NULL != splstr_queried_) delete [] splstr_queried_; if (NULL != splstr16_queried_) delete [] splstr16_queried_; if (NULL != spl_ym_ids_) delete [] spl_ym_ids_; if (NULL != root_) { free_son_trie(root_); delete root_; } if (NULL != dumb_node_) { delete [] dumb_node_; } if (NULL != splitter_node_) { delete [] splitter_node_; } if (NULL != instance_) { delete instance_; instance_ = NULL; } if (NULL != ym_buf_) delete [] ym_buf_; if (NULL != f2h_) delete [] f2h_; } bool SpellingTrie::if_valid_id_update(uint16 *splid) const { if (NULL == splid || 0 == *splid) return false; if (*splid >= kFullSplIdStart) return true; if (*splid < kFullSplIdStart) { char ch = kHalfId2Sc_[*splid]; if (ch > 'Z') { return true; } else { if (szm_is_enabled(ch)) { return true; } else if (is_yunmu_char(ch)) { assert(h2f_num_[*splid] > 0); *splid = h2f_start_[*splid]; return true; } } } return false; } bool SpellingTrie::is_half_id(uint16 splid) const { if (0 == splid || splid >= kFullSplIdStart) return false; return true; } bool SpellingTrie::is_full_id(uint16 splid) const { if (splid < kFullSplIdStart || splid >= kFullSplIdStart + spelling_num_) return false; return true; } bool SpellingTrie::half_full_compatible(uint16 half_id, uint16 full_id) const { uint16 half_fr_full = full_to_half(full_id); if (half_fr_full == half_id) return true; // &~0x20 is used to conver the char to upper case. // So that Zh/Ch/Sh(whose char is z/c/s) can be matched with Z/C/S. char ch_f = (kHalfId2Sc_[half_fr_full] & (~0x20)); char ch_h = kHalfId2Sc_[half_id]; if (ch_f == ch_h) return true; return false; } bool SpellingTrie::is_half_id_yunmu(uint16 splid) const { if (0 == splid || splid >= kFullSplIdStart) return false; char ch = kHalfId2Sc_[splid]; // If ch >= 'a', that means the half id is one of Zh/Ch/Sh if (ch >= 'a') { return false; } return char_flags_[ch - 'A'] & kHalfIdYunmuMask; } bool SpellingTrie::is_shengmu_char(char ch) const { return char_flags_[ch - 'A'] & kHalfIdShengmuMask; } bool SpellingTrie::is_yunmu_char(char ch) const { return char_flags_[ch - 'A'] & kHalfIdYunmuMask; } bool SpellingTrie::is_szm_char(char ch) const { return is_shengmu_char(ch) || is_yunmu_char(ch); } bool SpellingTrie::szm_is_enabled(char ch) const { return char_flags_[ch - 'A'] & kHalfIdSzmMask; } void SpellingTrie::szm_enable_shm(bool enable) { if (enable) { for (char ch = 'A'; ch <= 'Z'; ch++) { if (is_shengmu_char(ch)) char_flags_[ch - 'A'] = char_flags_[ch - 'A'] | kHalfIdSzmMask; } } else { for (char ch = 'A'; ch <= 'Z'; ch++) { if (is_shengmu_char(ch)) char_flags_[ch - 'A'] = char_flags_[ch - 'A'] & (kHalfIdSzmMask ^ 0xff); } } } void SpellingTrie::szm_enable_ym(bool enable) { if (enable) { for (char ch = 'A'; ch <= 'Z'; ch++) { if (is_yunmu_char(ch)) char_flags_[ch - 'A'] = char_flags_[ch - 'A'] | kHalfIdSzmMask; } } else { for (char ch = 'A'; ch <= 'Z'; ch++) { if (is_yunmu_char(ch)) char_flags_[ch - 'A'] = char_flags_[ch - 'A'] & (kHalfIdSzmMask ^ 0xff); } } } bool SpellingTrie::is_szm_enabled(char ch) const { return char_flags_[ch - 'A'] & kHalfIdSzmMask; } const SpellingTrie* SpellingTrie::get_cpinstance() { return &get_instance(); } SpellingTrie& SpellingTrie::get_instance() { if (NULL == instance_) instance_ = new SpellingTrie(); return *instance_; } uint16 SpellingTrie::half2full_num(uint16 half_id) const { if (NULL == root_ || half_id >= kFullSplIdStart) return 0; return h2f_num_[half_id]; } uint16 SpellingTrie::half_to_full(uint16 half_id, uint16 *spl_id_start) const { if (NULL == spl_id_start || NULL == root_ || half_id >= kFullSplIdStart) return 0; *spl_id_start = h2f_start_[half_id]; return h2f_num_[half_id]; } uint16 SpellingTrie::full_to_half(uint16 full_id) const { if (NULL == root_ || full_id < kFullSplIdStart || full_id > spelling_num_ + kFullSplIdStart) return 0; return f2h_[full_id - kFullSplIdStart]; } void SpellingTrie::free_son_trie(SpellingNode* node) { if (NULL == node) return; for (size_t pos = 0; pos < node->num_of_son; pos++) { free_son_trie(node->first_son + pos); } if (NULL != node->first_son) delete [] node->first_son; } bool SpellingTrie::construct(const char* spelling_arr, size_t item_size, size_t item_num, float score_amplifier, unsigned char average_score) { if (spelling_arr == NULL) return false; memset(h2f_start_, 0, sizeof(uint16) * kFullSplIdStart); memset(h2f_num_, 0, sizeof(uint16) * kFullSplIdStart); // If the arr is the same as the buf, means this function is called by // load_table(), the table data are ready; otherwise the array should be // saved. if (spelling_arr != spelling_buf_) { if (NULL != spelling_buf_) delete [] spelling_buf_; spelling_buf_ = new char[item_size * item_num]; if (NULL == spelling_buf_) return false; memcpy(spelling_buf_, spelling_arr, sizeof(char) * item_size * item_num); } spelling_size_ = item_size; spelling_num_ = item_num; score_amplifier_ = score_amplifier; average_score_ = average_score; if (NULL != splstr_queried_) delete [] splstr_queried_; splstr_queried_ = new char[spelling_size_]; if (NULL == splstr_queried_) return false; if (NULL != splstr16_queried_) delete [] splstr16_queried_; splstr16_queried_ = new char16[spelling_size_]; if (NULL == splstr16_queried_) return false; // First, sort the buf to ensure they are in ascendant order qsort(spelling_buf_, spelling_num_, spelling_size_, compare_spl); #ifdef ___BUILD_MODEL___ node_num_ = 1; #endif root_ = new SpellingNode(); memset(root_, 0, sizeof(SpellingNode)); dumb_node_ = new SpellingNode(); memset(dumb_node_, 0, sizeof(SpellingNode)); dumb_node_->score = average_score_; splitter_node_ = new SpellingNode(); memset(splitter_node_, 0, sizeof(SpellingNode)); splitter_node_->score = average_score_; memset(level1_sons_, 0, sizeof(SpellingNode*) * kValidSplCharNum); root_->first_son = construct_spellings_subset(0, spelling_num_, 0, root_); // Root's score should be cleared. root_->score = 0; if (NULL == root_->first_son) return false; h2f_start_[0] = h2f_num_[0] = 0; if (!build_f2h()) return false; #ifdef ___BUILD_MODEL___ if (kPrintDebug0) { printf("---SpellingTrie Nodes: %d\n", (int)node_num_); } return build_ym_info(); #else return true; #endif } #ifdef ___BUILD_MODEL___ const char* SpellingTrie::get_ym_str(const char *spl_str) { bool start_ZCS = false; if (is_shengmu_char(*spl_str)) { if ('Z' == *spl_str || 'C' == *spl_str || 'S' == *spl_str) start_ZCS = true; spl_str += 1; if (start_ZCS && 'h' == *spl_str) spl_str += 1; } return spl_str; } bool SpellingTrie::build_ym_info() { bool sucess; SpellingTable *spl_table = new SpellingTable(); sucess = spl_table->init_table(kMaxPinyinSize - 1, 2 * kMaxYmNum, false); assert(sucess); for (uint16 pos = 0; pos < spelling_num_; pos++) { const char *spl_str = spelling_buf_ + spelling_size_ * pos; spl_str = get_ym_str(spl_str); if ('\0' != spl_str[0]) { sucess = spl_table->put_spelling(spl_str, 0); assert(sucess); } } size_t ym_item_size; // '\0' is included size_t ym_num; const char* ym_buf; ym_buf = spl_table->arrange(&ym_item_size, &ym_num); if (NULL != ym_buf_) delete [] ym_buf_; ym_buf_ = new char[ym_item_size * ym_num]; if (NULL == ym_buf_) { delete spl_table; return false; } memcpy(ym_buf_, ym_buf, sizeof(char) * ym_item_size * ym_num); ym_size_ = ym_item_size; ym_num_ = ym_num; delete spl_table; // Generate the maping from the spelling ids to the Yunmu ids. if (spl_ym_ids_) delete spl_ym_ids_; spl_ym_ids_ = new uint8[spelling_num_ + kFullSplIdStart]; if (NULL == spl_ym_ids_) return false; memset(spl_ym_ids_, 0, sizeof(uint8) * (spelling_num_ + kFullSplIdStart)); for (uint16 id = 1; id < spelling_num_ + kFullSplIdStart; id++) { const char *str = get_spelling_str(id); str = get_ym_str(str); if ('\0' != str[0]) { uint8 ym_id = get_ym_id(str); spl_ym_ids_[id] = ym_id; assert(ym_id > 0); } else { spl_ym_ids_[id] = 0; } } return true; } #endif SpellingNode* SpellingTrie::construct_spellings_subset( size_t item_start, size_t item_end, size_t level, SpellingNode* parent) { if (level >= spelling_size_ || item_end <= item_start || NULL == parent) return NULL; SpellingNode *first_son = NULL; uint16 num_of_son = 0; unsigned char min_son_score = 255; const char *spelling_last_start = spelling_buf_ + spelling_size_ * item_start; char char_for_node = spelling_last_start[level]; assert((char_for_node >= 'A' && char_for_node <= 'Z') || 'h' == char_for_node); // Scan the array to find how many sons for (size_t i = item_start + 1; i < item_end; i++) { const char *spelling_current = spelling_buf_ + spelling_size_ * i; char char_current = spelling_current[level]; if (char_current != char_for_node) { num_of_son++; char_for_node = char_current; } } num_of_son++; // Allocate memory #ifdef ___BUILD_MODEL___ node_num_ += num_of_son; #endif first_son = new SpellingNode[num_of_son]; memset(first_son, 0, sizeof(SpellingNode)*num_of_son); // Now begin construct tree size_t son_pos = 0; spelling_last_start = spelling_buf_ + spelling_size_ * item_start; char_for_node = spelling_last_start[level]; bool spelling_endable = true; if (spelling_last_start[level + 1] != '\0') spelling_endable = false; size_t item_start_next = item_start; for (size_t i = item_start + 1; i < item_end; i++) { const char *spelling_current = spelling_buf_ + spelling_size_ * i; char char_current = spelling_current[level]; assert(is_valid_spl_char(char_current)); if (char_current != char_for_node) { // Construct a node SpellingNode *node_current = first_son + son_pos; node_current->char_this_node = char_for_node; // For quick search in the first level if (0 == level) level1_sons_[char_for_node - 'A'] = node_current; if (spelling_endable) { node_current->spelling_idx = kFullSplIdStart + item_start_next; } if (spelling_last_start[level + 1] != '\0' || i - item_start_next > 1) { size_t real_start = item_start_next; if (spelling_last_start[level + 1] == '\0') real_start++; node_current->first_son = construct_spellings_subset(real_start, i, level + 1, node_current); if (real_start == item_start_next + 1) { uint16 score_this = static_cast( spelling_last_start[spelling_size_ - 1]); if (score_this < node_current->score) node_current->score = score_this; } } else { node_current->first_son = NULL; node_current->score = static_cast( spelling_last_start[spelling_size_ - 1]); } if (node_current->score < min_son_score) min_son_score = node_current->score; bool is_half = false; if (level == 0 && is_szm_char(char_for_node)) { node_current->spelling_idx = static_cast(char_for_node - 'A' + 1); if (char_for_node > 'C') node_current->spelling_idx++; if (char_for_node > 'S') node_current->spelling_idx++; h2f_num_[node_current->spelling_idx] = i - item_start_next; is_half = true; } else if (level == 1 && char_for_node == 'h') { char ch_level0 = spelling_last_start[0]; uint16 part_id = 0; if (ch_level0 == 'C') part_id = 'C' - 'A' + 1 + 1; else if (ch_level0 == 'S') part_id = 'S' - 'A' + 1 + 2; else if (ch_level0 == 'Z') part_id = 'Z' - 'A' + 1 + 3; if (0 != part_id) { node_current->spelling_idx = part_id; h2f_num_[node_current->spelling_idx] = i - item_start_next; is_half = true; } } if (is_half) { if (h2f_num_[node_current->spelling_idx] > 0) h2f_start_[node_current->spelling_idx] = item_start_next + kFullSplIdStart; else h2f_start_[node_current->spelling_idx] = 0; } // for next sibling spelling_last_start = spelling_current; char_for_node = char_current; item_start_next = i; spelling_endable = true; if (spelling_current[level + 1] != '\0') spelling_endable = false; son_pos++; } } // the last one SpellingNode *node_current = first_son + son_pos; node_current->char_this_node = char_for_node; // For quick search in the first level if (0 == level) level1_sons_[char_for_node - 'A'] = node_current; if (spelling_endable) { node_current->spelling_idx = kFullSplIdStart + item_start_next; } if (spelling_last_start[level + 1] != '\0' || item_end - item_start_next > 1) { size_t real_start = item_start_next; if (spelling_last_start[level + 1] == '\0') real_start++; node_current->first_son = construct_spellings_subset(real_start, item_end, level + 1, node_current); if (real_start == item_start_next + 1) { uint16 score_this = static_cast( spelling_last_start[spelling_size_ - 1]); if (score_this < node_current->score) node_current->score = score_this; } } else { node_current->first_son = NULL; node_current->score = static_cast( spelling_last_start[spelling_size_ - 1]); } if (node_current->score < min_son_score) min_son_score = node_current->score; assert(son_pos + 1 == num_of_son); bool is_half = false; if (level == 0 && szm_is_enabled(char_for_node)) { node_current->spelling_idx = static_cast(char_for_node - 'A' + 1); if (char_for_node > 'C') node_current->spelling_idx++; if (char_for_node > 'S') node_current->spelling_idx++; h2f_num_[node_current->spelling_idx] = item_end - item_start_next; is_half = true; } else if (level == 1 && char_for_node == 'h') { char ch_level0 = spelling_last_start[0]; uint16 part_id = 0; if (ch_level0 == 'C') part_id = 'C' - 'A' + 1 + 1; else if (ch_level0 == 'S') part_id = 'S' - 'A' + 1 + 2; else if (ch_level0 == 'Z') part_id = 'Z' - 'A' + 1 + 3; if (0 != part_id) { node_current->spelling_idx = part_id; h2f_num_[node_current->spelling_idx] = item_end - item_start_next; is_half = true; } } if (is_half) { if (h2f_num_[node_current->spelling_idx] > 0) h2f_start_[node_current->spelling_idx] = item_start_next + kFullSplIdStart; else h2f_start_[node_current->spelling_idx] = 0; } parent->num_of_son = num_of_son; parent->score = min_son_score; return first_son; } bool SpellingTrie::save_spl_trie(FILE *fp) { if (NULL == fp || NULL == spelling_buf_) return false; if (fwrite(&spelling_size_, sizeof(uint32), 1, fp) != 1) return false; if (fwrite(&spelling_num_, sizeof(uint32), 1, fp) != 1) return false; if (fwrite(&score_amplifier_, sizeof(float), 1, fp) != 1) return false; if (fwrite(&average_score_, sizeof(unsigned char), 1, fp) != 1) return false; if (fwrite(spelling_buf_, sizeof(char) * spelling_size_, spelling_num_, fp) != spelling_num_) return false; return true; } bool SpellingTrie::load_spl_trie(FILE *fp) { if (NULL == fp) return false; if (fread(&spelling_size_, sizeof(uint32), 1, fp) != 1) return false; if (fread(&spelling_num_, sizeof(uint32), 1, fp) != 1) return false; if (fread(&score_amplifier_, sizeof(float), 1, fp) != 1) return false; if (fread(&average_score_, sizeof(unsigned char), 1, fp) != 1) return false; if (NULL != spelling_buf_) delete [] spelling_buf_; spelling_buf_ = new char[spelling_size_ * spelling_num_]; if (NULL == spelling_buf_) return false; if (fread(spelling_buf_, sizeof(char) * spelling_size_, spelling_num_, fp) != spelling_num_) return false; return construct(spelling_buf_, spelling_size_, spelling_num_, score_amplifier_, average_score_); } bool SpellingTrie::build_f2h() { if (NULL != f2h_) delete [] f2h_; f2h_ = new uint16[spelling_num_]; if (NULL == f2h_) return false; for (uint16 hid = 0; hid < kFullSplIdStart; hid++) { for (uint16 fid = h2f_start_[hid]; fid < h2f_start_[hid] + h2f_num_[hid]; fid++) f2h_[fid - kFullSplIdStart] = hid; } return true; } size_t SpellingTrie::get_spelling_num() { return spelling_num_; } uint8 SpellingTrie::get_ym_id(const char *ym_str) { if (NULL == ym_str || NULL == ym_buf_) return 0; for (uint8 pos = 0; pos < ym_num_; pos++) if (strcmp(ym_buf_ + ym_size_ * pos, ym_str) == 0) return pos + 1; return 0; } const char* SpellingTrie::get_spelling_str(uint16 splid) { splstr_queried_[0] = '\0'; if (splid >= kFullSplIdStart) { splid -= kFullSplIdStart; snprintf(splstr_queried_, spelling_size_, "%s", spelling_buf_ + splid * spelling_size_); } else { if (splid == 'C' - 'A' + 1 + 1) { snprintf(splstr_queried_, spelling_size_, "%s", "Ch"); } else if (splid == 'S' - 'A' + 1 + 2) { snprintf(splstr_queried_, spelling_size_, "%s", "Sh"); } else if (splid == 'Z' - 'A' + 1 + 3) { snprintf(splstr_queried_, spelling_size_, "%s", "Zh"); } else { if (splid > 'C' - 'A' + 1) splid--; if (splid > 'S' - 'A' + 1) splid--; splstr_queried_[0] = 'A' + splid - 1; splstr_queried_[1] = '\0'; } } return splstr_queried_; } const char16* SpellingTrie::get_spelling_str16(uint16 splid) { splstr16_queried_[0] = '\0'; if (splid >= kFullSplIdStart) { splid -= kFullSplIdStart; for (size_t pos = 0; pos < spelling_size_; pos++) { splstr16_queried_[pos] = static_cast (spelling_buf_[splid * spelling_size_ + pos]); } } else { if (splid == 'C' - 'A' + 1 + 1) { splstr16_queried_[0] = static_cast('C'); splstr16_queried_[1] = static_cast('h'); splstr16_queried_[2] = static_cast('\0'); } else if (splid == 'S' - 'A' + 1 + 2) { splstr16_queried_[0] = static_cast('S'); splstr16_queried_[1] = static_cast('h'); splstr16_queried_[2] = static_cast('\0'); } else if (splid == 'Z' - 'A' + 1 + 3) { splstr16_queried_[0] = static_cast('Z'); splstr16_queried_[1] = static_cast('h'); splstr16_queried_[2] = static_cast('\0'); } else { if (splid > 'C' - 'A' + 1) splid--; if (splid > 'S' - 'A' + 1) splid--; splstr16_queried_[0] = 'A' + splid - 1; splstr16_queried_[1] = '\0'; } } return splstr16_queried_; } size_t SpellingTrie::get_spelling_str16(uint16 splid, char16 *splstr16, size_t splstr16_len) { if (NULL == splstr16 || splstr16_len < kMaxPinyinSize + 1) return 0; if (splid >= kFullSplIdStart) { splid -= kFullSplIdStart; for (size_t pos = 0; pos <= kMaxPinyinSize; pos++) { splstr16[pos] = static_cast (spelling_buf_[splid * spelling_size_ + pos]); if (static_cast('\0') == splstr16[pos]) { return pos; } } } else { if (splid == 'C' - 'A' + 1 + 1) { splstr16[0] = static_cast('C'); splstr16[1] = static_cast('h'); splstr16[2] = static_cast('\0'); return 2; } else if (splid == 'S' - 'A' + 1 + 2) { splstr16[0] = static_cast('S'); splstr16[1] = static_cast('h'); splstr16[2] = static_cast('\0'); return 2; } else if (splid == 'Z' - 'A' + 1 + 3) { splstr16[0] = static_cast('Z'); splstr16[1] = static_cast('h'); splstr16[2] = static_cast('\0'); return 2; } else { if (splid > 'C' - 'A' + 1) splid--; if (splid > 'S' - 'A' + 1) splid--; splstr16[0] = 'A' + splid - 1; splstr16[1] = '\0'; return 1; } } // Not reachable. return 0; } } // namespace ime_pinyin