342 lines
9.4 KiB
C++
342 lines
9.4 KiB
C++
/*
|
|
* Copyright (C) 2009 The Android Open Source Project
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#include <assert.h>
|
|
#include "splparser.h"
|
|
|
|
namespace ime_pinyin {
|
|
|
|
SpellingParser::SpellingParser() {
|
|
spl_trie_ = SpellingTrie::get_cpinstance();
|
|
}
|
|
|
|
bool SpellingParser::is_valid_to_parse(char ch) {
|
|
return SpellingTrie::is_valid_spl_char(ch);
|
|
}
|
|
|
|
uint16 SpellingParser::splstr_to_idxs(const char *splstr, uint16 str_len,
|
|
uint16 spl_idx[], uint16 start_pos[],
|
|
uint16 max_size, bool &last_is_pre) {
|
|
if (NULL == splstr || 0 == max_size || 0 == str_len)
|
|
return 0;
|
|
|
|
if (!SpellingTrie::is_valid_spl_char(splstr[0]))
|
|
return 0;
|
|
|
|
last_is_pre = false;
|
|
|
|
const SpellingNode *node_this = spl_trie_->root_;
|
|
|
|
uint16 str_pos = 0;
|
|
uint16 idx_num = 0;
|
|
if (NULL != start_pos)
|
|
start_pos[0] = 0;
|
|
bool last_is_splitter = false;
|
|
|
|
while (str_pos < str_len) {
|
|
char char_this = splstr[str_pos];
|
|
// all characters outside of [a, z] are considered as splitters
|
|
if (!SpellingTrie::is_valid_spl_char(char_this)) {
|
|
// test if the current node is endable
|
|
uint16 id_this = node_this->spelling_idx;
|
|
if (spl_trie_->if_valid_id_update(&id_this)) {
|
|
spl_idx[idx_num] = id_this;
|
|
|
|
idx_num++;
|
|
str_pos++;
|
|
if (NULL != start_pos)
|
|
start_pos[idx_num] = str_pos;
|
|
if (idx_num >= max_size)
|
|
return idx_num;
|
|
|
|
node_this = spl_trie_->root_;
|
|
last_is_splitter = true;
|
|
continue;
|
|
} else {
|
|
if (last_is_splitter) {
|
|
str_pos++;
|
|
if (NULL != start_pos)
|
|
start_pos[idx_num] = str_pos;
|
|
continue;
|
|
} else {
|
|
return idx_num;
|
|
}
|
|
}
|
|
}
|
|
|
|
last_is_splitter = false;
|
|
|
|
SpellingNode *found_son = NULL;
|
|
|
|
if (0 == str_pos) {
|
|
if (char_this >= 'a')
|
|
found_son = spl_trie_->level1_sons_[char_this - 'a'];
|
|
else
|
|
found_son = spl_trie_->level1_sons_[char_this - 'A'];
|
|
} else {
|
|
SpellingNode *first_son = node_this->first_son;
|
|
// Because for Zh/Ch/Sh nodes, they are the last in the buffer and
|
|
// frequently used, so we scan from the end.
|
|
for (int i = 0; i < node_this->num_of_son; i++) {
|
|
SpellingNode *this_son = first_son + i;
|
|
if (SpellingTrie::is_same_spl_char(
|
|
this_son->char_this_node, char_this)) {
|
|
found_son = this_son;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// found, just move the current node pointer to the the son
|
|
if (NULL != found_son) {
|
|
node_this = found_son;
|
|
} else {
|
|
// not found, test if it is endable
|
|
uint16 id_this = node_this->spelling_idx;
|
|
if (spl_trie_->if_valid_id_update(&id_this)) {
|
|
// endable, remember the index
|
|
spl_idx[idx_num] = id_this;
|
|
|
|
idx_num++;
|
|
if (NULL != start_pos)
|
|
start_pos[idx_num] = str_pos;
|
|
if (idx_num >= max_size)
|
|
return idx_num;
|
|
node_this = spl_trie_->root_;
|
|
continue;
|
|
} else {
|
|
return idx_num;
|
|
}
|
|
}
|
|
|
|
str_pos++;
|
|
}
|
|
|
|
uint16 id_this = node_this->spelling_idx;
|
|
if (spl_trie_->if_valid_id_update(&id_this)) {
|
|
// endable, remember the index
|
|
spl_idx[idx_num] = id_this;
|
|
|
|
idx_num++;
|
|
if (NULL != start_pos)
|
|
start_pos[idx_num] = str_pos;
|
|
}
|
|
|
|
last_is_pre = !last_is_splitter;
|
|
|
|
return idx_num;
|
|
}
|
|
|
|
uint16 SpellingParser::splstr_to_idxs_f(const char *splstr, uint16 str_len,
|
|
uint16 spl_idx[], uint16 start_pos[],
|
|
uint16 max_size, bool &last_is_pre) {
|
|
uint16 idx_num = splstr_to_idxs(splstr, str_len, spl_idx, start_pos,
|
|
max_size, last_is_pre);
|
|
for (uint16 pos = 0; pos < idx_num; pos++) {
|
|
if (spl_trie_->is_half_id_yunmu(spl_idx[pos])) {
|
|
spl_trie_->half_to_full(spl_idx[pos], spl_idx + pos);
|
|
if (pos == idx_num - 1) {
|
|
last_is_pre = false;
|
|
}
|
|
}
|
|
}
|
|
return idx_num;
|
|
}
|
|
|
|
uint16 SpellingParser::splstr16_to_idxs(const char16 *splstr, uint16 str_len,
|
|
uint16 spl_idx[], uint16 start_pos[],
|
|
uint16 max_size, bool &last_is_pre) {
|
|
if (NULL == splstr || 0 == max_size || 0 == str_len)
|
|
return 0;
|
|
|
|
if (!SpellingTrie::is_valid_spl_char(splstr[0]))
|
|
return 0;
|
|
|
|
last_is_pre = false;
|
|
|
|
const SpellingNode *node_this = spl_trie_->root_;
|
|
|
|
uint16 str_pos = 0;
|
|
uint16 idx_num = 0;
|
|
if (NULL != start_pos)
|
|
start_pos[0] = 0;
|
|
bool last_is_splitter = false;
|
|
|
|
while (str_pos < str_len) {
|
|
char16 char_this = splstr[str_pos];
|
|
// all characters outside of [a, z] are considered as splitters
|
|
if (!SpellingTrie::is_valid_spl_char(char_this)) {
|
|
// test if the current node is endable
|
|
uint16 id_this = node_this->spelling_idx;
|
|
if (spl_trie_->if_valid_id_update(&id_this)) {
|
|
spl_idx[idx_num] = id_this;
|
|
|
|
idx_num++;
|
|
str_pos++;
|
|
if (NULL != start_pos)
|
|
start_pos[idx_num] = str_pos;
|
|
if (idx_num >= max_size)
|
|
return idx_num;
|
|
|
|
node_this = spl_trie_->root_;
|
|
last_is_splitter = true;
|
|
continue;
|
|
} else {
|
|
if (last_is_splitter) {
|
|
str_pos++;
|
|
if (NULL != start_pos)
|
|
start_pos[idx_num] = str_pos;
|
|
continue;
|
|
} else {
|
|
return idx_num;
|
|
}
|
|
}
|
|
}
|
|
|
|
last_is_splitter = false;
|
|
|
|
SpellingNode *found_son = NULL;
|
|
|
|
if (0 == str_pos) {
|
|
if (char_this >= 'a')
|
|
found_son = spl_trie_->level1_sons_[char_this - 'a'];
|
|
else
|
|
found_son = spl_trie_->level1_sons_[char_this - 'A'];
|
|
} else {
|
|
SpellingNode *first_son = node_this->first_son;
|
|
// Because for Zh/Ch/Sh nodes, they are the last in the buffer and
|
|
// frequently used, so we scan from the end.
|
|
for (int i = 0; i < node_this->num_of_son; i++) {
|
|
SpellingNode *this_son = first_son + i;
|
|
if (SpellingTrie::is_same_spl_char(
|
|
this_son->char_this_node, char_this)) {
|
|
found_son = this_son;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// found, just move the current node pointer to the the son
|
|
if (NULL != found_son) {
|
|
node_this = found_son;
|
|
} else {
|
|
// not found, test if it is endable
|
|
uint16 id_this = node_this->spelling_idx;
|
|
if (spl_trie_->if_valid_id_update(&id_this)) {
|
|
// endable, remember the index
|
|
spl_idx[idx_num] = id_this;
|
|
|
|
idx_num++;
|
|
if (NULL != start_pos)
|
|
start_pos[idx_num] = str_pos;
|
|
if (idx_num >= max_size)
|
|
return idx_num;
|
|
node_this = spl_trie_->root_;
|
|
continue;
|
|
} else {
|
|
return idx_num;
|
|
}
|
|
}
|
|
|
|
str_pos++;
|
|
}
|
|
|
|
uint16 id_this = node_this->spelling_idx;
|
|
if (spl_trie_->if_valid_id_update(&id_this)) {
|
|
// endable, remember the index
|
|
spl_idx[idx_num] = id_this;
|
|
|
|
idx_num++;
|
|
if (NULL != start_pos)
|
|
start_pos[idx_num] = str_pos;
|
|
}
|
|
|
|
last_is_pre = !last_is_splitter;
|
|
|
|
return idx_num;
|
|
}
|
|
|
|
uint16 SpellingParser::splstr16_to_idxs_f(const char16 *splstr, uint16 str_len,
|
|
uint16 spl_idx[], uint16 start_pos[],
|
|
uint16 max_size, bool &last_is_pre) {
|
|
uint16 idx_num = splstr16_to_idxs(splstr, str_len, spl_idx, start_pos,
|
|
max_size, last_is_pre);
|
|
for (uint16 pos = 0; pos < idx_num; pos++) {
|
|
if (spl_trie_->is_half_id_yunmu(spl_idx[pos])) {
|
|
spl_trie_->half_to_full(spl_idx[pos], spl_idx + pos);
|
|
if (pos == idx_num - 1) {
|
|
last_is_pre = false;
|
|
}
|
|
}
|
|
}
|
|
return idx_num;
|
|
}
|
|
|
|
uint16 SpellingParser::get_splid_by_str(const char *splstr, uint16 str_len,
|
|
bool *is_pre) {
|
|
if (NULL == is_pre)
|
|
return 0;
|
|
|
|
uint16 spl_idx[2];
|
|
uint16 start_pos[3];
|
|
|
|
if (splstr_to_idxs(splstr, str_len, spl_idx, start_pos, 2, *is_pre) != 1)
|
|
return 0;
|
|
|
|
if (start_pos[1] != str_len)
|
|
return 0;
|
|
return spl_idx[0];
|
|
}
|
|
|
|
uint16 SpellingParser::get_splid_by_str_f(const char *splstr, uint16 str_len,
|
|
bool *is_pre) {
|
|
if (NULL == is_pre)
|
|
return 0;
|
|
|
|
uint16 spl_idx[2];
|
|
uint16 start_pos[3];
|
|
|
|
if (splstr_to_idxs(splstr, str_len, spl_idx, start_pos, 2, *is_pre) != 1)
|
|
return 0;
|
|
|
|
if (start_pos[1] != str_len)
|
|
return 0;
|
|
if (spl_trie_->is_half_id_yunmu(spl_idx[0])) {
|
|
spl_trie_->half_to_full(spl_idx[0], spl_idx);
|
|
*is_pre = false;
|
|
}
|
|
|
|
return spl_idx[0];
|
|
}
|
|
|
|
uint16 SpellingParser::get_splids_parallel(const char *splstr, uint16 str_len,
|
|
uint16 splidx[], uint16 max_size,
|
|
uint16 &full_id_num, bool &is_pre) {
|
|
if (max_size <= 0 || !is_valid_to_parse(splstr[0]))
|
|
return 0;
|
|
|
|
splidx[0] = get_splid_by_str(splstr, str_len, &is_pre);
|
|
full_id_num = 0;
|
|
if (0 != splidx[0]) {
|
|
if (splidx[0] >= kFullSplIdStart)
|
|
full_id_num = 1;
|
|
return 1;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
} // namespace ime_pinyin
|