nim_duilib/base/util/string_util.cpp
jiajia_deng 4933d1f2bc Remove dependency on shared
Signed-off-by: jiajia_deng <2894220@gmail.com>
2019-09-20 16:27:58 +08:00

984 lines
22 KiB
C++
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*
*
* Author wrt(guangguang)
* Date 2011-06-08
* Copyright Hangzhou, Netease Inc.
* Brief Utilities for string operation
*
*/
#include "string_util.h"
#include <stdio.h> /* for vsnprintf */
#include <assert.h> /* for assert */
#include <memory.h> /* for mem*** */
#include <stdarg.h> /* va_list, va_start, va_end */
#include <ctype.h>
#include "base/macros.h"
#include "base/third_party/convert_utf/ConvertUTF.h"
namespace nbase
{
namespace
{
template<typename CharType>
int StringTokenizeT(const std::basic_string<CharType> &input,
const std::basic_string<CharType> &delimitor,
std::list<std::basic_string<CharType> > &output)
{
size_t token_begin;
size_t token_end;
output.clear();
for (token_begin = token_end = 0; token_end != std::basic_string<CharType>::npos;)
{
token_begin = input.find_first_not_of(delimitor, token_begin);
if (token_begin == std::basic_string<CharType>::npos)
break;
token_end = input.find_first_of(delimitor, token_begin + 1);
output.push_back(input.substr(token_begin, token_end - token_begin));
token_begin = token_end + 1;
}
return static_cast<int>(output.size());
}
template<typename CharType>
size_t StringReplaceAllT(const std::basic_string<CharType> &find,
const std::basic_string<CharType> &replace,
std::basic_string<CharType> &output)
{
size_t find_length = find.size();
size_t replace_length = replace.size();
size_t offset = 0, endpos;
size_t target = 0, found_pos;
size_t replaced = 0;
CharType *data_ptr;
if (find.empty() || output.empty())
return 0;
/*
* to avoid extra memory reallocating,
* we use two passes to finish the task in the case that replace.size() is greater find.size()
*/
if (find_length < replace_length)
{
/* the first pass, count all available 'find' to be replaced */
for (;;)
{
offset = output.find(find, offset);
if (offset == std::basic_string<CharType>::npos)
break;
replaced++;
offset += find_length;
}
if (replaced == 0)
return 0;
size_t newsize = output.size() + replaced * (replace_length - find_length);
/* we apply for more memory to hold the content to be replaced */
endpos = newsize;
offset = newsize - output.size();
output.resize(newsize);
data_ptr = &output[0];
memmove((void*)(data_ptr + offset),
(void*)data_ptr,
(output.size() - offset) * sizeof(CharType));
}
else
{
endpos = output.size();
offset = 0;
data_ptr = const_cast<CharType *>(&output[0]);
}
/* the second pass, the replacement */
while (offset < endpos)
{
found_pos = output.find(find, offset);
if (found_pos != std::basic_string<CharType>::npos)
{
/* move the content between two targets */
if (target != found_pos)
memmove((void*)(data_ptr + target),
(void*)(data_ptr + offset),
(found_pos - offset) * sizeof(CharType));
target += found_pos - offset;
/* replace */
memcpy(data_ptr + target,
replace.data(),
replace_length * sizeof(CharType));
target += replace_length;
offset = find_length + found_pos;
replaced++;
}
else
{
/* ending work */
if (target != offset)
memcpy((void*)(data_ptr + target),
(void*)(data_ptr + offset),
(endpos - offset) * sizeof(CharType));
break;
}
}
if (replace_length < find_length)
output.resize(output.size() - replaced * (find_length - replace_length));
return replaced;
}
inline int vsnprintfT(char *dst, size_t count, const char *format, va_list ap)
{
return vsnprintf(dst, count, format, ap);
}
inline int vsnprintfT(wchar_t *dst, size_t count, const wchar_t *format, va_list ap)
{
#if defined(OS_WIN)
return _vsnwprintf(dst, count, format, ap);
#else
return vswprintf(dst, count, format, ap);
#endif
}
template<typename CharType>
void StringAppendVT(const CharType *format, va_list ap, std::basic_string<CharType> &output)
{
CharType stack_buffer[1024];
/* first, we try to finish the task using a fixed-size buffer in the stack */
va_list ap_copy;
GG_VA_COPY(ap_copy, ap);
int result = vsnprintfT(stack_buffer, COUNT_OF(stack_buffer), format, ap_copy);
va_end(ap_copy);
if (result >= 0 && result < static_cast<int>(COUNT_OF(stack_buffer)))
{
/* It fits */
output.append(stack_buffer, result);
return;
}
/* then, we have to repeatedly increase buffer size until it fits. */
int buffer_size = COUNT_OF(stack_buffer);
std::basic_string<CharType> heap_buffer;
for (;;)
{
if (result != -1)
{
assert(0);
return; /* not expected, result should be -1 here */
}
buffer_size <<= 1; /* try doubling the buffer size */
if (buffer_size > 32 * 1024 * 1024)
{
assert(0);
return; /* too long */
}
/* resize */
heap_buffer.resize(buffer_size);
/*
* NOTE: You can only use a va_list once. Since we're in a while loop, we
* need to make a new copy each time so we don't use up the original.
*/
GG_VA_COPY(ap_copy, ap);
result = vsnprintfT(&heap_buffer[0], buffer_size, format, ap_copy);
va_end(ap_copy);
if ((result >= 0) && (result < buffer_size)) {
/* It fits */
output.append(&heap_buffer[0], result);
return;
}
}
}
#define NOT_SPACE(x) ((x) != 0x20 && ((x) < 0 || 0x1d < (x)))
template<typename CharType>
void StringTrimT(std::basic_string<CharType> &output)
{
if (output.empty())
return;
size_t bound1 = 0;
size_t bound2 = output.length();
const CharType *src = output.data();
for (; bound2 > 0; bound2--)
if (NOT_SPACE(src[bound2-1]))
break;
for (; bound1 < bound2; bound1++)
if (NOT_SPACE(src[bound1]))
break;
if (bound1 < bound2) {
memmove((void *)src,
src + bound1,
sizeof(CharType) * (bound2 - bound1));
}
output.resize(bound2 - bound1);
}
template<typename CharType>
void StringTrimLeftT(std::basic_string<CharType> &output)
{
size_t check = 0;
size_t length = output.length();
const CharType *src = output.data();
for (; check < length; check++)
if (NOT_SPACE(src[check]))
break;
output.erase(0, check);
}
template<typename CharType>
void StringTrimRightT(std::basic_string<CharType> &output)
{
size_t length = output.length();
const CharType *src = output.data();
for (; length > 0; length--)
if (NOT_SPACE(src[length-1]))
break;
output.resize(length);
}
} // anonymous namespace
std::string StringPrintf(const char *format, ...)
{
va_list args;
va_start(args, format);
std::string output;
StringAppendV(output, format, args);
va_end(args);
return output;
}
std::wstring StringPrintf(const wchar_t *format, ...)
{
va_list args;
va_start(args, format);
std::wstring output;
StringAppendV(output, format, args);
va_end(args);
return output;
}
const std::string& StringPrintf(std::string &output, const char *format, ...)
{
va_list args;
va_start(args, format);
output.clear();
StringAppendV(output, format, args);
va_end(args);
return output;
}
const std::wstring& StringPrintf(std::wstring &output, const wchar_t *format, ...)
{
va_list args;
va_start(args, format);
output.clear();
StringAppendV(output, format, args);
va_end(args);
return output;
}
void StringPrintfV(std::string &output, const char *format, va_list ap)
{
output.clear();
StringAppendV(output, format, ap);
}
void StringPrintfV(std::wstring &output, const wchar_t *format, va_list ap)
{
output.clear();
StringAppendV(output, format, ap);
}
void StringAppendF(std::string &output, const char *format, ...)
{
va_list args;
va_start(args, format);
StringAppendV(output, format, args);
va_end(args);
}
void StringAppendF(std::wstring &output, const wchar_t *format, ...)
{
va_list args;
va_start(args, format);
StringAppendV(output, format, args);
va_end(args);
}
void StringAppendV(std::string &output, const char *format, va_list ap)
{
StringAppendVT<char>(format, ap, output);
}
void StringAppendV(std::wstring &output, const wchar_t *format, va_list ap)
{
StringAppendVT<wchar_t>(format, ap, output);
}
std::list<std::string> StringTokenize(const char *input, const char *delimitor)
{
std::list<std::string> output;
std::string input2(input);
if (input2.empty())
return output;
char *token = strtok(&input2[0], delimitor);
while (token != NULL)
{
output.push_back(token);
token = strtok(NULL, delimitor);
}
return output;
}
std::list<std::wstring> StringTokenize(const wchar_t *input, const wchar_t *delimitor)
{
std::list<std::wstring> output;
std::wstring input2(input);
if (input2.empty())
return output;
#if defined(OS_WIN)
wchar_t *token = wcstok(&input2[0], delimitor);
while (token != NULL)
{
output.push_back(token);
token = wcstok(NULL, delimitor);
}
#else
wchar_t *ptr;
wchar_t *token = wcstok(const_cast<wchar_t *>(&input2[0]), delimitor, &ptr);
while (token != NULL)
{
output.push_back(token);
token = wcstok(NULL, delimitor, &ptr);
}
#endif
return output;
}
int StringTokenize(const std::string& input, const std::string& delimitor, std::list<std::string>& output)
{
return StringTokenizeT<char>(input, delimitor, output);
}
int StringTokenize(const std::wstring& input, const std::wstring& delimitor, std::list<std::wstring>& output)
{
return StringTokenizeT<wchar_t>(input, delimitor, output);
}
size_t StringReplaceAll(const std::string& find, const std::string& replace, std::string& output)
{
return StringReplaceAllT<char>(find, replace, output);
}
size_t StringReplaceAll(const std::wstring& find, const std::wstring& replace, std::wstring& output)
{
return StringReplaceAllT<wchar_t>(find, replace, output);
}
void LowerString(std::string &str)
{
if (str.empty())
return;
char *start = &str[0];
char *end = start + str.length();
for (; start < end; start++)
{
if (*start >= 'A' && *start <= 'Z')
*start += 'a' - 'A';
}
}
void LowerString(std::wstring &str)
{
if (str.empty())
return;
wchar_t *start = &str[0];
wchar_t *end = start + str.length();
for (; start < end; start++)
{
if (*start >= L'A' && *start <= L'Z')
*start += L'a' - L'A';
}
}
void UpperString(std::string &str)
{
if (str.empty())
return;
char *start = &str[0];
char *end = start + str.length();
for (; start < end; start++)
{
if (*start >= 'a' && *start <= 'z')
*start -= 'a' - 'A';
}
}
void UpperString(std::wstring &str)
{
if (str.empty())
return;
wchar_t *start = &str[0];
wchar_t *end = start + str.length();
for (; start < end; start++)
{
if (*start >= L'a' && *start <= L'z')
*start -= L'a' - L'A';
}
}
std::string MakeLowerString(const std::string &src)
{
std::string dest(src);
LowerString(dest);
return dest;
}
std::wstring MakeLowerString(const std::wstring &src)
{
std::wstring dest(src);
LowerString(dest);
return dest;
}
std::string MakeUpperString(const std::string &src)
{
std::string dest(src);
UpperString(dest);
return dest;
}
std::wstring MakeUpperString(const std::wstring &src)
{
std::wstring dest(src);
UpperString(dest);
return dest;
}
void BinaryToHexString(const void *binary, size_t length, std::string &output)
{
static const unsigned char kHexChars[] = "0123456789abcdef";
output.resize(length << 1);
if (length == 0)
return;
unsigned char *dst = reinterpret_cast<unsigned char *>(&output[0]);
const unsigned char *src = reinterpret_cast<const unsigned char *>(binary);
for (size_t i = 0; i < length; i++)
{
dst[i<<1] = kHexChars[src[i]>>4];
dst[(i<<1)+1] = kHexChars[src[i]&0xF];
}
}
std::string BinaryToHexString(const void *binary, size_t length)
{
std::string output;
BinaryToHexString(binary, length, output);
return output;
}
std::string BinaryToHexString(const std::string &binary)
{
std::string output;
BinaryToHexString(binary.c_str(), binary.size(), output);
return output;
}
int HexStringToBinary(const char *input, size_t length, std::string &output)
{
output.resize(length >> 1);
if (length == 0)
return 0;
const char *src = input;
char *dst = &output[0];
size_t i, size = output.size();
for (i = 0; i < size; i++)
{
char h = HexCharToInt8(src[i<<1]);
char l = HexCharToInt8(src[(i<<1)+1]);
if (l == -1 || h == -1)
break;
dst[i] = (h<<4)|l;
}
// i may be shorter than size when an error occurs
output.resize(i);
return (int)i;
}
std::string HexStringToBinary(const std::string &input)
{
std::string output;
HexStringToBinary(input.c_str(), input.size(), output);
return output;
}
char HexCharToInt8(char c)
{
if (c >= '0' && c <= '9')
return c - '0';
else if (c >= 'a' && c <= 'f')
return c - 'a' + 10;
else if (c >= 'A' && c <= 'F')
return c - 'A' + 10;
assert(0); // not a hexadecimal char
return -1;
}
std::wstring UTF8ToUTF16(const UTF8Char *utf8, size_t length)
{
UTF16Char output[4096];
const UTF8 *src_begin = reinterpret_cast<const UTF8 *>(utf8);
const UTF8 *src_end = src_begin + length;
UTF16 *dst_begin = reinterpret_cast<UTF16 *>(output);
std::wstring utf16;
while (src_begin < src_end)
{
ConversionResult result = ConvertUTF8toUTF16(&src_begin,
src_end,
&dst_begin,
dst_begin + COUNT_OF(output),
lenientConversion);
utf16.append(output, dst_begin - reinterpret_cast<UTF16 *>(output));
dst_begin = reinterpret_cast<UTF16 *>(output);
if (result == sourceIllegal || result == sourceExhausted)
{
utf16.clear();
break;
}
}
return utf16;
}
std::string UTF16ToUTF8(const UTF16Char *utf16, size_t length)
{
UTF8Char output[8192];
const UTF16 *src_begin = reinterpret_cast<const UTF16 *>(utf16);
const UTF16 *src_end = src_begin + length;
UTF8 *dst_begin = reinterpret_cast<UTF8 *>(output);
std::string utf8;
while (src_begin < src_end)
{
ConversionResult result = ConvertUTF16toUTF8(&src_begin,
src_end,
&dst_begin,
dst_begin + COUNT_OF(output),
lenientConversion);
utf8.append(output, dst_begin - reinterpret_cast<UTF8 *>(output));
dst_begin = reinterpret_cast<UTF8 *>(output);
if (result == sourceIllegal || result == sourceExhausted)
{
utf8.clear();
break;
}
}
return utf8;
}
std::basic_string<UTF32Char> UTF8ToUTF32(const UTF8Char *utf8, size_t length)
{
UTF32Char output[4096];
const UTF8 *src_begin = reinterpret_cast<const UTF8 *>(utf8);
const UTF8 *src_end = src_begin + length;
UTF32 *dst_begin = reinterpret_cast<UTF32 *>(output);
std::basic_string<UTF32Char> utf32;
while (src_begin < src_end)
{
ConversionResult result = ConvertUTF8toUTF32(&src_begin,
src_end,
&dst_begin,
dst_begin + COUNT_OF(output),
lenientConversion);
utf32.append(output, dst_begin - reinterpret_cast<UTF32 *>(output));
dst_begin = reinterpret_cast<UTF32 *>(output);
if (result == sourceIllegal || result == sourceExhausted)
{
utf32.clear();
break;
}
}
return utf32;
}
std::string UTF32ToUTF8(const UTF32Char *utf32, size_t length)
{
UTF8Char output[8192];
const UTF32 *src_begin = reinterpret_cast<const UTF32 *>(utf32);
const UTF32 *src_end = src_begin + length;
UTF8 *dst_begin = reinterpret_cast<UTF8 *>(output);
std::string utf8;
while (src_begin < src_end)
{
ConversionResult result = ConvertUTF32toUTF8(&src_begin,
src_end,
&dst_begin,
dst_begin + COUNT_OF(output),
lenientConversion);
utf8.append(output, dst_begin - reinterpret_cast<UTF8 *>(output));
dst_begin = reinterpret_cast<UTF8 *>(output);
if (result == sourceIllegal || result == sourceExhausted)
{
utf8.clear();
break;
}
}
return utf8;
}
std::basic_string<UTF32Char> UTF16ToUTF32(const UTF16Char *utf16, size_t length)
{
UTF32Char output[4096];
const UTF16 *src_begin = reinterpret_cast<const UTF16 *>(utf16);
const UTF16 *src_end = src_begin + length;
UTF32 *dst_begin = reinterpret_cast<UTF32 *>(output);
std::basic_string<UTF32Char> utf32;
while (src_begin < src_end)
{
ConversionResult result = ConvertUTF16toUTF32(&src_begin,
src_end,
&dst_begin,
dst_begin + COUNT_OF(output),
lenientConversion);
utf32.append(output, dst_begin - reinterpret_cast<UTF32 *>(output));
dst_begin = reinterpret_cast<UTF32 *>(output);
if (result == sourceIllegal || result == sourceExhausted)
{
utf32.clear();
break;
}
}
return utf32;
}
std::wstring UTF32ToUTF16(const UTF32Char *utf32, size_t length)
{
UTF16Char output[8192];
const UTF32 *src_begin = reinterpret_cast<const UTF32 *>(utf32);
const UTF32 *src_end = src_begin + length;
UTF16 *dst_begin = reinterpret_cast<UTF16 *>(output);
std::wstring utf16;
while (src_begin < src_end)
{
ConversionResult result = ConvertUTF32toUTF16(&src_begin,
src_end,
&dst_begin,
dst_begin + COUNT_OF(output),
lenientConversion);
utf16.append(output, dst_begin - reinterpret_cast<UTF16 *>(output));
dst_begin = reinterpret_cast<UTF16 *>(output);
if (result == sourceIllegal || result == sourceExhausted)
{
utf16.clear();
break;
}
}
return utf16;
}
std::wstring UTF8ToUTF16(const std::string &utf8)
{
return UTF8ToUTF16(utf8.c_str(), utf8.length());
}
std::string UTF16ToUTF8(const std::wstring &utf16)
{
return UTF16ToUTF8(utf16.c_str(), utf16.length());
}
std::basic_string<UTF32Char> UTF8ToUTF32(const std::string &utf8)
{
return UTF8ToUTF32(utf8.c_str(), utf8.length());
}
std::string UTF32ToUTF8(const std::basic_string<UTF32Char> &utf32)
{
return UTF32ToUTF8(utf32.c_str(), utf32.length());
}
std::basic_string<UTF32Char> UTF16ToUTF32(const std::wstring &utf16)
{
return UTF16ToUTF32(utf16.c_str(), utf16.length());
}
std::wstring UTF32ToUTF16(const std::basic_string<UTF32Char> &utf32)
{
return UTF32ToUTF16(utf32.c_str(), utf32.length());
}
void UTF8CreateLengthTable(unsigned table[256])
{
int c;
for (c = 0; c < 256; c++)
{
if (c < 0x80)
table[c] = 1;
else if (0xBF < c && c < 0xE0)
table[c] = 2;
else if (0xDF < c && c < 0xF0)
table[c] = 3;
else if (0xEF < c && c < 0xF8)
table[c] = 4;
else
table[c] = 0;
}
}
bool ValidateUTF8Stream(const void* stream, unsigned length)
{
/*
* 根据RFC3629http://tools.ietf.org/html/rfc3629
* UTF-8流一个字符的第一个字节由0-4个数值为1的二进制位
* 这些位之后的第一位必须是0当这些位的个数为0的时候表
* 示这个字符是ASCII占用一个字节除此之外表示这个字符
* 所占用的字节数第二个字节开始每个字节必须使用10的
* 二进制位开头,这样利于快速定位一个字符的起始字节,例如:
*
* 0XXXXXXX
* 110XXXXX 10XXXXXX
* 1110XXXX 10XXXXXX 10XXXXXX
* 11110XXX 10XXXXXX 10XXXXXX 10XXXXXX
*
* 另UTF-8理论上支持6字节长度但是标准将其限定为4字节
*/
unsigned i, j, k;
unsigned char* s = (unsigned char*)stream;
static unsigned int table_created = 0;
static unsigned int table[256];
/* 保证多线程安全 */
if (!table_created)
{
/* 利用lock-free的思想创建一模一样的表 */
UTF8CreateLengthTable(table);
/* 标记,之后的线程将不会重复创建该表 */
table_created = 1;
}
/* 这里使用查表法是因为考虑到这个表会被放入CPU的Cache */
for (i = 0; i < length;)
{
k = table[s[i]];
if (k == 0)
break;
for (j = 1; j < k; j++)
if (table[s[i + j]])
return false;
i += j;
}
return i == length;
}
bool ValidateGB2312Stream(const void* stream, unsigned length)
{
/*
* 根据http://zh.wikipedia.org/zh-cn/Gb2312
* 01-09区为特殊符号。
* 16-55区为一级汉字按拼音排序。
* 56-87区为二级汉字按部首笔画排序。
* 10-15区及88-94区则未有编码。
*
* 每个汉字及符号以两个字节来表示。第一个字节称为“高位字节”,第二个字节称为“低位字节”。
* “高位字节”使用了0xA1-0xF7把01-87区的区号加上0xA0“低位字节”使用了0xA1-0xFE把01-94加上 0xA0
* 由于一级汉字从16区起始汉字区的“高位字节”的范围是0xB0-0xF7“低位字节”的范围是0xA1-0xFE
* 占用的码位是 72*94=6768。其中有5个空位是D7FA-D7FE。
*/
unsigned char* s = (unsigned char*)stream;
unsigned char* e = s + length;
for (; s < e; s++)
{
if (*s < 0x80)
continue;
if (*s < 0xA1 || 0xE7 < *s)
break;
if (++s == e)
return false;
if (*s < 0xA1 || 0xFE < *s)
break;
}
return s == e;
}
bool ValidateGBKStream(const void* stream, unsigned length)
{
unsigned char* s = (unsigned char*)stream;
unsigned char* e = s + length;
for (; s < e; s++)
{
if (*s < 0x80)
continue;
if (*s < 0x81 || 0xFE < *s)
break;
if (++s == e)
return false;
if (*s < 0x40 || 0xFE < *s)
break;
}
return s == e;
}
bool ValidateBIG5Stream(const void* stream, unsigned length)
{
unsigned char* s = (unsigned char*)stream;
unsigned char* e = s + length;
for (; s < e; s++)
{
if (*s < 0x80)
continue;
if (*s < 0xA1 || 0xF9 < *s)
break;
if (++s == e)
return false;
if (*s < 0x40 || 0xFE < *s || (0x7E < *s && *s < 0xA1))
break;
}
return s == e;
}
std::string StringTrimLeft(const char *input)
{
std::string output = input;
StringTrimLeft(output);
return output;
}
std::string StringTrimRight(const char *input)
{
std::string output = input;
StringTrimRight(output);
return output;
}
std::string StringTrim(const char *input) /* both left and right */
{
std::string output = input;
StringTrim(output);
return output;
}
std::string& StringTrimLeft(std::string &input)
{
StringTrimLeftT<char>(input);
return input;
}
std::string& StringTrimRight(std::string &input)
{
StringTrimRightT<char>(input);
return input;
}
std::string& StringTrim(std::string &input) /* both left and right */
{
StringTrimT<char>(input);
return input;
}
std::wstring StringTrimLeft(const wchar_t *input)
{
std::wstring output = input;
StringTrimLeft(output);
return output;
}
std::wstring StringTrimRight(const wchar_t *input)
{
std::wstring output = input;
StringTrimRight(output);
return output;
}
std::wstring StringTrim(const wchar_t *input) /* both left and right */
{
std::wstring output = input;
StringTrim(output);
return output;
}
std::wstring& StringTrimLeft(std::wstring &input)
{
StringTrimLeftT<wchar_t>(input);
return input;
}
std::wstring& StringTrimRight(std::wstring &input)
{
StringTrimRightT<wchar_t>(input);
return input;
}
std::wstring& StringTrim(std::wstring &input) /* both left and right */
{
StringTrimT<wchar_t>(input);
return input;
}
} // namespace nbase