From a0d01a57a5792268b7dd0c144dfda00e7e171d80 Mon Sep 17 00:00:00 2001 From: aozhiwei Date: Tue, 13 Oct 2020 17:16:06 +0800 Subject: [PATCH] 1 --- a8/strutils.cc | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++ a8/strutils.h | 2 +- 2 files changed, 57 insertions(+), 1 deletion(-) diff --git a/a8/strutils.cc b/a8/strutils.cc index 37361e8..06c90b6 100644 --- a/a8/strutils.cc +++ b/a8/strutils.cc @@ -398,4 +398,60 @@ namespace a8 return strcmp(s1 ? s1 : "", s2 ? s2 : ""); } + size_t GetUtf8Length(const char *str) + { + static unsigned char utf8_look_for_table[] = + { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1 + }; + + #define UTFLEN(x) utf8_look_for_table[(x)] + + auto utf8_char_len = + [] (char firstByte) -> int + { + const unsigned char kFirstBitMask = 128; // 1000000 + const unsigned char kSecondBitMask = 64; // 0100000 + const unsigned char kThirdBitMask = 32; // 0010000 + const unsigned char kFourthBitMask = 16; // 0001000 + const unsigned char kFifthBitMask = 8; // 0000100 + + std::string::difference_type offset = 1; + + if (firstByte & kFirstBitMask) { // This means the first byte has a value greater than 127, and so is beyond the ASCII range. + if (firstByte & kThirdBitMask) { // This means that the first byte has a value greater than 224, and so it must be at least a three-octet code point. + if (firstByte & kFourthBitMask) { // This means that the first byte has a value greater than 240, and so it must be a four-octet code point. + offset = 4; + }else{ + offset = 3; + } + }else{ + offset = 2; + } + } + return offset; + }; + + int clen = strlen(str); + int len = 0; + for (const char *ptr = str; + *ptr!=0&&len