28#ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
29#define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
39#if !defined UTF_CPP_CPLUSPLUS
40 #define UTF_CPP_CPLUSPLUS __cplusplus
43#if UTF_CPP_CPLUSPLUS >= 201103L
44 #define UTF_CPP_OVERRIDE override
45 #define UTF_CPP_NOEXCEPT noexcept
46 #define UTF_CPP_STATIC_ASSERT(condition) static_assert(condition, "UTFCPP static assert");
48 #define UTF_CPP_OVERRIDE
49 #define UTF_CPP_NOEXCEPT throw()
51 #define UTF_CPP_STATIC_ASSERT(condition) (void)(condition);
58#if UTF_CPP_CPLUSPLUS >= 201103L
59 #if UTF_CPP_CPLUSPLUS >= 202002L
88 template<
typename octet_type>
94 template<
typename u16_type>
100 template<
typename octet_type>
131 template <
typename octet_iterator>
137 else if ((lead >> 5) == 0x6)
139 else if ((lead >> 4) == 0xe)
141 else if ((lead >> 3) == 0x1e)
153 else if (cp < 0x800) {
157 else if (cp < 0x10000) {
167 template <
typename octet_iterator>
179 #define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END) {utf_error ret = increase_safely(IT, END); if (ret != UTF8_OK) return ret;}
182 template <
typename octet_iterator>
193 template <
typename octet_iterator>
203 code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f);
208 template <
typename octet_iterator>
222 code_point =
static_cast<utfchar32_t>(code_point + ((*it) & 0x3f));
227 template <
typename octet_iterator>
245 code_point =
static_cast<utfchar32_t>(code_point + ((*it) & 0x3f));
250 #undef UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR
252 template <
typename octet_iterator>
260 octet_iterator original_it = it;
306 template <
typename octet_iterator>
312 template <
typename word_iterator>
316 typedef typename std::iterator_traits<word_iterator>::value_type word_type;
323 word_iterator original_it = it;
329 code_point = first_word;
355 template <
typename octet_iterator,
typename octet_type>
358 *(result++) =
static_cast<octet_type
>(cp);
359 else if (cp < 0x800) {
360 *(result++) =
static_cast<octet_type
>((cp >> 6) | 0xc0);
361 *(result++) =
static_cast<octet_type
>((cp & 0x3f) | 0x80);
363 else if (cp < 0x10000) {
364 *(result++) =
static_cast<octet_type
>((cp >> 12) | 0xe0);
365 *(result++) =
static_cast<octet_type
>(((cp >> 6) & 0x3f) | 0x80);
366 *(result++) =
static_cast<octet_type
>((cp & 0x3f) | 0x80);
369 *(result++) =
static_cast<octet_type
>((cp >> 18) | 0xf0);
370 *(result++) =
static_cast<octet_type
>(((cp >> 12) & 0x3f)| 0x80);
371 *(result++) =
static_cast<octet_type
>(((cp >> 6) & 0x3f) | 0x80);
372 *(result++) =
static_cast<octet_type
>((cp & 0x3f) | 0x80);
386 template<
typename container_type>
387 std::back_insert_iterator<container_type>
append
388 (
utfchar32_t cp, std::back_insert_iterator<container_type> result) {
390 typename container_type::value_type>(cp, result);
396 template <
typename octet_iterator>
404 template <
typename word_iterator,
typename word_type>
408 *(result++) =
static_cast<word_type
>(cp);
411 *(result++) =
static_cast<word_type
>(
LEAD_OFFSET + (cp >> 10));
419 template<
typename container_type>
421 (
utfchar32_t cp, std::back_insert_iterator<container_type> result) {
423 typename container_type::value_type>(cp, result);
429 template <
typename word_iterator>
441 template <
typename octet_iterator>
444 octet_iterator result = start;
445 while (result != end) {
455 const char* end = str + std::strlen(str);
461 std::string::const_iterator invalid =
find_invalid(s.begin(), s.end());
462 return (invalid == s.end()) ? std::string::npos :
static_cast<std::size_t
>(invalid - s.begin());
465 template <
typename octet_iterator>
466 inline bool is_valid(octet_iterator start, octet_iterator end)
478 return is_valid(s.begin(), s.end());
483 template <
typename octet_iterator>
#define UTF_CPP_STATIC_ASSERT(condition)
Definition core.h:51
#define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END)
Definition core.h:179
bool is_lead_surrogate(utfchar32_t cp)
Definition core.h:106
utf_error get_sequence_2(octet_iterator &it, octet_iterator end, utfchar32_t &code_point)
Definition core.h:194
octet_iterator append(utfchar32_t cp, octet_iterator result)
Definition core.h:356
const utfchar32_t SURROGATE_OFFSET
Definition core.h:83
const utfchar16_t LEAD_SURROGATE_MIN
Definition core.h:78
bool is_surrogate(utfchar32_t cp)
Definition core.h:116
bool is_in_bmp(utfchar32_t cp)
Definition core.h:126
bool is_trail_surrogate(utfchar32_t cp)
Definition core.h:111
utf_error get_sequence_1(octet_iterator &it, octet_iterator end, utfchar32_t &code_point)
get_sequence_x functions decode utf-8 sequences of the length x
Definition core.h:183
utfchar8_t mask8(octet_type oc)
Definition core.h:89
utf_error get_sequence_4(octet_iterator &it, octet_iterator end, utfchar32_t &code_point)
Definition core.h:228
const utfchar32_t CODE_POINT_MAX
Definition core.h:86
utfchar16_t mask16(u16_type oc)
Definition core.h:95
word_iterator append16(utfchar32_t cp, word_iterator result)
Definition core.h:405
utf_error
Definition core.h:164
@ INCOMPLETE_SEQUENCE
Definition core.h:164
@ INVALID_LEAD
Definition core.h:164
@ OVERLONG_SEQUENCE
Definition core.h:164
@ INVALID_CODE_POINT
Definition core.h:164
@ NOT_ENOUGH_ROOM
Definition core.h:164
@ UTF8_OK
Definition core.h:164
utf_error get_sequence_3(octet_iterator &it, octet_iterator end, utfchar32_t &code_point)
Definition core.h:209
utf_error validate_next16(word_iterator &it, word_iterator end, utfchar32_t &code_point)
Definition core.h:313
bool is_code_point_valid(utfchar32_t cp)
Definition core.h:121
bool is_trail(octet_type oc)
Definition core.h:101
const utfchar16_t TRAIL_SURROGATE_MAX
Definition core.h:81
const utfchar16_t LEAD_SURROGATE_MAX
Definition core.h:79
utf_error validate_next(octet_iterator &it, octet_iterator end, utfchar32_t &code_point)
Definition core.h:253
bool is_overlong_sequence(utfchar32_t cp, int length)
Definition core.h:147
int sequence_length(octet_iterator lead_it)
Definition core.h:132
const utfchar16_t TRAIL_SURROGATE_MIN
Definition core.h:80
const utfchar16_t LEAD_OFFSET
Definition core.h:82
utf_error increase_safely(octet_iterator &it, const octet_iterator end)
Helper for get_sequence_x.
Definition core.h:168
unsigned int utfchar32_t
Definition core.h:69
unsigned short utfchar16_t
Definition core.h:68
bool starts_with_bom(octet_iterator it, octet_iterator end)
Definition core.h:484
unsigned char utfchar8_t
Definition core.h:67
const utfchar8_t bom[]
The library API - functions intended to be called by the users.
Definition core.h:439
bool is_valid(octet_iterator start, octet_iterator end)
Definition core.h:466
octet_iterator find_invalid(octet_iterator start, octet_iterator end)
Definition core.h:442