LIEF: Library to Instrument Executable Formats Version 1.0.0
Loading...
Searching...
No Matches
core.h
Go to the documentation of this file.
1// Copyright 2006 Nemanja Trifunovic
2
3/*
4Permission is hereby granted, free of charge, to any person or organization
5obtaining a copy of the software and accompanying documentation covered by
6this license (the "Software") to use, reproduce, display, distribute,
7execute, and transmit the Software, and to prepare derivative works of the
8Software, and to permit third-parties to whom the Software is furnished to
9do so, all subject to the following:
10
11The copyright notices in the Software and this entire statement, including
12the above license grant, this restriction and the following disclaimer,
13must be included in all copies of the Software, in whole or in part, and
14all derivative works of the Software, unless such copies or derivative
15works are solely in the form of machine-executable object code generated by
16a source language processor.
17
18THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
21SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
22FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
23ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24DEALINGS IN THE SOFTWARE.
25*/
26
27
28#ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
29#define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
30
31#include <iterator>
32#include <cstring>
33#include <string>
34
35// Determine the C++ standard version.
36// If the user defines UTF_CPP_CPLUSPLUS, use that.
37// Otherwise, trust the unreliable predefined macro __cplusplus
38
39#if !defined UTF_CPP_CPLUSPLUS
40 #define UTF_CPP_CPLUSPLUS __cplusplus
41#endif
42
43#if UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later
44 #define UTF_CPP_OVERRIDE override
45 #define UTF_CPP_NOEXCEPT noexcept
46 #define UTF_CPP_STATIC_ASSERT(condition) static_assert(condition, "UTFCPP static assert");
47#else // C++ 98/03
48 #define UTF_CPP_OVERRIDE
49 #define UTF_CPP_NOEXCEPT throw()
50 // Not worth simulating static_assert:
51 #define UTF_CPP_STATIC_ASSERT(condition) (void)(condition);
52#endif // C++ 11 or later
53
54
55namespace utf8
56{
57// The typedefs for 8-bit, 16-bit and 32-bit code units
58#if UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later
59 #if UTF_CPP_CPLUSPLUS >= 202002L // C++ 20 or later
60 typedef char8_t utfchar8_t;
61 #else // C++ 11/14/17
62 typedef unsigned char utfchar8_t;
63 #endif
64 typedef char16_t utfchar16_t;
65 typedef char32_t utfchar32_t;
66#else // C++ 98/03
67 typedef unsigned char utfchar8_t;
68 typedef unsigned short utfchar16_t;
69 typedef unsigned int utfchar32_t;
70#endif // C++ 11 or later
71
72// Helper code - not intended to be directly called by the library users. May be changed at any time
73namespace internal
74{
75 // Unicode constants
76 // Leading (high) surrogates: 0xd800 - 0xdbff
77 // Trailing (low) surrogates: 0xdc00 - 0xdfff
82 const utfchar16_t LEAD_OFFSET = 0xd7c0u; // LEAD_SURROGATE_MIN - (0x10000 >> 10)
83 const utfchar32_t SURROGATE_OFFSET = 0xfca02400u; // 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN
84
85 // Maximum valid value for a Unicode code point
86 const utfchar32_t CODE_POINT_MAX = 0x0010ffffu;
87
88 template<typename octet_type>
89 inline utfchar8_t mask8(octet_type oc)
90 {
91 return static_cast<utfchar8_t>(0xff & oc);
92 }
93
94 template<typename u16_type>
95 inline utfchar16_t mask16(u16_type oc)
96 {
97 return static_cast<utfchar16_t>(0xffff & oc);
98 }
99
100 template<typename octet_type>
101 inline bool is_trail(octet_type oc)
102 {
103 return ((utf8::internal::mask8(oc) >> 6) == 0x2);
104 }
105
107 {
108 return (cp >= static_cast<utfchar32_t>(LEAD_SURROGATE_MIN) && cp <= static_cast<utfchar32_t>(LEAD_SURROGATE_MAX));
109 }
110
112 {
113 return (cp >= static_cast<utfchar32_t>(TRAIL_SURROGATE_MIN) && cp <= static_cast<utfchar32_t>(TRAIL_SURROGATE_MAX));
114 }
115
116 inline bool is_surrogate(utfchar32_t cp)
117 {
118 return (cp >= static_cast<utfchar32_t>(LEAD_SURROGATE_MIN) && cp <= static_cast<utfchar32_t>(TRAIL_SURROGATE_MAX));
119 }
120
122 {
123 return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp));
124 }
125
126 inline bool is_in_bmp(utfchar32_t cp)
127 {
128 return cp < utfchar32_t(0x10000);
129 }
130
131 template <typename octet_iterator>
132 int sequence_length(octet_iterator lead_it)
133 {
134 const utfchar8_t lead = utf8::internal::mask8(*lead_it);
135 if (lead < 0x80)
136 return 1;
137 else if ((lead >> 5) == 0x6)
138 return 2;
139 else if ((lead >> 4) == 0xe)
140 return 3;
141 else if ((lead >> 3) == 0x1e)
142 return 4;
143 else
144 return 0;
145 }
146
147 inline bool is_overlong_sequence(utfchar32_t cp, int length)
148 {
149 if (cp < 0x80) {
150 if (length != 1)
151 return true;
152 }
153 else if (cp < 0x800) {
154 if (length != 2)
155 return true;
156 }
157 else if (cp < 0x10000) {
158 if (length != 3)
159 return true;
160 }
161 return false;
162 }
163
165
167 template <typename octet_iterator>
168 utf_error increase_safely(octet_iterator& it, const octet_iterator end)
169 {
170 if (++it == end)
171 return NOT_ENOUGH_ROOM;
172
173 if (!utf8::internal::is_trail(*it))
174 return INCOMPLETE_SEQUENCE;
175
176 return UTF8_OK;
177 }
178
179 #define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END) {utf_error ret = increase_safely(IT, END); if (ret != UTF8_OK) return ret;}
180
182 template <typename octet_iterator>
183 utf_error get_sequence_1(octet_iterator& it, octet_iterator end, utfchar32_t& code_point)
184 {
185 if (it == end)
186 return NOT_ENOUGH_ROOM;
187
188 code_point = static_cast<utfchar32_t>(utf8::internal::mask8(*it));
189
190 return UTF8_OK;
191 }
192
193 template <typename octet_iterator>
194 utf_error get_sequence_2(octet_iterator& it, octet_iterator end, utfchar32_t& code_point)
195 {
196 if (it == end)
197 return NOT_ENOUGH_ROOM;
198
199 code_point = static_cast<utfchar32_t>(utf8::internal::mask8(*it));
200
202
203 code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f);
204
205 return UTF8_OK;
206 }
207
208 template <typename octet_iterator>
209 utf_error get_sequence_3(octet_iterator& it, octet_iterator end, utfchar32_t& code_point)
210 {
211 if (it == end)
212 return NOT_ENOUGH_ROOM;
213
214 code_point = static_cast<utfchar32_t>(utf8::internal::mask8(*it));
215
217
218 code_point = ((code_point << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff);
219
221
222 code_point = static_cast<utfchar32_t>(code_point + ((*it) & 0x3f));
223
224 return UTF8_OK;
225 }
226
227 template <typename octet_iterator>
228 utf_error get_sequence_4(octet_iterator& it, octet_iterator end, utfchar32_t& code_point)
229 {
230 if (it == end)
231 return NOT_ENOUGH_ROOM;
232
233 code_point = static_cast<utfchar32_t>(utf8::internal::mask8(*it));
234
236
237 code_point = ((code_point << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff);
238
240
241 code_point = static_cast<utfchar32_t>(code_point + ((utf8::internal::mask8(*it) << 6) & 0xfff));
242
244
245 code_point = static_cast<utfchar32_t>(code_point + ((*it) & 0x3f));
246
247 return UTF8_OK;
248 }
249
250 #undef UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR
251
252 template <typename octet_iterator>
253 utf_error validate_next(octet_iterator& it, octet_iterator end, utfchar32_t& code_point)
254 {
255 if (it == end)
256 return NOT_ENOUGH_ROOM;
257
258 // Save the original value of it so we can go back in case of failure
259 // Of course, it does not make much sense with i.e. stream iterators
260 octet_iterator original_it = it;
261
262 utfchar32_t cp = 0;
263 // Determine the sequence length based on the lead octet
264 const int length = utf8::internal::sequence_length(it);
265
266 // Get trail octets and calculate the code point
267 utf_error err = UTF8_OK;
268 switch (length) {
269 case 0:
270 return INVALID_LEAD;
271 case 1:
272 err = utf8::internal::get_sequence_1(it, end, cp);
273 break;
274 case 2:
275 err = utf8::internal::get_sequence_2(it, end, cp);
276 break;
277 case 3:
278 err = utf8::internal::get_sequence_3(it, end, cp);
279 break;
280 case 4:
281 err = utf8::internal::get_sequence_4(it, end, cp);
282 break;
283 }
284
285 if (err == UTF8_OK) {
286 // Decoding succeeded. Now, security checks...
288 if (!utf8::internal::is_overlong_sequence(cp, length)){
289 // Passed! Return here.
290 code_point = cp;
291 ++it;
292 return UTF8_OK;
293 }
294 else
295 err = OVERLONG_SEQUENCE;
296 }
297 else
298 err = INVALID_CODE_POINT;
299 }
300
301 // Failure branch - restore the original value of the iterator
302 it = original_it;
303 return err;
304 }
305
306 template <typename octet_iterator>
307 inline utf_error validate_next(octet_iterator& it, octet_iterator end) {
308 utfchar32_t ignored;
309 return utf8::internal::validate_next(it, end, ignored);
310 }
311
312 template <typename word_iterator>
313 utf_error validate_next16(word_iterator& it, word_iterator end, utfchar32_t& code_point)
314 {
315 // Make sure the iterator dereferences a large enough type
316 typedef typename std::iterator_traits<word_iterator>::value_type word_type;
317 UTF_CPP_STATIC_ASSERT(sizeof(word_type) >= sizeof(utfchar16_t));
318 // Check the edge case:
319 if (it == end)
320 return NOT_ENOUGH_ROOM;
321 // Save the original value of it so we can go back in case of failure
322 // Of course, it does not make much sense with i.e. stream iterators
323 word_iterator original_it = it;
324
325 utf_error err = UTF8_OK;
326
327 const utfchar16_t first_word = *it++;
328 if (!is_surrogate(first_word)) {
329 code_point = first_word;
330 return UTF8_OK;
331 }
332 else {
333 if (it == end)
334 err = NOT_ENOUGH_ROOM;
335 else if (is_lead_surrogate(first_word)) {
336 const utfchar16_t second_word = *it++;
337 if (is_trail_surrogate(static_cast<utfchar32_t>(second_word))) {
338 code_point = static_cast<utfchar32_t>(first_word << 10) + static_cast<utfchar32_t>(second_word) + SURROGATE_OFFSET;
339 return UTF8_OK;
340 } else
342
343 } else {
344 err = INVALID_LEAD;
345 }
346 }
347 // error branch
348 it = original_it;
349 return err;
350 }
351
352 // Internal implementation of both checked and unchecked append() function
353 // This function will be invoked by the overloads below, as they will know
354 // the octet_type.
355 template <typename octet_iterator, typename octet_type>
356 octet_iterator append(utfchar32_t cp, octet_iterator result) {
357 if (cp < 0x80) // one octet
358 *(result++) = static_cast<octet_type>(cp);
359 else if (cp < 0x800) { // two octets
360 *(result++) = static_cast<octet_type>((cp >> 6) | 0xc0);
361 *(result++) = static_cast<octet_type>((cp & 0x3f) | 0x80);
362 }
363 else if (cp < 0x10000) { // three octets
364 *(result++) = static_cast<octet_type>((cp >> 12) | 0xe0);
365 *(result++) = static_cast<octet_type>(((cp >> 6) & 0x3f) | 0x80);
366 *(result++) = static_cast<octet_type>((cp & 0x3f) | 0x80);
367 }
368 else { // four octets
369 *(result++) = static_cast<octet_type>((cp >> 18) | 0xf0);
370 *(result++) = static_cast<octet_type>(((cp >> 12) & 0x3f)| 0x80);
371 *(result++) = static_cast<octet_type>(((cp >> 6) & 0x3f) | 0x80);
372 *(result++) = static_cast<octet_type>((cp & 0x3f) | 0x80);
373 }
374 return result;
375 }
376
377 // One of the following overloads will be invoked from the API calls
378
379 // A simple (but dangerous) case: the caller appends byte(s) to a char array
380 inline char* append(utfchar32_t cp, char* result) {
381 return append<char*, char>(cp, result);
382 }
383
384 // Hopefully, most common case: the caller uses back_inserter
385 // i.e. append(cp, std::back_inserter(str));
386 template<typename container_type>
387 std::back_insert_iterator<container_type> append
388 (utfchar32_t cp, std::back_insert_iterator<container_type> result) {
390 typename container_type::value_type>(cp, result);
391 }
392
393 // The caller uses some other kind of output operator - not covered above
394 // Note that in this case we are not able to determine octet_type
395 // so we assume it's utfchar8_t; that can cause a conversion warning if we are wrong.
396 template <typename octet_iterator>
397 octet_iterator append(utfchar32_t cp, octet_iterator result) {
398 return append<octet_iterator, utfchar8_t>(cp, result);
399 }
400
401 // Internal implementation of both checked and unchecked append16() function
402 // This function will be invoked by the overloads below, as they will know
403 // the word_type.
404 template <typename word_iterator, typename word_type>
405 word_iterator append16(utfchar32_t cp, word_iterator result) {
406 UTF_CPP_STATIC_ASSERT(sizeof(word_type) >= sizeof(utfchar16_t));
407 if (is_in_bmp(cp))
408 *(result++) = static_cast<word_type>(cp);
409 else {
410 // Code points from the supplementary planes are encoded via surrogate pairs
411 *(result++) = static_cast<word_type>(LEAD_OFFSET + (cp >> 10));
412 *(result++) = static_cast<word_type>(TRAIL_SURROGATE_MIN + (cp & 0x3FF));
413 }
414 return result;
415 }
416
417 // Hopefully, most common case: the caller uses back_inserter
418 // i.e. append16(cp, std::back_inserter(str));
419 template<typename container_type>
420 std::back_insert_iterator<container_type> append16
421 (utfchar32_t cp, std::back_insert_iterator<container_type> result) {
423 typename container_type::value_type>(cp, result);
424 }
425
426 // The caller uses some other kind of output operator - not covered above
427 // Note that in this case we are not able to determine word_type
428 // so we assume it's utfchar16_t; that can cause a conversion warning if we are wrong.
429 template <typename word_iterator>
430 word_iterator append16(utfchar32_t cp, word_iterator result) {
431 return append16<word_iterator, utfchar16_t>(cp, result);
432 }
433
434} // namespace internal
435
437
438 // Byte order mark
439 const utfchar8_t bom[] = {0xef, 0xbb, 0xbf};
440
441 template <typename octet_iterator>
442 octet_iterator find_invalid(octet_iterator start, octet_iterator end)
443 {
444 octet_iterator result = start;
445 while (result != end) {
447 if (err_code != internal::UTF8_OK)
448 return result;
449 }
450 return result;
451 }
452
453 inline const char* find_invalid(const char* str)
454 {
455 const char* end = str + std::strlen(str);
456 return find_invalid(str, end);
457 }
458
459 inline std::size_t find_invalid(const std::string& s)
460 {
461 std::string::const_iterator invalid = find_invalid(s.begin(), s.end());
462 return (invalid == s.end()) ? std::string::npos : static_cast<std::size_t>(invalid - s.begin());
463 }
464
465 template <typename octet_iterator>
466 inline bool is_valid(octet_iterator start, octet_iterator end)
467 {
468 return (utf8::find_invalid(start, end) == end);
469 }
470
471 inline bool is_valid(const char* str)
472 {
473 return (*(utf8::find_invalid(str)) == '\0');
474 }
475
476 inline bool is_valid(const std::string& s)
477 {
478 return is_valid(s.begin(), s.end());
479 }
480
481
482
483 template <typename octet_iterator>
484 inline bool starts_with_bom (octet_iterator it, octet_iterator end)
485 {
486 return (
487 ((it != end) && (utf8::internal::mask8(*it++)) == bom[0]) &&
488 ((it != end) && (utf8::internal::mask8(*it++)) == bom[1]) &&
489 ((it != end) && (utf8::internal::mask8(*it)) == bom[2])
490 );
491 }
492
493 inline bool starts_with_bom(const std::string& s)
494 {
495 return starts_with_bom(s.begin(), s.end());
496 }
497} // namespace utf8
498
499#endif // header guard
500
#define UTF_CPP_STATIC_ASSERT(condition)
Definition core.h:51
#define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END)
Definition core.h:179
Definition core.h:74
bool is_lead_surrogate(utfchar32_t cp)
Definition core.h:106
utf_error get_sequence_2(octet_iterator &it, octet_iterator end, utfchar32_t &code_point)
Definition core.h:194
octet_iterator append(utfchar32_t cp, octet_iterator result)
Definition core.h:356
const utfchar32_t SURROGATE_OFFSET
Definition core.h:83
const utfchar16_t LEAD_SURROGATE_MIN
Definition core.h:78
bool is_surrogate(utfchar32_t cp)
Definition core.h:116
bool is_in_bmp(utfchar32_t cp)
Definition core.h:126
bool is_trail_surrogate(utfchar32_t cp)
Definition core.h:111
utf_error get_sequence_1(octet_iterator &it, octet_iterator end, utfchar32_t &code_point)
get_sequence_x functions decode utf-8 sequences of the length x
Definition core.h:183
utfchar8_t mask8(octet_type oc)
Definition core.h:89
utf_error get_sequence_4(octet_iterator &it, octet_iterator end, utfchar32_t &code_point)
Definition core.h:228
const utfchar32_t CODE_POINT_MAX
Definition core.h:86
utfchar16_t mask16(u16_type oc)
Definition core.h:95
word_iterator append16(utfchar32_t cp, word_iterator result)
Definition core.h:405
utf_error
Definition core.h:164
@ INCOMPLETE_SEQUENCE
Definition core.h:164
@ INVALID_LEAD
Definition core.h:164
@ OVERLONG_SEQUENCE
Definition core.h:164
@ INVALID_CODE_POINT
Definition core.h:164
@ NOT_ENOUGH_ROOM
Definition core.h:164
@ UTF8_OK
Definition core.h:164
utf_error get_sequence_3(octet_iterator &it, octet_iterator end, utfchar32_t &code_point)
Definition core.h:209
utf_error validate_next16(word_iterator &it, word_iterator end, utfchar32_t &code_point)
Definition core.h:313
bool is_code_point_valid(utfchar32_t cp)
Definition core.h:121
bool is_trail(octet_type oc)
Definition core.h:101
const utfchar16_t TRAIL_SURROGATE_MAX
Definition core.h:81
const utfchar16_t LEAD_SURROGATE_MAX
Definition core.h:79
utf_error validate_next(octet_iterator &it, octet_iterator end, utfchar32_t &code_point)
Definition core.h:253
bool is_overlong_sequence(utfchar32_t cp, int length)
Definition core.h:147
int sequence_length(octet_iterator lead_it)
Definition core.h:132
const utfchar16_t TRAIL_SURROGATE_MIN
Definition core.h:80
const utfchar16_t LEAD_OFFSET
Definition core.h:82
utf_error increase_safely(octet_iterator &it, const octet_iterator end)
Helper for get_sequence_x.
Definition core.h:168
Definition checked.h:35
unsigned int utfchar32_t
Definition core.h:69
unsigned short utfchar16_t
Definition core.h:68
bool starts_with_bom(octet_iterator it, octet_iterator end)
Definition core.h:484
unsigned char utfchar8_t
Definition core.h:67
const utfchar8_t bom[]
The library API - functions intended to be called by the users.
Definition core.h:439
bool is_valid(octet_iterator start, octet_iterator end)
Definition core.h:466
octet_iterator find_invalid(octet_iterator start, octet_iterator end)
Definition core.h:442