8#include <botan/internal/charset.h>
10#include <botan/exceptn.h>
11#include <botan/internal/loadstor.h>
18void append_utf8_for(std::string& s, uint32_t c) {
19 if(c >= 0xD800 && c < 0xE000) {
24 const uint8_t b0 =
static_cast<uint8_t
>(c);
25 s.push_back(
static_cast<char>(b0));
26 }
else if(c <= 0x7FF) {
27 const uint8_t b0 = 0xC0 |
static_cast<uint8_t
>(c >> 6);
28 const uint8_t b1 = 0x80 |
static_cast<uint8_t
>(c & 0x3F);
29 s.push_back(
static_cast<char>(b0));
30 s.push_back(
static_cast<char>(b1));
31 }
else if(c <= 0xFFFF) {
32 const uint8_t b0 = 0xE0 |
static_cast<uint8_t
>(c >> 12);
33 const uint8_t b1 = 0x80 |
static_cast<uint8_t
>((c >> 6) & 0x3F);
34 const uint8_t b2 = 0x80 |
static_cast<uint8_t
>(c & 0x3F);
35 s.push_back(
static_cast<char>(b0));
36 s.push_back(
static_cast<char>(b1));
37 s.push_back(
static_cast<char>(b2));
38 }
else if(c <= 0x10FFFF) {
39 const uint8_t b0 = 0xF0 |
static_cast<uint8_t
>(c >> 18);
40 const uint8_t b1 = 0x80 |
static_cast<uint8_t
>((c >> 12) & 0x3F);
41 const uint8_t b2 = 0x80 |
static_cast<uint8_t
>((c >> 6) & 0x3F);
42 const uint8_t b3 = 0x80 |
static_cast<uint8_t
>(c & 0x3F);
43 s.push_back(
static_cast<char>(b0));
44 s.push_back(
static_cast<char>(b1));
45 s.push_back(
static_cast<char>(b2));
46 s.push_back(
static_cast<char>(b3));
52uint32_t next_utf8_codepoint(
const std::string& utf8,
size_t& pos) {
53 auto read_continuation = [&]() -> uint32_t {
54 if(pos >= utf8.size()) {
57 const uint8_t b =
static_cast<uint8_t
>(utf8[pos++]);
58 if((b & 0xC0) != 0x80) {
64 const uint8_t lead =
static_cast<uint8_t
>(utf8[pos++]);
69 }
else if((lead & 0xE0) == 0xC0) {
70 c = (lead & 0x1F) << 6;
71 c |= read_continuation();
75 }
else if((lead & 0xF0) == 0xE0) {
76 c = (lead & 0x0F) << 12;
77 c |= read_continuation() << 6;
78 c |= read_continuation();
82 }
else if((lead & 0xF8) == 0xF0) {
83 c = (lead & 0x07) << 18;
84 c |= read_continuation() << 12;
85 c |= read_continuation() << 6;
86 c |= read_continuation();
95 throw Decoding_Error(
"UTF-8 sequence encodes value outside Unicode range");
97 if(c >= 0xD800 && c < 0xE000) {
98 throw Decoding_Error(
"UTF-8 sequence encodes surrogate code point");
111 const size_t chars = len / 2;
114 for(
size_t i = 0; i != chars; ++i) {
116 append_utf8_for(s, c);
123 std::vector<uint8_t> out;
124 out.reserve(utf8.size() * 2);
127 while(pos < utf8.size()) {
128 const uint32_t c = next_utf8_codepoint(utf8, pos);
132 const uint16_t val =
static_cast<uint16_t
>(c);
145 const size_t chars = len / 4;
148 for(
size_t i = 0; i != chars; ++i) {
150 append_utf8_for(s, c);
157 std::vector<uint8_t> out;
158 out.reserve(utf8.size() * 4);
161 while(pos < utf8.size()) {
162 const uint32_t val = next_utf8_codepoint(utf8, pos);
177 for(
size_t i = 0; i != len; ++i) {
178 const uint32_t c =
static_cast<uint8_t
>(chars[i]);
179 append_utf8_for(s, c);
185 std::ostringstream oss;
191 }
else if(c ==
'\n') {
193 }
else if(c ==
'\r') {
195 }
else if(
static_cast<unsigned char>(c) >= 128) {
196 const unsigned char z =
static_cast<unsigned char>(c);
197 oss <<
"\\x" << std::hex << std::uppercase << static_cast<int>(z);
constexpr uint8_t get_byte(T input)
std::string format_char_for_display(char c)
std::vector< uint8_t > utf8_to_ucs4(const std::string &utf8)
std::string ucs2_to_utf8(const uint8_t ucs2[], size_t len)
std::string latin1_to_utf8(const uint8_t chars[], size_t len)
std::string ucs4_to_utf8(const uint8_t ucs4[], size_t len)
constexpr auto load_be(ParamTs &&... params)
std::vector< uint8_t > utf8_to_ucs2(const std::string &utf8)