Botan 3.11.1
Crypto and TLS for C&
charset.cpp
Go to the documentation of this file.
1/*
2* Character Set Handling
3* (C) 1999-2007,2021 Jack Lloyd
4*
5* Botan is released under the Simplified BSD License (see license.txt)
6*/
7
8#include <botan/internal/charset.h>
9
10#include <botan/exceptn.h>
11#include <botan/internal/loadstor.h>
12#include <sstream>
13
14namespace Botan {
15
16namespace {
17
18void append_utf8_for(std::string& s, uint32_t c) {
19 if(c >= 0xD800 && c < 0xE000) {
20 throw Decoding_Error("Invalid Unicode character");
21 }
22
23 if(c <= 0x7F) {
24 const uint8_t b0 = static_cast<uint8_t>(c);
25 s.push_back(static_cast<char>(b0));
26 } else if(c <= 0x7FF) {
27 const uint8_t b0 = 0xC0 | static_cast<uint8_t>(c >> 6);
28 const uint8_t b1 = 0x80 | static_cast<uint8_t>(c & 0x3F);
29 s.push_back(static_cast<char>(b0));
30 s.push_back(static_cast<char>(b1));
31 } else if(c <= 0xFFFF) {
32 const uint8_t b0 = 0xE0 | static_cast<uint8_t>(c >> 12);
33 const uint8_t b1 = 0x80 | static_cast<uint8_t>((c >> 6) & 0x3F);
34 const uint8_t b2 = 0x80 | static_cast<uint8_t>(c & 0x3F);
35 s.push_back(static_cast<char>(b0));
36 s.push_back(static_cast<char>(b1));
37 s.push_back(static_cast<char>(b2));
38 } else if(c <= 0x10FFFF) {
39 const uint8_t b0 = 0xF0 | static_cast<uint8_t>(c >> 18);
40 const uint8_t b1 = 0x80 | static_cast<uint8_t>((c >> 12) & 0x3F);
41 const uint8_t b2 = 0x80 | static_cast<uint8_t>((c >> 6) & 0x3F);
42 const uint8_t b3 = 0x80 | static_cast<uint8_t>(c & 0x3F);
43 s.push_back(static_cast<char>(b0));
44 s.push_back(static_cast<char>(b1));
45 s.push_back(static_cast<char>(b2));
46 s.push_back(static_cast<char>(b3));
47 } else {
48 throw Decoding_Error("Invalid Unicode character");
49 }
50}
51
52uint32_t next_utf8_codepoint(const std::string& utf8, size_t& pos) {
53 auto read_continuation = [&]() -> uint32_t {
54 if(pos >= utf8.size()) {
55 throw Decoding_Error("Invalid UTF-8 sequence");
56 }
57 const uint8_t b = static_cast<uint8_t>(utf8[pos++]);
58 if((b & 0xC0) != 0x80) {
59 throw Decoding_Error("Invalid UTF-8 sequence");
60 }
61 return b & 0x3F;
62 };
63
64 const uint8_t lead = static_cast<uint8_t>(utf8[pos++]);
65 uint32_t c = 0;
66
67 if(lead <= 0x7F) {
68 c = lead;
69 } else if((lead & 0xE0) == 0xC0) {
70 c = (lead & 0x1F) << 6;
71 c |= read_continuation();
72 if(c < 0x80) {
73 throw Decoding_Error("Overlong UTF-8 sequence");
74 }
75 } else if((lead & 0xF0) == 0xE0) {
76 c = (lead & 0x0F) << 12;
77 c |= read_continuation() << 6;
78 c |= read_continuation();
79 if(c < 0x800) {
80 throw Decoding_Error("Overlong UTF-8 sequence");
81 }
82 } else if((lead & 0xF8) == 0xF0) {
83 c = (lead & 0x07) << 18;
84 c |= read_continuation() << 12;
85 c |= read_continuation() << 6;
86 c |= read_continuation();
87 if(c < 0x10000) {
88 throw Decoding_Error("Overlong UTF-8 sequence");
89 }
90 } else {
91 throw Decoding_Error("Invalid UTF-8 sequence");
92 }
93
94 if(c > 0x10FFFF) {
95 throw Decoding_Error("UTF-8 sequence encodes value outside Unicode range");
96 }
97 if(c >= 0xD800 && c < 0xE000) {
98 throw Decoding_Error("UTF-8 sequence encodes surrogate code point");
99 }
100
101 return c;
102}
103
104} // namespace
105
106std::string ucs2_to_utf8(const uint8_t ucs2[], size_t len) {
107 if(len % 2 != 0) {
108 throw Decoding_Error("Invalid length for UCS-2 string");
109 }
110
111 const size_t chars = len / 2;
112
113 std::string s;
114 for(size_t i = 0; i != chars; ++i) {
115 const uint32_t c = load_be<uint16_t>(ucs2, i);
116 append_utf8_for(s, c);
117 }
118
119 return s;
120}
121
122std::vector<uint8_t> utf8_to_ucs2(const std::string& utf8) {
123 std::vector<uint8_t> out;
124 out.reserve(utf8.size() * 2);
125
126 size_t pos = 0;
127 while(pos < utf8.size()) {
128 const uint32_t c = next_utf8_codepoint(utf8, pos);
129 if(c > 0xFFFF) {
130 throw Decoding_Error("Cannot encode character in UCS-2");
131 }
132 const uint16_t val = static_cast<uint16_t>(c);
133 out.push_back(get_byte<0>(val));
134 out.push_back(get_byte<1>(val));
135 }
136
137 return out;
138}
139
140std::string ucs4_to_utf8(const uint8_t ucs4[], size_t len) {
141 if(len % 4 != 0) {
142 throw Decoding_Error("Invalid length for UCS-4 string");
143 }
144
145 const size_t chars = len / 4;
146
147 std::string s;
148 for(size_t i = 0; i != chars; ++i) {
149 const uint32_t c = load_be<uint32_t>(ucs4, i);
150 append_utf8_for(s, c);
151 }
152
153 return s;
154}
155
156std::vector<uint8_t> utf8_to_ucs4(const std::string& utf8) {
157 std::vector<uint8_t> out;
158 out.reserve(utf8.size() * 4);
159
160 size_t pos = 0;
161 while(pos < utf8.size()) {
162 const uint32_t val = next_utf8_codepoint(utf8, pos);
163 out.push_back(get_byte<0>(val));
164 out.push_back(get_byte<1>(val));
165 out.push_back(get_byte<2>(val));
166 out.push_back(get_byte<3>(val));
167 }
168
169 return out;
170}
171
172/*
173* Convert from ISO 8859-1 to UTF-8
174*/
175std::string latin1_to_utf8(const uint8_t chars[], size_t len) {
176 std::string s;
177 for(size_t i = 0; i != len; ++i) {
178 const uint32_t c = static_cast<uint8_t>(chars[i]);
179 append_utf8_for(s, c);
180 }
181 return s;
182}
183
184std::string format_char_for_display(char c) {
185 std::ostringstream oss;
186
187 oss << "'";
188
189 if(c == '\t') {
190 oss << "\\t";
191 } else if(c == '\n') {
192 oss << "\\n";
193 } else if(c == '\r') {
194 oss << "\\r";
195 } else if(static_cast<unsigned char>(c) >= 128) {
196 const unsigned char z = static_cast<unsigned char>(c);
197 oss << "\\x" << std::hex << std::uppercase << static_cast<int>(z);
198 } else {
199 oss << c;
200 }
201
202 oss << "'";
203
204 return oss.str();
205}
206
207} // namespace Botan
constexpr uint8_t get_byte(T input)
Definition loadstor.h:79
std::string format_char_for_display(char c)
Definition charset.cpp:184
std::vector< uint8_t > utf8_to_ucs4(const std::string &utf8)
Definition charset.cpp:156
std::string ucs2_to_utf8(const uint8_t ucs2[], size_t len)
Definition charset.cpp:106
std::string latin1_to_utf8(const uint8_t chars[], size_t len)
Definition charset.cpp:175
std::string ucs4_to_utf8(const uint8_t ucs4[], size_t len)
Definition charset.cpp:140
constexpr auto load_be(ParamTs &&... params)
Definition loadstor.h:504
std::vector< uint8_t > utf8_to_ucs2(const std::string &utf8)
Definition charset.cpp:122