Botan 3.12.0
Crypto and TLS for C&
charset.cpp
Go to the documentation of this file.
1/*
2* Character Set Handling
3* (C) 1999-2007,2021 Jack Lloyd
4*
5* Botan is released under the Simplified BSD License (see license.txt)
6*/
7
8#include <botan/internal/charset.h>
9
10#include <botan/exceptn.h>
11#include <botan/internal/loadstor.h>
12#include <sstream>
13
14namespace Botan {
15
16namespace {
17
18void append_utf8_for(std::string& s, uint32_t c) {
19 if(c >= 0xD800 && c < 0xE000) {
20 throw Decoding_Error("Invalid Unicode character");
21 }
22
23 if(c <= 0x7F) {
24 const uint8_t b0 = static_cast<uint8_t>(c);
25 s.push_back(static_cast<char>(b0));
26 } else if(c <= 0x7FF) {
27 const uint8_t b0 = 0xC0 | static_cast<uint8_t>(c >> 6);
28 const uint8_t b1 = 0x80 | static_cast<uint8_t>(c & 0x3F);
29 s.push_back(static_cast<char>(b0));
30 s.push_back(static_cast<char>(b1));
31 } else if(c <= 0xFFFF) {
32 const uint8_t b0 = 0xE0 | static_cast<uint8_t>(c >> 12);
33 const uint8_t b1 = 0x80 | static_cast<uint8_t>((c >> 6) & 0x3F);
34 const uint8_t b2 = 0x80 | static_cast<uint8_t>(c & 0x3F);
35 s.push_back(static_cast<char>(b0));
36 s.push_back(static_cast<char>(b1));
37 s.push_back(static_cast<char>(b2));
38 } else if(c <= 0x10FFFF) {
39 const uint8_t b0 = 0xF0 | static_cast<uint8_t>(c >> 18);
40 const uint8_t b1 = 0x80 | static_cast<uint8_t>((c >> 12) & 0x3F);
41 const uint8_t b2 = 0x80 | static_cast<uint8_t>((c >> 6) & 0x3F);
42 const uint8_t b3 = 0x80 | static_cast<uint8_t>(c & 0x3F);
43 s.push_back(static_cast<char>(b0));
44 s.push_back(static_cast<char>(b1));
45 s.push_back(static_cast<char>(b2));
46 s.push_back(static_cast<char>(b3));
47 } else {
48 throw Decoding_Error("Invalid Unicode character");
49 }
50}
51
52uint32_t next_utf8_codepoint(const std::string& utf8, size_t& pos) {
53 auto read_continuation = [&]() -> uint32_t {
54 if(pos >= utf8.size()) {
55 throw Decoding_Error("Invalid UTF-8 sequence");
56 }
57 const uint8_t b = static_cast<uint8_t>(utf8[pos++]);
58 if((b & 0xC0) != 0x80) {
59 throw Decoding_Error("Invalid UTF-8 sequence");
60 }
61 return b & 0x3F;
62 };
63
64 const uint8_t lead = static_cast<uint8_t>(utf8[pos++]);
65 uint32_t c = 0;
66
67 if(lead <= 0x7F) {
68 c = lead;
69 } else if((lead & 0xE0) == 0xC0) {
70 c = (lead & 0x1F) << 6;
71 c |= read_continuation();
72 if(c < 0x80) {
73 throw Decoding_Error("Overlong UTF-8 sequence");
74 }
75 } else if((lead & 0xF0) == 0xE0) {
76 c = (lead & 0x0F) << 12;
77 c |= read_continuation() << 6;
78 c |= read_continuation();
79 if(c < 0x800) {
80 throw Decoding_Error("Overlong UTF-8 sequence");
81 }
82 } else if((lead & 0xF8) == 0xF0) {
83 c = (lead & 0x07) << 18;
84 c |= read_continuation() << 12;
85 c |= read_continuation() << 6;
86 c |= read_continuation();
87 if(c < 0x10000) {
88 throw Decoding_Error("Overlong UTF-8 sequence");
89 }
90 } else {
91 throw Decoding_Error("Invalid UTF-8 sequence");
92 }
93
94 if(c > 0x10FFFF) {
95 throw Decoding_Error("UTF-8 sequence encodes value outside Unicode range");
96 }
97 if(c >= 0xD800 && c < 0xE000) {
98 throw Decoding_Error("UTF-8 sequence encodes surrogate code point");
99 }
100
101 return c;
102}
103
104} // namespace
105
106bool is_valid_utf8(const std::string& utf8) {
107 try {
108 size_t pos = 0;
109 while(pos < utf8.size()) {
110 const uint32_t c = next_utf8_codepoint(utf8, pos);
111 BOTAN_UNUSED(c);
112 }
113 } catch(Decoding_Error&) {
114 return false;
115 }
116 return true;
117}
118
119std::string ucs2_to_utf8(const uint8_t ucs2[], size_t len) {
120 if(len % 2 != 0) {
121 throw Decoding_Error("Invalid length for UCS-2 string");
122 }
123
124 const size_t chars = len / 2;
125
126 std::string s;
127 for(size_t i = 0; i != chars; ++i) {
128 const uint32_t c = load_be<uint16_t>(ucs2, i);
129 append_utf8_for(s, c);
130 }
131
132 return s;
133}
134
135std::vector<uint8_t> utf8_to_ucs2(const std::string& utf8) {
136 std::vector<uint8_t> out;
137 out.reserve(utf8.size() * 2);
138
139 size_t pos = 0;
140 while(pos < utf8.size()) {
141 const uint32_t c = next_utf8_codepoint(utf8, pos);
142 if(c > 0xFFFF) {
143 throw Decoding_Error("Cannot encode character in UCS-2");
144 }
145 const uint16_t val = static_cast<uint16_t>(c);
146 out.push_back(get_byte<0>(val));
147 out.push_back(get_byte<1>(val));
148 }
149
150 return out;
151}
152
153std::string ucs4_to_utf8(const uint8_t ucs4[], size_t len) {
154 if(len % 4 != 0) {
155 throw Decoding_Error("Invalid length for UCS-4 string");
156 }
157
158 const size_t chars = len / 4;
159
160 std::string s;
161 for(size_t i = 0; i != chars; ++i) {
162 const uint32_t c = load_be<uint32_t>(ucs4, i);
163 append_utf8_for(s, c);
164 }
165
166 return s;
167}
168
169std::vector<uint8_t> utf8_to_ucs4(const std::string& utf8) {
170 std::vector<uint8_t> out;
171 out.reserve(utf8.size() * 4);
172
173 size_t pos = 0;
174 while(pos < utf8.size()) {
175 const uint32_t val = next_utf8_codepoint(utf8, pos);
176 out.push_back(get_byte<0>(val));
177 out.push_back(get_byte<1>(val));
178 out.push_back(get_byte<2>(val));
179 out.push_back(get_byte<3>(val));
180 }
181
182 return out;
183}
184
185/*
186* Convert from ISO 8859-1 to UTF-8
187*/
188std::string latin1_to_utf8(const uint8_t chars[], size_t len) {
189 std::string s;
190 for(size_t i = 0; i != len; ++i) {
191 const uint32_t c = static_cast<uint8_t>(chars[i]);
192 append_utf8_for(s, c);
193 }
194 return s;
195}
196
197std::string format_char_for_display(char c) {
198 std::ostringstream oss;
199
200 oss << "'";
201
202 if(c == '\t') {
203 oss << "\\t";
204 } else if(c == '\n') {
205 oss << "\\n";
206 } else if(c == '\r') {
207 oss << "\\r";
208 } else if(static_cast<unsigned char>(c) >= 128) {
209 const unsigned char z = static_cast<unsigned char>(c);
210 oss << "\\x" << std::hex << std::uppercase << static_cast<int>(z);
211 } else {
212 oss << c;
213 }
214
215 oss << "'";
216
217 return oss.str();
218}
219
220} // namespace Botan
#define BOTAN_UNUSED
Definition assert.h:144
constexpr uint8_t get_byte(T input)
Definition loadstor.h:79
std::string format_char_for_display(char c)
Definition charset.cpp:197
std::vector< uint8_t > utf8_to_ucs4(const std::string &utf8)
Definition charset.cpp:169
bool is_valid_utf8(const std::string &utf8)
Definition charset.cpp:106
std::string ucs2_to_utf8(const uint8_t ucs2[], size_t len)
Definition charset.cpp:119
std::string latin1_to_utf8(const uint8_t chars[], size_t len)
Definition charset.cpp:188
std::string ucs4_to_utf8(const uint8_t ucs4[], size_t len)
Definition charset.cpp:153
constexpr auto load_be(ParamTs &&... params)
Definition loadstor.h:504
std::vector< uint8_t > utf8_to_ucs2(const std::string &utf8)
Definition charset.cpp:135