Botan 2.19.1
Crypto and TLS for C&
charset.cpp
Go to the documentation of this file.
1/*
2* Character Set Handling
3* (C) 1999-2007 Jack Lloyd
4*
5* Botan is released under the Simplified BSD License (see license.txt)
6*/
7
8#include <botan/charset.h>
9#include <botan/exceptn.h>
10#include <botan/loadstor.h>
11#include <cctype>
12
13namespace Botan {
14
15namespace {
16
17void append_utf8_for(std::string& s, uint32_t c)
18 {
19 if(c >= 0xD800 && c < 0xE000)
20 throw Decoding_Error("Invalid Unicode character");
21
22 if(c <= 0x7F)
23 {
24 const uint8_t b0 = static_cast<uint8_t>(c);
25 s.push_back(static_cast<char>(b0));
26 }
27 else if(c <= 0x7FF)
28 {
29 const uint8_t b0 = 0xC0 | static_cast<uint8_t>(c >> 6);
30 const uint8_t b1 = 0x80 | static_cast<uint8_t>(c & 0x3F);
31 s.push_back(static_cast<char>(b0));
32 s.push_back(static_cast<char>(b1));
33 }
34 else if(c <= 0xFFFF)
35 {
36 const uint8_t b0 = 0xE0 | static_cast<uint8_t>(c >> 12);
37 const uint8_t b1 = 0x80 | static_cast<uint8_t>((c >> 6) & 0x3F);
38 const uint8_t b2 = 0x80 | static_cast<uint8_t>(c & 0x3F);
39 s.push_back(static_cast<char>(b0));
40 s.push_back(static_cast<char>(b1));
41 s.push_back(static_cast<char>(b2));
42 }
43 else if(c <= 0x10FFFF)
44 {
45 const uint8_t b0 = 0xF0 | static_cast<uint8_t>(c >> 18);
46 const uint8_t b1 = 0x80 | static_cast<uint8_t>((c >> 12) & 0x3F);
47 const uint8_t b2 = 0x80 | static_cast<uint8_t>((c >> 6) & 0x3F);
48 const uint8_t b3 = 0x80 | static_cast<uint8_t>(c & 0x3F);
49 s.push_back(static_cast<char>(b0));
50 s.push_back(static_cast<char>(b1));
51 s.push_back(static_cast<char>(b2));
52 s.push_back(static_cast<char>(b3));
53 }
54 else
55 throw Decoding_Error("Invalid Unicode character");
56
57 }
58
59}
60
61std::string ucs2_to_utf8(const uint8_t ucs2[], size_t len)
62 {
63 if(len % 2 != 0)
64 throw Decoding_Error("Invalid length for UCS-2 string");
65
66 const size_t chars = len / 2;
67
68 std::string s;
69 for(size_t i = 0; i != chars; ++i)
70 {
71 const uint16_t c = load_be<uint16_t>(ucs2, i);
72 append_utf8_for(s, c);
73 }
74
75 return s;
76 }
77
78std::string ucs4_to_utf8(const uint8_t ucs4[], size_t len)
79 {
80 if(len % 4 != 0)
81 throw Decoding_Error("Invalid length for UCS-4 string");
82
83 const size_t chars = len / 4;
84
85 std::string s;
86 for(size_t i = 0; i != chars; ++i)
87 {
88 const uint32_t c = load_be<uint32_t>(ucs4, i);
89 append_utf8_for(s, c);
90 }
91
92 return s;
93 }
94
95/*
96* Convert from UTF-8 to ISO 8859-1
97*/
98std::string utf8_to_latin1(const std::string& utf8)
99 {
100 std::string iso8859;
101
102 size_t position = 0;
103 while(position != utf8.size())
104 {
105 const uint8_t c1 = static_cast<uint8_t>(utf8[position++]);
106
107 if(c1 <= 0x7F)
108 {
109 iso8859 += static_cast<char>(c1);
110 }
111 else if(c1 >= 0xC0 && c1 <= 0xC7)
112 {
113 if(position == utf8.size())
114 throw Decoding_Error("UTF-8: sequence truncated");
115
116 const uint8_t c2 = static_cast<uint8_t>(utf8[position++]);
117 const uint8_t iso_char = ((c1 & 0x07) << 6) | (c2 & 0x3F);
118
119 if(iso_char <= 0x7F)
120 throw Decoding_Error("UTF-8: sequence longer than needed");
121
122 iso8859 += static_cast<char>(iso_char);
123 }
124 else
125 throw Decoding_Error("UTF-8: Unicode chars not in Latin1 used");
126 }
127
128 return iso8859;
129 }
130
131namespace Charset {
132
133namespace {
134
135/*
136* Convert from UCS-2 to ISO 8859-1
137*/
138std::string ucs2_to_latin1(const std::string& ucs2)
139 {
140 if(ucs2.size() % 2 == 1)
141 throw Decoding_Error("UCS-2 string has an odd number of bytes");
142
143 std::string latin1;
144
145 for(size_t i = 0; i != ucs2.size(); i += 2)
146 {
147 const uint8_t c1 = ucs2[i];
148 const uint8_t c2 = ucs2[i+1];
149
150 if(c1 != 0)
151 throw Decoding_Error("UCS-2 has non-Latin1 characters");
152
153 latin1 += static_cast<char>(c2);
154 }
155
156 return latin1;
157 }
158
159/*
160* Convert from ISO 8859-1 to UTF-8
161*/
162std::string latin1_to_utf8(const std::string& iso8859)
163 {
164 std::string utf8;
165 for(size_t i = 0; i != iso8859.size(); ++i)
166 {
167 const uint8_t c = static_cast<uint8_t>(iso8859[i]);
168
169 if(c <= 0x7F)
170 utf8 += static_cast<char>(c);
171 else
172 {
173 utf8 += static_cast<char>((0xC0 | (c >> 6)));
174 utf8 += static_cast<char>((0x80 | (c & 0x3F)));
175 }
176 }
177 return utf8;
178 }
179
180}
181
182/*
183* Perform character set transcoding
184*/
185std::string transcode(const std::string& str,
187 {
188 if(to == LOCAL_CHARSET)
189 to = LATIN1_CHARSET;
190 if(from == LOCAL_CHARSET)
191 from = LATIN1_CHARSET;
192
193 if(to == from)
194 return str;
195
196 if(from == LATIN1_CHARSET && to == UTF8_CHARSET)
197 return latin1_to_utf8(str);
198 if(from == UTF8_CHARSET && to == LATIN1_CHARSET)
199 return utf8_to_latin1(str);
200 if(from == UCS2_CHARSET && to == LATIN1_CHARSET)
201 return ucs2_to_latin1(str);
202
203 throw Invalid_Argument("Unknown transcoding operation from " +
204 std::to_string(from) + " to " + std::to_string(to));
205 }
206
207/*
208* Check if a character represents a digit
209*/
210bool is_digit(char c)
211 {
212 if(c == '0' || c == '1' || c == '2' || c == '3' || c == '4' ||
213 c == '5' || c == '6' || c == '7' || c == '8' || c == '9')
214 return true;
215 return false;
216 }
217
218/*
219* Check if a character represents whitespace
220*/
221bool is_space(char c)
222 {
223 if(c == ' ' || c == '\t' || c == '\n' || c == '\r')
224 return true;
225 return false;
226 }
227
228/*
229* Convert a character to a digit
230*/
231uint8_t char2digit(char c)
232 {
233 switch(c)
234 {
235 case '0': return 0;
236 case '1': return 1;
237 case '2': return 2;
238 case '3': return 3;
239 case '4': return 4;
240 case '5': return 5;
241 case '6': return 6;
242 case '7': return 7;
243 case '8': return 8;
244 case '9': return 9;
245 }
246
247 throw Invalid_Argument("char2digit: Input is not a digit character");
248 }
249
250/*
251* Convert a digit to a character
252*/
253char digit2char(uint8_t b)
254 {
255 switch(b)
256 {
257 case 0: return '0';
258 case 1: return '1';
259 case 2: return '2';
260 case 3: return '3';
261 case 4: return '4';
262 case 5: return '5';
263 case 6: return '6';
264 case 7: return '7';
265 case 8: return '8';
266 case 9: return '9';
267 }
268
269 throw Invalid_Argument("digit2char: Input is not a digit");
270 }
271
272/*
273* Case-insensitive character comparison
274*/
275bool caseless_cmp(char a, char b)
276 {
277 return (std::tolower(static_cast<unsigned char>(a)) ==
278 std::tolower(static_cast<unsigned char>(b)));
279 }
280
281}
282
283}
std::string to_string(const BER_Object &obj)
Definition: asn1_obj.cpp:213
bool is_digit(char c)
Definition: charset.cpp:210
std::string transcode(const std::string &str, Character_Set to, Character_Set from)
Definition: charset.cpp:185
bool is_space(char c)
Definition: charset.cpp:221
bool caseless_cmp(char a, char b)
Definition: charset.cpp:275
uint8_t char2digit(char c)
Definition: charset.cpp:231
char digit2char(uint8_t b)
Definition: charset.cpp:253
Definition: alg_id.cpp:13
std::string ucs4_to_utf8(const uint8_t ucs4[], size_t len)
Definition: charset.cpp:78
std::string ucs2_to_utf8(const uint8_t ucs2[], size_t len)
Definition: charset.cpp:61
Character_Set
Definition: charset.h:43
@ UCS2_CHARSET
Definition: charset.h:45
@ LOCAL_CHARSET
Definition: charset.h:44
@ LATIN1_CHARSET
Definition: charset.h:47
@ UTF8_CHARSET
Definition: charset.h:46
std::string utf8_to_latin1(const std::string &utf8)
Definition: charset.cpp:98
uint32_t load_be< uint32_t >(const uint8_t in[], size_t off)
Definition: loadstor.h:179
uint16_t load_be< uint16_t >(const uint8_t in[], size_t off)
Definition: loadstor.h:139