Botan  2.7.0
Crypto and TLS for C++11
charset.cpp
Go to the documentation of this file.
1 /*
2 * Character Set Handling
3 * (C) 1999-2007 Jack Lloyd
4 *
5 * Botan is released under the Simplified BSD License (see license.txt)
6 */
7 
8 #include <botan/charset.h>
9 #include <botan/exceptn.h>
10 #include <botan/loadstor.h>
11 #include <cctype>
12 
13 namespace Botan {
14 
15 namespace {
16 
17 void append_utf8_for(std::string& s, uint32_t c)
18  {
19  if(c >= 0xD800 && c < 0xE000)
20  throw Decoding_Error("Invalid Unicode character");
21 
22  if(c <= 0x7F)
23  {
24  const uint8_t b0 = static_cast<uint8_t>(c);
25  s.push_back(static_cast<char>(b0));
26  }
27  else if(c <= 0x7FF)
28  {
29  const uint8_t b0 = 0xC0 | static_cast<uint8_t>(c >> 6);
30  const uint8_t b1 = 0x80 | static_cast<uint8_t>(c & 0x3F);
31  s.push_back(static_cast<char>(b0));
32  s.push_back(static_cast<char>(b1));
33  }
34  else if(c <= 0xFFFF)
35  {
36  const uint8_t b0 = 0xE0 | static_cast<uint8_t>(c >> 12);
37  const uint8_t b1 = 0x80 | static_cast<uint8_t>((c >> 6) & 0x3F);
38  const uint8_t b2 = 0x80 | static_cast<uint8_t>(c & 0x3F);
39  s.push_back(static_cast<char>(b0));
40  s.push_back(static_cast<char>(b1));
41  s.push_back(static_cast<char>(b2));
42  }
43  else if(c <= 0x10FFFF)
44  {
45  const uint8_t b0 = 0xF0 | static_cast<uint8_t>(c >> 18);
46  const uint8_t b1 = 0x80 | static_cast<uint8_t>((c >> 12) & 0x3F);
47  const uint8_t b2 = 0x80 | static_cast<uint8_t>((c >> 6) & 0x3F);
48  const uint8_t b3 = 0x80 | static_cast<uint8_t>(c & 0x3F);
49  s.push_back(static_cast<char>(b0));
50  s.push_back(static_cast<char>(b1));
51  s.push_back(static_cast<char>(b2));
52  s.push_back(static_cast<char>(b3));
53  }
54  else
55  throw Decoding_Error("Invalid Unicode character");
56 
57  }
58 
59 }
60 
61 std::string ucs2_to_utf8(const uint8_t ucs2[], size_t len)
62  {
63  if(len % 2 != 0)
64  throw Decoding_Error("Invalid length for UCS-2 string");
65 
66  const size_t chars = len / 2;
67 
68  std::string s;
69  for(size_t i = 0; i != chars; ++i)
70  {
71  const uint16_t c = load_be<uint16_t>(ucs2, i);
72  append_utf8_for(s, c);
73  }
74 
75  return s;
76  }
77 
78 std::string ucs4_to_utf8(const uint8_t ucs4[], size_t len)
79  {
80  if(len % 4 != 0)
81  throw Decoding_Error("Invalid length for UCS-4 string");
82 
83  const size_t chars = len / 4;
84 
85  std::string s;
86  for(size_t i = 0; i != chars; ++i)
87  {
88  const uint32_t c = load_be<uint32_t>(ucs4, i);
89  append_utf8_for(s, c);
90  }
91 
92  return s;
93  }
94 
95 /*
96 * Convert from UTF-8 to ISO 8859-1
97 */
98 std::string utf8_to_latin1(const std::string& utf8)
99  {
100  std::string iso8859;
101 
102  size_t position = 0;
103  while(position != utf8.size())
104  {
105  const uint8_t c1 = static_cast<uint8_t>(utf8[position++]);
106 
107  if(c1 <= 0x7F)
108  {
109  iso8859 += static_cast<char>(c1);
110  }
111  else if(c1 >= 0xC0 && c1 <= 0xC7)
112  {
113  if(position == utf8.size())
114  throw Decoding_Error("UTF-8: sequence truncated");
115 
116  const uint8_t c2 = static_cast<uint8_t>(utf8[position++]);
117  const uint8_t iso_char = ((c1 & 0x07) << 6) | (c2 & 0x3F);
118 
119  if(iso_char <= 0x7F)
120  throw Decoding_Error("UTF-8: sequence longer than needed");
121 
122  iso8859 += static_cast<char>(iso_char);
123  }
124  else
125  throw Decoding_Error("UTF-8: Unicode chars not in Latin1 used");
126  }
127 
128  return iso8859;
129  }
130 
131 namespace Charset {
132 
133 namespace {
134 
135 /*
136 * Convert from UCS-2 to ISO 8859-1
137 */
138 std::string ucs2_to_latin1(const std::string& ucs2)
139  {
140  if(ucs2.size() % 2 == 1)
141  throw Decoding_Error("UCS-2 string has an odd number of bytes");
142 
143  std::string latin1;
144 
145  for(size_t i = 0; i != ucs2.size(); i += 2)
146  {
147  const uint8_t c1 = ucs2[i];
148  const uint8_t c2 = ucs2[i+1];
149 
150  if(c1 != 0)
151  throw Decoding_Error("UCS-2 has non-Latin1 characters");
152 
153  latin1 += static_cast<char>(c2);
154  }
155 
156  return latin1;
157  }
158 
159 /*
160 * Convert from ISO 8859-1 to UTF-8
161 */
162 std::string latin1_to_utf8(const std::string& iso8859)
163  {
164  std::string utf8;
165  for(size_t i = 0; i != iso8859.size(); ++i)
166  {
167  const uint8_t c = static_cast<uint8_t>(iso8859[i]);
168 
169  if(c <= 0x7F)
170  utf8 += static_cast<char>(c);
171  else
172  {
173  utf8 += static_cast<char>((0xC0 | (c >> 6)));
174  utf8 += static_cast<char>((0x80 | (c & 0x3F)));
175  }
176  }
177  return utf8;
178  }
179 
180 }
181 
182 /*
183 * Perform character set transcoding
184 */
185 std::string transcode(const std::string& str,
186  Character_Set to, Character_Set from)
187  {
188  if(to == LOCAL_CHARSET)
189  to = LATIN1_CHARSET;
190  if(from == LOCAL_CHARSET)
191  from = LATIN1_CHARSET;
192 
193  if(to == from)
194  return str;
195 
196  if(from == LATIN1_CHARSET && to == UTF8_CHARSET)
197  return latin1_to_utf8(str);
198  if(from == UTF8_CHARSET && to == LATIN1_CHARSET)
199  return utf8_to_latin1(str);
200  if(from == UCS2_CHARSET && to == LATIN1_CHARSET)
201  return ucs2_to_latin1(str);
202 
203  throw Invalid_Argument("Unknown transcoding operation from " +
204  std::to_string(from) + " to " + std::to_string(to));
205  }
206 
207 /*
208 * Check if a character represents a digit
209 */
210 bool is_digit(char c)
211  {
212  if(c == '0' || c == '1' || c == '2' || c == '3' || c == '4' ||
213  c == '5' || c == '6' || c == '7' || c == '8' || c == '9')
214  return true;
215  return false;
216  }
217 
218 /*
219 * Check if a character represents whitespace
220 */
221 bool is_space(char c)
222  {
223  if(c == ' ' || c == '\t' || c == '\n' || c == '\r')
224  return true;
225  return false;
226  }
227 
228 /*
229 * Convert a character to a digit
230 */
231 uint8_t char2digit(char c)
232  {
233  switch(c)
234  {
235  case '0': return 0;
236  case '1': return 1;
237  case '2': return 2;
238  case '3': return 3;
239  case '4': return 4;
240  case '5': return 5;
241  case '6': return 6;
242  case '7': return 7;
243  case '8': return 8;
244  case '9': return 9;
245  }
246 
247  throw Invalid_Argument("char2digit: Input is not a digit character");
248  }
249 
250 /*
251 * Convert a digit to a character
252 */
253 char digit2char(uint8_t b)
254  {
255  switch(b)
256  {
257  case 0: return '0';
258  case 1: return '1';
259  case 2: return '2';
260  case 3: return '3';
261  case 4: return '4';
262  case 5: return '5';
263  case 6: return '6';
264  case 7: return '7';
265  case 8: return '8';
266  case 9: return '9';
267  }
268 
269  throw Invalid_Argument("digit2char: Input is not a digit");
270  }
271 
272 /*
273 * Case-insensitive character comparison
274 */
275 bool caseless_cmp(char a, char b)
276  {
277  return (std::tolower(static_cast<unsigned char>(a)) ==
278  std::tolower(static_cast<unsigned char>(b)));
279  }
280 
281 }
282 
283 }
std::string transcode(const std::string &str, Character_Set to, Character_Set from)
Definition: charset.cpp:185
std::string ucs4_to_utf8(const uint8_t ucs4[], size_t len)
Definition: charset.cpp:78
uint16_t load_be< uint16_t >(const uint8_t in[], size_t off)
Definition: loadstor.h:137
uint32_t load_be< uint32_t >(const uint8_t in[], size_t off)
Definition: loadstor.h:177
uint8_t char2digit(char c)
Definition: charset.cpp:231
std::string utf8_to_latin1(const std::string &utf8)
Definition: charset.cpp:98
std::string ucs2_to_utf8(const uint8_t ucs2[], size_t len)
Definition: charset.cpp:61
std::string to_string(const BER_Object &obj)
Definition: asn1_obj.cpp:210
Character_Set
Definition: charset.h:41
bool caseless_cmp(char a, char b)
Definition: charset.cpp:275
bool is_space(char c)
Definition: charset.cpp:221
Definition: alg_id.cpp:13
char digit2char(uint8_t b)
Definition: charset.cpp:253
bool is_digit(char c)
Definition: charset.cpp:210