Botan 3.4.0
Crypto and TLS for C&
charset.cpp
Go to the documentation of this file.
1/*
2* Character Set Handling
3* (C) 1999-2007,2021 Jack Lloyd
4*
5* Botan is released under the Simplified BSD License (see license.txt)
6*/
7
8#include <botan/internal/charset.h>
9
10#include <botan/exceptn.h>
11#include <botan/internal/loadstor.h>
12#include <sstream>
13
14namespace Botan {
15
16namespace {
17
18void append_utf8_for(std::string& s, uint32_t c) {
19 if(c >= 0xD800 && c < 0xE000) {
20 throw Decoding_Error("Invalid Unicode character");
21 }
22
23 if(c <= 0x7F) {
24 const uint8_t b0 = static_cast<uint8_t>(c);
25 s.push_back(static_cast<char>(b0));
26 } else if(c <= 0x7FF) {
27 const uint8_t b0 = 0xC0 | static_cast<uint8_t>(c >> 6);
28 const uint8_t b1 = 0x80 | static_cast<uint8_t>(c & 0x3F);
29 s.push_back(static_cast<char>(b0));
30 s.push_back(static_cast<char>(b1));
31 } else if(c <= 0xFFFF) {
32 const uint8_t b0 = 0xE0 | static_cast<uint8_t>(c >> 12);
33 const uint8_t b1 = 0x80 | static_cast<uint8_t>((c >> 6) & 0x3F);
34 const uint8_t b2 = 0x80 | static_cast<uint8_t>(c & 0x3F);
35 s.push_back(static_cast<char>(b0));
36 s.push_back(static_cast<char>(b1));
37 s.push_back(static_cast<char>(b2));
38 } else if(c <= 0x10FFFF) {
39 const uint8_t b0 = 0xF0 | static_cast<uint8_t>(c >> 18);
40 const uint8_t b1 = 0x80 | static_cast<uint8_t>((c >> 12) & 0x3F);
41 const uint8_t b2 = 0x80 | static_cast<uint8_t>((c >> 6) & 0x3F);
42 const uint8_t b3 = 0x80 | static_cast<uint8_t>(c & 0x3F);
43 s.push_back(static_cast<char>(b0));
44 s.push_back(static_cast<char>(b1));
45 s.push_back(static_cast<char>(b2));
46 s.push_back(static_cast<char>(b3));
47 } else {
48 throw Decoding_Error("Invalid Unicode character");
49 }
50}
51
52} // namespace
53
54std::string ucs2_to_utf8(const uint8_t ucs2[], size_t len) {
55 if(len % 2 != 0) {
56 throw Decoding_Error("Invalid length for UCS-2 string");
57 }
58
59 const size_t chars = len / 2;
60
61 std::string s;
62 for(size_t i = 0; i != chars; ++i) {
63 const uint32_t c = load_be<uint16_t>(ucs2, i);
64 append_utf8_for(s, c);
65 }
66
67 return s;
68}
69
70std::string ucs4_to_utf8(const uint8_t ucs4[], size_t len) {
71 if(len % 4 != 0) {
72 throw Decoding_Error("Invalid length for UCS-4 string");
73 }
74
75 const size_t chars = len / 4;
76
77 std::string s;
78 for(size_t i = 0; i != chars; ++i) {
79 const uint32_t c = load_be<uint32_t>(ucs4, i);
80 append_utf8_for(s, c);
81 }
82
83 return s;
84}
85
86/*
87* Convert from ISO 8859-1 to UTF-8
88*/
89std::string latin1_to_utf8(const uint8_t chars[], size_t len) {
90 std::string s;
91 for(size_t i = 0; i != len; ++i) {
92 const uint32_t c = static_cast<uint8_t>(chars[i]);
93 append_utf8_for(s, c);
94 }
95 return s;
96}
97
98std::string format_char_for_display(char c) {
99 std::ostringstream oss;
100
101 oss << "'";
102
103 if(c == '\t') {
104 oss << "\\t";
105 } else if(c == '\n') {
106 oss << "\\n";
107 } else if(c == '\r') {
108 oss << "\\r";
109 } else if(static_cast<unsigned char>(c) >= 128) {
110 unsigned char z = static_cast<unsigned char>(c);
111 oss << "\\x" << std::hex << std::uppercase << static_cast<int>(z);
112 } else {
113 oss << c;
114 }
115
116 oss << "'";
117
118 return oss.str();
119}
120
121} // namespace Botan
std::string format_char_for_display(char c)
Definition charset.cpp:98
std::string ucs2_to_utf8(const uint8_t ucs2[], size_t len)
Definition charset.cpp:54
std::string latin1_to_utf8(const uint8_t chars[], size_t len)
Definition charset.cpp:89
std::string ucs4_to_utf8(const uint8_t ucs4[], size_t len)
Definition charset.cpp:70