31 #include <drizzled/utf8/core.h>
47 virtual const char* what()
const throw() {
return "Invalid code point"; }
48 uint32_t code_point()
const {
return cp;}
55 virtual const char* what()
const throw() {
return "Invalid UTF-8"; }
56 uint8_t utf8_octet()
const {
return u8;}
63 virtual const char* what()
const throw() {
return "Invalid UTF-16"; }
64 uint16_t utf16_word()
const {
return u16;}
69 virtual const char* what()
const throw() {
return "Not enough space"; }
74 template <
typename octet_iterator,
typename output_iterator>
75 output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement)
77 while (start != end) {
78 octet_iterator sequence_start = start;
79 internal::utf_error err_code = internal::validate_next(start, end);
81 case internal::UTF8_OK :
82 for (octet_iterator it = sequence_start; it != start; ++it)
85 case internal::NOT_ENOUGH_ROOM:
87 case internal::INVALID_LEAD:
88 append (replacement, out);
91 case internal::INCOMPLETE_SEQUENCE:
92 case internal::OVERLONG_SEQUENCE:
93 case internal::INVALID_CODE_POINT:
94 append (replacement, out);
97 while (internal::is_trail(*start) && start != end)
105 template <
typename octet_iterator,
typename output_iterator>
106 inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out)
108 static const uint32_t replacement_marker = internal::mask16(0xfffd);
109 return replace_invalid(start, end, out, replacement_marker);
112 template <
typename octet_iterator>
113 octet_iterator append(uint32_t cp, octet_iterator result)
115 if (!internal::is_code_point_valid(cp))
116 throw invalid_code_point(cp);
119 *(result++) = static_cast<uint8_t>(cp);
120 else if (cp < 0x800) {
121 *(result++) = static_cast<uint8_t>((cp >> 6) | 0xc0);
122 *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
124 else if (cp < 0x10000) {
125 *(result++) = static_cast<uint8_t>((cp >> 12) | 0xe0);
126 *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
127 *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
130 *(result++) = static_cast<uint8_t>((cp >> 18) | 0xf0);
131 *(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f) | 0x80);
132 *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
133 *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
138 template <
typename octet_iterator>
139 uint32_t next(octet_iterator& it, octet_iterator end)
142 internal::utf_error err_code = internal::validate_next(it, end, &cp);
144 case internal::UTF8_OK :
146 case internal::NOT_ENOUGH_ROOM :
147 throw not_enough_room();
148 case internal::INVALID_LEAD :
149 case internal::INCOMPLETE_SEQUENCE :
150 case internal::OVERLONG_SEQUENCE :
151 throw invalid_utf8(*it);
152 case internal::INVALID_CODE_POINT :
153 throw invalid_code_point(cp);
158 template <
typename octet_iterator>
159 uint32_t peek_next(octet_iterator it, octet_iterator end)
161 return next(it, end);
164 template <
typename octet_iterator>
165 uint32_t prior(octet_iterator& it, octet_iterator start)
167 octet_iterator end = it;
168 while (internal::is_trail(*(--it)))
170 throw invalid_utf8(*it);
171 octet_iterator temp = it;
172 return next(temp, end);
176 template <
typename octet_iterator>
177 uint32_t previous(octet_iterator& it, octet_iterator pass_start)
179 octet_iterator end = it;
180 while (internal::is_trail(*(--it)))
181 if (it == pass_start)
182 throw invalid_utf8(*it);
183 octet_iterator temp = it;
184 return next(temp, end);
187 template <
typename octet_iterator,
typename distance_type>
188 void advance (octet_iterator& it, distance_type n, octet_iterator end)
190 for (distance_type i = 0; i < n; ++i)
194 template <
typename octet_iterator>
195 typename std::iterator_traits<octet_iterator>::difference_type
196 distance (octet_iterator first, octet_iterator last)
198 typename std::iterator_traits<octet_iterator>::difference_type dist;
199 for (dist = 0; first < last; ++dist)
204 template <
typename u16bit_iterator,
typename octet_iterator>
205 octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
207 while (start != end) {
208 uint32_t cp = internal::mask16(*start++);
210 if (internal::is_lead_surrogate(cp)) {
212 uint32_t trail_surrogate = internal::mask16(*start++);
213 if (internal::is_trail_surrogate(trail_surrogate))
214 cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
216 throw invalid_utf16(static_cast<uint16_t>(trail_surrogate));
219 throw invalid_utf16(static_cast<uint16_t>(cp));
223 else if (internal::is_trail_surrogate(cp))
224 throw invalid_utf16(static_cast<uint16_t>(cp));
226 result = append(cp, result);
231 template <
typename u16bit_iterator,
typename octet_iterator>
232 u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
234 while (start != end) {
235 uint32_t cp = next(start, end);
237 *result++ =
static_cast<uint16_t
>((cp >> 10) + internal::LEAD_OFFSET);
238 *result++ =
static_cast<uint16_t
>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
241 *result++ =
static_cast<uint16_t
>(cp);
246 template <
typename octet_iterator,
typename u32bit_iterator>
247 octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result)
250 result = append(*(start++), result);
255 template <
typename octet_iterator,
typename u32bit_iterator>
256 u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)
259 (*result++) = next(start, end);
265 template <
typename octet_iterator>
266 class iterator :
public std::iterator <std::bidirectional_iterator_tag, uint32_t> {
268 octet_iterator range_start;
269 octet_iterator range_end;
272 explicit iterator (
const octet_iterator& octet_it,
273 const octet_iterator& range_start_in,
274 const octet_iterator& range_end_in) :
275 it(octet_it), range_start(range_start_in), range_end(range_end_in)
277 if (it < range_start || it > range_end)
278 throw std::out_of_range(
"Invalid utf-8 iterator position");
281 octet_iterator base ()
const {
return it; }
282 uint32_t operator * ()
const
284 octet_iterator temp = it;
285 return next(temp, range_end);
287 bool operator == (
const iterator& rhs)
const
289 if (range_start != rhs.range_start || range_end != rhs.range_end)
290 throw std::logic_error(
"Comparing utf-8 iterators defined with different ranges");
291 return (it == rhs.it);
293 bool operator != (
const iterator& rhs)
const
295 return !(operator == (rhs));
310 prior(it, range_start);
316 prior(it, range_start);