2 // Copyright (C) 2009 Ingo Ruhnke <grumbel@gmail.com>
4 // This program is free software: you can redistribute it and/or modify
5 // it under the terms of the GNU General Public License as published by
6 // the Free Software Foundation, either version 3 of the License, or
7 // (at your option) any later version.
9 // This program is distributed in the hope that it will be useful,
10 // but WITHOUT ANY WARRANTY; without even the implied warranty of
11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 // GNU General Public License for more details.
14 // You should have received a copy of the GNU General Public License
15 // along with this program. If not, see <http://www.gnu.org/licenses/>.
17 #include "util/utf8_iterator.hpp"
21 #include "util/log.hpp"
25 bool has_multibyte_mark(unsigned char c);
26 uint32_t decode_utf8(const std::string& text, size_t& p);
29 * returns true if this byte matches a bitmask of 10xx.xxxx, i.e. it is the 2nd, 3rd or 4th byte of a multibyte utf8 string
31 bool has_multibyte_mark(unsigned char c) {
32 return ((c & 0300) == 0200);
36 * gets unicode character at byte position @a p of UTF-8 encoded @a
37 * text, then advances @a p to the next character.
39 * @throws std::runtime_error if decoding fails.
40 * See unicode standard section 3.10 table 3-5 and 3-6 for details.
42 uint32_t decode_utf8(const std::string& text, size_t& p)
44 uint32_t c1 = (unsigned char) text[p+0];
46 if (has_multibyte_mark(c1)) std::runtime_error("Malformed utf-8 sequence");
48 if ((c1 & 0200) == 0000) {
49 // 0xxx.xxxx: 1 byte sequence
53 else if ((c1 & 0340) == 0300) {
54 // 110x.xxxx: 2 byte sequence
55 if(p+1 >= text.size()) throw std::range_error("Malformed utf-8 sequence");
56 uint32_t c2 = (unsigned char) text[p+1];
57 if (!has_multibyte_mark(c2)) throw std::runtime_error("Malformed utf-8 sequence");
59 return (c1 & 0037) << 6 | (c2 & 0077);
61 else if ((c1 & 0360) == 0340) {
62 // 1110.xxxx: 3 byte sequence
63 if(p+2 >= text.size()) throw std::range_error("Malformed utf-8 sequence");
64 uint32_t c2 = (unsigned char) text[p+1];
65 uint32_t c3 = (unsigned char) text[p+2];
66 if (!has_multibyte_mark(c2)) throw std::runtime_error("Malformed utf-8 sequence");
67 if (!has_multibyte_mark(c3)) throw std::runtime_error("Malformed utf-8 sequence");
69 return (c1 & 0017) << 12 | (c2 & 0077) << 6 | (c3 & 0077);
71 else if ((c1 & 0370) == 0360) {
72 // 1111.0xxx: 4 byte sequence
73 if(p+3 >= text.size()) throw std::range_error("Malformed utf-8 sequence");
74 uint32_t c2 = (unsigned char) text[p+1];
75 uint32_t c3 = (unsigned char) text[p+2];
76 uint32_t c4 = (unsigned char) text[p+4];
77 if (!has_multibyte_mark(c2)) throw std::runtime_error("Malformed utf-8 sequence");
78 if (!has_multibyte_mark(c3)) throw std::runtime_error("Malformed utf-8 sequence");
79 if (!has_multibyte_mark(c4)) throw std::runtime_error("Malformed utf-8 sequence");
81 return (c1 & 0007) << 18 | (c2 & 0077) << 12 | (c3 & 0077) << 6 | (c4 & 0077);
83 throw std::runtime_error("Malformed utf-8 sequence");
89 UTF8Iterator::UTF8Iterator(const std::string& text_) :
95 chr = decode_utf8(text, pos);
96 } catch (std::exception) {
97 log_debug << "Malformed utf-8 sequence beginning with " << *((uint32_t*)(text.c_str() + pos)) << " found " << std::endl;
103 UTF8Iterator::done() const
105 return pos > text.size();
109 UTF8Iterator::operator++() {
111 chr = decode_utf8(text, pos);
112 } catch (std::exception) {
113 log_debug << "Malformed utf-8 sequence beginning with " << *((uint32_t*)(text.c_str() + pos)) << " found " << std::endl;
122 UTF8Iterator::operator*() const {