1 |
786 |
skrzyp |
// This file is part of the uSTL library, an STL implementation.
|
2 |
|
|
//
|
3 |
|
|
// Copyright (c) 2005-2009 by Mike Sharov <msharov@users.sourceforge.net>
|
4 |
|
|
// This file is free software, distributed under the MIT License.
|
5 |
|
|
//
|
6 |
|
|
// This file contains stream iterators that read and write UTF-8 encoded
|
7 |
|
|
// characters. The encoding is defined as follows:
|
8 |
|
|
//
|
9 |
|
|
// U-00000000 - U-0000007F: 0xxxxxxx
|
10 |
|
|
// U-00000080 - U-000007FF: 110xxxxx 10xxxxxx
|
11 |
|
|
// U-00000800 - U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
|
12 |
|
|
// U-00010000 - U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
13 |
|
|
// U-00200000 - U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
14 |
|
|
// U-04000000 - U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
15 |
|
|
// U-80000000 - U-FFFFFFFF: 11111110 100000xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
16 |
|
|
|
17 |
|
|
#ifndef UTF8_H_3D7AEEEB3A88928D4D280B785F78B6F4
|
18 |
|
|
#define UTF8_H_3D7AEEEB3A88928D4D280B785F78B6F4
|
19 |
|
|
|
20 |
|
|
#include "uiterator.h"
|
21 |
|
|
|
22 |
|
|
namespace ustl {
|
23 |
|
|
|
24 |
|
|
//----------------------------------------------------------------------
|
25 |
|
|
|
26 |
|
|
typedef uint8_t utf8subchar_t; ///< Type for the encoding subcharacters.
|
27 |
|
|
|
28 |
|
|
//----------------------------------------------------------------------
|
29 |
|
|
|
30 |
|
|
inline size_t Utf8Bytes (wchar_t v) __attribute__((const));
|
31 |
|
|
inline size_t Utf8Bytes (const wchar_t* first, const wchar_t* last) __attribute__((pure));
|
32 |
|
|
inline size_t Utf8SequenceBytes (wchar_t c) __attribute__((const));
|
33 |
|
|
|
34 |
|
|
//----------------------------------------------------------------------
|
35 |
|
|
|
36 |
|
|
/// Returns the number of bytes required to UTF-8 encode \p v.
|
37 |
|
|
inline size_t Utf8Bytes (wchar_t v)
|
38 |
|
|
{
|
39 |
|
|
if ((uint32_t) v < 128)
|
40 |
|
|
return (1);
|
41 |
|
|
size_t n;
|
42 |
|
|
#if __i386__ || __x86_64__
|
43 |
|
|
uint32_t r = 0;
|
44 |
|
|
asm ("bsr\t%2, %%eax\n\t"
|
45 |
|
|
"add\t$4, %0\n\t"
|
46 |
|
|
"div\t%3":"=a"(n),"+d"(r):"r"(v),"c"(5));
|
47 |
|
|
#else
|
48 |
|
|
static const uint32_t c_Bounds[7] = { 0x0000007F, 0x000007FF, 0x0000FFFF, 0x001FFFFF, 0x03FFFFFF, 0x7FFFFFFF, 0xFFFFFFFF };
|
49 |
|
|
for (n = 0; c_Bounds[n++] < uint32_t(v););
|
50 |
|
|
#endif
|
51 |
|
|
return (n);
|
52 |
|
|
}
|
53 |
|
|
|
54 |
|
|
/// Measures the size of a wchar_t array in UTF-8 encoding.
|
55 |
|
|
inline size_t Utf8Bytes (const wchar_t* first, const wchar_t* last)
|
56 |
|
|
{
|
57 |
|
|
size_t bc = 0;
|
58 |
|
|
for (; first < last; ++first)
|
59 |
|
|
bc += Utf8Bytes(*first);
|
60 |
|
|
return (bc);
|
61 |
|
|
}
|
62 |
|
|
|
63 |
|
|
/// Returns the number of bytes in a UTF-8 sequence that starts with \p c.
|
64 |
|
|
inline size_t Utf8SequenceBytes (wchar_t c) // a wchar_t to keep c in a full register
|
65 |
|
|
{
|
66 |
|
|
// Count the leading bits. Header bits are 1 * nBytes followed by a 0.
|
67 |
|
|
// 0 - single byte character. Take 7 bits (0xFF >> 1)
|
68 |
|
|
// 1 - error, in the middle of the character. Take 6 bits (0xFF >> 2)
|
69 |
|
|
// so you will keep reading invalid entries until you hit the next character.
|
70 |
|
|
// >2 - multibyte character. Take remaining bits, and get the next bytes.
|
71 |
|
|
// All errors are ignored, since the user can not correct them.
|
72 |
|
|
//
|
73 |
|
|
wchar_t mask = 0x80;
|
74 |
|
|
size_t nBytes = 0;
|
75 |
|
|
for (; c & mask; ++nBytes)
|
76 |
|
|
mask >>= 1;
|
77 |
|
|
return (nBytes ? nBytes : 1); // A sequence is always at least 1 byte.
|
78 |
|
|
}
|
79 |
|
|
|
80 |
|
|
//----------------------------------------------------------------------
|
81 |
|
|
|
82 |
|
|
/// \class utf8in_iterator utf8.h ustl.h
|
83 |
|
|
/// \ingroup IteratorAdaptors
|
84 |
|
|
///
|
85 |
|
|
/// \brief An iterator adaptor to character containers for reading UTF-8 encoded text.
|
86 |
|
|
///
|
87 |
|
|
/// For example, you can copy from ustl::string to ustl::vector<wchar_t> with
|
88 |
|
|
/// copy (utf8in (str.begin()), utf8in (str.end()), back_inserter(wvect));
|
89 |
|
|
/// There is no error handling; if the reading frame slips you'll get extra
|
90 |
|
|
/// characters, one for every misaligned byte. Although it is possible to skip
|
91 |
|
|
/// to the start of the next character, that would result in omitting the
|
92 |
|
|
/// misformatted character and the one after it, making it very difficult to
|
93 |
|
|
/// detect by the user. It is better to write some strange characters and let
|
94 |
|
|
/// the user know his file is corrupted. Another problem is overflow on bad
|
95 |
|
|
/// encodings (like a 0xFF on the end of a string). This is checked through
|
96 |
|
|
/// the end-of-string nul character, which will always be there as long as
|
97 |
|
|
/// you are using the string class.
|
98 |
|
|
///
|
99 |
|
|
template <typename Iterator, typename WChar = wchar_t>
|
100 |
|
|
class utf8in_iterator {
|
101 |
|
|
public:
|
102 |
|
|
typedef typename iterator_traits<Iterator>::value_type value_type;
|
103 |
|
|
typedef typename iterator_traits<Iterator>::difference_type difference_type;
|
104 |
|
|
typedef typename iterator_traits<Iterator>::pointer pointer;
|
105 |
|
|
typedef typename iterator_traits<Iterator>::reference reference;
|
106 |
|
|
public:
|
107 |
|
|
explicit utf8in_iterator (const Iterator& is) : m_i (is), m_v (0) { Read(); }
|
108 |
|
|
utf8in_iterator (const utf8in_iterator& i) : m_i (i.m_i), m_v (i.m_v) {}
|
109 |
|
|
inline const utf8in_iterator& operator= (const utf8in_iterator& i) { m_i = i.m_i; m_v = i.m_v; return (*this); }
|
110 |
|
|
inline Iterator base (void) const { return (m_i - (Utf8Bytes(m_v) - 1)); }
|
111 |
|
|
/// Reads and returns the next value.
|
112 |
|
|
inline WChar operator* (void) const { return (m_v); }
|
113 |
|
|
inline utf8in_iterator& operator++ (void) { ++m_i; Read(); return (*this); }
|
114 |
|
|
inline utf8in_iterator operator++ (int) { utf8in_iterator old (*this); operator++(); return (old); }
|
115 |
|
|
inline utf8in_iterator& operator+= (uoff_t n) { while (n--) operator++(); return (*this); }
|
116 |
|
|
inline utf8in_iterator operator+ (uoff_t n) { utf8in_iterator v (*this); return (v += n); }
|
117 |
|
|
inline bool operator== (const utf8in_iterator& i) const { return (m_i == i.m_i); }
|
118 |
|
|
inline bool operator< (const utf8in_iterator& i) const { return (m_i < i.m_i); }
|
119 |
|
|
difference_type operator- (const utf8in_iterator& i) const;
|
120 |
|
|
private:
|
121 |
|
|
void Read (void);
|
122 |
|
|
private:
|
123 |
|
|
Iterator m_i;
|
124 |
|
|
WChar m_v;
|
125 |
|
|
};
|
126 |
|
|
|
127 |
|
|
/// Steps to the next character and updates current returnable value.
|
128 |
|
|
template <typename Iterator, typename WChar>
|
129 |
|
|
void utf8in_iterator<Iterator,WChar>::Read (void)
|
130 |
|
|
{
|
131 |
|
|
const utf8subchar_t c = *m_i;
|
132 |
|
|
size_t nBytes = Utf8SequenceBytes (c);
|
133 |
|
|
m_v = c & (0xFF >> nBytes); // First byte contains bits after the header.
|
134 |
|
|
while (--nBytes && *++m_i) // Each subsequent byte has 6 bits.
|
135 |
|
|
m_v = (m_v << 6) | (*m_i & 0x3F);
|
136 |
|
|
}
|
137 |
|
|
|
138 |
|
|
/// Returns the distance in characters (as opposed to the distance in bytes).
|
139 |
|
|
template <typename Iterator, typename WChar>
|
140 |
|
|
typename utf8in_iterator<Iterator,WChar>::difference_type
|
141 |
|
|
utf8in_iterator<Iterator,WChar>::operator- (const utf8in_iterator<Iterator,WChar>& last) const
|
142 |
|
|
{
|
143 |
|
|
difference_type dist = 0;
|
144 |
|
|
for (Iterator first (last.m_i); first < m_i; ++dist)
|
145 |
|
|
first = advance (first, Utf8SequenceBytes (*first));
|
146 |
|
|
return (dist);
|
147 |
|
|
}
|
148 |
|
|
|
149 |
|
|
//----------------------------------------------------------------------
|
150 |
|
|
|
151 |
|
|
/// \class utf8out_iterator utf8.h ustl.h
|
152 |
|
|
/// \ingroup IteratorAdaptors
|
153 |
|
|
///
|
154 |
|
|
/// \brief An iterator adaptor to character containers for writing UTF-8 encoded text.
|
155 |
|
|
///
|
156 |
|
|
template <typename Iterator, typename WChar = wchar_t>
|
157 |
|
|
class utf8out_iterator {
|
158 |
|
|
public:
|
159 |
|
|
typedef typename iterator_traits<Iterator>::value_type value_type;
|
160 |
|
|
typedef typename iterator_traits<Iterator>::difference_type difference_type;
|
161 |
|
|
typedef typename iterator_traits<Iterator>::pointer pointer;
|
162 |
|
|
typedef typename iterator_traits<Iterator>::reference reference;
|
163 |
|
|
public:
|
164 |
|
|
explicit utf8out_iterator (const Iterator& os) : m_i (os) {}
|
165 |
|
|
utf8out_iterator (const utf8out_iterator& i) : m_i (i.m_i) {}
|
166 |
|
|
inline const Iterator& base (void) const { return (m_i); }
|
167 |
|
|
/// Writes \p v into the stream.
|
168 |
|
|
utf8out_iterator& operator= (WChar v);
|
169 |
|
|
inline utf8out_iterator& operator* (void) { return (*this); }
|
170 |
|
|
inline utf8out_iterator& operator++ (void) { return (*this); }
|
171 |
|
|
inline utf8out_iterator operator++ (int) { return (*this); }
|
172 |
|
|
inline bool operator== (const utf8out_iterator& i) const { return (m_i == i.m_i); }
|
173 |
|
|
inline bool operator< (const utf8out_iterator& i) const { return (m_i < i.m_i); }
|
174 |
|
|
private:
|
175 |
|
|
Iterator m_i;
|
176 |
|
|
};
|
177 |
|
|
|
178 |
|
|
/// Writes \p v into the stream.
|
179 |
|
|
template <typename Iterator, typename WChar>
|
180 |
|
|
utf8out_iterator<Iterator,WChar>& utf8out_iterator<Iterator,WChar>::operator= (WChar v)
|
181 |
|
|
{
|
182 |
|
|
const size_t nBytes = Utf8Bytes (v);
|
183 |
|
|
if (nBytes > 1) {
|
184 |
|
|
// Write the bits 6 bits at a time, except for the first one,
|
185 |
|
|
// which may be less than 6 bits.
|
186 |
|
|
register wchar_t shift = nBytes * 6;
|
187 |
|
|
*m_i++ = ((v >> (shift -= 6)) & 0x3F) | (0xFF << (8 - nBytes));
|
188 |
|
|
while (shift)
|
189 |
|
|
*m_i++ = ((v >> (shift -= 6)) & 0x3F) | 0x80;
|
190 |
|
|
} else // If only one byte, there is no header.
|
191 |
|
|
*m_i++ = v;
|
192 |
|
|
return (*this);
|
193 |
|
|
}
|
194 |
|
|
|
195 |
|
|
//----------------------------------------------------------------------
|
196 |
|
|
|
197 |
|
|
/// Returns a UTF-8 adaptor writing to \p i. Useful in conjuction with back_insert_iterator.
|
198 |
|
|
template <typename Iterator>
|
199 |
|
|
inline utf8out_iterator<Iterator> utf8out (Iterator i)
|
200 |
|
|
{
|
201 |
|
|
return (utf8out_iterator<Iterator> (i));
|
202 |
|
|
}
|
203 |
|
|
|
204 |
|
|
/// Returns a UTF-8 adaptor reading from \p i.
|
205 |
|
|
template <typename Iterator>
|
206 |
|
|
inline utf8in_iterator<Iterator> utf8in (Iterator i)
|
207 |
|
|
{
|
208 |
|
|
return (utf8in_iterator<Iterator> (i));
|
209 |
|
|
}
|
210 |
|
|
|
211 |
|
|
//----------------------------------------------------------------------
|
212 |
|
|
|
213 |
|
|
} // namespace ustl
|
214 |
|
|
|
215 |
|
|
#endif
|