URL https://opencores.org/ocsvn/openrisc/openrisc/trunk

Subversion Repositories openrisc

[/] [openrisc/] [trunk/] [rtos/] [ecos-3.0/] [packages/] [language/] [cxx/] [ustl/] [current/] [include/] [ustl/] [utf8.h] - Blame information for rev 786

Details | Compare with Previous | View Log


// This file is part of the uSTL library, an STL implementation.
//
// Copyright (c) 2005-2009 by Mike Sharov <msharov@users.sourceforge.net>
// This file is free software, distributed under the MIT License.
//
// This file contains stream iterators that read and write UTF-8 encoded
// characters. The encoding is defined as follows:
//
// U-00000000 - U-0000007F: 0xxxxxxx
// U-00000080 - U-000007FF: 110xxxxx 10xxxxxx
// U-00000800 - U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
// U-00010000 - U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
// U-00200000 - U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
// U-04000000 - U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
// U-80000000 - U-FFFFFFFF: 11111110 100000xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 
#ifndef UTF8_H_3D7AEEEB3A88928D4D280B785F78B6F4
#define UTF8_H_3D7AEEEB3A88928D4D280B785F78B6F4
 
#include "uiterator.h"
 
namespace ustl {
 
//----------------------------------------------------------------------
 
typedef uint8_t utf8subchar_t;  ///< Type for the encoding subcharacters.
 
//----------------------------------------------------------------------
 
inline size_t Utf8Bytes (wchar_t v) __attribute__((const));
inline size_t Utf8Bytes (const wchar_t* first, const wchar_t* last) __attribute__((pure));
inline size_t Utf8SequenceBytes (wchar_t c) __attribute__((const));
 
//----------------------------------------------------------------------
 
/// Returns the number of bytes required to UTF-8 encode \p v.
inline size_t Utf8Bytes (wchar_t v)
{
    if ((uint32_t) v < 128)
        return (1);
    size_t n;
    #if __i386__ || __x86_64__
        uint32_t r = 0;
        asm ("bsr\t%2, %%eax\n\t"
            "add\t$4, %0\n\t"
            "div\t%3":"=a"(n),"+d"(r):"r"(v),"c"(5));
    #else
        static const uint32_t c_Bounds[7] = { 0x0000007F, 0x000007FF, 0x0000FFFF, 0x001FFFFF, 0x03FFFFFF, 0x7FFFFFFF, 0xFFFFFFFF };
        for (n = 0; c_Bounds[n++] < uint32_t(v););
    #endif
    return (n);
}
 
/// Measures the size of a wchar_t array in UTF-8 encoding.
inline size_t Utf8Bytes (const wchar_t* first, const wchar_t* last)
{
    size_t bc = 0;
    for (; first < last; ++first)
        bc += Utf8Bytes(*first);
    return (bc);
}
 
/// Returns the number of bytes in a UTF-8 sequence that starts with \p c.
inline size_t Utf8SequenceBytes (wchar_t c)     // a wchar_t to keep c in a full register
{
    // Count the leading bits. Header bits are 1 * nBytes followed by a 0.
    //  0 - single byte character. Take 7 bits (0xFF >> 1)
    //  1 - error, in the middle of the character. Take 6 bits (0xFF >> 2)
    //      so you will keep reading invalid entries until you hit the next character.
    //  >2 - multibyte character. Take remaining bits, and get the next bytes.
    // All errors are ignored, since the user can not correct them.
    //
    wchar_t mask = 0x80;
    size_t nBytes = 0;
    for (; c & mask; ++nBytes)
        mask >>= 1;
    return (nBytes ? nBytes : 1); // A sequence is always at least 1 byte.
}
 
//----------------------------------------------------------------------
 
/// \class utf8in_iterator utf8.h ustl.h
/// \ingroup IteratorAdaptors
///
/// \brief An iterator adaptor to character containers for reading UTF-8 encoded text.
///
/// For example, you can copy from ustl::string to ustl::vector<wchar_t> with
/// copy (utf8in (str.begin()), utf8in (str.end()), back_inserter(wvect));
/// There is no error handling; if the reading frame slips you'll get extra
/// characters, one for every misaligned byte. Although it is possible to skip
/// to the start of the next character, that would result in omitting the
/// misformatted character and the one after it, making it very difficult to
/// detect by the user. It is better to write some strange characters and let
/// the user know his file is corrupted. Another problem is overflow on bad
/// encodings (like a 0xFF on the end of a string). This is checked through
/// the end-of-string nul character, which will always be there as long as
/// you are using the string class.
///
template <typename Iterator, typename WChar = wchar_t>
class utf8in_iterator {
public:
    typedef typename iterator_traits<Iterator>::value_type      value_type;
    typedef typename iterator_traits<Iterator>::difference_type difference_type;
    typedef typename iterator_traits<Iterator>::pointer         pointer;
    typedef typename iterator_traits<Iterator>::reference       reference;
public:
    explicit                    utf8in_iterator (const Iterator& is)            : m_i (is), m_v (0) { Read(); }
                                utf8in_iterator (const utf8in_iterator& i)      : m_i (i.m_i), m_v (i.m_v) {}
    inline const utf8in_iterator& operator= (const utf8in_iterator& i)          { m_i = i.m_i; m_v = i.m_v; return (*this); }
    inline Iterator             base (void) const       { return (m_i - (Utf8Bytes(m_v) - 1)); }
    /// Reads and returns the next value.
    inline WChar                operator* (void) const  { return (m_v); }
    inline utf8in_iterator&     operator++ (void)       { ++m_i; Read(); return (*this); }
    inline utf8in_iterator      operator++ (int)        { utf8in_iterator old (*this); operator++(); return (old); }
    inline utf8in_iterator&     operator+= (uoff_t n)   { while (n--) operator++(); return (*this); }
    inline utf8in_iterator      operator+ (uoff_t n)    { utf8in_iterator v (*this); return (v += n); }
    inline bool                 operator== (const utf8in_iterator& i) const     { return (m_i == i.m_i); }
    inline bool                 operator< (const utf8in_iterator& i) const      { return (m_i < i.m_i); }
    difference_type             operator- (const utf8in_iterator& i) const;
private:
    void                        Read (void);
private:
    Iterator                    m_i;
    WChar                       m_v;
};
 
/// Steps to the next character and updates current returnable value.
template <typename Iterator, typename WChar>
void utf8in_iterator<Iterator,WChar>::Read (void)
{
    const utf8subchar_t c = *m_i;
    size_t nBytes = Utf8SequenceBytes (c);
    m_v = c & (0xFF >> nBytes); // First byte contains bits after the header.
    while (--nBytes && *++m_i)  // Each subsequent byte has 6 bits.
        m_v = (m_v << 6) | (*m_i & 0x3F);
}
 
/// Returns the distance in characters (as opposed to the distance in bytes).
template <typename Iterator, typename WChar>
typename utf8in_iterator<Iterator,WChar>::difference_type
utf8in_iterator<Iterator,WChar>::operator- (const utf8in_iterator<Iterator,WChar>& last) const
{
    difference_type dist = 0;
    for (Iterator first (last.m_i); first < m_i; ++dist)
        first = advance (first, Utf8SequenceBytes (*first));
    return (dist);
}
 
//----------------------------------------------------------------------
 
/// \class utf8out_iterator utf8.h ustl.h
/// \ingroup IteratorAdaptors
///
/// \brief An iterator adaptor to character containers for writing UTF-8 encoded text.
///
template <typename Iterator, typename WChar = wchar_t>
class utf8out_iterator {
public:
    typedef typename iterator_traits<Iterator>::value_type      value_type;
    typedef typename iterator_traits<Iterator>::difference_type difference_type;
    typedef typename iterator_traits<Iterator>::pointer         pointer;
    typedef typename iterator_traits<Iterator>::reference       reference;
public:
    explicit                    utf8out_iterator (const Iterator& os) : m_i (os) {}
                                utf8out_iterator (const utf8out_iterator& i) : m_i (i.m_i) {}
    inline const Iterator&      base (void) const { return (m_i); }
    /// Writes \p v into the stream.
    utf8out_iterator&           operator= (WChar v);
    inline utf8out_iterator&    operator* (void) { return (*this); }
    inline utf8out_iterator&    operator++ (void) { return (*this); }
    inline utf8out_iterator     operator++ (int) { return (*this); }
    inline bool                 operator== (const utf8out_iterator& i) const { return (m_i == i.m_i); }
    inline bool                 operator< (const utf8out_iterator& i) const { return (m_i < i.m_i); }
private:
    Iterator                    m_i;
};
 
/// Writes \p v into the stream.
template <typename Iterator, typename WChar>
utf8out_iterator<Iterator,WChar>& utf8out_iterator<Iterator,WChar>::operator= (WChar v)
{
    const size_t nBytes = Utf8Bytes (v);
    if (nBytes > 1) {
        // Write the bits 6 bits at a time, except for the first one,
        // which may be less than 6 bits.
        register wchar_t shift = nBytes * 6;
        *m_i++ = ((v >> (shift -= 6)) & 0x3F) | (0xFF << (8 - nBytes));
        while (shift)
            *m_i++ = ((v >> (shift -= 6)) & 0x3F) | 0x80;
    } else      // If only one byte, there is no header.
        *m_i++ = v;
    return (*this);
}
 
//----------------------------------------------------------------------
 
/// Returns a UTF-8 adaptor writing to \p i. Useful in conjuction with back_insert_iterator.
template <typename Iterator>
inline utf8out_iterator<Iterator> utf8out (Iterator i)
{
    return (utf8out_iterator<Iterator> (i));
}
 
/// Returns a UTF-8 adaptor reading from \p i.
template <typename Iterator>
inline utf8in_iterator<Iterator> utf8in (Iterator i)
{
    return (utf8in_iterator<Iterator> (i));
}
 
//----------------------------------------------------------------------
 
} // namespace ustl
 
#endif

Browse

Tools

Subversion Repositories openrisc

[/] [openrisc/] [trunk/] [rtos/] [ecos-3.0/] [packages/] [language/] [cxx/] [ustl/] [current/] [include/] [ustl/] [utf8.h] - Blame information for rev 786

Line No.	Rev	Author	Line
1	786	skrzyp	`// This file is part of the uSTL library, an STL implementation.`
2			`//`
3			`// Copyright (c) 2005-2009 by Mike Sharov <msharov@users.sourceforge.net>`
4			`// This file is free software, distributed under the MIT License.`
5			`//`
6			`// This file contains stream iterators that read and write UTF-8 encoded`
7			`// characters. The encoding is defined as follows:`
8			`//`
9			`// U-00000000 - U-0000007F: 0xxxxxxx`
10			`// U-00000080 - U-000007FF: 110xxxxx 10xxxxxx`
11			`// U-00000800 - U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx`
12			`// U-00010000 - U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx`
13			`// U-00200000 - U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx`
14			`// U-04000000 - U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx`
15			`// U-80000000 - U-FFFFFFFF: 11111110 100000xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx`
16
17			`#ifndef UTF8_H_3D7AEEEB3A88928D4D280B785F78B6F4`
18			`#define UTF8_H_3D7AEEEB3A88928D4D280B785F78B6F4`
19
20			`#include "uiterator.h"`
21
22			`namespace ustl {`
23
24			`//----------------------------------------------------------------------`
25
26			`typedef uint8_t utf8subchar_t; ///< Type for the encoding subcharacters.`
27
28			`//----------------------------------------------------------------------`
29
30			`inline size_t Utf8Bytes (wchar_t v) __attribute__((const));`
31			`inline size_t Utf8Bytes (const wchar_t* first, const wchar_t* last) __attribute__((pure));`
32			`inline size_t Utf8SequenceBytes (wchar_t c) __attribute__((const));`
33
34			`//----------------------------------------------------------------------`
35
36			`/// Returns the number of bytes required to UTF-8 encode \p v.`
37			`inline size_t Utf8Bytes (wchar_t v)`
38			`{`
39			`if ((uint32_t) v < 128)`
40			`return (1);`
41			`size_t n;`
42			`#if __i386__ \|\| __x86_64__`
43			`uint32_t r = 0;`
44			`asm ("bsr\t%2, %%eax\n\t"`
45			`"add\t$4, %0\n\t"`
46			`"div\t%3":"=a"(n),"+d"(r):"r"(v),"c"(5));`
47			`#else`
48			`static const uint32_t c_Bounds[7] = { 0x0000007F, 0x000007FF, 0x0000FFFF, 0x001FFFFF, 0x03FFFFFF, 0x7FFFFFFF, 0xFFFFFFFF };`
49			`for (n = 0; c_Bounds[n++] < uint32_t(v););`
50			`#endif`
51			`return (n);`
52			`}`
53
54			`/// Measures the size of a wchar_t array in UTF-8 encoding.`
55			`inline size_t Utf8Bytes (const wchar_t* first, const wchar_t* last)`
56			`{`
57			`size_t bc = 0;`
58			`for (; first < last; ++first)`
59			`bc += Utf8Bytes(*first);`
60			`return (bc);`
61			`}`
62
63			`/// Returns the number of bytes in a UTF-8 sequence that starts with \p c.`
64			`inline size_t Utf8SequenceBytes (wchar_t c) // a wchar_t to keep c in a full register`
65			`{`
66			`// Count the leading bits. Header bits are 1 * nBytes followed by a 0.`
67			`// 0 - single byte character. Take 7 bits (0xFF >> 1)`
68			`// 1 - error, in the middle of the character. Take 6 bits (0xFF >> 2)`
69			`// so you will keep reading invalid entries until you hit the next character.`
70			`// >2 - multibyte character. Take remaining bits, and get the next bytes.`
71			`// All errors are ignored, since the user can not correct them.`
72			`//`
73			`wchar_t mask = 0x80;`
74			`size_t nBytes = 0;`
75			`for (; c & mask; ++nBytes)`
76			`mask >>= 1;`
77			`return (nBytes ? nBytes : 1); // A sequence is always at least 1 byte.`
78			`}`
79
80			`//----------------------------------------------------------------------`
81
82			`/// \class utf8in_iterator utf8.h ustl.h`
83			`/// \ingroup IteratorAdaptors`
84			`///`
85			`/// \brief An iterator adaptor to character containers for reading UTF-8 encoded text.`
86			`///`
87			`/// For example, you can copy from ustl::string to ustl::vector<wchar_t> with`
88			`/// copy (utf8in (str.begin()), utf8in (str.end()), back_inserter(wvect));`
89			`/// There is no error handling; if the reading frame slips you'll get extra`
90			`/// characters, one for every misaligned byte. Although it is possible to skip`
91			`/// to the start of the next character, that would result in omitting the`
92			`/// misformatted character and the one after it, making it very difficult to`
93			`/// detect by the user. It is better to write some strange characters and let`
94			`/// the user know his file is corrupted. Another problem is overflow on bad`
95			`/// encodings (like a 0xFF on the end of a string). This is checked through`
96			`/// the end-of-string nul character, which will always be there as long as`
97			`/// you are using the string class.`
98			`///`
99			`template <typename Iterator, typename WChar = wchar_t>`
100			`class utf8in_iterator {`
101			`public:`
102			`typedef typename iterator_traits<Iterator>::value_type value_type;`
103			`typedef typename iterator_traits<Iterator>::difference_type difference_type;`
104			`typedef typename iterator_traits<Iterator>::pointer pointer;`
105			`typedef typename iterator_traits<Iterator>::reference reference;`
106			`public:`
107			`explicit utf8in_iterator (const Iterator& is) : m_i (is), m_v (0) { Read(); }`
108			`utf8in_iterator (const utf8in_iterator& i) : m_i (i.m_i), m_v (i.m_v) {}`
109			`inline const utf8in_iterator& operator= (const utf8in_iterator& i) { m_i = i.m_i; m_v = i.m_v; return (*this); }`
110			`inline Iterator base (void) const { return (m_i - (Utf8Bytes(m_v) - 1)); }`
111			`/// Reads and returns the next value.`
112			`inline WChar operator* (void) const { return (m_v); }`
113			`inline utf8in_iterator& operator++ (void) { ++m_i; Read(); return (*this); }`
114			`inline utf8in_iterator operator++ (int) { utf8in_iterator old (*this); operator++(); return (old); }`
115			`inline utf8in_iterator& operator+= (uoff_t n) { while (n--) operator++(); return (*this); }`
116			`inline utf8in_iterator operator+ (uoff_t n) { utf8in_iterator v (*this); return (v += n); }`
117			`inline bool operator== (const utf8in_iterator& i) const { return (m_i == i.m_i); }`
118			`inline bool operator< (const utf8in_iterator& i) const { return (m_i < i.m_i); }`
119			`difference_type operator- (const utf8in_iterator& i) const;`
120			`private:`
121			`void Read (void);`
122			`private:`
123			`Iterator m_i;`
124			`WChar m_v;`
125			`};`
126
127			`/// Steps to the next character and updates current returnable value.`
128			`template <typename Iterator, typename WChar>`
129			`void utf8in_iterator<Iterator,WChar>::Read (void)`
130			`{`
131			`const utf8subchar_t c = *m_i;`
132			`size_t nBytes = Utf8SequenceBytes (c);`
133			`m_v = c & (0xFF >> nBytes); // First byte contains bits after the header.`
134			`while (--nBytes && *++m_i) // Each subsequent byte has 6 bits.`
135			`m_v = (m_v << 6) \| (*m_i & 0x3F);`
136			`}`
137
138			`/// Returns the distance in characters (as opposed to the distance in bytes).`
139			`template <typename Iterator, typename WChar>`
140			`typename utf8in_iterator<Iterator,WChar>::difference_type`
141			`utf8in_iterator<Iterator,WChar>::operator- (const utf8in_iterator<Iterator,WChar>& last) const`
142			`{`
143			`difference_type dist = 0;`
144			`for (Iterator first (last.m_i); first < m_i; ++dist)`
145			`first = advance (first, Utf8SequenceBytes (*first));`
146			`return (dist);`
147			`}`
148
149			`//----------------------------------------------------------------------`
150
151			`/// \class utf8out_iterator utf8.h ustl.h`
152			`/// \ingroup IteratorAdaptors`
153			`///`
154			`/// \brief An iterator adaptor to character containers for writing UTF-8 encoded text.`
155			`///`
156			`template <typename Iterator, typename WChar = wchar_t>`
157			`class utf8out_iterator {`
158			`public:`
159			`typedef typename iterator_traits<Iterator>::value_type value_type;`
160			`typedef typename iterator_traits<Iterator>::difference_type difference_type;`
161			`typedef typename iterator_traits<Iterator>::pointer pointer;`
162			`typedef typename iterator_traits<Iterator>::reference reference;`
163			`public:`
164			`explicit utf8out_iterator (const Iterator& os) : m_i (os) {}`
165			`utf8out_iterator (const utf8out_iterator& i) : m_i (i.m_i) {}`
166			`inline const Iterator& base (void) const { return (m_i); }`
167			`/// Writes \p v into the stream.`
168			`utf8out_iterator& operator= (WChar v);`
169			`inline utf8out_iterator& operator* (void) { return (*this); }`
170			`inline utf8out_iterator& operator++ (void) { return (*this); }`
171			`inline utf8out_iterator operator++ (int) { return (*this); }`
172			`inline bool operator== (const utf8out_iterator& i) const { return (m_i == i.m_i); }`
173			`inline bool operator< (const utf8out_iterator& i) const { return (m_i < i.m_i); }`
174			`private:`
175			`Iterator m_i;`
176			`};`
177
178			`/// Writes \p v into the stream.`
179			`template <typename Iterator, typename WChar>`
180			`utf8out_iterator<Iterator,WChar>& utf8out_iterator<Iterator,WChar>::operator= (WChar v)`
181			`{`
182			`const size_t nBytes = Utf8Bytes (v);`
183			`if (nBytes > 1) {`
184			`// Write the bits 6 bits at a time, except for the first one,`
185			`// which may be less than 6 bits.`
186			`register wchar_t shift = nBytes * 6;`
187			`*m_i++ = ((v >> (shift -= 6)) & 0x3F) \| (0xFF << (8 - nBytes));`
188			`while (shift)`
189			`*m_i++ = ((v >> (shift -= 6)) & 0x3F) \| 0x80;`
190			`} else // If only one byte, there is no header.`
191			`*m_i++ = v;`
192			`return (*this);`
193			`}`
194
195			`//----------------------------------------------------------------------`
196
197			`/// Returns a UTF-8 adaptor writing to \p i. Useful in conjuction with back_insert_iterator.`
198			`template <typename Iterator>`
199			`inline utf8out_iterator<Iterator> utf8out (Iterator i)`
200			`{`
201			`return (utf8out_iterator<Iterator> (i));`
202			`}`
203
204			`/// Returns a UTF-8 adaptor reading from \p i.`
205			`template <typename Iterator>`
206			`inline utf8in_iterator<Iterator> utf8in (Iterator i)`
207			`{`
208			`return (utf8in_iterator<Iterator> (i));`
209			`}`
210
211			`//----------------------------------------------------------------------`
212
213			`} // namespace ustl`
214
215			`#endif`