OpenCores
URL https://opencores.org/ocsvn/openrisc/openrisc/trunk

Subversion Repositories openrisc

[/] [openrisc/] [trunk/] [rtos/] [ecos-3.0/] [packages/] [language/] [cxx/] [ustl/] [current/] [include/] [ustl/] [utf8.h] - Blame information for rev 786

Details | Compare with Previous | View Log

Line No. Rev Author Line
1 786 skrzyp
// This file is part of the uSTL library, an STL implementation.
2
//
3
// Copyright (c) 2005-2009 by Mike Sharov <msharov@users.sourceforge.net>
4
// This file is free software, distributed under the MIT License.
5
//
6
// This file contains stream iterators that read and write UTF-8 encoded
7
// characters. The encoding is defined as follows:
8
//
9
// U-00000000 - U-0000007F: 0xxxxxxx
10
// U-00000080 - U-000007FF: 110xxxxx 10xxxxxx
11
// U-00000800 - U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
12
// U-00010000 - U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
13
// U-00200000 - U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
14
// U-04000000 - U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
15
// U-80000000 - U-FFFFFFFF: 11111110 100000xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
16
 
17
#ifndef UTF8_H_3D7AEEEB3A88928D4D280B785F78B6F4
18
#define UTF8_H_3D7AEEEB3A88928D4D280B785F78B6F4
19
 
20
#include "uiterator.h"
21
 
22
namespace ustl {
23
 
24
//----------------------------------------------------------------------
25
 
26
typedef uint8_t utf8subchar_t;  ///< Type for the encoding subcharacters.
27
 
28
//----------------------------------------------------------------------
29
 
30
inline size_t Utf8Bytes (wchar_t v) __attribute__((const));
31
inline size_t Utf8Bytes (const wchar_t* first, const wchar_t* last) __attribute__((pure));
32
inline size_t Utf8SequenceBytes (wchar_t c) __attribute__((const));
33
 
34
//----------------------------------------------------------------------
35
 
36
/// Returns the number of bytes required to UTF-8 encode \p v.
37
inline size_t Utf8Bytes (wchar_t v)
38
{
39
    if ((uint32_t) v < 128)
40
        return (1);
41
    size_t n;
42
    #if __i386__ || __x86_64__
43
        uint32_t r = 0;
44
        asm ("bsr\t%2, %%eax\n\t"
45
            "add\t$4, %0\n\t"
46
            "div\t%3":"=a"(n),"+d"(r):"r"(v),"c"(5));
47
    #else
48
        static const uint32_t c_Bounds[7] = { 0x0000007F, 0x000007FF, 0x0000FFFF, 0x001FFFFF, 0x03FFFFFF, 0x7FFFFFFF, 0xFFFFFFFF };
49
        for (n = 0; c_Bounds[n++] < uint32_t(v););
50
    #endif
51
    return (n);
52
}
53
 
54
/// Measures the size of a wchar_t array in UTF-8 encoding.
55
inline size_t Utf8Bytes (const wchar_t* first, const wchar_t* last)
56
{
57
    size_t bc = 0;
58
    for (; first < last; ++first)
59
        bc += Utf8Bytes(*first);
60
    return (bc);
61
}
62
 
63
/// Returns the number of bytes in a UTF-8 sequence that starts with \p c.
64
inline size_t Utf8SequenceBytes (wchar_t c)     // a wchar_t to keep c in a full register
65
{
66
    // Count the leading bits. Header bits are 1 * nBytes followed by a 0.
67
    //  0 - single byte character. Take 7 bits (0xFF >> 1)
68
    //  1 - error, in the middle of the character. Take 6 bits (0xFF >> 2)
69
    //      so you will keep reading invalid entries until you hit the next character.
70
    //  >2 - multibyte character. Take remaining bits, and get the next bytes.
71
    // All errors are ignored, since the user can not correct them.
72
    //
73
    wchar_t mask = 0x80;
74
    size_t nBytes = 0;
75
    for (; c & mask; ++nBytes)
76
        mask >>= 1;
77
    return (nBytes ? nBytes : 1); // A sequence is always at least 1 byte.
78
}
79
 
80
//----------------------------------------------------------------------
81
 
82
/// \class utf8in_iterator utf8.h ustl.h
83
/// \ingroup IteratorAdaptors
84
///
85
/// \brief An iterator adaptor to character containers for reading UTF-8 encoded text.
86
///
87
/// For example, you can copy from ustl::string to ustl::vector<wchar_t> with
88
/// copy (utf8in (str.begin()), utf8in (str.end()), back_inserter(wvect));
89
/// There is no error handling; if the reading frame slips you'll get extra
90
/// characters, one for every misaligned byte. Although it is possible to skip
91
/// to the start of the next character, that would result in omitting the
92
/// misformatted character and the one after it, making it very difficult to
93
/// detect by the user. It is better to write some strange characters and let
94
/// the user know his file is corrupted. Another problem is overflow on bad
95
/// encodings (like a 0xFF on the end of a string). This is checked through
96
/// the end-of-string nul character, which will always be there as long as
97
/// you are using the string class.
98
///
99
template <typename Iterator, typename WChar = wchar_t>
100
class utf8in_iterator {
101
public:
102
    typedef typename iterator_traits<Iterator>::value_type      value_type;
103
    typedef typename iterator_traits<Iterator>::difference_type difference_type;
104
    typedef typename iterator_traits<Iterator>::pointer         pointer;
105
    typedef typename iterator_traits<Iterator>::reference       reference;
106
public:
107
    explicit                    utf8in_iterator (const Iterator& is)            : m_i (is), m_v (0) { Read(); }
108
                                utf8in_iterator (const utf8in_iterator& i)      : m_i (i.m_i), m_v (i.m_v) {}
109
    inline const utf8in_iterator& operator= (const utf8in_iterator& i)          { m_i = i.m_i; m_v = i.m_v; return (*this); }
110
    inline Iterator             base (void) const       { return (m_i - (Utf8Bytes(m_v) - 1)); }
111
    /// Reads and returns the next value.
112
    inline WChar                operator* (void) const  { return (m_v); }
113
    inline utf8in_iterator&     operator++ (void)       { ++m_i; Read(); return (*this); }
114
    inline utf8in_iterator      operator++ (int)        { utf8in_iterator old (*this); operator++(); return (old); }
115
    inline utf8in_iterator&     operator+= (uoff_t n)   { while (n--) operator++(); return (*this); }
116
    inline utf8in_iterator      operator+ (uoff_t n)    { utf8in_iterator v (*this); return (v += n); }
117
    inline bool                 operator== (const utf8in_iterator& i) const     { return (m_i == i.m_i); }
118
    inline bool                 operator< (const utf8in_iterator& i) const      { return (m_i < i.m_i); }
119
    difference_type             operator- (const utf8in_iterator& i) const;
120
private:
121
    void                        Read (void);
122
private:
123
    Iterator                    m_i;
124
    WChar                       m_v;
125
};
126
 
127
/// Steps to the next character and updates current returnable value.
128
template <typename Iterator, typename WChar>
129
void utf8in_iterator<Iterator,WChar>::Read (void)
130
{
131
    const utf8subchar_t c = *m_i;
132
    size_t nBytes = Utf8SequenceBytes (c);
133
    m_v = c & (0xFF >> nBytes); // First byte contains bits after the header.
134
    while (--nBytes && *++m_i)  // Each subsequent byte has 6 bits.
135
        m_v = (m_v << 6) | (*m_i & 0x3F);
136
}
137
 
138
/// Returns the distance in characters (as opposed to the distance in bytes).
139
template <typename Iterator, typename WChar>
140
typename utf8in_iterator<Iterator,WChar>::difference_type
141
utf8in_iterator<Iterator,WChar>::operator- (const utf8in_iterator<Iterator,WChar>& last) const
142
{
143
    difference_type dist = 0;
144
    for (Iterator first (last.m_i); first < m_i; ++dist)
145
        first = advance (first, Utf8SequenceBytes (*first));
146
    return (dist);
147
}
148
 
149
//----------------------------------------------------------------------
150
 
151
/// \class utf8out_iterator utf8.h ustl.h
152
/// \ingroup IteratorAdaptors
153
///
154
/// \brief An iterator adaptor to character containers for writing UTF-8 encoded text.
155
///
156
template <typename Iterator, typename WChar = wchar_t>
157
class utf8out_iterator {
158
public:
159
    typedef typename iterator_traits<Iterator>::value_type      value_type;
160
    typedef typename iterator_traits<Iterator>::difference_type difference_type;
161
    typedef typename iterator_traits<Iterator>::pointer         pointer;
162
    typedef typename iterator_traits<Iterator>::reference       reference;
163
public:
164
    explicit                    utf8out_iterator (const Iterator& os) : m_i (os) {}
165
                                utf8out_iterator (const utf8out_iterator& i) : m_i (i.m_i) {}
166
    inline const Iterator&      base (void) const { return (m_i); }
167
    /// Writes \p v into the stream.
168
    utf8out_iterator&           operator= (WChar v);
169
    inline utf8out_iterator&    operator* (void) { return (*this); }
170
    inline utf8out_iterator&    operator++ (void) { return (*this); }
171
    inline utf8out_iterator     operator++ (int) { return (*this); }
172
    inline bool                 operator== (const utf8out_iterator& i) const { return (m_i == i.m_i); }
173
    inline bool                 operator< (const utf8out_iterator& i) const { return (m_i < i.m_i); }
174
private:
175
    Iterator                    m_i;
176
};
177
 
178
/// Writes \p v into the stream.
179
template <typename Iterator, typename WChar>
180
utf8out_iterator<Iterator,WChar>& utf8out_iterator<Iterator,WChar>::operator= (WChar v)
181
{
182
    const size_t nBytes = Utf8Bytes (v);
183
    if (nBytes > 1) {
184
        // Write the bits 6 bits at a time, except for the first one,
185
        // which may be less than 6 bits.
186
        register wchar_t shift = nBytes * 6;
187
        *m_i++ = ((v >> (shift -= 6)) & 0x3F) | (0xFF << (8 - nBytes));
188
        while (shift)
189
            *m_i++ = ((v >> (shift -= 6)) & 0x3F) | 0x80;
190
    } else      // If only one byte, there is no header.
191
        *m_i++ = v;
192
    return (*this);
193
}
194
 
195
//----------------------------------------------------------------------
196
 
197
/// Returns a UTF-8 adaptor writing to \p i. Useful in conjuction with back_insert_iterator.
198
template <typename Iterator>
199
inline utf8out_iterator<Iterator> utf8out (Iterator i)
200
{
201
    return (utf8out_iterator<Iterator> (i));
202
}
203
 
204
/// Returns a UTF-8 adaptor reading from \p i.
205
template <typename Iterator>
206
inline utf8in_iterator<Iterator> utf8in (Iterator i)
207
{
208
    return (utf8in_iterator<Iterator> (i));
209
}
210
 
211
//----------------------------------------------------------------------
212
 
213
} // namespace ustl
214
 
215
#endif

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.