OpenCores
URL https://opencores.org/ocsvn/openrisc/openrisc/trunk

Subversion Repositories openrisc

[/] [openrisc/] [trunk/] [gnu-dev/] [or1k-gcc/] [libgo/] [go/] [unicode/] [utf8/] [utf8.go] - Blame information for rev 761

Go to most recent revision | Details | Compare with Previous | View Log

Line No. Rev Author Line
1 747 jeremybenn
// Copyright 2009 The Go Authors. All rights reserved.
2
// Use of this source code is governed by a BSD-style
3
// license that can be found in the LICENSE file.
4
 
5
// Package utf8 implements functions and constants to support text encoded in
6
// UTF-8. This package calls a Unicode character a rune for brevity.
7
package utf8
8
 
9
import "unicode" // only needed for a couple of constants
10
 
11
// Numbers fundamental to the encoding.
12
const (
13
        RuneError = unicode.ReplacementChar // the "error" Rune or "replacement character".
14
        RuneSelf  = 0x80                    // characters below Runeself are represented as themselves in a single byte.
15
        UTFMax    = 4                       // maximum number of bytes of a UTF-8 encoded Unicode character.
16
)
17
 
18
const (
19
        t1 = 0x00 // 0000 0000
20
        tx = 0x80 // 1000 0000
21
        t2 = 0xC0 // 1100 0000
22
        t3 = 0xE0 // 1110 0000
23
        t4 = 0xF0 // 1111 0000
24
        t5 = 0xF8 // 1111 1000
25
 
26
        maskx = 0x3F // 0011 1111
27
        mask2 = 0x1F // 0001 1111
28
        mask3 = 0x0F // 0000 1111
29
        mask4 = 0x07 // 0000 0111
30
 
31
        rune1Max = 1<<7 - 1
32
        rune2Max = 1<<11 - 1
33
        rune3Max = 1<<16 - 1
34
        rune4Max = 1<<21 - 1
35
)
36
 
37
func decodeRuneInternal(p []byte) (r rune, size int, short bool) {
38
        n := len(p)
39
        if n < 1 {
40
                return RuneError, 0, true
41
        }
42
        c0 := p[0]
43
 
44
        // 1-byte, 7-bit sequence?
45
        if c0 < tx {
46
                return rune(c0), 1, false
47
        }
48
 
49
        // unexpected continuation byte?
50
        if c0 < t2 {
51
                return RuneError, 1, false
52
        }
53
 
54
        // need first continuation byte
55
        if n < 2 {
56
                return RuneError, 1, true
57
        }
58
        c1 := p[1]
59
        if c1 < tx || t2 <= c1 {
60
                return RuneError, 1, false
61
        }
62
 
63
        // 2-byte, 11-bit sequence?
64
        if c0 < t3 {
65
                r = rune(c0&mask2)<<6 | rune(c1&maskx)
66
                if r <= rune1Max {
67
                        return RuneError, 1, false
68
                }
69
                return r, 2, false
70
        }
71
 
72
        // need second continuation byte
73
        if n < 3 {
74
                return RuneError, 1, true
75
        }
76
        c2 := p[2]
77
        if c2 < tx || t2 <= c2 {
78
                return RuneError, 1, false
79
        }
80
 
81
        // 3-byte, 16-bit sequence?
82
        if c0 < t4 {
83
                r = rune(c0&mask3)<<12 | rune(c1&maskx)<<6 | rune(c2&maskx)
84
                if r <= rune2Max {
85
                        return RuneError, 1, false
86
                }
87
                return r, 3, false
88
        }
89
 
90
        // need third continuation byte
91
        if n < 4 {
92
                return RuneError, 1, true
93
        }
94
        c3 := p[3]
95
        if c3 < tx || t2 <= c3 {
96
                return RuneError, 1, false
97
        }
98
 
99
        // 4-byte, 21-bit sequence?
100
        if c0 < t5 {
101
                r = rune(c0&mask4)<<18 | rune(c1&maskx)<<12 | rune(c2&maskx)<<6 | rune(c3&maskx)
102
                if r <= rune3Max {
103
                        return RuneError, 1, false
104
                }
105
                return r, 4, false
106
        }
107
 
108
        // error
109
        return RuneError, 1, false
110
}
111
 
112
func decodeRuneInStringInternal(s string) (r rune, size int, short bool) {
113
        n := len(s)
114
        if n < 1 {
115
                return RuneError, 0, true
116
        }
117
        c0 := s[0]
118
 
119
        // 1-byte, 7-bit sequence?
120
        if c0 < tx {
121
                return rune(c0), 1, false
122
        }
123
 
124
        // unexpected continuation byte?
125
        if c0 < t2 {
126
                return RuneError, 1, false
127
        }
128
 
129
        // need first continuation byte
130
        if n < 2 {
131
                return RuneError, 1, true
132
        }
133
        c1 := s[1]
134
        if c1 < tx || t2 <= c1 {
135
                return RuneError, 1, false
136
        }
137
 
138
        // 2-byte, 11-bit sequence?
139
        if c0 < t3 {
140
                r = rune(c0&mask2)<<6 | rune(c1&maskx)
141
                if r <= rune1Max {
142
                        return RuneError, 1, false
143
                }
144
                return r, 2, false
145
        }
146
 
147
        // need second continuation byte
148
        if n < 3 {
149
                return RuneError, 1, true
150
        }
151
        c2 := s[2]
152
        if c2 < tx || t2 <= c2 {
153
                return RuneError, 1, false
154
        }
155
 
156
        // 3-byte, 16-bit sequence?
157
        if c0 < t4 {
158
                r = rune(c0&mask3)<<12 | rune(c1&maskx)<<6 | rune(c2&maskx)
159
                if r <= rune2Max {
160
                        return RuneError, 1, false
161
                }
162
                return r, 3, false
163
        }
164
 
165
        // need third continuation byte
166
        if n < 4 {
167
                return RuneError, 1, true
168
        }
169
        c3 := s[3]
170
        if c3 < tx || t2 <= c3 {
171
                return RuneError, 1, false
172
        }
173
 
174
        // 4-byte, 21-bit sequence?
175
        if c0 < t5 {
176
                r = rune(c0&mask4)<<18 | rune(c1&maskx)<<12 | rune(c2&maskx)<<6 | rune(c3&maskx)
177
                if r <= rune3Max {
178
                        return RuneError, 1, false
179
                }
180
                return r, 4, false
181
        }
182
 
183
        // error
184
        return RuneError, 1, false
185
}
186
 
187
// FullRune reports whether the bytes in p begin with a full UTF-8 encoding of a rune.
188
// An invalid encoding is considered a full Rune since it will convert as a width-1 error rune.
189
func FullRune(p []byte) bool {
190
        _, _, short := decodeRuneInternal(p)
191
        return !short
192
}
193
 
194
// FullRuneInString is like FullRune but its input is a string.
195
func FullRuneInString(s string) bool {
196
        _, _, short := decodeRuneInStringInternal(s)
197
        return !short
198
}
199
 
200
// DecodeRune unpacks the first UTF-8 encoding in p and returns the rune and its width in bytes.
201
func DecodeRune(p []byte) (r rune, size int) {
202
        r, size, _ = decodeRuneInternal(p)
203
        return
204
}
205
 
206
// DecodeRuneInString is like DecodeRune but its input is a string.
207
func DecodeRuneInString(s string) (r rune, size int) {
208
        r, size, _ = decodeRuneInStringInternal(s)
209
        return
210
}
211
 
212
// DecodeLastRune unpacks the last UTF-8 encoding in p
213
// and returns the rune and its width in bytes.
214
func DecodeLastRune(p []byte) (r rune, size int) {
215
        end := len(p)
216
        if end == 0 {
217
                return RuneError, 0
218
        }
219
        start := end - 1
220
        r = rune(p[start])
221
        if r < RuneSelf {
222
                return r, 1
223
        }
224
        // guard against O(n^2) behavior when traversing
225
        // backwards through strings with long sequences of
226
        // invalid UTF-8.
227
        lim := end - UTFMax
228
        if lim < 0 {
229
                lim = 0
230
        }
231
        for start--; start >= lim; start-- {
232
                if RuneStart(p[start]) {
233
                        break
234
                }
235
        }
236
        if start < 0 {
237
                start = 0
238
        }
239
        r, size = DecodeRune(p[start:end])
240
        if start+size != end {
241
                return RuneError, 1
242
        }
243
        return r, size
244
}
245
 
246
// DecodeLastRuneInString is like DecodeLastRune but its input is a string.
247
func DecodeLastRuneInString(s string) (r rune, size int) {
248
        end := len(s)
249
        if end == 0 {
250
                return RuneError, 0
251
        }
252
        start := end - 1
253
        r = rune(s[start])
254
        if r < RuneSelf {
255
                return r, 1
256
        }
257
        // guard against O(n^2) behavior when traversing
258
        // backwards through strings with long sequences of
259
        // invalid UTF-8.
260
        lim := end - UTFMax
261
        if lim < 0 {
262
                lim = 0
263
        }
264
        for start--; start >= lim; start-- {
265
                if RuneStart(s[start]) {
266
                        break
267
                }
268
        }
269
        if start < 0 {
270
                start = 0
271
        }
272
        r, size = DecodeRuneInString(s[start:end])
273
        if start+size != end {
274
                return RuneError, 1
275
        }
276
        return r, size
277
}
278
 
279
// RuneLen returns the number of bytes required to encode the rune.
280
func RuneLen(r rune) int {
281
        switch {
282
        case r <= rune1Max:
283
                return 1
284
        case r <= rune2Max:
285
                return 2
286
        case r <= rune3Max:
287
                return 3
288
        case r <= rune4Max:
289
                return 4
290
        }
291
        return -1
292
}
293
 
294
// EncodeRune writes into p (which must be large enough) the UTF-8 encoding of the rune.
295
// It returns the number of bytes written.
296
func EncodeRune(p []byte, r rune) int {
297
        // Negative values are erroneous.  Making it unsigned addresses the problem.
298
        if uint32(r) <= rune1Max {
299
                p[0] = byte(r)
300
                return 1
301
        }
302
 
303
        if uint32(r) <= rune2Max {
304
                p[0] = t2 | byte(r>>6)
305
                p[1] = tx | byte(r)&maskx
306
                return 2
307
        }
308
 
309
        if uint32(r) > unicode.MaxRune {
310
                r = RuneError
311
        }
312
 
313
        if uint32(r) <= rune3Max {
314
                p[0] = t3 | byte(r>>12)
315
                p[1] = tx | byte(r>>6)&maskx
316
                p[2] = tx | byte(r)&maskx
317
                return 3
318
        }
319
 
320
        p[0] = t4 | byte(r>>18)
321
        p[1] = tx | byte(r>>12)&maskx
322
        p[2] = tx | byte(r>>6)&maskx
323
        p[3] = tx | byte(r)&maskx
324
        return 4
325
}
326
 
327
// RuneCount returns the number of runes in p.  Erroneous and short
328
// encodings are treated as single runes of width 1 byte.
329
func RuneCount(p []byte) int {
330
        i := 0
331
        var n int
332
        for n = 0; i < len(p); n++ {
333
                if p[i] < RuneSelf {
334
                        i++
335
                } else {
336
                        _, size := DecodeRune(p[i:])
337
                        i += size
338
                }
339
        }
340
        return n
341
}
342
 
343
// RuneCountInString is like RuneCount but its input is a string.
344
func RuneCountInString(s string) (n int) {
345
        for _ = range s {
346
                n++
347
        }
348
        return
349
}
350
 
351
// RuneStart reports whether the byte could be the first byte of
352
// an encoded rune.  Second and subsequent bytes always have the top
353
// two bits set to 10.
354
func RuneStart(b byte) bool { return b&0xC0 != 0x80 }
355
 
356
// Valid reports whether p consists entirely of valid UTF-8-encoded runes.
357
func Valid(p []byte) bool {
358
        i := 0
359
        for i < len(p) {
360
                if p[i] < RuneSelf {
361
                        i++
362
                } else {
363
                        _, size := DecodeRune(p[i:])
364
                        if size == 1 {
365
                                // All valid runes of size of 1 (those
366
                                // below RuneSelf) were handled above.
367
                                // This must be a RuneError.
368
                                return false
369
                        }
370
                        i += size
371
                }
372
        }
373
        return true
374
}
375
 
376
// ValidString reports whether s consists entirely of valid UTF-8-encoded runes.
377
func ValidString(s string) bool {
378
        for i, r := range s {
379
                if r == RuneError {
380
                        // The RuneError value can be an error
381
                        // sentinel value (if it's size 1) or the same
382
                        // value encoded properly. Decode it to see if
383
                        // it's the 1 byte sentinel value.
384
                        _, size := DecodeRuneInString(s[i:])
385
                        if size == 1 {
386
                                return false
387
                        }
388
                }
389
        }
390
        return true
391
}

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.