OpenCores
URL https://opencores.org/ocsvn/openrisc/openrisc/trunk

Subversion Repositories openrisc

[/] [openrisc/] [trunk/] [gnu-dev/] [or1k-gcc/] [libgo/] [go/] [exp/] [norm/] [normalize.go] - Blame information for rev 791

Go to most recent revision | Details | Compare with Previous | View Log

Line No. Rev Author Line
1 747 jeremybenn
// Copyright 2011 The Go Authors. All rights reserved.
2
// Use of this source code is governed by a BSD-style
3
// license that can be found in the LICENSE file.
4
 
5
// Package norm contains types and functions for normalizing Unicode strings.
6
package norm
7
 
8
import "unicode/utf8"
9
 
10
// A Form denotes a canonical representation of Unicode code points.
11
// The Unicode-defined normalization and equivalence forms are:
12
//
13
//   NFC   Unicode Normalization Form C
14
//   NFD   Unicode Normalization Form D
15
//   NFKC  Unicode Normalization Form KC
16
//   NFKD  Unicode Normalization Form KD
17
//
18
// For a Form f, this documentation uses the notation f(x) to mean
19
// the bytes or string x converted to the given form.
20
// A position n in x is called a boundary if conversion to the form can
21
// proceed independently on both sides:
22
//   f(x) == append(f(x[0:n]), f(x[n:])...)
23
//
24
// References: http://unicode.org/reports/tr15/ and
25
// http://unicode.org/notes/tn5/.
26
type Form int
27
 
28
const (
29
        NFC Form = iota
30
        NFD
31
        NFKC
32
        NFKD
33
)
34
 
35
// Bytes returns f(b). May return b if f(b) = b.
36
func (f Form) Bytes(b []byte) []byte {
37
        rb := reorderBuffer{}
38
        rb.init(f, b)
39
        n := quickSpan(&rb, 0)
40
        if n == len(b) {
41
                return b
42
        }
43
        out := make([]byte, n, len(b))
44
        copy(out, b[0:n])
45
        return doAppend(&rb, out, n)
46
}
47
 
48
// String returns f(s).
49
func (f Form) String(s string) string {
50
        rb := reorderBuffer{}
51
        rb.initString(f, s)
52
        n := quickSpan(&rb, 0)
53
        if n == len(s) {
54
                return s
55
        }
56
        out := make([]byte, n, len(s))
57
        copy(out, s[0:n])
58
        return string(doAppend(&rb, out, n))
59
}
60
 
61
// IsNormal returns true if b == f(b).
62
func (f Form) IsNormal(b []byte) bool {
63
        rb := reorderBuffer{}
64
        rb.init(f, b)
65
        bp := quickSpan(&rb, 0)
66
        if bp == len(b) {
67
                return true
68
        }
69
        for bp < len(b) {
70
                decomposeSegment(&rb, bp)
71
                if rb.f.composing {
72
                        rb.compose()
73
                }
74
                for i := 0; i < rb.nrune; i++ {
75
                        info := rb.rune[i]
76
                        if bp+int(info.size) > len(b) {
77
                                return false
78
                        }
79
                        p := info.pos
80
                        pe := p + info.size
81
                        for ; p < pe; p++ {
82
                                if b[bp] != rb.byte[p] {
83
                                        return false
84
                                }
85
                                bp++
86
                        }
87
                }
88
                rb.reset()
89
                bp = quickSpan(&rb, bp)
90
        }
91
        return true
92
}
93
 
94
// IsNormalString returns true if s == f(s).
95
func (f Form) IsNormalString(s string) bool {
96
        rb := reorderBuffer{}
97
        rb.initString(f, s)
98
        bp := quickSpan(&rb, 0)
99
        if bp == len(s) {
100
                return true
101
        }
102
        for bp < len(s) {
103
                decomposeSegment(&rb, bp)
104
                if rb.f.composing {
105
                        rb.compose()
106
                }
107
                for i := 0; i < rb.nrune; i++ {
108
                        info := rb.rune[i]
109
                        if bp+int(info.size) > len(s) {
110
                                return false
111
                        }
112
                        p := info.pos
113
                        pe := p + info.size
114
                        for ; p < pe; p++ {
115
                                if s[bp] != rb.byte[p] {
116
                                        return false
117
                                }
118
                                bp++
119
                        }
120
                }
121
                rb.reset()
122
                bp = quickSpan(&rb, bp)
123
        }
124
        return true
125
}
126
 
127
// patchTail fixes a case where a rune may be incorrectly normalized
128
// if it is followed by illegal continuation bytes. It returns the
129
// patched buffer and whether there were trailing continuation bytes.
130
func patchTail(rb *reorderBuffer, buf []byte) ([]byte, bool) {
131
        info, p := lastRuneStart(&rb.f, buf)
132
        if p == -1 || info.size == 0 {
133
                return buf, false
134
        }
135
        end := p + int(info.size)
136
        extra := len(buf) - end
137
        if extra > 0 {
138
                // Potentially allocating memory. However, this only
139
                // happens with ill-formed UTF-8.
140
                x := make([]byte, 0)
141
                x = append(x, buf[len(buf)-extra:]...)
142
                buf = decomposeToLastBoundary(rb, buf[:end])
143
                if rb.f.composing {
144
                        rb.compose()
145
                }
146
                buf = rb.flush(buf)
147
                return append(buf, x...), true
148
        }
149
        return buf, false
150
}
151
 
152
func appendQuick(rb *reorderBuffer, dst []byte, i int) ([]byte, int) {
153
        if rb.nsrc == i {
154
                return dst, i
155
        }
156
        end := quickSpan(rb, i)
157
        return rb.src.appendSlice(dst, i, end), end
158
}
159
 
160
// Append returns f(append(out, b...)).
161
// The buffer out must be nil, empty, or equal to f(out).
162
func (f Form) Append(out []byte, src ...byte) []byte {
163
        if len(src) == 0 {
164
                return out
165
        }
166
        rb := reorderBuffer{}
167
        rb.init(f, src)
168
        return doAppend(&rb, out, 0)
169
}
170
 
171
func doAppend(rb *reorderBuffer, out []byte, p int) []byte {
172
        src, n := rb.src, rb.nsrc
173
        doMerge := len(out) > 0
174
        if q := src.skipNonStarter(p); q > p {
175
                // Move leading non-starters to destination.
176
                out = src.appendSlice(out, p, q)
177
                buf, endsInError := patchTail(rb, out)
178
                if endsInError {
179
                        out = buf
180
                        doMerge = false // no need to merge, ends with illegal UTF-8
181
                } else {
182
                        out = decomposeToLastBoundary(rb, buf) // force decomposition
183
                }
184
                p = q
185
        }
186
        fd := &rb.f
187
        if doMerge {
188
                var info runeInfo
189
                if p < n {
190
                        info = fd.info(src, p)
191
                        if p == 0 && !info.boundaryBefore() {
192
                                out = decomposeToLastBoundary(rb, out)
193
                        }
194
                }
195
                if info.size == 0 || info.boundaryBefore() {
196
                        if fd.composing {
197
                                rb.compose()
198
                        }
199
                        out = rb.flush(out)
200
                        if info.size == 0 {
201
                                // Append incomplete UTF-8 encoding.
202
                                return src.appendSlice(out, p, n)
203
                        }
204
                }
205
        }
206
        if rb.nrune == 0 {
207
                out, p = appendQuick(rb, out, p)
208
        }
209
        for p < n {
210
                p = decomposeSegment(rb, p)
211
                if fd.composing {
212
                        rb.compose()
213
                }
214
                out = rb.flush(out)
215
                out, p = appendQuick(rb, out, p)
216
        }
217
        return out
218
}
219
 
220
// AppendString returns f(append(out, []byte(s))).
221
// The buffer out must be nil, empty, or equal to f(out).
222
func (f Form) AppendString(out []byte, src string) []byte {
223
        if len(src) == 0 {
224
                return out
225
        }
226
        rb := reorderBuffer{}
227
        rb.initString(f, src)
228
        return doAppend(&rb, out, 0)
229
}
230
 
231
// QuickSpan returns a boundary n such that b[0:n] == f(b[0:n]).
232
// It is not guaranteed to return the largest such n.
233
func (f Form) QuickSpan(b []byte) int {
234
        rb := reorderBuffer{}
235
        rb.init(f, b)
236
        n := quickSpan(&rb, 0)
237
        return n
238
}
239
 
240
func quickSpan(rb *reorderBuffer, i int) int {
241
        var lastCC uint8
242
        var nc int
243
        lastSegStart := i
244
        src, n := rb.src, rb.nsrc
245
        for i < n {
246
                if j := src.skipASCII(i); i != j {
247
                        i = j
248
                        lastSegStart = i - 1
249
                        lastCC = 0
250
                        nc = 0
251
                        continue
252
                }
253
                info := rb.f.info(src, i)
254
                if info.size == 0 {
255
                        // include incomplete runes
256
                        return n
257
                }
258
                cc := info.ccc
259
                if rb.f.composing {
260
                        if !info.isYesC() {
261
                                break
262
                        }
263
                } else {
264
                        if !info.isYesD() {
265
                                break
266
                        }
267
                }
268
                if cc == 0 {
269
                        lastSegStart = i
270
                        nc = 0
271
                } else {
272
                        if nc >= maxCombiningChars {
273
                                lastSegStart = i
274
                                lastCC = cc
275
                                nc = 1
276
                        } else {
277
                                if lastCC > cc {
278
                                        return lastSegStart
279
                                }
280
                                nc++
281
                        }
282
                }
283
                lastCC = cc
284
                i += int(info.size)
285
        }
286
        if i == n {
287
                return n
288
        }
289
        if rb.f.composing {
290
                return lastSegStart
291
        }
292
        return i
293
}
294
 
295
// QuickSpanString returns a boundary n such that b[0:n] == f(s[0:n]).
296
// It is not guaranteed to return the largest such n.
297
func (f Form) QuickSpanString(s string) int {
298
        rb := reorderBuffer{}
299
        rb.initString(f, s)
300
        return quickSpan(&rb, 0)
301
}
302
 
303
// FirstBoundary returns the position i of the first boundary in b
304
// or -1 if b contains no boundary.
305
func (f Form) FirstBoundary(b []byte) int {
306
        rb := reorderBuffer{}
307
        rb.init(f, b)
308
        return firstBoundary(&rb)
309
}
310
 
311
func firstBoundary(rb *reorderBuffer) int {
312
        src, nsrc := rb.src, rb.nsrc
313
        i := src.skipNonStarter(0)
314
        if i >= nsrc {
315
                return -1
316
        }
317
        fd := &rb.f
318
        info := fd.info(src, i)
319
        for n := 0; info.size != 0 && !info.boundaryBefore(); {
320
                i += int(info.size)
321
                if n++; n >= maxCombiningChars {
322
                        return i
323
                }
324
                if i >= nsrc {
325
                        if !info.boundaryAfter() {
326
                                return -1
327
                        }
328
                        return nsrc
329
                }
330
                info = fd.info(src, i)
331
        }
332
        if info.size == 0 {
333
                return -1
334
        }
335
        return i
336
}
337
 
338
// FirstBoundaryInString returns the position i of the first boundary in s
339
// or -1 if s contains no boundary.
340
func (f Form) FirstBoundaryInString(s string) int {
341
        rb := reorderBuffer{}
342
        rb.initString(f, s)
343
        return firstBoundary(&rb)
344
}
345
 
346
// LastBoundary returns the position i of the last boundary in b
347
// or -1 if b contains no boundary.
348
func (f Form) LastBoundary(b []byte) int {
349
        return lastBoundary(formTable[f], b)
350
}
351
 
352
func lastBoundary(fd *formInfo, b []byte) int {
353
        i := len(b)
354
        info, p := lastRuneStart(fd, b)
355
        if p == -1 {
356
                return -1
357
        }
358
        if info.size == 0 { // ends with incomplete rune
359
                if p == 0 { // starts wtih incomplete rune
360
                        return -1
361
                }
362
                i = p
363
                info, p = lastRuneStart(fd, b[:i])
364
                if p == -1 { // incomplete UTF-8 encoding or non-starter bytes without a starter
365
                        return i
366
                }
367
        }
368
        if p+int(info.size) != i { // trailing non-starter bytes: illegal UTF-8
369
                return i
370
        }
371
        if info.boundaryAfter() {
372
                return i
373
        }
374
        i = p
375
        for n := 0; i >= 0 && !info.boundaryBefore(); {
376
                info, p = lastRuneStart(fd, b[:i])
377
                if n++; n >= maxCombiningChars {
378
                        return len(b)
379
                }
380
                if p+int(info.size) != i {
381
                        if p == -1 { // no boundary found
382
                                return -1
383
                        }
384
                        return i // boundary after an illegal UTF-8 encoding
385
                }
386
                i = p
387
        }
388
        return i
389
}
390
 
391
// decomposeSegment scans the first segment in src into rb.
392
// It returns the number of bytes consumed from src.
393
// TODO(mpvl): consider inserting U+034f (Combining Grapheme Joiner)
394
// when we detect a sequence of 30+ non-starter chars.
395
func decomposeSegment(rb *reorderBuffer, sp int) int {
396
        // Force one character to be consumed.
397
        info := rb.f.info(rb.src, sp)
398
        if info.size == 0 {
399
                return 0
400
        }
401
        for rb.insert(rb.src, sp, info) {
402
                sp += int(info.size)
403
                if sp >= rb.nsrc {
404
                        break
405
                }
406
                info = rb.f.info(rb.src, sp)
407
                bound := info.boundaryBefore()
408
                if bound || info.size == 0 {
409
                        break
410
                }
411
        }
412
        return sp
413
}
414
 
415
// lastRuneStart returns the runeInfo and position of the last
416
// rune in buf or the zero runeInfo and -1 if no rune was found.
417
func lastRuneStart(fd *formInfo, buf []byte) (runeInfo, int) {
418
        p := len(buf) - 1
419
        for ; p >= 0 && !utf8.RuneStart(buf[p]); p-- {
420
        }
421
        if p < 0 {
422
                return runeInfo{}, -1
423
        }
424
        return fd.info(inputBytes(buf), p), p
425
}
426
 
427
// decomposeToLastBoundary finds an open segment at the end of the buffer
428
// and scans it into rb. Returns the buffer minus the last segment.
429
func decomposeToLastBoundary(rb *reorderBuffer, buf []byte) []byte {
430
        fd := &rb.f
431
        info, i := lastRuneStart(fd, buf)
432
        if int(info.size) != len(buf)-i {
433
                // illegal trailing continuation bytes
434
                return buf
435
        }
436
        if info.boundaryAfter() {
437
                return buf
438
        }
439
        var add [maxBackRunes]runeInfo // stores runeInfo in reverse order
440
        add[0] = info
441
        padd := 1
442
        n := 1
443
        p := len(buf) - int(info.size)
444
        for ; p >= 0 && !info.boundaryBefore(); p -= int(info.size) {
445
                info, i = lastRuneStart(fd, buf[:p])
446
                if int(info.size) != p-i {
447
                        break
448
                }
449
                // Check that decomposition doesn't result in overflow.
450
                if info.hasDecomposition() {
451
                        dcomp := rb.f.decompose(inputBytes(buf), p-int(info.size))
452
                        for i := 0; i < len(dcomp); {
453
                                inf := rb.f.info(inputBytes(dcomp), i)
454
                                i += int(inf.size)
455
                                n++
456
                        }
457
                } else {
458
                        n++
459
                }
460
                if n > maxBackRunes {
461
                        break
462
                }
463
                add[padd] = info
464
                padd++
465
        }
466
        pp := p
467
        for padd--; padd >= 0; padd-- {
468
                info = add[padd]
469
                rb.insert(inputBytes(buf), pp, info)
470
                pp += int(info.size)
471
        }
472
        return buf[:p]
473
}

powered by: WebSVN 2.1.0

© copyright 1999-2025 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.