OpenCores
URL https://opencores.org/ocsvn/openrisc/openrisc/trunk

Subversion Repositories openrisc

[/] [openrisc/] [trunk/] [gnu-dev/] [or1k-gcc/] [libgo/] [go/] [exp/] [norm/] [composition.go] - Blame information for rev 747

Details | Compare with Previous | View Log

Line No. Rev Author Line
1 747 jeremybenn
// Copyright 2011 The Go Authors. All rights reserved.
2
// Use of this source code is governed by a BSD-style
3
// license that can be found in the LICENSE file.
4
 
5
package norm
6
 
7
import "unicode/utf8"
8
 
9
const (
10
        maxCombiningChars = 30
11
        maxBufferSize     = maxCombiningChars + 2 // +1 to hold starter +1 to hold CGJ
12
        maxBackRunes      = maxCombiningChars - 1
13
        maxNFCExpansion   = 3  // NFC(0x1D160)
14
        maxNFKCExpansion  = 18 // NFKC(0xFDFA)
15
 
16
        maxByteBufferSize = utf8.UTFMax * maxBufferSize // 128
17
)
18
 
19
// reorderBuffer is used to normalize a single segment.  Characters inserted with
20
// insert are decomposed and reordered based on CCC. The compose method can
21
// be used to recombine characters.  Note that the byte buffer does not hold
22
// the UTF-8 characters in order.  Only the rune array is maintained in sorted
23
// order. flush writes the resulting segment to a byte array.
24
type reorderBuffer struct {
25
        rune  [maxBufferSize]runeInfo // Per character info.
26
        byte  [maxByteBufferSize]byte // UTF-8 buffer. Referenced by runeInfo.pos.
27
        nrune int                     // Number of runeInfos.
28
        nbyte uint8                   // Number or bytes.
29
        f     formInfo
30
 
31
        src       input
32
        nsrc      int
33
        srcBytes  inputBytes
34
        srcString inputString
35
        tmpBytes  inputBytes
36
}
37
 
38
func (rb *reorderBuffer) init(f Form, src []byte) {
39
        rb.f = *formTable[f]
40
        rb.srcBytes = inputBytes(src)
41
        rb.src = &rb.srcBytes
42
        rb.nsrc = len(src)
43
}
44
 
45
func (rb *reorderBuffer) initString(f Form, src string) {
46
        rb.f = *formTable[f]
47
        rb.srcString = inputString(src)
48
        rb.src = &rb.srcString
49
        rb.nsrc = len(src)
50
}
51
 
52
// reset discards all characters from the buffer.
53
func (rb *reorderBuffer) reset() {
54
        rb.nrune = 0
55
        rb.nbyte = 0
56
}
57
 
58
// flush appends the normalized segment to out and resets rb.
59
func (rb *reorderBuffer) flush(out []byte) []byte {
60
        for i := 0; i < rb.nrune; i++ {
61
                start := rb.rune[i].pos
62
                end := start + rb.rune[i].size
63
                out = append(out, rb.byte[start:end]...)
64
        }
65
        rb.reset()
66
        return out
67
}
68
 
69
// insertOrdered inserts a rune in the buffer, ordered by Canonical Combining Class.
70
// It returns false if the buffer is not large enough to hold the rune.
71
// It is used internally by insert and insertString only.
72
func (rb *reorderBuffer) insertOrdered(info runeInfo) bool {
73
        n := rb.nrune
74
        if n >= maxCombiningChars+1 {
75
                return false
76
        }
77
        b := rb.rune[:]
78
        cc := info.ccc
79
        if cc > 0 {
80
                // Find insertion position + move elements to make room.
81
                for ; n > 0; n-- {
82
                        if b[n-1].ccc <= cc {
83
                                break
84
                        }
85
                        b[n] = b[n-1]
86
                }
87
        }
88
        rb.nrune += 1
89
        pos := uint8(rb.nbyte)
90
        rb.nbyte += utf8.UTFMax
91
        info.pos = pos
92
        b[n] = info
93
        return true
94
}
95
 
96
// insert inserts the given rune in the buffer ordered by CCC.
97
// It returns true if the buffer was large enough to hold the decomposed rune.
98
func (rb *reorderBuffer) insert(src input, i int, info runeInfo) bool {
99
        if info.size == 3 {
100
                if rune := src.hangul(i); rune != 0 {
101
                        return rb.decomposeHangul(rune)
102
                }
103
        }
104
        if info.hasDecomposition() {
105
                dcomp := rb.f.decompose(src, i)
106
                rb.tmpBytes = inputBytes(dcomp)
107
                for i := 0; i < len(dcomp); {
108
                        info = rb.f.info(&rb.tmpBytes, i)
109
                        pos := rb.nbyte
110
                        if !rb.insertOrdered(info) {
111
                                return false
112
                        }
113
                        end := i + int(info.size)
114
                        copy(rb.byte[pos:], dcomp[i:end])
115
                        i = end
116
                }
117
        } else {
118
                // insertOrder changes nbyte
119
                pos := rb.nbyte
120
                if !rb.insertOrdered(info) {
121
                        return false
122
                }
123
                src.copySlice(rb.byte[pos:], i, i+int(info.size))
124
        }
125
        return true
126
}
127
 
128
// appendRune inserts a rune at the end of the buffer. It is used for Hangul.
129
func (rb *reorderBuffer) appendRune(r rune) {
130
        bn := rb.nbyte
131
        sz := utf8.EncodeRune(rb.byte[bn:], rune(r))
132
        rb.nbyte += utf8.UTFMax
133
        rb.rune[rb.nrune] = runeInfo{pos: bn, size: uint8(sz)}
134
        rb.nrune++
135
}
136
 
137
// assignRune sets a rune at position pos. It is used for Hangul and recomposition.
138
func (rb *reorderBuffer) assignRune(pos int, r rune) {
139
        bn := rb.rune[pos].pos
140
        sz := utf8.EncodeRune(rb.byte[bn:], rune(r))
141
        rb.rune[pos] = runeInfo{pos: bn, size: uint8(sz)}
142
}
143
 
144
// runeAt returns the rune at position n. It is used for Hangul and recomposition.
145
func (rb *reorderBuffer) runeAt(n int) rune {
146
        inf := rb.rune[n]
147
        r, _ := utf8.DecodeRune(rb.byte[inf.pos : inf.pos+inf.size])
148
        return r
149
}
150
 
151
// bytesAt returns the UTF-8 encoding of the rune at position n.
152
// It is used for Hangul and recomposition.
153
func (rb *reorderBuffer) bytesAt(n int) []byte {
154
        inf := rb.rune[n]
155
        return rb.byte[inf.pos : int(inf.pos)+int(inf.size)]
156
}
157
 
158
// For Hangul we combine algorithmically, instead of using tables.
159
const (
160
        hangulBase  = 0xAC00 // UTF-8(hangulBase) -> EA B0 80
161
        hangulBase0 = 0xEA
162
        hangulBase1 = 0xB0
163
        hangulBase2 = 0x80
164
 
165
        hangulEnd  = hangulBase + jamoLVTCount // UTF-8(0xD7A4) -> ED 9E A4
166
        hangulEnd0 = 0xED
167
        hangulEnd1 = 0x9E
168
        hangulEnd2 = 0xA4
169
 
170
        jamoLBase  = 0x1100 // UTF-8(jamoLBase) -> E1 84 00
171
        jamoLBase0 = 0xE1
172
        jamoLBase1 = 0x84
173
        jamoLEnd   = 0x1113
174
        jamoVBase  = 0x1161
175
        jamoVEnd   = 0x1176
176
        jamoTBase  = 0x11A7
177
        jamoTEnd   = 0x11C3
178
 
179
        jamoTCount   = 28
180
        jamoVCount   = 21
181
        jamoVTCount  = 21 * 28
182
        jamoLVTCount = 19 * 21 * 28
183
)
184
 
185
// Caller must verify that len(b) >= 3.
186
func isHangul(b []byte) bool {
187
        b0 := b[0]
188
        if b0 < hangulBase0 {
189
                return false
190
        }
191
        b1 := b[1]
192
        switch {
193
        case b0 == hangulBase0:
194
                return b1 >= hangulBase1
195
        case b0 < hangulEnd0:
196
                return true
197
        case b0 > hangulEnd0:
198
                return false
199
        case b1 < hangulEnd1:
200
                return true
201
        }
202
        return b1 == hangulEnd1 && b[2] < hangulEnd2
203
}
204
 
205
// Caller must verify that len(b) >= 3.
206
func isHangulString(b string) bool {
207
        b0 := b[0]
208
        if b0 < hangulBase0 {
209
                return false
210
        }
211
        b1 := b[1]
212
        switch {
213
        case b0 == hangulBase0:
214
                return b1 >= hangulBase1
215
        case b0 < hangulEnd0:
216
                return true
217
        case b0 > hangulEnd0:
218
                return false
219
        case b1 < hangulEnd1:
220
                return true
221
        }
222
        return b1 == hangulEnd1 && b[2] < hangulEnd2
223
}
224
 
225
// Caller must ensure len(b) >= 2.
226
func isJamoVT(b []byte) bool {
227
        // True if (rune & 0xff00) == jamoLBase
228
        return b[0] == jamoLBase0 && (b[1]&0xFC) == jamoLBase1
229
}
230
 
231
func isHangulWithoutJamoT(b []byte) bool {
232
        c, _ := utf8.DecodeRune(b)
233
        c -= hangulBase
234
        return c < jamoLVTCount && c%jamoTCount == 0
235
}
236
 
237
// decomposeHangul algorithmically decomposes a Hangul rune into
238
// its Jamo components.
239
// See http://unicode.org/reports/tr15/#Hangul for details on decomposing Hangul.
240
func (rb *reorderBuffer) decomposeHangul(r rune) bool {
241
        b := rb.rune[:]
242
        n := rb.nrune
243
        if n+3 > len(b) {
244
                return false
245
        }
246
        r -= hangulBase
247
        x := r % jamoTCount
248
        r /= jamoTCount
249
        rb.appendRune(jamoLBase + r/jamoVCount)
250
        rb.appendRune(jamoVBase + r%jamoVCount)
251
        if x != 0 {
252
                rb.appendRune(jamoTBase + x)
253
        }
254
        return true
255
}
256
 
257
// combineHangul algorithmically combines Jamo character components into Hangul.
258
// See http://unicode.org/reports/tr15/#Hangul for details on combining Hangul.
259
func (rb *reorderBuffer) combineHangul(s, i, k int) {
260
        b := rb.rune[:]
261
        bn := rb.nrune
262
        for ; i < bn; i++ {
263
                cccB := b[k-1].ccc
264
                cccC := b[i].ccc
265
                if cccB == 0 {
266
                        s = k - 1
267
                }
268
                if s != k-1 && cccB >= cccC {
269
                        // b[i] is blocked by greater-equal cccX below it
270
                        b[k] = b[i]
271
                        k++
272
                } else {
273
                        l := rb.runeAt(s) // also used to compare to hangulBase
274
                        v := rb.runeAt(i) // also used to compare to jamoT
275
                        switch {
276
                        case jamoLBase <= l && l < jamoLEnd &&
277
                                jamoVBase <= v && v < jamoVEnd:
278
                                // 11xx plus 116x to LV
279
                                rb.assignRune(s, hangulBase+
280
                                        (l-jamoLBase)*jamoVTCount+(v-jamoVBase)*jamoTCount)
281
                        case hangulBase <= l && l < hangulEnd &&
282
                                jamoTBase < v && v < jamoTEnd &&
283
                                ((l-hangulBase)%jamoTCount) == 0:
284
                                // ACxx plus 11Ax to LVT
285
                                rb.assignRune(s, l+v-jamoTBase)
286
                        default:
287
                                b[k] = b[i]
288
                                k++
289
                        }
290
                }
291
        }
292
        rb.nrune = k
293
}
294
 
295
// compose recombines the runes in the buffer.
296
// It should only be used to recompose a single segment, as it will not
297
// handle alternations between Hangul and non-Hangul characters correctly.
298
func (rb *reorderBuffer) compose() {
299
        // UAX #15, section X5 , including Corrigendum #5
300
        // "In any character sequence beginning with starter S, a character C is
301
        //  blocked from S if and only if there is some character B between S
302
        //  and C, and either B is a starter or it has the same or higher
303
        //  combining class as C."
304
        bn := rb.nrune
305
        if bn == 0 {
306
                return
307
        }
308
        k := 1
309
        b := rb.rune[:]
310
        for s, i := 0, 1; i < bn; i++ {
311
                if isJamoVT(rb.bytesAt(i)) {
312
                        // Redo from start in Hangul mode. Necessary to support
313
                        // U+320E..U+321E in NFKC mode.
314
                        rb.combineHangul(s, i, k)
315
                        return
316
                }
317
                ii := b[i]
318
                // We can only use combineForward as a filter if we later
319
                // get the info for the combined character. This is more
320
                // expensive than using the filter. Using combinesBackward()
321
                // is safe.
322
                if ii.combinesBackward() {
323
                        cccB := b[k-1].ccc
324
                        cccC := ii.ccc
325
                        blocked := false // b[i] blocked by starter or greater or equal CCC?
326
                        if cccB == 0 {
327
                                s = k - 1
328
                        } else {
329
                                blocked = s != k-1 && cccB >= cccC
330
                        }
331
                        if !blocked {
332
                                combined := combine(rb.runeAt(s), rb.runeAt(i))
333
                                if combined != 0 {
334
                                        rb.assignRune(s, combined)
335
                                        continue
336
                                }
337
                        }
338
                }
339
                b[k] = b[i]
340
                k++
341
        }
342
        rb.nrune = k
343
}

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.