URL
https://opencores.org/ocsvn/openrisc/openrisc/trunk
Subversion Repositories openrisc
[/] [openrisc/] [trunk/] [gnu-dev/] [or1k-gcc/] [libgo/] [go/] [exp/] [norm/] [normalize.go] - Rev 752
Go to most recent revision | Compare with Previous | Blame | View Log
// Copyright 2011 The Go Authors. All rights reserved.// Use of this source code is governed by a BSD-style// license that can be found in the LICENSE file.// Package norm contains types and functions for normalizing Unicode strings.package normimport "unicode/utf8"// A Form denotes a canonical representation of Unicode code points.// The Unicode-defined normalization and equivalence forms are://// NFC Unicode Normalization Form C// NFD Unicode Normalization Form D// NFKC Unicode Normalization Form KC// NFKD Unicode Normalization Form KD//// For a Form f, this documentation uses the notation f(x) to mean// the bytes or string x converted to the given form.// A position n in x is called a boundary if conversion to the form can// proceed independently on both sides:// f(x) == append(f(x[0:n]), f(x[n:])...)//// References: http://unicode.org/reports/tr15/ and// http://unicode.org/notes/tn5/.type Form intconst (NFC Form = iotaNFDNFKCNFKD)// Bytes returns f(b). May return b if f(b) = b.func (f Form) Bytes(b []byte) []byte {rb := reorderBuffer{}rb.init(f, b)n := quickSpan(&rb, 0)if n == len(b) {return b}out := make([]byte, n, len(b))copy(out, b[0:n])return doAppend(&rb, out, n)}// String returns f(s).func (f Form) String(s string) string {rb := reorderBuffer{}rb.initString(f, s)n := quickSpan(&rb, 0)if n == len(s) {return s}out := make([]byte, n, len(s))copy(out, s[0:n])return string(doAppend(&rb, out, n))}// IsNormal returns true if b == f(b).func (f Form) IsNormal(b []byte) bool {rb := reorderBuffer{}rb.init(f, b)bp := quickSpan(&rb, 0)if bp == len(b) {return true}for bp < len(b) {decomposeSegment(&rb, bp)if rb.f.composing {rb.compose()}for i := 0; i < rb.nrune; i++ {info := rb.rune[i]if bp+int(info.size) > len(b) {return false}p := info.pospe := p + info.sizefor ; p < pe; p++ {if b[bp] != rb.byte[p] {return false}bp++}}rb.reset()bp = quickSpan(&rb, bp)}return true}// IsNormalString returns true if s == f(s).func (f Form) IsNormalString(s string) bool {rb := reorderBuffer{}rb.initString(f, s)bp := quickSpan(&rb, 0)if bp == len(s) {return true}for bp < len(s) {decomposeSegment(&rb, bp)if rb.f.composing {rb.compose()}for i := 0; i < rb.nrune; i++ {info := rb.rune[i]if bp+int(info.size) > len(s) {return false}p := info.pospe := p + info.sizefor ; p < pe; p++ {if s[bp] != rb.byte[p] {return false}bp++}}rb.reset()bp = quickSpan(&rb, bp)}return true}// patchTail fixes a case where a rune may be incorrectly normalized// if it is followed by illegal continuation bytes. It returns the// patched buffer and whether there were trailing continuation bytes.func patchTail(rb *reorderBuffer, buf []byte) ([]byte, bool) {info, p := lastRuneStart(&rb.f, buf)if p == -1 || info.size == 0 {return buf, false}end := p + int(info.size)extra := len(buf) - endif extra > 0 {// Potentially allocating memory. However, this only// happens with ill-formed UTF-8.x := make([]byte, 0)x = append(x, buf[len(buf)-extra:]...)buf = decomposeToLastBoundary(rb, buf[:end])if rb.f.composing {rb.compose()}buf = rb.flush(buf)return append(buf, x...), true}return buf, false}func appendQuick(rb *reorderBuffer, dst []byte, i int) ([]byte, int) {if rb.nsrc == i {return dst, i}end := quickSpan(rb, i)return rb.src.appendSlice(dst, i, end), end}// Append returns f(append(out, b...)).// The buffer out must be nil, empty, or equal to f(out).func (f Form) Append(out []byte, src ...byte) []byte {if len(src) == 0 {return out}rb := reorderBuffer{}rb.init(f, src)return doAppend(&rb, out, 0)}func doAppend(rb *reorderBuffer, out []byte, p int) []byte {src, n := rb.src, rb.nsrcdoMerge := len(out) > 0if q := src.skipNonStarter(p); q > p {// Move leading non-starters to destination.out = src.appendSlice(out, p, q)buf, endsInError := patchTail(rb, out)if endsInError {out = bufdoMerge = false // no need to merge, ends with illegal UTF-8} else {out = decomposeToLastBoundary(rb, buf) // force decomposition}p = q}fd := &rb.fif doMerge {var info runeInfoif p < n {info = fd.info(src, p)if p == 0 && !info.boundaryBefore() {out = decomposeToLastBoundary(rb, out)}}if info.size == 0 || info.boundaryBefore() {if fd.composing {rb.compose()}out = rb.flush(out)if info.size == 0 {// Append incomplete UTF-8 encoding.return src.appendSlice(out, p, n)}}}if rb.nrune == 0 {out, p = appendQuick(rb, out, p)}for p < n {p = decomposeSegment(rb, p)if fd.composing {rb.compose()}out = rb.flush(out)out, p = appendQuick(rb, out, p)}return out}// AppendString returns f(append(out, []byte(s))).// The buffer out must be nil, empty, or equal to f(out).func (f Form) AppendString(out []byte, src string) []byte {if len(src) == 0 {return out}rb := reorderBuffer{}rb.initString(f, src)return doAppend(&rb, out, 0)}// QuickSpan returns a boundary n such that b[0:n] == f(b[0:n]).// It is not guaranteed to return the largest such n.func (f Form) QuickSpan(b []byte) int {rb := reorderBuffer{}rb.init(f, b)n := quickSpan(&rb, 0)return n}func quickSpan(rb *reorderBuffer, i int) int {var lastCC uint8var nc intlastSegStart := isrc, n := rb.src, rb.nsrcfor i < n {if j := src.skipASCII(i); i != j {i = jlastSegStart = i - 1lastCC = 0nc = 0continue}info := rb.f.info(src, i)if info.size == 0 {// include incomplete runesreturn n}cc := info.cccif rb.f.composing {if !info.isYesC() {break}} else {if !info.isYesD() {break}}if cc == 0 {lastSegStart = inc = 0} else {if nc >= maxCombiningChars {lastSegStart = ilastCC = ccnc = 1} else {if lastCC > cc {return lastSegStart}nc++}}lastCC = cci += int(info.size)}if i == n {return n}if rb.f.composing {return lastSegStart}return i}// QuickSpanString returns a boundary n such that b[0:n] == f(s[0:n]).// It is not guaranteed to return the largest such n.func (f Form) QuickSpanString(s string) int {rb := reorderBuffer{}rb.initString(f, s)return quickSpan(&rb, 0)}// FirstBoundary returns the position i of the first boundary in b// or -1 if b contains no boundary.func (f Form) FirstBoundary(b []byte) int {rb := reorderBuffer{}rb.init(f, b)return firstBoundary(&rb)}func firstBoundary(rb *reorderBuffer) int {src, nsrc := rb.src, rb.nsrci := src.skipNonStarter(0)if i >= nsrc {return -1}fd := &rb.finfo := fd.info(src, i)for n := 0; info.size != 0 && !info.boundaryBefore(); {i += int(info.size)if n++; n >= maxCombiningChars {return i}if i >= nsrc {if !info.boundaryAfter() {return -1}return nsrc}info = fd.info(src, i)}if info.size == 0 {return -1}return i}// FirstBoundaryInString returns the position i of the first boundary in s// or -1 if s contains no boundary.func (f Form) FirstBoundaryInString(s string) int {rb := reorderBuffer{}rb.initString(f, s)return firstBoundary(&rb)}// LastBoundary returns the position i of the last boundary in b// or -1 if b contains no boundary.func (f Form) LastBoundary(b []byte) int {return lastBoundary(formTable[f], b)}func lastBoundary(fd *formInfo, b []byte) int {i := len(b)info, p := lastRuneStart(fd, b)if p == -1 {return -1}if info.size == 0 { // ends with incomplete runeif p == 0 { // starts wtih incomplete runereturn -1}i = pinfo, p = lastRuneStart(fd, b[:i])if p == -1 { // incomplete UTF-8 encoding or non-starter bytes without a starterreturn i}}if p+int(info.size) != i { // trailing non-starter bytes: illegal UTF-8return i}if info.boundaryAfter() {return i}i = pfor n := 0; i >= 0 && !info.boundaryBefore(); {info, p = lastRuneStart(fd, b[:i])if n++; n >= maxCombiningChars {return len(b)}if p+int(info.size) != i {if p == -1 { // no boundary foundreturn -1}return i // boundary after an illegal UTF-8 encoding}i = p}return i}// decomposeSegment scans the first segment in src into rb.// It returns the number of bytes consumed from src.// TODO(mpvl): consider inserting U+034f (Combining Grapheme Joiner)// when we detect a sequence of 30+ non-starter chars.func decomposeSegment(rb *reorderBuffer, sp int) int {// Force one character to be consumed.info := rb.f.info(rb.src, sp)if info.size == 0 {return 0}for rb.insert(rb.src, sp, info) {sp += int(info.size)if sp >= rb.nsrc {break}info = rb.f.info(rb.src, sp)bound := info.boundaryBefore()if bound || info.size == 0 {break}}return sp}// lastRuneStart returns the runeInfo and position of the last// rune in buf or the zero runeInfo and -1 if no rune was found.func lastRuneStart(fd *formInfo, buf []byte) (runeInfo, int) {p := len(buf) - 1for ; p >= 0 && !utf8.RuneStart(buf[p]); p-- {}if p < 0 {return runeInfo{}, -1}return fd.info(inputBytes(buf), p), p}// decomposeToLastBoundary finds an open segment at the end of the buffer// and scans it into rb. Returns the buffer minus the last segment.func decomposeToLastBoundary(rb *reorderBuffer, buf []byte) []byte {fd := &rb.finfo, i := lastRuneStart(fd, buf)if int(info.size) != len(buf)-i {// illegal trailing continuation bytesreturn buf}if info.boundaryAfter() {return buf}var add [maxBackRunes]runeInfo // stores runeInfo in reverse orderadd[0] = infopadd := 1n := 1p := len(buf) - int(info.size)for ; p >= 0 && !info.boundaryBefore(); p -= int(info.size) {info, i = lastRuneStart(fd, buf[:p])if int(info.size) != p-i {break}// Check that decomposition doesn't result in overflow.if info.hasDecomposition() {dcomp := rb.f.decompose(inputBytes(buf), p-int(info.size))for i := 0; i < len(dcomp); {inf := rb.f.info(inputBytes(dcomp), i)i += int(inf.size)n++}} else {n++}if n > maxBackRunes {break}add[padd] = infopadd++}pp := pfor padd--; padd >= 0; padd-- {info = add[padd]rb.insert(inputBytes(buf), pp, info)pp += int(info.size)}return buf[:p]}
Go to most recent revision | Compare with Previous | Blame | View Log
