OpenCores
URL https://opencores.org/ocsvn/openrisc/openrisc/trunk

Subversion Repositories openrisc

[/] [openrisc/] [trunk/] [gnu-dev/] [or1k-gcc/] [libgo/] [go/] [exp/] [norm/] [normalize.go] - Rev 791

Go to most recent revision | Compare with Previous | Blame | View Log

// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// Package norm contains types and functions for normalizing Unicode strings.
package norm

import "unicode/utf8"

// A Form denotes a canonical representation of Unicode code points.
// The Unicode-defined normalization and equivalence forms are:
//
//   NFC   Unicode Normalization Form C
//   NFD   Unicode Normalization Form D
//   NFKC  Unicode Normalization Form KC
//   NFKD  Unicode Normalization Form KD
//
// For a Form f, this documentation uses the notation f(x) to mean
// the bytes or string x converted to the given form.
// A position n in x is called a boundary if conversion to the form can
// proceed independently on both sides:
//   f(x) == append(f(x[0:n]), f(x[n:])...)
//
// References: http://unicode.org/reports/tr15/ and
// http://unicode.org/notes/tn5/.
type Form int

const (
        NFC Form = iota
        NFD
        NFKC
        NFKD
)

// Bytes returns f(b). May return b if f(b) = b.
func (f Form) Bytes(b []byte) []byte {
        rb := reorderBuffer{}
        rb.init(f, b)
        n := quickSpan(&rb, 0)
        if n == len(b) {
                return b
        }
        out := make([]byte, n, len(b))
        copy(out, b[0:n])
        return doAppend(&rb, out, n)
}

// String returns f(s).
func (f Form) String(s string) string {
        rb := reorderBuffer{}
        rb.initString(f, s)
        n := quickSpan(&rb, 0)
        if n == len(s) {
                return s
        }
        out := make([]byte, n, len(s))
        copy(out, s[0:n])
        return string(doAppend(&rb, out, n))
}

// IsNormal returns true if b == f(b).
func (f Form) IsNormal(b []byte) bool {
        rb := reorderBuffer{}
        rb.init(f, b)
        bp := quickSpan(&rb, 0)
        if bp == len(b) {
                return true
        }
        for bp < len(b) {
                decomposeSegment(&rb, bp)
                if rb.f.composing {
                        rb.compose()
                }
                for i := 0; i < rb.nrune; i++ {
                        info := rb.rune[i]
                        if bp+int(info.size) > len(b) {
                                return false
                        }
                        p := info.pos
                        pe := p + info.size
                        for ; p < pe; p++ {
                                if b[bp] != rb.byte[p] {
                                        return false
                                }
                                bp++
                        }
                }
                rb.reset()
                bp = quickSpan(&rb, bp)
        }
        return true
}

// IsNormalString returns true if s == f(s).
func (f Form) IsNormalString(s string) bool {
        rb := reorderBuffer{}
        rb.initString(f, s)
        bp := quickSpan(&rb, 0)
        if bp == len(s) {
                return true
        }
        for bp < len(s) {
                decomposeSegment(&rb, bp)
                if rb.f.composing {
                        rb.compose()
                }
                for i := 0; i < rb.nrune; i++ {
                        info := rb.rune[i]
                        if bp+int(info.size) > len(s) {
                                return false
                        }
                        p := info.pos
                        pe := p + info.size
                        for ; p < pe; p++ {
                                if s[bp] != rb.byte[p] {
                                        return false
                                }
                                bp++
                        }
                }
                rb.reset()
                bp = quickSpan(&rb, bp)
        }
        return true
}

// patchTail fixes a case where a rune may be incorrectly normalized
// if it is followed by illegal continuation bytes. It returns the
// patched buffer and whether there were trailing continuation bytes.
func patchTail(rb *reorderBuffer, buf []byte) ([]byte, bool) {
        info, p := lastRuneStart(&rb.f, buf)
        if p == -1 || info.size == 0 {
                return buf, false
        }
        end := p + int(info.size)
        extra := len(buf) - end
        if extra > 0 {
                // Potentially allocating memory. However, this only
                // happens with ill-formed UTF-8.
                x := make([]byte, 0)
                x = append(x, buf[len(buf)-extra:]...)
                buf = decomposeToLastBoundary(rb, buf[:end])
                if rb.f.composing {
                        rb.compose()
                }
                buf = rb.flush(buf)
                return append(buf, x...), true
        }
        return buf, false
}

func appendQuick(rb *reorderBuffer, dst []byte, i int) ([]byte, int) {
        if rb.nsrc == i {
                return dst, i
        }
        end := quickSpan(rb, i)
        return rb.src.appendSlice(dst, i, end), end
}

// Append returns f(append(out, b...)).
// The buffer out must be nil, empty, or equal to f(out).
func (f Form) Append(out []byte, src ...byte) []byte {
        if len(src) == 0 {
                return out
        }
        rb := reorderBuffer{}
        rb.init(f, src)
        return doAppend(&rb, out, 0)
}

func doAppend(rb *reorderBuffer, out []byte, p int) []byte {
        src, n := rb.src, rb.nsrc
        doMerge := len(out) > 0
        if q := src.skipNonStarter(p); q > p {
                // Move leading non-starters to destination.
                out = src.appendSlice(out, p, q)
                buf, endsInError := patchTail(rb, out)
                if endsInError {
                        out = buf
                        doMerge = false // no need to merge, ends with illegal UTF-8
                } else {
                        out = decomposeToLastBoundary(rb, buf) // force decomposition
                }
                p = q
        }
        fd := &rb.f
        if doMerge {
                var info runeInfo
                if p < n {
                        info = fd.info(src, p)
                        if p == 0 && !info.boundaryBefore() {
                                out = decomposeToLastBoundary(rb, out)
                        }
                }
                if info.size == 0 || info.boundaryBefore() {
                        if fd.composing {
                                rb.compose()
                        }
                        out = rb.flush(out)
                        if info.size == 0 {
                                // Append incomplete UTF-8 encoding.
                                return src.appendSlice(out, p, n)
                        }
                }
        }
        if rb.nrune == 0 {
                out, p = appendQuick(rb, out, p)
        }
        for p < n {
                p = decomposeSegment(rb, p)
                if fd.composing {
                        rb.compose()
                }
                out = rb.flush(out)
                out, p = appendQuick(rb, out, p)
        }
        return out
}

// AppendString returns f(append(out, []byte(s))).
// The buffer out must be nil, empty, or equal to f(out).
func (f Form) AppendString(out []byte, src string) []byte {
        if len(src) == 0 {
                return out
        }
        rb := reorderBuffer{}
        rb.initString(f, src)
        return doAppend(&rb, out, 0)
}

// QuickSpan returns a boundary n such that b[0:n] == f(b[0:n]).
// It is not guaranteed to return the largest such n.
func (f Form) QuickSpan(b []byte) int {
        rb := reorderBuffer{}
        rb.init(f, b)
        n := quickSpan(&rb, 0)
        return n
}

func quickSpan(rb *reorderBuffer, i int) int {
        var lastCC uint8
        var nc int
        lastSegStart := i
        src, n := rb.src, rb.nsrc
        for i < n {
                if j := src.skipASCII(i); i != j {
                        i = j
                        lastSegStart = i - 1
                        lastCC = 0
                        nc = 0
                        continue
                }
                info := rb.f.info(src, i)
                if info.size == 0 {
                        // include incomplete runes
                        return n
                }
                cc := info.ccc
                if rb.f.composing {
                        if !info.isYesC() {
                                break
                        }
                } else {
                        if !info.isYesD() {
                                break
                        }
                }
                if cc == 0 {
                        lastSegStart = i
                        nc = 0
                } else {
                        if nc >= maxCombiningChars {
                                lastSegStart = i
                                lastCC = cc
                                nc = 1
                        } else {
                                if lastCC > cc {
                                        return lastSegStart
                                }
                                nc++
                        }
                }
                lastCC = cc
                i += int(info.size)
        }
        if i == n {
                return n
        }
        if rb.f.composing {
                return lastSegStart
        }
        return i
}

// QuickSpanString returns a boundary n such that b[0:n] == f(s[0:n]).
// It is not guaranteed to return the largest such n.
func (f Form) QuickSpanString(s string) int {
        rb := reorderBuffer{}
        rb.initString(f, s)
        return quickSpan(&rb, 0)
}

// FirstBoundary returns the position i of the first boundary in b
// or -1 if b contains no boundary.
func (f Form) FirstBoundary(b []byte) int {
        rb := reorderBuffer{}
        rb.init(f, b)
        return firstBoundary(&rb)
}

func firstBoundary(rb *reorderBuffer) int {
        src, nsrc := rb.src, rb.nsrc
        i := src.skipNonStarter(0)
        if i >= nsrc {
                return -1
        }
        fd := &rb.f
        info := fd.info(src, i)
        for n := 0; info.size != 0 && !info.boundaryBefore(); {
                i += int(info.size)
                if n++; n >= maxCombiningChars {
                        return i
                }
                if i >= nsrc {
                        if !info.boundaryAfter() {
                                return -1
                        }
                        return nsrc
                }
                info = fd.info(src, i)
        }
        if info.size == 0 {
                return -1
        }
        return i
}

// FirstBoundaryInString returns the position i of the first boundary in s
// or -1 if s contains no boundary.
func (f Form) FirstBoundaryInString(s string) int {
        rb := reorderBuffer{}
        rb.initString(f, s)
        return firstBoundary(&rb)
}

// LastBoundary returns the position i of the last boundary in b
// or -1 if b contains no boundary.
func (f Form) LastBoundary(b []byte) int {
        return lastBoundary(formTable[f], b)
}

func lastBoundary(fd *formInfo, b []byte) int {
        i := len(b)
        info, p := lastRuneStart(fd, b)
        if p == -1 {
                return -1
        }
        if info.size == 0 { // ends with incomplete rune
                if p == 0 { // starts wtih incomplete rune
                        return -1
                }
                i = p
                info, p = lastRuneStart(fd, b[:i])
                if p == -1 { // incomplete UTF-8 encoding or non-starter bytes without a starter
                        return i
                }
        }
        if p+int(info.size) != i { // trailing non-starter bytes: illegal UTF-8
                return i
        }
        if info.boundaryAfter() {
                return i
        }
        i = p
        for n := 0; i >= 0 && !info.boundaryBefore(); {
                info, p = lastRuneStart(fd, b[:i])
                if n++; n >= maxCombiningChars {
                        return len(b)
                }
                if p+int(info.size) != i {
                        if p == -1 { // no boundary found
                                return -1
                        }
                        return i // boundary after an illegal UTF-8 encoding
                }
                i = p
        }
        return i
}

// decomposeSegment scans the first segment in src into rb.
// It returns the number of bytes consumed from src.
// TODO(mpvl): consider inserting U+034f (Combining Grapheme Joiner)
// when we detect a sequence of 30+ non-starter chars.
func decomposeSegment(rb *reorderBuffer, sp int) int {
        // Force one character to be consumed.
        info := rb.f.info(rb.src, sp)
        if info.size == 0 {
                return 0
        }
        for rb.insert(rb.src, sp, info) {
                sp += int(info.size)
                if sp >= rb.nsrc {
                        break
                }
                info = rb.f.info(rb.src, sp)
                bound := info.boundaryBefore()
                if bound || info.size == 0 {
                        break
                }
        }
        return sp
}

// lastRuneStart returns the runeInfo and position of the last
// rune in buf or the zero runeInfo and -1 if no rune was found.
func lastRuneStart(fd *formInfo, buf []byte) (runeInfo, int) {
        p := len(buf) - 1
        for ; p >= 0 && !utf8.RuneStart(buf[p]); p-- {
        }
        if p < 0 {
                return runeInfo{}, -1
        }
        return fd.info(inputBytes(buf), p), p
}

// decomposeToLastBoundary finds an open segment at the end of the buffer
// and scans it into rb. Returns the buffer minus the last segment.
func decomposeToLastBoundary(rb *reorderBuffer, buf []byte) []byte {
        fd := &rb.f
        info, i := lastRuneStart(fd, buf)
        if int(info.size) != len(buf)-i {
                // illegal trailing continuation bytes
                return buf
        }
        if info.boundaryAfter() {
                return buf
        }
        var add [maxBackRunes]runeInfo // stores runeInfo in reverse order
        add[0] = info
        padd := 1
        n := 1
        p := len(buf) - int(info.size)
        for ; p >= 0 && !info.boundaryBefore(); p -= int(info.size) {
                info, i = lastRuneStart(fd, buf[:p])
                if int(info.size) != p-i {
                        break
                }
                // Check that decomposition doesn't result in overflow.
                if info.hasDecomposition() {
                        dcomp := rb.f.decompose(inputBytes(buf), p-int(info.size))
                        for i := 0; i < len(dcomp); {
                                inf := rb.f.info(inputBytes(dcomp), i)
                                i += int(inf.size)
                                n++
                        }
                } else {
                        n++
                }
                if n > maxBackRunes {
                        break
                }
                add[padd] = info
                padd++
        }
        pp := p
        for padd--; padd >= 0; padd-- {
                info = add[padd]
                rb.insert(inputBytes(buf), pp, info)
                pp += int(info.size)
        }
        return buf[:p]
}

Go to most recent revision | Compare with Previous | Blame | View Log

powered by: WebSVN 2.1.0

© copyright 1999-2025 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.