URL https://opencores.org/ocsvn/openrisc/openrisc/trunk

Subversion Repositories openrisc

[/] [openrisc/] [trunk/] [gnu-dev/] [or1k-gcc/] [libgo/] [go/] [exp/] [utf8string/] [string.go] - Blame information for rev 747

Details | Compare with Previous | View Log


// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
 
// Package utf8string provides an efficient way to index strings by rune rather than by byte.
package utf8string
 
import (
        "errors"
        "unicode/utf8"
)
 
// String wraps a regular string with a small structure that provides more
// efficient indexing by code point index, as opposed to byte index.
// Scanning incrementally forwards or backwards is O(1) per index operation
// (although not as fast a range clause going forwards).  Random access is
// O(N) in the length of the string, but the overhead is less than always
// scanning from the beginning.
// If the string is ASCII, random access is O(1).
// Unlike the built-in string type, String has internal mutable state and
// is not thread-safe.
type String struct {
        str      string
        numRunes int
        // If width > 0, the rune at runePos starts at bytePos and has the specified width.
        width    int
        bytePos  int
        runePos  int
        nonASCII int // byte index of the first non-ASCII rune.
}
 
// NewString returns a new UTF-8 string with the provided contents.
func NewString(contents string) *String {
        return new(String).Init(contents)
}
 
// Init initializes an existing String to hold the provided contents.
// It returns a pointer to the initialized String.
func (s *String) Init(contents string) *String {
        s.str = contents
        s.bytePos = 0
        s.runePos = 0
        for i := 0; i < len(contents); i++ {
                if contents[i] >= utf8.RuneSelf {
                        // Not ASCII.
                        s.numRunes = utf8.RuneCountInString(contents)
                        _, s.width = utf8.DecodeRuneInString(contents)
                        s.nonASCII = i
                        return s
                }
        }
        // ASCII is simple.  Also, the empty string is ASCII.
        s.numRunes = len(contents)
        s.width = 0
        s.nonASCII = len(contents)
        return s
}
 
// String returns the contents of the String.  This method also means the
// String is directly printable by fmt.Print.
func (s *String) String() string {
        return s.str
}
 
// RuneCount returns the number of runes (Unicode code points) in the String.
func (s *String) RuneCount() int {
        return s.numRunes
}
 
// IsASCII returns a boolean indicating whether the String contains only ASCII bytes.
func (s *String) IsASCII() bool {
        return s.width == 0
}
 
// Slice returns the string sliced at rune positions [i:j].
func (s *String) Slice(i, j int) string {
        // ASCII is easy.  Let the compiler catch the indexing error if there is one.
        if j < s.nonASCII {
                return s.str[i:j]
        }
        if i < 0 || j > s.numRunes || i > j {
                panic(sliceOutOfRange)
        }
        if i == j {
                return ""
        }
        // For non-ASCII, after At(i), bytePos is always the position of the indexed character.
        var low, high int
        switch {
        case i < s.nonASCII:
                low = i
        case i == s.numRunes:
                low = len(s.str)
        default:
                s.At(i)
                low = s.bytePos
        }
        switch {
        case j == s.numRunes:
                high = len(s.str)
        default:
                s.At(j)
                high = s.bytePos
        }
        return s.str[low:high]
}
 
// At returns the rune with index i in the String.  The sequence of runes is the same
// as iterating over the contents with a "for range" clause.
func (s *String) At(i int) rune {
        // ASCII is easy.  Let the compiler catch the indexing error if there is one.
        if i < s.nonASCII {
                return rune(s.str[i])
        }
 
        // Now we do need to know the index is valid.
        if i < 0 || i >= s.numRunes {
                panic(outOfRange)
        }
 
        var r rune
 
        // Five easy common cases: within 1 spot of bytePos/runePos, or the beginning, or the end.
        // With these cases, all scans from beginning or end work in O(1) time per rune.
        switch {
 
        case i == s.runePos-1: // backing up one rune
                r, s.width = utf8.DecodeLastRuneInString(s.str[0:s.bytePos])
                s.runePos = i
                s.bytePos -= s.width
                return r
        case i == s.runePos+1: // moving ahead one rune
                s.runePos = i
                s.bytePos += s.width
                fallthrough
        case i == s.runePos:
                r, s.width = utf8.DecodeRuneInString(s.str[s.bytePos:])
                return r
        case i == 0: // start of string
                r, s.width = utf8.DecodeRuneInString(s.str)
                s.runePos = 0
                s.bytePos = 0
                return r
 
        case i == s.numRunes-1: // last rune in string
                r, s.width = utf8.DecodeLastRuneInString(s.str)
                s.runePos = i
                s.bytePos = len(s.str) - s.width
                return r
        }
 
        // We need to do a linear scan.  There are three places to start from:
        // 1) The beginning
        // 2) bytePos/runePos.
        // 3) The end
        // Choose the closest in rune count, scanning backwards if necessary.
        forward := true
        if i < s.runePos {
                // Between beginning and pos.  Which is closer?
                // Since both i and runePos are guaranteed >= nonASCII, that's the
                // lowest location we need to start from.
                if i < (s.runePos-s.nonASCII)/2 {
                        // Scan forward from beginning
                        s.bytePos, s.runePos = s.nonASCII, s.nonASCII
                } else {
                        // Scan backwards from where we are
                        forward = false
                }
        } else {
                // Between pos and end.  Which is closer?
                if i-s.runePos < (s.numRunes-s.runePos)/2 {
                        // Scan forward from pos
                } else {
                        // Scan backwards from end
                        s.bytePos, s.runePos = len(s.str), s.numRunes
                        forward = false
                }
        }
        if forward {
                // TODO: Is it much faster to use a range loop for this scan?
                for {
                        r, s.width = utf8.DecodeRuneInString(s.str[s.bytePos:])
                        if s.runePos == i {
                                break
                        }
                        s.runePos++
                        s.bytePos += s.width
                }
        } else {
                for {
                        r, s.width = utf8.DecodeLastRuneInString(s.str[0:s.bytePos])
                        s.runePos--
                        s.bytePos -= s.width
                        if s.runePos == i {
                                break
                        }
                }
        }
        return r
}
 
var outOfRange = errors.New("utf8.String: index out of range")
var sliceOutOfRange = errors.New("utf8.String: slice index out of range")

Browse

Tools

Subversion Repositories openrisc

[/] [openrisc/] [trunk/] [gnu-dev/] [or1k-gcc/] [libgo/] [go/] [exp/] [utf8string/] [string.go] - Blame information for rev 747

Line No.	Rev	Author	Line
1	747	jeremybenn	`// Copyright 2009 The Go Authors. All rights reserved.`
2			`// Use of this source code is governed by a BSD-style`
3			`// license that can be found in the LICENSE file.`
4
5			`// Package utf8string provides an efficient way to index strings by rune rather than by byte.`
6			`package utf8string`
7
8			`import (`
9			`"errors"`
10			`"unicode/utf8"`
11			`)`
12
13			`// String wraps a regular string with a small structure that provides more`
14			`// efficient indexing by code point index, as opposed to byte index.`
15			`// Scanning incrementally forwards or backwards is O(1) per index operation`
16			`// (although not as fast a range clause going forwards). Random access is`
17			`// O(N) in the length of the string, but the overhead is less than always`
18			`// scanning from the beginning.`
19			`// If the string is ASCII, random access is O(1).`
20			`// Unlike the built-in string type, String has internal mutable state and`
21			`// is not thread-safe.`
22			`type String struct {`
23			`str string`
24			`numRunes int`
25			`// If width > 0, the rune at runePos starts at bytePos and has the specified width.`
26			`width int`
27			`bytePos int`
28			`runePos int`
29			`nonASCII int // byte index of the first non-ASCII rune.`
30			`}`
31
32			`// NewString returns a new UTF-8 string with the provided contents.`
33			`func NewString(contents string) *String {`
34			`return new(String).Init(contents)`
35			`}`
36
37			`// Init initializes an existing String to hold the provided contents.`
38			`// It returns a pointer to the initialized String.`
39			`func (s String) Init(contents string) String {`
40			`s.str = contents`
41			`s.bytePos = 0`
42			`s.runePos = 0`
43			`for i := 0; i < len(contents); i++ {`
44			`if contents[i] >= utf8.RuneSelf {`
45			`// Not ASCII.`
46			`s.numRunes = utf8.RuneCountInString(contents)`
47			`_, s.width = utf8.DecodeRuneInString(contents)`
48			`s.nonASCII = i`
49			`return s`
50			`}`
51			`}`
52			`// ASCII is simple. Also, the empty string is ASCII.`
53			`s.numRunes = len(contents)`
54			`s.width = 0`
55			`s.nonASCII = len(contents)`
56			`return s`
57			`}`
58
59			`// String returns the contents of the String. This method also means the`
60			`// String is directly printable by fmt.Print.`
61			`func (s *String) String() string {`
62			`return s.str`
63			`}`
64
65			`// RuneCount returns the number of runes (Unicode code points) in the String.`
66			`func (s *String) RuneCount() int {`
67			`return s.numRunes`
68			`}`
69
70			`// IsASCII returns a boolean indicating whether the String contains only ASCII bytes.`
71			`func (s *String) IsASCII() bool {`
72			`return s.width == 0`
73			`}`
74
75			`// Slice returns the string sliced at rune positions [i:j].`
76			`func (s *String) Slice(i, j int) string {`
77			`// ASCII is easy. Let the compiler catch the indexing error if there is one.`
78			`if j < s.nonASCII {`
79			`return s.str[i:j]`
80			`}`
81			`if i < 0 \|\| j > s.numRunes \|\| i > j {`
82			`panic(sliceOutOfRange)`
83			`}`
84			`if i == j {`
85			`return ""`
86			`}`
87			`// For non-ASCII, after At(i), bytePos is always the position of the indexed character.`
88			`var low, high int`
89			`switch {`
90			`case i < s.nonASCII:`
91			`low = i`
92			`case i == s.numRunes:`
93			`low = len(s.str)`
94			`default:`
95			`s.At(i)`
96			`low = s.bytePos`
97			`}`
98			`switch {`
99			`case j == s.numRunes:`
100			`high = len(s.str)`
101			`default:`
102			`s.At(j)`
103			`high = s.bytePos`
104			`}`
105			`return s.str[low:high]`
106			`}`
107
108			`// At returns the rune with index i in the String. The sequence of runes is the same`
109			`// as iterating over the contents with a "for range" clause.`
110			`func (s *String) At(i int) rune {`
111			`// ASCII is easy. Let the compiler catch the indexing error if there is one.`
112			`if i < s.nonASCII {`
113			`return rune(s.str[i])`
114			`}`
115
116			`// Now we do need to know the index is valid.`
117			`if i < 0 \|\| i >= s.numRunes {`
118			`panic(outOfRange)`
119			`}`
120
121			`var r rune`
122
123			`// Five easy common cases: within 1 spot of bytePos/runePos, or the beginning, or the end.`
124			`// With these cases, all scans from beginning or end work in O(1) time per rune.`
125			`switch {`
126
127			`case i == s.runePos-1: // backing up one rune`
128			`r, s.width = utf8.DecodeLastRuneInString(s.str[0:s.bytePos])`
129			`s.runePos = i`
130			`s.bytePos -= s.width`
131			`return r`
132			`case i == s.runePos+1: // moving ahead one rune`
133			`s.runePos = i`
134			`s.bytePos += s.width`
135			`fallthrough`
136			`case i == s.runePos:`
137			`r, s.width = utf8.DecodeRuneInString(s.str[s.bytePos:])`
138			`return r`
139			`case i == 0: // start of string`
140			`r, s.width = utf8.DecodeRuneInString(s.str)`
141			`s.runePos = 0`
142			`s.bytePos = 0`
143			`return r`
144
145			`case i == s.numRunes-1: // last rune in string`
146			`r, s.width = utf8.DecodeLastRuneInString(s.str)`
147			`s.runePos = i`
148			`s.bytePos = len(s.str) - s.width`
149			`return r`
150			`}`
151
152			`// We need to do a linear scan. There are three places to start from:`
153			`// 1) The beginning`
154			`// 2) bytePos/runePos.`
155			`// 3) The end`
156			`// Choose the closest in rune count, scanning backwards if necessary.`
157			`forward := true`
158			`if i < s.runePos {`
159			`// Between beginning and pos. Which is closer?`
160			`// Since both i and runePos are guaranteed >= nonASCII, that's the`
161			`// lowest location we need to start from.`
162			`if i < (s.runePos-s.nonASCII)/2 {`
163			`// Scan forward from beginning`
164			`s.bytePos, s.runePos = s.nonASCII, s.nonASCII`
165			`} else {`
166			`// Scan backwards from where we are`
167			`forward = false`
168			`}`
169			`} else {`
170			`// Between pos and end. Which is closer?`
171			`if i-s.runePos < (s.numRunes-s.runePos)/2 {`
172			`// Scan forward from pos`
173			`} else {`
174			`// Scan backwards from end`
175			`s.bytePos, s.runePos = len(s.str), s.numRunes`
176			`forward = false`
177			`}`
178			`}`
179			`if forward {`
180			`// TODO: Is it much faster to use a range loop for this scan?`
181			`for {`
182			`r, s.width = utf8.DecodeRuneInString(s.str[s.bytePos:])`
183			`if s.runePos == i {`
184			`break`
185			`}`
186			`s.runePos++`
187			`s.bytePos += s.width`
188			`}`
189			`} else {`
190			`for {`
191			`r, s.width = utf8.DecodeLastRuneInString(s.str[0:s.bytePos])`
192			`s.runePos--`
193			`s.bytePos -= s.width`
194			`if s.runePos == i {`
195			`break`
196			`}`
197			`}`
198			`}`
199			`return r`
200			`}`
201
202			`var outOfRange = errors.New("utf8.String: index out of range")`
203			`var sliceOutOfRange = errors.New("utf8.String: slice index out of range")`