| 1 | 747 | jeremybenn | // Copyright 2009 The Go Authors. All rights reserved.
 | 
      
         | 2 |  |  | // Use of this source code is governed by a BSD-style
 | 
      
         | 3 |  |  | // license that can be found in the LICENSE file.
 | 
      
         | 4 |  |  |  
 | 
      
         | 5 |  |  | // Package utf8string provides an efficient way to index strings by rune rather than by byte.
 | 
      
         | 6 |  |  | package utf8string
 | 
      
         | 7 |  |  |  
 | 
      
         | 8 |  |  | import (
 | 
      
         | 9 |  |  |         "errors"
 | 
      
         | 10 |  |  |         "unicode/utf8"
 | 
      
         | 11 |  |  | )
 | 
      
         | 12 |  |  |  
 | 
      
         | 13 |  |  | // String wraps a regular string with a small structure that provides more
 | 
      
         | 14 |  |  | // efficient indexing by code point index, as opposed to byte index.
 | 
      
         | 15 |  |  | // Scanning incrementally forwards or backwards is O(1) per index operation
 | 
      
         | 16 |  |  | // (although not as fast a range clause going forwards).  Random access is
 | 
      
         | 17 |  |  | // O(N) in the length of the string, but the overhead is less than always
 | 
      
         | 18 |  |  | // scanning from the beginning.
 | 
      
         | 19 |  |  | // If the string is ASCII, random access is O(1).
 | 
      
         | 20 |  |  | // Unlike the built-in string type, String has internal mutable state and
 | 
      
         | 21 |  |  | // is not thread-safe.
 | 
      
         | 22 |  |  | type String struct {
 | 
      
         | 23 |  |  |         str      string
 | 
      
         | 24 |  |  |         numRunes int
 | 
      
         | 25 |  |  |         // If width > 0, the rune at runePos starts at bytePos and has the specified width.
 | 
      
         | 26 |  |  |         width    int
 | 
      
         | 27 |  |  |         bytePos  int
 | 
      
         | 28 |  |  |         runePos  int
 | 
      
         | 29 |  |  |         nonASCII int // byte index of the first non-ASCII rune.
 | 
      
         | 30 |  |  | }
 | 
      
         | 31 |  |  |  
 | 
      
         | 32 |  |  | // NewString returns a new UTF-8 string with the provided contents.
 | 
      
         | 33 |  |  | func NewString(contents string) *String {
 | 
      
         | 34 |  |  |         return new(String).Init(contents)
 | 
      
         | 35 |  |  | }
 | 
      
         | 36 |  |  |  
 | 
      
         | 37 |  |  | // Init initializes an existing String to hold the provided contents.
 | 
      
         | 38 |  |  | // It returns a pointer to the initialized String.
 | 
      
         | 39 |  |  | func (s *String) Init(contents string) *String {
 | 
      
         | 40 |  |  |         s.str = contents
 | 
      
         | 41 |  |  |         s.bytePos = 0
 | 
      
         | 42 |  |  |         s.runePos = 0
 | 
      
         | 43 |  |  |         for i := 0; i < len(contents); i++ {
 | 
      
         | 44 |  |  |                 if contents[i] >= utf8.RuneSelf {
 | 
      
         | 45 |  |  |                         // Not ASCII.
 | 
      
         | 46 |  |  |                         s.numRunes = utf8.RuneCountInString(contents)
 | 
      
         | 47 |  |  |                         _, s.width = utf8.DecodeRuneInString(contents)
 | 
      
         | 48 |  |  |                         s.nonASCII = i
 | 
      
         | 49 |  |  |                         return s
 | 
      
         | 50 |  |  |                 }
 | 
      
         | 51 |  |  |         }
 | 
      
         | 52 |  |  |         // ASCII is simple.  Also, the empty string is ASCII.
 | 
      
         | 53 |  |  |         s.numRunes = len(contents)
 | 
      
         | 54 |  |  |         s.width = 0
 | 
      
         | 55 |  |  |         s.nonASCII = len(contents)
 | 
      
         | 56 |  |  |         return s
 | 
      
         | 57 |  |  | }
 | 
      
         | 58 |  |  |  
 | 
      
         | 59 |  |  | // String returns the contents of the String.  This method also means the
 | 
      
         | 60 |  |  | // String is directly printable by fmt.Print.
 | 
      
         | 61 |  |  | func (s *String) String() string {
 | 
      
         | 62 |  |  |         return s.str
 | 
      
         | 63 |  |  | }
 | 
      
         | 64 |  |  |  
 | 
      
         | 65 |  |  | // RuneCount returns the number of runes (Unicode code points) in the String.
 | 
      
         | 66 |  |  | func (s *String) RuneCount() int {
 | 
      
         | 67 |  |  |         return s.numRunes
 | 
      
         | 68 |  |  | }
 | 
      
         | 69 |  |  |  
 | 
      
         | 70 |  |  | // IsASCII returns a boolean indicating whether the String contains only ASCII bytes.
 | 
      
         | 71 |  |  | func (s *String) IsASCII() bool {
 | 
      
         | 72 |  |  |         return s.width == 0
 | 
      
         | 73 |  |  | }
 | 
      
         | 74 |  |  |  
 | 
      
         | 75 |  |  | // Slice returns the string sliced at rune positions [i:j].
 | 
      
         | 76 |  |  | func (s *String) Slice(i, j int) string {
 | 
      
         | 77 |  |  |         // ASCII is easy.  Let the compiler catch the indexing error if there is one.
 | 
      
         | 78 |  |  |         if j < s.nonASCII {
 | 
      
         | 79 |  |  |                 return s.str[i:j]
 | 
      
         | 80 |  |  |         }
 | 
      
         | 81 |  |  |         if i < 0 || j > s.numRunes || i > j {
 | 
      
         | 82 |  |  |                 panic(sliceOutOfRange)
 | 
      
         | 83 |  |  |         }
 | 
      
         | 84 |  |  |         if i == j {
 | 
      
         | 85 |  |  |                 return ""
 | 
      
         | 86 |  |  |         }
 | 
      
         | 87 |  |  |         // For non-ASCII, after At(i), bytePos is always the position of the indexed character.
 | 
      
         | 88 |  |  |         var low, high int
 | 
      
         | 89 |  |  |         switch {
 | 
      
         | 90 |  |  |         case i < s.nonASCII:
 | 
      
         | 91 |  |  |                 low = i
 | 
      
         | 92 |  |  |         case i == s.numRunes:
 | 
      
         | 93 |  |  |                 low = len(s.str)
 | 
      
         | 94 |  |  |         default:
 | 
      
         | 95 |  |  |                 s.At(i)
 | 
      
         | 96 |  |  |                 low = s.bytePos
 | 
      
         | 97 |  |  |         }
 | 
      
         | 98 |  |  |         switch {
 | 
      
         | 99 |  |  |         case j == s.numRunes:
 | 
      
         | 100 |  |  |                 high = len(s.str)
 | 
      
         | 101 |  |  |         default:
 | 
      
         | 102 |  |  |                 s.At(j)
 | 
      
         | 103 |  |  |                 high = s.bytePos
 | 
      
         | 104 |  |  |         }
 | 
      
         | 105 |  |  |         return s.str[low:high]
 | 
      
         | 106 |  |  | }
 | 
      
         | 107 |  |  |  
 | 
      
         | 108 |  |  | // At returns the rune with index i in the String.  The sequence of runes is the same
 | 
      
         | 109 |  |  | // as iterating over the contents with a "for range" clause.
 | 
      
         | 110 |  |  | func (s *String) At(i int) rune {
 | 
      
         | 111 |  |  |         // ASCII is easy.  Let the compiler catch the indexing error if there is one.
 | 
      
         | 112 |  |  |         if i < s.nonASCII {
 | 
      
         | 113 |  |  |                 return rune(s.str[i])
 | 
      
         | 114 |  |  |         }
 | 
      
         | 115 |  |  |  
 | 
      
         | 116 |  |  |         // Now we do need to know the index is valid.
 | 
      
         | 117 |  |  |         if i < 0 || i >= s.numRunes {
 | 
      
         | 118 |  |  |                 panic(outOfRange)
 | 
      
         | 119 |  |  |         }
 | 
      
         | 120 |  |  |  
 | 
      
         | 121 |  |  |         var r rune
 | 
      
         | 122 |  |  |  
 | 
      
         | 123 |  |  |         // Five easy common cases: within 1 spot of bytePos/runePos, or the beginning, or the end.
 | 
      
         | 124 |  |  |         // With these cases, all scans from beginning or end work in O(1) time per rune.
 | 
      
         | 125 |  |  |         switch {
 | 
      
         | 126 |  |  |  
 | 
      
         | 127 |  |  |         case i == s.runePos-1: // backing up one rune
 | 
      
         | 128 |  |  |                 r, s.width = utf8.DecodeLastRuneInString(s.str[0:s.bytePos])
 | 
      
         | 129 |  |  |                 s.runePos = i
 | 
      
         | 130 |  |  |                 s.bytePos -= s.width
 | 
      
         | 131 |  |  |                 return r
 | 
      
         | 132 |  |  |         case i == s.runePos+1: // moving ahead one rune
 | 
      
         | 133 |  |  |                 s.runePos = i
 | 
      
         | 134 |  |  |                 s.bytePos += s.width
 | 
      
         | 135 |  |  |                 fallthrough
 | 
      
         | 136 |  |  |         case i == s.runePos:
 | 
      
         | 137 |  |  |                 r, s.width = utf8.DecodeRuneInString(s.str[s.bytePos:])
 | 
      
         | 138 |  |  |                 return r
 | 
      
         | 139 |  |  |         case i == 0: // start of string
 | 
      
         | 140 |  |  |                 r, s.width = utf8.DecodeRuneInString(s.str)
 | 
      
         | 141 |  |  |                 s.runePos = 0
 | 
      
         | 142 |  |  |                 s.bytePos = 0
 | 
      
         | 143 |  |  |                 return r
 | 
      
         | 144 |  |  |  
 | 
      
         | 145 |  |  |         case i == s.numRunes-1: // last rune in string
 | 
      
         | 146 |  |  |                 r, s.width = utf8.DecodeLastRuneInString(s.str)
 | 
      
         | 147 |  |  |                 s.runePos = i
 | 
      
         | 148 |  |  |                 s.bytePos = len(s.str) - s.width
 | 
      
         | 149 |  |  |                 return r
 | 
      
         | 150 |  |  |         }
 | 
      
         | 151 |  |  |  
 | 
      
         | 152 |  |  |         // We need to do a linear scan.  There are three places to start from:
 | 
      
         | 153 |  |  |         // 1) The beginning
 | 
      
         | 154 |  |  |         // 2) bytePos/runePos.
 | 
      
         | 155 |  |  |         // 3) The end
 | 
      
         | 156 |  |  |         // Choose the closest in rune count, scanning backwards if necessary.
 | 
      
         | 157 |  |  |         forward := true
 | 
      
         | 158 |  |  |         if i < s.runePos {
 | 
      
         | 159 |  |  |                 // Between beginning and pos.  Which is closer?
 | 
      
         | 160 |  |  |                 // Since both i and runePos are guaranteed >= nonASCII, that's the
 | 
      
         | 161 |  |  |                 // lowest location we need to start from.
 | 
      
         | 162 |  |  |                 if i < (s.runePos-s.nonASCII)/2 {
 | 
      
         | 163 |  |  |                         // Scan forward from beginning
 | 
      
         | 164 |  |  |                         s.bytePos, s.runePos = s.nonASCII, s.nonASCII
 | 
      
         | 165 |  |  |                 } else {
 | 
      
         | 166 |  |  |                         // Scan backwards from where we are
 | 
      
         | 167 |  |  |                         forward = false
 | 
      
         | 168 |  |  |                 }
 | 
      
         | 169 |  |  |         } else {
 | 
      
         | 170 |  |  |                 // Between pos and end.  Which is closer?
 | 
      
         | 171 |  |  |                 if i-s.runePos < (s.numRunes-s.runePos)/2 {
 | 
      
         | 172 |  |  |                         // Scan forward from pos
 | 
      
         | 173 |  |  |                 } else {
 | 
      
         | 174 |  |  |                         // Scan backwards from end
 | 
      
         | 175 |  |  |                         s.bytePos, s.runePos = len(s.str), s.numRunes
 | 
      
         | 176 |  |  |                         forward = false
 | 
      
         | 177 |  |  |                 }
 | 
      
         | 178 |  |  |         }
 | 
      
         | 179 |  |  |         if forward {
 | 
      
         | 180 |  |  |                 // TODO: Is it much faster to use a range loop for this scan?
 | 
      
         | 181 |  |  |                 for {
 | 
      
         | 182 |  |  |                         r, s.width = utf8.DecodeRuneInString(s.str[s.bytePos:])
 | 
      
         | 183 |  |  |                         if s.runePos == i {
 | 
      
         | 184 |  |  |                                 break
 | 
      
         | 185 |  |  |                         }
 | 
      
         | 186 |  |  |                         s.runePos++
 | 
      
         | 187 |  |  |                         s.bytePos += s.width
 | 
      
         | 188 |  |  |                 }
 | 
      
         | 189 |  |  |         } else {
 | 
      
         | 190 |  |  |                 for {
 | 
      
         | 191 |  |  |                         r, s.width = utf8.DecodeLastRuneInString(s.str[0:s.bytePos])
 | 
      
         | 192 |  |  |                         s.runePos--
 | 
      
         | 193 |  |  |                         s.bytePos -= s.width
 | 
      
         | 194 |  |  |                         if s.runePos == i {
 | 
      
         | 195 |  |  |                                 break
 | 
      
         | 196 |  |  |                         }
 | 
      
         | 197 |  |  |                 }
 | 
      
         | 198 |  |  |         }
 | 
      
         | 199 |  |  |         return r
 | 
      
         | 200 |  |  | }
 | 
      
         | 201 |  |  |  
 | 
      
         | 202 |  |  | var outOfRange = errors.New("utf8.String: index out of range")
 | 
      
         | 203 |  |  | var sliceOutOfRange = errors.New("utf8.String: slice index out of range")
 |