| 1 |
747 |
jeremybenn |
// Copyright 2009 The Go Authors. All rights reserved.
|
| 2 |
|
|
// Use of this source code is governed by a BSD-style
|
| 3 |
|
|
// license that can be found in the LICENSE file.
|
| 4 |
|
|
|
| 5 |
|
|
// Package scanner provides a scanner and tokenizer for UTF-8-encoded text.
|
| 6 |
|
|
// It takes an io.Reader providing the source, which then can be tokenized
|
| 7 |
|
|
// through repeated calls to the Scan function. For compatibility with
|
| 8 |
|
|
// existing tools, the NUL character is not allowed (implementation
|
| 9 |
|
|
// restriction).
|
| 10 |
|
|
//
|
| 11 |
|
|
// By default, a Scanner skips white space and Go comments and recognizes all
|
| 12 |
|
|
// literals as defined by the Go language specification. It may be
|
| 13 |
|
|
// customized to recognize only a subset of those literals and to recognize
|
| 14 |
|
|
// different white space characters.
|
| 15 |
|
|
//
|
| 16 |
|
|
// Basic usage pattern:
|
| 17 |
|
|
//
|
| 18 |
|
|
// var s scanner.Scanner
|
| 19 |
|
|
// s.Init(src)
|
| 20 |
|
|
// tok := s.Scan()
|
| 21 |
|
|
// for tok != scanner.EOF {
|
| 22 |
|
|
// // do something with tok
|
| 23 |
|
|
// tok = s.Scan()
|
| 24 |
|
|
// }
|
| 25 |
|
|
//
|
| 26 |
|
|
package scanner
|
| 27 |
|
|
|
| 28 |
|
|
import (
|
| 29 |
|
|
"bytes"
|
| 30 |
|
|
"fmt"
|
| 31 |
|
|
"io"
|
| 32 |
|
|
"os"
|
| 33 |
|
|
"unicode"
|
| 34 |
|
|
"unicode/utf8"
|
| 35 |
|
|
)
|
| 36 |
|
|
|
| 37 |
|
|
// TODO(gri): Consider changing this to use the new (token) Position package.
|
| 38 |
|
|
|
| 39 |
|
|
// A source position is represented by a Position value.
|
| 40 |
|
|
// A position is valid if Line > 0.
|
| 41 |
|
|
type Position struct {
|
| 42 |
|
|
Filename string // filename, if any
|
| 43 |
|
|
Offset int // byte offset, starting at 0
|
| 44 |
|
|
Line int // line number, starting at 1
|
| 45 |
|
|
Column int // column number, starting at 1 (character count per line)
|
| 46 |
|
|
}
|
| 47 |
|
|
|
| 48 |
|
|
// IsValid returns true if the position is valid.
|
| 49 |
|
|
func (pos *Position) IsValid() bool { return pos.Line > 0 }
|
| 50 |
|
|
|
| 51 |
|
|
func (pos Position) String() string {
|
| 52 |
|
|
s := pos.Filename
|
| 53 |
|
|
if pos.IsValid() {
|
| 54 |
|
|
if s != "" {
|
| 55 |
|
|
s += ":"
|
| 56 |
|
|
}
|
| 57 |
|
|
s += fmt.Sprintf("%d:%d", pos.Line, pos.Column)
|
| 58 |
|
|
}
|
| 59 |
|
|
if s == "" {
|
| 60 |
|
|
s = "???"
|
| 61 |
|
|
}
|
| 62 |
|
|
return s
|
| 63 |
|
|
}
|
| 64 |
|
|
|
| 65 |
|
|
// Predefined mode bits to control recognition of tokens. For instance,
|
| 66 |
|
|
// to configure a Scanner such that it only recognizes (Go) identifiers,
|
| 67 |
|
|
// integers, and skips comments, set the Scanner's Mode field to:
|
| 68 |
|
|
//
|
| 69 |
|
|
// ScanIdents | ScanInts | SkipComments
|
| 70 |
|
|
//
|
| 71 |
|
|
const (
|
| 72 |
|
|
ScanIdents = 1 << -Ident
|
| 73 |
|
|
ScanInts = 1 << -Int
|
| 74 |
|
|
ScanFloats = 1 << -Float // includes Ints
|
| 75 |
|
|
ScanChars = 1 << -Char
|
| 76 |
|
|
ScanStrings = 1 << -String
|
| 77 |
|
|
ScanRawStrings = 1 << -RawString
|
| 78 |
|
|
ScanComments = 1 << -Comment
|
| 79 |
|
|
SkipComments = 1 << -skipComment // if set with ScanComments, comments become white space
|
| 80 |
|
|
GoTokens = ScanIdents | ScanFloats | ScanChars | ScanStrings | ScanRawStrings | ScanComments | SkipComments
|
| 81 |
|
|
)
|
| 82 |
|
|
|
| 83 |
|
|
// The result of Scan is one of the following tokens or a Unicode character.
|
| 84 |
|
|
const (
|
| 85 |
|
|
EOF = -(iota + 1)
|
| 86 |
|
|
Ident
|
| 87 |
|
|
Int
|
| 88 |
|
|
Float
|
| 89 |
|
|
Char
|
| 90 |
|
|
String
|
| 91 |
|
|
RawString
|
| 92 |
|
|
Comment
|
| 93 |
|
|
skipComment
|
| 94 |
|
|
)
|
| 95 |
|
|
|
| 96 |
|
|
var tokenString = map[rune]string{
|
| 97 |
|
|
EOF: "EOF",
|
| 98 |
|
|
Ident: "Ident",
|
| 99 |
|
|
Int: "Int",
|
| 100 |
|
|
Float: "Float",
|
| 101 |
|
|
Char: "Char",
|
| 102 |
|
|
String: "String",
|
| 103 |
|
|
RawString: "RawString",
|
| 104 |
|
|
Comment: "Comment",
|
| 105 |
|
|
}
|
| 106 |
|
|
|
| 107 |
|
|
// TokenString returns a (visible) string for a token or Unicode character.
|
| 108 |
|
|
func TokenString(tok rune) string {
|
| 109 |
|
|
if s, found := tokenString[tok]; found {
|
| 110 |
|
|
return s
|
| 111 |
|
|
}
|
| 112 |
|
|
return fmt.Sprintf("%q", string(tok))
|
| 113 |
|
|
}
|
| 114 |
|
|
|
| 115 |
|
|
// GoWhitespace is the default value for the Scanner's Whitespace field.
|
| 116 |
|
|
// Its value selects Go's white space characters.
|
| 117 |
|
|
const GoWhitespace = 1<<'\t' | 1<<'\n' | 1<<'\r' | 1<<' '
|
| 118 |
|
|
|
| 119 |
|
|
const bufLen = 1024 // at least utf8.UTFMax
|
| 120 |
|
|
|
| 121 |
|
|
// A Scanner implements reading of Unicode characters and tokens from an io.Reader.
|
| 122 |
|
|
type Scanner struct {
|
| 123 |
|
|
// Input
|
| 124 |
|
|
src io.Reader
|
| 125 |
|
|
|
| 126 |
|
|
// Source buffer
|
| 127 |
|
|
srcBuf [bufLen + 1]byte // +1 for sentinel for common case of s.next()
|
| 128 |
|
|
srcPos int // reading position (srcBuf index)
|
| 129 |
|
|
srcEnd int // source end (srcBuf index)
|
| 130 |
|
|
|
| 131 |
|
|
// Source position
|
| 132 |
|
|
srcBufOffset int // byte offset of srcBuf[0] in source
|
| 133 |
|
|
line int // line count
|
| 134 |
|
|
column int // character count
|
| 135 |
|
|
lastLineLen int // length of last line in characters (for correct column reporting)
|
| 136 |
|
|
lastCharLen int // length of last character in bytes
|
| 137 |
|
|
|
| 138 |
|
|
// Token text buffer
|
| 139 |
|
|
// Typically, token text is stored completely in srcBuf, but in general
|
| 140 |
|
|
// the token text's head may be buffered in tokBuf while the token text's
|
| 141 |
|
|
// tail is stored in srcBuf.
|
| 142 |
|
|
tokBuf bytes.Buffer // token text head that is not in srcBuf anymore
|
| 143 |
|
|
tokPos int // token text tail position (srcBuf index); valid if >= 0
|
| 144 |
|
|
tokEnd int // token text tail end (srcBuf index)
|
| 145 |
|
|
|
| 146 |
|
|
// One character look-ahead
|
| 147 |
|
|
ch rune // character before current srcPos
|
| 148 |
|
|
|
| 149 |
|
|
// Error is called for each error encountered. If no Error
|
| 150 |
|
|
// function is set, the error is reported to os.Stderr.
|
| 151 |
|
|
Error func(s *Scanner, msg string)
|
| 152 |
|
|
|
| 153 |
|
|
// ErrorCount is incremented by one for each error encountered.
|
| 154 |
|
|
ErrorCount int
|
| 155 |
|
|
|
| 156 |
|
|
// The Mode field controls which tokens are recognized. For instance,
|
| 157 |
|
|
// to recognize Ints, set the ScanInts bit in Mode. The field may be
|
| 158 |
|
|
// changed at any time.
|
| 159 |
|
|
Mode uint
|
| 160 |
|
|
|
| 161 |
|
|
// The Whitespace field controls which characters are recognized
|
| 162 |
|
|
// as white space. To recognize a character ch <= ' ' as white space,
|
| 163 |
|
|
// set the ch'th bit in Whitespace (the Scanner's behavior is undefined
|
| 164 |
|
|
// for values ch > ' '). The field may be changed at any time.
|
| 165 |
|
|
Whitespace uint64
|
| 166 |
|
|
|
| 167 |
|
|
// Start position of most recently scanned token; set by Scan.
|
| 168 |
|
|
// Calling Init or Next invalidates the position (Line == 0).
|
| 169 |
|
|
// The Filename field is always left untouched by the Scanner.
|
| 170 |
|
|
// If an error is reported (via Error) and Position is invalid,
|
| 171 |
|
|
// the scanner is not inside a token. Call Pos to obtain an error
|
| 172 |
|
|
// position in that case.
|
| 173 |
|
|
Position
|
| 174 |
|
|
}
|
| 175 |
|
|
|
| 176 |
|
|
// Init initializes a Scanner with a new source and returns s.
|
| 177 |
|
|
// Error is set to nil, ErrorCount is set to 0, Mode is set to GoTokens,
|
| 178 |
|
|
// and Whitespace is set to GoWhitespace.
|
| 179 |
|
|
func (s *Scanner) Init(src io.Reader) *Scanner {
|
| 180 |
|
|
s.src = src
|
| 181 |
|
|
|
| 182 |
|
|
// initialize source buffer
|
| 183 |
|
|
// (the first call to next() will fill it by calling src.Read)
|
| 184 |
|
|
s.srcBuf[0] = utf8.RuneSelf // sentinel
|
| 185 |
|
|
s.srcPos = 0
|
| 186 |
|
|
s.srcEnd = 0
|
| 187 |
|
|
|
| 188 |
|
|
// initialize source position
|
| 189 |
|
|
s.srcBufOffset = 0
|
| 190 |
|
|
s.line = 1
|
| 191 |
|
|
s.column = 0
|
| 192 |
|
|
s.lastLineLen = 0
|
| 193 |
|
|
s.lastCharLen = 0
|
| 194 |
|
|
|
| 195 |
|
|
// initialize token text buffer
|
| 196 |
|
|
// (required for first call to next()).
|
| 197 |
|
|
s.tokPos = -1
|
| 198 |
|
|
|
| 199 |
|
|
// initialize one character look-ahead
|
| 200 |
|
|
s.ch = -1 // no char read yet
|
| 201 |
|
|
|
| 202 |
|
|
// initialize public fields
|
| 203 |
|
|
s.Error = nil
|
| 204 |
|
|
s.ErrorCount = 0
|
| 205 |
|
|
s.Mode = GoTokens
|
| 206 |
|
|
s.Whitespace = GoWhitespace
|
| 207 |
|
|
s.Line = 0 // invalidate token position
|
| 208 |
|
|
|
| 209 |
|
|
return s
|
| 210 |
|
|
}
|
| 211 |
|
|
|
| 212 |
|
|
// TODO(gri): The code for next() and the internal scanner state could benefit
|
| 213 |
|
|
// from a rethink. While next() is optimized for the common ASCII
|
| 214 |
|
|
// case, the "corrections" needed for proper position tracking undo
|
| 215 |
|
|
// some of the attempts for fast-path optimization.
|
| 216 |
|
|
|
| 217 |
|
|
// next reads and returns the next Unicode character. It is designed such
|
| 218 |
|
|
// that only a minimal amount of work needs to be done in the common ASCII
|
| 219 |
|
|
// case (one test to check for both ASCII and end-of-buffer, and one test
|
| 220 |
|
|
// to check for newlines).
|
| 221 |
|
|
func (s *Scanner) next() rune {
|
| 222 |
|
|
ch, width := rune(s.srcBuf[s.srcPos]), 1
|
| 223 |
|
|
|
| 224 |
|
|
if ch >= utf8.RuneSelf {
|
| 225 |
|
|
// uncommon case: not ASCII or not enough bytes
|
| 226 |
|
|
for s.srcPos+utf8.UTFMax > s.srcEnd && !utf8.FullRune(s.srcBuf[s.srcPos:s.srcEnd]) {
|
| 227 |
|
|
// not enough bytes: read some more, but first
|
| 228 |
|
|
// save away token text if any
|
| 229 |
|
|
if s.tokPos >= 0 {
|
| 230 |
|
|
s.tokBuf.Write(s.srcBuf[s.tokPos:s.srcPos])
|
| 231 |
|
|
s.tokPos = 0
|
| 232 |
|
|
// s.tokEnd is set by Scan()
|
| 233 |
|
|
}
|
| 234 |
|
|
// move unread bytes to beginning of buffer
|
| 235 |
|
|
copy(s.srcBuf[0:], s.srcBuf[s.srcPos:s.srcEnd])
|
| 236 |
|
|
s.srcBufOffset += s.srcPos
|
| 237 |
|
|
// read more bytes
|
| 238 |
|
|
// (an io.Reader must return io.EOF when it reaches
|
| 239 |
|
|
// the end of what it is reading - simply returning
|
| 240 |
|
|
// n == 0 will make this loop retry forever; but the
|
| 241 |
|
|
// error is in the reader implementation in that case)
|
| 242 |
|
|
i := s.srcEnd - s.srcPos
|
| 243 |
|
|
n, err := s.src.Read(s.srcBuf[i:bufLen])
|
| 244 |
|
|
s.srcPos = 0
|
| 245 |
|
|
s.srcEnd = i + n
|
| 246 |
|
|
s.srcBuf[s.srcEnd] = utf8.RuneSelf // sentinel
|
| 247 |
|
|
if err != nil {
|
| 248 |
|
|
if s.srcEnd == 0 {
|
| 249 |
|
|
if s.lastCharLen > 0 {
|
| 250 |
|
|
// previous character was not EOF
|
| 251 |
|
|
s.column++
|
| 252 |
|
|
}
|
| 253 |
|
|
s.lastCharLen = 0
|
| 254 |
|
|
return EOF
|
| 255 |
|
|
}
|
| 256 |
|
|
if err != io.EOF {
|
| 257 |
|
|
s.error(err.Error())
|
| 258 |
|
|
}
|
| 259 |
|
|
// If err == EOF, we won't be getting more
|
| 260 |
|
|
// bytes; break to avoid infinite loop. If
|
| 261 |
|
|
// err is something else, we don't know if
|
| 262 |
|
|
// we can get more bytes; thus also break.
|
| 263 |
|
|
break
|
| 264 |
|
|
}
|
| 265 |
|
|
}
|
| 266 |
|
|
// at least one byte
|
| 267 |
|
|
ch = rune(s.srcBuf[s.srcPos])
|
| 268 |
|
|
if ch >= utf8.RuneSelf {
|
| 269 |
|
|
// uncommon case: not ASCII
|
| 270 |
|
|
ch, width = utf8.DecodeRune(s.srcBuf[s.srcPos:s.srcEnd])
|
| 271 |
|
|
if ch == utf8.RuneError && width == 1 {
|
| 272 |
|
|
// advance for correct error position
|
| 273 |
|
|
s.srcPos += width
|
| 274 |
|
|
s.lastCharLen = width
|
| 275 |
|
|
s.column++
|
| 276 |
|
|
s.error("illegal UTF-8 encoding")
|
| 277 |
|
|
return ch
|
| 278 |
|
|
}
|
| 279 |
|
|
}
|
| 280 |
|
|
}
|
| 281 |
|
|
|
| 282 |
|
|
// advance
|
| 283 |
|
|
s.srcPos += width
|
| 284 |
|
|
s.lastCharLen = width
|
| 285 |
|
|
s.column++
|
| 286 |
|
|
|
| 287 |
|
|
// special situations
|
| 288 |
|
|
switch ch {
|
| 289 |
|
|
case 0:
|
| 290 |
|
|
// implementation restriction for compatibility with other tools
|
| 291 |
|
|
s.error("illegal character NUL")
|
| 292 |
|
|
case '\n':
|
| 293 |
|
|
s.line++
|
| 294 |
|
|
s.lastLineLen = s.column
|
| 295 |
|
|
s.column = 0
|
| 296 |
|
|
}
|
| 297 |
|
|
|
| 298 |
|
|
return ch
|
| 299 |
|
|
}
|
| 300 |
|
|
|
| 301 |
|
|
// Next reads and returns the next Unicode character.
|
| 302 |
|
|
// It returns EOF at the end of the source. It reports
|
| 303 |
|
|
// a read error by calling s.Error, if not nil; otherwise
|
| 304 |
|
|
// it prints an error message to os.Stderr. Next does not
|
| 305 |
|
|
// update the Scanner's Position field; use Pos() to
|
| 306 |
|
|
// get the current position.
|
| 307 |
|
|
func (s *Scanner) Next() rune {
|
| 308 |
|
|
s.tokPos = -1 // don't collect token text
|
| 309 |
|
|
s.Line = 0 // invalidate token position
|
| 310 |
|
|
ch := s.Peek()
|
| 311 |
|
|
s.ch = s.next()
|
| 312 |
|
|
return ch
|
| 313 |
|
|
}
|
| 314 |
|
|
|
| 315 |
|
|
// Peek returns the next Unicode character in the source without advancing
|
| 316 |
|
|
// the scanner. It returns EOF if the scanner's position is at the last
|
| 317 |
|
|
// character of the source.
|
| 318 |
|
|
func (s *Scanner) Peek() rune {
|
| 319 |
|
|
if s.ch < 0 {
|
| 320 |
|
|
s.ch = s.next()
|
| 321 |
|
|
}
|
| 322 |
|
|
return s.ch
|
| 323 |
|
|
}
|
| 324 |
|
|
|
| 325 |
|
|
func (s *Scanner) error(msg string) {
|
| 326 |
|
|
s.ErrorCount++
|
| 327 |
|
|
if s.Error != nil {
|
| 328 |
|
|
s.Error(s, msg)
|
| 329 |
|
|
return
|
| 330 |
|
|
}
|
| 331 |
|
|
pos := s.Position
|
| 332 |
|
|
if !pos.IsValid() {
|
| 333 |
|
|
pos = s.Pos()
|
| 334 |
|
|
}
|
| 335 |
|
|
fmt.Fprintf(os.Stderr, "%s: %s\n", pos, msg)
|
| 336 |
|
|
}
|
| 337 |
|
|
|
| 338 |
|
|
func (s *Scanner) scanIdentifier() rune {
|
| 339 |
|
|
ch := s.next() // read character after first '_' or letter
|
| 340 |
|
|
for ch == '_' || unicode.IsLetter(ch) || unicode.IsDigit(ch) {
|
| 341 |
|
|
ch = s.next()
|
| 342 |
|
|
}
|
| 343 |
|
|
return ch
|
| 344 |
|
|
}
|
| 345 |
|
|
|
| 346 |
|
|
func digitVal(ch rune) int {
|
| 347 |
|
|
switch {
|
| 348 |
|
|
case '0' <= ch && ch <= '9':
|
| 349 |
|
|
return int(ch - '0')
|
| 350 |
|
|
case 'a' <= ch && ch <= 'f':
|
| 351 |
|
|
return int(ch - 'a' + 10)
|
| 352 |
|
|
case 'A' <= ch && ch <= 'F':
|
| 353 |
|
|
return int(ch - 'A' + 10)
|
| 354 |
|
|
}
|
| 355 |
|
|
return 16 // larger than any legal digit val
|
| 356 |
|
|
}
|
| 357 |
|
|
|
| 358 |
|
|
func isDecimal(ch rune) bool { return '0' <= ch && ch <= '9' }
|
| 359 |
|
|
|
| 360 |
|
|
func (s *Scanner) scanMantissa(ch rune) rune {
|
| 361 |
|
|
for isDecimal(ch) {
|
| 362 |
|
|
ch = s.next()
|
| 363 |
|
|
}
|
| 364 |
|
|
return ch
|
| 365 |
|
|
}
|
| 366 |
|
|
|
| 367 |
|
|
func (s *Scanner) scanFraction(ch rune) rune {
|
| 368 |
|
|
if ch == '.' {
|
| 369 |
|
|
ch = s.scanMantissa(s.next())
|
| 370 |
|
|
}
|
| 371 |
|
|
return ch
|
| 372 |
|
|
}
|
| 373 |
|
|
|
| 374 |
|
|
func (s *Scanner) scanExponent(ch rune) rune {
|
| 375 |
|
|
if ch == 'e' || ch == 'E' {
|
| 376 |
|
|
ch = s.next()
|
| 377 |
|
|
if ch == '-' || ch == '+' {
|
| 378 |
|
|
ch = s.next()
|
| 379 |
|
|
}
|
| 380 |
|
|
ch = s.scanMantissa(ch)
|
| 381 |
|
|
}
|
| 382 |
|
|
return ch
|
| 383 |
|
|
}
|
| 384 |
|
|
|
| 385 |
|
|
func (s *Scanner) scanNumber(ch rune) (rune, rune) {
|
| 386 |
|
|
// isDecimal(ch)
|
| 387 |
|
|
if ch == '0' {
|
| 388 |
|
|
// int or float
|
| 389 |
|
|
ch = s.next()
|
| 390 |
|
|
if ch == 'x' || ch == 'X' {
|
| 391 |
|
|
// hexadecimal int
|
| 392 |
|
|
ch = s.next()
|
| 393 |
|
|
for digitVal(ch) < 16 {
|
| 394 |
|
|
ch = s.next()
|
| 395 |
|
|
}
|
| 396 |
|
|
} else {
|
| 397 |
|
|
// octal int or float
|
| 398 |
|
|
seenDecimalDigit := false
|
| 399 |
|
|
for isDecimal(ch) {
|
| 400 |
|
|
if ch > '7' {
|
| 401 |
|
|
seenDecimalDigit = true
|
| 402 |
|
|
}
|
| 403 |
|
|
ch = s.next()
|
| 404 |
|
|
}
|
| 405 |
|
|
if s.Mode&ScanFloats != 0 && (ch == '.' || ch == 'e' || ch == 'E') {
|
| 406 |
|
|
// float
|
| 407 |
|
|
ch = s.scanFraction(ch)
|
| 408 |
|
|
ch = s.scanExponent(ch)
|
| 409 |
|
|
return Float, ch
|
| 410 |
|
|
}
|
| 411 |
|
|
// octal int
|
| 412 |
|
|
if seenDecimalDigit {
|
| 413 |
|
|
s.error("illegal octal number")
|
| 414 |
|
|
}
|
| 415 |
|
|
}
|
| 416 |
|
|
return Int, ch
|
| 417 |
|
|
}
|
| 418 |
|
|
// decimal int or float
|
| 419 |
|
|
ch = s.scanMantissa(ch)
|
| 420 |
|
|
if s.Mode&ScanFloats != 0 && (ch == '.' || ch == 'e' || ch == 'E') {
|
| 421 |
|
|
// float
|
| 422 |
|
|
ch = s.scanFraction(ch)
|
| 423 |
|
|
ch = s.scanExponent(ch)
|
| 424 |
|
|
return Float, ch
|
| 425 |
|
|
}
|
| 426 |
|
|
return Int, ch
|
| 427 |
|
|
}
|
| 428 |
|
|
|
| 429 |
|
|
func (s *Scanner) scanDigits(ch rune, base, n int) rune {
|
| 430 |
|
|
for n > 0 && digitVal(ch) < base {
|
| 431 |
|
|
ch = s.next()
|
| 432 |
|
|
n--
|
| 433 |
|
|
}
|
| 434 |
|
|
if n > 0 {
|
| 435 |
|
|
s.error("illegal char escape")
|
| 436 |
|
|
}
|
| 437 |
|
|
return ch
|
| 438 |
|
|
}
|
| 439 |
|
|
|
| 440 |
|
|
func (s *Scanner) scanEscape(quote rune) rune {
|
| 441 |
|
|
ch := s.next() // read character after '/'
|
| 442 |
|
|
switch ch {
|
| 443 |
|
|
case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
|
| 444 |
|
|
// nothing to do
|
| 445 |
|
|
ch = s.next()
|
| 446 |
|
|
case '0', '1', '2', '3', '4', '5', '6', '7':
|
| 447 |
|
|
ch = s.scanDigits(ch, 8, 3)
|
| 448 |
|
|
case 'x':
|
| 449 |
|
|
ch = s.scanDigits(s.next(), 16, 2)
|
| 450 |
|
|
case 'u':
|
| 451 |
|
|
ch = s.scanDigits(s.next(), 16, 4)
|
| 452 |
|
|
case 'U':
|
| 453 |
|
|
ch = s.scanDigits(s.next(), 16, 8)
|
| 454 |
|
|
default:
|
| 455 |
|
|
s.error("illegal char escape")
|
| 456 |
|
|
}
|
| 457 |
|
|
return ch
|
| 458 |
|
|
}
|
| 459 |
|
|
|
| 460 |
|
|
func (s *Scanner) scanString(quote rune) (n int) {
|
| 461 |
|
|
ch := s.next() // read character after quote
|
| 462 |
|
|
for ch != quote {
|
| 463 |
|
|
if ch == '\n' || ch < 0 {
|
| 464 |
|
|
s.error("literal not terminated")
|
| 465 |
|
|
return
|
| 466 |
|
|
}
|
| 467 |
|
|
if ch == '\\' {
|
| 468 |
|
|
ch = s.scanEscape(quote)
|
| 469 |
|
|
} else {
|
| 470 |
|
|
ch = s.next()
|
| 471 |
|
|
}
|
| 472 |
|
|
n++
|
| 473 |
|
|
}
|
| 474 |
|
|
return
|
| 475 |
|
|
}
|
| 476 |
|
|
|
| 477 |
|
|
func (s *Scanner) scanRawString() {
|
| 478 |
|
|
ch := s.next() // read character after '`'
|
| 479 |
|
|
for ch != '`' {
|
| 480 |
|
|
if ch < 0 {
|
| 481 |
|
|
s.error("literal not terminated")
|
| 482 |
|
|
return
|
| 483 |
|
|
}
|
| 484 |
|
|
ch = s.next()
|
| 485 |
|
|
}
|
| 486 |
|
|
}
|
| 487 |
|
|
|
| 488 |
|
|
func (s *Scanner) scanChar() {
|
| 489 |
|
|
if s.scanString('\'') != 1 {
|
| 490 |
|
|
s.error("illegal char literal")
|
| 491 |
|
|
}
|
| 492 |
|
|
}
|
| 493 |
|
|
|
| 494 |
|
|
func (s *Scanner) scanComment(ch rune) rune {
|
| 495 |
|
|
// ch == '/' || ch == '*'
|
| 496 |
|
|
if ch == '/' {
|
| 497 |
|
|
// line comment
|
| 498 |
|
|
ch = s.next() // read character after "//"
|
| 499 |
|
|
for ch != '\n' && ch >= 0 {
|
| 500 |
|
|
ch = s.next()
|
| 501 |
|
|
}
|
| 502 |
|
|
return ch
|
| 503 |
|
|
}
|
| 504 |
|
|
|
| 505 |
|
|
// general comment
|
| 506 |
|
|
ch = s.next() // read character after "/*"
|
| 507 |
|
|
for {
|
| 508 |
|
|
if ch < 0 {
|
| 509 |
|
|
s.error("comment not terminated")
|
| 510 |
|
|
break
|
| 511 |
|
|
}
|
| 512 |
|
|
ch0 := ch
|
| 513 |
|
|
ch = s.next()
|
| 514 |
|
|
if ch0 == '*' && ch == '/' {
|
| 515 |
|
|
ch = s.next()
|
| 516 |
|
|
break
|
| 517 |
|
|
}
|
| 518 |
|
|
}
|
| 519 |
|
|
return ch
|
| 520 |
|
|
}
|
| 521 |
|
|
|
| 522 |
|
|
// Scan reads the next token or Unicode character from source and returns it.
|
| 523 |
|
|
// It only recognizes tokens t for which the respective Mode bit (1<<-t) is set.
|
| 524 |
|
|
// It returns EOF at the end of the source. It reports scanner errors (read and
|
| 525 |
|
|
// token errors) by calling s.Error, if not nil; otherwise it prints an error
|
| 526 |
|
|
// message to os.Stderr.
|
| 527 |
|
|
func (s *Scanner) Scan() rune {
|
| 528 |
|
|
ch := s.Peek()
|
| 529 |
|
|
|
| 530 |
|
|
// reset token text position
|
| 531 |
|
|
s.tokPos = -1
|
| 532 |
|
|
s.Line = 0
|
| 533 |
|
|
|
| 534 |
|
|
redo:
|
| 535 |
|
|
// skip white space
|
| 536 |
|
|
for s.Whitespace&(1<
|
| 537 |
|
|
ch = s.next()
|
| 538 |
|
|
}
|
| 539 |
|
|
|
| 540 |
|
|
// start collecting token text
|
| 541 |
|
|
s.tokBuf.Reset()
|
| 542 |
|
|
s.tokPos = s.srcPos - s.lastCharLen
|
| 543 |
|
|
|
| 544 |
|
|
// set token position
|
| 545 |
|
|
// (this is a slightly optimized version of the code in Pos())
|
| 546 |
|
|
s.Offset = s.srcBufOffset + s.tokPos
|
| 547 |
|
|
if s.column > 0 {
|
| 548 |
|
|
// common case: last character was not a '\n'
|
| 549 |
|
|
s.Line = s.line
|
| 550 |
|
|
s.Column = s.column
|
| 551 |
|
|
} else {
|
| 552 |
|
|
// last character was a '\n'
|
| 553 |
|
|
// (we cannot be at the beginning of the source
|
| 554 |
|
|
// since we have called next() at least once)
|
| 555 |
|
|
s.Line = s.line - 1
|
| 556 |
|
|
s.Column = s.lastLineLen
|
| 557 |
|
|
}
|
| 558 |
|
|
|
| 559 |
|
|
// determine token value
|
| 560 |
|
|
tok := ch
|
| 561 |
|
|
switch {
|
| 562 |
|
|
case unicode.IsLetter(ch) || ch == '_':
|
| 563 |
|
|
if s.Mode&ScanIdents != 0 {
|
| 564 |
|
|
tok = Ident
|
| 565 |
|
|
ch = s.scanIdentifier()
|
| 566 |
|
|
} else {
|
| 567 |
|
|
ch = s.next()
|
| 568 |
|
|
}
|
| 569 |
|
|
case isDecimal(ch):
|
| 570 |
|
|
if s.Mode&(ScanInts|ScanFloats) != 0 {
|
| 571 |
|
|
tok, ch = s.scanNumber(ch)
|
| 572 |
|
|
} else {
|
| 573 |
|
|
ch = s.next()
|
| 574 |
|
|
}
|
| 575 |
|
|
default:
|
| 576 |
|
|
switch ch {
|
| 577 |
|
|
case '"':
|
| 578 |
|
|
if s.Mode&ScanStrings != 0 {
|
| 579 |
|
|
s.scanString('"')
|
| 580 |
|
|
tok = String
|
| 581 |
|
|
}
|
| 582 |
|
|
ch = s.next()
|
| 583 |
|
|
case '\'':
|
| 584 |
|
|
if s.Mode&ScanChars != 0 {
|
| 585 |
|
|
s.scanChar()
|
| 586 |
|
|
tok = Char
|
| 587 |
|
|
}
|
| 588 |
|
|
ch = s.next()
|
| 589 |
|
|
case '.':
|
| 590 |
|
|
ch = s.next()
|
| 591 |
|
|
if isDecimal(ch) && s.Mode&ScanFloats != 0 {
|
| 592 |
|
|
tok = Float
|
| 593 |
|
|
ch = s.scanMantissa(ch)
|
| 594 |
|
|
ch = s.scanExponent(ch)
|
| 595 |
|
|
}
|
| 596 |
|
|
case '/':
|
| 597 |
|
|
ch = s.next()
|
| 598 |
|
|
if (ch == '/' || ch == '*') && s.Mode&ScanComments != 0 {
|
| 599 |
|
|
if s.Mode&SkipComments != 0 {
|
| 600 |
|
|
s.tokPos = -1 // don't collect token text
|
| 601 |
|
|
ch = s.scanComment(ch)
|
| 602 |
|
|
goto redo
|
| 603 |
|
|
}
|
| 604 |
|
|
ch = s.scanComment(ch)
|
| 605 |
|
|
tok = Comment
|
| 606 |
|
|
}
|
| 607 |
|
|
case '`':
|
| 608 |
|
|
if s.Mode&ScanRawStrings != 0 {
|
| 609 |
|
|
s.scanRawString()
|
| 610 |
|
|
tok = String
|
| 611 |
|
|
}
|
| 612 |
|
|
ch = s.next()
|
| 613 |
|
|
default:
|
| 614 |
|
|
ch = s.next()
|
| 615 |
|
|
}
|
| 616 |
|
|
}
|
| 617 |
|
|
|
| 618 |
|
|
// end of token text
|
| 619 |
|
|
s.tokEnd = s.srcPos - s.lastCharLen
|
| 620 |
|
|
|
| 621 |
|
|
s.ch = ch
|
| 622 |
|
|
return tok
|
| 623 |
|
|
}
|
| 624 |
|
|
|
| 625 |
|
|
// Pos returns the position of the character immediately after
|
| 626 |
|
|
// the character or token returned by the last call to Next or Scan.
|
| 627 |
|
|
func (s *Scanner) Pos() (pos Position) {
|
| 628 |
|
|
pos.Filename = s.Filename
|
| 629 |
|
|
pos.Offset = s.srcBufOffset + s.srcPos - s.lastCharLen
|
| 630 |
|
|
switch {
|
| 631 |
|
|
case s.column > 0:
|
| 632 |
|
|
// common case: last character was not a '\n'
|
| 633 |
|
|
pos.Line = s.line
|
| 634 |
|
|
pos.Column = s.column
|
| 635 |
|
|
case s.lastLineLen > 0:
|
| 636 |
|
|
// last character was a '\n'
|
| 637 |
|
|
pos.Line = s.line - 1
|
| 638 |
|
|
pos.Column = s.lastLineLen
|
| 639 |
|
|
default:
|
| 640 |
|
|
// at the beginning of the source
|
| 641 |
|
|
pos.Line = 1
|
| 642 |
|
|
pos.Column = 1
|
| 643 |
|
|
}
|
| 644 |
|
|
return
|
| 645 |
|
|
}
|
| 646 |
|
|
|
| 647 |
|
|
// TokenText returns the string corresponding to the most recently scanned token.
|
| 648 |
|
|
// Valid after calling Scan().
|
| 649 |
|
|
func (s *Scanner) TokenText() string {
|
| 650 |
|
|
if s.tokPos < 0 {
|
| 651 |
|
|
// no token text
|
| 652 |
|
|
return ""
|
| 653 |
|
|
}
|
| 654 |
|
|
|
| 655 |
|
|
if s.tokEnd < 0 {
|
| 656 |
|
|
// if EOF was reached, s.tokEnd is set to -1 (s.srcPos == 0)
|
| 657 |
|
|
s.tokEnd = s.tokPos
|
| 658 |
|
|
}
|
| 659 |
|
|
|
| 660 |
|
|
if s.tokBuf.Len() == 0 {
|
| 661 |
|
|
// common case: the entire token text is still in srcBuf
|
| 662 |
|
|
return string(s.srcBuf[s.tokPos:s.tokEnd])
|
| 663 |
|
|
}
|
| 664 |
|
|
|
| 665 |
|
|
// part of the token text was saved in tokBuf: save the rest in
|
| 666 |
|
|
// tokBuf as well and return its content
|
| 667 |
|
|
s.tokBuf.Write(s.srcBuf[s.tokPos:s.tokEnd])
|
| 668 |
|
|
s.tokPos = s.tokEnd // ensure idempotency of TokenText() call
|
| 669 |
|
|
return s.tokBuf.String()
|
| 670 |
|
|
}
|