OpenCores
URL https://opencores.org/ocsvn/openrisc/openrisc/trunk

Subversion Repositories openrisc

[/] [openrisc/] [trunk/] [gnu-dev/] [or1k-gcc/] [libgo/] [go/] [text/] [scanner/] [scanner.go] - Blame information for rev 791

Go to most recent revision | Details | Compare with Previous | View Log

Line No. Rev Author Line
1 747 jeremybenn
// Copyright 2009 The Go Authors. All rights reserved.
2
// Use of this source code is governed by a BSD-style
3
// license that can be found in the LICENSE file.
4
 
5
// Package scanner provides a scanner and tokenizer for UTF-8-encoded text.
6
// It takes an io.Reader providing the source, which then can be tokenized
7
// through repeated calls to the Scan function.  For compatibility with
8
// existing tools, the NUL character is not allowed (implementation
9
// restriction).
10
//
11
// By default, a Scanner skips white space and Go comments and recognizes all
12
// literals as defined by the Go language specification.  It may be
13
// customized to recognize only a subset of those literals and to recognize
14
// different white space characters.
15
//
16
// Basic usage pattern:
17
//
18
//      var s scanner.Scanner
19
//      s.Init(src)
20
//      tok := s.Scan()
21
//      for tok != scanner.EOF {
22
//              // do something with tok
23
//              tok = s.Scan()
24
//      }
25
//
26
package scanner
27
 
28
import (
29
        "bytes"
30
        "fmt"
31
        "io"
32
        "os"
33
        "unicode"
34
        "unicode/utf8"
35
)
36
 
37
// TODO(gri): Consider changing this to use the new (token) Position package.
38
 
39
// A source position is represented by a Position value.
40
// A position is valid if Line > 0.
41
type Position struct {
42
        Filename string // filename, if any
43
        Offset   int    // byte offset, starting at 0
44
        Line     int    // line number, starting at 1
45
        Column   int    // column number, starting at 1 (character count per line)
46
}
47
 
48
// IsValid returns true if the position is valid.
49
func (pos *Position) IsValid() bool { return pos.Line > 0 }
50
 
51
func (pos Position) String() string {
52
        s := pos.Filename
53
        if pos.IsValid() {
54
                if s != "" {
55
                        s += ":"
56
                }
57
                s += fmt.Sprintf("%d:%d", pos.Line, pos.Column)
58
        }
59
        if s == "" {
60
                s = "???"
61
        }
62
        return s
63
}
64
 
65
// Predefined mode bits to control recognition of tokens. For instance,
66
// to configure a Scanner such that it only recognizes (Go) identifiers,
67
// integers, and skips comments, set the Scanner's Mode field to:
68
//
69
//      ScanIdents | ScanInts | SkipComments
70
//
71
const (
72
        ScanIdents     = 1 << -Ident
73
        ScanInts       = 1 << -Int
74
        ScanFloats     = 1 << -Float // includes Ints
75
        ScanChars      = 1 << -Char
76
        ScanStrings    = 1 << -String
77
        ScanRawStrings = 1 << -RawString
78
        ScanComments   = 1 << -Comment
79
        SkipComments   = 1 << -skipComment // if set with ScanComments, comments become white space
80
        GoTokens       = ScanIdents | ScanFloats | ScanChars | ScanStrings | ScanRawStrings | ScanComments | SkipComments
81
)
82
 
83
// The result of Scan is one of the following tokens or a Unicode character.
84
const (
85
        EOF = -(iota + 1)
86
        Ident
87
        Int
88
        Float
89
        Char
90
        String
91
        RawString
92
        Comment
93
        skipComment
94
)
95
 
96
var tokenString = map[rune]string{
97
        EOF:       "EOF",
98
        Ident:     "Ident",
99
        Int:       "Int",
100
        Float:     "Float",
101
        Char:      "Char",
102
        String:    "String",
103
        RawString: "RawString",
104
        Comment:   "Comment",
105
}
106
 
107
// TokenString returns a (visible) string for a token or Unicode character.
108
func TokenString(tok rune) string {
109
        if s, found := tokenString[tok]; found {
110
                return s
111
        }
112
        return fmt.Sprintf("%q", string(tok))
113
}
114
 
115
// GoWhitespace is the default value for the Scanner's Whitespace field.
116
// Its value selects Go's white space characters.
117
const GoWhitespace = 1<<'\t' | 1<<'\n' | 1<<'\r' | 1<<' '
118
 
119
const bufLen = 1024 // at least utf8.UTFMax
120
 
121
// A Scanner implements reading of Unicode characters and tokens from an io.Reader.
122
type Scanner struct {
123
        // Input
124
        src io.Reader
125
 
126
        // Source buffer
127
        srcBuf [bufLen + 1]byte // +1 for sentinel for common case of s.next()
128
        srcPos int              // reading position (srcBuf index)
129
        srcEnd int              // source end (srcBuf index)
130
 
131
        // Source position
132
        srcBufOffset int // byte offset of srcBuf[0] in source
133
        line         int // line count
134
        column       int // character count
135
        lastLineLen  int // length of last line in characters (for correct column reporting)
136
        lastCharLen  int // length of last character in bytes
137
 
138
        // Token text buffer
139
        // Typically, token text is stored completely in srcBuf, but in general
140
        // the token text's head may be buffered in tokBuf while the token text's
141
        // tail is stored in srcBuf.
142
        tokBuf bytes.Buffer // token text head that is not in srcBuf anymore
143
        tokPos int          // token text tail position (srcBuf index); valid if >= 0
144
        tokEnd int          // token text tail end (srcBuf index)
145
 
146
        // One character look-ahead
147
        ch rune // character before current srcPos
148
 
149
        // Error is called for each error encountered. If no Error
150
        // function is set, the error is reported to os.Stderr.
151
        Error func(s *Scanner, msg string)
152
 
153
        // ErrorCount is incremented by one for each error encountered.
154
        ErrorCount int
155
 
156
        // The Mode field controls which tokens are recognized. For instance,
157
        // to recognize Ints, set the ScanInts bit in Mode. The field may be
158
        // changed at any time.
159
        Mode uint
160
 
161
        // The Whitespace field controls which characters are recognized
162
        // as white space. To recognize a character ch <= ' ' as white space,
163
        // set the ch'th bit in Whitespace (the Scanner's behavior is undefined
164
        // for values ch > ' '). The field may be changed at any time.
165
        Whitespace uint64
166
 
167
        // Start position of most recently scanned token; set by Scan.
168
        // Calling Init or Next invalidates the position (Line == 0).
169
        // The Filename field is always left untouched by the Scanner.
170
        // If an error is reported (via Error) and Position is invalid,
171
        // the scanner is not inside a token. Call Pos to obtain an error
172
        // position in that case.
173
        Position
174
}
175
 
176
// Init initializes a Scanner with a new source and returns s.
177
// Error is set to nil, ErrorCount is set to 0, Mode is set to GoTokens,
178
// and Whitespace is set to GoWhitespace.
179
func (s *Scanner) Init(src io.Reader) *Scanner {
180
        s.src = src
181
 
182
        // initialize source buffer
183
        // (the first call to next() will fill it by calling src.Read)
184
        s.srcBuf[0] = utf8.RuneSelf // sentinel
185
        s.srcPos = 0
186
        s.srcEnd = 0
187
 
188
        // initialize source position
189
        s.srcBufOffset = 0
190
        s.line = 1
191
        s.column = 0
192
        s.lastLineLen = 0
193
        s.lastCharLen = 0
194
 
195
        // initialize token text buffer
196
        // (required for first call to next()).
197
        s.tokPos = -1
198
 
199
        // initialize one character look-ahead
200
        s.ch = -1 // no char read yet
201
 
202
        // initialize public fields
203
        s.Error = nil
204
        s.ErrorCount = 0
205
        s.Mode = GoTokens
206
        s.Whitespace = GoWhitespace
207
        s.Line = 0 // invalidate token position
208
 
209
        return s
210
}
211
 
212
// TODO(gri): The code for next() and the internal scanner state could benefit
213
//            from a rethink. While next() is optimized for the common ASCII
214
//            case, the "corrections" needed for proper position tracking undo
215
//            some of the attempts for fast-path optimization.
216
 
217
// next reads and returns the next Unicode character. It is designed such
218
// that only a minimal amount of work needs to be done in the common ASCII
219
// case (one test to check for both ASCII and end-of-buffer, and one test
220
// to check for newlines).
221
func (s *Scanner) next() rune {
222
        ch, width := rune(s.srcBuf[s.srcPos]), 1
223
 
224
        if ch >= utf8.RuneSelf {
225
                // uncommon case: not ASCII or not enough bytes
226
                for s.srcPos+utf8.UTFMax > s.srcEnd && !utf8.FullRune(s.srcBuf[s.srcPos:s.srcEnd]) {
227
                        // not enough bytes: read some more, but first
228
                        // save away token text if any
229
                        if s.tokPos >= 0 {
230
                                s.tokBuf.Write(s.srcBuf[s.tokPos:s.srcPos])
231
                                s.tokPos = 0
232
                                // s.tokEnd is set by Scan()
233
                        }
234
                        // move unread bytes to beginning of buffer
235
                        copy(s.srcBuf[0:], s.srcBuf[s.srcPos:s.srcEnd])
236
                        s.srcBufOffset += s.srcPos
237
                        // read more bytes
238
                        // (an io.Reader must return io.EOF when it reaches
239
                        // the end of what it is reading - simply returning
240
                        // n == 0 will make this loop retry forever; but the
241
                        // error is in the reader implementation in that case)
242
                        i := s.srcEnd - s.srcPos
243
                        n, err := s.src.Read(s.srcBuf[i:bufLen])
244
                        s.srcPos = 0
245
                        s.srcEnd = i + n
246
                        s.srcBuf[s.srcEnd] = utf8.RuneSelf // sentinel
247
                        if err != nil {
248
                                if s.srcEnd == 0 {
249
                                        if s.lastCharLen > 0 {
250
                                                // previous character was not EOF
251
                                                s.column++
252
                                        }
253
                                        s.lastCharLen = 0
254
                                        return EOF
255
                                }
256
                                if err != io.EOF {
257
                                        s.error(err.Error())
258
                                }
259
                                // If err == EOF, we won't be getting more
260
                                // bytes; break to avoid infinite loop. If
261
                                // err is something else, we don't know if
262
                                // we can get more bytes; thus also break.
263
                                break
264
                        }
265
                }
266
                // at least one byte
267
                ch = rune(s.srcBuf[s.srcPos])
268
                if ch >= utf8.RuneSelf {
269
                        // uncommon case: not ASCII
270
                        ch, width = utf8.DecodeRune(s.srcBuf[s.srcPos:s.srcEnd])
271
                        if ch == utf8.RuneError && width == 1 {
272
                                // advance for correct error position
273
                                s.srcPos += width
274
                                s.lastCharLen = width
275
                                s.column++
276
                                s.error("illegal UTF-8 encoding")
277
                                return ch
278
                        }
279
                }
280
        }
281
 
282
        // advance
283
        s.srcPos += width
284
        s.lastCharLen = width
285
        s.column++
286
 
287
        // special situations
288
        switch ch {
289
        case 0:
290
                // implementation restriction for compatibility with other tools
291
                s.error("illegal character NUL")
292
        case '\n':
293
                s.line++
294
                s.lastLineLen = s.column
295
                s.column = 0
296
        }
297
 
298
        return ch
299
}
300
 
301
// Next reads and returns the next Unicode character.
302
// It returns EOF at the end of the source. It reports
303
// a read error by calling s.Error, if not nil; otherwise
304
// it prints an error message to os.Stderr. Next does not
305
// update the Scanner's Position field; use Pos() to
306
// get the current position.
307
func (s *Scanner) Next() rune {
308
        s.tokPos = -1 // don't collect token text
309
        s.Line = 0    // invalidate token position
310
        ch := s.Peek()
311
        s.ch = s.next()
312
        return ch
313
}
314
 
315
// Peek returns the next Unicode character in the source without advancing
316
// the scanner. It returns EOF if the scanner's position is at the last
317
// character of the source.
318
func (s *Scanner) Peek() rune {
319
        if s.ch < 0 {
320
                s.ch = s.next()
321
        }
322
        return s.ch
323
}
324
 
325
func (s *Scanner) error(msg string) {
326
        s.ErrorCount++
327
        if s.Error != nil {
328
                s.Error(s, msg)
329
                return
330
        }
331
        pos := s.Position
332
        if !pos.IsValid() {
333
                pos = s.Pos()
334
        }
335
        fmt.Fprintf(os.Stderr, "%s: %s\n", pos, msg)
336
}
337
 
338
func (s *Scanner) scanIdentifier() rune {
339
        ch := s.next() // read character after first '_' or letter
340
        for ch == '_' || unicode.IsLetter(ch) || unicode.IsDigit(ch) {
341
                ch = s.next()
342
        }
343
        return ch
344
}
345
 
346
func digitVal(ch rune) int {
347
        switch {
348
        case '0' <= ch && ch <= '9':
349
                return int(ch - '0')
350
        case 'a' <= ch && ch <= 'f':
351
                return int(ch - 'a' + 10)
352
        case 'A' <= ch && ch <= 'F':
353
                return int(ch - 'A' + 10)
354
        }
355
        return 16 // larger than any legal digit val
356
}
357
 
358
func isDecimal(ch rune) bool { return '0' <= ch && ch <= '9' }
359
 
360
func (s *Scanner) scanMantissa(ch rune) rune {
361
        for isDecimal(ch) {
362
                ch = s.next()
363
        }
364
        return ch
365
}
366
 
367
func (s *Scanner) scanFraction(ch rune) rune {
368
        if ch == '.' {
369
                ch = s.scanMantissa(s.next())
370
        }
371
        return ch
372
}
373
 
374
func (s *Scanner) scanExponent(ch rune) rune {
375
        if ch == 'e' || ch == 'E' {
376
                ch = s.next()
377
                if ch == '-' || ch == '+' {
378
                        ch = s.next()
379
                }
380
                ch = s.scanMantissa(ch)
381
        }
382
        return ch
383
}
384
 
385
func (s *Scanner) scanNumber(ch rune) (rune, rune) {
386
        // isDecimal(ch)
387
        if ch == '0' {
388
                // int or float
389
                ch = s.next()
390
                if ch == 'x' || ch == 'X' {
391
                        // hexadecimal int
392
                        ch = s.next()
393
                        for digitVal(ch) < 16 {
394
                                ch = s.next()
395
                        }
396
                } else {
397
                        // octal int or float
398
                        seenDecimalDigit := false
399
                        for isDecimal(ch) {
400
                                if ch > '7' {
401
                                        seenDecimalDigit = true
402
                                }
403
                                ch = s.next()
404
                        }
405
                        if s.Mode&ScanFloats != 0 && (ch == '.' || ch == 'e' || ch == 'E') {
406
                                // float
407
                                ch = s.scanFraction(ch)
408
                                ch = s.scanExponent(ch)
409
                                return Float, ch
410
                        }
411
                        // octal int
412
                        if seenDecimalDigit {
413
                                s.error("illegal octal number")
414
                        }
415
                }
416
                return Int, ch
417
        }
418
        // decimal int or float
419
        ch = s.scanMantissa(ch)
420
        if s.Mode&ScanFloats != 0 && (ch == '.' || ch == 'e' || ch == 'E') {
421
                // float
422
                ch = s.scanFraction(ch)
423
                ch = s.scanExponent(ch)
424
                return Float, ch
425
        }
426
        return Int, ch
427
}
428
 
429
func (s *Scanner) scanDigits(ch rune, base, n int) rune {
430
        for n > 0 && digitVal(ch) < base {
431
                ch = s.next()
432
                n--
433
        }
434
        if n > 0 {
435
                s.error("illegal char escape")
436
        }
437
        return ch
438
}
439
 
440
func (s *Scanner) scanEscape(quote rune) rune {
441
        ch := s.next() // read character after '/'
442
        switch ch {
443
        case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
444
                // nothing to do
445
                ch = s.next()
446
        case '0', '1', '2', '3', '4', '5', '6', '7':
447
                ch = s.scanDigits(ch, 8, 3)
448
        case 'x':
449
                ch = s.scanDigits(s.next(), 16, 2)
450
        case 'u':
451
                ch = s.scanDigits(s.next(), 16, 4)
452
        case 'U':
453
                ch = s.scanDigits(s.next(), 16, 8)
454
        default:
455
                s.error("illegal char escape")
456
        }
457
        return ch
458
}
459
 
460
func (s *Scanner) scanString(quote rune) (n int) {
461
        ch := s.next() // read character after quote
462
        for ch != quote {
463
                if ch == '\n' || ch < 0 {
464
                        s.error("literal not terminated")
465
                        return
466
                }
467
                if ch == '\\' {
468
                        ch = s.scanEscape(quote)
469
                } else {
470
                        ch = s.next()
471
                }
472
                n++
473
        }
474
        return
475
}
476
 
477
func (s *Scanner) scanRawString() {
478
        ch := s.next() // read character after '`'
479
        for ch != '`' {
480
                if ch < 0 {
481
                        s.error("literal not terminated")
482
                        return
483
                }
484
                ch = s.next()
485
        }
486
}
487
 
488
func (s *Scanner) scanChar() {
489
        if s.scanString('\'') != 1 {
490
                s.error("illegal char literal")
491
        }
492
}
493
 
494
func (s *Scanner) scanComment(ch rune) rune {
495
        // ch == '/' || ch == '*'
496
        if ch == '/' {
497
                // line comment
498
                ch = s.next() // read character after "//"
499
                for ch != '\n' && ch >= 0 {
500
                        ch = s.next()
501
                }
502
                return ch
503
        }
504
 
505
        // general comment
506
        ch = s.next() // read character after "/*"
507
        for {
508
                if ch < 0 {
509
                        s.error("comment not terminated")
510
                        break
511
                }
512
                ch0 := ch
513
                ch = s.next()
514
                if ch0 == '*' && ch == '/' {
515
                        ch = s.next()
516
                        break
517
                }
518
        }
519
        return ch
520
}
521
 
522
// Scan reads the next token or Unicode character from source and returns it.
523
// It only recognizes tokens t for which the respective Mode bit (1<<-t) is set.
524
// It returns EOF at the end of the source. It reports scanner errors (read and
525
// token errors) by calling s.Error, if not nil; otherwise it prints an error
526
// message to os.Stderr.
527
func (s *Scanner) Scan() rune {
528
        ch := s.Peek()
529
 
530
        // reset token text position
531
        s.tokPos = -1
532
        s.Line = 0
533
 
534
redo:
535
        // skip white space
536
        for s.Whitespace&(1<
537
                ch = s.next()
538
        }
539
 
540
        // start collecting token text
541
        s.tokBuf.Reset()
542
        s.tokPos = s.srcPos - s.lastCharLen
543
 
544
        // set token position
545
        // (this is a slightly optimized version of the code in Pos())
546
        s.Offset = s.srcBufOffset + s.tokPos
547
        if s.column > 0 {
548
                // common case: last character was not a '\n'
549
                s.Line = s.line
550
                s.Column = s.column
551
        } else {
552
                // last character was a '\n'
553
                // (we cannot be at the beginning of the source
554
                // since we have called next() at least once)
555
                s.Line = s.line - 1
556
                s.Column = s.lastLineLen
557
        }
558
 
559
        // determine token value
560
        tok := ch
561
        switch {
562
        case unicode.IsLetter(ch) || ch == '_':
563
                if s.Mode&ScanIdents != 0 {
564
                        tok = Ident
565
                        ch = s.scanIdentifier()
566
                } else {
567
                        ch = s.next()
568
                }
569
        case isDecimal(ch):
570
                if s.Mode&(ScanInts|ScanFloats) != 0 {
571
                        tok, ch = s.scanNumber(ch)
572
                } else {
573
                        ch = s.next()
574
                }
575
        default:
576
                switch ch {
577
                case '"':
578
                        if s.Mode&ScanStrings != 0 {
579
                                s.scanString('"')
580
                                tok = String
581
                        }
582
                        ch = s.next()
583
                case '\'':
584
                        if s.Mode&ScanChars != 0 {
585
                                s.scanChar()
586
                                tok = Char
587
                        }
588
                        ch = s.next()
589
                case '.':
590
                        ch = s.next()
591
                        if isDecimal(ch) && s.Mode&ScanFloats != 0 {
592
                                tok = Float
593
                                ch = s.scanMantissa(ch)
594
                                ch = s.scanExponent(ch)
595
                        }
596
                case '/':
597
                        ch = s.next()
598
                        if (ch == '/' || ch == '*') && s.Mode&ScanComments != 0 {
599
                                if s.Mode&SkipComments != 0 {
600
                                        s.tokPos = -1 // don't collect token text
601
                                        ch = s.scanComment(ch)
602
                                        goto redo
603
                                }
604
                                ch = s.scanComment(ch)
605
                                tok = Comment
606
                        }
607
                case '`':
608
                        if s.Mode&ScanRawStrings != 0 {
609
                                s.scanRawString()
610
                                tok = String
611
                        }
612
                        ch = s.next()
613
                default:
614
                        ch = s.next()
615
                }
616
        }
617
 
618
        // end of token text
619
        s.tokEnd = s.srcPos - s.lastCharLen
620
 
621
        s.ch = ch
622
        return tok
623
}
624
 
625
// Pos returns the position of the character immediately after
626
// the character or token returned by the last call to Next or Scan.
627
func (s *Scanner) Pos() (pos Position) {
628
        pos.Filename = s.Filename
629
        pos.Offset = s.srcBufOffset + s.srcPos - s.lastCharLen
630
        switch {
631
        case s.column > 0:
632
                // common case: last character was not a '\n'
633
                pos.Line = s.line
634
                pos.Column = s.column
635
        case s.lastLineLen > 0:
636
                // last character was a '\n'
637
                pos.Line = s.line - 1
638
                pos.Column = s.lastLineLen
639
        default:
640
                // at the beginning of the source
641
                pos.Line = 1
642
                pos.Column = 1
643
        }
644
        return
645
}
646
 
647
// TokenText returns the string corresponding to the most recently scanned token.
648
// Valid after calling Scan().
649
func (s *Scanner) TokenText() string {
650
        if s.tokPos < 0 {
651
                // no token text
652
                return ""
653
        }
654
 
655
        if s.tokEnd < 0 {
656
                // if EOF was reached, s.tokEnd is set to -1 (s.srcPos == 0)
657
                s.tokEnd = s.tokPos
658
        }
659
 
660
        if s.tokBuf.Len() == 0 {
661
                // common case: the entire token text is still in srcBuf
662
                return string(s.srcBuf[s.tokPos:s.tokEnd])
663
        }
664
 
665
        // part of the token text was saved in tokBuf: save the rest in
666
        // tokBuf as well and return its content
667
        s.tokBuf.Write(s.srcBuf[s.tokPos:s.tokEnd])
668
        s.tokPos = s.tokEnd // ensure idempotency of TokenText() call
669
        return s.tokBuf.String()
670
}

powered by: WebSVN 2.1.0

© copyright 1999-2025 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.