OpenCores
URL https://opencores.org/ocsvn/openrisc/openrisc/trunk

Subversion Repositories openrisc

[/] [openrisc/] [trunk/] [gnu-dev/] [or1k-gcc/] [libgo/] [go/] [regexp/] [syntax/] [parse.go] - Blame information for rev 801

Go to most recent revision | Details | Compare with Previous | View Log

Line No. Rev Author Line
1 747 jeremybenn
// Copyright 2011 The Go Authors.  All rights reserved.
2
// Use of this source code is governed by a BSD-style
3
// license that can be found in the LICENSE file.
4
 
5
package syntax
6
 
7
import (
8
        "sort"
9
        "strings"
10
        "unicode"
11
        "unicode/utf8"
12
)
13
 
14
// An Error describes a failure to parse a regular expression
15
// and gives the offending expression.
16
type Error struct {
17
        Code ErrorCode
18
        Expr string
19
}
20
 
21
func (e *Error) Error() string {
22
        return "error parsing regexp: " + e.Code.String() + ": `" + e.Expr + "`"
23
}
24
 
25
// An ErrorCode describes a failure to parse a regular expression.
26
type ErrorCode string
27
 
28
const (
29
        // Unexpected error
30
        ErrInternalError ErrorCode = "regexp/syntax: internal error"
31
 
32
        // Parse errors
33
        ErrInvalidCharClass      ErrorCode = "invalid character class"
34
        ErrInvalidCharRange      ErrorCode = "invalid character class range"
35
        ErrInvalidEscape         ErrorCode = "invalid escape sequence"
36
        ErrInvalidNamedCapture   ErrorCode = "invalid named capture"
37
        ErrInvalidPerlOp         ErrorCode = "invalid or unsupported Perl syntax"
38
        ErrInvalidRepeatOp       ErrorCode = "invalid nested repetition operator"
39
        ErrInvalidRepeatSize     ErrorCode = "invalid repeat count"
40
        ErrInvalidUTF8           ErrorCode = "invalid UTF-8"
41
        ErrMissingBracket        ErrorCode = "missing closing ]"
42
        ErrMissingParen          ErrorCode = "missing closing )"
43
        ErrMissingRepeatArgument ErrorCode = "missing argument to repetition operator"
44
        ErrTrailingBackslash     ErrorCode = "trailing backslash at end of expression"
45
)
46
 
47
func (e ErrorCode) String() string {
48
        return string(e)
49
}
50
 
51
// Flags control the behavior of the parser and record information about regexp context.
52
type Flags uint16
53
 
54
const (
55
        FoldCase      Flags = 1 << iota // case-insensitive match
56
        Literal                         // treat pattern as literal string
57
        ClassNL                         // allow character classes like [^a-z] and [[:space:]] to match newline
58
        DotNL                           // allow . to match newline
59
        OneLine                         // treat ^ and $ as only matching at beginning and end of text
60
        NonGreedy                       // make repetition operators default to non-greedy
61
        PerlX                           // allow Perl extensions
62
        UnicodeGroups                   // allow \p{Han}, \P{Han} for Unicode group and negation
63
        WasDollar                       // regexp OpEndText was $, not \z
64
        Simple                          // regexp contains no counted repetition
65
 
66
        MatchNL = ClassNL | DotNL
67
 
68
        Perl        = ClassNL | OneLine | PerlX | UnicodeGroups // as close to Perl as possible
69
        POSIX Flags = 0                                         // POSIX syntax
70
)
71
 
72
// Pseudo-ops for parsing stack.
73
const (
74
        opLeftParen = opPseudo + iota
75
        opVerticalBar
76
)
77
 
78
type parser struct {
79
        flags       Flags     // parse mode flags
80
        stack       []*Regexp // stack of parsed expressions
81
        free        *Regexp
82
        numCap      int // number of capturing groups seen
83
        wholeRegexp string
84
        tmpClass    []rune // temporary char class work space
85
}
86
 
87
func (p *parser) newRegexp(op Op) *Regexp {
88
        re := p.free
89
        if re != nil {
90
                p.free = re.Sub0[0]
91
                *re = Regexp{}
92
        } else {
93
                re = new(Regexp)
94
        }
95
        re.Op = op
96
        return re
97
}
98
 
99
func (p *parser) reuse(re *Regexp) {
100
        re.Sub0[0] = p.free
101
        p.free = re
102
}
103
 
104
// Parse stack manipulation.
105
 
106
// push pushes the regexp re onto the parse stack and returns the regexp.
107
func (p *parser) push(re *Regexp) *Regexp {
108
        if re.Op == OpCharClass && len(re.Rune) == 2 && re.Rune[0] == re.Rune[1] {
109
                // Single rune.
110
                if p.maybeConcat(re.Rune[0], p.flags&^FoldCase) {
111
                        return nil
112
                }
113
                re.Op = OpLiteral
114
                re.Rune = re.Rune[:1]
115
                re.Flags = p.flags &^ FoldCase
116
        } else if re.Op == OpCharClass && len(re.Rune) == 4 &&
117
                re.Rune[0] == re.Rune[1] && re.Rune[2] == re.Rune[3] &&
118
                unicode.SimpleFold(re.Rune[0]) == re.Rune[2] &&
119
                unicode.SimpleFold(re.Rune[2]) == re.Rune[0] ||
120
                re.Op == OpCharClass && len(re.Rune) == 2 &&
121
                        re.Rune[0]+1 == re.Rune[1] &&
122
                        unicode.SimpleFold(re.Rune[0]) == re.Rune[1] &&
123
                        unicode.SimpleFold(re.Rune[1]) == re.Rune[0] {
124
                // Case-insensitive rune like [Aa] or [Δδ].
125
                if p.maybeConcat(re.Rune[0], p.flags|FoldCase) {
126
                        return nil
127
                }
128
 
129
                // Rewrite as (case-insensitive) literal.
130
                re.Op = OpLiteral
131
                re.Rune = re.Rune[:1]
132
                re.Flags = p.flags | FoldCase
133
        } else {
134
                // Incremental concatenation.
135
                p.maybeConcat(-1, 0)
136
        }
137
 
138
        p.stack = append(p.stack, re)
139
        return re
140
}
141
 
142
// maybeConcat implements incremental concatenation
143
// of literal runes into string nodes.  The parser calls this
144
// before each push, so only the top fragment of the stack
145
// might need processing.  Since this is called before a push,
146
// the topmost literal is no longer subject to operators like *
147
// (Otherwise ab* would turn into (ab)*.)
148
// If r >= 0 and there's a node left over, maybeConcat uses it
149
// to push r with the given flags.
150
// maybeConcat reports whether r was pushed.
151
func (p *parser) maybeConcat(r rune, flags Flags) bool {
152
        n := len(p.stack)
153
        if n < 2 {
154
                return false
155
        }
156
 
157
        re1 := p.stack[n-1]
158
        re2 := p.stack[n-2]
159
        if re1.Op != OpLiteral || re2.Op != OpLiteral || re1.Flags&FoldCase != re2.Flags&FoldCase {
160
                return false
161
        }
162
 
163
        // Push re1 into re2.
164
        re2.Rune = append(re2.Rune, re1.Rune...)
165
 
166
        // Reuse re1 if possible.
167
        if r >= 0 {
168
                re1.Rune = re1.Rune0[:1]
169
                re1.Rune[0] = r
170
                re1.Flags = flags
171
                return true
172
        }
173
 
174
        p.stack = p.stack[:n-1]
175
        p.reuse(re1)
176
        return false // did not push r
177
}
178
 
179
// newLiteral returns a new OpLiteral Regexp with the given flags
180
func (p *parser) newLiteral(r rune, flags Flags) *Regexp {
181
        re := p.newRegexp(OpLiteral)
182
        re.Flags = flags
183
        if flags&FoldCase != 0 {
184
                r = minFoldRune(r)
185
        }
186
        re.Rune0[0] = r
187
        re.Rune = re.Rune0[:1]
188
        return re
189
}
190
 
191
// minFoldRune returns the minimum rune fold-equivalent to r.
192
func minFoldRune(r rune) rune {
193
        if r < MinFold || r > MaxFold {
194
                return r
195
        }
196
        min := r
197
        r0 := r
198
        for r = unicode.SimpleFold(r); r != r0; r = unicode.SimpleFold(r) {
199
                if min > r {
200
                        min = r
201
                }
202
        }
203
        return min
204
}
205
 
206
// literal pushes a literal regexp for the rune r on the stack
207
// and returns that regexp.
208
func (p *parser) literal(r rune) {
209
        p.push(p.newLiteral(r, p.flags))
210
}
211
 
212
// op pushes a regexp with the given op onto the stack
213
// and returns that regexp.
214
func (p *parser) op(op Op) *Regexp {
215
        re := p.newRegexp(op)
216
        re.Flags = p.flags
217
        return p.push(re)
218
}
219
 
220
// repeat replaces the top stack element with itself repeated according to op, min, max.
221
// before is the regexp suffix starting at the repetition operator.
222
// after is the regexp suffix following after the repetition operator.
223
// repeat returns an updated 'after' and an error, if any.
224
func (p *parser) repeat(op Op, min, max int, before, after, lastRepeat string) (string, error) {
225
        flags := p.flags
226
        if p.flags&PerlX != 0 {
227
                if len(after) > 0 && after[0] == '?' {
228
                        after = after[1:]
229
                        flags ^= NonGreedy
230
                }
231
                if lastRepeat != "" {
232
                        // In Perl it is not allowed to stack repetition operators:
233
                        // a** is a syntax error, not a doubled star, and a++ means
234
                        // something else entirely, which we don't support!
235
                        return "", &Error{ErrInvalidRepeatOp, lastRepeat[:len(lastRepeat)-len(after)]}
236
                }
237
        }
238
        n := len(p.stack)
239
        if n == 0 {
240
                return "", &Error{ErrMissingRepeatArgument, before[:len(before)-len(after)]}
241
        }
242
        sub := p.stack[n-1]
243
        if sub.Op >= opPseudo {
244
                return "", &Error{ErrMissingRepeatArgument, before[:len(before)-len(after)]}
245
        }
246
        re := p.newRegexp(op)
247
        re.Min = min
248
        re.Max = max
249
        re.Flags = flags
250
        re.Sub = re.Sub0[:1]
251
        re.Sub[0] = sub
252
        p.stack[n-1] = re
253
        return after, nil
254
}
255
 
256
// concat replaces the top of the stack (above the topmost '|' or '(') with its concatenation.
257
func (p *parser) concat() *Regexp {
258
        p.maybeConcat(-1, 0)
259
 
260
        // Scan down to find pseudo-operator | or (.
261
        i := len(p.stack)
262
        for i > 0 && p.stack[i-1].Op < opPseudo {
263
                i--
264
        }
265
        subs := p.stack[i:]
266
        p.stack = p.stack[:i]
267
 
268
        // Empty concatenation is special case.
269
        if len(subs) == 0 {
270
                return p.push(p.newRegexp(OpEmptyMatch))
271
        }
272
 
273
        return p.push(p.collapse(subs, OpConcat))
274
}
275
 
276
// alternate replaces the top of the stack (above the topmost '(') with its alternation.
277
func (p *parser) alternate() *Regexp {
278
        // Scan down to find pseudo-operator (.
279
        // There are no | above (.
280
        i := len(p.stack)
281
        for i > 0 && p.stack[i-1].Op < opPseudo {
282
                i--
283
        }
284
        subs := p.stack[i:]
285
        p.stack = p.stack[:i]
286
 
287
        // Make sure top class is clean.
288
        // All the others already are (see swapVerticalBar).
289
        if len(subs) > 0 {
290
                cleanAlt(subs[len(subs)-1])
291
        }
292
 
293
        // Empty alternate is special case
294
        // (shouldn't happen but easy to handle).
295
        if len(subs) == 0 {
296
                return p.push(p.newRegexp(OpNoMatch))
297
        }
298
 
299
        return p.push(p.collapse(subs, OpAlternate))
300
}
301
 
302
// cleanAlt cleans re for eventual inclusion in an alternation.
303
func cleanAlt(re *Regexp) {
304
        switch re.Op {
305
        case OpCharClass:
306
                re.Rune = cleanClass(&re.Rune)
307
                if len(re.Rune) == 2 && re.Rune[0] == 0 && re.Rune[1] == unicode.MaxRune {
308
                        re.Rune = nil
309
                        re.Op = OpAnyChar
310
                        return
311
                }
312
                if len(re.Rune) == 4 && re.Rune[0] == 0 && re.Rune[1] == '\n'-1 && re.Rune[2] == '\n'+1 && re.Rune[3] == unicode.MaxRune {
313
                        re.Rune = nil
314
                        re.Op = OpAnyCharNotNL
315
                        return
316
                }
317
                if cap(re.Rune)-len(re.Rune) > 100 {
318
                        // re.Rune will not grow any more.
319
                        // Make a copy or inline to reclaim storage.
320
                        re.Rune = append(re.Rune0[:0], re.Rune...)
321
                }
322
        }
323
}
324
 
325
// collapse returns the result of applying op to sub.
326
// If sub contains op nodes, they all get hoisted up
327
// so that there is never a concat of a concat or an
328
// alternate of an alternate.
329
func (p *parser) collapse(subs []*Regexp, op Op) *Regexp {
330
        if len(subs) == 1 {
331
                return subs[0]
332
        }
333
        re := p.newRegexp(op)
334
        re.Sub = re.Sub0[:0]
335
        for _, sub := range subs {
336
                if sub.Op == op {
337
                        re.Sub = append(re.Sub, sub.Sub...)
338
                        p.reuse(sub)
339
                } else {
340
                        re.Sub = append(re.Sub, sub)
341
                }
342
        }
343
        if op == OpAlternate {
344
                re.Sub = p.factor(re.Sub, re.Flags)
345
                if len(re.Sub) == 1 {
346
                        old := re
347
                        re = re.Sub[0]
348
                        p.reuse(old)
349
                }
350
        }
351
        return re
352
}
353
 
354
// factor factors common prefixes from the alternation list sub.
355
// It returns a replacement list that reuses the same storage and
356
// frees (passes to p.reuse) any removed *Regexps.
357
//
358
// For example,
359
//     ABC|ABD|AEF|BCX|BCY
360
// simplifies by literal prefix extraction to
361
//     A(B(C|D)|EF)|BC(X|Y)
362
// which simplifies by character class introduction to
363
//     A(B[CD]|EF)|BC[XY]
364
//
365
func (p *parser) factor(sub []*Regexp, flags Flags) []*Regexp {
366
        if len(sub) < 2 {
367
                return sub
368
        }
369
 
370
        // Round 1: Factor out common literal prefixes.
371
        var str []rune
372
        var strflags Flags
373
        start := 0
374
        out := sub[:0]
375
        for i := 0; i <= len(sub); i++ {
376
                // Invariant: the Regexps that were in sub[0:start] have been
377
                // used or marked for reuse, and the slice space has been reused
378
                // for out (len(out) <= start).
379
                //
380
                // Invariant: sub[start:i] consists of regexps that all begin
381
                // with str as modified by strflags.
382
                var istr []rune
383
                var iflags Flags
384
                if i < len(sub) {
385
                        istr, iflags = p.leadingString(sub[i])
386
                        if iflags == strflags {
387
                                same := 0
388
                                for same < len(str) && same < len(istr) && str[same] == istr[same] {
389
                                        same++
390
                                }
391
                                if same > 0 {
392
                                        // Matches at least one rune in current range.
393
                                        // Keep going around.
394
                                        str = str[:same]
395
                                        continue
396
                                }
397
                        }
398
                }
399
 
400
                // Found end of a run with common leading literal string:
401
                // sub[start:i] all begin with str[0:len(str)], but sub[i]
402
                // does not even begin with str[0].
403
                //
404
                // Factor out common string and append factored expression to out.
405
                if i == start {
406
                        // Nothing to do - run of length 0.
407
                } else if i == start+1 {
408
                        // Just one: don't bother factoring.
409
                        out = append(out, sub[start])
410
                } else {
411
                        // Construct factored form: prefix(suffix1|suffix2|...)
412
                        prefix := p.newRegexp(OpLiteral)
413
                        prefix.Flags = strflags
414
                        prefix.Rune = append(prefix.Rune[:0], str...)
415
 
416
                        for j := start; j < i; j++ {
417
                                sub[j] = p.removeLeadingString(sub[j], len(str))
418
                        }
419
                        suffix := p.collapse(sub[start:i], OpAlternate) // recurse
420
 
421
                        re := p.newRegexp(OpConcat)
422
                        re.Sub = append(re.Sub[:0], prefix, suffix)
423
                        out = append(out, re)
424
                }
425
 
426
                // Prepare for next iteration.
427
                start = i
428
                str = istr
429
                strflags = iflags
430
        }
431
        sub = out
432
 
433
        // Round 2: Factor out common complex prefixes,
434
        // just the first piece of each concatenation,
435
        // whatever it is.  This is good enough a lot of the time.
436
        start = 0
437
        out = sub[:0]
438
        var first *Regexp
439
        for i := 0; i <= len(sub); i++ {
440
                // Invariant: the Regexps that were in sub[0:start] have been
441
                // used or marked for reuse, and the slice space has been reused
442
                // for out (len(out) <= start).
443
                //
444
                // Invariant: sub[start:i] consists of regexps that all begin with ifirst.
445
                var ifirst *Regexp
446
                if i < len(sub) {
447
                        ifirst = p.leadingRegexp(sub[i])
448
                        if first != nil && first.Equal(ifirst) {
449
                                continue
450
                        }
451
                }
452
 
453
                // Found end of a run with common leading regexp:
454
                // sub[start:i] all begin with first but sub[i] does not.
455
                //
456
                // Factor out common regexp and append factored expression to out.
457
                if i == start {
458
                        // Nothing to do - run of length 0.
459
                } else if i == start+1 {
460
                        // Just one: don't bother factoring.
461
                        out = append(out, sub[start])
462
                } else {
463
                        // Construct factored form: prefix(suffix1|suffix2|...)
464
                        prefix := first
465
                        for j := start; j < i; j++ {
466
                                reuse := j != start // prefix came from sub[start]
467
                                sub[j] = p.removeLeadingRegexp(sub[j], reuse)
468
                        }
469
                        suffix := p.collapse(sub[start:i], OpAlternate) // recurse
470
 
471
                        re := p.newRegexp(OpConcat)
472
                        re.Sub = append(re.Sub[:0], prefix, suffix)
473
                        out = append(out, re)
474
                }
475
 
476
                // Prepare for next iteration.
477
                start = i
478
                first = ifirst
479
        }
480
        sub = out
481
 
482
        // Round 3: Collapse runs of single literals into character classes.
483
        start = 0
484
        out = sub[:0]
485
        for i := 0; i <= len(sub); i++ {
486
                // Invariant: the Regexps that were in sub[0:start] have been
487
                // used or marked for reuse, and the slice space has been reused
488
                // for out (len(out) <= start).
489
                //
490
                // Invariant: sub[start:i] consists of regexps that are either
491
                // literal runes or character classes.
492
                if i < len(sub) && isCharClass(sub[i]) {
493
                        continue
494
                }
495
 
496
                // sub[i] is not a char or char class;
497
                // emit char class for sub[start:i]...
498
                if i == start {
499
                        // Nothing to do - run of length 0.
500
                } else if i == start+1 {
501
                        out = append(out, sub[start])
502
                } else {
503
                        // Make new char class.
504
                        // Start with most complex regexp in sub[start].
505
                        max := start
506
                        for j := start + 1; j < i; j++ {
507
                                if sub[max].Op < sub[j].Op || sub[max].Op == sub[j].Op && len(sub[max].Rune) < len(sub[j].Rune) {
508
                                        max = j
509
                                }
510
                        }
511
                        sub[start], sub[max] = sub[max], sub[start]
512
 
513
                        for j := start + 1; j < i; j++ {
514
                                mergeCharClass(sub[start], sub[j])
515
                                p.reuse(sub[j])
516
                        }
517
                        cleanAlt(sub[start])
518
                        out = append(out, sub[start])
519
                }
520
 
521
                // ... and then emit sub[i].
522
                if i < len(sub) {
523
                        out = append(out, sub[i])
524
                }
525
                start = i + 1
526
        }
527
        sub = out
528
 
529
        // Round 4: Collapse runs of empty matches into a single empty match.
530
        start = 0
531
        out = sub[:0]
532
        for i := range sub {
533
                if i+1 < len(sub) && sub[i].Op == OpEmptyMatch && sub[i+1].Op == OpEmptyMatch {
534
                        continue
535
                }
536
                out = append(out, sub[i])
537
        }
538
        sub = out
539
 
540
        return sub
541
}
542
 
543
// leadingString returns the leading literal string that re begins with.
544
// The string refers to storage in re or its children.
545
func (p *parser) leadingString(re *Regexp) ([]rune, Flags) {
546
        if re.Op == OpConcat && len(re.Sub) > 0 {
547
                re = re.Sub[0]
548
        }
549
        if re.Op != OpLiteral {
550
                return nil, 0
551
        }
552
        return re.Rune, re.Flags & FoldCase
553
}
554
 
555
// removeLeadingString removes the first n leading runes
556
// from the beginning of re.  It returns the replacement for re.
557
func (p *parser) removeLeadingString(re *Regexp, n int) *Regexp {
558
        if re.Op == OpConcat && len(re.Sub) > 0 {
559
                // Removing a leading string in a concatenation
560
                // might simplify the concatenation.
561
                sub := re.Sub[0]
562
                sub = p.removeLeadingString(sub, n)
563
                re.Sub[0] = sub
564
                if sub.Op == OpEmptyMatch {
565
                        p.reuse(sub)
566
                        switch len(re.Sub) {
567
                        case 0, 1:
568
                                // Impossible but handle.
569
                                re.Op = OpEmptyMatch
570
                                re.Sub = nil
571
                        case 2:
572
                                old := re
573
                                re = re.Sub[1]
574
                                p.reuse(old)
575
                        default:
576
                                copy(re.Sub, re.Sub[1:])
577
                                re.Sub = re.Sub[:len(re.Sub)-1]
578
                        }
579
                }
580
                return re
581
        }
582
 
583
        if re.Op == OpLiteral {
584
                re.Rune = re.Rune[:copy(re.Rune, re.Rune[n:])]
585
                if len(re.Rune) == 0 {
586
                        re.Op = OpEmptyMatch
587
                }
588
        }
589
        return re
590
}
591
 
592
// leadingRegexp returns the leading regexp that re begins with.
593
// The regexp refers to storage in re or its children.
594
func (p *parser) leadingRegexp(re *Regexp) *Regexp {
595
        if re.Op == OpEmptyMatch {
596
                return nil
597
        }
598
        if re.Op == OpConcat && len(re.Sub) > 0 {
599
                sub := re.Sub[0]
600
                if sub.Op == OpEmptyMatch {
601
                        return nil
602
                }
603
                return sub
604
        }
605
        return re
606
}
607
 
608
// removeLeadingRegexp removes the leading regexp in re.
609
// It returns the replacement for re.
610
// If reuse is true, it passes the removed regexp (if no longer needed) to p.reuse.
611
func (p *parser) removeLeadingRegexp(re *Regexp, reuse bool) *Regexp {
612
        if re.Op == OpConcat && len(re.Sub) > 0 {
613
                if reuse {
614
                        p.reuse(re.Sub[0])
615
                }
616
                re.Sub = re.Sub[:copy(re.Sub, re.Sub[1:])]
617
                switch len(re.Sub) {
618
                case 0:
619
                        re.Op = OpEmptyMatch
620
                        re.Sub = nil
621
                case 1:
622
                        old := re
623
                        re = re.Sub[0]
624
                        p.reuse(old)
625
                }
626
                return re
627
        }
628
        if reuse {
629
                p.reuse(re)
630
        }
631
        return p.newRegexp(OpEmptyMatch)
632
}
633
 
634
func literalRegexp(s string, flags Flags) *Regexp {
635
        re := &Regexp{Op: OpLiteral}
636
        re.Flags = flags
637
        re.Rune = re.Rune0[:0] // use local storage for small strings
638
        for _, c := range s {
639
                if len(re.Rune) >= cap(re.Rune) {
640
                        // string is too long to fit in Rune0.  let Go handle it
641
                        re.Rune = []rune(s)
642
                        break
643
                }
644
                re.Rune = append(re.Rune, c)
645
        }
646
        return re
647
}
648
 
649
// Parsing.
650
 
651
func Parse(s string, flags Flags) (*Regexp, error) {
652
        if flags&Literal != 0 {
653
                // Trivial parser for literal string.
654
                if err := checkUTF8(s); err != nil {
655
                        return nil, err
656
                }
657
                return literalRegexp(s, flags), nil
658
        }
659
 
660
        // Otherwise, must do real work.
661
        var (
662
                p          parser
663
                err        error
664
                c          rune
665
                op         Op
666
                lastRepeat string
667
                min, max   int
668
        )
669
        p.flags = flags
670
        p.wholeRegexp = s
671
        t := s
672
        for t != "" {
673
                repeat := ""
674
        BigSwitch:
675
                switch t[0] {
676
                default:
677
                        if c, t, err = nextRune(t); err != nil {
678
                                return nil, err
679
                        }
680
                        p.literal(c)
681
 
682
                case '(':
683
                        if p.flags&PerlX != 0 && len(t) >= 2 && t[1] == '?' {
684
                                // Flag changes and non-capturing groups.
685
                                if t, err = p.parsePerlFlags(t); err != nil {
686
                                        return nil, err
687
                                }
688
                                break
689
                        }
690
                        p.numCap++
691
                        p.op(opLeftParen).Cap = p.numCap
692
                        t = t[1:]
693
                case '|':
694
                        if err = p.parseVerticalBar(); err != nil {
695
                                return nil, err
696
                        }
697
                        t = t[1:]
698
                case ')':
699
                        if err = p.parseRightParen(); err != nil {
700
                                return nil, err
701
                        }
702
                        t = t[1:]
703
                case '^':
704
                        if p.flags&OneLine != 0 {
705
                                p.op(OpBeginText)
706
                        } else {
707
                                p.op(OpBeginLine)
708
                        }
709
                        t = t[1:]
710
                case '$':
711
                        if p.flags&OneLine != 0 {
712
                                p.op(OpEndText).Flags |= WasDollar
713
                        } else {
714
                                p.op(OpEndLine)
715
                        }
716
                        t = t[1:]
717
                case '.':
718
                        if p.flags&DotNL != 0 {
719
                                p.op(OpAnyChar)
720
                        } else {
721
                                p.op(OpAnyCharNotNL)
722
                        }
723
                        t = t[1:]
724
                case '[':
725
                        if t, err = p.parseClass(t); err != nil {
726
                                return nil, err
727
                        }
728
                case '*', '+', '?':
729
                        before := t
730
                        switch t[0] {
731
                        case '*':
732
                                op = OpStar
733
                        case '+':
734
                                op = OpPlus
735
                        case '?':
736
                                op = OpQuest
737
                        }
738
                        after := t[1:]
739
                        if after, err = p.repeat(op, min, max, before, after, lastRepeat); err != nil {
740
                                return nil, err
741
                        }
742
                        repeat = before
743
                        t = after
744
                case '{':
745
                        op = OpRepeat
746
                        before := t
747
                        min, max, after, ok := p.parseRepeat(t)
748
                        if !ok {
749
                                // If the repeat cannot be parsed, { is a literal.
750
                                p.literal('{')
751
                                t = t[1:]
752
                                break
753
                        }
754
                        if min < 0 || min > 1000 || max > 1000 || max >= 0 && min > max {
755
                                // Numbers were too big, or max is present and min > max.
756
                                return nil, &Error{ErrInvalidRepeatSize, before[:len(before)-len(after)]}
757
                        }
758
                        if after, err = p.repeat(op, min, max, before, after, lastRepeat); err != nil {
759
                                return nil, err
760
                        }
761
                        repeat = before
762
                        t = after
763
                case '\\':
764
                        if p.flags&PerlX != 0 && len(t) >= 2 {
765
                                switch t[1] {
766
                                case 'A':
767
                                        p.op(OpBeginText)
768
                                        t = t[2:]
769
                                        break BigSwitch
770
                                case 'b':
771
                                        p.op(OpWordBoundary)
772
                                        t = t[2:]
773
                                        break BigSwitch
774
                                case 'B':
775
                                        p.op(OpNoWordBoundary)
776
                                        t = t[2:]
777
                                        break BigSwitch
778
                                case 'C':
779
                                        // any byte; not supported
780
                                        return nil, &Error{ErrInvalidEscape, t[:2]}
781
                                case 'Q':
782
                                        // \Q ... \E: the ... is always literals
783
                                        var lit string
784
                                        if i := strings.Index(t, `\E`); i < 0 {
785
                                                lit = t[2:]
786
                                                t = ""
787
                                        } else {
788
                                                lit = t[2:i]
789
                                                t = t[i+2:]
790
                                        }
791
                                        p.push(literalRegexp(lit, p.flags))
792
                                        break BigSwitch
793
                                case 'z':
794
                                        p.op(OpEndText)
795
                                        t = t[2:]
796
                                        break BigSwitch
797
                                }
798
                        }
799
 
800
                        re := p.newRegexp(OpCharClass)
801
                        re.Flags = p.flags
802
 
803
                        // Look for Unicode character group like \p{Han}
804
                        if len(t) >= 2 && (t[1] == 'p' || t[1] == 'P') {
805
                                r, rest, err := p.parseUnicodeClass(t, re.Rune0[:0])
806
                                if err != nil {
807
                                        return nil, err
808
                                }
809
                                if r != nil {
810
                                        re.Rune = r
811
                                        t = rest
812
                                        p.push(re)
813
                                        break BigSwitch
814
                                }
815
                        }
816
 
817
                        // Perl character class escape.
818
                        if r, rest := p.parsePerlClassEscape(t, re.Rune0[:0]); r != nil {
819
                                re.Rune = r
820
                                t = rest
821
                                p.push(re)
822
                                break BigSwitch
823
                        }
824
                        p.reuse(re)
825
 
826
                        // Ordinary single-character escape.
827
                        if c, t, err = p.parseEscape(t); err != nil {
828
                                return nil, err
829
                        }
830
                        p.literal(c)
831
                }
832
                lastRepeat = repeat
833
        }
834
 
835
        p.concat()
836
        if p.swapVerticalBar() {
837
                // pop vertical bar
838
                p.stack = p.stack[:len(p.stack)-1]
839
        }
840
        p.alternate()
841
 
842
        n := len(p.stack)
843
        if n != 1 {
844
                return nil, &Error{ErrMissingParen, s}
845
        }
846
        return p.stack[0], nil
847
}
848
 
849
// parseRepeat parses {min} (max=min) or {min,} (max=-1) or {min,max}.
850
// If s is not of that form, it returns ok == false.
851
// If s has the right form but the values are too big, it returns min == -1, ok == true.
852
func (p *parser) parseRepeat(s string) (min, max int, rest string, ok bool) {
853
        if s == "" || s[0] != '{' {
854
                return
855
        }
856
        s = s[1:]
857
        var ok1 bool
858
        if min, s, ok1 = p.parseInt(s); !ok1 {
859
                return
860
        }
861
        if s == "" {
862
                return
863
        }
864
        if s[0] != ',' {
865
                max = min
866
        } else {
867
                s = s[1:]
868
                if s == "" {
869
                        return
870
                }
871
                if s[0] == '}' {
872
                        max = -1
873
                } else if max, s, ok1 = p.parseInt(s); !ok1 {
874
                        return
875
                } else if max < 0 {
876
                        // parseInt found too big a number
877
                        min = -1
878
                }
879
        }
880
        if s == "" || s[0] != '}' {
881
                return
882
        }
883
        rest = s[1:]
884
        ok = true
885
        return
886
}
887
 
888
// parsePerlFlags parses a Perl flag setting or non-capturing group or both,
889
// like (?i) or (?: or (?i:.  It removes the prefix from s and updates the parse state.
890
// The caller must have ensured that s begins with "(?".
891
func (p *parser) parsePerlFlags(s string) (rest string, err error) {
892
        t := s
893
 
894
        // Check for named captures, first introduced in Python's regexp library.
895
        // As usual, there are three slightly different syntaxes:
896
        //
897
        //   (?Pexpr)   the original, introduced by Python
898
        //   (?expr)    the .NET alteration, adopted by Perl 5.10
899
        //   (?'name'expr)    another .NET alteration, adopted by Perl 5.10
900
        //
901
        // Perl 5.10 gave in and implemented the Python version too,
902
        // but they claim that the last two are the preferred forms.
903
        // PCRE and languages based on it (specifically, PHP and Ruby)
904
        // support all three as well.  EcmaScript 4 uses only the Python form.
905
        //
906
        // In both the open source world (via Code Search) and the
907
        // Google source tree, (?Pname) is the dominant form,
908
        // so that's the one we implement.  One is enough.
909
        if len(t) > 4 && t[2] == 'P' && t[3] == '<' {
910
                // Pull out name.
911
                end := strings.IndexRune(t, '>')
912
                if end < 0 {
913
                        if err = checkUTF8(t); err != nil {
914
                                return "", err
915
                        }
916
                        return "", &Error{ErrInvalidNamedCapture, s}
917
                }
918
 
919
                capture := t[:end+1] // "(?P"
920
                name := t[4:end]     // "name"
921
                if err = checkUTF8(name); err != nil {
922
                        return "", err
923
                }
924
                if !isValidCaptureName(name) {
925
                        return "", &Error{ErrInvalidNamedCapture, capture}
926
                }
927
 
928
                // Like ordinary capture, but named.
929
                p.numCap++
930
                re := p.op(opLeftParen)
931
                re.Cap = p.numCap
932
                re.Name = name
933
                return t[end+1:], nil
934
        }
935
 
936
        // Non-capturing group.  Might also twiddle Perl flags.
937
        var c rune
938
        t = t[2:] // skip (?
939
        flags := p.flags
940
        sign := +1
941
        sawFlag := false
942
Loop:
943
        for t != "" {
944
                if c, t, err = nextRune(t); err != nil {
945
                        return "", err
946
                }
947
                switch c {
948
                default:
949
                        break Loop
950
 
951
                // Flags.
952
                case 'i':
953
                        flags |= FoldCase
954
                        sawFlag = true
955
                case 'm':
956
                        flags &^= OneLine
957
                        sawFlag = true
958
                case 's':
959
                        flags |= DotNL
960
                        sawFlag = true
961
                case 'U':
962
                        flags |= NonGreedy
963
                        sawFlag = true
964
 
965
                // Switch to negation.
966
                case '-':
967
                        if sign < 0 {
968
                                break Loop
969
                        }
970
                        sign = -1
971
                        // Invert flags so that | above turn into &^ and vice versa.
972
                        // We'll invert flags again before using it below.
973
                        flags = ^flags
974
                        sawFlag = false
975
 
976
                // End of flags, starting group or not.
977
                case ':', ')':
978
                        if sign < 0 {
979
                                if !sawFlag {
980
                                        break Loop
981
                                }
982
                                flags = ^flags
983
                        }
984
                        if c == ':' {
985
                                // Open new group
986
                                p.op(opLeftParen)
987
                        }
988
                        p.flags = flags
989
                        return t, nil
990
                }
991
        }
992
 
993
        return "", &Error{ErrInvalidPerlOp, s[:len(s)-len(t)]}
994
}
995
 
996
// isValidCaptureName reports whether name
997
// is a valid capture name: [A-Za-z0-9_]+.
998
// PCRE limits names to 32 bytes.
999
// Python rejects names starting with digits.
1000
// We don't enforce either of those.
1001
func isValidCaptureName(name string) bool {
1002
        if name == "" {
1003
                return false
1004
        }
1005
        for _, c := range name {
1006
                if c != '_' && !isalnum(c) {
1007
                        return false
1008
                }
1009
        }
1010
        return true
1011
}
1012
 
1013
// parseInt parses a decimal integer.
1014
func (p *parser) parseInt(s string) (n int, rest string, ok bool) {
1015
        if s == "" || s[0] < '0' || '9' < s[0] {
1016
                return
1017
        }
1018
        // Disallow leading zeros.
1019
        if len(s) >= 2 && s[0] == '0' && '0' <= s[1] && s[1] <= '9' {
1020
                return
1021
        }
1022
        t := s
1023
        for s != "" && '0' <= s[0] && s[0] <= '9' {
1024
                s = s[1:]
1025
        }
1026
        rest = s
1027
        ok = true
1028
        // Have digits, compute value.
1029
        t = t[:len(t)-len(s)]
1030
        for i := 0; i < len(t); i++ {
1031
                // Avoid overflow.
1032
                if n >= 1e8 {
1033
                        n = -1
1034
                        break
1035
                }
1036
                n = n*10 + int(t[i]) - '0'
1037
        }
1038
        return
1039
}
1040
 
1041
// can this be represented as a character class?
1042
// single-rune literal string, char class, ., and .|\n.
1043
func isCharClass(re *Regexp) bool {
1044
        return re.Op == OpLiteral && len(re.Rune) == 1 ||
1045
                re.Op == OpCharClass ||
1046
                re.Op == OpAnyCharNotNL ||
1047
                re.Op == OpAnyChar
1048
}
1049
 
1050
// does re match r?
1051
func matchRune(re *Regexp, r rune) bool {
1052
        switch re.Op {
1053
        case OpLiteral:
1054
                return len(re.Rune) == 1 && re.Rune[0] == r
1055
        case OpCharClass:
1056
                for i := 0; i < len(re.Rune); i += 2 {
1057
                        if re.Rune[i] <= r && r <= re.Rune[i+1] {
1058
                                return true
1059
                        }
1060
                }
1061
                return false
1062
        case OpAnyCharNotNL:
1063
                return r != '\n'
1064
        case OpAnyChar:
1065
                return true
1066
        }
1067
        return false
1068
}
1069
 
1070
// parseVerticalBar handles a | in the input.
1071
func (p *parser) parseVerticalBar() error {
1072
        p.concat()
1073
 
1074
        // The concatenation we just parsed is on top of the stack.
1075
        // If it sits above an opVerticalBar, swap it below
1076
        // (things below an opVerticalBar become an alternation).
1077
        // Otherwise, push a new vertical bar.
1078
        if !p.swapVerticalBar() {
1079
                p.op(opVerticalBar)
1080
        }
1081
 
1082
        return nil
1083
}
1084
 
1085
// mergeCharClass makes dst = dst|src.
1086
// The caller must ensure that dst.Op >= src.Op,
1087
// to reduce the amount of copying.
1088
func mergeCharClass(dst, src *Regexp) {
1089
        switch dst.Op {
1090
        case OpAnyChar:
1091
                // src doesn't add anything.
1092
        case OpAnyCharNotNL:
1093
                // src might add \n
1094
                if matchRune(src, '\n') {
1095
                        dst.Op = OpAnyChar
1096
                }
1097
        case OpCharClass:
1098
                // src is simpler, so either literal or char class
1099
                if src.Op == OpLiteral {
1100
                        dst.Rune = appendLiteral(dst.Rune, src.Rune[0], src.Flags)
1101
                } else {
1102
                        dst.Rune = appendClass(dst.Rune, src.Rune)
1103
                }
1104
        case OpLiteral:
1105
                // both literal
1106
                if src.Rune[0] == dst.Rune[0] && src.Flags == dst.Flags {
1107
                        break
1108
                }
1109
                dst.Op = OpCharClass
1110
                dst.Rune = appendLiteral(dst.Rune[:0], dst.Rune[0], dst.Flags)
1111
                dst.Rune = appendLiteral(dst.Rune, src.Rune[0], src.Flags)
1112
        }
1113
}
1114
 
1115
// If the top of the stack is an element followed by an opVerticalBar
1116
// swapVerticalBar swaps the two and returns true.
1117
// Otherwise it returns false.
1118
func (p *parser) swapVerticalBar() bool {
1119
        // If above and below vertical bar are literal or char class,
1120
        // can merge into a single char class.
1121
        n := len(p.stack)
1122
        if n >= 3 && p.stack[n-2].Op == opVerticalBar && isCharClass(p.stack[n-1]) && isCharClass(p.stack[n-3]) {
1123
                re1 := p.stack[n-1]
1124
                re3 := p.stack[n-3]
1125
                // Make re3 the more complex of the two.
1126
                if re1.Op > re3.Op {
1127
                        re1, re3 = re3, re1
1128
                        p.stack[n-3] = re3
1129
                }
1130
                mergeCharClass(re3, re1)
1131
                p.reuse(re1)
1132
                p.stack = p.stack[:n-1]
1133
                return true
1134
        }
1135
 
1136
        if n >= 2 {
1137
                re1 := p.stack[n-1]
1138
                re2 := p.stack[n-2]
1139
                if re2.Op == opVerticalBar {
1140
                        if n >= 3 {
1141
                                // Now out of reach.
1142
                                // Clean opportunistically.
1143
                                cleanAlt(p.stack[n-3])
1144
                        }
1145
                        p.stack[n-2] = re1
1146
                        p.stack[n-1] = re2
1147
                        return true
1148
                }
1149
        }
1150
        return false
1151
}
1152
 
1153
// parseRightParen handles a ) in the input.
1154
func (p *parser) parseRightParen() error {
1155
        p.concat()
1156
        if p.swapVerticalBar() {
1157
                // pop vertical bar
1158
                p.stack = p.stack[:len(p.stack)-1]
1159
        }
1160
        p.alternate()
1161
 
1162
        n := len(p.stack)
1163
        if n < 2 {
1164
                return &Error{ErrInternalError, ""}
1165
        }
1166
        re1 := p.stack[n-1]
1167
        re2 := p.stack[n-2]
1168
        p.stack = p.stack[:n-2]
1169
        if re2.Op != opLeftParen {
1170
                return &Error{ErrMissingParen, p.wholeRegexp}
1171
        }
1172
        // Restore flags at time of paren.
1173
        p.flags = re2.Flags
1174
        if re2.Cap == 0 {
1175
                // Just for grouping.
1176
                p.push(re1)
1177
        } else {
1178
                re2.Op = OpCapture
1179
                re2.Sub = re2.Sub0[:1]
1180
                re2.Sub[0] = re1
1181
                p.push(re2)
1182
        }
1183
        return nil
1184
}
1185
 
1186
// parseEscape parses an escape sequence at the beginning of s
1187
// and returns the rune.
1188
func (p *parser) parseEscape(s string) (r rune, rest string, err error) {
1189
        t := s[1:]
1190
        if t == "" {
1191
                return 0, "", &Error{ErrTrailingBackslash, ""}
1192
        }
1193
        c, t, err := nextRune(t)
1194
        if err != nil {
1195
                return 0, "", err
1196
        }
1197
 
1198
Switch:
1199
        switch c {
1200
        default:
1201
                if c < utf8.RuneSelf && !isalnum(c) {
1202
                        // Escaped non-word characters are always themselves.
1203
                        // PCRE is not quite so rigorous: it accepts things like
1204
                        // \q, but we don't.  We once rejected \_, but too many
1205
                        // programs and people insist on using it, so allow \_.
1206
                        return c, t, nil
1207
                }
1208
 
1209
        // Octal escapes.
1210
        case '1', '2', '3', '4', '5', '6', '7':
1211
                // Single non-zero digit is a backreference; not supported
1212
                if t == "" || t[0] < '0' || t[0] > '7' {
1213
                        break
1214
                }
1215
                fallthrough
1216
        case '0':
1217
                // Consume up to three octal digits; already have one.
1218
                r = c - '0'
1219
                for i := 1; i < 3; i++ {
1220
                        if t == "" || t[0] < '0' || t[0] > '7' {
1221
                                break
1222
                        }
1223
                        r = r*8 + rune(t[0]) - '0'
1224
                        t = t[1:]
1225
                }
1226
                return r, t, nil
1227
 
1228
        // Hexadecimal escapes.
1229
        case 'x':
1230
                if t == "" {
1231
                        break
1232
                }
1233
                if c, t, err = nextRune(t); err != nil {
1234
                        return 0, "", err
1235
                }
1236
                if c == '{' {
1237
                        // Any number of digits in braces.
1238
                        // Perl accepts any text at all; it ignores all text
1239
                        // after the first non-hex digit.  We require only hex digits,
1240
                        // and at least one.
1241
                        nhex := 0
1242
                        r = 0
1243
                        for {
1244
                                if t == "" {
1245
                                        break Switch
1246
                                }
1247
                                if c, t, err = nextRune(t); err != nil {
1248
                                        return 0, "", err
1249
                                }
1250
                                if c == '}' {
1251
                                        break
1252
                                }
1253
                                v := unhex(c)
1254
                                if v < 0 {
1255
                                        break Switch
1256
                                }
1257
                                r = r*16 + v
1258
                                if r > unicode.MaxRune {
1259
                                        break Switch
1260
                                }
1261
                                nhex++
1262
                        }
1263
                        if nhex == 0 {
1264
                                break Switch
1265
                        }
1266
                        return r, t, nil
1267
                }
1268
 
1269
                // Easy case: two hex digits.
1270
                x := unhex(c)
1271
                if c, t, err = nextRune(t); err != nil {
1272
                        return 0, "", err
1273
                }
1274
                y := unhex(c)
1275
                if x < 0 || y < 0 {
1276
                        break
1277
                }
1278
                return x*16 + y, t, nil
1279
 
1280
        // C escapes.  There is no case 'b', to avoid misparsing
1281
        // the Perl word-boundary \b as the C backspace \b
1282
        // when in POSIX mode.  In Perl, /\b/ means word-boundary
1283
        // but /[\b]/ means backspace.  We don't support that.
1284
        // If you want a backspace, embed a literal backspace
1285
        // character or use \x08.
1286
        case 'a':
1287
                return '\a', t, err
1288
        case 'f':
1289
                return '\f', t, err
1290
        case 'n':
1291
                return '\n', t, err
1292
        case 'r':
1293
                return '\r', t, err
1294
        case 't':
1295
                return '\t', t, err
1296
        case 'v':
1297
                return '\v', t, err
1298
        }
1299
        return 0, "", &Error{ErrInvalidEscape, s[:len(s)-len(t)]}
1300
}
1301
 
1302
// parseClassChar parses a character class character at the beginning of s
1303
// and returns it.
1304
func (p *parser) parseClassChar(s, wholeClass string) (r rune, rest string, err error) {
1305
        if s == "" {
1306
                return 0, "", &Error{Code: ErrMissingBracket, Expr: wholeClass}
1307
        }
1308
 
1309
        // Allow regular escape sequences even though
1310
        // many need not be escaped in this context.
1311
        if s[0] == '\\' {
1312
                return p.parseEscape(s)
1313
        }
1314
 
1315
        return nextRune(s)
1316
}
1317
 
1318
type charGroup struct {
1319
        sign  int
1320
        class []rune
1321
}
1322
 
1323
// parsePerlClassEscape parses a leading Perl character class escape like \d
1324
// from the beginning of s.  If one is present, it appends the characters to r
1325
// and returns the new slice r and the remainder of the string.
1326
func (p *parser) parsePerlClassEscape(s string, r []rune) (out []rune, rest string) {
1327
        if p.flags&PerlX == 0 || len(s) < 2 || s[0] != '\\' {
1328
                return
1329
        }
1330
        g := perlGroup[s[0:2]]
1331
        if g.sign == 0 {
1332
                return
1333
        }
1334
        return p.appendGroup(r, g), s[2:]
1335
}
1336
 
1337
// parseNamedClass parses a leading POSIX named character class like [:alnum:]
1338
// from the beginning of s.  If one is present, it appends the characters to r
1339
// and returns the new slice r and the remainder of the string.
1340
func (p *parser) parseNamedClass(s string, r []rune) (out []rune, rest string, err error) {
1341
        if len(s) < 2 || s[0] != '[' || s[1] != ':' {
1342
                return
1343
        }
1344
 
1345
        i := strings.Index(s[2:], ":]")
1346
        if i < 0 {
1347
                return
1348
        }
1349
        i += 2
1350
        name, s := s[0:i+2], s[i+2:]
1351
        g := posixGroup[name]
1352
        if g.sign == 0 {
1353
                return nil, "", &Error{ErrInvalidCharRange, name}
1354
        }
1355
        return p.appendGroup(r, g), s, nil
1356
}
1357
 
1358
func (p *parser) appendGroup(r []rune, g charGroup) []rune {
1359
        if p.flags&FoldCase == 0 {
1360
                if g.sign < 0 {
1361
                        r = appendNegatedClass(r, g.class)
1362
                } else {
1363
                        r = appendClass(r, g.class)
1364
                }
1365
        } else {
1366
                tmp := p.tmpClass[:0]
1367
                tmp = appendFoldedClass(tmp, g.class)
1368
                p.tmpClass = tmp
1369
                tmp = cleanClass(&p.tmpClass)
1370
                if g.sign < 0 {
1371
                        r = appendNegatedClass(r, tmp)
1372
                } else {
1373
                        r = appendClass(r, tmp)
1374
                }
1375
        }
1376
        return r
1377
}
1378
 
1379
var anyTable = &unicode.RangeTable{
1380
        R16: []unicode.Range16{{Lo: 0, Hi: 1<<16 - 1, Stride: 1}},
1381
        R32: []unicode.Range32{{Lo: 1 << 16, Hi: unicode.MaxRune, Stride: 1}},
1382
}
1383
 
1384
// unicodeTable returns the unicode.RangeTable identified by name
1385
// and the table of additional fold-equivalent code points.
1386
func unicodeTable(name string) (*unicode.RangeTable, *unicode.RangeTable) {
1387
        // Special case: "Any" means any.
1388
        if name == "Any" {
1389
                return anyTable, anyTable
1390
        }
1391
        if t := unicode.Categories[name]; t != nil {
1392
                return t, unicode.FoldCategory[name]
1393
        }
1394
        if t := unicode.Scripts[name]; t != nil {
1395
                return t, unicode.FoldScript[name]
1396
        }
1397
        return nil, nil
1398
}
1399
 
1400
// parseUnicodeClass parses a leading Unicode character class like \p{Han}
1401
// from the beginning of s.  If one is present, it appends the characters to r
1402
// and returns the new slice r and the remainder of the string.
1403
func (p *parser) parseUnicodeClass(s string, r []rune) (out []rune, rest string, err error) {
1404
        if p.flags&UnicodeGroups == 0 || len(s) < 2 || s[0] != '\\' || s[1] != 'p' && s[1] != 'P' {
1405
                return
1406
        }
1407
 
1408
        // Committed to parse or return error.
1409
        sign := +1
1410
        if s[1] == 'P' {
1411
                sign = -1
1412
        }
1413
        t := s[2:]
1414
        c, t, err := nextRune(t)
1415
        if err != nil {
1416
                return
1417
        }
1418
        var seq, name string
1419
        if c != '{' {
1420
                // Single-letter name.
1421
                seq = s[:len(s)-len(t)]
1422
                name = seq[2:]
1423
        } else {
1424
                // Name is in braces.
1425
                end := strings.IndexRune(s, '}')
1426
                if end < 0 {
1427
                        if err = checkUTF8(s); err != nil {
1428
                                return
1429
                        }
1430
                        return nil, "", &Error{ErrInvalidCharRange, s}
1431
                }
1432
                seq, t = s[:end+1], s[end+1:]
1433
                name = s[3:end]
1434
                if err = checkUTF8(name); err != nil {
1435
                        return
1436
                }
1437
        }
1438
 
1439
        // Group can have leading negation too.  \p{^Han} == \P{Han}, \P{^Han} == \p{Han}.
1440
        if name != "" && name[0] == '^' {
1441
                sign = -sign
1442
                name = name[1:]
1443
        }
1444
 
1445
        tab, fold := unicodeTable(name)
1446
        if tab == nil {
1447
                return nil, "", &Error{ErrInvalidCharRange, seq}
1448
        }
1449
 
1450
        if p.flags&FoldCase == 0 || fold == nil {
1451
                if sign > 0 {
1452
                        r = appendTable(r, tab)
1453
                } else {
1454
                        r = appendNegatedTable(r, tab)
1455
                }
1456
        } else {
1457
                // Merge and clean tab and fold in a temporary buffer.
1458
                // This is necessary for the negative case and just tidy
1459
                // for the positive case.
1460
                tmp := p.tmpClass[:0]
1461
                tmp = appendTable(tmp, tab)
1462
                tmp = appendTable(tmp, fold)
1463
                p.tmpClass = tmp
1464
                tmp = cleanClass(&p.tmpClass)
1465
                if sign > 0 {
1466
                        r = appendClass(r, tmp)
1467
                } else {
1468
                        r = appendNegatedClass(r, tmp)
1469
                }
1470
        }
1471
        return r, t, nil
1472
}
1473
 
1474
// parseClass parses a character class at the beginning of s
1475
// and pushes it onto the parse stack.
1476
func (p *parser) parseClass(s string) (rest string, err error) {
1477
        t := s[1:] // chop [
1478
        re := p.newRegexp(OpCharClass)
1479
        re.Flags = p.flags
1480
        re.Rune = re.Rune0[:0]
1481
 
1482
        sign := +1
1483
        if t != "" && t[0] == '^' {
1484
                sign = -1
1485
                t = t[1:]
1486
 
1487
                // If character class does not match \n, add it here,
1488
                // so that negation later will do the right thing.
1489
                if p.flags&ClassNL == 0 {
1490
                        re.Rune = append(re.Rune, '\n', '\n')
1491
                }
1492
        }
1493
 
1494
        class := re.Rune
1495
        first := true // ] and - are okay as first char in class
1496
        for t == "" || t[0] != ']' || first {
1497
                // POSIX: - is only okay unescaped as first or last in class.
1498
                // Perl: - is okay anywhere.
1499
                if t != "" && t[0] == '-' && p.flags&PerlX == 0 && !first && (len(t) == 1 || t[1] != ']') {
1500
                        _, size := utf8.DecodeRuneInString(t[1:])
1501
                        return "", &Error{Code: ErrInvalidCharRange, Expr: t[:1+size]}
1502
                }
1503
                first = false
1504
 
1505
                // Look for POSIX [:alnum:] etc.
1506
                if len(t) > 2 && t[0] == '[' && t[1] == ':' {
1507
                        nclass, nt, err := p.parseNamedClass(t, class)
1508
                        if err != nil {
1509
                                return "", err
1510
                        }
1511
                        if nclass != nil {
1512
                                class, t = nclass, nt
1513
                                continue
1514
                        }
1515
                }
1516
 
1517
                // Look for Unicode character group like \p{Han}.
1518
                nclass, nt, err := p.parseUnicodeClass(t, class)
1519
                if err != nil {
1520
                        return "", err
1521
                }
1522
                if nclass != nil {
1523
                        class, t = nclass, nt
1524
                        continue
1525
                }
1526
 
1527
                // Look for Perl character class symbols (extension).
1528
                if nclass, nt := p.parsePerlClassEscape(t, class); nclass != nil {
1529
                        class, t = nclass, nt
1530
                        continue
1531
                }
1532
 
1533
                // Single character or simple range.
1534
                rng := t
1535
                var lo, hi rune
1536
                if lo, t, err = p.parseClassChar(t, s); err != nil {
1537
                        return "", err
1538
                }
1539
                hi = lo
1540
                // [a-] means (a|-) so check for final ].
1541
                if len(t) >= 2 && t[0] == '-' && t[1] != ']' {
1542
                        t = t[1:]
1543
                        if hi, t, err = p.parseClassChar(t, s); err != nil {
1544
                                return "", err
1545
                        }
1546
                        if hi < lo {
1547
                                rng = rng[:len(rng)-len(t)]
1548
                                return "", &Error{Code: ErrInvalidCharRange, Expr: rng}
1549
                        }
1550
                }
1551
                if p.flags&FoldCase == 0 {
1552
                        class = AppendRange(class, lo, hi)
1553
                } else {
1554
                        class = appendFoldedRange(class, lo, hi)
1555
                }
1556
        }
1557
        t = t[1:] // chop ]
1558
 
1559
        // Use &re.Rune instead of &class to avoid allocation.
1560
        re.Rune = class
1561
        class = cleanClass(&re.Rune)
1562
        if sign < 0 {
1563
                class = negateClass(class)
1564
        }
1565
        re.Rune = class
1566
        p.push(re)
1567
        return t, nil
1568
}
1569
 
1570
// cleanClass sorts the ranges (pairs of elements of r),
1571
// merges them, and eliminates duplicates.
1572
func cleanClass(rp *[]rune) []rune {
1573
 
1574
        // Sort by lo increasing, hi decreasing to break ties.
1575
        sort.Sort(ranges{rp})
1576
 
1577
        r := *rp
1578
        if len(r) < 2 {
1579
                return r
1580
        }
1581
 
1582
        // Merge abutting, overlapping.
1583
        w := 2 // write index
1584
        for i := 2; i < len(r); i += 2 {
1585
                lo, hi := r[i], r[i+1]
1586
                if lo <= r[w-1]+1 {
1587
                        // merge with previous range
1588
                        if hi > r[w-1] {
1589
                                r[w-1] = hi
1590
                        }
1591
                        continue
1592
                }
1593
                // new disjoint range
1594
                r[w] = lo
1595
                r[w+1] = hi
1596
                w += 2
1597
        }
1598
 
1599
        return r[:w]
1600
}
1601
 
1602
// appendLiteral returns the result of appending the literal x to the class r.
1603
func appendLiteral(r []rune, x rune, flags Flags) []rune {
1604
        if flags&FoldCase != 0 {
1605
                return appendFoldedRange(r, x, x)
1606
        }
1607
        return AppendRange(r, x, x)
1608
}
1609
 
1610
// appendRange returns the result of appending the range lo-hi to the class r.
1611
func AppendRange(r []rune, lo, hi rune) []rune {
1612
        // Expand last range or next to last range if it overlaps or abuts.
1613
        // Checking two ranges helps when appending case-folded
1614
        // alphabets, so that one range can be expanding A-Z and the
1615
        // other expanding a-z.
1616
        n := len(r)
1617
        for i := 2; i <= 4; i += 2 { // twice, using i=2, i=4
1618
                if n >= i {
1619
                        rlo, rhi := r[n-i], r[n-i+1]
1620
                        if lo <= rhi+1 && rlo <= hi+1 {
1621
                                if lo < rlo {
1622
                                        r[n-i] = lo
1623
                                }
1624
                                if hi > rhi {
1625
                                        r[n-i+1] = hi
1626
                                }
1627
                                return r
1628
                        }
1629
                }
1630
        }
1631
 
1632
        return append(r, lo, hi)
1633
}
1634
 
1635
const (
1636
        // minimum and maximum runes involved in folding.
1637
        // checked during test.
1638
        MinFold = 0x0041
1639
        MaxFold = 0x1044f
1640
)
1641
 
1642
// appendFoldedRange returns the result of appending the range lo-hi
1643
// and its case folding-equivalent runes to the class r.
1644
func appendFoldedRange(r []rune, lo, hi rune) []rune {
1645
        // Optimizations.
1646
        if lo <= MinFold && hi >= MaxFold {
1647
                // Range is full: folding can't add more.
1648
                return AppendRange(r, lo, hi)
1649
        }
1650
        if hi < MinFold || lo > MaxFold {
1651
                // Range is outside folding possibilities.
1652
                return AppendRange(r, lo, hi)
1653
        }
1654
        if lo < MinFold {
1655
                // [lo, MinFold-1] needs no folding.
1656
                r = AppendRange(r, lo, MinFold-1)
1657
                lo = MinFold
1658
        }
1659
        if hi > MaxFold {
1660
                // [MaxFold+1, hi] needs no folding.
1661
                r = AppendRange(r, MaxFold+1, hi)
1662
                hi = MaxFold
1663
        }
1664
 
1665
        // Brute force.  Depend on AppendRange to coalesce ranges on the fly.
1666
        for c := lo; c <= hi; c++ {
1667
                r = AppendRange(r, c, c)
1668
                f := unicode.SimpleFold(c)
1669
                for f != c {
1670
                        r = AppendRange(r, f, f)
1671
                        f = unicode.SimpleFold(f)
1672
                }
1673
        }
1674
        return r
1675
}
1676
 
1677
// appendClass returns the result of appending the class x to the class r.
1678
// It assume x is clean.
1679
func appendClass(r []rune, x []rune) []rune {
1680
        for i := 0; i < len(x); i += 2 {
1681
                r = AppendRange(r, x[i], x[i+1])
1682
        }
1683
        return r
1684
}
1685
 
1686
// appendFolded returns the result of appending the case folding of the class x to the class r.
1687
func appendFoldedClass(r []rune, x []rune) []rune {
1688
        for i := 0; i < len(x); i += 2 {
1689
                r = appendFoldedRange(r, x[i], x[i+1])
1690
        }
1691
        return r
1692
}
1693
 
1694
// appendNegatedClass returns the result of appending the negation of the class x to the class r.
1695
// It assumes x is clean.
1696
func appendNegatedClass(r []rune, x []rune) []rune {
1697
        nextLo := '\u0000'
1698
        for i := 0; i < len(x); i += 2 {
1699
                lo, hi := x[i], x[i+1]
1700
                if nextLo <= lo-1 {
1701
                        r = AppendRange(r, nextLo, lo-1)
1702
                }
1703
                nextLo = hi + 1
1704
        }
1705
        if nextLo <= unicode.MaxRune {
1706
                r = AppendRange(r, nextLo, unicode.MaxRune)
1707
        }
1708
        return r
1709
}
1710
 
1711
// appendTable returns the result of appending x to the class r.
1712
func appendTable(r []rune, x *unicode.RangeTable) []rune {
1713
        for _, xr := range x.R16 {
1714
                lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)
1715
                if stride == 1 {
1716
                        r = AppendRange(r, lo, hi)
1717
                        continue
1718
                }
1719
                for c := lo; c <= hi; c += stride {
1720
                        r = AppendRange(r, c, c)
1721
                }
1722
        }
1723
        for _, xr := range x.R32 {
1724
                lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)
1725
                if stride == 1 {
1726
                        r = AppendRange(r, lo, hi)
1727
                        continue
1728
                }
1729
                for c := lo; c <= hi; c += stride {
1730
                        r = AppendRange(r, c, c)
1731
                }
1732
        }
1733
        return r
1734
}
1735
 
1736
// appendNegatedTable returns the result of appending the negation of x to the class r.
1737
func appendNegatedTable(r []rune, x *unicode.RangeTable) []rune {
1738
        nextLo := '\u0000' // lo end of next class to add
1739
        for _, xr := range x.R16 {
1740
                lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)
1741
                if stride == 1 {
1742
                        if nextLo <= lo-1 {
1743
                                r = AppendRange(r, nextLo, lo-1)
1744
                        }
1745
                        nextLo = hi + 1
1746
                        continue
1747
                }
1748
                for c := lo; c <= hi; c += stride {
1749
                        if nextLo <= c-1 {
1750
                                r = AppendRange(r, nextLo, c-1)
1751
                        }
1752
                        nextLo = c + 1
1753
                }
1754
        }
1755
        for _, xr := range x.R32 {
1756
                lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)
1757
                if stride == 1 {
1758
                        if nextLo <= lo-1 {
1759
                                r = AppendRange(r, nextLo, lo-1)
1760
                        }
1761
                        nextLo = hi + 1
1762
                        continue
1763
                }
1764
                for c := lo; c <= hi; c += stride {
1765
                        if nextLo <= c-1 {
1766
                                r = AppendRange(r, nextLo, c-1)
1767
                        }
1768
                        nextLo = c + 1
1769
                }
1770
        }
1771
        if nextLo <= unicode.MaxRune {
1772
                r = AppendRange(r, nextLo, unicode.MaxRune)
1773
        }
1774
        return r
1775
}
1776
 
1777
// negateClass overwrites r and returns r's negation.
1778
// It assumes the class r is already clean.
1779
func negateClass(r []rune) []rune {
1780
        nextLo := '\u0000' // lo end of next class to add
1781
        w := 0             // write index
1782
        for i := 0; i < len(r); i += 2 {
1783
                lo, hi := r[i], r[i+1]
1784
                if nextLo <= lo-1 {
1785
                        r[w] = nextLo
1786
                        r[w+1] = lo - 1
1787
                        w += 2
1788
                }
1789
                nextLo = hi + 1
1790
        }
1791
        r = r[:w]
1792
        if nextLo <= unicode.MaxRune {
1793
                // It's possible for the negation to have one more
1794
                // range - this one - than the original class, so use append.
1795
                r = append(r, nextLo, unicode.MaxRune)
1796
        }
1797
        return r
1798
}
1799
 
1800
// ranges implements sort.Interface on a []rune.
1801
// The choice of receiver type definition is strange
1802
// but avoids an allocation since we already have
1803
// a *[]rune.
1804
type ranges struct {
1805
        p *[]rune
1806
}
1807
 
1808
func (ra ranges) Less(i, j int) bool {
1809
        p := *ra.p
1810
        i *= 2
1811
        j *= 2
1812
        return p[i] < p[j] || p[i] == p[j] && p[i+1] > p[j+1]
1813
}
1814
 
1815
func (ra ranges) Len() int {
1816
        return len(*ra.p) / 2
1817
}
1818
 
1819
func (ra ranges) Swap(i, j int) {
1820
        p := *ra.p
1821
        i *= 2
1822
        j *= 2
1823
        p[i], p[i+1], p[j], p[j+1] = p[j], p[j+1], p[i], p[i+1]
1824
}
1825
 
1826
func checkUTF8(s string) error {
1827
        for s != "" {
1828
                rune, size := utf8.DecodeRuneInString(s)
1829
                if rune == utf8.RuneError && size == 1 {
1830
                        return &Error{Code: ErrInvalidUTF8, Expr: s}
1831
                }
1832
                s = s[size:]
1833
        }
1834
        return nil
1835
}
1836
 
1837
func nextRune(s string) (c rune, t string, err error) {
1838
        c, size := utf8.DecodeRuneInString(s)
1839
        if c == utf8.RuneError && size == 1 {
1840
                return 0, "", &Error{Code: ErrInvalidUTF8, Expr: s}
1841
        }
1842
        return c, s[size:], nil
1843
}
1844
 
1845
func isalnum(c rune) bool {
1846
        return '0' <= c && c <= '9' || 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z'
1847
}
1848
 
1849
func unhex(c rune) rune {
1850
        if '0' <= c && c <= '9' {
1851
                return c - '0'
1852
        }
1853
        if 'a' <= c && c <= 'f' {
1854
                return c - 'a' + 10
1855
        }
1856
        if 'A' <= c && c <= 'F' {
1857
                return c - 'A' + 10
1858
        }
1859
        return -1
1860
}

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.