URL
https://opencores.org/ocsvn/openrisc/openrisc/trunk
Subversion Repositories openrisc
[/] [openrisc/] [trunk/] [gnu-dev/] [or1k-gcc/] [libgo/] [go/] [regexp/] [syntax/] [parse.go] - Rev 747
Compare with Previous | Blame | View Log
// Copyright 2011 The Go Authors. All rights reserved.// Use of this source code is governed by a BSD-style// license that can be found in the LICENSE file.package syntaximport ("sort""strings""unicode""unicode/utf8")// An Error describes a failure to parse a regular expression// and gives the offending expression.type Error struct {Code ErrorCodeExpr string}func (e *Error) Error() string {return "error parsing regexp: " + e.Code.String() + ": `" + e.Expr + "`"}// An ErrorCode describes a failure to parse a regular expression.type ErrorCode stringconst (// Unexpected errorErrInternalError ErrorCode = "regexp/syntax: internal error"// Parse errorsErrInvalidCharClass ErrorCode = "invalid character class"ErrInvalidCharRange ErrorCode = "invalid character class range"ErrInvalidEscape ErrorCode = "invalid escape sequence"ErrInvalidNamedCapture ErrorCode = "invalid named capture"ErrInvalidPerlOp ErrorCode = "invalid or unsupported Perl syntax"ErrInvalidRepeatOp ErrorCode = "invalid nested repetition operator"ErrInvalidRepeatSize ErrorCode = "invalid repeat count"ErrInvalidUTF8 ErrorCode = "invalid UTF-8"ErrMissingBracket ErrorCode = "missing closing ]"ErrMissingParen ErrorCode = "missing closing )"ErrMissingRepeatArgument ErrorCode = "missing argument to repetition operator"ErrTrailingBackslash ErrorCode = "trailing backslash at end of expression")func (e ErrorCode) String() string {return string(e)}// Flags control the behavior of the parser and record information about regexp context.type Flags uint16const (FoldCase Flags = 1 << iota // case-insensitive matchLiteral // treat pattern as literal stringClassNL // allow character classes like [^a-z] and [[:space:]] to match newlineDotNL // allow . to match newlineOneLine // treat ^ and $ as only matching at beginning and end of textNonGreedy // make repetition operators default to non-greedyPerlX // allow Perl extensionsUnicodeGroups // allow \p{Han}, \P{Han} for Unicode group and negationWasDollar // regexp OpEndText was $, not \zSimple // regexp contains no counted repetitionMatchNL = ClassNL | DotNLPerl = ClassNL | OneLine | PerlX | UnicodeGroups // as close to Perl as possiblePOSIX Flags = 0 // POSIX syntax)// Pseudo-ops for parsing stack.const (opLeftParen = opPseudo + iotaopVerticalBar)type parser struct {flags Flags // parse mode flagsstack []*Regexp // stack of parsed expressionsfree *RegexpnumCap int // number of capturing groups seenwholeRegexp stringtmpClass []rune // temporary char class work space}func (p *parser) newRegexp(op Op) *Regexp {re := p.freeif re != nil {p.free = re.Sub0[0]*re = Regexp{}} else {re = new(Regexp)}re.Op = opreturn re}func (p *parser) reuse(re *Regexp) {re.Sub0[0] = p.freep.free = re}// Parse stack manipulation.// push pushes the regexp re onto the parse stack and returns the regexp.func (p *parser) push(re *Regexp) *Regexp {if re.Op == OpCharClass && len(re.Rune) == 2 && re.Rune[0] == re.Rune[1] {// Single rune.if p.maybeConcat(re.Rune[0], p.flags&^FoldCase) {return nil}re.Op = OpLiteralre.Rune = re.Rune[:1]re.Flags = p.flags &^ FoldCase} else if re.Op == OpCharClass && len(re.Rune) == 4 &&re.Rune[0] == re.Rune[1] && re.Rune[2] == re.Rune[3] &&unicode.SimpleFold(re.Rune[0]) == re.Rune[2] &&unicode.SimpleFold(re.Rune[2]) == re.Rune[0] ||re.Op == OpCharClass && len(re.Rune) == 2 &&re.Rune[0]+1 == re.Rune[1] &&unicode.SimpleFold(re.Rune[0]) == re.Rune[1] &&unicode.SimpleFold(re.Rune[1]) == re.Rune[0] {// Case-insensitive rune like [Aa] or [Δδ].if p.maybeConcat(re.Rune[0], p.flags|FoldCase) {return nil}// Rewrite as (case-insensitive) literal.re.Op = OpLiteralre.Rune = re.Rune[:1]re.Flags = p.flags | FoldCase} else {// Incremental concatenation.p.maybeConcat(-1, 0)}p.stack = append(p.stack, re)return re}// maybeConcat implements incremental concatenation// of literal runes into string nodes. The parser calls this// before each push, so only the top fragment of the stack// might need processing. Since this is called before a push,// the topmost literal is no longer subject to operators like *// (Otherwise ab* would turn into (ab)*.)// If r >= 0 and there's a node left over, maybeConcat uses it// to push r with the given flags.// maybeConcat reports whether r was pushed.func (p *parser) maybeConcat(r rune, flags Flags) bool {n := len(p.stack)if n < 2 {return false}re1 := p.stack[n-1]re2 := p.stack[n-2]if re1.Op != OpLiteral || re2.Op != OpLiteral || re1.Flags&FoldCase != re2.Flags&FoldCase {return false}// Push re1 into re2.re2.Rune = append(re2.Rune, re1.Rune...)// Reuse re1 if possible.if r >= 0 {re1.Rune = re1.Rune0[:1]re1.Rune[0] = rre1.Flags = flagsreturn true}p.stack = p.stack[:n-1]p.reuse(re1)return false // did not push r}// newLiteral returns a new OpLiteral Regexp with the given flagsfunc (p *parser) newLiteral(r rune, flags Flags) *Regexp {re := p.newRegexp(OpLiteral)re.Flags = flagsif flags&FoldCase != 0 {r = minFoldRune(r)}re.Rune0[0] = rre.Rune = re.Rune0[:1]return re}// minFoldRune returns the minimum rune fold-equivalent to r.func minFoldRune(r rune) rune {if r < MinFold || r > MaxFold {return r}min := rr0 := rfor r = unicode.SimpleFold(r); r != r0; r = unicode.SimpleFold(r) {if min > r {min = r}}return min}// literal pushes a literal regexp for the rune r on the stack// and returns that regexp.func (p *parser) literal(r rune) {p.push(p.newLiteral(r, p.flags))}// op pushes a regexp with the given op onto the stack// and returns that regexp.func (p *parser) op(op Op) *Regexp {re := p.newRegexp(op)re.Flags = p.flagsreturn p.push(re)}// repeat replaces the top stack element with itself repeated according to op, min, max.// before is the regexp suffix starting at the repetition operator.// after is the regexp suffix following after the repetition operator.// repeat returns an updated 'after' and an error, if any.func (p *parser) repeat(op Op, min, max int, before, after, lastRepeat string) (string, error) {flags := p.flagsif p.flags&PerlX != 0 {if len(after) > 0 && after[0] == '?' {after = after[1:]flags ^= NonGreedy}if lastRepeat != "" {// In Perl it is not allowed to stack repetition operators:// a** is a syntax error, not a doubled star, and a++ means// something else entirely, which we don't support!return "", &Error{ErrInvalidRepeatOp, lastRepeat[:len(lastRepeat)-len(after)]}}}n := len(p.stack)if n == 0 {return "", &Error{ErrMissingRepeatArgument, before[:len(before)-len(after)]}}sub := p.stack[n-1]if sub.Op >= opPseudo {return "", &Error{ErrMissingRepeatArgument, before[:len(before)-len(after)]}}re := p.newRegexp(op)re.Min = minre.Max = maxre.Flags = flagsre.Sub = re.Sub0[:1]re.Sub[0] = subp.stack[n-1] = rereturn after, nil}// concat replaces the top of the stack (above the topmost '|' or '(') with its concatenation.func (p *parser) concat() *Regexp {p.maybeConcat(-1, 0)// Scan down to find pseudo-operator | or (.i := len(p.stack)for i > 0 && p.stack[i-1].Op < opPseudo {i--}subs := p.stack[i:]p.stack = p.stack[:i]// Empty concatenation is special case.if len(subs) == 0 {return p.push(p.newRegexp(OpEmptyMatch))}return p.push(p.collapse(subs, OpConcat))}// alternate replaces the top of the stack (above the topmost '(') with its alternation.func (p *parser) alternate() *Regexp {// Scan down to find pseudo-operator (.// There are no | above (.i := len(p.stack)for i > 0 && p.stack[i-1].Op < opPseudo {i--}subs := p.stack[i:]p.stack = p.stack[:i]// Make sure top class is clean.// All the others already are (see swapVerticalBar).if len(subs) > 0 {cleanAlt(subs[len(subs)-1])}// Empty alternate is special case// (shouldn't happen but easy to handle).if len(subs) == 0 {return p.push(p.newRegexp(OpNoMatch))}return p.push(p.collapse(subs, OpAlternate))}// cleanAlt cleans re for eventual inclusion in an alternation.func cleanAlt(re *Regexp) {switch re.Op {case OpCharClass:re.Rune = cleanClass(&re.Rune)if len(re.Rune) == 2 && re.Rune[0] == 0 && re.Rune[1] == unicode.MaxRune {re.Rune = nilre.Op = OpAnyCharreturn}if len(re.Rune) == 4 && re.Rune[0] == 0 && re.Rune[1] == '\n'-1 && re.Rune[2] == '\n'+1 && re.Rune[3] == unicode.MaxRune {re.Rune = nilre.Op = OpAnyCharNotNLreturn}if cap(re.Rune)-len(re.Rune) > 100 {// re.Rune will not grow any more.// Make a copy or inline to reclaim storage.re.Rune = append(re.Rune0[:0], re.Rune...)}}}// collapse returns the result of applying op to sub.// If sub contains op nodes, they all get hoisted up// so that there is never a concat of a concat or an// alternate of an alternate.func (p *parser) collapse(subs []*Regexp, op Op) *Regexp {if len(subs) == 1 {return subs[0]}re := p.newRegexp(op)re.Sub = re.Sub0[:0]for _, sub := range subs {if sub.Op == op {re.Sub = append(re.Sub, sub.Sub...)p.reuse(sub)} else {re.Sub = append(re.Sub, sub)}}if op == OpAlternate {re.Sub = p.factor(re.Sub, re.Flags)if len(re.Sub) == 1 {old := rere = re.Sub[0]p.reuse(old)}}return re}// factor factors common prefixes from the alternation list sub.// It returns a replacement list that reuses the same storage and// frees (passes to p.reuse) any removed *Regexps.//// For example,// ABC|ABD|AEF|BCX|BCY// simplifies by literal prefix extraction to// A(B(C|D)|EF)|BC(X|Y)// which simplifies by character class introduction to// A(B[CD]|EF)|BC[XY]//func (p *parser) factor(sub []*Regexp, flags Flags) []*Regexp {if len(sub) < 2 {return sub}// Round 1: Factor out common literal prefixes.var str []runevar strflags Flagsstart := 0out := sub[:0]for i := 0; i <= len(sub); i++ {// Invariant: the Regexps that were in sub[0:start] have been// used or marked for reuse, and the slice space has been reused// for out (len(out) <= start).//// Invariant: sub[start:i] consists of regexps that all begin// with str as modified by strflags.var istr []runevar iflags Flagsif i < len(sub) {istr, iflags = p.leadingString(sub[i])if iflags == strflags {same := 0for same < len(str) && same < len(istr) && str[same] == istr[same] {same++}if same > 0 {// Matches at least one rune in current range.// Keep going around.str = str[:same]continue}}}// Found end of a run with common leading literal string:// sub[start:i] all begin with str[0:len(str)], but sub[i]// does not even begin with str[0].//// Factor out common string and append factored expression to out.if i == start {// Nothing to do - run of length 0.} else if i == start+1 {// Just one: don't bother factoring.out = append(out, sub[start])} else {// Construct factored form: prefix(suffix1|suffix2|...)prefix := p.newRegexp(OpLiteral)prefix.Flags = strflagsprefix.Rune = append(prefix.Rune[:0], str...)for j := start; j < i; j++ {sub[j] = p.removeLeadingString(sub[j], len(str))}suffix := p.collapse(sub[start:i], OpAlternate) // recursere := p.newRegexp(OpConcat)re.Sub = append(re.Sub[:0], prefix, suffix)out = append(out, re)}// Prepare for next iteration.start = istr = istrstrflags = iflags}sub = out// Round 2: Factor out common complex prefixes,// just the first piece of each concatenation,// whatever it is. This is good enough a lot of the time.start = 0out = sub[:0]var first *Regexpfor i := 0; i <= len(sub); i++ {// Invariant: the Regexps that were in sub[0:start] have been// used or marked for reuse, and the slice space has been reused// for out (len(out) <= start).//// Invariant: sub[start:i] consists of regexps that all begin with ifirst.var ifirst *Regexpif i < len(sub) {ifirst = p.leadingRegexp(sub[i])if first != nil && first.Equal(ifirst) {continue}}// Found end of a run with common leading regexp:// sub[start:i] all begin with first but sub[i] does not.//// Factor out common regexp and append factored expression to out.if i == start {// Nothing to do - run of length 0.} else if i == start+1 {// Just one: don't bother factoring.out = append(out, sub[start])} else {// Construct factored form: prefix(suffix1|suffix2|...)prefix := firstfor j := start; j < i; j++ {reuse := j != start // prefix came from sub[start]sub[j] = p.removeLeadingRegexp(sub[j], reuse)}suffix := p.collapse(sub[start:i], OpAlternate) // recursere := p.newRegexp(OpConcat)re.Sub = append(re.Sub[:0], prefix, suffix)out = append(out, re)}// Prepare for next iteration.start = ifirst = ifirst}sub = out// Round 3: Collapse runs of single literals into character classes.start = 0out = sub[:0]for i := 0; i <= len(sub); i++ {// Invariant: the Regexps that were in sub[0:start] have been// used or marked for reuse, and the slice space has been reused// for out (len(out) <= start).//// Invariant: sub[start:i] consists of regexps that are either// literal runes or character classes.if i < len(sub) && isCharClass(sub[i]) {continue}// sub[i] is not a char or char class;// emit char class for sub[start:i]...if i == start {// Nothing to do - run of length 0.} else if i == start+1 {out = append(out, sub[start])} else {// Make new char class.// Start with most complex regexp in sub[start].max := startfor j := start + 1; j < i; j++ {if sub[max].Op < sub[j].Op || sub[max].Op == sub[j].Op && len(sub[max].Rune) < len(sub[j].Rune) {max = j}}sub[start], sub[max] = sub[max], sub[start]for j := start + 1; j < i; j++ {mergeCharClass(sub[start], sub[j])p.reuse(sub[j])}cleanAlt(sub[start])out = append(out, sub[start])}// ... and then emit sub[i].if i < len(sub) {out = append(out, sub[i])}start = i + 1}sub = out// Round 4: Collapse runs of empty matches into a single empty match.start = 0out = sub[:0]for i := range sub {if i+1 < len(sub) && sub[i].Op == OpEmptyMatch && sub[i+1].Op == OpEmptyMatch {continue}out = append(out, sub[i])}sub = outreturn sub}// leadingString returns the leading literal string that re begins with.// The string refers to storage in re or its children.func (p *parser) leadingString(re *Regexp) ([]rune, Flags) {if re.Op == OpConcat && len(re.Sub) > 0 {re = re.Sub[0]}if re.Op != OpLiteral {return nil, 0}return re.Rune, re.Flags & FoldCase}// removeLeadingString removes the first n leading runes// from the beginning of re. It returns the replacement for re.func (p *parser) removeLeadingString(re *Regexp, n int) *Regexp {if re.Op == OpConcat && len(re.Sub) > 0 {// Removing a leading string in a concatenation// might simplify the concatenation.sub := re.Sub[0]sub = p.removeLeadingString(sub, n)re.Sub[0] = subif sub.Op == OpEmptyMatch {p.reuse(sub)switch len(re.Sub) {case 0, 1:// Impossible but handle.re.Op = OpEmptyMatchre.Sub = nilcase 2:old := rere = re.Sub[1]p.reuse(old)default:copy(re.Sub, re.Sub[1:])re.Sub = re.Sub[:len(re.Sub)-1]}}return re}if re.Op == OpLiteral {re.Rune = re.Rune[:copy(re.Rune, re.Rune[n:])]if len(re.Rune) == 0 {re.Op = OpEmptyMatch}}return re}// leadingRegexp returns the leading regexp that re begins with.// The regexp refers to storage in re or its children.func (p *parser) leadingRegexp(re *Regexp) *Regexp {if re.Op == OpEmptyMatch {return nil}if re.Op == OpConcat && len(re.Sub) > 0 {sub := re.Sub[0]if sub.Op == OpEmptyMatch {return nil}return sub}return re}// removeLeadingRegexp removes the leading regexp in re.// It returns the replacement for re.// If reuse is true, it passes the removed regexp (if no longer needed) to p.reuse.func (p *parser) removeLeadingRegexp(re *Regexp, reuse bool) *Regexp {if re.Op == OpConcat && len(re.Sub) > 0 {if reuse {p.reuse(re.Sub[0])}re.Sub = re.Sub[:copy(re.Sub, re.Sub[1:])]switch len(re.Sub) {case 0:re.Op = OpEmptyMatchre.Sub = nilcase 1:old := rere = re.Sub[0]p.reuse(old)}return re}if reuse {p.reuse(re)}return p.newRegexp(OpEmptyMatch)}func literalRegexp(s string, flags Flags) *Regexp {re := &Regexp{Op: OpLiteral}re.Flags = flagsre.Rune = re.Rune0[:0] // use local storage for small stringsfor _, c := range s {if len(re.Rune) >= cap(re.Rune) {// string is too long to fit in Rune0. let Go handle itre.Rune = []rune(s)break}re.Rune = append(re.Rune, c)}return re}// Parsing.func Parse(s string, flags Flags) (*Regexp, error) {if flags&Literal != 0 {// Trivial parser for literal string.if err := checkUTF8(s); err != nil {return nil, err}return literalRegexp(s, flags), nil}// Otherwise, must do real work.var (p parsererr errorc runeop OplastRepeat stringmin, max int)p.flags = flagsp.wholeRegexp = st := sfor t != "" {repeat := ""BigSwitch:switch t[0] {default:if c, t, err = nextRune(t); err != nil {return nil, err}p.literal(c)case '(':if p.flags&PerlX != 0 && len(t) >= 2 && t[1] == '?' {// Flag changes and non-capturing groups.if t, err = p.parsePerlFlags(t); err != nil {return nil, err}break}p.numCap++p.op(opLeftParen).Cap = p.numCapt = t[1:]case '|':if err = p.parseVerticalBar(); err != nil {return nil, err}t = t[1:]case ')':if err = p.parseRightParen(); err != nil {return nil, err}t = t[1:]case '^':if p.flags&OneLine != 0 {p.op(OpBeginText)} else {p.op(OpBeginLine)}t = t[1:]case '$':if p.flags&OneLine != 0 {p.op(OpEndText).Flags |= WasDollar} else {p.op(OpEndLine)}t = t[1:]case '.':if p.flags&DotNL != 0 {p.op(OpAnyChar)} else {p.op(OpAnyCharNotNL)}t = t[1:]case '[':if t, err = p.parseClass(t); err != nil {return nil, err}case '*', '+', '?':before := tswitch t[0] {case '*':op = OpStarcase '+':op = OpPluscase '?':op = OpQuest}after := t[1:]if after, err = p.repeat(op, min, max, before, after, lastRepeat); err != nil {return nil, err}repeat = beforet = aftercase '{':op = OpRepeatbefore := tmin, max, after, ok := p.parseRepeat(t)if !ok {// If the repeat cannot be parsed, { is a literal.p.literal('{')t = t[1:]break}if min < 0 || min > 1000 || max > 1000 || max >= 0 && min > max {// Numbers were too big, or max is present and min > max.return nil, &Error{ErrInvalidRepeatSize, before[:len(before)-len(after)]}}if after, err = p.repeat(op, min, max, before, after, lastRepeat); err != nil {return nil, err}repeat = beforet = aftercase '\\':if p.flags&PerlX != 0 && len(t) >= 2 {switch t[1] {case 'A':p.op(OpBeginText)t = t[2:]break BigSwitchcase 'b':p.op(OpWordBoundary)t = t[2:]break BigSwitchcase 'B':p.op(OpNoWordBoundary)t = t[2:]break BigSwitchcase 'C':// any byte; not supportedreturn nil, &Error{ErrInvalidEscape, t[:2]}case 'Q':// \Q ... \E: the ... is always literalsvar lit stringif i := strings.Index(t, `\E`); i < 0 {lit = t[2:]t = ""} else {lit = t[2:i]t = t[i+2:]}p.push(literalRegexp(lit, p.flags))break BigSwitchcase 'z':p.op(OpEndText)t = t[2:]break BigSwitch}}re := p.newRegexp(OpCharClass)re.Flags = p.flags// Look for Unicode character group like \p{Han}if len(t) >= 2 && (t[1] == 'p' || t[1] == 'P') {r, rest, err := p.parseUnicodeClass(t, re.Rune0[:0])if err != nil {return nil, err}if r != nil {re.Rune = rt = restp.push(re)break BigSwitch}}// Perl character class escape.if r, rest := p.parsePerlClassEscape(t, re.Rune0[:0]); r != nil {re.Rune = rt = restp.push(re)break BigSwitch}p.reuse(re)// Ordinary single-character escape.if c, t, err = p.parseEscape(t); err != nil {return nil, err}p.literal(c)}lastRepeat = repeat}p.concat()if p.swapVerticalBar() {// pop vertical barp.stack = p.stack[:len(p.stack)-1]}p.alternate()n := len(p.stack)if n != 1 {return nil, &Error{ErrMissingParen, s}}return p.stack[0], nil}// parseRepeat parses {min} (max=min) or {min,} (max=-1) or {min,max}.// If s is not of that form, it returns ok == false.// If s has the right form but the values are too big, it returns min == -1, ok == true.func (p *parser) parseRepeat(s string) (min, max int, rest string, ok bool) {if s == "" || s[0] != '{' {return}s = s[1:]var ok1 boolif min, s, ok1 = p.parseInt(s); !ok1 {return}if s == "" {return}if s[0] != ',' {max = min} else {s = s[1:]if s == "" {return}if s[0] == '}' {max = -1} else if max, s, ok1 = p.parseInt(s); !ok1 {return} else if max < 0 {// parseInt found too big a numbermin = -1}}if s == "" || s[0] != '}' {return}rest = s[1:]ok = truereturn}// parsePerlFlags parses a Perl flag setting or non-capturing group or both,// like (?i) or (?: or (?i:. It removes the prefix from s and updates the parse state.// The caller must have ensured that s begins with "(?".func (p *parser) parsePerlFlags(s string) (rest string, err error) {t := s// Check for named captures, first introduced in Python's regexp library.// As usual, there are three slightly different syntaxes://// (?P<name>expr) the original, introduced by Python// (?<name>expr) the .NET alteration, adopted by Perl 5.10// (?'name'expr) another .NET alteration, adopted by Perl 5.10//// Perl 5.10 gave in and implemented the Python version too,// but they claim that the last two are the preferred forms.// PCRE and languages based on it (specifically, PHP and Ruby)// support all three as well. EcmaScript 4 uses only the Python form.//// In both the open source world (via Code Search) and the// Google source tree, (?P<expr>name) is the dominant form,// so that's the one we implement. One is enough.if len(t) > 4 && t[2] == 'P' && t[3] == '<' {// Pull out name.end := strings.IndexRune(t, '>')if end < 0 {if err = checkUTF8(t); err != nil {return "", err}return "", &Error{ErrInvalidNamedCapture, s}}capture := t[:end+1] // "(?P<name>"name := t[4:end] // "name"if err = checkUTF8(name); err != nil {return "", err}if !isValidCaptureName(name) {return "", &Error{ErrInvalidNamedCapture, capture}}// Like ordinary capture, but named.p.numCap++re := p.op(opLeftParen)re.Cap = p.numCapre.Name = namereturn t[end+1:], nil}// Non-capturing group. Might also twiddle Perl flags.var c runet = t[2:] // skip (?flags := p.flagssign := +1sawFlag := falseLoop:for t != "" {if c, t, err = nextRune(t); err != nil {return "", err}switch c {default:break Loop// Flags.case 'i':flags |= FoldCasesawFlag = truecase 'm':flags &^= OneLinesawFlag = truecase 's':flags |= DotNLsawFlag = truecase 'U':flags |= NonGreedysawFlag = true// Switch to negation.case '-':if sign < 0 {break Loop}sign = -1// Invert flags so that | above turn into &^ and vice versa.// We'll invert flags again before using it below.flags = ^flagssawFlag = false// End of flags, starting group or not.case ':', ')':if sign < 0 {if !sawFlag {break Loop}flags = ^flags}if c == ':' {// Open new groupp.op(opLeftParen)}p.flags = flagsreturn t, nil}}return "", &Error{ErrInvalidPerlOp, s[:len(s)-len(t)]}}// isValidCaptureName reports whether name// is a valid capture name: [A-Za-z0-9_]+.// PCRE limits names to 32 bytes.// Python rejects names starting with digits.// We don't enforce either of those.func isValidCaptureName(name string) bool {if name == "" {return false}for _, c := range name {if c != '_' && !isalnum(c) {return false}}return true}// parseInt parses a decimal integer.func (p *parser) parseInt(s string) (n int, rest string, ok bool) {if s == "" || s[0] < '0' || '9' < s[0] {return}// Disallow leading zeros.if len(s) >= 2 && s[0] == '0' && '0' <= s[1] && s[1] <= '9' {return}t := sfor s != "" && '0' <= s[0] && s[0] <= '9' {s = s[1:]}rest = sok = true// Have digits, compute value.t = t[:len(t)-len(s)]for i := 0; i < len(t); i++ {// Avoid overflow.if n >= 1e8 {n = -1break}n = n*10 + int(t[i]) - '0'}return}// can this be represented as a character class?// single-rune literal string, char class, ., and .|\n.func isCharClass(re *Regexp) bool {return re.Op == OpLiteral && len(re.Rune) == 1 ||re.Op == OpCharClass ||re.Op == OpAnyCharNotNL ||re.Op == OpAnyChar}// does re match r?func matchRune(re *Regexp, r rune) bool {switch re.Op {case OpLiteral:return len(re.Rune) == 1 && re.Rune[0] == rcase OpCharClass:for i := 0; i < len(re.Rune); i += 2 {if re.Rune[i] <= r && r <= re.Rune[i+1] {return true}}return falsecase OpAnyCharNotNL:return r != '\n'case OpAnyChar:return true}return false}// parseVerticalBar handles a | in the input.func (p *parser) parseVerticalBar() error {p.concat()// The concatenation we just parsed is on top of the stack.// If it sits above an opVerticalBar, swap it below// (things below an opVerticalBar become an alternation).// Otherwise, push a new vertical bar.if !p.swapVerticalBar() {p.op(opVerticalBar)}return nil}// mergeCharClass makes dst = dst|src.// The caller must ensure that dst.Op >= src.Op,// to reduce the amount of copying.func mergeCharClass(dst, src *Regexp) {switch dst.Op {case OpAnyChar:// src doesn't add anything.case OpAnyCharNotNL:// src might add \nif matchRune(src, '\n') {dst.Op = OpAnyChar}case OpCharClass:// src is simpler, so either literal or char classif src.Op == OpLiteral {dst.Rune = appendLiteral(dst.Rune, src.Rune[0], src.Flags)} else {dst.Rune = appendClass(dst.Rune, src.Rune)}case OpLiteral:// both literalif src.Rune[0] == dst.Rune[0] && src.Flags == dst.Flags {break}dst.Op = OpCharClassdst.Rune = appendLiteral(dst.Rune[:0], dst.Rune[0], dst.Flags)dst.Rune = appendLiteral(dst.Rune, src.Rune[0], src.Flags)}}// If the top of the stack is an element followed by an opVerticalBar// swapVerticalBar swaps the two and returns true.// Otherwise it returns false.func (p *parser) swapVerticalBar() bool {// If above and below vertical bar are literal or char class,// can merge into a single char class.n := len(p.stack)if n >= 3 && p.stack[n-2].Op == opVerticalBar && isCharClass(p.stack[n-1]) && isCharClass(p.stack[n-3]) {re1 := p.stack[n-1]re3 := p.stack[n-3]// Make re3 the more complex of the two.if re1.Op > re3.Op {re1, re3 = re3, re1p.stack[n-3] = re3}mergeCharClass(re3, re1)p.reuse(re1)p.stack = p.stack[:n-1]return true}if n >= 2 {re1 := p.stack[n-1]re2 := p.stack[n-2]if re2.Op == opVerticalBar {if n >= 3 {// Now out of reach.// Clean opportunistically.cleanAlt(p.stack[n-3])}p.stack[n-2] = re1p.stack[n-1] = re2return true}}return false}// parseRightParen handles a ) in the input.func (p *parser) parseRightParen() error {p.concat()if p.swapVerticalBar() {// pop vertical barp.stack = p.stack[:len(p.stack)-1]}p.alternate()n := len(p.stack)if n < 2 {return &Error{ErrInternalError, ""}}re1 := p.stack[n-1]re2 := p.stack[n-2]p.stack = p.stack[:n-2]if re2.Op != opLeftParen {return &Error{ErrMissingParen, p.wholeRegexp}}// Restore flags at time of paren.p.flags = re2.Flagsif re2.Cap == 0 {// Just for grouping.p.push(re1)} else {re2.Op = OpCapturere2.Sub = re2.Sub0[:1]re2.Sub[0] = re1p.push(re2)}return nil}// parseEscape parses an escape sequence at the beginning of s// and returns the rune.func (p *parser) parseEscape(s string) (r rune, rest string, err error) {t := s[1:]if t == "" {return 0, "", &Error{ErrTrailingBackslash, ""}}c, t, err := nextRune(t)if err != nil {return 0, "", err}Switch:switch c {default:if c < utf8.RuneSelf && !isalnum(c) {// Escaped non-word characters are always themselves.// PCRE is not quite so rigorous: it accepts things like// \q, but we don't. We once rejected \_, but too many// programs and people insist on using it, so allow \_.return c, t, nil}// Octal escapes.case '1', '2', '3', '4', '5', '6', '7':// Single non-zero digit is a backreference; not supportedif t == "" || t[0] < '0' || t[0] > '7' {break}fallthroughcase '0':// Consume up to three octal digits; already have one.r = c - '0'for i := 1; i < 3; i++ {if t == "" || t[0] < '0' || t[0] > '7' {break}r = r*8 + rune(t[0]) - '0't = t[1:]}return r, t, nil// Hexadecimal escapes.case 'x':if t == "" {break}if c, t, err = nextRune(t); err != nil {return 0, "", err}if c == '{' {// Any number of digits in braces.// Perl accepts any text at all; it ignores all text// after the first non-hex digit. We require only hex digits,// and at least one.nhex := 0r = 0for {if t == "" {break Switch}if c, t, err = nextRune(t); err != nil {return 0, "", err}if c == '}' {break}v := unhex(c)if v < 0 {break Switch}r = r*16 + vif r > unicode.MaxRune {break Switch}nhex++}if nhex == 0 {break Switch}return r, t, nil}// Easy case: two hex digits.x := unhex(c)if c, t, err = nextRune(t); err != nil {return 0, "", err}y := unhex(c)if x < 0 || y < 0 {break}return x*16 + y, t, nil// C escapes. There is no case 'b', to avoid misparsing// the Perl word-boundary \b as the C backspace \b// when in POSIX mode. In Perl, /\b/ means word-boundary// but /[\b]/ means backspace. We don't support that.// If you want a backspace, embed a literal backspace// character or use \x08.case 'a':return '\a', t, errcase 'f':return '\f', t, errcase 'n':return '\n', t, errcase 'r':return '\r', t, errcase 't':return '\t', t, errcase 'v':return '\v', t, err}return 0, "", &Error{ErrInvalidEscape, s[:len(s)-len(t)]}}// parseClassChar parses a character class character at the beginning of s// and returns it.func (p *parser) parseClassChar(s, wholeClass string) (r rune, rest string, err error) {if s == "" {return 0, "", &Error{Code: ErrMissingBracket, Expr: wholeClass}}// Allow regular escape sequences even though// many need not be escaped in this context.if s[0] == '\\' {return p.parseEscape(s)}return nextRune(s)}type charGroup struct {sign intclass []rune}// parsePerlClassEscape parses a leading Perl character class escape like \d// from the beginning of s. If one is present, it appends the characters to r// and returns the new slice r and the remainder of the string.func (p *parser) parsePerlClassEscape(s string, r []rune) (out []rune, rest string) {if p.flags&PerlX == 0 || len(s) < 2 || s[0] != '\\' {return}g := perlGroup[s[0:2]]if g.sign == 0 {return}return p.appendGroup(r, g), s[2:]}// parseNamedClass parses a leading POSIX named character class like [:alnum:]// from the beginning of s. If one is present, it appends the characters to r// and returns the new slice r and the remainder of the string.func (p *parser) parseNamedClass(s string, r []rune) (out []rune, rest string, err error) {if len(s) < 2 || s[0] != '[' || s[1] != ':' {return}i := strings.Index(s[2:], ":]")if i < 0 {return}i += 2name, s := s[0:i+2], s[i+2:]g := posixGroup[name]if g.sign == 0 {return nil, "", &Error{ErrInvalidCharRange, name}}return p.appendGroup(r, g), s, nil}func (p *parser) appendGroup(r []rune, g charGroup) []rune {if p.flags&FoldCase == 0 {if g.sign < 0 {r = appendNegatedClass(r, g.class)} else {r = appendClass(r, g.class)}} else {tmp := p.tmpClass[:0]tmp = appendFoldedClass(tmp, g.class)p.tmpClass = tmptmp = cleanClass(&p.tmpClass)if g.sign < 0 {r = appendNegatedClass(r, tmp)} else {r = appendClass(r, tmp)}}return r}var anyTable = &unicode.RangeTable{R16: []unicode.Range16{{Lo: 0, Hi: 1<<16 - 1, Stride: 1}},R32: []unicode.Range32{{Lo: 1 << 16, Hi: unicode.MaxRune, Stride: 1}},}// unicodeTable returns the unicode.RangeTable identified by name// and the table of additional fold-equivalent code points.func unicodeTable(name string) (*unicode.RangeTable, *unicode.RangeTable) {// Special case: "Any" means any.if name == "Any" {return anyTable, anyTable}if t := unicode.Categories[name]; t != nil {return t, unicode.FoldCategory[name]}if t := unicode.Scripts[name]; t != nil {return t, unicode.FoldScript[name]}return nil, nil}// parseUnicodeClass parses a leading Unicode character class like \p{Han}// from the beginning of s. If one is present, it appends the characters to r// and returns the new slice r and the remainder of the string.func (p *parser) parseUnicodeClass(s string, r []rune) (out []rune, rest string, err error) {if p.flags&UnicodeGroups == 0 || len(s) < 2 || s[0] != '\\' || s[1] != 'p' && s[1] != 'P' {return}// Committed to parse or return error.sign := +1if s[1] == 'P' {sign = -1}t := s[2:]c, t, err := nextRune(t)if err != nil {return}var seq, name stringif c != '{' {// Single-letter name.seq = s[:len(s)-len(t)]name = seq[2:]} else {// Name is in braces.end := strings.IndexRune(s, '}')if end < 0 {if err = checkUTF8(s); err != nil {return}return nil, "", &Error{ErrInvalidCharRange, s}}seq, t = s[:end+1], s[end+1:]name = s[3:end]if err = checkUTF8(name); err != nil {return}}// Group can have leading negation too. \p{^Han} == \P{Han}, \P{^Han} == \p{Han}.if name != "" && name[0] == '^' {sign = -signname = name[1:]}tab, fold := unicodeTable(name)if tab == nil {return nil, "", &Error{ErrInvalidCharRange, seq}}if p.flags&FoldCase == 0 || fold == nil {if sign > 0 {r = appendTable(r, tab)} else {r = appendNegatedTable(r, tab)}} else {// Merge and clean tab and fold in a temporary buffer.// This is necessary for the negative case and just tidy// for the positive case.tmp := p.tmpClass[:0]tmp = appendTable(tmp, tab)tmp = appendTable(tmp, fold)p.tmpClass = tmptmp = cleanClass(&p.tmpClass)if sign > 0 {r = appendClass(r, tmp)} else {r = appendNegatedClass(r, tmp)}}return r, t, nil}// parseClass parses a character class at the beginning of s// and pushes it onto the parse stack.func (p *parser) parseClass(s string) (rest string, err error) {t := s[1:] // chop [re := p.newRegexp(OpCharClass)re.Flags = p.flagsre.Rune = re.Rune0[:0]sign := +1if t != "" && t[0] == '^' {sign = -1t = t[1:]// If character class does not match \n, add it here,// so that negation later will do the right thing.if p.flags&ClassNL == 0 {re.Rune = append(re.Rune, '\n', '\n')}}class := re.Runefirst := true // ] and - are okay as first char in classfor t == "" || t[0] != ']' || first {// POSIX: - is only okay unescaped as first or last in class.// Perl: - is okay anywhere.if t != "" && t[0] == '-' && p.flags&PerlX == 0 && !first && (len(t) == 1 || t[1] != ']') {_, size := utf8.DecodeRuneInString(t[1:])return "", &Error{Code: ErrInvalidCharRange, Expr: t[:1+size]}}first = false// Look for POSIX [:alnum:] etc.if len(t) > 2 && t[0] == '[' && t[1] == ':' {nclass, nt, err := p.parseNamedClass(t, class)if err != nil {return "", err}if nclass != nil {class, t = nclass, ntcontinue}}// Look for Unicode character group like \p{Han}.nclass, nt, err := p.parseUnicodeClass(t, class)if err != nil {return "", err}if nclass != nil {class, t = nclass, ntcontinue}// Look for Perl character class symbols (extension).if nclass, nt := p.parsePerlClassEscape(t, class); nclass != nil {class, t = nclass, ntcontinue}// Single character or simple range.rng := tvar lo, hi runeif lo, t, err = p.parseClassChar(t, s); err != nil {return "", err}hi = lo// [a-] means (a|-) so check for final ].if len(t) >= 2 && t[0] == '-' && t[1] != ']' {t = t[1:]if hi, t, err = p.parseClassChar(t, s); err != nil {return "", err}if hi < lo {rng = rng[:len(rng)-len(t)]return "", &Error{Code: ErrInvalidCharRange, Expr: rng}}}if p.flags&FoldCase == 0 {class = AppendRange(class, lo, hi)} else {class = appendFoldedRange(class, lo, hi)}}t = t[1:] // chop ]// Use &re.Rune instead of &class to avoid allocation.re.Rune = classclass = cleanClass(&re.Rune)if sign < 0 {class = negateClass(class)}re.Rune = classp.push(re)return t, nil}// cleanClass sorts the ranges (pairs of elements of r),// merges them, and eliminates duplicates.func cleanClass(rp *[]rune) []rune {// Sort by lo increasing, hi decreasing to break ties.sort.Sort(ranges{rp})r := *rpif len(r) < 2 {return r}// Merge abutting, overlapping.w := 2 // write indexfor i := 2; i < len(r); i += 2 {lo, hi := r[i], r[i+1]if lo <= r[w-1]+1 {// merge with previous rangeif hi > r[w-1] {r[w-1] = hi}continue}// new disjoint ranger[w] = lor[w+1] = hiw += 2}return r[:w]}// appendLiteral returns the result of appending the literal x to the class r.func appendLiteral(r []rune, x rune, flags Flags) []rune {if flags&FoldCase != 0 {return appendFoldedRange(r, x, x)}return AppendRange(r, x, x)}// appendRange returns the result of appending the range lo-hi to the class r.func AppendRange(r []rune, lo, hi rune) []rune {// Expand last range or next to last range if it overlaps or abuts.// Checking two ranges helps when appending case-folded// alphabets, so that one range can be expanding A-Z and the// other expanding a-z.n := len(r)for i := 2; i <= 4; i += 2 { // twice, using i=2, i=4if n >= i {rlo, rhi := r[n-i], r[n-i+1]if lo <= rhi+1 && rlo <= hi+1 {if lo < rlo {r[n-i] = lo}if hi > rhi {r[n-i+1] = hi}return r}}}return append(r, lo, hi)}const (// minimum and maximum runes involved in folding.// checked during test.MinFold = 0x0041MaxFold = 0x1044f)// appendFoldedRange returns the result of appending the range lo-hi// and its case folding-equivalent runes to the class r.func appendFoldedRange(r []rune, lo, hi rune) []rune {// Optimizations.if lo <= MinFold && hi >= MaxFold {// Range is full: folding can't add more.return AppendRange(r, lo, hi)}if hi < MinFold || lo > MaxFold {// Range is outside folding possibilities.return AppendRange(r, lo, hi)}if lo < MinFold {// [lo, MinFold-1] needs no folding.r = AppendRange(r, lo, MinFold-1)lo = MinFold}if hi > MaxFold {// [MaxFold+1, hi] needs no folding.r = AppendRange(r, MaxFold+1, hi)hi = MaxFold}// Brute force. Depend on AppendRange to coalesce ranges on the fly.for c := lo; c <= hi; c++ {r = AppendRange(r, c, c)f := unicode.SimpleFold(c)for f != c {r = AppendRange(r, f, f)f = unicode.SimpleFold(f)}}return r}// appendClass returns the result of appending the class x to the class r.// It assume x is clean.func appendClass(r []rune, x []rune) []rune {for i := 0; i < len(x); i += 2 {r = AppendRange(r, x[i], x[i+1])}return r}// appendFolded returns the result of appending the case folding of the class x to the class r.func appendFoldedClass(r []rune, x []rune) []rune {for i := 0; i < len(x); i += 2 {r = appendFoldedRange(r, x[i], x[i+1])}return r}// appendNegatedClass returns the result of appending the negation of the class x to the class r.// It assumes x is clean.func appendNegatedClass(r []rune, x []rune) []rune {nextLo := '\u0000'for i := 0; i < len(x); i += 2 {lo, hi := x[i], x[i+1]if nextLo <= lo-1 {r = AppendRange(r, nextLo, lo-1)}nextLo = hi + 1}if nextLo <= unicode.MaxRune {r = AppendRange(r, nextLo, unicode.MaxRune)}return r}// appendTable returns the result of appending x to the class r.func appendTable(r []rune, x *unicode.RangeTable) []rune {for _, xr := range x.R16 {lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)if stride == 1 {r = AppendRange(r, lo, hi)continue}for c := lo; c <= hi; c += stride {r = AppendRange(r, c, c)}}for _, xr := range x.R32 {lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)if stride == 1 {r = AppendRange(r, lo, hi)continue}for c := lo; c <= hi; c += stride {r = AppendRange(r, c, c)}}return r}// appendNegatedTable returns the result of appending the negation of x to the class r.func appendNegatedTable(r []rune, x *unicode.RangeTable) []rune {nextLo := '\u0000' // lo end of next class to addfor _, xr := range x.R16 {lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)if stride == 1 {if nextLo <= lo-1 {r = AppendRange(r, nextLo, lo-1)}nextLo = hi + 1continue}for c := lo; c <= hi; c += stride {if nextLo <= c-1 {r = AppendRange(r, nextLo, c-1)}nextLo = c + 1}}for _, xr := range x.R32 {lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)if stride == 1 {if nextLo <= lo-1 {r = AppendRange(r, nextLo, lo-1)}nextLo = hi + 1continue}for c := lo; c <= hi; c += stride {if nextLo <= c-1 {r = AppendRange(r, nextLo, c-1)}nextLo = c + 1}}if nextLo <= unicode.MaxRune {r = AppendRange(r, nextLo, unicode.MaxRune)}return r}// negateClass overwrites r and returns r's negation.// It assumes the class r is already clean.func negateClass(r []rune) []rune {nextLo := '\u0000' // lo end of next class to addw := 0 // write indexfor i := 0; i < len(r); i += 2 {lo, hi := r[i], r[i+1]if nextLo <= lo-1 {r[w] = nextLor[w+1] = lo - 1w += 2}nextLo = hi + 1}r = r[:w]if nextLo <= unicode.MaxRune {// It's possible for the negation to have one more// range - this one - than the original class, so use append.r = append(r, nextLo, unicode.MaxRune)}return r}// ranges implements sort.Interface on a []rune.// The choice of receiver type definition is strange// but avoids an allocation since we already have// a *[]rune.type ranges struct {p *[]rune}func (ra ranges) Less(i, j int) bool {p := *ra.pi *= 2j *= 2return p[i] < p[j] || p[i] == p[j] && p[i+1] > p[j+1]}func (ra ranges) Len() int {return len(*ra.p) / 2}func (ra ranges) Swap(i, j int) {p := *ra.pi *= 2j *= 2p[i], p[i+1], p[j], p[j+1] = p[j], p[j+1], p[i], p[i+1]}func checkUTF8(s string) error {for s != "" {rune, size := utf8.DecodeRuneInString(s)if rune == utf8.RuneError && size == 1 {return &Error{Code: ErrInvalidUTF8, Expr: s}}s = s[size:]}return nil}func nextRune(s string) (c rune, t string, err error) {c, size := utf8.DecodeRuneInString(s)if c == utf8.RuneError && size == 1 {return 0, "", &Error{Code: ErrInvalidUTF8, Expr: s}}return c, s[size:], nil}func isalnum(c rune) bool {return '0' <= c && c <= '9' || 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z'}func unhex(c rune) rune {if '0' <= c && c <= '9' {return c - '0'}if 'a' <= c && c <= 'f' {return c - 'a' + 10}if 'A' <= c && c <= 'F' {return c - 'A' + 10}return -1}
