URL https://opencores.org/ocsvn/openrisc/openrisc/trunk
Subversion Repositories openrisc

[/] [openrisc/] [trunk/] [gnu-dev/] [or1k-gcc/] [libgo/] [go/] [old/] [regexp/] [regexp.go] - Rev 809

Go to most recent revision | Compare with Previous | Blame | View Log
// Copyright 2010 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// Package regexp implements a simple regular expression library.
//
// The syntax of the regular expressions accepted is:
//
//      regexp:
//              concatenation { '|' concatenation }
//      concatenation:
//              { closure }
//      closure:
//              term [ '*' | '+' | '?' ]
//      term:
//              '^'
//              '$'
//              '.'
//              character
//              '[' [ '^' ] { character-range } ']'
//              '(' regexp ')'
//      character-range:
//              character [ '-' character ]
//
// All characters are UTF-8-encoded code points.  Backslashes escape special
// characters, including inside character classes.  The standard Go character
// escapes are also recognized: \a \b \f \n \r \t \v.
//
// There are 16 methods of Regexp that match a regular expression and identify
// the matched text.  Their names are matched by this regular expression:
//
//      Find(All)?(String)?(Submatch)?(Index)?
//
// If 'All' is present, the routine matches successive non-overlapping
// matches of the entire expression.  Empty matches abutting a preceding
// match are ignored.  The return value is a slice containing the successive
// return values of the corresponding non-'All' routine.  These routines take
// an extra integer argument, n; if n >= 0, the function returns at most n
// matches/submatches.
//
// If 'String' is present, the argument is a string; otherwise it is a slice
// of bytes; return values are adjusted as appropriate.
//
// If 'Submatch' is present, the return value is a slice identifying the
// successive submatches of the expression.  Submatches are matches of
// parenthesized subexpressions within the regular expression, numbered from
// left to right in order of opening parenthesis.  Submatch 0 is the match of
// the entire expression, submatch 1 the match of the first parenthesized
// subexpression, and so on.
//
// If 'Index' is present, matches and submatches are identified by byte index
// pairs within the input string: result[2*n:2*n+1] identifies the indexes of
// the nth submatch.  The pair for n==0 identifies the match of the entire
// expression.  If 'Index' is not present, the match is identified by the
// text of the match/submatch.  If an index is negative, it means that
// subexpression did not match any string in the input.
//
// There is also a subset of the methods that can be applied to text read
// from a RuneReader:
//
//      MatchReader, FindReaderIndex, FindReaderSubmatchIndex
//
// This set may grow.  Note that regular expression matches may need to
// examine text beyond the text returned by a match, so the methods that
// match text from a RuneReader may read arbitrarily far into the input
// before returning.
//
// (There are a few other methods that do not match this pattern.)
//
package regexp

import (
        "bytes"
        "io"
        "strings"
        "unicode/utf8"
)

var debug = false

// Error is the local type for a parsing error.
type Error string

func (e Error) Error() string {
        return string(e)
}

// Error codes returned by failures to parse an expression.
var (
        ErrInternal            = Error("regexp: internal error")
        ErrUnmatchedLpar       = Error("regexp: unmatched '('")
        ErrUnmatchedRpar       = Error("regexp: unmatched ')'")
        ErrUnmatchedLbkt       = Error("regexp: unmatched '['")
        ErrUnmatchedRbkt       = Error("regexp: unmatched ']'")
        ErrBadRange            = Error("regexp: bad range in character class")
        ErrExtraneousBackslash = Error("regexp: extraneous backslash")
        ErrBadClosure          = Error("regexp: repeated closure (**, ++, etc.)")
        ErrBareClosure         = Error("regexp: closure applies to nothing")
        ErrBadBackslash        = Error("regexp: illegal backslash escape")
)

const (
        iStart     = iota // beginning of program
        iEnd              // end of program: success
        iBOT              // '^' beginning of text
        iEOT              // '$' end of text
        iChar             // 'a' regular character
        iCharClass        // [a-z] character class
        iAny              // '.' any character including newline
        iNotNL            // [^\n] special case: any character but newline
        iBra              // '(' parenthesized expression: 2*braNum for left, 2*braNum+1 for right
        iAlt              // '|' alternation
        iNop              // do nothing; makes it easy to link without patching
)

// An instruction executed by the NFA
type instr struct {
        kind  int    // the type of this instruction: iChar, iAny, etc.
        index int    // used only in debugging; could be eliminated
        next  *instr // the instruction to execute after this one
        // Special fields valid only for some items.
        char   rune       // iChar
        braNum int        // iBra, iEbra
        cclass *charClass // iCharClass
        left   *instr     // iAlt, other branch
}

func (i *instr) print() {
        switch i.kind {
        case iStart:
                print("start")
        case iEnd:
                print("end")
        case iBOT:
                print("bot")
        case iEOT:
                print("eot")
        case iChar:
                print("char ", string(i.char))
        case iCharClass:
                i.cclass.print()
        case iAny:
                print("any")
        case iNotNL:
                print("notnl")
        case iBra:
                if i.braNum&1 == 0 {
                        print("bra", i.braNum/2)
                } else {
                        print("ebra", i.braNum/2)
                }
        case iAlt:
                print("alt(", i.left.index, ")")
        case iNop:
                print("nop")
        }
}

// Regexp is the representation of a compiled regular expression.
// The public interface is entirely through methods.
// A Regexp is safe for concurrent use by multiple goroutines.
type Regexp struct {
        expr        string // the original expression
        prefix      string // initial plain text string
        prefixBytes []byte // initial plain text bytes
        inst        []*instr
        start       *instr // first instruction of machine
        prefixStart *instr // where to start if there is a prefix
        nbra        int    // number of brackets in expression, for subexpressions
}

type charClass struct {
        negate bool // is character class negated? ([^a-z])
        // slice of int, stored pairwise: [a-z] is (a,z); x is (x,x):
        ranges     []rune
        cmin, cmax rune
}

func (cclass *charClass) print() {
        print("charclass")
        if cclass.negate {
                print(" (negated)")
        }
        for i := 0; i < len(cclass.ranges); i += 2 {
                l := cclass.ranges[i]
                r := cclass.ranges[i+1]
                if l == r {
                        print(" [", string(l), "]")
                } else {
                        print(" [", string(l), "-", string(r), "]")
                }
        }
}

func (cclass *charClass) addRange(a, b rune) {
        // range is a through b inclusive
        cclass.ranges = append(cclass.ranges, a, b)
        if a < cclass.cmin {
                cclass.cmin = a
        }
        if b > cclass.cmax {
                cclass.cmax = b
        }
}

func (cclass *charClass) matches(c rune) bool {
        if c < cclass.cmin || c > cclass.cmax {
                return cclass.negate
        }
        ranges := cclass.ranges
        for i := 0; i < len(ranges); i = i + 2 {
                if ranges[i] <= c && c <= ranges[i+1] {
                        return !cclass.negate
                }
        }
        return cclass.negate
}

func newCharClass() *instr {
        i := &instr{kind: iCharClass}
        i.cclass = new(charClass)
        i.cclass.ranges = make([]rune, 0, 4)
        i.cclass.cmin = 0x10FFFF + 1 // MaxRune + 1
        i.cclass.cmax = -1
        return i
}

func (re *Regexp) add(i *instr) *instr {
        i.index = len(re.inst)
        re.inst = append(re.inst, i)
        return i
}

type parser struct {
        re    *Regexp
        nlpar int // number of unclosed lpars
        pos   int
        ch    rune
}

func (p *parser) error(err Error) {
        panic(err)
}

const endOfText = -1

func (p *parser) c() rune { return p.ch }

func (p *parser) nextc() rune {
        if p.pos >= len(p.re.expr) {
                p.ch = endOfText
        } else {
                c, w := utf8.DecodeRuneInString(p.re.expr[p.pos:])
                p.ch = c
                p.pos += w
        }
        return p.ch
}

func newParser(re *Regexp) *parser {
        p := new(parser)
        p.re = re
        p.nextc() // load p.ch
        return p
}

func special(c rune) bool {
        for _, r := range `\.+*?()|[]^$` {
                if c == r {
                        return true
                }
        }
        return false
}

func ispunct(c rune) bool {
        for _, r := range "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" {
                if c == r {
                        return true
                }
        }
        return false
}

var escapes = []byte("abfnrtv")
var escaped = []byte("\a\b\f\n\r\t\v")

func escape(c rune) int {
        for i, b := range escapes {
                if rune(b) == c {
                        return i
                }
        }
        return -1
}

func (p *parser) checkBackslash() rune {
        c := p.c()
        if c == '\\' {
                c = p.nextc()
                switch {
                case c == endOfText:
                        p.error(ErrExtraneousBackslash)
                case ispunct(c):
                        // c is as delivered
                case escape(c) >= 0:
                        c = rune(escaped[escape(c)])
                default:
                        p.error(ErrBadBackslash)
                }
        }
        return c
}

func (p *parser) charClass() *instr {
        i := newCharClass()
        cc := i.cclass
        if p.c() == '^' {
                cc.negate = true
                p.nextc()
        }
        left := rune(-1)
        for {
                switch c := p.c(); c {
                case ']', endOfText:
                        if left >= 0 {
                                p.error(ErrBadRange)
                        }
                        // Is it [^\n]?
                        if cc.negate && len(cc.ranges) == 2 &&
                                cc.ranges[0] == '\n' && cc.ranges[1] == '\n' {
                                nl := &instr{kind: iNotNL}
                                p.re.add(nl)
                                return nl
                        }
                        // Special common case: "[a]" -> "a"
                        if !cc.negate && len(cc.ranges) == 2 && cc.ranges[0] == cc.ranges[1] {
                                c := &instr{kind: iChar, char: cc.ranges[0]}
                                p.re.add(c)
                                return c
                        }
                        p.re.add(i)
                        return i
                case '-': // do this before backslash processing
                        p.error(ErrBadRange)
                default:
                        c = p.checkBackslash()
                        p.nextc()
                        switch {
                        case left < 0: // first of pair
                                if p.c() == '-' { // range
                                        p.nextc()
                                        left = c
                                } else { // single char
                                        cc.addRange(c, c)
                                }
                        case left <= c: // second of pair
                                cc.addRange(left, c)
                                left = -1
                        default:
                                p.error(ErrBadRange)
                        }
                }
        }
        panic("unreachable")
}

func (p *parser) term() (start, end *instr) {
        switch c := p.c(); c {
        case '|', endOfText:
                return nil, nil
        case '*', '+', '?':
                p.error(ErrBareClosure)
        case ')':
                if p.nlpar == 0 {
                        p.error(ErrUnmatchedRpar)
                }
                return nil, nil
        case ']':
                p.error(ErrUnmatchedRbkt)
        case '^':
                p.nextc()
                start = p.re.add(&instr{kind: iBOT})
                return start, start
        case '$':
                p.nextc()
                start = p.re.add(&instr{kind: iEOT})
                return start, start
        case '.':
                p.nextc()
                start = p.re.add(&instr{kind: iAny})
                return start, start
        case '[':
                p.nextc()
                start = p.charClass()
                if p.c() != ']' {
                        p.error(ErrUnmatchedLbkt)
                }
                p.nextc()
                return start, start
        case '(':
                p.nextc()
                p.nlpar++
                p.re.nbra++ // increment first so first subexpr is \1
                nbra := p.re.nbra
                start, end = p.regexp()
                if p.c() != ')' {
                        p.error(ErrUnmatchedLpar)
                }
                p.nlpar--
                p.nextc()
                bra := &instr{kind: iBra, braNum: 2 * nbra}
                p.re.add(bra)
                ebra := &instr{kind: iBra, braNum: 2*nbra + 1}
                p.re.add(ebra)
                if start == nil {
                        if end == nil {
                                p.error(ErrInternal)
                                return
                        }
                        start = ebra
                } else {
                        end.next = ebra
                }
                bra.next = start
                return bra, ebra
        default:
                c = p.checkBackslash()
                p.nextc()
                start = &instr{kind: iChar, char: c}
                p.re.add(start)
                return start, start
        }
        panic("unreachable")
}

func (p *parser) closure() (start, end *instr) {
        start, end = p.term()
        if start == nil {
                return
        }
        switch p.c() {
        case '*':
                // (start,end)*:
                alt := &instr{kind: iAlt}
                p.re.add(alt)
                end.next = alt   // after end, do alt
                alt.left = start // alternate brach: return to start
                start = alt      // alt becomes new (start, end)
                end = alt
        case '+':
                // (start,end)+:
                alt := &instr{kind: iAlt}
                p.re.add(alt)
                end.next = alt   // after end, do alt
                alt.left = start // alternate brach: return to start
                end = alt        // start is unchanged; end is alt
        case '?':
                // (start,end)?:
                alt := &instr{kind: iAlt}
                p.re.add(alt)
                nop := &instr{kind: iNop}
                p.re.add(nop)
                alt.left = start // alternate branch is start
                alt.next = nop   // follow on to nop
                end.next = nop   // after end, go to nop
                start = alt      // start is now alt
                end = nop        // end is nop pointed to by both branches
        default:
                return
        }
        switch p.nextc() {
        case '*', '+', '?':
                p.error(ErrBadClosure)
        }
        return
}

func (p *parser) concatenation() (start, end *instr) {
        for {
                nstart, nend := p.closure()
                switch {
                case nstart == nil: // end of this concatenation
                        if start == nil { // this is the empty string
                                nop := p.re.add(&instr{kind: iNop})
                                return nop, nop
                        }
                        return
                case start == nil: // this is first element of concatenation
                        start, end = nstart, nend
                default:
                        end.next = nstart
                        end = nend
                }
        }
        panic("unreachable")
}

func (p *parser) regexp() (start, end *instr) {
        start, end = p.concatenation()
        for {
                switch p.c() {
                default:
                        return
                case '|':
                        p.nextc()
                        nstart, nend := p.concatenation()
                        alt := &instr{kind: iAlt}
                        p.re.add(alt)
                        alt.left = start
                        alt.next = nstart
                        nop := &instr{kind: iNop}
                        p.re.add(nop)
                        end.next = nop
                        nend.next = nop
                        start, end = alt, nop
                }
        }
        panic("unreachable")
}

func unNop(i *instr) *instr {
        for i.kind == iNop {
                i = i.next
        }
        return i
}

func (re *Regexp) eliminateNops() {
        for _, inst := range re.inst {
                if inst.kind == iEnd {
                        continue
                }
                inst.next = unNop(inst.next)
                if inst.kind == iAlt {
                        inst.left = unNop(inst.left)
                }
        }
}

func (re *Regexp) dump() {
        print("prefix <", re.prefix, ">\n")
        for _, inst := range re.inst {
                print(inst.index, ": ")
                inst.print()
                if inst.kind != iEnd {
                        print(" -> ", inst.next.index)
                }
                print("\n")
        }
}

func (re *Regexp) doParse() {
        p := newParser(re)
        start := &instr{kind: iStart}
        re.add(start)
        s, e := p.regexp()
        start.next = s
        re.start = start
        e.next = re.add(&instr{kind: iEnd})

        if debug {
                re.dump()
                println()
        }

        re.eliminateNops()
        if debug {
                re.dump()
                println()
        }
        re.setPrefix()
        if debug {
                re.dump()
                println()
        }
}

// Extract regular text from the beginning of the pattern,
// possibly after a leading iBOT.
// That text can be used by doExecute to speed up matching.
func (re *Regexp) setPrefix() {
        var b []byte
        var utf = make([]byte, utf8.UTFMax)
        var inst *instr
        // First instruction is start; skip that.  Also skip any initial iBOT.
        inst = re.inst[0].next
        for inst.kind == iBOT {
                inst = inst.next
        }
Loop:
        for ; inst.kind != iEnd; inst = inst.next {
                // stop if this is not a char
                if inst.kind != iChar {
                        break
                }
                // stop if this char can be followed by a match for an empty string,
                // which includes closures, ^, and $.
                switch inst.next.kind {
                case iBOT, iEOT, iAlt:
                        break Loop
                }
                n := utf8.EncodeRune(utf, inst.char)
                b = append(b, utf[0:n]...)
        }
        // point prefixStart instruction to first non-CHAR after prefix
        re.prefixStart = inst
        re.prefixBytes = b
        re.prefix = string(b)
}

// String returns the source text used to compile the regular expression.
func (re *Regexp) String() string {
        return re.expr
}

// Compile parses a regular expression and returns, if successful, a Regexp
// object that can be used to match against text.
func Compile(str string) (regexp *Regexp, error error) {
        regexp = new(Regexp)
        // doParse will panic if there is a parse error.
        defer func() {
                if e := recover(); e != nil {
                        regexp = nil
                        error = e.(Error) // Will re-panic if error was not an Error, e.g. nil-pointer exception
                }
        }()
        regexp.expr = str
        regexp.inst = make([]*instr, 0, 10)
        regexp.doParse()
        return
}

// MustCompile is like Compile but panics if the expression cannot be parsed.
// It simplifies safe initialization of global variables holding compiled regular
// expressions.
func MustCompile(str string) *Regexp {
        regexp, error := Compile(str)
        if error != nil {
                panic(`regexp: compiling "` + str + `": ` + error.Error())
        }
        return regexp
}

// NumSubexp returns the number of parenthesized subexpressions in this Regexp.
func (re *Regexp) NumSubexp() int { return re.nbra }

// The match arena allows us to reduce the garbage generated by tossing
// match vectors away as we execute.  Matches are ref counted and returned
// to a free list when no longer active.  Increases a simple benchmark by 22X.
type matchArena struct {
        head  *matchVec
        len   int // length of match vector
        pos   int
        atBOT bool // whether we're at beginning of text
        atEOT bool // whether we're at end of text
}

type matchVec struct {
        m    []int // pairs of bracketing submatches. 0th is start,end
        ref  int
        next *matchVec
}

func (a *matchArena) new() *matchVec {
        if a.head == nil {
                const N = 10
                block := make([]matchVec, N)
                for i := 0; i < N; i++ {
                        b := &block[i]
                        b.next = a.head
                        a.head = b
                }
        }
        m := a.head
        a.head = m.next
        m.ref = 0
        if m.m == nil {
                m.m = make([]int, a.len)
        }
        return m
}

func (a *matchArena) free(m *matchVec) {
        m.ref--
        if m.ref == 0 {
                m.next = a.head
                a.head = m
        }
}

func (a *matchArena) copy(m *matchVec) *matchVec {
        m1 := a.new()
        copy(m1.m, m.m)
        return m1
}

func (a *matchArena) noMatch() *matchVec {
        m := a.new()
        for i := range m.m {
                m.m[i] = -1 // no match seen; catches cases like "a(b)?c" on "ac"
        }
        m.ref = 1
        return m
}

type state struct {
        inst     *instr // next instruction to execute
        prefixed bool   // this match began with a fixed prefix
        match    *matchVec
}

// Append new state to to-do list.  Leftmost-longest wins so avoid
// adding a state that's already active.  The matchVec will be inc-ref'ed
// if it is assigned to a state.
func (a *matchArena) addState(s []state, inst *instr, prefixed bool, match *matchVec) []state {
        switch inst.kind {
        case iBOT:
                if a.atBOT {
                        s = a.addState(s, inst.next, prefixed, match)
                }
                return s
        case iEOT:
                if a.atEOT {
                        s = a.addState(s, inst.next, prefixed, match)
                }
                return s
        case iBra:
                match.m[inst.braNum] = a.pos
                s = a.addState(s, inst.next, prefixed, match)
                return s
        }
        l := len(s)
        // States are inserted in order so it's sufficient to see if we have the same
        // instruction; no need to see if existing match is earlier (it is).
        for i := 0; i < l; i++ {
                if s[i].inst == inst {
                        return s
                }
        }
        s = append(s, state{inst, prefixed, match})
        match.ref++
        if inst.kind == iAlt {
                s = a.addState(s, inst.left, prefixed, a.copy(match))
                // give other branch a copy of this match vector
                s = a.addState(s, inst.next, prefixed, a.copy(match))
        }
        return s
}

// input abstracts different representations of the input text. It provides
// one-character lookahead.
type input interface {
        step(pos int) (r rune, width int) // advance one rune
        canCheckPrefix() bool             // can we look ahead without losing info?
        hasPrefix(re *Regexp) bool
        index(re *Regexp, pos int) int
}

// inputString scans a string.
type inputString struct {
        str string
}

func newInputString(str string) *inputString {
        return &inputString{str: str}
}

func (i *inputString) step(pos int) (rune, int) {
        if pos < len(i.str) {
                return utf8.DecodeRuneInString(i.str[pos:len(i.str)])
        }
        return endOfText, 0
}

func (i *inputString) canCheckPrefix() bool {
        return true
}

func (i *inputString) hasPrefix(re *Regexp) bool {
        return strings.HasPrefix(i.str, re.prefix)
}

func (i *inputString) index(re *Regexp, pos int) int {
        return strings.Index(i.str[pos:], re.prefix)
}

// inputBytes scans a byte slice.
type inputBytes struct {
        str []byte
}

func newInputBytes(str []byte) *inputBytes {
        return &inputBytes{str: str}
}

func (i *inputBytes) step(pos int) (rune, int) {
        if pos < len(i.str) {
                return utf8.DecodeRune(i.str[pos:len(i.str)])
        }
        return endOfText, 0
}

func (i *inputBytes) canCheckPrefix() bool {
        return true
}

func (i *inputBytes) hasPrefix(re *Regexp) bool {
        return bytes.HasPrefix(i.str, re.prefixBytes)
}

func (i *inputBytes) index(re *Regexp, pos int) int {
        return bytes.Index(i.str[pos:], re.prefixBytes)
}

// inputReader scans a RuneReader.
type inputReader struct {
        r     io.RuneReader
        atEOT bool
        pos   int
}

func newInputReader(r io.RuneReader) *inputReader {
        return &inputReader{r: r}
}

func (i *inputReader) step(pos int) (rune, int) {
        if !i.atEOT && pos != i.pos {
                return endOfText, 0

        }
        r, w, err := i.r.ReadRune()
        if err != nil {
                i.atEOT = true
                return endOfText, 0
        }
        i.pos += w
        return r, w
}

func (i *inputReader) canCheckPrefix() bool {
        return false
}

func (i *inputReader) hasPrefix(re *Regexp) bool {
        return false
}

func (i *inputReader) index(re *Regexp, pos int) int {
        return -1
}

// Search match starting from pos bytes into the input.
func (re *Regexp) doExecute(i input, pos int) []int {
        var s [2][]state
        s[0] = make([]state, 0, 10)
        s[1] = make([]state, 0, 10)
        in, out := 0, 1
        var final state
        found := false
        anchored := re.inst[0].next.kind == iBOT
        if anchored && pos > 0 {
                return nil
        }
        // fast check for initial plain substring
        if i.canCheckPrefix() && re.prefix != "" {
                advance := 0
                if anchored {
                        if !i.hasPrefix(re) {
                                return nil
                        }
                } else {
                        advance = i.index(re, pos)
                        if advance == -1 {
                                return nil
                        }
                }
                pos += advance
        }
        // We look one character ahead so we can match $, which checks whether
        // we are at EOT.
        nextChar, nextWidth := i.step(pos)
        arena := &matchArena{
                len:   2 * (re.nbra + 1),
                pos:   pos,
                atBOT: pos == 0,
                atEOT: nextChar == endOfText,
        }
        for c, startPos := rune(0), pos; c != endOfText; {
                if !found && (pos == startPos || !anchored) {
                        // prime the pump if we haven't seen a match yet
                        match := arena.noMatch()
                        match.m[0] = pos
                        s[out] = arena.addState(s[out], re.start.next, false, match)
                        arena.free(match) // if addState saved it, ref was incremented
                } else if len(s[out]) == 0 {
                        // machine has completed
                        break
                }
                in, out = out, in // old out state is new in state
                // clear out old state
                old := s[out]
                for _, state := range old {
                        arena.free(state.match)
                }
                s[out] = old[0:0] // truncate state vector
                c = nextChar
                thisPos := pos
                pos += nextWidth
                nextChar, nextWidth = i.step(pos)
                arena.atEOT = nextChar == endOfText
                arena.atBOT = false
                arena.pos = pos
                for _, st := range s[in] {
                        switch st.inst.kind {
                        case iBOT:
                        case iEOT:
                        case iChar:
                                if c == st.inst.char {
                                        s[out] = arena.addState(s[out], st.inst.next, st.prefixed, st.match)
                                }
                        case iCharClass:
                                if st.inst.cclass.matches(c) {
                                        s[out] = arena.addState(s[out], st.inst.next, st.prefixed, st.match)
                                }
                        case iAny:
                                if c != endOfText {
                                        s[out] = arena.addState(s[out], st.inst.next, st.prefixed, st.match)
                                }
                        case iNotNL:
                                if c != endOfText && c != '\n' {
                                        s[out] = arena.addState(s[out], st.inst.next, st.prefixed, st.match)
                                }
                        case iBra:
                        case iAlt:
                        case iEnd:
                                // choose leftmost longest
                                if !found || // first
                                        st.match.m[0] < final.match.m[0] || // leftmost
                                        (st.match.m[0] == final.match.m[0] && thisPos > final.match.m[1]) { // longest
                                        if final.match != nil {
                                                arena.free(final.match)
                                        }
                                        final = st
                                        final.match.ref++
                                        final.match.m[1] = thisPos
                                }
                                found = true
                        default:
                                st.inst.print()
                                panic("unknown instruction in execute")
                        }
                }
        }
        if final.match == nil {
                return nil
        }
        // if match found, back up start of match by width of prefix.
        if final.prefixed && len(final.match.m) > 0 {
                final.match.m[0] -= len(re.prefix)
        }
        return final.match.m
}

// LiteralPrefix returns a literal string that must begin any match
// of the regular expression re.  It returns the boolean true if the
// literal string comprises the entire regular expression.
func (re *Regexp) LiteralPrefix() (prefix string, complete bool) {
        c := make([]rune, len(re.inst)-2) // minus start and end.
        // First instruction is start; skip that.
        i := 0
        for inst := re.inst[0].next; inst.kind != iEnd; inst = inst.next {
                // stop if this is not a char
                if inst.kind != iChar {
                        return string(c[:i]), false
                }
                c[i] = inst.char
                i++
        }
        return string(c[:i]), true
}

// MatchReader returns whether the Regexp matches the text read by the
// RuneReader.  The return value is a boolean: true for match, false for no
// match.
func (re *Regexp) MatchReader(r io.RuneReader) bool {
        return len(re.doExecute(newInputReader(r), 0)) > 0
}

// MatchString returns whether the Regexp matches the string s.
// The return value is a boolean: true for match, false for no match.
func (re *Regexp) MatchString(s string) bool { return len(re.doExecute(newInputString(s), 0)) > 0 }

// Match returns whether the Regexp matches the byte slice b.
// The return value is a boolean: true for match, false for no match.
func (re *Regexp) Match(b []byte) bool { return len(re.doExecute(newInputBytes(b), 0)) > 0 }

// MatchReader checks whether a textual regular expression matches the text
// read by the RuneReader.  More complicated queries need to use Compile and
// the full Regexp interface.
func MatchReader(pattern string, r io.RuneReader) (matched bool, error error) {
        re, err := Compile(pattern)
        if err != nil {
                return false, err
        }
        return re.MatchReader(r), nil
}

// MatchString checks whether a textual regular expression
// matches a string.  More complicated queries need
// to use Compile and the full Regexp interface.
func MatchString(pattern string, s string) (matched bool, error error) {
        re, err := Compile(pattern)
        if err != nil {
                return false, err
        }
        return re.MatchString(s), nil
}

// Match checks whether a textual regular expression
// matches a byte slice.  More complicated queries need
// to use Compile and the full Regexp interface.
func Match(pattern string, b []byte) (matched bool, error error) {
        re, err := Compile(pattern)
        if err != nil {
                return false, err
        }
        return re.Match(b), nil
}

// ReplaceAllString returns a copy of src in which all matches for the Regexp
// have been replaced by repl.  No support is provided for expressions
// (e.g. \1 or $1) in the replacement string.
func (re *Regexp) ReplaceAllString(src, repl string) string {
        return re.ReplaceAllStringFunc(src, func(string) string { return repl })
}

// ReplaceAllStringFunc returns a copy of src in which all matches for the
// Regexp have been replaced by the return value of of function repl (whose
// first argument is the matched string).  No support is provided for
// expressions (e.g. \1 or $1) in the replacement string.
func (re *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) string {
        lastMatchEnd := 0 // end position of the most recent match
        searchPos := 0    // position where we next look for a match
        buf := new(bytes.Buffer)
        for searchPos <= len(src) {
                a := re.doExecute(newInputString(src), searchPos)
                if len(a) == 0 {
                        break // no more matches
                }

                // Copy the unmatched characters before this match.
                io.WriteString(buf, src[lastMatchEnd:a[0]])

                // Now insert a copy of the replacement string, but not for a
                // match of the empty string immediately after another match.
                // (Otherwise, we get double replacement for patterns that
                // match both empty and nonempty strings.)
                if a[1] > lastMatchEnd || a[0] == 0 {
                        io.WriteString(buf, repl(src[a[0]:a[1]]))
                }
                lastMatchEnd = a[1]

                // Advance past this match; always advance at least one character.
                _, width := utf8.DecodeRuneInString(src[searchPos:])
                if searchPos+width > a[1] {
                        searchPos += width
                } else if searchPos+1 > a[1] {
                        // This clause is only needed at the end of the input
                        // string.  In that case, DecodeRuneInString returns width=0.
                        searchPos++
                } else {
                        searchPos = a[1]
                }
        }

        // Copy the unmatched characters after the last match.
        io.WriteString(buf, src[lastMatchEnd:])

        return buf.String()
}

// ReplaceAll returns a copy of src in which all matches for the Regexp
// have been replaced by repl.  No support is provided for expressions
// (e.g. \1 or $1) in the replacement text.
func (re *Regexp) ReplaceAll(src, repl []byte) []byte {
        return re.ReplaceAllFunc(src, func([]byte) []byte { return repl })
}

// ReplaceAllFunc returns a copy of src in which all matches for the
// Regexp have been replaced by the return value of of function repl (whose
// first argument is the matched []byte).  No support is provided for
// expressions (e.g. \1 or $1) in the replacement string.
func (re *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte {
        lastMatchEnd := 0 // end position of the most recent match
        searchPos := 0    // position where we next look for a match
        buf := new(bytes.Buffer)
        for searchPos <= len(src) {
                a := re.doExecute(newInputBytes(src), searchPos)
                if len(a) == 0 {
                        break // no more matches
                }

                // Copy the unmatched characters before this match.
                buf.Write(src[lastMatchEnd:a[0]])

                // Now insert a copy of the replacement string, but not for a
                // match of the empty string immediately after another match.
                // (Otherwise, we get double replacement for patterns that
                // match both empty and nonempty strings.)
                if a[1] > lastMatchEnd || a[0] == 0 {
                        buf.Write(repl(src[a[0]:a[1]]))
                }
                lastMatchEnd = a[1]

                // Advance past this match; always advance at least one character.
                _, width := utf8.DecodeRune(src[searchPos:])
                if searchPos+width > a[1] {
                        searchPos += width
                } else if searchPos+1 > a[1] {
                        // This clause is only needed at the end of the input
                        // string.  In that case, DecodeRuneInString returns width=0.
                        searchPos++
                } else {
                        searchPos = a[1]
                }
        }

        // Copy the unmatched characters after the last match.
        buf.Write(src[lastMatchEnd:])

        return buf.Bytes()
}

// QuoteMeta returns a string that quotes all regular expression metacharacters
// inside the argument text; the returned string is a regular expression matching
// the literal text.  For example, QuoteMeta(`[foo]`) returns `\[foo\]`.
func QuoteMeta(s string) string {
        b := make([]byte, 2*len(s))

        // A byte loop is correct because all metacharacters are ASCII.
        j := 0
        for i := 0; i < len(s); i++ {
                if special(rune(s[i])) {
                        b[j] = '\\'
                        j++
                }
                b[j] = s[i]
                j++
        }
        return string(b[0:j])
}

// Find matches in slice b if b is non-nil, otherwise find matches in string s.
func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) {
        var end int
        if b == nil {
                end = len(s)
        } else {
                end = len(b)
        }

        for pos, i, prevMatchEnd := 0, 0, -1; i < n && pos <= end; {
                var in input
                if b == nil {
                        in = newInputString(s)
                } else {
                        in = newInputBytes(b)
                }
                matches := re.doExecute(in, pos)
                if len(matches) == 0 {
                        break
                }

                accept := true
                if matches[1] == pos {
                        // We've found an empty match.
                        if matches[0] == prevMatchEnd {
                                // We don't allow an empty match right
                                // after a previous match, so ignore it.
                                accept = false
                        }
                        var width int
                        // TODO: use step()
                        if b == nil {
                                _, width = utf8.DecodeRuneInString(s[pos:end])
                        } else {
                                _, width = utf8.DecodeRune(b[pos:end])
                        }
                        if width > 0 {
                                pos += width
                        } else {
                                pos = end + 1
                        }
                } else {
                        pos = matches[1]
                }
                prevMatchEnd = matches[1]

                if accept {
                        deliver(matches)
                        i++
                }
        }
}

// Find returns a slice holding the text of the leftmost match in b of the regular expression.
// A return value of nil indicates no match.
func (re *Regexp) Find(b []byte) []byte {
        a := re.doExecute(newInputBytes(b), 0)
        if a == nil {
                return nil
        }
        return b[a[0]:a[1]]
}

// FindIndex returns a two-element slice of integers defining the location of
// the leftmost match in b of the regular expression.  The match itself is at
// b[loc[0]:loc[1]].
// A return value of nil indicates no match.
func (re *Regexp) FindIndex(b []byte) (loc []int) {
        a := re.doExecute(newInputBytes(b), 0)
        if a == nil {
                return nil
        }
        return a[0:2]
}

// FindString returns a string holding the text of the leftmost match in s of the regular
// expression.  If there is no match, the return value is an empty string,
// but it will also be empty if the regular expression successfully matches
// an empty string.  Use FindStringIndex or FindStringSubmatch if it is
// necessary to distinguish these cases.
func (re *Regexp) FindString(s string) string {
        a := re.doExecute(newInputString(s), 0)
        if a == nil {
                return ""
        }
        return s[a[0]:a[1]]
}

// FindStringIndex returns a two-element slice of integers defining the
// location of the leftmost match in s of the regular expression.  The match
// itself is at s[loc[0]:loc[1]].
// A return value of nil indicates no match.
func (re *Regexp) FindStringIndex(s string) []int {
        a := re.doExecute(newInputString(s), 0)
        if a == nil {
                return nil
        }
        return a[0:2]
}

// FindReaderIndex returns a two-element slice of integers defining the
// location of the leftmost match of the regular expression in text read from
// the RuneReader.  The match itself is at s[loc[0]:loc[1]].  A return
// value of nil indicates no match.
func (re *Regexp) FindReaderIndex(r io.RuneReader) []int {
        a := re.doExecute(newInputReader(r), 0)
        if a == nil {
                return nil
        }
        return a[0:2]
}

// FindSubmatch returns a slice of slices holding the text of the leftmost
// match of the regular expression in b and the matches, if any, of its
// subexpressions, as defined by the 'Submatch' descriptions in the package
// comment.
// A return value of nil indicates no match.
func (re *Regexp) FindSubmatch(b []byte) [][]byte {
        a := re.doExecute(newInputBytes(b), 0)
        if a == nil {
                return nil
        }
        ret := make([][]byte, len(a)/2)
        for i := range ret {
                if a[2*i] >= 0 {
                        ret[i] = b[a[2*i]:a[2*i+1]]
                }
        }
        return ret
}

// FindSubmatchIndex returns a slice holding the index pairs identifying the
// leftmost match of the regular expression in b and the matches, if any, of
// its subexpressions, as defined by the 'Submatch' and 'Index' descriptions
// in the package comment.
// A return value of nil indicates no match.
func (re *Regexp) FindSubmatchIndex(b []byte) []int {
        return re.doExecute(newInputBytes(b), 0)
}

// FindStringSubmatch returns a slice of strings holding the text of the
// leftmost match of the regular expression in s and the matches, if any, of
// its subexpressions, as defined by the 'Submatch' description in the
// package comment.
// A return value of nil indicates no match.
func (re *Regexp) FindStringSubmatch(s string) []string {
        a := re.doExecute(newInputString(s), 0)
        if a == nil {
                return nil
        }
        ret := make([]string, len(a)/2)
        for i := range ret {
                if a[2*i] >= 0 {
                        ret[i] = s[a[2*i]:a[2*i+1]]
                }
        }
        return ret
}

// FindStringSubmatchIndex returns a slice holding the index pairs
// identifying the leftmost match of the regular expression in s and the
// matches, if any, of its subexpressions, as defined by the 'Submatch' and
// 'Index' descriptions in the package comment.
// A return value of nil indicates no match.
func (re *Regexp) FindStringSubmatchIndex(s string) []int {
        return re.doExecute(newInputString(s), 0)
}

// FindReaderSubmatchIndex returns a slice holding the index pairs
// identifying the leftmost match of the regular expression of text read by
// the RuneReader, and the matches, if any, of its subexpressions, as defined
// by the 'Submatch' and 'Index' descriptions in the package comment.  A
// return value of nil indicates no match.
func (re *Regexp) FindReaderSubmatchIndex(r io.RuneReader) []int {
        return re.doExecute(newInputReader(r), 0)
}

const startSize = 10 // The size at which to start a slice in the 'All' routines.

// FindAll is the 'All' version of Find; it returns a slice of all successive
// matches of the expression, as defined by the 'All' description in the
// package comment.
// A return value of nil indicates no match.
func (re *Regexp) FindAll(b []byte, n int) [][]byte {
        if n < 0 {
                n = len(b) + 1
        }
        result := make([][]byte, 0, startSize)
        re.allMatches("", b, n, func(match []int) {
                result = append(result, b[match[0]:match[1]])
        })
        if len(result) == 0 {
                return nil
        }
        return result
}

// FindAllIndex is the 'All' version of FindIndex; it returns a slice of all
// successive matches of the expression, as defined by the 'All' description
// in the package comment.
// A return value of nil indicates no match.
func (re *Regexp) FindAllIndex(b []byte, n int) [][]int {
        if n < 0 {
                n = len(b) + 1
        }
        result := make([][]int, 0, startSize)
        re.allMatches("", b, n, func(match []int) {
                result = append(result, match[0:2])
        })
        if len(result) == 0 {
                return nil
        }
        return result
}

// FindAllString is the 'All' version of FindString; it returns a slice of all
// successive matches of the expression, as defined by the 'All' description
// in the package comment.
// A return value of nil indicates no match.
func (re *Regexp) FindAllString(s string, n int) []string {
        if n < 0 {
                n = len(s) + 1
        }
        result := make([]string, 0, startSize)
        re.allMatches(s, nil, n, func(match []int) {
                result = append(result, s[match[0]:match[1]])
        })
        if len(result) == 0 {
                return nil
        }
        return result
}

// FindAllStringIndex is the 'All' version of FindStringIndex; it returns a
// slice of all successive matches of the expression, as defined by the 'All'
// description in the package comment.
// A return value of nil indicates no match.
func (re *Regexp) FindAllStringIndex(s string, n int) [][]int {
        if n < 0 {
                n = len(s) + 1
        }
        result := make([][]int, 0, startSize)
        re.allMatches(s, nil, n, func(match []int) {
                result = append(result, match[0:2])
        })
        if len(result) == 0 {
                return nil
        }
        return result
}

// FindAllSubmatch is the 'All' version of FindSubmatch; it returns a slice
// of all successive matches of the expression, as defined by the 'All'
// description in the package comment.
// A return value of nil indicates no match.
func (re *Regexp) FindAllSubmatch(b []byte, n int) [][][]byte {
        if n < 0 {
                n = len(b) + 1
        }
        result := make([][][]byte, 0, startSize)
        re.allMatches("", b, n, func(match []int) {
                slice := make([][]byte, len(match)/2)
                for j := range slice {
                        if match[2*j] >= 0 {
                                slice[j] = b[match[2*j]:match[2*j+1]]
                        }
                }
                result = append(result, slice)
        })
        if len(result) == 0 {
                return nil
        }
        return result
}

// FindAllSubmatchIndex is the 'All' version of FindSubmatchIndex; it returns
// a slice of all successive matches of the expression, as defined by the
// 'All' description in the package comment.
// A return value of nil indicates no match.
func (re *Regexp) FindAllSubmatchIndex(b []byte, n int) [][]int {
        if n < 0 {
                n = len(b) + 1
        }
        result := make([][]int, 0, startSize)
        re.allMatches("", b, n, func(match []int) {
                result = append(result, match)
        })
        if len(result) == 0 {
                return nil
        }
        return result
}

// FindAllStringSubmatch is the 'All' version of FindStringSubmatch; it
// returns a slice of all successive matches of the expression, as defined by
// the 'All' description in the package comment.
// A return value of nil indicates no match.
func (re *Regexp) FindAllStringSubmatch(s string, n int) [][]string {
        if n < 0 {
                n = len(s) + 1
        }
        result := make([][]string, 0, startSize)
        re.allMatches(s, nil, n, func(match []int) {
                slice := make([]string, len(match)/2)
                for j := range slice {
                        if match[2*j] >= 0 {
                                slice[j] = s[match[2*j]:match[2*j+1]]
                        }
                }
                result = append(result, slice)
        })
        if len(result) == 0 {
                return nil
        }
        return result
}

// FindAllStringSubmatchIndex is the 'All' version of
// FindStringSubmatchIndex; it returns a slice of all successive matches of
// the expression, as defined by the 'All' description in the package
// comment.
// A return value of nil indicates no match.
func (re *Regexp) FindAllStringSubmatchIndex(s string, n int) [][]int {
        if n < 0 {
                n = len(s) + 1
        }
        result := make([][]int, 0, startSize)
        re.allMatches(s, nil, n, func(match []int) {
                result = append(result, match)
        })
        if len(result) == 0 {
                return nil
        }
        return result
}
Go to most recent revision | Compare with Previous | Blame | View Log
Browse

Tools

Subversion Repositories openrisc

[/] [openrisc/] [trunk/] [gnu-dev/] [or1k-gcc/] [libgo/] [go/] [old/] [regexp/] [regexp.go] - Rev 809