URL https://opencores.org/ocsvn/openrisc/openrisc/trunk
Subversion Repositories openrisc

[/] [openrisc/] [trunk/] [gnu-dev/] [or1k-gcc/] [libgo/] [go/] [regexp/] [exec_test.go] - Rev 867

Go to most recent revision | Compare with Previous | Blame | View Log
// Copyright 2010 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package regexp

import (
        "bufio"
        "compress/bzip2"
        "fmt"
        "io"
        "math/rand"
        "os"
        "path/filepath"
        "regexp/syntax"
        "strconv"
        "strings"
        "testing"
        "unicode/utf8"
)

// TestRE2 tests this package's regexp API against test cases
// considered during RE2's exhaustive tests, which run all possible
// regexps over a given set of atoms and operators, up to a given
// complexity, over all possible strings over a given alphabet,
// up to a given size.  Rather than try to link with RE2, we read a
// log file containing the test cases and the expected matches.
// The log file, re2.txt, is generated by running 'make exhaustive-log'
// in the open source RE2 distribution.  http://code.google.com/p/re2/
//
// The test file format is a sequence of stanzas like:
//
//      strings
//      "abc"
//      "123x"
//      regexps
//      "[a-z]+"
//      0-3;0-3
//      -;-
//      "([0-9])([0-9])([0-9])"
//      -;-
//      -;0-3 0-1 1-2 2-3
//
// The stanza begins by defining a set of strings, quoted
// using Go double-quote syntax, one per line.  Then the
// regexps section gives a sequence of regexps to run on
// the strings.  In the block that follows a regexp, each line
// gives the semicolon-separated match results of running
// the regexp on the corresponding string.
// Each match result is either a single -, meaning no match, or a
// space-separated sequence of pairs giving the match and
// submatch indices.  An unmatched subexpression formats
// its pair as a single - (not illustrated above).  For now
// each regexp run produces two match results, one for a
// ``full match'' that restricts the regexp to matching the entire
// string or nothing, and one for a ``partial match'' that gives
// the leftmost first match found in the string.
//
// Lines beginning with # are comments.  Lines beginning with
// a capital letter are test names printed during RE2's test suite
// and are echoed into t but otherwise ignored.
//
// At time of writing, re2.txt is 32 MB but compresses to 760 kB,
// so we store re2.txt.gz in the repository and decompress it on the fly.
//
func TestRE2Search(t *testing.T) {
        testRE2(t, "testdata/re2-search.txt")
}

func TestRE2Exhaustive(t *testing.T) {
        if testing.Short() {
                t.Log("skipping TestRE2Exhaustive during short test")
                return
        }
        testRE2(t, "testdata/re2-exhaustive.txt.bz2")
}

func testRE2(t *testing.T, file string) {
        f, err := os.Open(file)
        if err != nil {
                t.Fatal(err)
        }
        defer f.Close()
        var txt io.Reader
        if strings.HasSuffix(file, ".bz2") {
                z := bzip2.NewReader(f)
                txt = z
                file = file[:len(file)-len(".bz2")] // for error messages
        } else {
                txt = f
        }
        lineno := 0
        r := bufio.NewReader(txt)
        var (
                str       []string
                input     []string
                inStrings bool
                re        *Regexp
                refull    *Regexp
                nfail     int
                ncase     int
        )
        for {
                line, err := r.ReadString('\n')
                if err != nil {
                        if err == io.EOF {
                                break
                        }
                        t.Fatalf("%s:%d: %v", file, lineno, err)
                }
                line = line[:len(line)-1] // chop \n
                lineno++
                switch {
                case line == "":
                        t.Fatalf("%s:%d: unexpected blank line", file, lineno)
                case line[0] == '#':
                        continue
                case 'A' <= line[0] && line[0] <= 'Z':
                        // Test name.
                        t.Logf("%s\n", line)
                        continue
                case line == "strings":
                        str = str[:0]
                        inStrings = true
                case line == "regexps":
                        inStrings = false
                case line[0] == '"':
                        q, err := strconv.Unquote(line)
                        if err != nil {
                                // Fatal because we'll get out of sync.
                                t.Fatalf("%s:%d: unquote %s: %v", file, lineno, line, err)
                        }
                        if inStrings {
                                str = append(str, q)
                                continue
                        }
                        // Is a regexp.
                        if len(input) != 0 {
                                t.Fatalf("%s:%d: out of sync: have %d strings left before %#q", file, lineno, len(input), q)
                        }
                        re, err = tryCompile(q)
                        if err != nil {
                                if err.Error() == "error parsing regexp: invalid escape sequence: `\\C`" {
                                        // We don't and likely never will support \C; keep going.
                                        continue
                                }
                                t.Errorf("%s:%d: compile %#q: %v", file, lineno, q, err)
                                if nfail++; nfail >= 100 {
                                        t.Fatalf("stopping after %d errors", nfail)
                                }
                                continue
                        }
                        full := `\A(?:` + q + `)\z`
                        refull, err = tryCompile(full)
                        if err != nil {
                                // Fatal because q worked, so this should always work.
                                t.Fatalf("%s:%d: compile full %#q: %v", file, lineno, full, err)
                        }
                        input = str
                case line[0] == '-' || '0' <= line[0] && line[0] <= '9':
                        // A sequence of match results.
                        ncase++
                        if re == nil {
                                // Failed to compile: skip results.
                                continue
                        }
                        if len(input) == 0 {
                                t.Fatalf("%s:%d: out of sync: no input remaining", file, lineno)
                        }
                        var text string
                        text, input = input[0], input[1:]
                        if !isSingleBytes(text) && strings.Contains(re.String(), `\B`) {
                                // RE2's \B considers every byte position,
                                // so it sees 'not word boundary' in the
                                // middle of UTF-8 sequences.  This package
                                // only considers the positions between runes,
                                // so it disagrees.  Skip those cases.
                                continue
                        }
                        res := strings.Split(line, ";")
                        if len(res) != len(run) {
                                t.Fatalf("%s:%d: have %d test results, want %d", file, lineno, len(res), len(run))
                        }
                        for i := range res {
                                have, suffix := run[i](re, refull, text)
                                want := parseResult(t, file, lineno, res[i])
                                if !same(have, want) {
                                        t.Errorf("%s:%d: %#q%s.FindSubmatchIndex(%#q) = %v, want %v", file, lineno, re, suffix, text, have, want)
                                        if nfail++; nfail >= 100 {
                                                t.Fatalf("stopping after %d errors", nfail)
                                        }
                                        continue
                                }
                                b, suffix := match[i](re, refull, text)
                                if b != (want != nil) {
                                        t.Errorf("%s:%d: %#q%s.MatchString(%#q) = %v, want %v", file, lineno, re, suffix, text, b, !b)
                                        if nfail++; nfail >= 100 {
                                                t.Fatalf("stopping after %d errors", nfail)
                                        }
                                        continue
                                }
                        }

                default:
                        t.Fatalf("%s:%d: out of sync: %s\n", file, lineno, line)
                }
        }
        if len(input) != 0 {
                t.Fatalf("%s:%d: out of sync: have %d strings left at EOF", file, lineno, len(input))
        }
        t.Logf("%d cases tested", ncase)
}

var run = []func(*Regexp, *Regexp, string) ([]int, string){
        runFull,
        runPartial,
        runFullLongest,
        runPartialLongest,
}

func runFull(re, refull *Regexp, text string) ([]int, string) {
        refull.longest = false
        return refull.FindStringSubmatchIndex(text), "[full]"
}

func runPartial(re, refull *Regexp, text string) ([]int, string) {
        re.longest = false
        return re.FindStringSubmatchIndex(text), ""
}

func runFullLongest(re, refull *Regexp, text string) ([]int, string) {
        refull.longest = true
        return refull.FindStringSubmatchIndex(text), "[full,longest]"
}

func runPartialLongest(re, refull *Regexp, text string) ([]int, string) {
        re.longest = true
        return re.FindStringSubmatchIndex(text), "[longest]"
}

var match = []func(*Regexp, *Regexp, string) (bool, string){
        matchFull,
        matchPartial,
        matchFullLongest,
        matchPartialLongest,
}

func matchFull(re, refull *Regexp, text string) (bool, string) {
        refull.longest = false
        return refull.MatchString(text), "[full]"
}

func matchPartial(re, refull *Regexp, text string) (bool, string) {
        re.longest = false
        return re.MatchString(text), ""
}

func matchFullLongest(re, refull *Regexp, text string) (bool, string) {
        refull.longest = true
        return refull.MatchString(text), "[full,longest]"
}

func matchPartialLongest(re, refull *Regexp, text string) (bool, string) {
        re.longest = true
        return re.MatchString(text), "[longest]"
}

func isSingleBytes(s string) bool {
        for _, c := range s {
                if c >= utf8.RuneSelf {
                        return false
                }
        }
        return true
}

func tryCompile(s string) (re *Regexp, err error) {
        // Protect against panic during Compile.
        defer func() {
                if r := recover(); r != nil {
                        err = fmt.Errorf("panic: %v", r)
                }
        }()
        return Compile(s)
}

func parseResult(t *testing.T, file string, lineno int, res string) []int {
        // A single - indicates no match.
        if res == "-" {
                return nil
        }
        // Otherwise, a space-separated list of pairs.
        n := 1
        for j := 0; j < len(res); j++ {
                if res[j] == ' ' {
                        n++
                }
        }
        out := make([]int, 2*n)
        i := 0
        n = 0
        for j := 0; j <= len(res); j++ {
                if j == len(res) || res[j] == ' ' {
                        // Process a single pair.  - means no submatch.
                        pair := res[i:j]
                        if pair == "-" {
                                out[n] = -1
                                out[n+1] = -1
                        } else {
                                k := strings.Index(pair, "-")
                                if k < 0 {
                                        t.Fatalf("%s:%d: invalid pair %s", file, lineno, pair)
                                }
                                lo, err1 := strconv.Atoi(pair[:k])
                                hi, err2 := strconv.Atoi(pair[k+1:])
                                if err1 != nil || err2 != nil || lo > hi {
                                        t.Fatalf("%s:%d: invalid pair %s", file, lineno, pair)
                                }
                                out[n] = lo
                                out[n+1] = hi
                        }
                        n += 2
                        i = j + 1
                }
        }
        return out
}

func same(x, y []int) bool {
        if len(x) != len(y) {
                return false
        }
        for i, xi := range x {
                if xi != y[i] {
                        return false
                }
        }
        return true
}

// TestFowler runs this package's regexp API against the
// POSIX regular expression tests collected by Glenn Fowler
// at http://www2.research.att.com/~gsf/testregex/.
func TestFowler(t *testing.T) {
        files, err := filepath.Glob("testdata/*.dat")
        if err != nil {
                t.Fatal(err)
        }
        for _, file := range files {
                t.Log(file)
                testFowler(t, file)
        }
}

var notab = MustCompilePOSIX(`[^\t]+`)

func testFowler(t *testing.T, file string) {
        f, err := os.Open(file)
        if err != nil {
                t.Error(err)
                return
        }
        defer f.Close()
        b := bufio.NewReader(f)
        lineno := 0
        lastRegexp := ""
Reading:
        for {
                lineno++
                line, err := b.ReadString('\n')
                if err != nil {
                        if err != io.EOF {
                                t.Errorf("%s:%d: %v", file, lineno, err)
                        }
                        break Reading
                }

                // http://www2.research.att.com/~gsf/man/man1/testregex.html
                //
                // INPUT FORMAT
                //   Input lines may be blank, a comment beginning with #, or a test
                //   specification. A specification is five fields separated by one
                //   or more tabs. NULL denotes the empty string and NIL denotes the
                //   0 pointer.
                if line[0] == '#' || line[0] == '\n' {
                        continue Reading
                }
                line = line[:len(line)-1]
                field := notab.FindAllString(line, -1)
                for i, f := range field {
                        if f == "NULL" {
                                field[i] = ""
                        }
                        if f == "NIL" {
                                t.Logf("%s:%d: skip: %s", file, lineno, line)
                                continue Reading
                        }
                }
                if len(field) == 0 {
                        continue Reading
                }

                //   Field 1: the regex(3) flags to apply, one character per REG_feature
                //   flag. The test is skipped if REG_feature is not supported by the
                //   implementation. If the first character is not [BEASKLP] then the
                //   specification is a global control line. One or more of [BEASKLP] may be
                //   specified; the test will be repeated for each mode.
                // 
                //     B        basic                   BRE     (grep, ed, sed)
                //     E        REG_EXTENDED            ERE     (egrep)
                //     A        REG_AUGMENTED           ARE     (egrep with negation)
                //     S        REG_SHELL               SRE     (sh glob)
                //     K        REG_SHELL|REG_AUGMENTED KRE     (ksh glob)
                //     L        REG_LITERAL             LRE     (fgrep)
                // 
                //     a        REG_LEFT|REG_RIGHT      implicit ^...$
                //     b        REG_NOTBOL              lhs does not match ^
                //     c        REG_COMMENT             ignore space and #...\n
                //     d        REG_SHELL_DOT           explicit leading . match
                //     e        REG_NOTEOL              rhs does not match $
                //     f        REG_MULTIPLE            multiple \n separated patterns
                //     g        FNM_LEADING_DIR         testfnmatch only -- match until /
                //     h        REG_MULTIREF            multiple digit backref
                //     i        REG_ICASE               ignore case
                //     j        REG_SPAN                . matches \n
                //     k        REG_ESCAPE              \ to ecape [...] delimiter
                //     l        REG_LEFT                implicit ^...
                //     m        REG_MINIMAL             minimal match
                //     n        REG_NEWLINE             explicit \n match
                //     o        REG_ENCLOSED            (|&) magic inside [@|&](...)
                //     p        REG_SHELL_PATH          explicit / match
                //     q        REG_DELIMITED           delimited pattern
                //     r        REG_RIGHT               implicit ...$
                //     s        REG_SHELL_ESCAPED       \ not special
                //     t        REG_MUSTDELIM           all delimiters must be specified
                //     u        standard unspecified behavior -- errors not counted
                //     v        REG_CLASS_ESCAPE        \ special inside [...]
                //     w        REG_NOSUB               no subexpression match array
                //     x        REG_LENIENT             let some errors slide
                //     y        REG_LEFT                regexec() implicit ^...
                //     z        REG_NULL                NULL subexpressions ok
                //     $                                expand C \c escapes in fields 2 and 3
                //     /                                field 2 is a regsubcomp() expression
                //     =                                field 3 is a regdecomp() expression
                // 
                //   Field 1 control lines:
                // 
                //     C                set LC_COLLATE and LC_CTYPE to locale in field 2
                // 
                //     ?test ...        output field 5 if passed and != EXPECTED, silent otherwise
                //     &test ...        output field 5 if current and previous passed
                //     |test ...        output field 5 if current passed and previous failed
                //     ; ...    output field 2 if previous failed
                //     {test ...        skip if failed until }
                //     }                end of skip
                // 
                //     : comment                comment copied as output NOTE
                //     :comment:test    :comment: ignored
                //     N[OTE] comment   comment copied as output NOTE
                //     T[EST] comment   comment
                // 
                //     number           use number for nmatch (20 by default)
                flag := field[0]
                switch flag[0] {
                case '?', '&', '|', ';', '{', '}':
                        // Ignore all the control operators.
                        // Just run everything.
                        flag = flag[1:]
                        if flag == "" {
                                continue Reading
                        }
                case ':':
                        i := strings.Index(flag[1:], ":")
                        if i < 0 {
                                t.Logf("skip: %s", line)
                                continue Reading
                        }
                        flag = flag[1+i+1:]
                case 'C', 'N', 'T', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
                        t.Logf("skip: %s", line)
                        continue Reading
                }

                // Can check field count now that we've handled the myriad comment formats.
                if len(field) < 4 {
                        t.Errorf("%s:%d: too few fields: %s", file, lineno, line)
                        continue Reading
                }

                // Expand C escapes (a.k.a. Go escapes).
                if strings.Contains(flag, "$") {
                        f := `"` + field[1] + `"`
                        if field[1], err = strconv.Unquote(f); err != nil {
                                t.Errorf("%s:%d: cannot unquote %s", file, lineno, f)
                        }
                        f = `"` + field[2] + `"`
                        if field[2], err = strconv.Unquote(f); err != nil {
                                t.Errorf("%s:%d: cannot unquote %s", file, lineno, f)
                        }
                }

                //   Field 2: the regular expression pattern; SAME uses the pattern from
                //     the previous specification.
                // 
                if field[1] == "SAME" {
                        field[1] = lastRegexp
                }
                lastRegexp = field[1]

                //   Field 3: the string to match.
                text := field[2]

                //   Field 4: the test outcome...
                ok, shouldCompile, shouldMatch, pos := parseFowlerResult(field[3])
                if !ok {
                        t.Errorf("%s:%d: cannot parse result %#q", file, lineno, field[3])
                        continue Reading
                }

                //   Field 5: optional comment appended to the report.

        Testing:
                // Run test once for each specified capital letter mode that we support.
                for _, c := range flag {
                        pattern := field[1]
                        syn := syntax.POSIX | syntax.ClassNL
                        switch c {
                        default:
                                continue Testing
                        case 'E':
                                // extended regexp (what we support)
                        case 'L':
                                // literal
                                pattern = QuoteMeta(pattern)
                        }

                        for _, c := range flag {
                                switch c {
                                case 'i':
                                        syn |= syntax.FoldCase
                                }
                        }

                        re, err := compile(pattern, syn, true)
                        if err != nil {
                                if shouldCompile {
                                        t.Errorf("%s:%d: %#q did not compile", file, lineno, pattern)
                                }
                                continue Testing
                        }
                        if !shouldCompile {
                                t.Errorf("%s:%d: %#q should not compile", file, lineno, pattern)
                                continue Testing
                        }
                        match := re.MatchString(text)
                        if match != shouldMatch {
                                t.Errorf("%s:%d: %#q.Match(%#q) = %v, want %v", file, lineno, pattern, text, match, shouldMatch)
                                continue Testing
                        }
                        have := re.FindStringSubmatchIndex(text)
                        if (len(have) > 0) != match {
                                t.Errorf("%s:%d: %#q.Match(%#q) = %v, but %#q.FindSubmatchIndex(%#q) = %v", file, lineno, pattern, text, match, pattern, text, have)
                                continue Testing
                        }
                        if len(have) > len(pos) {
                                have = have[:len(pos)]
                        }
                        if !same(have, pos) {
                                t.Errorf("%s:%d: %#q.FindSubmatchIndex(%#q) = %v, want %v", file, lineno, pattern, text, have, pos)
                        }
                }
        }
}

func parseFowlerResult(s string) (ok, compiled, matched bool, pos []int) {
        //   Field 4: the test outcome. This is either one of the posix error
        //     codes (with REG_ omitted) or the match array, a list of (m,n)
        //     entries with m and n being first and last+1 positions in the
        //     field 3 string, or NULL if REG_NOSUB is in effect and success
        //     is expected. BADPAT is acceptable in place of any regcomp(3)
        //     error code. The match[] array is initialized to (-2,-2) before
        //     each test. All array elements from 0 to nmatch-1 must be specified
        //     in the outcome. Unspecified endpoints (offset -1) are denoted by ?.
        //     Unset endpoints (offset -2) are denoted by X. {x}(o:n) denotes a
        //     matched (?{...}) expression, where x is the text enclosed by {...},
        //     o is the expression ordinal counting from 1, and n is the length of
        //     the unmatched portion of the subject string. If x starts with a
        //     number then that is the return value of re_execf(), otherwise 0 is
        //     returned.
        switch {
        case s == "":
                // Match with no position information.
                ok = true
                compiled = true
                matched = true
                return
        case s == "NOMATCH":
                // Match failure.
                ok = true
                compiled = true
                matched = false
                return
        case 'A' <= s[0] && s[0] <= 'Z':
                // All the other error codes are compile errors.
                ok = true
                compiled = false
                return
        }
        compiled = true

        var x []int
        for s != "" {
                var end byte = ')'
                if len(x)%2 == 0 {
                        if s[0] != '(' {
                                ok = false
                                return
                        }
                        s = s[1:]
                        end = ','
                }
                i := 0
                for i < len(s) && s[i] != end {
                        i++
                }
                if i == 0 || i == len(s) {
                        ok = false
                        return
                }
                var v = -1
                var err error
                if s[:i] != "?" {
                        v, err = strconv.Atoi(s[:i])
                        if err != nil {
                                ok = false
                                return
                        }
                }
                x = append(x, v)
                s = s[i+1:]
        }
        if len(x)%2 != 0 {
                ok = false
                return
        }
        ok = true
        matched = true
        pos = x
        return
}

var text []byte

func makeText(n int) []byte {
        if len(text) >= n {
                return text[:n]
        }
        text = make([]byte, n)
        for i := range text {
                if rand.Intn(30) == 0 {
                        text[i] = '\n'
                } else {
                        text[i] = byte(rand.Intn(0x7E+1-0x20) + 0x20)
                }
        }
        return text
}

func benchmark(b *testing.B, re string, n int) {
        r := MustCompile(re)
        t := makeText(n)
        b.ResetTimer()
        b.SetBytes(int64(n))
        for i := 0; i < b.N; i++ {
                if r.Match(t) {
                        b.Fatal("match!")
                }
        }
}

const (
        easy0  = "ABCDEFGHIJKLMNOPQRSTUVWXYZ$"
        easy1  = "A[AB]B[BC]C[CD]D[DE]E[EF]F[FG]G[GH]H[HI]I[IJ]J$"
        medium = "[XYZ]ABCDEFGHIJKLMNOPQRSTUVWXYZ$"
        hard   = "[ -~]*ABCDEFGHIJKLMNOPQRSTUVWXYZ$"
        parens = "([ -~])*(A)(B)(C)(D)(E)(F)(G)(H)(I)(J)(K)(L)(M)" +
                "(N)(O)(P)(Q)(R)(S)(T)(U)(V)(W)(X)(Y)(Z)$"
)

func BenchmarkMatchEasy0_32(b *testing.B)   { benchmark(b, easy0, 32<<0) }
func BenchmarkMatchEasy0_1K(b *testing.B)   { benchmark(b, easy0, 1<<10) }
func BenchmarkMatchEasy0_32K(b *testing.B)  { benchmark(b, easy0, 32<<10) }
func BenchmarkMatchEasy0_1M(b *testing.B)   { benchmark(b, easy0, 1<<20) }
func BenchmarkMatchEasy0_32M(b *testing.B)  { benchmark(b, easy0, 32<<20) }
func BenchmarkMatchEasy1_32(b *testing.B)   { benchmark(b, easy1, 32<<0) }
func BenchmarkMatchEasy1_1K(b *testing.B)   { benchmark(b, easy1, 1<<10) }
func BenchmarkMatchEasy1_32K(b *testing.B)  { benchmark(b, easy1, 32<<10) }
func BenchmarkMatchEasy1_1M(b *testing.B)   { benchmark(b, easy1, 1<<20) }
func BenchmarkMatchEasy1_32M(b *testing.B)  { benchmark(b, easy1, 32<<20) }
func BenchmarkMatchMedium_32(b *testing.B)  { benchmark(b, medium, 1<<0) }
func BenchmarkMatchMedium_1K(b *testing.B)  { benchmark(b, medium, 1<<10) }
func BenchmarkMatchMedium_32K(b *testing.B) { benchmark(b, medium, 32<<10) }
func BenchmarkMatchMedium_1M(b *testing.B)  { benchmark(b, medium, 1<<20) }
func BenchmarkMatchMedium_32M(b *testing.B) { benchmark(b, medium, 32<<20) }
func BenchmarkMatchHard_32(b *testing.B)    { benchmark(b, hard, 32<<0) }
func BenchmarkMatchHard_1K(b *testing.B)    { benchmark(b, hard, 1<<10) }
func BenchmarkMatchHard_32K(b *testing.B)   { benchmark(b, hard, 32<<10) }
func BenchmarkMatchHard_1M(b *testing.B)    { benchmark(b, hard, 1<<20) }
func BenchmarkMatchHard_32M(b *testing.B)   { benchmark(b, hard, 32<<20) }
Go to most recent revision | Compare with Previous | Blame | View Log
Browse

Tools

Subversion Repositories openrisc

[/] [openrisc/] [trunk/] [gnu-dev/] [or1k-gcc/] [libgo/] [go/] [regexp/] [exec_test.go] - Rev 867