| 1 | 747 | jeremybenn | // Copyright 2011 The Go Authors.  All rights reserved.
 | 
      
         | 2 |  |  | // Use of this source code is governed by a BSD-style
 | 
      
         | 3 |  |  | // license that can be found in the LICENSE file.
 | 
      
         | 4 |  |  |  
 | 
      
         | 5 |  |  | package syntax_test
 | 
      
         | 6 |  |  |  
 | 
      
         | 7 |  |  | import (
 | 
      
         | 8 |  |  |         "bytes"
 | 
      
         | 9 |  |  |         "fmt"
 | 
      
         | 10 |  |  |         . "regexp/syntax"
 | 
      
         | 11 |  |  |         "testing"
 | 
      
         | 12 |  |  |         "unicode"
 | 
      
         | 13 |  |  | )
 | 
      
         | 14 |  |  |  
 | 
      
         | 15 |  |  | type parseTest struct {
 | 
      
         | 16 |  |  |         Regexp string
 | 
      
         | 17 |  |  |         Dump   string
 | 
      
         | 18 |  |  | }
 | 
      
         | 19 |  |  |  
 | 
      
         | 20 |  |  | var parseTests = []parseTest{
 | 
      
         | 21 |  |  |         // Base cases
 | 
      
         | 22 |  |  |         {`a`, `lit{a}`},
 | 
      
         | 23 |  |  |         {`a.`, `cat{lit{a}dot{}}`},
 | 
      
         | 24 |  |  |         {`a.b`, `cat{lit{a}dot{}lit{b}}`},
 | 
      
         | 25 |  |  |         {`ab`, `str{ab}`},
 | 
      
         | 26 |  |  |         {`a.b.c`, `cat{lit{a}dot{}lit{b}dot{}lit{c}}`},
 | 
      
         | 27 |  |  |         {`abc`, `str{abc}`},
 | 
      
         | 28 |  |  |         {`a|^`, `alt{lit{a}bol{}}`},
 | 
      
         | 29 |  |  |         {`a|b`, `cc{0x61-0x62}`},
 | 
      
         | 30 |  |  |         {`(a)`, `cap{lit{a}}`},
 | 
      
         | 31 |  |  |         {`(a)|b`, `alt{cap{lit{a}}lit{b}}`},
 | 
      
         | 32 |  |  |         {`a*`, `star{lit{a}}`},
 | 
      
         | 33 |  |  |         {`a+`, `plus{lit{a}}`},
 | 
      
         | 34 |  |  |         {`a?`, `que{lit{a}}`},
 | 
      
         | 35 |  |  |         {`a{2}`, `rep{2,2 lit{a}}`},
 | 
      
         | 36 |  |  |         {`a{2,3}`, `rep{2,3 lit{a}}`},
 | 
      
         | 37 |  |  |         {`a{2,}`, `rep{2,-1 lit{a}}`},
 | 
      
         | 38 |  |  |         {`a*?`, `nstar{lit{a}}`},
 | 
      
         | 39 |  |  |         {`a+?`, `nplus{lit{a}}`},
 | 
      
         | 40 |  |  |         {`a??`, `nque{lit{a}}`},
 | 
      
         | 41 |  |  |         {`a{2}?`, `nrep{2,2 lit{a}}`},
 | 
      
         | 42 |  |  |         {`a{2,3}?`, `nrep{2,3 lit{a}}`},
 | 
      
         | 43 |  |  |         {`a{2,}?`, `nrep{2,-1 lit{a}}`},
 | 
      
         | 44 |  |  |         // Malformed { } are treated as literals.
 | 
      
         | 45 |  |  |         {`x{1001`, `str{x{1001}`},
 | 
      
         | 46 |  |  |         {`x{9876543210`, `str{x{9876543210}`},
 | 
      
         | 47 |  |  |         {`x{9876543210,`, `str{x{9876543210,}`},
 | 
      
         | 48 |  |  |         {`x{2,1`, `str{x{2,1}`},
 | 
      
         | 49 |  |  |         {`x{1,9876543210`, `str{x{1,9876543210}`},
 | 
      
         | 50 |  |  |         {``, `emp{}`},
 | 
      
         | 51 |  |  |         {`|`, `emp{}`}, // alt{emp{}emp{}} but got factored
 | 
      
         | 52 |  |  |         {`|x|`, `alt{emp{}lit{x}emp{}}`},
 | 
      
         | 53 |  |  |         {`.`, `dot{}`},
 | 
      
         | 54 |  |  |         {`^`, `bol{}`},
 | 
      
         | 55 |  |  |         {`$`, `eol{}`},
 | 
      
         | 56 |  |  |         {`\|`, `lit{|}`},
 | 
      
         | 57 |  |  |         {`\(`, `lit{(}`},
 | 
      
         | 58 |  |  |         {`\)`, `lit{)}`},
 | 
      
         | 59 |  |  |         {`\*`, `lit{*}`},
 | 
      
         | 60 |  |  |         {`\+`, `lit{+}`},
 | 
      
         | 61 |  |  |         {`\?`, `lit{?}`},
 | 
      
         | 62 |  |  |         {`{`, `lit{{}`},
 | 
      
         | 63 |  |  |         {`}`, `lit{}}`},
 | 
      
         | 64 |  |  |         {`\.`, `lit{.}`},
 | 
      
         | 65 |  |  |         {`\^`, `lit{^}`},
 | 
      
         | 66 |  |  |         {`\$`, `lit{$}`},
 | 
      
         | 67 |  |  |         {`\\`, `lit{\}`},
 | 
      
         | 68 |  |  |         {`[ace]`, `cc{0x61 0x63 0x65}`},
 | 
      
         | 69 |  |  |         {`[abc]`, `cc{0x61-0x63}`},
 | 
      
         | 70 |  |  |         {`[a-z]`, `cc{0x61-0x7a}`},
 | 
      
         | 71 |  |  |         {`[a]`, `lit{a}`},
 | 
      
         | 72 |  |  |         {`\-`, `lit{-}`},
 | 
      
         | 73 |  |  |         {`-`, `lit{-}`},
 | 
      
         | 74 |  |  |         {`\_`, `lit{_}`},
 | 
      
         | 75 |  |  |         {`abc`, `str{abc}`},
 | 
      
         | 76 |  |  |         {`abc|def`, `alt{str{abc}str{def}}`},
 | 
      
         | 77 |  |  |         {`abc|def|ghi`, `alt{str{abc}str{def}str{ghi}}`},
 | 
      
         | 78 |  |  |  
 | 
      
         | 79 |  |  |         // Posix and Perl extensions
 | 
      
         | 80 |  |  |         {`[[:lower:]]`, `cc{0x61-0x7a}`},
 | 
      
         | 81 |  |  |         {`[a-z]`, `cc{0x61-0x7a}`},
 | 
      
         | 82 |  |  |         {`[^[:lower:]]`, `cc{0x0-0x60 0x7b-0x10ffff}`},
 | 
      
         | 83 |  |  |         {`[[:^lower:]]`, `cc{0x0-0x60 0x7b-0x10ffff}`},
 | 
      
         | 84 |  |  |         {`(?i)[[:lower:]]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`},
 | 
      
         | 85 |  |  |         {`(?i)[a-z]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`},
 | 
      
         | 86 |  |  |         {`(?i)[^[:lower:]]`, `cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`},
 | 
      
         | 87 |  |  |         {`(?i)[[:^lower:]]`, `cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`},
 | 
      
         | 88 |  |  |         {`\d`, `cc{0x30-0x39}`},
 | 
      
         | 89 |  |  |         {`\D`, `cc{0x0-0x2f 0x3a-0x10ffff}`},
 | 
      
         | 90 |  |  |         {`\s`, `cc{0x9-0xa 0xc-0xd 0x20}`},
 | 
      
         | 91 |  |  |         {`\S`, `cc{0x0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}`},
 | 
      
         | 92 |  |  |         {`\w`, `cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a}`},
 | 
      
         | 93 |  |  |         {`\W`, `cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x10ffff}`},
 | 
      
         | 94 |  |  |         {`(?i)\w`, `cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a 0x17f 0x212a}`},
 | 
      
         | 95 |  |  |         {`(?i)\W`, `cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`},
 | 
      
         | 96 |  |  |         {`[^\\]`, `cc{0x0-0x5b 0x5d-0x10ffff}`},
 | 
      
         | 97 |  |  |         //      { `\C`, `byte{}` },  // probably never
 | 
      
         | 98 |  |  |  
 | 
      
         | 99 |  |  |         // Unicode, negatives, and a double negative.
 | 
      
         | 100 |  |  |         {`\p{Braille}`, `cc{0x2800-0x28ff}`},
 | 
      
         | 101 |  |  |         {`\P{Braille}`, `cc{0x0-0x27ff 0x2900-0x10ffff}`},
 | 
      
         | 102 |  |  |         {`\p{^Braille}`, `cc{0x0-0x27ff 0x2900-0x10ffff}`},
 | 
      
         | 103 |  |  |         {`\P{^Braille}`, `cc{0x2800-0x28ff}`},
 | 
      
         | 104 |  |  |         {`\pZ`, `cc{0x20 0xa0 0x1680 0x180e 0x2000-0x200a 0x2028-0x2029 0x202f 0x205f 0x3000}`},
 | 
      
         | 105 |  |  |         {`[\p{Braille}]`, `cc{0x2800-0x28ff}`},
 | 
      
         | 106 |  |  |         {`[\P{Braille}]`, `cc{0x0-0x27ff 0x2900-0x10ffff}`},
 | 
      
         | 107 |  |  |         {`[\p{^Braille}]`, `cc{0x0-0x27ff 0x2900-0x10ffff}`},
 | 
      
         | 108 |  |  |         {`[\P{^Braille}]`, `cc{0x2800-0x28ff}`},
 | 
      
         | 109 |  |  |         {`[\pZ]`, `cc{0x20 0xa0 0x1680 0x180e 0x2000-0x200a 0x2028-0x2029 0x202f 0x205f 0x3000}`},
 | 
      
         | 110 |  |  |         {`\p{Lu}`, mkCharClass(unicode.IsUpper)},
 | 
      
         | 111 |  |  |         {`[\p{Lu}]`, mkCharClass(unicode.IsUpper)},
 | 
      
         | 112 |  |  |         {`(?i)[\p{Lu}]`, mkCharClass(isUpperFold)},
 | 
      
         | 113 |  |  |         {`\p{Any}`, `dot{}`},
 | 
      
         | 114 |  |  |         {`\p{^Any}`, `cc{}`},
 | 
      
         | 115 |  |  |  
 | 
      
         | 116 |  |  |         // Hex, octal.
 | 
      
         | 117 |  |  |         {`[\012-\234]\141`, `cat{cc{0xa-0x9c}lit{a}}`},
 | 
      
         | 118 |  |  |         {`[\x{41}-\x7a]\x61`, `cat{cc{0x41-0x7a}lit{a}}`},
 | 
      
         | 119 |  |  |  
 | 
      
         | 120 |  |  |         // More interesting regular expressions.
 | 
      
         | 121 |  |  |         {`a{,2}`, `str{a{,2}}`},
 | 
      
         | 122 |  |  |         {`\.\^\$\\`, `str{.^$\}`},
 | 
      
         | 123 |  |  |         {`[a-zABC]`, `cc{0x41-0x43 0x61-0x7a}`},
 | 
      
         | 124 |  |  |         {`[^a]`, `cc{0x0-0x60 0x62-0x10ffff}`},
 | 
      
         | 125 |  |  |         {`[α-ε☺]`, `cc{0x3b1-0x3b5 0x263a}`}, // utf-8
 | 
      
         | 126 |  |  |         {`a*{`, `cat{star{lit{a}}lit{{}}`},
 | 
      
         | 127 |  |  |  
 | 
      
         | 128 |  |  |         // Test precedences
 | 
      
         | 129 |  |  |         {`(?:ab)*`, `star{str{ab}}`},
 | 
      
         | 130 |  |  |         {`(ab)*`, `star{cap{str{ab}}}`},
 | 
      
         | 131 |  |  |         {`ab|cd`, `alt{str{ab}str{cd}}`},
 | 
      
         | 132 |  |  |         {`a(b|c)d`, `cat{lit{a}cap{cc{0x62-0x63}}lit{d}}`},
 | 
      
         | 133 |  |  |  
 | 
      
         | 134 |  |  |         // Test flattening.
 | 
      
         | 135 |  |  |         {`(?:a)`, `lit{a}`},
 | 
      
         | 136 |  |  |         {`(?:ab)(?:cd)`, `str{abcd}`},
 | 
      
         | 137 |  |  |         {`(?:a+b+)(?:c+d+)`, `cat{plus{lit{a}}plus{lit{b}}plus{lit{c}}plus{lit{d}}}`},
 | 
      
         | 138 |  |  |         {`(?:a+|b+)|(?:c+|d+)`, `alt{plus{lit{a}}plus{lit{b}}plus{lit{c}}plus{lit{d}}}`},
 | 
      
         | 139 |  |  |         {`(?:a|b)|(?:c|d)`, `cc{0x61-0x64}`},
 | 
      
         | 140 |  |  |         {`a|.`, `dot{}`},
 | 
      
         | 141 |  |  |         {`.|a`, `dot{}`},
 | 
      
         | 142 |  |  |         {`(?:[abc]|A|Z|hello|world)`, `alt{cc{0x41 0x5a 0x61-0x63}str{hello}str{world}}`},
 | 
      
         | 143 |  |  |         {`(?:[abc]|A|Z)`, `cc{0x41 0x5a 0x61-0x63}`},
 | 
      
         | 144 |  |  |  
 | 
      
         | 145 |  |  |         // Test Perl quoted literals
 | 
      
         | 146 |  |  |         {`\Q+|*?{[\E`, `str{+|*?{[}`},
 | 
      
         | 147 |  |  |         {`\Q+\E+`, `plus{lit{+}}`},
 | 
      
         | 148 |  |  |         {`\Q\\E`, `lit{\}`},
 | 
      
         | 149 |  |  |         {`\Q\\\E`, `str{\\}`},
 | 
      
         | 150 |  |  |  
 | 
      
         | 151 |  |  |         // Test Perl \A and \z
 | 
      
         | 152 |  |  |         {`(?m)^`, `bol{}`},
 | 
      
         | 153 |  |  |         {`(?m)$`, `eol{}`},
 | 
      
         | 154 |  |  |         {`(?-m)^`, `bot{}`},
 | 
      
         | 155 |  |  |         {`(?-m)$`, `eot{}`},
 | 
      
         | 156 |  |  |         {`(?m)\A`, `bot{}`},
 | 
      
         | 157 |  |  |         {`(?m)\z`, `eot{\z}`},
 | 
      
         | 158 |  |  |         {`(?-m)\A`, `bot{}`},
 | 
      
         | 159 |  |  |         {`(?-m)\z`, `eot{\z}`},
 | 
      
         | 160 |  |  |  
 | 
      
         | 161 |  |  |         // Test named captures
 | 
      
         | 162 |  |  |         {`(?Pa)`, `cap{name:lit{a}}`},
 | 
      
         | 163 |  |  |  
 | 
      
         | 164 |  |  |         // Case-folded literals
 | 
      
         | 165 |  |  |         {`[Aa]`, `litfold{A}`},
 | 
      
         | 166 |  |  |         {`[\x{100}\x{101}]`, `litfold{Ä€}`},
 | 
      
         | 167 |  |  |         {`[Δδ]`, `litfold{Δ}`},
 | 
      
         | 168 |  |  |  
 | 
      
         | 169 |  |  |         // Strings
 | 
      
         | 170 |  |  |         {`abcde`, `str{abcde}`},
 | 
      
         | 171 |  |  |         {`[Aa][Bb]cd`, `cat{strfold{AB}str{cd}}`},
 | 
      
         | 172 |  |  |  
 | 
      
         | 173 |  |  |         // Factoring.
 | 
      
         | 174 |  |  |         {`abc|abd|aef|bcx|bcy`, `alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}cat{str{bc}cc{0x78-0x79}}}`},
 | 
      
         | 175 |  |  |         {`ax+y|ax+z|ay+w`, `cat{lit{a}alt{cat{plus{lit{x}}cc{0x79-0x7a}}cat{plus{lit{y}}lit{w}}}}`},
 | 
      
         | 176 |  |  |  
 | 
      
         | 177 |  |  |         // Bug fixes.
 | 
      
         | 178 |  |  |         {`(?:.)`, `dot{}`},
 | 
      
         | 179 |  |  |         {`(?:x|(?:xa))`, `cat{lit{x}alt{emp{}lit{a}}}`},
 | 
      
         | 180 |  |  |         {`(?:.|(?:.a))`, `cat{dot{}alt{emp{}lit{a}}}`},
 | 
      
         | 181 |  |  |         {`(?:A(?:A|a))`, `cat{lit{A}litfold{A}}`},
 | 
      
         | 182 |  |  |         {`(?:A|a)`, `litfold{A}`},
 | 
      
         | 183 |  |  |         {`A|(?:A|a)`, `litfold{A}`},
 | 
      
         | 184 |  |  |         {`(?s).`, `dot{}`},
 | 
      
         | 185 |  |  |         {`(?-s).`, `dnl{}`},
 | 
      
         | 186 |  |  |         {`(?:(?:^).)`, `cat{bol{}dot{}}`},
 | 
      
         | 187 |  |  |         {`(?-s)(?:(?:^).)`, `cat{bol{}dnl{}}`},
 | 
      
         | 188 |  |  |  
 | 
      
         | 189 |  |  |         // RE2 prefix_tests
 | 
      
         | 190 |  |  |         {`abc|abd`, `cat{str{ab}cc{0x63-0x64}}`},
 | 
      
         | 191 |  |  |         {`a(?:b)c|abd`, `cat{str{ab}cc{0x63-0x64}}`},
 | 
      
         | 192 |  |  |         {`abc|abd|aef|bcx|bcy`,
 | 
      
         | 193 |  |  |                 `alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}` +
 | 
      
         | 194 |  |  |                         `cat{str{bc}cc{0x78-0x79}}}`},
 | 
      
         | 195 |  |  |         {`abc|x|abd`, `alt{str{abc}lit{x}str{abd}}`},
 | 
      
         | 196 |  |  |         {`(?i)abc|ABD`, `cat{strfold{AB}cc{0x43-0x44 0x63-0x64}}`},
 | 
      
         | 197 |  |  |         {`[ab]c|[ab]d`, `cat{cc{0x61-0x62}cc{0x63-0x64}}`},
 | 
      
         | 198 |  |  |         {`(?:xx|yy)c|(?:xx|yy)d`,
 | 
      
         | 199 |  |  |                 `cat{alt{str{xx}str{yy}}cc{0x63-0x64}}`},
 | 
      
         | 200 |  |  |         {`x{2}|x{2}[0-9]`,
 | 
      
         | 201 |  |  |                 `cat{rep{2,2 lit{x}}alt{emp{}cc{0x30-0x39}}}`},
 | 
      
         | 202 |  |  |         {`x{2}y|x{2}[0-9]y`,
 | 
      
         | 203 |  |  |                 `cat{rep{2,2 lit{x}}alt{lit{y}cat{cc{0x30-0x39}lit{y}}}}`},
 | 
      
         | 204 |  |  | }
 | 
      
         | 205 |  |  |  
 | 
      
         | 206 |  |  | const testFlags = MatchNL | PerlX | UnicodeGroups
 | 
      
         | 207 |  |  |  
 | 
      
         | 208 |  |  | func TestParseSimple(t *testing.T) {
 | 
      
         | 209 |  |  |         testParseDump(t, parseTests, testFlags)
 | 
      
         | 210 |  |  | }
 | 
      
         | 211 |  |  |  
 | 
      
         | 212 |  |  | var foldcaseTests = []parseTest{
 | 
      
         | 213 |  |  |         {`AbCdE`, `strfold{ABCDE}`},
 | 
      
         | 214 |  |  |         {`[Aa]`, `litfold{A}`},
 | 
      
         | 215 |  |  |         {`a`, `litfold{A}`},
 | 
      
         | 216 |  |  |  
 | 
      
         | 217 |  |  |         // 0x17F is an old English long s (looks like an f) and folds to s.
 | 
      
         | 218 |  |  |         // 0x212A is the Kelvin symbol and folds to k.
 | 
      
         | 219 |  |  |         {`A[F-g]`, `cat{litfold{A}cc{0x41-0x7a 0x17f 0x212a}}`}, // [Aa][A-z...]
 | 
      
         | 220 |  |  |         {`[[:upper:]]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`},
 | 
      
         | 221 |  |  |         {`[[:lower:]]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`},
 | 
      
         | 222 |  |  | }
 | 
      
         | 223 |  |  |  
 | 
      
         | 224 |  |  | func TestParseFoldCase(t *testing.T) {
 | 
      
         | 225 |  |  |         testParseDump(t, foldcaseTests, FoldCase)
 | 
      
         | 226 |  |  | }
 | 
      
         | 227 |  |  |  
 | 
      
         | 228 |  |  | var literalTests = []parseTest{
 | 
      
         | 229 |  |  |         {"(|)^$.[*+?]{5,10},\\", "str{(|)^$.[*+?]{5,10},\\}"},
 | 
      
         | 230 |  |  | }
 | 
      
         | 231 |  |  |  
 | 
      
         | 232 |  |  | func TestParseLiteral(t *testing.T) {
 | 
      
         | 233 |  |  |         testParseDump(t, literalTests, Literal)
 | 
      
         | 234 |  |  | }
 | 
      
         | 235 |  |  |  
 | 
      
         | 236 |  |  | var matchnlTests = []parseTest{
 | 
      
         | 237 |  |  |         {`.`, `dot{}`},
 | 
      
         | 238 |  |  |         {"\n", "lit{\n}"},
 | 
      
         | 239 |  |  |         {`[^a]`, `cc{0x0-0x60 0x62-0x10ffff}`},
 | 
      
         | 240 |  |  |         {`[a\n]`, `cc{0xa 0x61}`},
 | 
      
         | 241 |  |  | }
 | 
      
         | 242 |  |  |  
 | 
      
         | 243 |  |  | func TestParseMatchNL(t *testing.T) {
 | 
      
         | 244 |  |  |         testParseDump(t, matchnlTests, MatchNL)
 | 
      
         | 245 |  |  | }
 | 
      
         | 246 |  |  |  
 | 
      
         | 247 |  |  | var nomatchnlTests = []parseTest{
 | 
      
         | 248 |  |  |         {`.`, `dnl{}`},
 | 
      
         | 249 |  |  |         {"\n", "lit{\n}"},
 | 
      
         | 250 |  |  |         {`[^a]`, `cc{0x0-0x9 0xb-0x60 0x62-0x10ffff}`},
 | 
      
         | 251 |  |  |         {`[a\n]`, `cc{0xa 0x61}`},
 | 
      
         | 252 |  |  | }
 | 
      
         | 253 |  |  |  
 | 
      
         | 254 |  |  | func TestParseNoMatchNL(t *testing.T) {
 | 
      
         | 255 |  |  |         testParseDump(t, nomatchnlTests, 0)
 | 
      
         | 256 |  |  | }
 | 
      
         | 257 |  |  |  
 | 
      
         | 258 |  |  | // Test Parse -> Dump.
 | 
      
         | 259 |  |  | func testParseDump(t *testing.T, tests []parseTest, flags Flags) {
 | 
      
         | 260 |  |  |         for _, tt := range tests {
 | 
      
         | 261 |  |  |                 re, err := Parse(tt.Regexp, flags)
 | 
      
         | 262 |  |  |                 if err != nil {
 | 
      
         | 263 |  |  |                         t.Errorf("Parse(%#q): %v", tt.Regexp, err)
 | 
      
         | 264 |  |  |                         continue
 | 
      
         | 265 |  |  |                 }
 | 
      
         | 266 |  |  |                 d := dump(re)
 | 
      
         | 267 |  |  |                 if d != tt.Dump {
 | 
      
         | 268 |  |  |                         t.Errorf("Parse(%#q).Dump() = %#q want %#q", tt.Regexp, d, tt.Dump)
 | 
      
         | 269 |  |  |                 }
 | 
      
         | 270 |  |  |         }
 | 
      
         | 271 |  |  | }
 | 
      
         | 272 |  |  |  
 | 
      
         | 273 |  |  | // dump prints a string representation of the regexp showing
 | 
      
         | 274 |  |  | // the structure explicitly.
 | 
      
         | 275 |  |  | func dump(re *Regexp) string {
 | 
      
         | 276 |  |  |         var b bytes.Buffer
 | 
      
         | 277 |  |  |         dumpRegexp(&b, re)
 | 
      
         | 278 |  |  |         return b.String()
 | 
      
         | 279 |  |  | }
 | 
      
         | 280 |  |  |  
 | 
      
         | 281 |  |  | var opNames = []string{
 | 
      
         | 282 |  |  |         OpNoMatch:        "no",
 | 
      
         | 283 |  |  |         OpEmptyMatch:     "emp",
 | 
      
         | 284 |  |  |         OpLiteral:        "lit",
 | 
      
         | 285 |  |  |         OpCharClass:      "cc",
 | 
      
         | 286 |  |  |         OpAnyCharNotNL:   "dnl",
 | 
      
         | 287 |  |  |         OpAnyChar:        "dot",
 | 
      
         | 288 |  |  |         OpBeginLine:      "bol",
 | 
      
         | 289 |  |  |         OpEndLine:        "eol",
 | 
      
         | 290 |  |  |         OpBeginText:      "bot",
 | 
      
         | 291 |  |  |         OpEndText:        "eot",
 | 
      
         | 292 |  |  |         OpWordBoundary:   "wb",
 | 
      
         | 293 |  |  |         OpNoWordBoundary: "nwb",
 | 
      
         | 294 |  |  |         OpCapture:        "cap",
 | 
      
         | 295 |  |  |         OpStar:           "star",
 | 
      
         | 296 |  |  |         OpPlus:           "plus",
 | 
      
         | 297 |  |  |         OpQuest:          "que",
 | 
      
         | 298 |  |  |         OpRepeat:         "rep",
 | 
      
         | 299 |  |  |         OpConcat:         "cat",
 | 
      
         | 300 |  |  |         OpAlternate:      "alt",
 | 
      
         | 301 |  |  | }
 | 
      
         | 302 |  |  |  
 | 
      
         | 303 |  |  | // dumpRegexp writes an encoding of the syntax tree for the regexp re to b.
 | 
      
         | 304 |  |  | // It is used during testing to distinguish between parses that might print
 | 
      
         | 305 |  |  | // the same using re's String method.
 | 
      
         | 306 |  |  | func dumpRegexp(b *bytes.Buffer, re *Regexp) {
 | 
      
         | 307 |  |  |         if int(re.Op) >= len(opNames) || opNames[re.Op] == "" {
 | 
      
         | 308 |  |  |                 fmt.Fprintf(b, "op%d", re.Op)
 | 
      
         | 309 |  |  |         } else {
 | 
      
         | 310 |  |  |                 switch re.Op {
 | 
      
         | 311 |  |  |                 default:
 | 
      
         | 312 |  |  |                         b.WriteString(opNames[re.Op])
 | 
      
         | 313 |  |  |                 case OpStar, OpPlus, OpQuest, OpRepeat:
 | 
      
         | 314 |  |  |                         if re.Flags&NonGreedy != 0 {
 | 
      
         | 315 |  |  |                                 b.WriteByte('n')
 | 
      
         | 316 |  |  |                         }
 | 
      
         | 317 |  |  |                         b.WriteString(opNames[re.Op])
 | 
      
         | 318 |  |  |                 case OpLiteral:
 | 
      
         | 319 |  |  |                         if len(re.Rune) > 1 {
 | 
      
         | 320 |  |  |                                 b.WriteString("str")
 | 
      
         | 321 |  |  |                         } else {
 | 
      
         | 322 |  |  |                                 b.WriteString("lit")
 | 
      
         | 323 |  |  |                         }
 | 
      
         | 324 |  |  |                         if re.Flags&FoldCase != 0 {
 | 
      
         | 325 |  |  |                                 for _, r := range re.Rune {
 | 
      
         | 326 |  |  |                                         if unicode.SimpleFold(r) != r {
 | 
      
         | 327 |  |  |                                                 b.WriteString("fold")
 | 
      
         | 328 |  |  |                                                 break
 | 
      
         | 329 |  |  |                                         }
 | 
      
         | 330 |  |  |                                 }
 | 
      
         | 331 |  |  |                         }
 | 
      
         | 332 |  |  |                 }
 | 
      
         | 333 |  |  |         }
 | 
      
         | 334 |  |  |         b.WriteByte('{')
 | 
      
         | 335 |  |  |         switch re.Op {
 | 
      
         | 336 |  |  |         case OpEndText:
 | 
      
         | 337 |  |  |                 if re.Flags&WasDollar == 0 {
 | 
      
         | 338 |  |  |                         b.WriteString(`\z`)
 | 
      
         | 339 |  |  |                 }
 | 
      
         | 340 |  |  |         case OpLiteral:
 | 
      
         | 341 |  |  |                 for _, r := range re.Rune {
 | 
      
         | 342 |  |  |                         b.WriteRune(r)
 | 
      
         | 343 |  |  |                 }
 | 
      
         | 344 |  |  |         case OpConcat, OpAlternate:
 | 
      
         | 345 |  |  |                 for _, sub := range re.Sub {
 | 
      
         | 346 |  |  |                         dumpRegexp(b, sub)
 | 
      
         | 347 |  |  |                 }
 | 
      
         | 348 |  |  |         case OpStar, OpPlus, OpQuest:
 | 
      
         | 349 |  |  |                 dumpRegexp(b, re.Sub[0])
 | 
      
         | 350 |  |  |         case OpRepeat:
 | 
      
         | 351 |  |  |                 fmt.Fprintf(b, "%d,%d ", re.Min, re.Max)
 | 
      
         | 352 |  |  |                 dumpRegexp(b, re.Sub[0])
 | 
      
         | 353 |  |  |         case OpCapture:
 | 
      
         | 354 |  |  |                 if re.Name != "" {
 | 
      
         | 355 |  |  |                         b.WriteString(re.Name)
 | 
      
         | 356 |  |  |                         b.WriteByte(':')
 | 
      
         | 357 |  |  |                 }
 | 
      
         | 358 |  |  |                 dumpRegexp(b, re.Sub[0])
 | 
      
         | 359 |  |  |         case OpCharClass:
 | 
      
         | 360 |  |  |                 sep := ""
 | 
      
         | 361 |  |  |                 for i := 0; i < len(re.Rune); i += 2 {
 | 
      
         | 362 |  |  |                         b.WriteString(sep)
 | 
      
         | 363 |  |  |                         sep = " "
 | 
      
         | 364 |  |  |                         lo, hi := re.Rune[i], re.Rune[i+1]
 | 
      
         | 365 |  |  |                         if lo == hi {
 | 
      
         | 366 |  |  |                                 fmt.Fprintf(b, "%#x", lo)
 | 
      
         | 367 |  |  |                         } else {
 | 
      
         | 368 |  |  |                                 fmt.Fprintf(b, "%#x-%#x", lo, hi)
 | 
      
         | 369 |  |  |                         }
 | 
      
         | 370 |  |  |                 }
 | 
      
         | 371 |  |  |         }
 | 
      
         | 372 |  |  |         b.WriteByte('}')
 | 
      
         | 373 |  |  | }
 | 
      
         | 374 |  |  |  
 | 
      
         | 375 |  |  | func mkCharClass(f func(rune) bool) string {
 | 
      
         | 376 |  |  |         re := &Regexp{Op: OpCharClass}
 | 
      
         | 377 |  |  |         lo := rune(-1)
 | 
      
         | 378 |  |  |         for i := rune(0); i <= unicode.MaxRune; i++ {
 | 
      
         | 379 |  |  |                 if f(i) {
 | 
      
         | 380 |  |  |                         if lo < 0 {
 | 
      
         | 381 |  |  |                                 lo = i
 | 
      
         | 382 |  |  |                         }
 | 
      
         | 383 |  |  |                 } else {
 | 
      
         | 384 |  |  |                         if lo >= 0 {
 | 
      
         | 385 |  |  |                                 re.Rune = append(re.Rune, lo, i-1)
 | 
      
         | 386 |  |  |                                 lo = -1
 | 
      
         | 387 |  |  |                         }
 | 
      
         | 388 |  |  |                 }
 | 
      
         | 389 |  |  |         }
 | 
      
         | 390 |  |  |         if lo >= 0 {
 | 
      
         | 391 |  |  |                 re.Rune = append(re.Rune, lo, unicode.MaxRune)
 | 
      
         | 392 |  |  |         }
 | 
      
         | 393 |  |  |         return dump(re)
 | 
      
         | 394 |  |  | }
 | 
      
         | 395 |  |  |  
 | 
      
         | 396 |  |  | func isUpperFold(r rune) bool {
 | 
      
         | 397 |  |  |         if unicode.IsUpper(r) {
 | 
      
         | 398 |  |  |                 return true
 | 
      
         | 399 |  |  |         }
 | 
      
         | 400 |  |  |         c := unicode.SimpleFold(r)
 | 
      
         | 401 |  |  |         for c != r {
 | 
      
         | 402 |  |  |                 if unicode.IsUpper(c) {
 | 
      
         | 403 |  |  |                         return true
 | 
      
         | 404 |  |  |                 }
 | 
      
         | 405 |  |  |                 c = unicode.SimpleFold(c)
 | 
      
         | 406 |  |  |         }
 | 
      
         | 407 |  |  |         return false
 | 
      
         | 408 |  |  | }
 | 
      
         | 409 |  |  |  
 | 
      
         | 410 |  |  | func TestFoldConstants(t *testing.T) {
 | 
      
         | 411 |  |  |         last := rune(-1)
 | 
      
         | 412 |  |  |         for i := rune(0); i <= unicode.MaxRune; i++ {
 | 
      
         | 413 |  |  |                 if unicode.SimpleFold(i) == i {
 | 
      
         | 414 |  |  |                         continue
 | 
      
         | 415 |  |  |                 }
 | 
      
         | 416 |  |  |                 if last == -1 && MinFold != i {
 | 
      
         | 417 |  |  |                         t.Errorf("MinFold=%#U should be %#U", MinFold, i)
 | 
      
         | 418 |  |  |                 }
 | 
      
         | 419 |  |  |                 last = i
 | 
      
         | 420 |  |  |         }
 | 
      
         | 421 |  |  |         if MaxFold != last {
 | 
      
         | 422 |  |  |                 t.Errorf("MaxFold=%#U should be %#U", MaxFold, last)
 | 
      
         | 423 |  |  |         }
 | 
      
         | 424 |  |  | }
 | 
      
         | 425 |  |  |  
 | 
      
         | 426 |  |  | func TestAppendRangeCollapse(t *testing.T) {
 | 
      
         | 427 |  |  |         // AppendRange should collapse each of the new ranges
 | 
      
         | 428 |  |  |         // into the earlier ones (it looks back two ranges), so that
 | 
      
         | 429 |  |  |         // the slice never grows very large.
 | 
      
         | 430 |  |  |         // Note that we are not calling cleanClass.
 | 
      
         | 431 |  |  |         var r []rune
 | 
      
         | 432 |  |  |         for i := rune('A'); i <= 'Z'; i++ {
 | 
      
         | 433 |  |  |                 r = AppendRange(r, i, i)
 | 
      
         | 434 |  |  |                 r = AppendRange(r, i+'a'-'A', i+'a'-'A')
 | 
      
         | 435 |  |  |         }
 | 
      
         | 436 |  |  |         if string(r) != "AZaz" {
 | 
      
         | 437 |  |  |                 t.Errorf("AppendRange interlaced A-Z a-z = %s, want AZaz", string(r))
 | 
      
         | 438 |  |  |         }
 | 
      
         | 439 |  |  | }
 | 
      
         | 440 |  |  |  
 | 
      
         | 441 |  |  | var invalidRegexps = []string{
 | 
      
         | 442 |  |  |         `(`,
 | 
      
         | 443 |  |  |         `)`,
 | 
      
         | 444 |  |  |         `(a`,
 | 
      
         | 445 |  |  |         `(a|b|`,
 | 
      
         | 446 |  |  |         `(a|b`,
 | 
      
         | 447 |  |  |         `[a-z`,
 | 
      
         | 448 |  |  |         `([a-z)`,
 | 
      
         | 449 |  |  |         `x{1001}`,
 | 
      
         | 450 |  |  |         `x{9876543210}`,
 | 
      
         | 451 |  |  |         `x{2,1}`,
 | 
      
         | 452 |  |  |         `x{1,9876543210}`,
 | 
      
         | 453 |  |  |         "\xff", // Invalid UTF-8
 | 
      
         | 454 |  |  |         "[\xff]",
 | 
      
         | 455 |  |  |         "[\\\xff]",
 | 
      
         | 456 |  |  |         "\\\xff",
 | 
      
         | 457 |  |  |         `(?Pa`,
 | 
      
         | 458 |  |  |         `(?P`,
 | 
      
         | 459 |  |  |         `(?P
 | 
      
         | 460 |  |  |         `(?Pa)`,
 | 
      
         | 461 |  |  |         `(?P<>a)`,
 | 
      
         | 462 |  |  |         `[a-Z]`,
 | 
      
         | 463 |  |  |         `(?i)[a-Z]`,
 | 
      
         | 464 |  |  |         `a{100000}`,
 | 
      
         | 465 |  |  |         `a{100000,}`,
 | 
      
         | 466 |  |  | }
 | 
      
         | 467 |  |  |  
 | 
      
         | 468 |  |  | var onlyPerl = []string{
 | 
      
         | 469 |  |  |         `[a-b-c]`,
 | 
      
         | 470 |  |  |         `\Qabc\E`,
 | 
      
         | 471 |  |  |         `\Q*+?{[\E`,
 | 
      
         | 472 |  |  |         `\Q\\E`,
 | 
      
         | 473 |  |  |         `\Q\\\E`,
 | 
      
         | 474 |  |  |         `\Q\\\\E`,
 | 
      
         | 475 |  |  |         `\Q\\\\\E`,
 | 
      
         | 476 |  |  |         `(?:a)`,
 | 
      
         | 477 |  |  |         `(?Pa)`,
 | 
      
         | 478 |  |  | }
 | 
      
         | 479 |  |  |  
 | 
      
         | 480 |  |  | var onlyPOSIX = []string{
 | 
      
         | 481 |  |  |         "a++",
 | 
      
         | 482 |  |  |         "a**",
 | 
      
         | 483 |  |  |         "a?*",
 | 
      
         | 484 |  |  |         "a+*",
 | 
      
         | 485 |  |  |         "a{1}*",
 | 
      
         | 486 |  |  |         ".{1}{2}.{3}",
 | 
      
         | 487 |  |  | }
 | 
      
         | 488 |  |  |  
 | 
      
         | 489 |  |  | func TestParseInvalidRegexps(t *testing.T) {
 | 
      
         | 490 |  |  |         for _, regexp := range invalidRegexps {
 | 
      
         | 491 |  |  |                 if re, err := Parse(regexp, Perl); err == nil {
 | 
      
         | 492 |  |  |                         t.Errorf("Parse(%#q, Perl) = %s, should have failed", regexp, dump(re))
 | 
      
         | 493 |  |  |                 }
 | 
      
         | 494 |  |  |                 if re, err := Parse(regexp, POSIX); err == nil {
 | 
      
         | 495 |  |  |                         t.Errorf("Parse(%#q, POSIX) = %s, should have failed", regexp, dump(re))
 | 
      
         | 496 |  |  |                 }
 | 
      
         | 497 |  |  |         }
 | 
      
         | 498 |  |  |         for _, regexp := range onlyPerl {
 | 
      
         | 499 |  |  |                 if _, err := Parse(regexp, Perl); err != nil {
 | 
      
         | 500 |  |  |                         t.Errorf("Parse(%#q, Perl): %v", regexp, err)
 | 
      
         | 501 |  |  |                 }
 | 
      
         | 502 |  |  |                 if re, err := Parse(regexp, POSIX); err == nil {
 | 
      
         | 503 |  |  |                         t.Errorf("Parse(%#q, POSIX) = %s, should have failed", regexp, dump(re))
 | 
      
         | 504 |  |  |                 }
 | 
      
         | 505 |  |  |         }
 | 
      
         | 506 |  |  |         for _, regexp := range onlyPOSIX {
 | 
      
         | 507 |  |  |                 if re, err := Parse(regexp, Perl); err == nil {
 | 
      
         | 508 |  |  |                         t.Errorf("Parse(%#q, Perl) = %s, should have failed", regexp, dump(re))
 | 
      
         | 509 |  |  |                 }
 | 
      
         | 510 |  |  |                 if _, err := Parse(regexp, POSIX); err != nil {
 | 
      
         | 511 |  |  |                         t.Errorf("Parse(%#q, POSIX): %v", regexp, err)
 | 
      
         | 512 |  |  |                 }
 | 
      
         | 513 |  |  |         }
 | 
      
         | 514 |  |  | }
 | 
      
         | 515 |  |  |  
 | 
      
         | 516 |  |  | func TestToStringEquivalentParse(t *testing.T) {
 | 
      
         | 517 |  |  |         for _, tt := range parseTests {
 | 
      
         | 518 |  |  |                 re, err := Parse(tt.Regexp, testFlags)
 | 
      
         | 519 |  |  |                 if err != nil {
 | 
      
         | 520 |  |  |                         t.Errorf("Parse(%#q): %v", tt.Regexp, err)
 | 
      
         | 521 |  |  |                         continue
 | 
      
         | 522 |  |  |                 }
 | 
      
         | 523 |  |  |                 d := dump(re)
 | 
      
         | 524 |  |  |                 if d != tt.Dump {
 | 
      
         | 525 |  |  |                         t.Errorf("Parse(%#q).Dump() = %#q want %#q", tt.Regexp, d, tt.Dump)
 | 
      
         | 526 |  |  |                         continue
 | 
      
         | 527 |  |  |                 }
 | 
      
         | 528 |  |  |  
 | 
      
         | 529 |  |  |                 s := re.String()
 | 
      
         | 530 |  |  |                 if s != tt.Regexp {
 | 
      
         | 531 |  |  |                         // If ToString didn't return the original regexp,
 | 
      
         | 532 |  |  |                         // it must have found one with fewer parens.
 | 
      
         | 533 |  |  |                         // Unfortunately we can't check the length here, because
 | 
      
         | 534 |  |  |                         // ToString produces "\\{" for a literal brace,
 | 
      
         | 535 |  |  |                         // but "{" is a shorter equivalent in some contexts.
 | 
      
         | 536 |  |  |                         nre, err := Parse(s, testFlags)
 | 
      
         | 537 |  |  |                         if err != nil {
 | 
      
         | 538 |  |  |                                 t.Errorf("Parse(%#q.String() = %#q): %v", tt.Regexp, t, err)
 | 
      
         | 539 |  |  |                                 continue
 | 
      
         | 540 |  |  |                         }
 | 
      
         | 541 |  |  |                         nd := dump(nre)
 | 
      
         | 542 |  |  |                         if d != nd {
 | 
      
         | 543 |  |  |                                 t.Errorf("Parse(%#q) -> %#q; %#q vs %#q", tt.Regexp, s, d, nd)
 | 
      
         | 544 |  |  |                         }
 | 
      
         | 545 |  |  |  
 | 
      
         | 546 |  |  |                         ns := nre.String()
 | 
      
         | 547 |  |  |                         if s != ns {
 | 
      
         | 548 |  |  |                                 t.Errorf("Parse(%#q) -> %#q -> %#q", tt.Regexp, s, ns)
 | 
      
         | 549 |  |  |                         }
 | 
      
         | 550 |  |  |                 }
 | 
      
         | 551 |  |  |         }
 | 
      
         | 552 |  |  | }
 |