1 |
747 |
jeremybenn |
// Copyright 2009 The Go Authors. All rights reserved.
|
2 |
|
|
// Use of this source code is governed by a BSD-style
|
3 |
|
|
// license that can be found in the LICENSE file.
|
4 |
|
|
|
5 |
|
|
// Package regexp implements regular expression search.
|
6 |
|
|
//
|
7 |
|
|
// The syntax of the regular expressions accepted is the same
|
8 |
|
|
// general syntax used by Perl, Python, and other languages.
|
9 |
|
|
// More precisely, it is the syntax accepted by RE2 and described at
|
10 |
|
|
// http://code.google.com/p/re2/wiki/Syntax, except for \C.
|
11 |
|
|
//
|
12 |
|
|
// All characters are UTF-8-encoded code points.
|
13 |
|
|
//
|
14 |
|
|
// There are 16 methods of Regexp that match a regular expression and identify
|
15 |
|
|
// the matched text. Their names are matched by this regular expression:
|
16 |
|
|
//
|
17 |
|
|
// Find(All)?(String)?(Submatch)?(Index)?
|
18 |
|
|
//
|
19 |
|
|
// If 'All' is present, the routine matches successive non-overlapping
|
20 |
|
|
// matches of the entire expression. Empty matches abutting a preceding
|
21 |
|
|
// match are ignored. The return value is a slice containing the successive
|
22 |
|
|
// return values of the corresponding non-'All' routine. These routines take
|
23 |
|
|
// an extra integer argument, n; if n >= 0, the function returns at most n
|
24 |
|
|
// matches/submatches.
|
25 |
|
|
//
|
26 |
|
|
// If 'String' is present, the argument is a string; otherwise it is a slice
|
27 |
|
|
// of bytes; return values are adjusted as appropriate.
|
28 |
|
|
//
|
29 |
|
|
// If 'Submatch' is present, the return value is a slice identifying the
|
30 |
|
|
// successive submatches of the expression. Submatches are matches of
|
31 |
|
|
// parenthesized subexpressions within the regular expression, numbered from
|
32 |
|
|
// left to right in order of opening parenthesis. Submatch 0 is the match of
|
33 |
|
|
// the entire expression, submatch 1 the match of the first parenthesized
|
34 |
|
|
// subexpression, and so on.
|
35 |
|
|
//
|
36 |
|
|
// If 'Index' is present, matches and submatches are identified by byte index
|
37 |
|
|
// pairs within the input string: result[2*n:2*n+1] identifies the indexes of
|
38 |
|
|
// the nth submatch. The pair for n==0 identifies the match of the entire
|
39 |
|
|
// expression. If 'Index' is not present, the match is identified by the
|
40 |
|
|
// text of the match/submatch. If an index is negative, it means that
|
41 |
|
|
// subexpression did not match any string in the input.
|
42 |
|
|
//
|
43 |
|
|
// There is also a subset of the methods that can be applied to text read
|
44 |
|
|
// from a RuneReader:
|
45 |
|
|
//
|
46 |
|
|
// MatchReader, FindReaderIndex, FindReaderSubmatchIndex
|
47 |
|
|
//
|
48 |
|
|
// This set may grow. Note that regular expression matches may need to
|
49 |
|
|
// examine text beyond the text returned by a match, so the methods that
|
50 |
|
|
// match text from a RuneReader may read arbitrarily far into the input
|
51 |
|
|
// before returning.
|
52 |
|
|
//
|
53 |
|
|
// (There are a few other methods that do not match this pattern.)
|
54 |
|
|
//
|
55 |
|
|
package regexp
|
56 |
|
|
|
57 |
|
|
import (
|
58 |
|
|
"bytes"
|
59 |
|
|
"io"
|
60 |
|
|
"regexp/syntax"
|
61 |
|
|
"strconv"
|
62 |
|
|
"strings"
|
63 |
|
|
"sync"
|
64 |
|
|
"unicode/utf8"
|
65 |
|
|
)
|
66 |
|
|
|
67 |
|
|
var debug = false
|
68 |
|
|
|
69 |
|
|
// Regexp is the representation of a compiled regular expression.
|
70 |
|
|
// The public interface is entirely through methods.
|
71 |
|
|
// A Regexp is safe for concurrent use by multiple goroutines.
|
72 |
|
|
type Regexp struct {
|
73 |
|
|
// read-only after Compile
|
74 |
|
|
expr string // as passed to Compile
|
75 |
|
|
prog *syntax.Prog // compiled program
|
76 |
|
|
prefix string // required prefix in unanchored matches
|
77 |
|
|
prefixBytes []byte // prefix, as a []byte
|
78 |
|
|
prefixComplete bool // prefix is the entire regexp
|
79 |
|
|
prefixRune rune // first rune in prefix
|
80 |
|
|
cond syntax.EmptyOp // empty-width conditions required at start of match
|
81 |
|
|
numSubexp int
|
82 |
|
|
subexpNames []string
|
83 |
|
|
longest bool
|
84 |
|
|
|
85 |
|
|
// cache of machines for running regexp
|
86 |
|
|
mu sync.Mutex
|
87 |
|
|
machine []*machine
|
88 |
|
|
}
|
89 |
|
|
|
90 |
|
|
// String returns the source text used to compile the regular expression.
|
91 |
|
|
func (re *Regexp) String() string {
|
92 |
|
|
return re.expr
|
93 |
|
|
}
|
94 |
|
|
|
95 |
|
|
// Compile parses a regular expression and returns, if successful,
|
96 |
|
|
// a Regexp object that can be used to match against text.
|
97 |
|
|
//
|
98 |
|
|
// When matching against text, the regexp returns a match that
|
99 |
|
|
// begins as early as possible in the input (leftmost), and among those
|
100 |
|
|
// it chooses the one that a backtracking search would have found first.
|
101 |
|
|
// This so-called leftmost-first matching is the same semantics
|
102 |
|
|
// that Perl, Python, and other implementations use, although this
|
103 |
|
|
// package implements it without the expense of backtracking.
|
104 |
|
|
// For POSIX leftmost-longest matching, see CompilePOSIX.
|
105 |
|
|
func Compile(expr string) (*Regexp, error) {
|
106 |
|
|
return compile(expr, syntax.Perl, false)
|
107 |
|
|
}
|
108 |
|
|
|
109 |
|
|
// CompilePOSIX is like Compile but restricts the regular expression
|
110 |
|
|
// to POSIX ERE (egrep) syntax and changes the match semantics to
|
111 |
|
|
// leftmost-longest.
|
112 |
|
|
//
|
113 |
|
|
// That is, when matching against text, the regexp returns a match that
|
114 |
|
|
// begins as early as possible in the input (leftmost), and among those
|
115 |
|
|
// it chooses a match that is as long as possible.
|
116 |
|
|
// This so-called leftmost-longest matching is the same semantics
|
117 |
|
|
// that early regular expression implementations used and that POSIX
|
118 |
|
|
// specifies.
|
119 |
|
|
//
|
120 |
|
|
// However, there can be multiple leftmost-longest matches, with different
|
121 |
|
|
// submatch choices, and here this package diverges from POSIX.
|
122 |
|
|
// Among the possible leftmost-longest matches, this package chooses
|
123 |
|
|
// the one that a backtracking search would have found first, while POSIX
|
124 |
|
|
// specifies that the match be chosen to maximize the length of the first
|
125 |
|
|
// subexpression, then the second, and so on from left to right.
|
126 |
|
|
// The POSIX rule is computationally prohibitive and not even well-defined.
|
127 |
|
|
// See http://swtch.com/~rsc/regexp/regexp2.html#posix for details.
|
128 |
|
|
func CompilePOSIX(expr string) (*Regexp, error) {
|
129 |
|
|
return compile(expr, syntax.POSIX, true)
|
130 |
|
|
}
|
131 |
|
|
|
132 |
|
|
func compile(expr string, mode syntax.Flags, longest bool) (*Regexp, error) {
|
133 |
|
|
re, err := syntax.Parse(expr, mode)
|
134 |
|
|
if err != nil {
|
135 |
|
|
return nil, err
|
136 |
|
|
}
|
137 |
|
|
maxCap := re.MaxCap()
|
138 |
|
|
capNames := re.CapNames()
|
139 |
|
|
|
140 |
|
|
re = re.Simplify()
|
141 |
|
|
prog, err := syntax.Compile(re)
|
142 |
|
|
if err != nil {
|
143 |
|
|
return nil, err
|
144 |
|
|
}
|
145 |
|
|
regexp := &Regexp{
|
146 |
|
|
expr: expr,
|
147 |
|
|
prog: prog,
|
148 |
|
|
numSubexp: maxCap,
|
149 |
|
|
subexpNames: capNames,
|
150 |
|
|
cond: prog.StartCond(),
|
151 |
|
|
longest: longest,
|
152 |
|
|
}
|
153 |
|
|
regexp.prefix, regexp.prefixComplete = prog.Prefix()
|
154 |
|
|
if regexp.prefix != "" {
|
155 |
|
|
// TODO(rsc): Remove this allocation by adding
|
156 |
|
|
// IndexString to package bytes.
|
157 |
|
|
regexp.prefixBytes = []byte(regexp.prefix)
|
158 |
|
|
regexp.prefixRune, _ = utf8.DecodeRuneInString(regexp.prefix)
|
159 |
|
|
}
|
160 |
|
|
return regexp, nil
|
161 |
|
|
}
|
162 |
|
|
|
163 |
|
|
// get returns a machine to use for matching re.
|
164 |
|
|
// It uses the re's machine cache if possible, to avoid
|
165 |
|
|
// unnecessary allocation.
|
166 |
|
|
func (re *Regexp) get() *machine {
|
167 |
|
|
re.mu.Lock()
|
168 |
|
|
if n := len(re.machine); n > 0 {
|
169 |
|
|
z := re.machine[n-1]
|
170 |
|
|
re.machine = re.machine[:n-1]
|
171 |
|
|
re.mu.Unlock()
|
172 |
|
|
return z
|
173 |
|
|
}
|
174 |
|
|
re.mu.Unlock()
|
175 |
|
|
z := progMachine(re.prog)
|
176 |
|
|
z.re = re
|
177 |
|
|
return z
|
178 |
|
|
}
|
179 |
|
|
|
180 |
|
|
// put returns a machine to the re's machine cache.
|
181 |
|
|
// There is no attempt to limit the size of the cache, so it will
|
182 |
|
|
// grow to the maximum number of simultaneous matches
|
183 |
|
|
// run using re. (The cache empties when re gets garbage collected.)
|
184 |
|
|
func (re *Regexp) put(z *machine) {
|
185 |
|
|
re.mu.Lock()
|
186 |
|
|
re.machine = append(re.machine, z)
|
187 |
|
|
re.mu.Unlock()
|
188 |
|
|
}
|
189 |
|
|
|
190 |
|
|
// MustCompile is like Compile but panics if the expression cannot be parsed.
|
191 |
|
|
// It simplifies safe initialization of global variables holding compiled regular
|
192 |
|
|
// expressions.
|
193 |
|
|
func MustCompile(str string) *Regexp {
|
194 |
|
|
regexp, error := Compile(str)
|
195 |
|
|
if error != nil {
|
196 |
|
|
panic(`regexp: Compile(` + quote(str) + `): ` + error.Error())
|
197 |
|
|
}
|
198 |
|
|
return regexp
|
199 |
|
|
}
|
200 |
|
|
|
201 |
|
|
// MustCompilePOSIX is like CompilePOSIX but panics if the expression cannot be parsed.
|
202 |
|
|
// It simplifies safe initialization of global variables holding compiled regular
|
203 |
|
|
// expressions.
|
204 |
|
|
func MustCompilePOSIX(str string) *Regexp {
|
205 |
|
|
regexp, error := CompilePOSIX(str)
|
206 |
|
|
if error != nil {
|
207 |
|
|
panic(`regexp: CompilePOSIX(` + quote(str) + `): ` + error.Error())
|
208 |
|
|
}
|
209 |
|
|
return regexp
|
210 |
|
|
}
|
211 |
|
|
|
212 |
|
|
func quote(s string) string {
|
213 |
|
|
if strconv.CanBackquote(s) {
|
214 |
|
|
return "`" + s + "`"
|
215 |
|
|
}
|
216 |
|
|
return strconv.Quote(s)
|
217 |
|
|
}
|
218 |
|
|
|
219 |
|
|
// NumSubexp returns the number of parenthesized subexpressions in this Regexp.
|
220 |
|
|
func (re *Regexp) NumSubexp() int {
|
221 |
|
|
return re.numSubexp
|
222 |
|
|
}
|
223 |
|
|
|
224 |
|
|
// SubexpNames returns the names of the parenthesized subexpressions
|
225 |
|
|
// in this Regexp. The name for the first sub-expression is names[1],
|
226 |
|
|
// so that if m is a match slice, the name for m[i] is SubexpNames()[i].
|
227 |
|
|
// Since the Regexp as a whole cannot be named, names[0] is always
|
228 |
|
|
// the empty string. The slice should not be modified.
|
229 |
|
|
func (re *Regexp) SubexpNames() []string {
|
230 |
|
|
return re.subexpNames
|
231 |
|
|
}
|
232 |
|
|
|
233 |
|
|
const endOfText rune = -1
|
234 |
|
|
|
235 |
|
|
// input abstracts different representations of the input text. It provides
|
236 |
|
|
// one-character lookahead.
|
237 |
|
|
type input interface {
|
238 |
|
|
step(pos int) (r rune, width int) // advance one rune
|
239 |
|
|
canCheckPrefix() bool // can we look ahead without losing info?
|
240 |
|
|
hasPrefix(re *Regexp) bool
|
241 |
|
|
index(re *Regexp, pos int) int
|
242 |
|
|
context(pos int) syntax.EmptyOp
|
243 |
|
|
}
|
244 |
|
|
|
245 |
|
|
// inputString scans a string.
|
246 |
|
|
type inputString struct {
|
247 |
|
|
str string
|
248 |
|
|
}
|
249 |
|
|
|
250 |
|
|
func (i *inputString) step(pos int) (rune, int) {
|
251 |
|
|
if pos < len(i.str) {
|
252 |
|
|
c := i.str[pos]
|
253 |
|
|
if c < utf8.RuneSelf {
|
254 |
|
|
return rune(c), 1
|
255 |
|
|
}
|
256 |
|
|
return utf8.DecodeRuneInString(i.str[pos:])
|
257 |
|
|
}
|
258 |
|
|
return endOfText, 0
|
259 |
|
|
}
|
260 |
|
|
|
261 |
|
|
func (i *inputString) canCheckPrefix() bool {
|
262 |
|
|
return true
|
263 |
|
|
}
|
264 |
|
|
|
265 |
|
|
func (i *inputString) hasPrefix(re *Regexp) bool {
|
266 |
|
|
return strings.HasPrefix(i.str, re.prefix)
|
267 |
|
|
}
|
268 |
|
|
|
269 |
|
|
func (i *inputString) index(re *Regexp, pos int) int {
|
270 |
|
|
return strings.Index(i.str[pos:], re.prefix)
|
271 |
|
|
}
|
272 |
|
|
|
273 |
|
|
func (i *inputString) context(pos int) syntax.EmptyOp {
|
274 |
|
|
r1, r2 := endOfText, endOfText
|
275 |
|
|
if pos > 0 && pos <= len(i.str) {
|
276 |
|
|
r1, _ = utf8.DecodeLastRuneInString(i.str[:pos])
|
277 |
|
|
}
|
278 |
|
|
if pos < len(i.str) {
|
279 |
|
|
r2, _ = utf8.DecodeRuneInString(i.str[pos:])
|
280 |
|
|
}
|
281 |
|
|
return syntax.EmptyOpContext(r1, r2)
|
282 |
|
|
}
|
283 |
|
|
|
284 |
|
|
// inputBytes scans a byte slice.
|
285 |
|
|
type inputBytes struct {
|
286 |
|
|
str []byte
|
287 |
|
|
}
|
288 |
|
|
|
289 |
|
|
func (i *inputBytes) step(pos int) (rune, int) {
|
290 |
|
|
if pos < len(i.str) {
|
291 |
|
|
c := i.str[pos]
|
292 |
|
|
if c < utf8.RuneSelf {
|
293 |
|
|
return rune(c), 1
|
294 |
|
|
}
|
295 |
|
|
return utf8.DecodeRune(i.str[pos:])
|
296 |
|
|
}
|
297 |
|
|
return endOfText, 0
|
298 |
|
|
}
|
299 |
|
|
|
300 |
|
|
func (i *inputBytes) canCheckPrefix() bool {
|
301 |
|
|
return true
|
302 |
|
|
}
|
303 |
|
|
|
304 |
|
|
func (i *inputBytes) hasPrefix(re *Regexp) bool {
|
305 |
|
|
return bytes.HasPrefix(i.str, re.prefixBytes)
|
306 |
|
|
}
|
307 |
|
|
|
308 |
|
|
func (i *inputBytes) index(re *Regexp, pos int) int {
|
309 |
|
|
return bytes.Index(i.str[pos:], re.prefixBytes)
|
310 |
|
|
}
|
311 |
|
|
|
312 |
|
|
func (i *inputBytes) context(pos int) syntax.EmptyOp {
|
313 |
|
|
r1, r2 := endOfText, endOfText
|
314 |
|
|
if pos > 0 && pos <= len(i.str) {
|
315 |
|
|
r1, _ = utf8.DecodeLastRune(i.str[:pos])
|
316 |
|
|
}
|
317 |
|
|
if pos < len(i.str) {
|
318 |
|
|
r2, _ = utf8.DecodeRune(i.str[pos:])
|
319 |
|
|
}
|
320 |
|
|
return syntax.EmptyOpContext(r1, r2)
|
321 |
|
|
}
|
322 |
|
|
|
323 |
|
|
// inputReader scans a RuneReader.
|
324 |
|
|
type inputReader struct {
|
325 |
|
|
r io.RuneReader
|
326 |
|
|
atEOT bool
|
327 |
|
|
pos int
|
328 |
|
|
}
|
329 |
|
|
|
330 |
|
|
func (i *inputReader) step(pos int) (rune, int) {
|
331 |
|
|
if !i.atEOT && pos != i.pos {
|
332 |
|
|
return endOfText, 0
|
333 |
|
|
|
334 |
|
|
}
|
335 |
|
|
r, w, err := i.r.ReadRune()
|
336 |
|
|
if err != nil {
|
337 |
|
|
i.atEOT = true
|
338 |
|
|
return endOfText, 0
|
339 |
|
|
}
|
340 |
|
|
i.pos += w
|
341 |
|
|
return r, w
|
342 |
|
|
}
|
343 |
|
|
|
344 |
|
|
func (i *inputReader) canCheckPrefix() bool {
|
345 |
|
|
return false
|
346 |
|
|
}
|
347 |
|
|
|
348 |
|
|
func (i *inputReader) hasPrefix(re *Regexp) bool {
|
349 |
|
|
return false
|
350 |
|
|
}
|
351 |
|
|
|
352 |
|
|
func (i *inputReader) index(re *Regexp, pos int) int {
|
353 |
|
|
return -1
|
354 |
|
|
}
|
355 |
|
|
|
356 |
|
|
func (i *inputReader) context(pos int) syntax.EmptyOp {
|
357 |
|
|
return 0
|
358 |
|
|
}
|
359 |
|
|
|
360 |
|
|
// LiteralPrefix returns a literal string that must begin any match
|
361 |
|
|
// of the regular expression re. It returns the boolean true if the
|
362 |
|
|
// literal string comprises the entire regular expression.
|
363 |
|
|
func (re *Regexp) LiteralPrefix() (prefix string, complete bool) {
|
364 |
|
|
return re.prefix, re.prefixComplete
|
365 |
|
|
}
|
366 |
|
|
|
367 |
|
|
// MatchReader returns whether the Regexp matches the text read by the
|
368 |
|
|
// RuneReader. The return value is a boolean: true for match, false for no
|
369 |
|
|
// match.
|
370 |
|
|
func (re *Regexp) MatchReader(r io.RuneReader) bool {
|
371 |
|
|
return re.doExecute(r, nil, "", 0, 0) != nil
|
372 |
|
|
}
|
373 |
|
|
|
374 |
|
|
// MatchString returns whether the Regexp matches the string s.
|
375 |
|
|
// The return value is a boolean: true for match, false for no match.
|
376 |
|
|
func (re *Regexp) MatchString(s string) bool {
|
377 |
|
|
return re.doExecute(nil, nil, s, 0, 0) != nil
|
378 |
|
|
}
|
379 |
|
|
|
380 |
|
|
// Match returns whether the Regexp matches the byte slice b.
|
381 |
|
|
// The return value is a boolean: true for match, false for no match.
|
382 |
|
|
func (re *Regexp) Match(b []byte) bool {
|
383 |
|
|
return re.doExecute(nil, b, "", 0, 0) != nil
|
384 |
|
|
}
|
385 |
|
|
|
386 |
|
|
// MatchReader checks whether a textual regular expression matches the text
|
387 |
|
|
// read by the RuneReader. More complicated queries need to use Compile and
|
388 |
|
|
// the full Regexp interface.
|
389 |
|
|
func MatchReader(pattern string, r io.RuneReader) (matched bool, error error) {
|
390 |
|
|
re, err := Compile(pattern)
|
391 |
|
|
if err != nil {
|
392 |
|
|
return false, err
|
393 |
|
|
}
|
394 |
|
|
return re.MatchReader(r), nil
|
395 |
|
|
}
|
396 |
|
|
|
397 |
|
|
// MatchString checks whether a textual regular expression
|
398 |
|
|
// matches a string. More complicated queries need
|
399 |
|
|
// to use Compile and the full Regexp interface.
|
400 |
|
|
func MatchString(pattern string, s string) (matched bool, error error) {
|
401 |
|
|
re, err := Compile(pattern)
|
402 |
|
|
if err != nil {
|
403 |
|
|
return false, err
|
404 |
|
|
}
|
405 |
|
|
return re.MatchString(s), nil
|
406 |
|
|
}
|
407 |
|
|
|
408 |
|
|
// Match checks whether a textual regular expression
|
409 |
|
|
// matches a byte slice. More complicated queries need
|
410 |
|
|
// to use Compile and the full Regexp interface.
|
411 |
|
|
func Match(pattern string, b []byte) (matched bool, error error) {
|
412 |
|
|
re, err := Compile(pattern)
|
413 |
|
|
if err != nil {
|
414 |
|
|
return false, err
|
415 |
|
|
}
|
416 |
|
|
return re.Match(b), nil
|
417 |
|
|
}
|
418 |
|
|
|
419 |
|
|
// ReplaceAllString returns a copy of src in which all matches for the Regexp
|
420 |
|
|
// have been replaced by repl. No support is provided for expressions
|
421 |
|
|
// (e.g. \1 or $1) in the replacement string.
|
422 |
|
|
func (re *Regexp) ReplaceAllString(src, repl string) string {
|
423 |
|
|
return re.ReplaceAllStringFunc(src, func(string) string { return repl })
|
424 |
|
|
}
|
425 |
|
|
|
426 |
|
|
// ReplaceAllStringFunc returns a copy of src in which all matches for the
|
427 |
|
|
// Regexp have been replaced by the return value of of function repl (whose
|
428 |
|
|
// first argument is the matched string). No support is provided for
|
429 |
|
|
// expressions (e.g. \1 or $1) in the replacement string.
|
430 |
|
|
func (re *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) string {
|
431 |
|
|
lastMatchEnd := 0 // end position of the most recent match
|
432 |
|
|
searchPos := 0 // position where we next look for a match
|
433 |
|
|
buf := new(bytes.Buffer)
|
434 |
|
|
for searchPos <= len(src) {
|
435 |
|
|
a := re.doExecute(nil, nil, src, searchPos, 2)
|
436 |
|
|
if len(a) == 0 {
|
437 |
|
|
break // no more matches
|
438 |
|
|
}
|
439 |
|
|
|
440 |
|
|
// Copy the unmatched characters before this match.
|
441 |
|
|
io.WriteString(buf, src[lastMatchEnd:a[0]])
|
442 |
|
|
|
443 |
|
|
// Now insert a copy of the replacement string, but not for a
|
444 |
|
|
// match of the empty string immediately after another match.
|
445 |
|
|
// (Otherwise, we get double replacement for patterns that
|
446 |
|
|
// match both empty and nonempty strings.)
|
447 |
|
|
if a[1] > lastMatchEnd || a[0] == 0 {
|
448 |
|
|
io.WriteString(buf, repl(src[a[0]:a[1]]))
|
449 |
|
|
}
|
450 |
|
|
lastMatchEnd = a[1]
|
451 |
|
|
|
452 |
|
|
// Advance past this match; always advance at least one character.
|
453 |
|
|
_, width := utf8.DecodeRuneInString(src[searchPos:])
|
454 |
|
|
if searchPos+width > a[1] {
|
455 |
|
|
searchPos += width
|
456 |
|
|
} else if searchPos+1 > a[1] {
|
457 |
|
|
// This clause is only needed at the end of the input
|
458 |
|
|
// string. In that case, DecodeRuneInString returns width=0.
|
459 |
|
|
searchPos++
|
460 |
|
|
} else {
|
461 |
|
|
searchPos = a[1]
|
462 |
|
|
}
|
463 |
|
|
}
|
464 |
|
|
|
465 |
|
|
// Copy the unmatched characters after the last match.
|
466 |
|
|
io.WriteString(buf, src[lastMatchEnd:])
|
467 |
|
|
|
468 |
|
|
return buf.String()
|
469 |
|
|
}
|
470 |
|
|
|
471 |
|
|
// ReplaceAll returns a copy of src in which all matches for the Regexp
|
472 |
|
|
// have been replaced by repl. No support is provided for expressions
|
473 |
|
|
// (e.g. \1 or $1) in the replacement text.
|
474 |
|
|
func (re *Regexp) ReplaceAll(src, repl []byte) []byte {
|
475 |
|
|
return re.ReplaceAllFunc(src, func([]byte) []byte { return repl })
|
476 |
|
|
}
|
477 |
|
|
|
478 |
|
|
// ReplaceAllFunc returns a copy of src in which all matches for the
|
479 |
|
|
// Regexp have been replaced by the return value of of function repl (whose
|
480 |
|
|
// first argument is the matched []byte). No support is provided for
|
481 |
|
|
// expressions (e.g. \1 or $1) in the replacement string.
|
482 |
|
|
func (re *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte {
|
483 |
|
|
lastMatchEnd := 0 // end position of the most recent match
|
484 |
|
|
searchPos := 0 // position where we next look for a match
|
485 |
|
|
buf := new(bytes.Buffer)
|
486 |
|
|
for searchPos <= len(src) {
|
487 |
|
|
a := re.doExecute(nil, src, "", searchPos, 2)
|
488 |
|
|
if len(a) == 0 {
|
489 |
|
|
break // no more matches
|
490 |
|
|
}
|
491 |
|
|
|
492 |
|
|
// Copy the unmatched characters before this match.
|
493 |
|
|
buf.Write(src[lastMatchEnd:a[0]])
|
494 |
|
|
|
495 |
|
|
// Now insert a copy of the replacement string, but not for a
|
496 |
|
|
// match of the empty string immediately after another match.
|
497 |
|
|
// (Otherwise, we get double replacement for patterns that
|
498 |
|
|
// match both empty and nonempty strings.)
|
499 |
|
|
if a[1] > lastMatchEnd || a[0] == 0 {
|
500 |
|
|
buf.Write(repl(src[a[0]:a[1]]))
|
501 |
|
|
}
|
502 |
|
|
lastMatchEnd = a[1]
|
503 |
|
|
|
504 |
|
|
// Advance past this match; always advance at least one character.
|
505 |
|
|
_, width := utf8.DecodeRune(src[searchPos:])
|
506 |
|
|
if searchPos+width > a[1] {
|
507 |
|
|
searchPos += width
|
508 |
|
|
} else if searchPos+1 > a[1] {
|
509 |
|
|
// This clause is only needed at the end of the input
|
510 |
|
|
// string. In that case, DecodeRuneInString returns width=0.
|
511 |
|
|
searchPos++
|
512 |
|
|
} else {
|
513 |
|
|
searchPos = a[1]
|
514 |
|
|
}
|
515 |
|
|
}
|
516 |
|
|
|
517 |
|
|
// Copy the unmatched characters after the last match.
|
518 |
|
|
buf.Write(src[lastMatchEnd:])
|
519 |
|
|
|
520 |
|
|
return buf.Bytes()
|
521 |
|
|
}
|
522 |
|
|
|
523 |
|
|
var specialBytes = []byte(`\.+*?()|[]{}^$`)
|
524 |
|
|
|
525 |
|
|
func special(b byte) bool {
|
526 |
|
|
return bytes.IndexByte(specialBytes, b) >= 0
|
527 |
|
|
}
|
528 |
|
|
|
529 |
|
|
// QuoteMeta returns a string that quotes all regular expression metacharacters
|
530 |
|
|
// inside the argument text; the returned string is a regular expression matching
|
531 |
|
|
// the literal text. For example, QuoteMeta(`[foo]`) returns `\[foo\]`.
|
532 |
|
|
func QuoteMeta(s string) string {
|
533 |
|
|
b := make([]byte, 2*len(s))
|
534 |
|
|
|
535 |
|
|
// A byte loop is correct because all metacharacters are ASCII.
|
536 |
|
|
j := 0
|
537 |
|
|
for i := 0; i < len(s); i++ {
|
538 |
|
|
if special(s[i]) {
|
539 |
|
|
b[j] = '\\'
|
540 |
|
|
j++
|
541 |
|
|
}
|
542 |
|
|
b[j] = s[i]
|
543 |
|
|
j++
|
544 |
|
|
}
|
545 |
|
|
return string(b[0:j])
|
546 |
|
|
}
|
547 |
|
|
|
548 |
|
|
// The number of capture values in the program may correspond
|
549 |
|
|
// to fewer capturing expressions than are in the regexp.
|
550 |
|
|
// For example, "(a){0}" turns into an empty program, so the
|
551 |
|
|
// maximum capture in the program is 0 but we need to return
|
552 |
|
|
// an expression for \1. Pad appends -1s to the slice a as needed.
|
553 |
|
|
func (re *Regexp) pad(a []int) []int {
|
554 |
|
|
if a == nil {
|
555 |
|
|
// No match.
|
556 |
|
|
return nil
|
557 |
|
|
}
|
558 |
|
|
n := (1 + re.numSubexp) * 2
|
559 |
|
|
for len(a) < n {
|
560 |
|
|
a = append(a, -1)
|
561 |
|
|
}
|
562 |
|
|
return a
|
563 |
|
|
}
|
564 |
|
|
|
565 |
|
|
// Find matches in slice b if b is non-nil, otherwise find matches in string s.
|
566 |
|
|
func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) {
|
567 |
|
|
var end int
|
568 |
|
|
if b == nil {
|
569 |
|
|
end = len(s)
|
570 |
|
|
} else {
|
571 |
|
|
end = len(b)
|
572 |
|
|
}
|
573 |
|
|
|
574 |
|
|
for pos, i, prevMatchEnd := 0, 0, -1; i < n && pos <= end; {
|
575 |
|
|
matches := re.doExecute(nil, b, s, pos, re.prog.NumCap)
|
576 |
|
|
if len(matches) == 0 {
|
577 |
|
|
break
|
578 |
|
|
}
|
579 |
|
|
|
580 |
|
|
accept := true
|
581 |
|
|
if matches[1] == pos {
|
582 |
|
|
// We've found an empty match.
|
583 |
|
|
if matches[0] == prevMatchEnd {
|
584 |
|
|
// We don't allow an empty match right
|
585 |
|
|
// after a previous match, so ignore it.
|
586 |
|
|
accept = false
|
587 |
|
|
}
|
588 |
|
|
var width int
|
589 |
|
|
// TODO: use step()
|
590 |
|
|
if b == nil {
|
591 |
|
|
_, width = utf8.DecodeRuneInString(s[pos:end])
|
592 |
|
|
} else {
|
593 |
|
|
_, width = utf8.DecodeRune(b[pos:end])
|
594 |
|
|
}
|
595 |
|
|
if width > 0 {
|
596 |
|
|
pos += width
|
597 |
|
|
} else {
|
598 |
|
|
pos = end + 1
|
599 |
|
|
}
|
600 |
|
|
} else {
|
601 |
|
|
pos = matches[1]
|
602 |
|
|
}
|
603 |
|
|
prevMatchEnd = matches[1]
|
604 |
|
|
|
605 |
|
|
if accept {
|
606 |
|
|
deliver(re.pad(matches))
|
607 |
|
|
i++
|
608 |
|
|
}
|
609 |
|
|
}
|
610 |
|
|
}
|
611 |
|
|
|
612 |
|
|
// Find returns a slice holding the text of the leftmost match in b of the regular expression.
|
613 |
|
|
// A return value of nil indicates no match.
|
614 |
|
|
func (re *Regexp) Find(b []byte) []byte {
|
615 |
|
|
a := re.doExecute(nil, b, "", 0, 2)
|
616 |
|
|
if a == nil {
|
617 |
|
|
return nil
|
618 |
|
|
}
|
619 |
|
|
return b[a[0]:a[1]]
|
620 |
|
|
}
|
621 |
|
|
|
622 |
|
|
// FindIndex returns a two-element slice of integers defining the location of
|
623 |
|
|
// the leftmost match in b of the regular expression. The match itself is at
|
624 |
|
|
// b[loc[0]:loc[1]].
|
625 |
|
|
// A return value of nil indicates no match.
|
626 |
|
|
func (re *Regexp) FindIndex(b []byte) (loc []int) {
|
627 |
|
|
a := re.doExecute(nil, b, "", 0, 2)
|
628 |
|
|
if a == nil {
|
629 |
|
|
return nil
|
630 |
|
|
}
|
631 |
|
|
return a[0:2]
|
632 |
|
|
}
|
633 |
|
|
|
634 |
|
|
// FindString returns a string holding the text of the leftmost match in s of the regular
|
635 |
|
|
// expression. If there is no match, the return value is an empty string,
|
636 |
|
|
// but it will also be empty if the regular expression successfully matches
|
637 |
|
|
// an empty string. Use FindStringIndex or FindStringSubmatch if it is
|
638 |
|
|
// necessary to distinguish these cases.
|
639 |
|
|
func (re *Regexp) FindString(s string) string {
|
640 |
|
|
a := re.doExecute(nil, nil, s, 0, 2)
|
641 |
|
|
if a == nil {
|
642 |
|
|
return ""
|
643 |
|
|
}
|
644 |
|
|
return s[a[0]:a[1]]
|
645 |
|
|
}
|
646 |
|
|
|
647 |
|
|
// FindStringIndex returns a two-element slice of integers defining the
|
648 |
|
|
// location of the leftmost match in s of the regular expression. The match
|
649 |
|
|
// itself is at s[loc[0]:loc[1]].
|
650 |
|
|
// A return value of nil indicates no match.
|
651 |
|
|
func (re *Regexp) FindStringIndex(s string) []int {
|
652 |
|
|
a := re.doExecute(nil, nil, s, 0, 2)
|
653 |
|
|
if a == nil {
|
654 |
|
|
return nil
|
655 |
|
|
}
|
656 |
|
|
return a[0:2]
|
657 |
|
|
}
|
658 |
|
|
|
659 |
|
|
// FindReaderIndex returns a two-element slice of integers defining the
|
660 |
|
|
// location of the leftmost match of the regular expression in text read from
|
661 |
|
|
// the RuneReader. The match itself is at s[loc[0]:loc[1]]. A return
|
662 |
|
|
// value of nil indicates no match.
|
663 |
|
|
func (re *Regexp) FindReaderIndex(r io.RuneReader) []int {
|
664 |
|
|
a := re.doExecute(r, nil, "", 0, 2)
|
665 |
|
|
if a == nil {
|
666 |
|
|
return nil
|
667 |
|
|
}
|
668 |
|
|
return a[0:2]
|
669 |
|
|
}
|
670 |
|
|
|
671 |
|
|
// FindSubmatch returns a slice of slices holding the text of the leftmost
|
672 |
|
|
// match of the regular expression in b and the matches, if any, of its
|
673 |
|
|
// subexpressions, as defined by the 'Submatch' descriptions in the package
|
674 |
|
|
// comment.
|
675 |
|
|
// A return value of nil indicates no match.
|
676 |
|
|
func (re *Regexp) FindSubmatch(b []byte) [][]byte {
|
677 |
|
|
a := re.doExecute(nil, b, "", 0, re.prog.NumCap)
|
678 |
|
|
if a == nil {
|
679 |
|
|
return nil
|
680 |
|
|
}
|
681 |
|
|
ret := make([][]byte, 1+re.numSubexp)
|
682 |
|
|
for i := range ret {
|
683 |
|
|
if 2*i < len(a) && a[2*i] >= 0 {
|
684 |
|
|
ret[i] = b[a[2*i]:a[2*i+1]]
|
685 |
|
|
}
|
686 |
|
|
}
|
687 |
|
|
return ret
|
688 |
|
|
}
|
689 |
|
|
|
690 |
|
|
// FindSubmatchIndex returns a slice holding the index pairs identifying the
|
691 |
|
|
// leftmost match of the regular expression in b and the matches, if any, of
|
692 |
|
|
// its subexpressions, as defined by the 'Submatch' and 'Index' descriptions
|
693 |
|
|
// in the package comment.
|
694 |
|
|
// A return value of nil indicates no match.
|
695 |
|
|
func (re *Regexp) FindSubmatchIndex(b []byte) []int {
|
696 |
|
|
return re.pad(re.doExecute(nil, b, "", 0, re.prog.NumCap))
|
697 |
|
|
}
|
698 |
|
|
|
699 |
|
|
// FindStringSubmatch returns a slice of strings holding the text of the
|
700 |
|
|
// leftmost match of the regular expression in s and the matches, if any, of
|
701 |
|
|
// its subexpressions, as defined by the 'Submatch' description in the
|
702 |
|
|
// package comment.
|
703 |
|
|
// A return value of nil indicates no match.
|
704 |
|
|
func (re *Regexp) FindStringSubmatch(s string) []string {
|
705 |
|
|
a := re.doExecute(nil, nil, s, 0, re.prog.NumCap)
|
706 |
|
|
if a == nil {
|
707 |
|
|
return nil
|
708 |
|
|
}
|
709 |
|
|
ret := make([]string, 1+re.numSubexp)
|
710 |
|
|
for i := range ret {
|
711 |
|
|
if 2*i < len(a) && a[2*i] >= 0 {
|
712 |
|
|
ret[i] = s[a[2*i]:a[2*i+1]]
|
713 |
|
|
}
|
714 |
|
|
}
|
715 |
|
|
return ret
|
716 |
|
|
}
|
717 |
|
|
|
718 |
|
|
// FindStringSubmatchIndex returns a slice holding the index pairs
|
719 |
|
|
// identifying the leftmost match of the regular expression in s and the
|
720 |
|
|
// matches, if any, of its subexpressions, as defined by the 'Submatch' and
|
721 |
|
|
// 'Index' descriptions in the package comment.
|
722 |
|
|
// A return value of nil indicates no match.
|
723 |
|
|
func (re *Regexp) FindStringSubmatchIndex(s string) []int {
|
724 |
|
|
return re.pad(re.doExecute(nil, nil, s, 0, re.prog.NumCap))
|
725 |
|
|
}
|
726 |
|
|
|
727 |
|
|
// FindReaderSubmatchIndex returns a slice holding the index pairs
|
728 |
|
|
// identifying the leftmost match of the regular expression of text read by
|
729 |
|
|
// the RuneReader, and the matches, if any, of its subexpressions, as defined
|
730 |
|
|
// by the 'Submatch' and 'Index' descriptions in the package comment. A
|
731 |
|
|
// return value of nil indicates no match.
|
732 |
|
|
func (re *Regexp) FindReaderSubmatchIndex(r io.RuneReader) []int {
|
733 |
|
|
return re.pad(re.doExecute(r, nil, "", 0, re.prog.NumCap))
|
734 |
|
|
}
|
735 |
|
|
|
736 |
|
|
const startSize = 10 // The size at which to start a slice in the 'All' routines.
|
737 |
|
|
|
738 |
|
|
// FindAll is the 'All' version of Find; it returns a slice of all successive
|
739 |
|
|
// matches of the expression, as defined by the 'All' description in the
|
740 |
|
|
// package comment.
|
741 |
|
|
// A return value of nil indicates no match.
|
742 |
|
|
func (re *Regexp) FindAll(b []byte, n int) [][]byte {
|
743 |
|
|
if n < 0 {
|
744 |
|
|
n = len(b) + 1
|
745 |
|
|
}
|
746 |
|
|
result := make([][]byte, 0, startSize)
|
747 |
|
|
re.allMatches("", b, n, func(match []int) {
|
748 |
|
|
result = append(result, b[match[0]:match[1]])
|
749 |
|
|
})
|
750 |
|
|
if len(result) == 0 {
|
751 |
|
|
return nil
|
752 |
|
|
}
|
753 |
|
|
return result
|
754 |
|
|
}
|
755 |
|
|
|
756 |
|
|
// FindAllIndex is the 'All' version of FindIndex; it returns a slice of all
|
757 |
|
|
// successive matches of the expression, as defined by the 'All' description
|
758 |
|
|
// in the package comment.
|
759 |
|
|
// A return value of nil indicates no match.
|
760 |
|
|
func (re *Regexp) FindAllIndex(b []byte, n int) [][]int {
|
761 |
|
|
if n < 0 {
|
762 |
|
|
n = len(b) + 1
|
763 |
|
|
}
|
764 |
|
|
result := make([][]int, 0, startSize)
|
765 |
|
|
re.allMatches("", b, n, func(match []int) {
|
766 |
|
|
result = append(result, match[0:2])
|
767 |
|
|
})
|
768 |
|
|
if len(result) == 0 {
|
769 |
|
|
return nil
|
770 |
|
|
}
|
771 |
|
|
return result
|
772 |
|
|
}
|
773 |
|
|
|
774 |
|
|
// FindAllString is the 'All' version of FindString; it returns a slice of all
|
775 |
|
|
// successive matches of the expression, as defined by the 'All' description
|
776 |
|
|
// in the package comment.
|
777 |
|
|
// A return value of nil indicates no match.
|
778 |
|
|
func (re *Regexp) FindAllString(s string, n int) []string {
|
779 |
|
|
if n < 0 {
|
780 |
|
|
n = len(s) + 1
|
781 |
|
|
}
|
782 |
|
|
result := make([]string, 0, startSize)
|
783 |
|
|
re.allMatches(s, nil, n, func(match []int) {
|
784 |
|
|
result = append(result, s[match[0]:match[1]])
|
785 |
|
|
})
|
786 |
|
|
if len(result) == 0 {
|
787 |
|
|
return nil
|
788 |
|
|
}
|
789 |
|
|
return result
|
790 |
|
|
}
|
791 |
|
|
|
792 |
|
|
// FindAllStringIndex is the 'All' version of FindStringIndex; it returns a
|
793 |
|
|
// slice of all successive matches of the expression, as defined by the 'All'
|
794 |
|
|
// description in the package comment.
|
795 |
|
|
// A return value of nil indicates no match.
|
796 |
|
|
func (re *Regexp) FindAllStringIndex(s string, n int) [][]int {
|
797 |
|
|
if n < 0 {
|
798 |
|
|
n = len(s) + 1
|
799 |
|
|
}
|
800 |
|
|
result := make([][]int, 0, startSize)
|
801 |
|
|
re.allMatches(s, nil, n, func(match []int) {
|
802 |
|
|
result = append(result, match[0:2])
|
803 |
|
|
})
|
804 |
|
|
if len(result) == 0 {
|
805 |
|
|
return nil
|
806 |
|
|
}
|
807 |
|
|
return result
|
808 |
|
|
}
|
809 |
|
|
|
810 |
|
|
// FindAllSubmatch is the 'All' version of FindSubmatch; it returns a slice
|
811 |
|
|
// of all successive matches of the expression, as defined by the 'All'
|
812 |
|
|
// description in the package comment.
|
813 |
|
|
// A return value of nil indicates no match.
|
814 |
|
|
func (re *Regexp) FindAllSubmatch(b []byte, n int) [][][]byte {
|
815 |
|
|
if n < 0 {
|
816 |
|
|
n = len(b) + 1
|
817 |
|
|
}
|
818 |
|
|
result := make([][][]byte, 0, startSize)
|
819 |
|
|
re.allMatches("", b, n, func(match []int) {
|
820 |
|
|
slice := make([][]byte, len(match)/2)
|
821 |
|
|
for j := range slice {
|
822 |
|
|
if match[2*j] >= 0 {
|
823 |
|
|
slice[j] = b[match[2*j]:match[2*j+1]]
|
824 |
|
|
}
|
825 |
|
|
}
|
826 |
|
|
result = append(result, slice)
|
827 |
|
|
})
|
828 |
|
|
if len(result) == 0 {
|
829 |
|
|
return nil
|
830 |
|
|
}
|
831 |
|
|
return result
|
832 |
|
|
}
|
833 |
|
|
|
834 |
|
|
// FindAllSubmatchIndex is the 'All' version of FindSubmatchIndex; it returns
|
835 |
|
|
// a slice of all successive matches of the expression, as defined by the
|
836 |
|
|
// 'All' description in the package comment.
|
837 |
|
|
// A return value of nil indicates no match.
|
838 |
|
|
func (re *Regexp) FindAllSubmatchIndex(b []byte, n int) [][]int {
|
839 |
|
|
if n < 0 {
|
840 |
|
|
n = len(b) + 1
|
841 |
|
|
}
|
842 |
|
|
result := make([][]int, 0, startSize)
|
843 |
|
|
re.allMatches("", b, n, func(match []int) {
|
844 |
|
|
result = append(result, match)
|
845 |
|
|
})
|
846 |
|
|
if len(result) == 0 {
|
847 |
|
|
return nil
|
848 |
|
|
}
|
849 |
|
|
return result
|
850 |
|
|
}
|
851 |
|
|
|
852 |
|
|
// FindAllStringSubmatch is the 'All' version of FindStringSubmatch; it
|
853 |
|
|
// returns a slice of all successive matches of the expression, as defined by
|
854 |
|
|
// the 'All' description in the package comment.
|
855 |
|
|
// A return value of nil indicates no match.
|
856 |
|
|
func (re *Regexp) FindAllStringSubmatch(s string, n int) [][]string {
|
857 |
|
|
if n < 0 {
|
858 |
|
|
n = len(s) + 1
|
859 |
|
|
}
|
860 |
|
|
result := make([][]string, 0, startSize)
|
861 |
|
|
re.allMatches(s, nil, n, func(match []int) {
|
862 |
|
|
slice := make([]string, len(match)/2)
|
863 |
|
|
for j := range slice {
|
864 |
|
|
if match[2*j] >= 0 {
|
865 |
|
|
slice[j] = s[match[2*j]:match[2*j+1]]
|
866 |
|
|
}
|
867 |
|
|
}
|
868 |
|
|
result = append(result, slice)
|
869 |
|
|
})
|
870 |
|
|
if len(result) == 0 {
|
871 |
|
|
return nil
|
872 |
|
|
}
|
873 |
|
|
return result
|
874 |
|
|
}
|
875 |
|
|
|
876 |
|
|
// FindAllStringSubmatchIndex is the 'All' version of
|
877 |
|
|
// FindStringSubmatchIndex; it returns a slice of all successive matches of
|
878 |
|
|
// the expression, as defined by the 'All' description in the package
|
879 |
|
|
// comment.
|
880 |
|
|
// A return value of nil indicates no match.
|
881 |
|
|
func (re *Regexp) FindAllStringSubmatchIndex(s string, n int) [][]int {
|
882 |
|
|
if n < 0 {
|
883 |
|
|
n = len(s) + 1
|
884 |
|
|
}
|
885 |
|
|
result := make([][]int, 0, startSize)
|
886 |
|
|
re.allMatches(s, nil, n, func(match []int) {
|
887 |
|
|
result = append(result, match)
|
888 |
|
|
})
|
889 |
|
|
if len(result) == 0 {
|
890 |
|
|
return nil
|
891 |
|
|
}
|
892 |
|
|
return result
|
893 |
|
|
}
|