OpenCores
URL https://opencores.org/ocsvn/scarts/scarts/trunk

Subversion Repositories scarts

[/] [scarts/] [trunk/] [toolchain/] [scarts-gcc/] [gcc-4.1.1/] [libjava/] [classpath/] [gnu/] [regexp/] [RE.java] - Blame information for rev 14

Details | Compare with Previous | View Log

Line No. Rev Author Line
1 14 jlechner
/* gnu/regexp/RE.java
2
   Copyright (C) 2006 Free Software Foundation, Inc.
3
 
4
This file is part of GNU Classpath.
5
 
6
GNU Classpath is free software; you can redistribute it and/or modify
7
it under the terms of the GNU General Public License as published by
8
the Free Software Foundation; either version 2, or (at your option)
9
any later version.
10
 
11
GNU Classpath is distributed in the hope that it will be useful, but
12
WITHOUT ANY WARRANTY; without even the implied warranty of
13
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
General Public License for more details.
15
 
16
You should have received a copy of the GNU General Public License
17
along with GNU Classpath; see the file COPYING.  If not, write to the
18
Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19
02110-1301 USA.
20
 
21
Linking this library statically or dynamically with other modules is
22
making a combined work based on this library.  Thus, the terms and
23
conditions of the GNU General Public License cover the whole
24
combination.
25
 
26
As a special exception, the copyright holders of this library give you
27
permission to link this library with independent modules to produce an
28
executable, regardless of the license terms of these independent
29
modules, and to copy and distribute the resulting executable under
30
terms of your choice, provided that you also meet, for each linked
31
independent module, the terms and conditions of the license of that
32
module.  An independent module is a module which is not derived from
33
or based on this library.  If you modify this library, you may extend
34
this exception to your version of the library, but you are not
35
obligated to do so.  If you do not wish to do so, delete this
36
exception statement from your version. */
37
 
38
package gnu.regexp;
39
import java.io.InputStream;
40
import java.io.Serializable;
41
import java.util.Locale;
42
import java.util.PropertyResourceBundle;
43
import java.util.ResourceBundle;
44
import java.util.Vector;
45
 
46
/**
47
 * RE provides the user interface for compiling and matching regular
48
 * expressions.
49
 * <P>
50
 * A regular expression object (class RE) is compiled by constructing it
51
 * from a String, StringBuffer or character array, with optional
52
 * compilation flags (below)
53
 * and an optional syntax specification (see RESyntax; if not specified,
54
 * <code>RESyntax.RE_SYNTAX_PERL5</code> is used).
55
 * <P>
56
 * Once compiled, a regular expression object is reusable as well as
57
 * threadsafe: multiple threads can use the RE instance simultaneously
58
 * to match against different input text.
59
 * <P>
60
 * Various methods attempt to match input text against a compiled
61
 * regular expression.  These methods are:
62
 * <LI><code>isMatch</code>: returns true if the input text in its
63
 * entirety matches the regular expression pattern.
64
 * <LI><code>getMatch</code>: returns the first match found in the
65
 * input text, or null if no match is found.
66
 * <LI><code>getAllMatches</code>: returns an array of all
67
 * non-overlapping matches found in the input text.  If no matches are
68
 * found, the array is zero-length.
69
 * <LI><code>substitute</code>: substitute the first occurence of the
70
 * pattern in the input text with a replacement string (which may
71
 * include metacharacters $0-$9, see REMatch.substituteInto).
72
 * <LI><code>substituteAll</code>: same as above, but repeat for each
73
 * match before returning.
74
 * <LI><code>getMatchEnumeration</code>: returns an REMatchEnumeration
75
 * object that allows iteration over the matches (see
76
 * REMatchEnumeration for some reasons why you may want to do this
77
 * instead of using <code>getAllMatches</code>.
78
 * <P>
79
 *
80
 * These methods all have similar argument lists.  The input can be a
81
 * String, a character array, a StringBuffer, or an
82
 * InputStream of some sort.  Note that when using an
83
 * InputStream, the stream read position cannot be guaranteed after
84
 * attempting a match (this is not a bug, but a consequence of the way
85
 * regular expressions work).  Using an REMatchEnumeration can
86
 * eliminate most positioning problems.
87
 *
88
 * <P>
89
 *
90
 * The optional index argument specifies the offset from the beginning
91
 * of the text at which the search should start (see the descriptions
92
 * of some of the execution flags for how this can affect positional
93
 * pattern operators).  For an InputStream, this means an
94
 * offset from the current read position, so subsequent calls with the
95
 * same index argument on an InputStream will not
96
 * necessarily access the same position on the stream, whereas
97
 * repeated searches at a given index in a fixed string will return
98
 * consistent results.
99
 *
100
 * <P>
101
 * You can optionally affect the execution environment by using a
102
 * combination of execution flags (constants listed below).
103
 *
104
 * <P>
105
 * All operations on a regular expression are performed in a
106
 * thread-safe manner.
107
 *
108
 * @author <A HREF="mailto:wes@cacas.org">Wes Biggs</A>
109
 * @version 1.1.5-dev, to be released
110
 */
111
 
112
public class RE extends REToken {
113
 
114
  private static final class IntPair implements Serializable {
115
    public int first, second;
116
  }
117
 
118
  private static final class CharUnit implements Serializable {
119
    public char ch;
120
    public boolean bk;
121
  }
122
 
123
  // This String will be returned by getVersion()
124
  private static final String VERSION = "1.1.5-dev";
125
 
126
  // The localized strings are kept in a separate file
127
  private static ResourceBundle messages = PropertyResourceBundle.getBundle("gnu/regexp/MessagesBundle", Locale.getDefault());
128
 
129
  // These are, respectively, the first and last tokens in our linked list
130
  // If there is only one token, firstToken == lastToken
131
  private REToken firstToken, lastToken;
132
 
133
  // This is the number of subexpressions in this regular expression,
134
  // with a minimum value of zero.  Returned by getNumSubs()
135
  private int numSubs;
136
 
137
    /** Minimum length, in characters, of any possible match. */
138
    private int minimumLength;
139
    private int maximumLength;
140
 
141
  /**
142
   * Compilation flag. Do  not  differentiate  case.   Subsequent
143
   * searches  using  this  RE will be case insensitive.
144
   */
145
  public static final int REG_ICASE = 0x02;
146
 
147
  /**
148
   * Compilation flag. The match-any-character operator (dot)
149
   * will match a newline character.  When set this overrides the syntax
150
   * bit RE_DOT_NEWLINE (see RESyntax for details).  This is equivalent to
151
   * the "/s" operator in Perl.
152
   */
153
  public static final int REG_DOT_NEWLINE = 0x04;
154
 
155
  /**
156
   * Compilation flag. Use multiline mode.  In this mode, the ^ and $
157
   * anchors will match based on newlines within the input. This is
158
   * equivalent to the "/m" operator in Perl.
159
   */
160
  public static final int REG_MULTILINE = 0x08;
161
 
162
  /**
163
   * Execution flag.
164
   * The match-beginning operator (^) will not match at the beginning
165
   * of the input string. Useful for matching on a substring when you
166
   * know the context of the input is such that position zero of the
167
   * input to the match test is not actually position zero of the text.
168
   * <P>
169
   * This example demonstrates the results of various ways of matching on
170
   * a substring.
171
   * <P>
172
   * <CODE>
173
   * String s = "food bar fool";<BR>
174
   * RE exp = new RE("^foo.");<BR>
175
   * REMatch m0 = exp.getMatch(s);<BR>
176
   * REMatch m1 = exp.getMatch(s.substring(8));<BR>
177
   * REMatch m2 = exp.getMatch(s.substring(8),0,RE.REG_NOTBOL); <BR>
178
   * REMatch m3 = exp.getMatch(s,8);                            <BR>
179
   * REMatch m4 = exp.getMatch(s,8,RE.REG_ANCHORINDEX);         <BR>
180
   * <P>
181
   * // Results:<BR>
182
   * //  m0.toString(): "food"<BR>
183
   * //  m1.toString(): "fool"<BR>
184
   * //  m2.toString(): null<BR>
185
   * //  m3.toString(): null<BR>
186
   * //  m4.toString(): "fool"<BR>
187
   * </CODE>
188
   */
189
  public static final int REG_NOTBOL = 0x10;
190
 
191
  /**
192
   * Execution flag.
193
   * The match-end operator ($) does not match at the end
194
   * of the input string. Useful for matching on substrings.
195
   */
196
  public static final int REG_NOTEOL = 0x20;
197
 
198
  /**
199
   * Execution flag.
200
   * When a match method is invoked that starts matching at a non-zero
201
   * index into the input, treat the input as if it begins at the index
202
   * given.  The effect of this flag is that the engine does not "see"
203
   * any text in the input before the given index.  This is useful so
204
   * that the match-beginning operator (^) matches not at position 0
205
   * in the input string, but at the position the search started at
206
   * (based on the index input given to the getMatch function).  See
207
   * the example under REG_NOTBOL.  It also affects the use of the \&lt;
208
   * and \b operators.
209
   */
210
  public static final int REG_ANCHORINDEX = 0x40;
211
 
212
  /**
213
   * Execution flag.
214
   * The substitute and substituteAll methods will not attempt to
215
   * interpolate occurrences of $1-$9 in the replacement text with
216
   * the corresponding subexpressions.  For example, you may want to
217
   * replace all matches of "one dollar" with "$1".
218
   */
219
  public static final int REG_NO_INTERPOLATE = 0x80;
220
 
221
  /**
222
   * Execution flag.
223
   * Try to match the whole input string. An implicit match-end operator
224
   * is added to this regexp.
225
   */
226
  public static final int REG_TRY_ENTIRE_MATCH = 0x0100;
227
 
228
  /**
229
   * Execution flag.
230
   * The substitute and substituteAll methods will treat the
231
   * character '\' in the replacement as an escape to a literal
232
   * character. In this case "\n", "\$", "\\", "\x40" and "\012"
233
   * will become "n", "$", "\", "x40" and "012" respectively.
234
   * This flag has no effect if REG_NO_INTERPOLATE is set on.
235
   */
236
  public static final int REG_REPLACE_USE_BACKSLASHESCAPE = 0x0200;
237
 
238
  /** Returns a string representing the version of the gnu.regexp package. */
239
  public static final String version() {
240
    return VERSION;
241
  }
242
 
243
  // Retrieves a message from the ResourceBundle
244
  static final String getLocalizedMessage(String key) {
245
    return messages.getString(key);
246
  }
247
 
248
  /**
249
   * Constructs a regular expression pattern buffer without any compilation
250
   * flags set, and using the default syntax (RESyntax.RE_SYNTAX_PERL5).
251
   *
252
   * @param pattern A regular expression pattern, in the form of a String,
253
   *   StringBuffer or char[].  Other input types will be converted to
254
   *   strings using the toString() method.
255
   * @exception REException The input pattern could not be parsed.
256
   * @exception NullPointerException The pattern was null.
257
   */
258
  public RE(Object pattern) throws REException {
259
    this(pattern,0,RESyntax.RE_SYNTAX_PERL5,0,0);
260
  }
261
 
262
  /**
263
   * Constructs a regular expression pattern buffer using the specified
264
   * compilation flags and the default syntax (RESyntax.RE_SYNTAX_PERL5).
265
   *
266
   * @param pattern A regular expression pattern, in the form of a String,
267
   *   StringBuffer, or char[].  Other input types will be converted to
268
   *   strings using the toString() method.
269
   * @param cflags The logical OR of any combination of the compilation flags listed above.
270
   * @exception REException The input pattern could not be parsed.
271
   * @exception NullPointerException The pattern was null.
272
   */
273
  public RE(Object pattern, int cflags) throws REException {
274
    this(pattern,cflags,RESyntax.RE_SYNTAX_PERL5,0,0);
275
  }
276
 
277
  /**
278
   * Constructs a regular expression pattern buffer using the specified
279
   * compilation flags and regular expression syntax.
280
   *
281
   * @param pattern A regular expression pattern, in the form of a String,
282
   *   StringBuffer, or char[].  Other input types will be converted to
283
   *   strings using the toString() method.
284
   * @param cflags The logical OR of any combination of the compilation flags listed above.
285
   * @param syntax The type of regular expression syntax to use.
286
   * @exception REException The input pattern could not be parsed.
287
   * @exception NullPointerException The pattern was null.
288
   */
289
  public RE(Object pattern, int cflags, RESyntax syntax) throws REException {
290
    this(pattern,cflags,syntax,0,0);
291
  }
292
 
293
  // internal constructor used for alternation
294
  private RE(REToken first, REToken last,int subs, int subIndex, int minLength, int maxLength) {
295
    super(subIndex);
296
    firstToken = first;
297
    lastToken = last;
298
    numSubs = subs;
299
    minimumLength = minLength;
300
    maximumLength = maxLength;
301
    addToken(new RETokenEndSub(subIndex));
302
  }
303
 
304
  private RE(Object patternObj, int cflags, RESyntax syntax, int myIndex, int nextSub) throws REException {
305
    super(myIndex); // Subexpression index of this token.
306
    initialize(patternObj, cflags, syntax, myIndex, nextSub);
307
  }
308
 
309
    // For use by subclasses
310
    protected RE() { super(0); }
311
 
312
    // The meat of construction
313
  protected void initialize(Object patternObj, int cflags, RESyntax syntax, int myIndex, int nextSub) throws REException {
314
      char[] pattern;
315
    if (patternObj instanceof String) {
316
      pattern = ((String) patternObj).toCharArray();
317
    } else if (patternObj instanceof char[]) {
318
      pattern = (char[]) patternObj;
319
    } else if (patternObj instanceof StringBuffer) {
320
      pattern = new char [((StringBuffer) patternObj).length()];
321
      ((StringBuffer) patternObj).getChars(0,pattern.length,pattern,0);
322
    } else {
323
        pattern = patternObj.toString().toCharArray();
324
    }
325
 
326
    int pLength = pattern.length;
327
 
328
    numSubs = 0; // Number of subexpressions in this token.
329
    Vector branches = null;
330
 
331
    // linked list of tokens (sort of -- some closed loops can exist)
332
    firstToken = lastToken = null;
333
 
334
    // Precalculate these so we don't pay for the math every time we
335
    // need to access them.
336
    boolean insens = ((cflags & REG_ICASE) > 0);
337
 
338
    // Parse pattern into tokens.  Does anyone know if it's more efficient
339
    // to use char[] than a String.charAt()?  I'm assuming so.
340
 
341
    // index tracks the position in the char array
342
    int index = 0;
343
 
344
    // this will be the current parse character (pattern[index])
345
    CharUnit unit = new CharUnit();
346
 
347
    // This is used for {x,y} calculations
348
    IntPair minMax = new IntPair();
349
 
350
    // Buffer a token so we can create a TokenRepeated, etc.
351
    REToken currentToken = null;
352
    char ch;
353
    boolean quot = false;
354
 
355
    // Saved syntax and flags.
356
    RESyntax savedSyntax = null;
357
    int savedCflags = 0;
358
    boolean flagsSaved = false;
359
 
360
    while (index < pLength) {
361
      // read the next character unit (including backslash escapes)
362
      index = getCharUnit(pattern,index,unit,quot);
363
 
364
      if (unit.bk)
365
        if (unit.ch == 'Q') {
366
          quot = true;
367
          continue;
368
        } else if (unit.ch == 'E') {
369
          quot = false;
370
          continue;
371
        }
372
      if (quot)
373
        unit.bk = false;
374
 
375
      // ALTERNATION OPERATOR
376
      //  \| or | (if RE_NO_BK_VBAR) or newline (if RE_NEWLINE_ALT)
377
      //  not available if RE_LIMITED_OPS is set
378
 
379
      // TODO: the '\n' literal here should be a test against REToken.newline,
380
      // which unfortunately may be more than a single character.
381
      if ( ( (unit.ch == '|' && (syntax.get(RESyntax.RE_NO_BK_VBAR) ^ (unit.bk || quot)))
382
             || (syntax.get(RESyntax.RE_NEWLINE_ALT) && (unit.ch == '\n') && !(unit.bk || quot)) )
383
           && !syntax.get(RESyntax.RE_LIMITED_OPS)) {
384
        // make everything up to here be a branch. create vector if nec.
385
        addToken(currentToken);
386
        RE theBranch = new RE(firstToken, lastToken, numSubs, subIndex, minimumLength, maximumLength);
387
        minimumLength = 0;
388
        maximumLength = 0;
389
        if (branches == null) {
390
            branches = new Vector();
391
        }
392
        branches.addElement(theBranch);
393
        firstToken = lastToken = currentToken = null;
394
      }
395
 
396
      // INTERVAL OPERATOR:
397
      //  {x} | {x,} | {x,y}  (RE_INTERVALS && RE_NO_BK_BRACES)
398
      //  \{x\} | \{x,\} | \{x,y\} (RE_INTERVALS && !RE_NO_BK_BRACES)
399
      //
400
      // OPEN QUESTION: 
401
      //  what is proper interpretation of '{' at start of string?
402
      //
403
      // This method used to check "repeat.empty.token" to avoid such regexp
404
      // as "(a*){2,}", but now "repeat.empty.token" is allowed.
405
 
406
      else if ((unit.ch == '{') && syntax.get(RESyntax.RE_INTERVALS) && (syntax.get(RESyntax.RE_NO_BK_BRACES) ^ (unit.bk || quot))) {
407
        int newIndex = getMinMax(pattern,index,minMax,syntax);
408
        if (newIndex > index) {
409
          if (minMax.first > minMax.second)
410
            throw new REException(getLocalizedMessage("interval.order"),REException.REG_BADRPT,newIndex);
411
          if (currentToken == null)
412
            throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,newIndex);
413
          if (currentToken instanceof RETokenRepeated)
414
            throw new REException(getLocalizedMessage("repeat.chained"),REException.REG_BADRPT,newIndex);
415
          if (currentToken instanceof RETokenWordBoundary || currentToken instanceof RETokenWordBoundary)
416
            throw new REException(getLocalizedMessage("repeat.assertion"),REException.REG_BADRPT,newIndex);
417
          index = newIndex;
418
          currentToken = setRepeated(currentToken,minMax.first,minMax.second,index);
419
        }
420
        else {
421
          addToken(currentToken);
422
          currentToken = new RETokenChar(subIndex,unit.ch,insens);
423
        }
424
      }
425
 
426
      // LIST OPERATOR:
427
      //  [...] | [^...]
428
 
429
      else if ((unit.ch == '[') && !(unit.bk || quot)) {
430
        // Create a new RETokenOneOf
431
        ParseCharClassResult result = parseCharClass(
432
                subIndex, pattern, index, pLength, cflags, syntax, 0);
433
        addToken(currentToken);
434
        currentToken = result.token;
435
        index = result.index;
436
      }
437
 
438
      // SUBEXPRESSIONS
439
      //  (...) | \(...\) depending on RE_NO_BK_PARENS
440
 
441
      else if ((unit.ch == '(') && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ (unit.bk || quot))) {
442
        boolean pure = false;
443
        boolean comment = false;
444
        boolean lookAhead = false;
445
        boolean lookBehind = false;
446
        boolean independent = false;
447
        boolean negativelh = false;
448
        boolean negativelb = false;
449
        if ((index+1 < pLength) && (pattern[index] == '?')) {
450
          switch (pattern[index+1]) {
451
          case '!':
452
            if (syntax.get(RESyntax.RE_LOOKAHEAD)) {
453
              pure = true;
454
              negativelh = true;
455
              lookAhead = true;
456
              index += 2;
457
            }
458
            break;
459
          case '=':
460
            if (syntax.get(RESyntax.RE_LOOKAHEAD)) {
461
              pure = true;
462
              lookAhead = true;
463
              index += 2;
464
            }
465
            break;
466
          case '<':
467
            // We assume that if the syntax supports look-ahead,
468
            // it also supports look-behind.
469
            if (syntax.get(RESyntax.RE_LOOKAHEAD)) {
470
                index++;
471
                switch (pattern[index +1]) {
472
                case '!':
473
                  pure = true;
474
                  negativelb = true;
475
                  lookBehind = true;
476
                  index += 2;
477
                  break;
478
                case '=':
479
                  pure = true;
480
                  lookBehind = true;
481
                  index += 2;
482
                }
483
            }
484
            break;
485
          case '>':
486
            // We assume that if the syntax supports look-ahead,
487
            // it also supports independent group.
488
            if (syntax.get(RESyntax.RE_LOOKAHEAD)) {
489
              pure = true;
490
              independent = true;
491
              index += 2;
492
            }
493
            break;
494
          case 'i':
495
          case 'd':
496
          case 'm':
497
          case 's':
498
          // case 'u':  not supported
499
          // case 'x':  not supported
500
          case '-':
501
            if (!syntax.get(RESyntax.RE_EMBEDDED_FLAGS)) break;
502
            // Set or reset syntax flags.
503
            int flagIndex = index + 1;
504
            int endFlag = -1;
505
            RESyntax newSyntax = new RESyntax(syntax);
506
            int newCflags = cflags;
507
            boolean negate = false;
508
            while (flagIndex < pLength && endFlag < 0) {
509
                switch(pattern[flagIndex]) {
510
                case 'i':
511
                  if (negate)
512
                    newCflags &= ~REG_ICASE;
513
                  else
514
                    newCflags |= REG_ICASE;
515
                  flagIndex++;
516
                  break;
517
                case 'd':
518
                  if (negate)
519
                    newSyntax.setLineSeparator(RESyntax.DEFAULT_LINE_SEPARATOR);
520
                  else
521
                    newSyntax.setLineSeparator("\n");
522
                  flagIndex++;
523
                  break;
524
                case 'm':
525
                  if (negate)
526
                    newCflags &= ~REG_MULTILINE;
527
                  else
528
                    newCflags |= REG_MULTILINE;
529
                  flagIndex++;
530
                  break;
531
                case 's':
532
                  if (negate)
533
                    newCflags &= ~REG_DOT_NEWLINE;
534
                  else
535
                    newCflags |= REG_DOT_NEWLINE;
536
                  flagIndex++;
537
                  break;
538
                // case 'u': not supported
539
                // case 'x': not supported
540
                case '-':
541
                  negate = true;
542
                  flagIndex++;
543
                  break;
544
                case ':':
545
                case ')':
546
                  endFlag = pattern[flagIndex];
547
                  break;
548
                default:
549
                  throw new REException(getLocalizedMessage("repeat.no.token"), REException.REG_BADRPT, index);
550
                }
551
            }
552
            if (endFlag == ')') {
553
                syntax = newSyntax;
554
                cflags = newCflags;
555
                insens = ((cflags & REG_ICASE) > 0);
556
                // This can be treated as though it were a comment.
557
                comment = true;
558
                index = flagIndex - 1;
559
                break;
560
            }
561
            if (endFlag == ':') {
562
                savedSyntax = syntax;
563
                savedCflags = cflags;
564
                flagsSaved = true;
565
                syntax = newSyntax;
566
                cflags = newCflags;
567
                insens = ((cflags & REG_ICASE) > 0);
568
                index = flagIndex -1;
569
                // Fall through to the next case.
570
            }
571
            else {
572
                throw new REException(getLocalizedMessage("unmatched.paren"), REException.REG_ESUBREG,index);
573
            }
574
          case ':':
575
            if (syntax.get(RESyntax.RE_PURE_GROUPING)) {
576
              pure = true;
577
              index += 2;
578
            }
579
            break;
580
          case '#':
581
            if (syntax.get(RESyntax.RE_COMMENTS)) {
582
              comment = true;
583
            }
584
            break;
585
          default:
586
            throw new REException(getLocalizedMessage("repeat.no.token"), REException.REG_BADRPT, index);
587
          }
588
        }
589
 
590
        if (index >= pLength) {
591
            throw new REException(getLocalizedMessage("unmatched.paren"), REException.REG_ESUBREG,index);
592
        }
593
 
594
        // find end of subexpression
595
        int endIndex = index;
596
        int nextIndex = index;
597
        int nested = 0;
598
 
599
        while ( ((nextIndex = getCharUnit(pattern,endIndex,unit,false)) > 0)
600
                && !(nested == 0 && (unit.ch == ')') && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ (unit.bk || quot))) ) {
601
          if ((endIndex = nextIndex) >= pLength)
602
            throw new REException(getLocalizedMessage("subexpr.no.end"),REException.REG_ESUBREG,nextIndex);
603
          else if ((unit.ch == '[') && !(unit.bk || quot)) {
604
            // I hate to do something similar to the LIST OPERATOR matters
605
            // above, but ...
606
            int listIndex = nextIndex;
607
            if (listIndex < pLength && pattern[listIndex] == '^') listIndex++;
608
            if (listIndex < pLength && pattern[listIndex] == ']') listIndex++;
609
            int listEndIndex = -1;
610
            int listNest = 0;
611
            while (listIndex < pLength && listEndIndex < 0) {
612
              switch(pattern[listIndex++]) {
613
                case '\\':
614
                  listIndex++;
615
                  break;
616
                case '[':
617
                  // Sun's API document says that regexp like "[a-d[m-p]]"
618
                  // is legal. Even something like "[[[^]]]]" is accepted.
619
                  listNest++;
620
                  if (listIndex < pLength && pattern[listIndex] == '^') listIndex++;
621
                  if (listIndex < pLength && pattern[listIndex] == ']') listIndex++;
622
                  break;
623
                case ']':
624
                  if (listNest == 0)
625
                    listEndIndex = listIndex;
626
                  listNest--;
627
                  break;
628
              }
629
            }
630
            if (listEndIndex >= 0) {
631
              nextIndex = listEndIndex;
632
              if ((endIndex = nextIndex) >= pLength)
633
                throw new REException(getLocalizedMessage("subexpr.no.end"),REException.REG_ESUBREG,nextIndex);
634
              else
635
                continue;
636
            }
637
            throw new REException(getLocalizedMessage("subexpr.no.end"),REException.REG_ESUBREG,nextIndex);
638
          }
639
          else if (unit.ch == '(' && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ (unit.bk || quot)))
640
            nested++;
641
          else if (unit.ch == ')' && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ (unit.bk || quot)))
642
            nested--;
643
        }
644
 
645
        // endIndex is now position at a ')','\)' 
646
        // nextIndex is end of string or position after ')' or '\)'
647
 
648
        if (comment) index = nextIndex;
649
        else { // not a comment
650
          // create RE subexpression as token.
651
          addToken(currentToken);
652
          if (!pure) {
653
            numSubs++;
654
          }
655
 
656
          int useIndex = (pure || lookAhead || lookBehind || independent) ?
657
 
658
          currentToken = new RE(String.valueOf(pattern,index,endIndex-index).toCharArray(),cflags,syntax,useIndex,nextSub + numSubs);
659
          numSubs += ((RE) currentToken).getNumSubs();
660
 
661
          if (lookAhead) {
662
              currentToken = new RETokenLookAhead(currentToken,negativelh);
663
          }
664
          else if (lookBehind) {
665
              currentToken = new RETokenLookBehind(currentToken,negativelb);
666
          }
667
          else if (independent) {
668
              currentToken = new RETokenIndependent(currentToken);
669
          }
670
 
671
          index = nextIndex;
672
          if (flagsSaved) {
673
              syntax = savedSyntax;
674
              cflags = savedCflags;
675
              insens = ((cflags & REG_ICASE) > 0);
676
              flagsSaved = false;
677
          }
678
        } // not a comment
679
      } // subexpression
680
 
681
      // UNMATCHED RIGHT PAREN
682
      // ) or \) throw exception if
683
      // !syntax.get(RESyntax.RE_UNMATCHED_RIGHT_PAREN_ORD)
684
      else if (!syntax.get(RESyntax.RE_UNMATCHED_RIGHT_PAREN_ORD) && ((unit.ch == ')') && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ (unit.bk || quot)))) {
685
        throw new REException(getLocalizedMessage("unmatched.paren"),REException.REG_EPAREN,index);
686
      }
687
 
688
      // START OF LINE OPERATOR
689
      //  ^
690
 
691
      else if ((unit.ch == '^') && !(unit.bk || quot)) {
692
        addToken(currentToken);
693
        currentToken = null;
694
        addToken(new RETokenStart(subIndex,((cflags & REG_MULTILINE) > 0) ? syntax.getLineSeparator() : null));
695
      }
696
 
697
      // END OF LINE OPERATOR
698
      //  $
699
 
700
      else if ((unit.ch == '$') && !(unit.bk || quot)) {
701
        addToken(currentToken);
702
        currentToken = null;
703
        addToken(new RETokenEnd(subIndex,((cflags & REG_MULTILINE) > 0) ? syntax.getLineSeparator() : null));
704
      }
705
 
706
      // MATCH-ANY-CHARACTER OPERATOR (except possibly newline and null)
707
      //  .
708
 
709
      else if ((unit.ch == '.') && !(unit.bk || quot)) {
710
        addToken(currentToken);
711
        currentToken = new RETokenAny(subIndex,syntax.get(RESyntax.RE_DOT_NEWLINE) || ((cflags & REG_DOT_NEWLINE) > 0),syntax.get(RESyntax.RE_DOT_NOT_NULL));
712
      }
713
 
714
      // ZERO-OR-MORE REPEAT OPERATOR
715
      //  *
716
      //
717
      // This method used to check "repeat.empty.token" to avoid such regexp
718
      // as "(a*)*", but now "repeat.empty.token" is allowed.
719
 
720
      else if ((unit.ch == '*') && !(unit.bk || quot)) {
721
        if (currentToken == null)
722
          throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,index);
723
        if (currentToken instanceof RETokenRepeated)
724
          throw new REException(getLocalizedMessage("repeat.chained"),REException.REG_BADRPT,index);
725
        if (currentToken instanceof RETokenWordBoundary || currentToken instanceof RETokenWordBoundary)
726
          throw new REException(getLocalizedMessage("repeat.assertion"),REException.REG_BADRPT,index);
727
        currentToken = setRepeated(currentToken,0,Integer.MAX_VALUE,index);
728
      }
729
 
730
      // ONE-OR-MORE REPEAT OPERATOR / POSSESSIVE MATCHING OPERATOR
731
      //  + | \+ depending on RE_BK_PLUS_QM
732
      //  not available if RE_LIMITED_OPS is set
733
      //
734
      // This method used to check "repeat.empty.token" to avoid such regexp
735
      // as "(a*)+", but now "repeat.empty.token" is allowed.
736
 
737
      else if ((unit.ch == '+') && !syntax.get(RESyntax.RE_LIMITED_OPS) && (!syntax.get(RESyntax.RE_BK_PLUS_QM) ^ (unit.bk || quot))) {
738
        if (currentToken == null)
739
          throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,index);
740
 
741
        // Check for possessive matching on RETokenRepeated
742
        if (currentToken instanceof RETokenRepeated) {
743
          RETokenRepeated tokenRep = (RETokenRepeated)currentToken;
744
          if (syntax.get(RESyntax.RE_POSSESSIVE_OPS) && !tokenRep.isPossessive() && !tokenRep.isStingy())
745
            tokenRep.makePossessive();
746
          else
747
            throw new REException(getLocalizedMessage("repeat.chained"),REException.REG_BADRPT,index);
748
 
749
        }
750
        else if (currentToken instanceof RETokenWordBoundary || currentToken instanceof RETokenWordBoundary)
751
          throw new REException(getLocalizedMessage("repeat.assertion"),REException.REG_BADRPT,index);
752
        else
753
          currentToken = setRepeated(currentToken,1,Integer.MAX_VALUE,index);
754
      }
755
 
756
      // ZERO-OR-ONE REPEAT OPERATOR / STINGY MATCHING OPERATOR
757
      //  ? | \? depending on RE_BK_PLUS_QM
758
      //  not available if RE_LIMITED_OPS is set
759
      //  stingy matching if RE_STINGY_OPS is set and it follows a quantifier
760
 
761
      else if ((unit.ch == '?') && !syntax.get(RESyntax.RE_LIMITED_OPS) && (!syntax.get(RESyntax.RE_BK_PLUS_QM) ^ (unit.bk || quot))) {
762
        if (currentToken == null) throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,index);
763
 
764
        // Check for stingy matching on RETokenRepeated
765
        if (currentToken instanceof RETokenRepeated) {
766
          RETokenRepeated tokenRep = (RETokenRepeated)currentToken;
767
          if (syntax.get(RESyntax.RE_STINGY_OPS) && !tokenRep.isStingy() && !tokenRep.isPossessive())
768
            tokenRep.makeStingy();
769
          else
770
            throw new REException(getLocalizedMessage("repeat.chained"),REException.REG_BADRPT,index);
771
        }
772
        else if (currentToken instanceof RETokenWordBoundary || currentToken instanceof RETokenWordBoundary)
773
          throw new REException(getLocalizedMessage("repeat.assertion"),REException.REG_BADRPT,index);
774
        else
775
          currentToken = setRepeated(currentToken,0,1,index);
776
      }
777
 
778
      // OCTAL CHARACTER
779
      //  \0377
780
 
781
      else if (unit.bk && (unit.ch == '0') && syntax.get(RESyntax.RE_OCTAL_CHAR)) {
782
        CharExpression ce = getCharExpression(pattern, index - 2, pLength, syntax);
783
        if (ce == null)
784
          throw new REException("invalid octal character", REException.REG_ESCAPE, index);
785
        index = index - 2 + ce.len;
786
        addToken(currentToken);
787
        currentToken = new RETokenChar(subIndex,ce.ch,insens);
788
      }
789
 
790
      // BACKREFERENCE OPERATOR
791
      //  \1 \2 ... \9 and \10 \11 \12 ...
792
      // not available if RE_NO_BK_REFS is set
793
      // Perl recognizes \10, \11, and so on only if enough number of
794
      // parentheses have opened before it, otherwise they are treated
795
      // as aliases of \010, \011, ... (octal characters).  In case of
796
      // Sun's JDK, octal character expression must always begin with \0.
797
      // We will do as JDK does. But FIXME, take a look at "(a)(b)\29".
798
      // JDK treats \2 as a back reference to the 2nd group because
799
      // there are only two groups. But in our poor implementation,
800
      // we cannot help but treat \29 as a back reference to the 29th group.
801
 
802
      else if (unit.bk && Character.isDigit(unit.ch) && !syntax.get(RESyntax.RE_NO_BK_REFS)) {
803
        addToken(currentToken);
804
        int numBegin = index - 1;
805
        int numEnd = pLength;
806
        for (int i = index; i < pLength; i++) {
807
            if (! Character.isDigit(pattern[i])) {
808
                numEnd = i;
809
                break;
810
            }
811
        }
812
        int num = parseInt(pattern, numBegin, numEnd-numBegin, 10);
813
 
814
        currentToken = new RETokenBackRef(subIndex,num,insens);
815
        index = numEnd;
816
      }
817
 
818
      // START OF STRING OPERATOR
819
      //  \A if RE_STRING_ANCHORS is set
820
 
821
      else if (unit.bk && (unit.ch == 'A') && syntax.get(RESyntax.RE_STRING_ANCHORS)) {
822
        addToken(currentToken);
823
        currentToken = new RETokenStart(subIndex,null);
824
      }
825
 
826
      // WORD BREAK OPERATOR
827
      //  \b if ????
828
 
829
      else if (unit.bk && (unit.ch == 'b') && syntax.get(RESyntax.RE_STRING_ANCHORS)) {
830
          addToken(currentToken);
831
          currentToken = new RETokenWordBoundary(subIndex, RETokenWordBoundary.BEGIN | RETokenWordBoundary.END, false);
832
      }
833
 
834
      // WORD BEGIN OPERATOR 
835
      //  \< if ????
836
      else if (unit.bk && (unit.ch == '<')) {
837
          addToken(currentToken);
838
          currentToken = new RETokenWordBoundary(subIndex, RETokenWordBoundary.BEGIN, false);
839
      }
840
 
841
      // WORD END OPERATOR 
842
      //  \> if ????
843
      else if (unit.bk && (unit.ch == '>')) {
844
          addToken(currentToken);
845
          currentToken = new RETokenWordBoundary(subIndex, RETokenWordBoundary.END, false);
846
      }
847
 
848
      // NON-WORD BREAK OPERATOR
849
      // \B if ????
850
 
851
      else if (unit.bk && (unit.ch == 'B') && syntax.get(RESyntax.RE_STRING_ANCHORS)) {
852
          addToken(currentToken);
853
          currentToken = new RETokenWordBoundary(subIndex, RETokenWordBoundary.BEGIN | RETokenWordBoundary.END, true);
854
      }
855
 
856
 
857
      // DIGIT OPERATOR
858
      //  \d if RE_CHAR_CLASS_ESCAPES is set
859
 
860
      else if (unit.bk && (unit.ch == 'd') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
861
        addToken(currentToken);
862
        currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.DIGIT,insens,false);
863
      }
864
 
865
      // NON-DIGIT OPERATOR
866
      //  \D
867
 
868
        else if (unit.bk && (unit.ch == 'D') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
869
          addToken(currentToken);
870
          currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.DIGIT,insens,true);
871
        }
872
 
873
        // NEWLINE ESCAPE
874
        //  \n
875
 
876
        else if (unit.bk && (unit.ch == 'n')) {
877
          addToken(currentToken);
878
          currentToken = new RETokenChar(subIndex,'\n',false);
879
        }
880
 
881
        // RETURN ESCAPE
882
        //  \r
883
 
884
        else if (unit.bk && (unit.ch == 'r')) {
885
          addToken(currentToken);
886
          currentToken = new RETokenChar(subIndex,'\r',false);
887
        }
888
 
889
        // WHITESPACE OPERATOR
890
        //  \s if RE_CHAR_CLASS_ESCAPES is set
891
 
892
        else if (unit.bk && (unit.ch == 's') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
893
          addToken(currentToken);
894
          currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.SPACE,insens,false);
895
        }
896
 
897
        // NON-WHITESPACE OPERATOR
898
        //  \S
899
 
900
        else if (unit.bk && (unit.ch == 'S') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
901
          addToken(currentToken);
902
          currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.SPACE,insens,true);
903
        }
904
 
905
        // TAB ESCAPE
906
        //  \t
907
 
908
        else if (unit.bk && (unit.ch == 't')) {
909
          addToken(currentToken);
910
          currentToken = new RETokenChar(subIndex,'\t',false);
911
        }
912
 
913
        // ALPHANUMERIC OPERATOR
914
        //  \w
915
 
916
        else if (unit.bk && (unit.ch == 'w') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
917
          addToken(currentToken);
918
          currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.ALNUM,insens,false);
919
        }
920
 
921
        // NON-ALPHANUMERIC OPERATOR
922
        //  \W
923
 
924
        else if (unit.bk && (unit.ch == 'W') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
925
          addToken(currentToken);
926
          currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.ALNUM,insens,true);
927
        }
928
 
929
        // END OF STRING OPERATOR
930
        //  \Z
931
 
932
        else if (unit.bk && (unit.ch == 'Z') && syntax.get(RESyntax.RE_STRING_ANCHORS)) {
933
          addToken(currentToken);
934
          currentToken = new RETokenEnd(subIndex,null);
935
        }
936
 
937
        // HEX CHARACTER, UNICODE CHARACTER
938
        //  \x1B, \u1234
939
 
940
        else if ((unit.bk && (unit.ch == 'x') && syntax.get(RESyntax.RE_HEX_CHAR)) ||
941
                 (unit.bk && (unit.ch == 'u') && syntax.get(RESyntax.RE_UNICODE_CHAR))) {
942
          CharExpression ce = getCharExpression(pattern, index - 2, pLength, syntax);
943
          if (ce == null)
944
            throw new REException("invalid hex character", REException.REG_ESCAPE, index);
945
          index = index - 2 + ce.len;
946
          addToken(currentToken);
947
          currentToken = new RETokenChar(subIndex,ce.ch,insens);
948
        }
949
 
950
        // NAMED PROPERTY
951
        // \p{prop}, \P{prop}
952
 
953
        else if ((unit.bk && (unit.ch == 'p') && syntax.get(RESyntax.RE_NAMED_PROPERTY)) ||
954
                 (unit.bk && (unit.ch == 'P') && syntax.get(RESyntax.RE_NAMED_PROPERTY))) {
955
          NamedProperty np = getNamedProperty(pattern, index - 2, pLength);
956
          if (np == null)
957
              throw new REException("invalid escape sequence", REException.REG_ESCAPE, index);
958
          index = index - 2 + np.len;
959
          addToken(currentToken);
960
          currentToken = getRETokenNamedProperty(subIndex,np,insens,index);
961
        }
962
 
963
        // NON-SPECIAL CHARACTER (or escape to make literal)
964
        //  c | \* for example
965
 
966
        else {  // not a special character
967
          addToken(currentToken);
968
          currentToken = new RETokenChar(subIndex,unit.ch,insens);
969
        }
970
      } // end while
971
 
972
    // Add final buffered token and an EndSub marker
973
    addToken(currentToken);
974
 
975
    if (branches != null) {
976
        branches.addElement(new RE(firstToken,lastToken,numSubs,subIndex,minimumLength, maximumLength));
977
        branches.trimToSize(); // compact the Vector
978
        minimumLength = 0;
979
        maximumLength = 0;
980
        firstToken = lastToken = null;
981
        addToken(new RETokenOneOf(subIndex,branches,false));
982
    }
983
    else addToken(new RETokenEndSub(subIndex));
984
 
985
  }
986
 
987
  private static class ParseCharClassResult {
988
      RETokenOneOf token;
989
      int index;
990
      boolean returnAtAndOperator = false;
991
  }
992
 
993
  /**
994
   * Parse [...] or [^...] and make an RETokenOneOf instance.
995
   * @param subIndex subIndex to be given to the created RETokenOneOf instance.
996
   * @param pattern Input array of characters to be parsed.
997
   * @param index Index pointing to the character next to the beginning '['.
998
   * @param pLength Limit of the input array.
999
   * @param cflags Compilation flags used to parse the pattern.
1000
   * @param pflags Flags that affect the behavior of this method.
1001
   * @param syntax Syntax used to parse the pattern.
1002
   */
1003
  private static ParseCharClassResult parseCharClass(int subIndex,
1004
                char[] pattern, int index,
1005
                int pLength, int cflags, RESyntax syntax, int pflags)
1006
                throws REException {
1007
 
1008
        boolean insens = ((cflags & REG_ICASE) > 0);
1009
        Vector options = new Vector();
1010
        Vector addition = new Vector();
1011
        boolean additionAndAppeared = false;
1012
        final int RETURN_AT_AND = 0x01;
1013
        boolean returnAtAndOperator = ((pflags & RETURN_AT_AND) != 0);
1014
        boolean negative = false;
1015
        char ch;
1016
 
1017
        char lastChar = 0;
1018
        boolean lastCharIsSet = false;
1019
        if (index == pLength) throw new REException(getLocalizedMessage("unmatched.bracket"),REException.REG_EBRACK,index);
1020
 
1021
        // Check for initial caret, negation
1022
        if ((ch = pattern[index]) == '^') {
1023
          negative = true;
1024
          if (++index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
1025
          ch = pattern[index];
1026
        }
1027
 
1028
        // Check for leading right bracket literal
1029
        if (ch == ']') {
1030
          lastChar = ch; lastCharIsSet = true;
1031
          if (++index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
1032
        }
1033
 
1034
        while ((ch = pattern[index++]) != ']') {
1035
          if ((ch == '-') && (lastCharIsSet)) {
1036
            if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
1037
            if ((ch = pattern[index]) == ']') {
1038
              options.addElement(new RETokenChar(subIndex,lastChar,insens));
1039
              lastChar = '-';
1040
            } else {
1041
              if ((ch == '\\') && syntax.get(RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS)) {
1042
                CharExpression ce = getCharExpression(pattern, index, pLength, syntax);
1043
                if (ce == null)
1044
                  throw new REException("invalid escape sequence", REException.REG_ESCAPE, index);
1045
                ch = ce.ch;
1046
                index = index + ce.len - 1;
1047
              }
1048
              options.addElement(new RETokenRange(subIndex,lastChar,ch,insens));
1049
              lastChar = 0; lastCharIsSet = false;
1050
              index++;
1051
            }
1052
          } else if ((ch == '\\') && syntax.get(RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS)) {
1053
            if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
1054
            int posixID = -1;
1055
            boolean negate = false;
1056
            char asciiEsc = 0;
1057
            boolean asciiEscIsSet = false;
1058
            NamedProperty np = null;
1059
            if (("dswDSW".indexOf(pattern[index]) != -1) && syntax.get(RESyntax.RE_CHAR_CLASS_ESC_IN_LISTS)) {
1060
              switch (pattern[index]) {
1061
              case 'D':
1062
                negate = true;
1063
              case 'd':
1064
                posixID = RETokenPOSIX.DIGIT;
1065
                break;
1066
              case 'S':
1067
                negate = true;
1068
              case 's':
1069
                posixID = RETokenPOSIX.SPACE;
1070
                break;
1071
              case 'W':
1072
                negate = true;
1073
              case 'w':
1074
                posixID = RETokenPOSIX.ALNUM;
1075
                break;
1076
              }
1077
            }
1078
            if (("pP".indexOf(pattern[index]) != -1) && syntax.get(RESyntax.RE_NAMED_PROPERTY)) {
1079
              np = getNamedProperty(pattern, index - 1, pLength);
1080
              if (np == null)
1081
                throw new REException("invalid escape sequence", REException.REG_ESCAPE, index);
1082
              index = index - 1 + np.len - 1;
1083
            }
1084
            else {
1085
              CharExpression ce = getCharExpression(pattern, index - 1, pLength, syntax);
1086
              if (ce == null)
1087
                throw new REException("invalid escape sequence", REException.REG_ESCAPE, index);
1088
              asciiEsc = ce.ch; asciiEscIsSet = true;
1089
              index = index - 1 + ce.len - 1;
1090
            }
1091
            if (lastCharIsSet) options.addElement(new RETokenChar(subIndex,lastChar,insens));
1092
 
1093
            if (posixID != -1) {
1094
              options.addElement(new RETokenPOSIX(subIndex,posixID,insens,negate));
1095
            } else if (np != null) {
1096
              options.addElement(getRETokenNamedProperty(subIndex,np,insens,index));
1097
            } else if (asciiEscIsSet) {
1098
              lastChar = asciiEsc; lastCharIsSet = true;
1099
            } else {
1100
              lastChar = pattern[index]; lastCharIsSet = true;
1101
            }
1102
            ++index;
1103
          } else if ((ch == '[') && (syntax.get(RESyntax.RE_CHAR_CLASSES)) && (index < pLength) && (pattern[index] == ':')) {
1104
            StringBuffer posixSet = new StringBuffer();
1105
            index = getPosixSet(pattern,index+1,posixSet);
1106
            int posixId = RETokenPOSIX.intValue(posixSet.toString());
1107
            if (posixId != -1)
1108
              options.addElement(new RETokenPOSIX(subIndex,posixId,insens,false));
1109
          } else if ((ch == '[') && (syntax.get(RESyntax.RE_NESTED_CHARCLASS))) {
1110
                ParseCharClassResult result = parseCharClass(
1111
                    subIndex, pattern, index, pLength, cflags, syntax, 0);
1112
                addition.addElement(result.token);
1113
                addition.addElement("|");
1114
                index = result.index;
1115
          } else if ((ch == '&') &&
1116
                     (syntax.get(RESyntax.RE_NESTED_CHARCLASS)) &&
1117
                     (index < pLength) && (pattern[index] == '&')) {
1118
                if (returnAtAndOperator) {
1119
                    ParseCharClassResult result = new ParseCharClassResult();
1120
                    options.trimToSize();
1121
                    if (additionAndAppeared) addition.addElement("&");
1122
                    if (addition.size() == 0) addition = null;
1123
                    result.token = new RETokenOneOf(subIndex,
1124
                        options, addition, negative);
1125
                    result.index = index - 1;
1126
                    result.returnAtAndOperator = true;
1127
                    return result;
1128
                }
1129
                // The precedence of the operator "&&" is the lowest.
1130
                // So we postpone adding "&" until other elements
1131
                // are added. And we insert Boolean.FALSE at the
1132
                // beginning of the list of tokens following "&&".
1133
                // So, "&&[a-b][k-m]" will be stored in the Vecter
1134
                // addition in this order:
1135
                //     Boolean.FALSE, [a-b], "|", [k-m], "|", "&"
1136
                if (additionAndAppeared) addition.addElement("&");
1137
                addition.addElement(Boolean.FALSE);
1138
                additionAndAppeared = true;
1139
 
1140
                // The part on which "&&" operates may be either
1141
                //   (1) explicitly enclosed by []
1142
                //   or
1143
                //   (2) not enclosed by [] and terminated by the
1144
                //       next "&&" or the end of the character list.
1145
                //  Let the preceding else if block do the case (1).
1146
                //  We must do something in case of (2).
1147
                if ((index + 1 < pLength) && (pattern[index + 1] != '[')) {
1148
                    ParseCharClassResult result = parseCharClass(
1149
                        subIndex, pattern, index+1, pLength, cflags, syntax,
1150
                        RETURN_AT_AND);
1151
                    addition.addElement(result.token);
1152
                    addition.addElement("|");
1153
                    // If the method returned at the next "&&", it is OK.
1154
                    // Otherwise we have eaten the mark of the end of this
1155
                    // character list "]".  In this case we must give back
1156
                    // the end mark.
1157
                    index = (result.returnAtAndOperator ?
1158
                        result.index: result.index - 1);
1159
                }
1160
          } else {
1161
            if (lastCharIsSet) options.addElement(new RETokenChar(subIndex,lastChar,insens));
1162
            lastChar = ch; lastCharIsSet = true;
1163
          }
1164
          if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
1165
        } // while in list
1166
        // Out of list, index is one past ']'
1167
 
1168
        if (lastCharIsSet) options.addElement(new RETokenChar(subIndex,lastChar,insens));
1169
 
1170
        ParseCharClassResult result = new ParseCharClassResult();
1171
        // Create a new RETokenOneOf
1172
        options.trimToSize();
1173
        if (additionAndAppeared) addition.addElement("&");
1174
        if (addition.size() == 0) addition = null;
1175
        result.token = new RETokenOneOf(subIndex,options, addition, negative);
1176
        result.index = index;
1177
        return result;
1178
  }
1179
 
1180
  private static int getCharUnit(char[] input, int index, CharUnit unit, boolean quot) throws REException {
1181
    unit.ch = input[index++];
1182
    unit.bk = (unit.ch == '\\'
1183
               && (!quot || index >= input.length || input[index] == 'E'));
1184
    if (unit.bk)
1185
      if (index < input.length)
1186
        unit.ch = input[index++];
1187
      else throw new REException(getLocalizedMessage("ends.with.backslash"),REException.REG_ESCAPE,index);
1188
    return index;
1189
  }
1190
 
1191
  private static int parseInt(char[] input, int pos, int len, int radix) {
1192
    int ret = 0;
1193
    for (int i = pos; i < pos + len; i++) {
1194
        ret = ret * radix + Character.digit(input[i], radix);
1195
    }
1196
    return ret;
1197
  }
1198
 
1199
  /**
1200
   * This class represents various expressions for a character.
1201
   * "a"      : 'a' itself.
1202
   * "\0123"  : Octal char 0123
1203
   * "\x1b"   : Hex char 0x1b
1204
   * "\u1234" : Unicode char \u1234
1205
   */
1206
  private static class CharExpression {
1207
    /** character represented by this expression */
1208
    char ch;
1209
    /** String expression */
1210
    String expr;
1211
    /** length of this expression */
1212
    int len;
1213
    public String toString() { return expr; }
1214
  }
1215
 
1216
  private static CharExpression getCharExpression(char[] input, int pos, int lim,
1217
        RESyntax syntax) {
1218
    CharExpression ce = new CharExpression();
1219
    char c = input[pos];
1220
    if (c == '\\') {
1221
      if (pos + 1 >= lim) return null;
1222
      c = input[pos + 1];
1223
      switch(c) {
1224
      case 't':
1225
        ce.ch = '\t';
1226
        ce.len = 2;
1227
        break;
1228
      case 'n':
1229
        ce.ch = '\n';
1230
        ce.len = 2;
1231
        break;
1232
      case 'r':
1233
        ce.ch = '\r';
1234
        ce.len = 2;
1235
        break;
1236
      case 'x':
1237
      case 'u':
1238
        if ((c == 'x' && syntax.get(RESyntax.RE_HEX_CHAR)) ||
1239
            (c == 'u' && syntax.get(RESyntax.RE_UNICODE_CHAR))) {
1240
          int l = 0;
1241
          int expectedLength = (c == 'x' ? 2 : 4);
1242
          for (int i = pos + 2; i < pos + 2 + expectedLength; i++) {
1243
            if (i >= lim) break;
1244
            if (!((input[i] >= '0' && input[i] <= '9') ||
1245
                  (input[i] >= 'A' && input[i] <= 'F') ||
1246
                  (input[i] >= 'a' && input[i] <= 'f')))
1247
                break;
1248
            l++;
1249
          }
1250
          if (l != expectedLength) return null;
1251
          ce.ch = (char)(parseInt(input, pos + 2, l, 16));
1252
          ce.len = l + 2;
1253
        }
1254
        else {
1255
          ce.ch = c;
1256
          ce.len = 2;
1257
        }
1258
        break;
1259
      case '0':
1260
        if (syntax.get(RESyntax.RE_OCTAL_CHAR)) {
1261
          int l = 0;
1262
          for (int i = pos + 2; i < pos + 2 + 3; i++) {
1263
            if (i >= lim) break;
1264
            if (input[i] < '0' || input[i] > '7') break;
1265
            l++;
1266
          }
1267
          if (l == 3 && input[pos + 2] > '3') l--;
1268
          if (l <= 0) return null;
1269
          ce.ch = (char)(parseInt(input, pos + 2, l, 8));
1270
          ce.len = l + 2;
1271
        }
1272
        else {
1273
          ce.ch = c;
1274
          ce.len = 2;
1275
        }
1276
        break;
1277
      default:
1278
        ce.ch = c;
1279
        ce.len = 2;
1280
        break;
1281
      }
1282
    }
1283
    else {
1284
      ce.ch = input[pos];
1285
      ce.len = 1;
1286
    }
1287
    ce.expr = new String(input, pos, ce.len);
1288
    return ce;
1289
  }
1290
 
1291
  /**
1292
   * This class represents a substring in a pattern string expressing
1293
   * a named property.
1294
   * "\pA"      : Property named "A"
1295
   * "\p{prop}" : Property named "prop"
1296
   * "\PA"      : Property named "A" (Negated)
1297
   * "\P{prop}" : Property named "prop" (Negated)
1298
   */
1299
  private static class NamedProperty {
1300
    /** Property name */
1301
    String name;
1302
    /** Negated or not */
1303
    boolean negate;
1304
    /** length of this expression */
1305
    int len;
1306
  }
1307
 
1308
  private static NamedProperty getNamedProperty(char[] input, int pos, int lim) {
1309
    NamedProperty np = new NamedProperty();
1310
    char c = input[pos];
1311
    if (c == '\\') {
1312
      if (++pos >= lim) return null;
1313
      c = input[pos++];
1314
      switch(c) {
1315
      case 'p':
1316
        np.negate = false;
1317
        break;
1318
      case 'P':
1319
        np.negate = true;
1320
        break;
1321
      default:
1322
        return null;
1323
      }
1324
      c = input[pos++];
1325
      if (c == '{') {
1326
          int p = -1;
1327
          for (int i = pos; i < lim; i++) {
1328
              if (input[i] == '}') {
1329
                  p = i;
1330
                  break;
1331
              }
1332
          }
1333
          if (p < 0) return null;
1334
          int len = p - pos;
1335
          np.name = new String(input, pos, len);
1336
          np.len = len + 4;
1337
      }
1338
      else {
1339
          np.name = new String(input, pos - 1, 1);
1340
          np.len = 3;
1341
      }
1342
      return np;
1343
    }
1344
    else return null;
1345
  }
1346
 
1347
  private static RETokenNamedProperty getRETokenNamedProperty(
1348
      int subIndex, NamedProperty np, boolean insens, int index)
1349
      throws REException {
1350
    try {
1351
        return new RETokenNamedProperty(subIndex, np.name, insens, np.negate);
1352
    }
1353
    catch (REException e) {
1354
        REException ree;
1355
        ree = new REException(e.getMessage(), REException.REG_ESCAPE, index);
1356
        ree.initCause(e);
1357
        throw ree;
1358
    }
1359
  }
1360
 
1361
  /**
1362
   * Checks if the regular expression matches the input in its entirety.
1363
   *
1364
   * @param input The input text.
1365
   */
1366
  public boolean isMatch(Object input) {
1367
    return isMatch(input,0,0);
1368
  }
1369
 
1370
  /**
1371
   * Checks if the input string, starting from index, is an exact match of
1372
   * this regular expression.
1373
   *
1374
   * @param input The input text.
1375
   * @param index The offset index at which the search should be begin.
1376
   */
1377
  public boolean isMatch(Object input,int index) {
1378
    return isMatch(input,index,0);
1379
  }
1380
 
1381
 
1382
  /**
1383
   * Checks if the input, starting from index and using the specified
1384
   * execution flags, is an exact match of this regular expression.
1385
   *
1386
   * @param input The input text.
1387
   * @param index The offset index at which the search should be begin.
1388
   * @param eflags The logical OR of any execution flags above.
1389
   */
1390
  public boolean isMatch(Object input,int index,int eflags) {
1391
    return isMatchImpl(makeCharIndexed(input,index),index,eflags);
1392
  }
1393
 
1394
  private boolean isMatchImpl(CharIndexed input, int index, int eflags) {
1395
    if (firstToken == null)  // Trivial case
1396
      return (input.charAt(0) == CharIndexed.OUT_OF_BOUNDS);
1397
    REMatch m = new REMatch(numSubs, index, eflags);
1398
    if (firstToken.match(input, m)) {
1399
        while (m != null) {
1400
            if (input.charAt(m.index) == CharIndexed.OUT_OF_BOUNDS) {
1401
                return true;
1402
            }
1403
            m = m.next;
1404
        }
1405
    }
1406
    return false;
1407
  }
1408
 
1409
  /**
1410
   * Returns the maximum number of subexpressions in this regular expression.
1411
   * If the expression contains branches, the value returned will be the
1412
   * maximum subexpressions in any of the branches.
1413
   */
1414
  public int getNumSubs() {
1415
    return numSubs;
1416
  }
1417
 
1418
  // Overrides REToken.setUncle
1419
  void setUncle(REToken uncle) {
1420
      if (lastToken != null) {
1421
          lastToken.setUncle(uncle);
1422
      } else super.setUncle(uncle); // to deal with empty subexpressions
1423
  }
1424
 
1425
  // Overrides REToken.chain
1426
 
1427
  boolean chain(REToken next) {
1428
    super.chain(next);
1429
    setUncle(next);
1430
    return true;
1431
  }
1432
 
1433
  /**
1434
   * Returns the minimum number of characters that could possibly
1435
   * constitute a match of this regular expression.
1436
   */
1437
  public int getMinimumLength() {
1438
      return minimumLength;
1439
  }
1440
 
1441
  public int getMaximumLength() {
1442
      return maximumLength;
1443
  }
1444
 
1445
  /**
1446
   * Returns an array of all matches found in the input.
1447
   *
1448
   * If the regular expression allows the empty string to match, it will
1449
   * substitute matches at all positions except the end of the input.
1450
   *
1451
   * @param input The input text.
1452
   * @return a non-null (but possibly zero-length) array of matches
1453
   */
1454
  public REMatch[] getAllMatches(Object input) {
1455
    return getAllMatches(input,0,0);
1456
  }
1457
 
1458
  /**
1459
   * Returns an array of all matches found in the input,
1460
   * beginning at the specified index position.
1461
   *
1462
   * If the regular expression allows the empty string to match, it will
1463
   * substitute matches at all positions except the end of the input.
1464
   *
1465
   * @param input The input text.
1466
   * @param index The offset index at which the search should be begin.
1467
   * @return a non-null (but possibly zero-length) array of matches
1468
   */
1469
  public REMatch[] getAllMatches(Object input, int index) {
1470
    return getAllMatches(input,index,0);
1471
  }
1472
 
1473
  /**
1474
   * Returns an array of all matches found in the input string,
1475
   * beginning at the specified index position and using the specified
1476
   * execution flags.
1477
   *
1478
   * If the regular expression allows the empty string to match, it will
1479
   * substitute matches at all positions except the end of the input.
1480
   *
1481
   * @param input The input text.
1482
   * @param index The offset index at which the search should be begin.
1483
   * @param eflags The logical OR of any execution flags above.
1484
   * @return a non-null (but possibly zero-length) array of matches
1485
   */
1486
  public REMatch[] getAllMatches(Object input, int index, int eflags) {
1487
    return getAllMatchesImpl(makeCharIndexed(input,index),index,eflags);
1488
  }
1489
 
1490
  // this has been changed since 1.03 to be non-overlapping matches
1491
  private REMatch[] getAllMatchesImpl(CharIndexed input, int index, int eflags) {
1492
    Vector all = new Vector();
1493
    REMatch m = null;
1494
    while ((m = getMatchImpl(input,index,eflags,null)) != null) {
1495
      all.addElement(m);
1496
      index = m.getEndIndex();
1497
      if (m.end[0] == 0) {   // handle pathological case of zero-length match
1498
        index++;
1499
        input.move(1);
1500
      } else {
1501
        input.move(m.end[0]);
1502
      }
1503
      if (!input.isValid()) break;
1504
    }
1505
    REMatch[] mset = new REMatch[all.size()];
1506
    all.copyInto(mset);
1507
    return mset;
1508
  }
1509
 
1510
    /* Implements abstract method REToken.match() */
1511
    boolean match(CharIndexed input, REMatch mymatch) {
1512
        if (firstToken == null) {
1513
            return next(input, mymatch);
1514
        }
1515
 
1516
        // Note the start of this subexpression
1517
        mymatch.start[subIndex] = mymatch.index;
1518
 
1519
        return firstToken.match(input, mymatch);
1520
    }
1521
 
1522
  /**
1523
   * Returns the first match found in the input.  If no match is found,
1524
   * null is returned.
1525
   *
1526
   * @param input The input text.
1527
   * @return An REMatch instance referencing the match, or null if none.
1528
   */
1529
  public REMatch getMatch(Object input) {
1530
    return getMatch(input,0,0);
1531
  }
1532
 
1533
  /**
1534
   * Returns the first match found in the input, beginning
1535
   * the search at the specified index.  If no match is found,
1536
   * returns null.
1537
   *
1538
   * @param input The input text.
1539
   * @param index The offset within the text to begin looking for a match.
1540
   * @return An REMatch instance referencing the match, or null if none.
1541
   */
1542
  public REMatch getMatch(Object input, int index) {
1543
    return getMatch(input,index,0);
1544
  }
1545
 
1546
  /**
1547
   * Returns the first match found in the input, beginning
1548
   * the search at the specified index, and using the specified
1549
   * execution flags.  If no match is found, returns null.
1550
   *
1551
   * @param input The input text.
1552
   * @param index The offset index at which the search should be begin.
1553
   * @param eflags The logical OR of any execution flags above.
1554
   * @return An REMatch instance referencing the match, or null if none.
1555
   */
1556
  public REMatch getMatch(Object input, int index, int eflags) {
1557
    return getMatch(input,index,eflags,null);
1558
  }
1559
 
1560
  /**
1561
   * Returns the first match found in the input, beginning the search
1562
   * at the specified index, and using the specified execution flags.
1563
   * If no match is found, returns null.  If a StringBuffer is
1564
   * provided and is non-null, the contents of the input text from the
1565
   * index to the beginning of the match (or to the end of the input,
1566
   * if there is no match) are appended to the StringBuffer.
1567
   *
1568
   * @param input The input text.
1569
   * @param index The offset index at which the search should be begin.
1570
   * @param eflags The logical OR of any execution flags above.
1571
   * @param buffer The StringBuffer to save pre-match text in.
1572
   * @return An REMatch instance referencing the match, or null if none.  */
1573
  public REMatch getMatch(Object input, int index, int eflags, StringBuffer buffer) {
1574
    return getMatchImpl(makeCharIndexed(input,index),index,eflags,buffer);
1575
  }
1576
 
1577
  REMatch getMatchImpl(CharIndexed input, int anchor, int eflags, StringBuffer buffer) {
1578
      boolean tryEntireMatch = ((eflags & REG_TRY_ENTIRE_MATCH) != 0);
1579
      RE re = (tryEntireMatch ? (RE) this.clone() : this);
1580
      if (tryEntireMatch) {
1581
          re.chain(new RETokenEnd(0, null));
1582
      }
1583
      // Create a new REMatch to hold results
1584
      REMatch mymatch = new REMatch(numSubs, anchor, eflags);
1585
      do {
1586
          // Optimization: check if anchor + minimumLength > length
1587
          if (minimumLength == 0 || input.charAt(minimumLength-1) != CharIndexed.OUT_OF_BOUNDS) {
1588
              if (re.match(input, mymatch)) {
1589
                  REMatch best = mymatch;
1590
                  // We assume that the match that coms first is the best.
1591
                  // And the following "The longer, the better" rule has
1592
                  // been commented out. The longest is not neccesarily
1593
                  // the best. For example, "a" out of "aaa" is the best
1594
                  // match for /a+?/.
1595
                  /*
1596
                  // Find best match of them all to observe leftmost longest
1597
                  while ((mymatch = mymatch.next) != null) {
1598
                      if (mymatch.index > best.index) {
1599
                        best = mymatch;
1600
                      }
1601
                  }
1602
                  */
1603
                  best.end[0] = best.index;
1604
                  best.finish(input);
1605
                  return best;
1606
              }
1607
          }
1608
          mymatch.clear(++anchor);
1609
          // Append character to buffer if needed
1610
          if (buffer != null && input.charAt(0) != CharIndexed.OUT_OF_BOUNDS) {
1611
              buffer.append(input.charAt(0));
1612
          }
1613
      } while (input.move(1));
1614
 
1615
      // Special handling at end of input for e.g. "$"
1616
      if (minimumLength == 0) {
1617
          if (match(input, mymatch)) {
1618
              mymatch.finish(input);
1619
              return mymatch;
1620
          }
1621
      }
1622
 
1623
      return null;
1624
  }
1625
 
1626
  /**
1627
   * Returns an REMatchEnumeration that can be used to iterate over the
1628
   * matches found in the input text.
1629
   *
1630
   * @param input The input text.
1631
   * @return A non-null REMatchEnumeration instance.
1632
   */
1633
  public REMatchEnumeration getMatchEnumeration(Object input) {
1634
    return getMatchEnumeration(input,0,0);
1635
  }
1636
 
1637
 
1638
  /**
1639
   * Returns an REMatchEnumeration that can be used to iterate over the
1640
   * matches found in the input text.
1641
   *
1642
   * @param input The input text.
1643
   * @param index The offset index at which the search should be begin.
1644
   * @return A non-null REMatchEnumeration instance, with its input cursor
1645
   *  set to the index position specified.
1646
   */
1647
  public REMatchEnumeration getMatchEnumeration(Object input, int index) {
1648
    return getMatchEnumeration(input,index,0);
1649
  }
1650
 
1651
  /**
1652
   * Returns an REMatchEnumeration that can be used to iterate over the
1653
   * matches found in the input text.
1654
   *
1655
   * @param input The input text.
1656
   * @param index The offset index at which the search should be begin.
1657
   * @param eflags The logical OR of any execution flags above.
1658
   * @return A non-null REMatchEnumeration instance, with its input cursor
1659
   *  set to the index position specified.
1660
   */
1661
  public REMatchEnumeration getMatchEnumeration(Object input, int index, int eflags) {
1662
    return new REMatchEnumeration(this,makeCharIndexed(input,index),index,eflags);
1663
  }
1664
 
1665
 
1666
  /**
1667
   * Substitutes the replacement text for the first match found in the input.
1668
   *
1669
   * @param input The input text.
1670
   * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1671
   * @return A String interpolating the substituted text.
1672
   * @see REMatch#substituteInto
1673
   */
1674
  public String substitute(Object input,String replace) {
1675
    return substitute(input,replace,0,0);
1676
  }
1677
 
1678
  /**
1679
   * Substitutes the replacement text for the first match found in the input
1680
   * beginning at the specified index position.  Specifying an index
1681
   * effectively causes the regular expression engine to throw away the
1682
   * specified number of characters.
1683
   *
1684
   * @param input The input text.
1685
   * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1686
   * @param index The offset index at which the search should be begin.
1687
   * @return A String containing the substring of the input, starting
1688
   *   at the index position, and interpolating the substituted text.
1689
   * @see REMatch#substituteInto
1690
   */
1691
  public String substitute(Object input,String replace,int index) {
1692
    return substitute(input,replace,index,0);
1693
  }
1694
 
1695
  /**
1696
   * Substitutes the replacement text for the first match found in the input
1697
   * string, beginning at the specified index position and using the
1698
   * specified execution flags.
1699
   *
1700
   * @param input The input text.
1701
   * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1702
   * @param index The offset index at which the search should be begin.
1703
   * @param eflags The logical OR of any execution flags above.
1704
   * @return A String containing the substring of the input, starting
1705
   *   at the index position, and interpolating the substituted text.
1706
   * @see REMatch#substituteInto
1707
   */
1708
  public String substitute(Object input,String replace,int index,int eflags) {
1709
    return substituteImpl(makeCharIndexed(input,index),replace,index,eflags);
1710
  }
1711
 
1712
  private String substituteImpl(CharIndexed input,String replace,int index,int eflags) {
1713
    StringBuffer buffer = new StringBuffer();
1714
    REMatch m = getMatchImpl(input,index,eflags,buffer);
1715
    if (m==null) return buffer.toString();
1716
    buffer.append(getReplacement(replace, m, eflags));
1717
    if (input.move(m.end[0])) {
1718
      do {
1719
        buffer.append(input.charAt(0));
1720
      } while (input.move(1));
1721
    }
1722
    return buffer.toString();
1723
  }
1724
 
1725
  /**
1726
   * Substitutes the replacement text for each non-overlapping match found
1727
   * in the input text.
1728
   *
1729
   * @param input The input text.
1730
   * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1731
   * @return A String interpolating the substituted text.
1732
   * @see REMatch#substituteInto
1733
   */
1734
  public String substituteAll(Object input,String replace) {
1735
    return substituteAll(input,replace,0,0);
1736
  }
1737
 
1738
  /**
1739
   * Substitutes the replacement text for each non-overlapping match found
1740
   * in the input text, starting at the specified index.
1741
   *
1742
   * If the regular expression allows the empty string to match, it will
1743
   * substitute matches at all positions except the end of the input.
1744
   *
1745
   * @param input The input text.
1746
   * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1747
   * @param index The offset index at which the search should be begin.
1748
   * @return A String containing the substring of the input, starting
1749
   *   at the index position, and interpolating the substituted text.
1750
   * @see REMatch#substituteInto
1751
   */
1752
  public String substituteAll(Object input,String replace,int index) {
1753
    return substituteAll(input,replace,index,0);
1754
  }
1755
 
1756
  /**
1757
   * Substitutes the replacement text for each non-overlapping match found
1758
   * in the input text, starting at the specified index and using the
1759
   * specified execution flags.
1760
   *
1761
   * @param input The input text.
1762
   * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1763
   * @param index The offset index at which the search should be begin.
1764
   * @param eflags The logical OR of any execution flags above.
1765
   * @return A String containing the substring of the input, starting
1766
   *   at the index position, and interpolating the substituted text.
1767
   * @see REMatch#substituteInto
1768
   */
1769
  public String substituteAll(Object input,String replace,int index,int eflags) {
1770
    return substituteAllImpl(makeCharIndexed(input,index),replace,index,eflags);
1771
  }
1772
 
1773
  private String substituteAllImpl(CharIndexed input,String replace,int index,int eflags) {
1774
    StringBuffer buffer = new StringBuffer();
1775
    REMatch m;
1776
    while ((m = getMatchImpl(input,index,eflags,buffer)) != null) {
1777
      buffer.append(getReplacement(replace, m, eflags));
1778
      index = m.getEndIndex();
1779
      if (m.end[0] == 0) {
1780
        char ch = input.charAt(0);
1781
        if (ch != CharIndexed.OUT_OF_BOUNDS)
1782
            buffer.append(ch);
1783
        input.move(1);
1784
      } else {
1785
          input.move(m.end[0]);
1786
      }
1787
 
1788
      if (!input.isValid()) break;
1789
    }
1790
    return buffer.toString();
1791
  }
1792
 
1793
  public static String getReplacement(String replace, REMatch m, int eflags) {
1794
    if ((eflags & REG_NO_INTERPOLATE) > 0)
1795
      return replace;
1796
    else {
1797
      if ((eflags & REG_REPLACE_USE_BACKSLASHESCAPE) > 0) {
1798
        StringBuffer sb = new StringBuffer();
1799
        int l = replace.length();
1800
        for (int i = 0; i < l; i++) {
1801
            char c = replace.charAt(i);
1802
            switch(c) {
1803
            case '\\':
1804
              i++;
1805
              // Let StringIndexOutOfBoundsException be thrown.
1806
              sb.append(replace.charAt(i));
1807
              break;
1808
            case '$':
1809
              int i1 = i + 1;
1810
              while (i1 < replace.length() &&
1811
                Character.isDigit(replace.charAt(i1))) i1++;
1812
              sb.append(m.substituteInto(replace.substring(i, i1)));
1813
              i = i1 - 1;
1814
              break;
1815
            default:
1816
              sb.append(c);
1817
            }
1818
        }
1819
        return sb.toString();
1820
      }
1821
      else
1822
        return m.substituteInto(replace);
1823
    }
1824
  }
1825
 
1826
  /* Helper function for constructor */
1827
  private void addToken(REToken next) {
1828
    if (next == null) return;
1829
    minimumLength += next.getMinimumLength();
1830
    int nmax = next.getMaximumLength();
1831
    if (nmax < Integer.MAX_VALUE && maximumLength < Integer.MAX_VALUE)
1832
        maximumLength += nmax;
1833
    else
1834
        maximumLength = Integer.MAX_VALUE;
1835
 
1836
    if (firstToken == null) {
1837
        lastToken = firstToken = next;
1838
    } else {
1839
      // if chain returns false, it "rejected" the token due to
1840
      // an optimization, and next was combined with lastToken
1841
      if (lastToken.chain(next)) {
1842
          lastToken = next;
1843
      }
1844
    }
1845
  }
1846
 
1847
  private static REToken setRepeated(REToken current, int min, int max, int index) throws REException {
1848
    if (current == null) throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,index);
1849
    return new RETokenRepeated(current.subIndex,current,min,max);
1850
  }
1851
 
1852
  private static int getPosixSet(char[] pattern,int index,StringBuffer buf) {
1853
    // Precondition: pattern[index-1] == ':'
1854
    // we will return pos of closing ']'.
1855
    int i;
1856
    for (i=index; i<(pattern.length-1); i++) {
1857
      if ((pattern[i] == ':') && (pattern[i+1] == ']'))
1858
        return i+2;
1859
      buf.append(pattern[i]);
1860
    }
1861
    return index; // didn't match up
1862
  }
1863
 
1864
  private int getMinMax(char[] input,int index,IntPair minMax,RESyntax syntax) throws REException {
1865
    // Precondition: input[index-1] == '{', minMax != null
1866
 
1867
    boolean mustMatch = !syntax.get(RESyntax.RE_NO_BK_BRACES);
1868
    int startIndex = index;
1869
    if (index == input.length) {
1870
      if (mustMatch)
1871
        throw new REException(getLocalizedMessage("unmatched.brace"),REException.REG_EBRACE,index);
1872
      else
1873
        return startIndex;
1874
    }
1875
 
1876
    int min,max=0;
1877
    CharUnit unit = new CharUnit();
1878
    StringBuffer buf = new StringBuffer();
1879
 
1880
    // Read string of digits
1881
    do {
1882
      index = getCharUnit(input,index,unit,false);
1883
      if (Character.isDigit(unit.ch))
1884
        buf.append(unit.ch);
1885
    } while ((index != input.length) && Character.isDigit(unit.ch));
1886
 
1887
    // Check for {} tomfoolery
1888
    if (buf.length() == 0) {
1889
      if (mustMatch)
1890
        throw new REException(getLocalizedMessage("interval.error"),REException.REG_EBRACE,index);
1891
      else
1892
        return startIndex;
1893
    }
1894
 
1895
    min = Integer.parseInt(buf.toString());
1896
 
1897
    if ((unit.ch == '}') && (syntax.get(RESyntax.RE_NO_BK_BRACES) ^ unit.bk))
1898
      max = min;
1899
    else if (index == input.length)
1900
      if (mustMatch)
1901
        throw new REException(getLocalizedMessage("interval.no.end"),REException.REG_EBRACE,index);
1902
      else
1903
        return startIndex;
1904
    else if ((unit.ch == ',') && !unit.bk) {
1905
      buf = new StringBuffer();
1906
      // Read string of digits
1907
      while (((index = getCharUnit(input,index,unit,false)) != input.length) && Character.isDigit(unit.ch))
1908
        buf.append(unit.ch);
1909
 
1910
      if (!((unit.ch == '}') && (syntax.get(RESyntax.RE_NO_BK_BRACES) ^ unit.bk)))
1911
        if (mustMatch)
1912
          throw new REException(getLocalizedMessage("interval.error"),REException.REG_EBRACE,index);
1913
        else
1914
          return startIndex;
1915
 
1916
      // This is the case of {x,}
1917
      if (buf.length() == 0) max = Integer.MAX_VALUE;
1918
      else max = Integer.parseInt(buf.toString());
1919
    } else
1920
      if (mustMatch)
1921
        throw new REException(getLocalizedMessage("interval.error"),REException.REG_EBRACE,index);
1922
      else
1923
        return startIndex;
1924
 
1925
    // We know min and max now, and they are valid.
1926
 
1927
    minMax.first = min;
1928
    minMax.second = max;
1929
 
1930
    // return the index following the '}'
1931
    return index;
1932
  }
1933
 
1934
   /**
1935
    * Return a human readable form of the compiled regular expression,
1936
    * useful for debugging.
1937
    */
1938
   public String toString() {
1939
     StringBuffer sb = new StringBuffer();
1940
     dump(sb);
1941
     return sb.toString();
1942
   }
1943
 
1944
  void dump(StringBuffer os) {
1945
    os.append('(');
1946
    if (subIndex == 0)
1947
      os.append("?:");
1948
    if (firstToken != null)
1949
      firstToken.dumpAll(os);
1950
    os.append(')');
1951
  }
1952
 
1953
  // Cast input appropriately or throw exception
1954
  private static CharIndexed makeCharIndexed(Object input, int index) {
1955
      // We could let a String fall through to final input, but since
1956
      // it's the most likely input type, we check it first.
1957
    if (input instanceof String)
1958
      return new CharIndexedString((String) input,index);
1959
    else if (input instanceof char[])
1960
      return new CharIndexedCharArray((char[]) input,index);
1961
    else if (input instanceof StringBuffer)
1962
      return new CharIndexedStringBuffer((StringBuffer) input,index);
1963
    else if (input instanceof InputStream)
1964
      return new CharIndexedInputStream((InputStream) input,index);
1965
    else if (input instanceof CharIndexed)
1966
        return (CharIndexed) input; // do we lose index info?
1967
    else
1968
        return new CharIndexedString(input.toString(), index);
1969
  }
1970
}

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.