OpenCores
URL https://opencores.org/ocsvn/openrisc/openrisc/trunk

Subversion Repositories openrisc

[/] [openrisc/] [trunk/] [gnu-dev/] [or1k-gcc/] [libjava/] [classpath/] [gnu/] [java/] [util/] [regex/] [RE.java] - Blame information for rev 791

Go to most recent revision | Details | Compare with Previous | View Log

Line No. Rev Author Line
1 769 jeremybenn
/* gnu/regexp/RE.java
2
   Copyright (C) 2006 Free Software Foundation, Inc.
3
 
4
This file is part of GNU Classpath.
5
 
6
GNU Classpath is free software; you can redistribute it and/or modify
7
it under the terms of the GNU General Public License as published by
8
the Free Software Foundation; either version 2, or (at your option)
9
any later version.
10
 
11
GNU Classpath is distributed in the hope that it will be useful, but
12
WITHOUT ANY WARRANTY; without even the implied warranty of
13
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
General Public License for more details.
15
 
16
You should have received a copy of the GNU General Public License
17
along with GNU Classpath; see the file COPYING.  If not, write to the
18
Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19
02110-1301 USA.
20
 
21
Linking this library statically or dynamically with other modules is
22
making a combined work based on this library.  Thus, the terms and
23
conditions of the GNU General Public License cover the whole
24
combination.
25
 
26
As a special exception, the copyright holders of this library give you
27
permission to link this library with independent modules to produce an
28
executable, regardless of the license terms of these independent
29
modules, and to copy and distribute the resulting executable under
30
terms of your choice, provided that you also meet, for each linked
31
independent module, the terms and conditions of the license of that
32
module.  An independent module is a module which is not derived from
33
or based on this library.  If you modify this library, you may extend
34
this exception to your version of the library, but you are not
35
obligated to do so.  If you do not wish to do so, delete this
36
exception statement from your version. */
37
 
38
package gnu.java.util.regex;
39
 
40
import gnu.java.lang.CPStringBuilder;
41
 
42
import java.io.InputStream;
43
import java.io.Serializable;
44
 
45
import java.util.ArrayList;
46
import java.util.List;
47
import java.util.Locale;
48
import java.util.PropertyResourceBundle;
49
import java.util.ResourceBundle;
50
 
51
/**
52
 * RE provides the user interface for compiling and matching regular
53
 * expressions.
54
 * <P>
55
 * A regular expression object (class RE) is compiled by constructing it
56
 * from a String, StringBuffer or character array, with optional
57
 * compilation flags (below)
58
 * and an optional syntax specification (see RESyntax; if not specified,
59
 * <code>RESyntax.RE_SYNTAX_PERL5</code> is used).
60
 * <P>
61
 * Once compiled, a regular expression object is reusable as well as
62
 * threadsafe: multiple threads can use the RE instance simultaneously
63
 * to match against different input text.
64
 * <P>
65
 * Various methods attempt to match input text against a compiled
66
 * regular expression.  These methods are:
67
 * <LI><code>isMatch</code>: returns true if the input text in its
68
 * entirety matches the regular expression pattern.
69
 * <LI><code>getMatch</code>: returns the first match found in the
70
 * input text, or null if no match is found.
71
 * <LI><code>getAllMatches</code>: returns an array of all
72
 * non-overlapping matches found in the input text.  If no matches are
73
 * found, the array is zero-length.
74
 * <LI><code>substitute</code>: substitute the first occurence of the
75
 * pattern in the input text with a replacement string (which may
76
 * include metacharacters $0-$9, see REMatch.substituteInto).
77
 * <LI><code>substituteAll</code>: same as above, but repeat for each
78
 * match before returning.
79
 * <LI><code>getMatchEnumeration</code>: returns an REMatchEnumeration
80
 * object that allows iteration over the matches (see
81
 * REMatchEnumeration for some reasons why you may want to do this
82
 * instead of using <code>getAllMatches</code>.
83
 * <P>
84
 *
85
 * These methods all have similar argument lists.  The input can be a
86
 * CharIndexed, String, a character array, a StringBuffer, or an
87
 * InputStream of some sort.  Note that when using an
88
 * InputStream, the stream read position cannot be guaranteed after
89
 * attempting a match (this is not a bug, but a consequence of the way
90
 * regular expressions work).  Using an REMatchEnumeration can
91
 * eliminate most positioning problems.
92
 *
93
 * Although the input object can be of various types, it is recommended
94
 * that it should be a CharIndexed because {@link CharIndexed#getLastMatch()}
95
 * can show the last match found on this input, which helps the expression
96
 * \G work as the end of the previous match.
97
 *
98
 * <P>
99
 *
100
 * The optional index argument specifies the offset from the beginning
101
 * of the text at which the search should start (see the descriptions
102
 * of some of the execution flags for how this can affect positional
103
 * pattern operators).  For an InputStream, this means an
104
 * offset from the current read position, so subsequent calls with the
105
 * same index argument on an InputStream will not
106
 * necessarily access the same position on the stream, whereas
107
 * repeated searches at a given index in a fixed string will return
108
 * consistent results.
109
 *
110
 * <P>
111
 * You can optionally affect the execution environment by using a
112
 * combination of execution flags (constants listed below).
113
 *
114
 * <P>
115
 * All operations on a regular expression are performed in a
116
 * thread-safe manner.
117
 *
118
 * @author <A HREF="mailto:wes@cacas.org">Wes Biggs</A>
119
 * @version 1.1.5-dev, to be released
120
 */
121
 
122
public class RE extends REToken
123
{
124
 
125
  private static final class IntPair implements Serializable
126
  {
127
    public int first, second;
128
  }
129
 
130
  private static final class CharUnit implements Serializable
131
  {
132
    public char ch;
133
    public boolean bk;
134
  }
135
 
136
  // This String will be returned by getVersion()
137
  private static final String VERSION = "1.1.5-dev";
138
 
139
  // The localized strings are kept in a separate file
140
  // Used by getLocalizedMessage().
141
  private static ResourceBundle messages;
142
 
143
  // Name of the bundle that contains the localized messages.
144
  private static final String bundle = "gnu/java/util/regex/MessagesBundle";
145
 
146
  // These are, respectively, the first and last tokens in our linked list
147
  // If there is only one token, firstToken == lastToken
148
  private REToken firstToken, lastToken;
149
 
150
  // This is the number of subexpressions in this regular expression,
151
  // with a minimum value of zero.  Returned by getNumSubs()
152
  private int numSubs;
153
 
154
    /** Minimum length, in characters, of any possible match. */
155
  private int minimumLength;
156
  private int maximumLength;
157
 
158
  /**
159
   * Compilation flag. Do  not  differentiate  case.   Subsequent
160
   * searches  using  this  RE will be case insensitive.
161
   */
162
  public static final int REG_ICASE = 0x02;
163
 
164
  /**
165
   * Compilation flag. The match-any-character operator (dot)
166
   * will match a newline character.  When set this overrides the syntax
167
   * bit RE_DOT_NEWLINE (see RESyntax for details).  This is equivalent to
168
   * the "/s" operator in Perl.
169
   */
170
  public static final int REG_DOT_NEWLINE = 0x04;
171
 
172
  /**
173
   * Compilation flag. Use multiline mode.  In this mode, the ^ and $
174
   * anchors will match based on newlines within the input. This is
175
   * equivalent to the "/m" operator in Perl.
176
   */
177
  public static final int REG_MULTILINE = 0x08;
178
 
179
  /**
180
   * Execution flag.
181
   * The match-beginning operator (^) will not match at the beginning
182
   * of the input string. Useful for matching on a substring when you
183
   * know the context of the input is such that position zero of the
184
   * input to the match test is not actually position zero of the text.
185
   * <P>
186
   * This example demonstrates the results of various ways of matching on
187
   * a substring.
188
   * <P>
189
   * <CODE>
190
   * String s = "food bar fool";<BR>
191
   * RE exp = new RE("^foo.");<BR>
192
   * REMatch m0 = exp.getMatch(s);<BR>
193
   * REMatch m1 = exp.getMatch(s.substring(8));<BR>
194
   * REMatch m2 = exp.getMatch(s.substring(8),0,RE.REG_NOTBOL); <BR>
195
   * REMatch m3 = exp.getMatch(s,8);                            <BR>
196
   * REMatch m4 = exp.getMatch(s,8,RE.REG_ANCHORINDEX);         <BR>
197
   * <P>
198
   * // Results:<BR>
199
   * //  m0.toString(): "food"<BR>
200
   * //  m1.toString(): "fool"<BR>
201
   * //  m2.toString(): null<BR>
202
   * //  m3.toString(): null<BR>
203
   * //  m4.toString(): "fool"<BR>
204
   * </CODE>
205
   */
206
  public static final int REG_NOTBOL = 0x10;
207
 
208
  /**
209
   * Execution flag.
210
   * The match-end operator ($) does not match at the end
211
   * of the input string. Useful for matching on substrings.
212
   */
213
  public static final int REG_NOTEOL = 0x20;
214
 
215
  /**
216
   * Execution flag.
217
   * When a match method is invoked that starts matching at a non-zero
218
   * index into the input, treat the input as if it begins at the index
219
   * given.  The effect of this flag is that the engine does not "see"
220
   * any text in the input before the given index.  This is useful so
221
   * that the match-beginning operator (^) matches not at position 0
222
   * in the input string, but at the position the search started at
223
   * (based on the index input given to the getMatch function).  See
224
   * the example under REG_NOTBOL.  It also affects the use of the \&lt;
225
   * and \b operators.
226
   */
227
  public static final int REG_ANCHORINDEX = 0x40;
228
 
229
  /**
230
   * Execution flag.
231
   * The substitute and substituteAll methods will not attempt to
232
   * interpolate occurrences of $1-$9 in the replacement text with
233
   * the corresponding subexpressions.  For example, you may want to
234
   * replace all matches of "one dollar" with "$1".
235
   */
236
  public static final int REG_NO_INTERPOLATE = 0x80;
237
 
238
  /**
239
   * Execution flag.
240
   * Try to match the whole input string. An implicit match-end operator
241
   * is added to this regexp.
242
   */
243
  public static final int REG_TRY_ENTIRE_MATCH = 0x0100;
244
 
245
  /**
246
   * Execution flag.
247
   * The substitute and substituteAll methods will treat the
248
   * character '\' in the replacement as an escape to a literal
249
   * character. In this case "\n", "\$", "\\", "\x40" and "\012"
250
   * will become "n", "$", "\", "x40" and "012" respectively.
251
   * This flag has no effect if REG_NO_INTERPOLATE is set on.
252
   */
253
  public static final int REG_REPLACE_USE_BACKSLASHESCAPE = 0x0200;
254
 
255
  /**
256
   * Compilation flag. Allow whitespace and comments in pattern.
257
   * This is equivalent to the "/x" operator in Perl.
258
   */
259
  public static final int REG_X_COMMENTS = 0x0400;
260
 
261
  /**
262
   * Compilation flag. If set, REG_ICASE is effective only for US-ASCII.
263
   */
264
  public static final int REG_ICASE_USASCII = 0x0800;
265
 
266
  /**
267
   * Execution flag.
268
   * Do not move the position at which the search begins.  If not set,
269
   * the starting position will be moved until a match is found.
270
   */
271
  public static final int REG_FIX_STARTING_POSITION = 0x1000;
272
 
273
  /** Returns a string representing the version of the gnu.regexp package. */
274
  public static final String version ()
275
  {
276
    return VERSION;
277
  }
278
 
279
  // Retrieves a message from the ResourceBundle
280
  static final String getLocalizedMessage (String key)
281
  {
282
    if (messages == null)
283
      messages =
284
        PropertyResourceBundle.getBundle (bundle, Locale.getDefault ());
285
    return messages.getString (key);
286
  }
287
 
288
  /**
289
   * Constructs a regular expression pattern buffer without any compilation
290
   * flags set, and using the default syntax (RESyntax.RE_SYNTAX_PERL5).
291
   *
292
   * @param pattern A regular expression pattern, in the form of a String,
293
   *   StringBuffer or char[].  Other input types will be converted to
294
   *   strings using the toString() method.
295
   * @exception REException The input pattern could not be parsed.
296
   * @exception NullPointerException The pattern was null.
297
   */
298
  public RE (Object pattern) throws REException
299
  {
300
    this (pattern, 0, RESyntax.RE_SYNTAX_PERL5, 0, 0);
301
  }
302
 
303
  /**
304
   * Constructs a regular expression pattern buffer using the specified
305
   * compilation flags and the default syntax (RESyntax.RE_SYNTAX_PERL5).
306
   *
307
   * @param pattern A regular expression pattern, in the form of a String,
308
   *   StringBuffer, or char[].  Other input types will be converted to
309
   *   strings using the toString() method.
310
   * @param cflags The logical OR of any combination of the compilation flags listed above.
311
   * @exception REException The input pattern could not be parsed.
312
   * @exception NullPointerException The pattern was null.
313
   */
314
  public RE (Object pattern, int cflags) throws REException
315
  {
316
    this (pattern, cflags, RESyntax.RE_SYNTAX_PERL5, 0, 0);
317
  }
318
 
319
  /**
320
   * Constructs a regular expression pattern buffer using the specified
321
   * compilation flags and regular expression syntax.
322
   *
323
   * @param pattern A regular expression pattern, in the form of a String,
324
   *   StringBuffer, or char[].  Other input types will be converted to
325
   *   strings using the toString() method.
326
   * @param cflags The logical OR of any combination of the compilation flags listed above.
327
   * @param syntax The type of regular expression syntax to use.
328
   * @exception REException The input pattern could not be parsed.
329
   * @exception NullPointerException The pattern was null.
330
   */
331
  public RE (Object pattern, int cflags, RESyntax syntax) throws REException
332
  {
333
    this (pattern, cflags, syntax, 0, 0);
334
  }
335
 
336
  // internal constructor used for alternation
337
  private RE (REToken first, REToken last, int subs, int subIndex,
338
              int minLength, int maxLength)
339
  {
340
    super (subIndex);
341
    firstToken = first;
342
    lastToken = last;
343
    numSubs = subs;
344
    minimumLength = minLength;
345
    maximumLength = maxLength;
346
    addToken (new RETokenEndSub (subIndex));
347
  }
348
 
349
  private RE (Object patternObj, int cflags, RESyntax syntax, int myIndex,
350
              int nextSub) throws REException
351
  {
352
    super (myIndex);            // Subexpression index of this token.
353
    initialize (patternObj, cflags, syntax, myIndex, nextSub);
354
  }
355
 
356
  // For use by subclasses
357
  protected RE ()
358
  {
359
    super (0);
360
  }
361
 
362
  // The meat of construction
363
  protected void initialize (Object patternObj, int cflags, RESyntax syntax,
364
                             int myIndex, int nextSub) throws REException
365
  {
366
    char[] pattern;
367
    if (patternObj instanceof String)
368
      {
369
        pattern = ((String) patternObj).toCharArray ();
370
      }
371
    else if (patternObj instanceof char[])
372
      {
373
        pattern = (char[]) patternObj;
374
      }
375
    else if (patternObj instanceof StringBuffer)
376
      {
377
        pattern = new char[((StringBuffer) patternObj).length ()];
378
        ((StringBuffer) patternObj).getChars (0, pattern.length, pattern, 0);
379
      }
380
    else if (patternObj instanceof StringBuilder)
381
      {
382
        pattern = new char[((StringBuilder) patternObj).length ()];
383
        ((StringBuilder) patternObj).getChars (0, pattern.length, pattern, 0);
384
      }
385
    else if (patternObj instanceof CPStringBuilder)
386
      {
387
        pattern = new char[((CPStringBuilder) patternObj).length ()];
388
        ((CPStringBuilder) patternObj).getChars (0, pattern.length, pattern,
389
                                                 0);
390
      }
391
    else
392
      {
393
        pattern = patternObj.toString ().toCharArray ();
394
      }
395
 
396
    int pLength = pattern.length;
397
 
398
    numSubs = 0;                // Number of subexpressions in this token.
399
    ArrayList < REToken > branches = null;
400
 
401
    // linked list of tokens (sort of -- some closed loops can exist)
402
    firstToken = lastToken = null;
403
 
404
    // Precalculate these so we don't pay for the math every time we
405
    // need to access them.
406
    boolean insens = ((cflags & REG_ICASE) > 0);
407
    boolean insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0);
408
 
409
    // Parse pattern into tokens.  Does anyone know if it's more efficient
410
    // to use char[] than a String.charAt()?  I'm assuming so.
411
 
412
    // index tracks the position in the char array
413
    int index = 0;
414
 
415
    // this will be the current parse character (pattern[index])
416
    CharUnit unit = new CharUnit ();
417
 
418
    // This is used for {x,y} calculations
419
    IntPair minMax = new IntPair ();
420
 
421
    // Buffer a token so we can create a TokenRepeated, etc.
422
    REToken currentToken = null;
423
    boolean quot = false;
424
 
425
    // Saved syntax and flags.
426
    RESyntax savedSyntax = null;
427
    int savedCflags = 0;
428
    boolean flagsSaved = false;
429
 
430
    while (index < pLength)
431
      {
432
        // read the next character unit (including backslash escapes)
433
        index = getCharUnit (pattern, index, unit, quot);
434
 
435
        if (unit.bk)
436
          if (unit.ch == 'Q')
437
            {
438
              quot = true;
439
              continue;
440
            }
441
          else if (unit.ch == 'E')
442
            {
443
              quot = false;
444
              continue;
445
            }
446
        if (quot)
447
          unit.bk = false;
448
 
449
        if (((cflags & REG_X_COMMENTS) > 0) && (!unit.bk) && (!quot))
450
          {
451
            if (Character.isWhitespace (unit.ch))
452
              {
453
                continue;
454
              }
455
            if (unit.ch == '#')
456
              {
457
                for (int i = index; i < pLength; i++)
458
                  {
459
                    if (pattern[i] == '\n')
460
                      {
461
                        index = i + 1;
462
                        continue;
463
                      }
464
                    else if (pattern[i] == '\r')
465
                      {
466
                        if (i + 1 < pLength && pattern[i + 1] == '\n')
467
                          {
468
                            index = i + 2;
469
                          }
470
                        else
471
                          {
472
                            index = i + 1;
473
                          }
474
                        continue;
475
                      }
476
                  }
477
                index = pLength;
478
                continue;
479
              }
480
          }
481
 
482
        // ALTERNATION OPERATOR
483
        //  \| or | (if RE_NO_BK_VBAR) or newline (if RE_NEWLINE_ALT)
484
        //  not available if RE_LIMITED_OPS is set
485
 
486
        // TODO: the '\n' literal here should be a test against REToken.newline,
487
        // which unfortunately may be more than a single character.
488
        if (((unit.ch == '|'
489
              && (syntax.get (RESyntax.RE_NO_BK_VBAR) ^ (unit.bk || quot)))
490
             || (syntax.get (RESyntax.RE_NEWLINE_ALT) && (unit.ch == '\n')
491
                 && !(unit.bk || quot)))
492
            && !syntax.get (RESyntax.RE_LIMITED_OPS))
493
          {
494
            // make everything up to here be a branch. create vector if nec.
495
            addToken (currentToken);
496
            RE theBranch =
497
              new RE (firstToken, lastToken, numSubs, subIndex, minimumLength,
498
                      maximumLength);
499
            minimumLength = 0;
500
            maximumLength = 0;
501
            if (branches == null)
502
              {
503
                branches = new ArrayList < REToken > ();
504
              }
505
            branches.add (theBranch);
506
            firstToken = lastToken = currentToken = null;
507
          }
508
 
509
        // INTERVAL OPERATOR:
510
        //  {x} | {x,} | {x,y}  (RE_INTERVALS && RE_NO_BK_BRACES)
511
        //  \{x\} | \{x,\} | \{x,y\} (RE_INTERVALS && !RE_NO_BK_BRACES)
512
        //
513
        // OPEN QUESTION:
514
        //  what is proper interpretation of '{' at start of string?
515
        //
516
        // This method used to check "repeat.empty.token" to avoid such regexp
517
        // as "(a*){2,}", but now "repeat.empty.token" is allowed.
518
 
519
        else if ((unit.ch == '{') && syntax.get (RESyntax.RE_INTERVALS)
520
                 && (syntax.
521
                     get (RESyntax.RE_NO_BK_BRACES) ^ (unit.bk || quot)))
522
          {
523
            int newIndex = getMinMax (pattern, index, minMax, syntax);
524
            if (newIndex > index)
525
              {
526
                if (minMax.first > minMax.second)
527
                  throw new
528
                    REException (getLocalizedMessage ("interval.order"),
529
                                 REException.REG_BADRPT, newIndex);
530
                if (currentToken == null)
531
                  throw new
532
                    REException (getLocalizedMessage ("repeat.no.token"),
533
                                 REException.REG_BADRPT, newIndex);
534
                if (currentToken instanceof RETokenRepeated)
535
                  throw new
536
                    REException (getLocalizedMessage ("repeat.chained"),
537
                                 REException.REG_BADRPT, newIndex);
538
                if (currentToken instanceof RETokenWordBoundary
539
                    || currentToken instanceof RETokenWordBoundary)
540
                  throw new
541
                    REException (getLocalizedMessage ("repeat.assertion"),
542
                                 REException.REG_BADRPT, newIndex);
543
                index = newIndex;
544
                currentToken =
545
                  setRepeated (currentToken, minMax.first, minMax.second,
546
                               index);
547
              }
548
            else
549
              {
550
                addToken (currentToken);
551
                currentToken = new RETokenChar (subIndex, unit.ch, insens);
552
                if (insensUSASCII)
553
                  currentToken.unicodeAware = false;
554
              }
555
          }
556
 
557
        // LIST OPERATOR:
558
        //  [...] | [^...]
559
 
560
        else if ((unit.ch == '[') && !(unit.bk || quot))
561
          {
562
            // Create a new RETokenOneOf
563
            ParseCharClassResult result =
564
              parseCharClass (subIndex, pattern, index, pLength, cflags,
565
                              syntax, 0);
566
            addToken (currentToken);
567
            currentToken = result.token;
568
            index = result.index;
569
          }
570
 
571
        // SUBEXPRESSIONS
572
        //  (...) | \(...\) depending on RE_NO_BK_PARENS
573
 
574
        else if ((unit.ch == '(')
575
                 && (syntax.
576
                     get (RESyntax.RE_NO_BK_PARENS) ^ (unit.bk || quot)))
577
          {
578
            boolean pure = false;
579
            boolean comment = false;
580
            boolean lookAhead = false;
581
            boolean lookBehind = false;
582
            boolean independent = false;
583
            boolean negativelh = false;
584
            boolean negativelb = false;
585
            if ((index + 1 < pLength) && (pattern[index] == '?'))
586
              {
587
                switch (pattern[index + 1])
588
                  {
589
                  case '!':
590
                    if (syntax.get (RESyntax.RE_LOOKAHEAD))
591
                      {
592
                        pure = true;
593
                        negativelh = true;
594
                        lookAhead = true;
595
                        index += 2;
596
                      }
597
                    break;
598
                  case '=':
599
                    if (syntax.get (RESyntax.RE_LOOKAHEAD))
600
                      {
601
                        pure = true;
602
                        lookAhead = true;
603
                        index += 2;
604
                      }
605
                    break;
606
                  case '<':
607
                    // We assume that if the syntax supports look-ahead,
608
                    // it also supports look-behind.
609
                    if (syntax.get (RESyntax.RE_LOOKAHEAD))
610
                      {
611
                        index++;
612
                        switch (pattern[index + 1])
613
                          {
614
                          case '!':
615
                            pure = true;
616
                            negativelb = true;
617
                            lookBehind = true;
618
                            index += 2;
619
                            break;
620
                          case '=':
621
                            pure = true;
622
                            lookBehind = true;
623
                            index += 2;
624
                          }
625
                      }
626
                    break;
627
                  case '>':
628
                    // We assume that if the syntax supports look-ahead,
629
                    // it also supports independent group.
630
                    if (syntax.get (RESyntax.RE_LOOKAHEAD))
631
                      {
632
                        pure = true;
633
                        independent = true;
634
                        index += 2;
635
                      }
636
                    break;
637
                  case 'i':
638
                  case 'd':
639
                  case 'm':
640
                  case 's':
641
                  case 'u':
642
                  case 'x':
643
                  case '-':
644
                    if (!syntax.get (RESyntax.RE_EMBEDDED_FLAGS))
645
                      break;
646
                    // Set or reset syntax flags.
647
                    int flagIndex = index + 1;
648
                    int endFlag = -1;
649
                    RESyntax newSyntax = new RESyntax (syntax);
650
                    int newCflags = cflags;
651
                    boolean negate = false;
652
                    while (flagIndex < pLength && endFlag < 0)
653
                      {
654
                        switch (pattern[flagIndex])
655
                          {
656
                          case 'i':
657
                            if (negate)
658
                              newCflags &= ~REG_ICASE;
659
                            else
660
                              newCflags |= REG_ICASE;
661
                            flagIndex++;
662
                            break;
663
                          case 'd':
664
                            if (negate)
665
                              newSyntax.setLineSeparator (RESyntax.
666
                                                          DEFAULT_LINE_SEPARATOR);
667
                            else
668
                              newSyntax.setLineSeparator ("\n");
669
                            flagIndex++;
670
                            break;
671
                          case 'm':
672
                            if (negate)
673
                              newCflags &= ~REG_MULTILINE;
674
                            else
675
                              newCflags |= REG_MULTILINE;
676
                            flagIndex++;
677
                            break;
678
                          case 's':
679
                            if (negate)
680
                              newCflags &= ~REG_DOT_NEWLINE;
681
                            else
682
                              newCflags |= REG_DOT_NEWLINE;
683
                            flagIndex++;
684
                            break;
685
                          case 'u':
686
                            if (negate)
687
                              newCflags |= REG_ICASE_USASCII;
688
                            else
689
                              newCflags &= ~REG_ICASE_USASCII;
690
                            flagIndex++;
691
                            break;
692
                          case 'x':
693
                            if (negate)
694
                              newCflags &= ~REG_X_COMMENTS;
695
                            else
696
                              newCflags |= REG_X_COMMENTS;
697
                            flagIndex++;
698
                            break;
699
                          case '-':
700
                            negate = true;
701
                            flagIndex++;
702
                            break;
703
                          case ':':
704
                          case ')':
705
                            endFlag = pattern[flagIndex];
706
                            break;
707
                          default:
708
                            throw new
709
                              REException (getLocalizedMessage
710
                                           ("repeat.no.token"),
711
                                           REException.REG_BADRPT, index);
712
                          }
713
                      }
714
                    if (endFlag == ')')
715
                      {
716
                        syntax = newSyntax;
717
                        cflags = newCflags;
718
                        insens = ((cflags & REG_ICASE) > 0);
719
                        insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0);
720
                        // This can be treated as though it were a comment.
721
                        comment = true;
722
                        index = flagIndex - 1;
723
                        break;
724
                      }
725
                    if (endFlag == ':')
726
                      {
727
                        savedSyntax = syntax;
728
                        savedCflags = cflags;
729
                        flagsSaved = true;
730
                        syntax = newSyntax;
731
                        cflags = newCflags;
732
                        insens = ((cflags & REG_ICASE) > 0);
733
                        insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0);
734
                        index = flagIndex - 1;
735
                        // Fall through to the next case.
736
                      }
737
                    else
738
                      {
739
                        throw new
740
                          REException (getLocalizedMessage
741
                                       ("unmatched.paren"),
742
                                       REException.REG_ESUBREG, index);
743
                      }
744
                  case ':':
745
                    if (syntax.get (RESyntax.RE_PURE_GROUPING))
746
                      {
747
                        pure = true;
748
                        index += 2;
749
                      }
750
                    break;
751
                  case '#':
752
                    if (syntax.get (RESyntax.RE_COMMENTS))
753
                      {
754
                        comment = true;
755
                      }
756
                    break;
757
                  default:
758
                    throw new
759
                      REException (getLocalizedMessage ("repeat.no.token"),
760
                                   REException.REG_BADRPT, index);
761
                  }
762
              }
763
 
764
            if (index >= pLength)
765
              {
766
                throw new
767
                  REException (getLocalizedMessage ("unmatched.paren"),
768
                               REException.REG_ESUBREG, index);
769
              }
770
 
771
            // find end of subexpression
772
            int endIndex = index;
773
            int nextIndex = index;
774
            int nested = 0;
775
 
776
            while (((nextIndex =
777
                     getCharUnit (pattern, endIndex, unit, false)) > 0)
778
                   && !(nested == 0 && (unit.ch == ')')
779
                        && (syntax.
780
                            get (RESyntax.RE_NO_BK_PARENS) ^ (unit.bk
781
                                                              || quot))))
782
              {
783
                if ((endIndex = nextIndex) >= pLength)
784
                  throw new
785
                    REException (getLocalizedMessage ("subexpr.no.end"),
786
                                 REException.REG_ESUBREG, nextIndex);
787
                else
788
              if ((unit.ch == '[') && !(unit.bk || quot))
789
                {
790
                  // I hate to do something similar to the LIST OPERATOR matters
791
                  // above, but ...
792
                  int listIndex = nextIndex;
793
                  if (listIndex < pLength && pattern[listIndex] == '^')
794
                    listIndex++;
795
                  if (listIndex < pLength && pattern[listIndex] == ']')
796
                    listIndex++;
797
                  int listEndIndex = -1;
798
                  int listNest = 0;
799
                  while (listIndex < pLength && listEndIndex < 0)
800
                    {
801
                      switch (pattern[listIndex++])
802
                        {
803
                        case '\\':
804
                          listIndex++;
805
                          break;
806
                        case '[':
807
                          // Sun's API document says that regexp like "[a-d[m-p]]"
808
                          // is legal. Even something like "[[[^]]]]" is accepted.
809
                          listNest++;
810
                          if (listIndex < pLength
811
                              && pattern[listIndex] == '^')
812
                            listIndex++;
813
                          if (listIndex < pLength
814
                              && pattern[listIndex] == ']')
815
                            listIndex++;
816
                          break;
817
                        case ']':
818
                          if (listNest == 0)
819
                            listEndIndex = listIndex;
820
                          listNest--;
821
                          break;
822
                        }
823
                    }
824
                  if (listEndIndex >= 0)
825
                    {
826
                      nextIndex = listEndIndex;
827
                      if ((endIndex = nextIndex) >= pLength)
828
                        throw new
829
                          REException (getLocalizedMessage ("subexpr.no.end"),
830
                                       REException.REG_ESUBREG, nextIndex);
831
                      else
832
                      continue;
833
                    }
834
                  throw new
835
                    REException (getLocalizedMessage ("subexpr.no.end"),
836
                                 REException.REG_ESUBREG, nextIndex);
837
                }
838
              else if (unit.ch == '('
839
                       && (syntax.
840
                           get (RESyntax.RE_NO_BK_PARENS) ^ (unit.bk
841
                                                             || quot)))
842
                nested++;
843
              else if (unit.ch == ')'
844
                       && (syntax.
845
                           get (RESyntax.RE_NO_BK_PARENS) ^ (unit.bk
846
                                                             || quot)))
847
                nested--;
848
              }
849
 
850
            // endIndex is now position at a ')','\)'
851
            // nextIndex is end of string or position after ')' or '\)'
852
 
853
            if (comment)
854
              index = nextIndex;
855
            else
856
              {                 // not a comment
857
                // create RE subexpression as token.
858
                addToken (currentToken);
859
                if (!pure)
860
                  {
861
                    numSubs++;
862
                  }
863
 
864
                int useIndex = (pure || lookAhead || lookBehind
865
                                || independent) ? 0 : nextSub + numSubs;
866
                currentToken =
867
                  new RE (String.valueOf (pattern, index, endIndex - index).
868
                          toCharArray (), cflags, syntax, useIndex,
869
                          nextSub + numSubs);
870
                numSubs += ((RE) currentToken).getNumSubs ();
871
 
872
                if (lookAhead)
873
                  {
874
                    currentToken =
875
                      new RETokenLookAhead (currentToken, negativelh);
876
                  }
877
                else if (lookBehind)
878
                  {
879
                    currentToken =
880
                      new RETokenLookBehind (currentToken, negativelb);
881
                  }
882
                else if (independent)
883
                  {
884
                    currentToken = new RETokenIndependent (currentToken);
885
                  }
886
 
887
                index = nextIndex;
888
                if (flagsSaved)
889
                  {
890
                    syntax = savedSyntax;
891
                    cflags = savedCflags;
892
                    insens = ((cflags & REG_ICASE) > 0);
893
                    insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0);
894
                    flagsSaved = false;
895
                  }
896
              }                 // not a comment
897
          }                     // subexpression
898
 
899
        // UNMATCHED RIGHT PAREN
900
        // ) or \) throw exception if
901
        // !syntax.get(RESyntax.RE_UNMATCHED_RIGHT_PAREN_ORD)
902
        else if (!syntax.get (RESyntax.RE_UNMATCHED_RIGHT_PAREN_ORD)
903
                 && ((unit.ch == ')')
904
                     && (syntax.
905
                         get (RESyntax.RE_NO_BK_PARENS) ^ (unit.bk || quot))))
906
          {
907
            throw new REException (getLocalizedMessage ("unmatched.paren"),
908
                                   REException.REG_EPAREN, index);
909
          }
910
 
911
        // START OF LINE OPERATOR
912
        //  ^
913
 
914
        else if ((unit.ch == '^') && !(unit.bk || quot))
915
          {
916
            addToken (currentToken);
917
            currentToken = null;
918
            RETokenStart token = null;
919
            if ((cflags & REG_MULTILINE) > 0)
920
              {
921
                String sep = syntax.getLineSeparator ();
922
                if (sep == null)
923
                  {
924
                    token = new RETokenStart (subIndex, null, true);
925
                  }
926
                else
927
                  {
928
                    token = new RETokenStart (subIndex, sep);
929
                  }
930
              }
931
            else
932
              {
933
                token = new RETokenStart (subIndex, null);
934
              }
935
            addToken (token);
936
          }
937
 
938
        // END OF LINE OPERATOR
939
        //  $
940
 
941
        else if ((unit.ch == '$') && !(unit.bk || quot))
942
          {
943
            addToken (currentToken);
944
            currentToken = null;
945
            RETokenEnd token = null;
946
            if ((cflags & REG_MULTILINE) > 0)
947
              {
948
                String sep = syntax.getLineSeparator ();
949
                if (sep == null)
950
                  {
951
                    token = new RETokenEnd (subIndex, null, true);
952
                  }
953
                else
954
                  {
955
                    token = new RETokenEnd (subIndex, sep);
956
                  }
957
              }
958
            else
959
              {
960
                token = new RETokenEnd (subIndex, null);
961
              }
962
            addToken (token);
963
          }
964
 
965
        // MATCH-ANY-CHARACTER OPERATOR (except possibly newline and null)
966
        //  .
967
 
968
        else if ((unit.ch == '.') && !(unit.bk || quot))
969
          {
970
            addToken (currentToken);
971
            currentToken =
972
              new RETokenAny (subIndex, syntax.get (RESyntax.RE_DOT_NEWLINE)
973
                              || ((cflags & REG_DOT_NEWLINE) > 0),
974
                              syntax.get (RESyntax.RE_DOT_NOT_NULL));
975
          }
976
 
977
        // ZERO-OR-MORE REPEAT OPERATOR
978
        //  *
979
        //
980
        // This method used to check "repeat.empty.token" to avoid such regexp
981
        // as "(a*)*", but now "repeat.empty.token" is allowed.
982
 
983
        else if ((unit.ch == '*') && !(unit.bk || quot))
984
          {
985
            if (currentToken == null)
986
              throw new REException (getLocalizedMessage ("repeat.no.token"),
987
                                     REException.REG_BADRPT, index);
988
            if (currentToken instanceof RETokenRepeated)
989
              throw new REException (getLocalizedMessage ("repeat.chained"),
990
                                     REException.REG_BADRPT, index);
991
            if (currentToken instanceof RETokenWordBoundary
992
                || currentToken instanceof RETokenWordBoundary)
993
              throw new REException (getLocalizedMessage ("repeat.assertion"),
994
                                     REException.REG_BADRPT, index);
995
            currentToken =
996
              setRepeated (currentToken, 0, Integer.MAX_VALUE, index);
997
          }
998
 
999
        // ONE-OR-MORE REPEAT OPERATOR / POSSESSIVE MATCHING OPERATOR
1000
        //  + | \+ depending on RE_BK_PLUS_QM
1001
        //  not available if RE_LIMITED_OPS is set
1002
        //
1003
        // This method used to check "repeat.empty.token" to avoid such regexp
1004
        // as "(a*)+", but now "repeat.empty.token" is allowed.
1005
 
1006
        else if ((unit.ch == '+') && !syntax.get (RESyntax.RE_LIMITED_OPS)
1007
                 && (!syntax.
1008
                     get (RESyntax.RE_BK_PLUS_QM) ^ (unit.bk || quot)))
1009
          {
1010
            if (currentToken == null)
1011
              throw new REException (getLocalizedMessage ("repeat.no.token"),
1012
                                     REException.REG_BADRPT, index);
1013
 
1014
            // Check for possessive matching on RETokenRepeated
1015
            if (currentToken instanceof RETokenRepeated)
1016
              {
1017
                RETokenRepeated tokenRep = (RETokenRepeated) currentToken;
1018
                if (syntax.get (RESyntax.RE_POSSESSIVE_OPS)
1019
                    && !tokenRep.isPossessive () && !tokenRep.isStingy ())
1020
                  tokenRep.makePossessive ();
1021
                else
1022
                  throw new
1023
                    REException (getLocalizedMessage ("repeat.chained"),
1024
                                 REException.REG_BADRPT, index);
1025
 
1026
              }
1027
            else if (currentToken instanceof RETokenWordBoundary
1028
                     || currentToken instanceof RETokenWordBoundary)
1029
              throw new REException (getLocalizedMessage ("repeat.assertion"),
1030
                                     REException.REG_BADRPT, index);
1031
            else
1032
            currentToken =
1033
              setRepeated (currentToken, 1, Integer.MAX_VALUE, index);
1034
          }
1035
 
1036
        // ZERO-OR-ONE REPEAT OPERATOR / STINGY MATCHING OPERATOR
1037
        //  ? | \? depending on RE_BK_PLUS_QM
1038
        //  not available if RE_LIMITED_OPS is set
1039
        //  stingy matching if RE_STINGY_OPS is set and it follows a quantifier
1040
 
1041
        else if ((unit.ch == '?') && !syntax.get (RESyntax.RE_LIMITED_OPS)
1042
                 && (!syntax.
1043
                     get (RESyntax.RE_BK_PLUS_QM) ^ (unit.bk || quot)))
1044
          {
1045
            if (currentToken == null)
1046
              throw new REException (getLocalizedMessage ("repeat.no.token"),
1047
                                     REException.REG_BADRPT, index);
1048
 
1049
            // Check for stingy matching on RETokenRepeated
1050
            if (currentToken instanceof RETokenRepeated)
1051
              {
1052
                RETokenRepeated tokenRep = (RETokenRepeated) currentToken;
1053
                if (syntax.get (RESyntax.RE_STINGY_OPS)
1054
                    && !tokenRep.isStingy () && !tokenRep.isPossessive ())
1055
                  tokenRep.makeStingy ();
1056
                else
1057
                  throw new
1058
                    REException (getLocalizedMessage ("repeat.chained"),
1059
                                 REException.REG_BADRPT, index);
1060
              }
1061
            else if (currentToken instanceof RETokenWordBoundary
1062
                     || currentToken instanceof RETokenWordBoundary)
1063
              throw new REException (getLocalizedMessage ("repeat.assertion"),
1064
                                     REException.REG_BADRPT, index);
1065
            else
1066
            currentToken = setRepeated (currentToken, 0, 1, index);
1067
          }
1068
 
1069
        // OCTAL CHARACTER
1070
        //  \0377
1071
 
1072
        else if (unit.bk && (unit.ch == '0')
1073
                 && syntax.get (RESyntax.RE_OCTAL_CHAR))
1074
          {
1075
            CharExpression ce =
1076
              getCharExpression (pattern, index - 2, pLength, syntax);
1077
            if (ce == null)
1078
              throw new REException ("invalid octal character",
1079
                                     REException.REG_ESCAPE, index);
1080
            index = index - 2 + ce.len;
1081
            addToken (currentToken);
1082
            currentToken = new RETokenChar (subIndex, ce.ch, insens);
1083
            if (insensUSASCII)
1084
              currentToken.unicodeAware = false;
1085
          }
1086
 
1087
        // BACKREFERENCE OPERATOR
1088
        //  \1 \2 ... \9 and \10 \11 \12 ...
1089
        // not available if RE_NO_BK_REFS is set
1090
        // Perl recognizes \10, \11, and so on only if enough number of
1091
        // parentheses have opened before it, otherwise they are treated
1092
        // as aliases of \010, \011, ... (octal characters).  In case of
1093
        // Sun's JDK, octal character expression must always begin with \0.
1094
        // We will do as JDK does. But FIXME, take a look at "(a)(b)\29".
1095
        // JDK treats \2 as a back reference to the 2nd group because
1096
        // there are only two groups. But in our poor implementation,
1097
        // we cannot help but treat \29 as a back reference to the 29th group.
1098
 
1099
        else if (unit.bk && Character.isDigit (unit.ch)
1100
                 && !syntax.get (RESyntax.RE_NO_BK_REFS))
1101
          {
1102
            addToken (currentToken);
1103
            int numBegin = index - 1;
1104
            int numEnd = pLength;
1105
            for (int i = index; i < pLength; i++)
1106
              {
1107
                if (!Character.isDigit (pattern[i]))
1108
                  {
1109
                    numEnd = i;
1110
                    break;
1111
                  }
1112
              }
1113
            int num = parseInt (pattern, numBegin, numEnd - numBegin, 10);
1114
 
1115
            currentToken = new RETokenBackRef (subIndex, num, insens);
1116
            if (insensUSASCII)
1117
              currentToken.unicodeAware = false;
1118
            index = numEnd;
1119
          }
1120
 
1121
        // START OF STRING OPERATOR
1122
        //  \A if RE_STRING_ANCHORS is set
1123
 
1124
        else if (unit.bk && (unit.ch == 'A')
1125
                 && syntax.get (RESyntax.RE_STRING_ANCHORS))
1126
          {
1127
            addToken (currentToken);
1128
            currentToken = new RETokenStart (subIndex, null);
1129
          }
1130
 
1131
        // WORD BREAK OPERATOR
1132
        //  \b if ????
1133
 
1134
        else if (unit.bk && (unit.ch == 'b')
1135
                 && syntax.get (RESyntax.RE_STRING_ANCHORS))
1136
          {
1137
            addToken (currentToken);
1138
            currentToken =
1139
              new RETokenWordBoundary (subIndex,
1140
                                       RETokenWordBoundary.
1141
                                       BEGIN | RETokenWordBoundary.END,
1142
                                       false);
1143
          }
1144
 
1145
        // WORD BEGIN OPERATOR
1146
        //  \< if ????
1147
        else if (unit.bk && (unit.ch == '<'))
1148
          {
1149
            addToken (currentToken);
1150
            currentToken =
1151
              new RETokenWordBoundary (subIndex, RETokenWordBoundary.BEGIN,
1152
                                       false);
1153
          }
1154
 
1155
        // WORD END OPERATOR
1156
        //  \> if ????
1157
        else if (unit.bk && (unit.ch == '>'))
1158
          {
1159
            addToken (currentToken);
1160
            currentToken =
1161
              new RETokenWordBoundary (subIndex, RETokenWordBoundary.END,
1162
                                       false);
1163
          }
1164
 
1165
        // NON-WORD BREAK OPERATOR
1166
        // \B if ????
1167
 
1168
        else if (unit.bk && (unit.ch == 'B')
1169
                 && syntax.get (RESyntax.RE_STRING_ANCHORS))
1170
          {
1171
            addToken (currentToken);
1172
            currentToken =
1173
              new RETokenWordBoundary (subIndex,
1174
                                       RETokenWordBoundary.
1175
                                       BEGIN | RETokenWordBoundary.END, true);
1176
          }
1177
 
1178
 
1179
        // DIGIT OPERATOR
1180
        //  \d if RE_CHAR_CLASS_ESCAPES is set
1181
 
1182
        else if (unit.bk && (unit.ch == 'd')
1183
                 && syntax.get (RESyntax.RE_CHAR_CLASS_ESCAPES))
1184
          {
1185
            addToken (currentToken);
1186
            currentToken =
1187
              new RETokenPOSIX (subIndex, RETokenPOSIX.DIGIT, insens, false);
1188
            if (insensUSASCII)
1189
              currentToken.unicodeAware = false;
1190
          }
1191
 
1192
        // NON-DIGIT OPERATOR
1193
        //  \D
1194
 
1195
        else if (unit.bk && (unit.ch == 'D')
1196
                 && syntax.get (RESyntax.RE_CHAR_CLASS_ESCAPES))
1197
          {
1198
            addToken (currentToken);
1199
            currentToken =
1200
              new RETokenPOSIX (subIndex, RETokenPOSIX.DIGIT, insens, true);
1201
            if (insensUSASCII)
1202
              currentToken.unicodeAware = false;
1203
          }
1204
 
1205
        // NEWLINE ESCAPE
1206
        //  \n
1207
 
1208
        else if (unit.bk && (unit.ch == 'n'))
1209
          {
1210
            addToken (currentToken);
1211
            currentToken = new RETokenChar (subIndex, '\n', false);
1212
          }
1213
 
1214
        // RETURN ESCAPE
1215
        //  \r
1216
 
1217
        else if (unit.bk && (unit.ch == 'r'))
1218
          {
1219
            addToken (currentToken);
1220
            currentToken = new RETokenChar (subIndex, '\r', false);
1221
          }
1222
 
1223
        // WHITESPACE OPERATOR
1224
        //  \s if RE_CHAR_CLASS_ESCAPES is set
1225
 
1226
        else if (unit.bk && (unit.ch == 's')
1227
                 && syntax.get (RESyntax.RE_CHAR_CLASS_ESCAPES))
1228
          {
1229
            addToken (currentToken);
1230
            currentToken =
1231
              new RETokenPOSIX (subIndex, RETokenPOSIX.SPACE, insens, false);
1232
            if (insensUSASCII)
1233
              currentToken.unicodeAware = false;
1234
          }
1235
 
1236
        // NON-WHITESPACE OPERATOR
1237
        //  \S
1238
 
1239
        else if (unit.bk && (unit.ch == 'S')
1240
                 && syntax.get (RESyntax.RE_CHAR_CLASS_ESCAPES))
1241
          {
1242
            addToken (currentToken);
1243
            currentToken =
1244
              new RETokenPOSIX (subIndex, RETokenPOSIX.SPACE, insens, true);
1245
            if (insensUSASCII)
1246
              currentToken.unicodeAware = false;
1247
          }
1248
 
1249
        // TAB ESCAPE
1250
        //  \t
1251
 
1252
        else if (unit.bk && (unit.ch == 't'))
1253
          {
1254
            addToken (currentToken);
1255
            currentToken = new RETokenChar (subIndex, '\t', false);
1256
          }
1257
 
1258
        // ALPHANUMERIC OPERATOR
1259
        //  \w
1260
 
1261
        else if (unit.bk && (unit.ch == 'w')
1262
                 && syntax.get (RESyntax.RE_CHAR_CLASS_ESCAPES))
1263
          {
1264
            addToken (currentToken);
1265
            currentToken =
1266
              new RETokenPOSIX (subIndex, RETokenPOSIX.ALNUM, insens, false);
1267
            if (insensUSASCII)
1268
              currentToken.unicodeAware = false;
1269
          }
1270
 
1271
        // NON-ALPHANUMERIC OPERATOR
1272
        //  \W
1273
 
1274
        else if (unit.bk && (unit.ch == 'W')
1275
                 && syntax.get (RESyntax.RE_CHAR_CLASS_ESCAPES))
1276
          {
1277
            addToken (currentToken);
1278
            currentToken =
1279
              new RETokenPOSIX (subIndex, RETokenPOSIX.ALNUM, insens, true);
1280
            if (insensUSASCII)
1281
              currentToken.unicodeAware = false;
1282
          }
1283
 
1284
        // END OF STRING OPERATOR
1285
        //  \Z, \z
1286
 
1287
        // FIXME: \Z and \z are different in that if the input string
1288
        // ends with a line terminator, \Z matches the position before
1289
        // the final terminator.  This special behavior of \Z is yet
1290
        // to be implemented.
1291
 
1292
        else if (unit.bk && (unit.ch == 'Z' || unit.ch == 'z') &&
1293
                 syntax.get (RESyntax.RE_STRING_ANCHORS))
1294
          {
1295
            addToken (currentToken);
1296
            currentToken = new RETokenEnd (subIndex, null);
1297
          }
1298
 
1299
        // HEX CHARACTER, UNICODE CHARACTER
1300
        //  \x1B, \u1234
1301
 
1302
        else
1303
          if ((unit.bk && (unit.ch == 'x')
1304
               && syntax.get (RESyntax.RE_HEX_CHAR)) || (unit.bk
1305
                                                         && (unit.ch == 'u')
1306
                                                         && syntax.
1307
                                                         get (RESyntax.
1308
                                                              RE_UNICODE_CHAR)))
1309
          {
1310
            CharExpression ce =
1311
              getCharExpression (pattern, index - 2, pLength, syntax);
1312
            if (ce == null)
1313
              throw new REException ("invalid hex character",
1314
                                     REException.REG_ESCAPE, index);
1315
            index = index - 2 + ce.len;
1316
            addToken (currentToken);
1317
            currentToken = new RETokenChar (subIndex, ce.ch, insens);
1318
            if (insensUSASCII)
1319
              currentToken.unicodeAware = false;
1320
          }
1321
 
1322
        // NAMED PROPERTY
1323
        // \p{prop}, \P{prop}
1324
 
1325
        else
1326
          if ((unit.bk && (unit.ch == 'p')
1327
               && syntax.get (RESyntax.RE_NAMED_PROPERTY)) || (unit.bk
1328
                                                               && (unit.ch ==
1329
                                                                   'P')
1330
                                                               && syntax.
1331
                                                               get (RESyntax.
1332
                                                                    RE_NAMED_PROPERTY)))
1333
          {
1334
            NamedProperty np = getNamedProperty (pattern, index - 2, pLength);
1335
            if (np == null)
1336
              throw new REException ("invalid escape sequence",
1337
                                     REException.REG_ESCAPE, index);
1338
            index = index - 2 + np.len;
1339
            addToken (currentToken);
1340
            currentToken =
1341
              getRETokenNamedProperty (subIndex, np, insens, index);
1342
            if (insensUSASCII)
1343
              currentToken.unicodeAware = false;
1344
          }
1345
 
1346
        // END OF PREVIOUS MATCH
1347
        //  \G
1348
 
1349
        else if (unit.bk && (unit.ch == 'G') &&
1350
                 syntax.get (RESyntax.RE_STRING_ANCHORS))
1351
          {
1352
            addToken (currentToken);
1353
            currentToken = new RETokenEndOfPreviousMatch (subIndex);
1354
          }
1355
 
1356
        // NON-SPECIAL CHARACTER (or escape to make literal)
1357
        //  c | \* for example
1358
 
1359
        else
1360
          {                     // not a special character
1361
            addToken (currentToken);
1362
            currentToken = new RETokenChar (subIndex, unit.ch, insens);
1363
            if (insensUSASCII)
1364
              currentToken.unicodeAware = false;
1365
          }
1366
      }                         // end while
1367
 
1368
    // Add final buffered token and an EndSub marker
1369
    addToken (currentToken);
1370
 
1371
    if (branches != null)
1372
      {
1373
        branches.
1374
          add (new
1375
               RE (firstToken, lastToken, numSubs, subIndex, minimumLength,
1376
                   maximumLength));
1377
        branches.trimToSize (); // compact the Vector
1378
        minimumLength = 0;
1379
        maximumLength = 0;
1380
        firstToken = lastToken = null;
1381
        addToken (new RETokenOneOf (subIndex, branches, false));
1382
      }
1383
    else
1384
      addToken (new RETokenEndSub (subIndex));
1385
 
1386
  }
1387
 
1388
  private static class ParseCharClassResult
1389
  {
1390
    RETokenOneOf token;
1391
    int index;
1392
    boolean returnAtAndOperator = false;
1393
  }
1394
 
1395
  /**
1396
   * Parse [...] or [^...] and make an RETokenOneOf instance.
1397
   * @param subIndex subIndex to be given to the created RETokenOneOf instance.
1398
   * @param pattern Input array of characters to be parsed.
1399
   * @param index Index pointing to the character next to the beginning '['.
1400
   * @param pLength Limit of the input array.
1401
   * @param cflags Compilation flags used to parse the pattern.
1402
   * @param pflags Flags that affect the behavior of this method.
1403
   * @param syntax Syntax used to parse the pattern.
1404
   */
1405
  private static ParseCharClassResult parseCharClass (int subIndex,
1406
                                                      char[]pattern,
1407
                                                      int index, int pLength,
1408
                                                      int cflags,
1409
                                                      RESyntax syntax,
1410
                                                      int pflags) throws
1411
    REException
1412
  {
1413
 
1414
    boolean insens = ((cflags & REG_ICASE) > 0);
1415
    boolean insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0);
1416
    final ArrayList < REToken > options = new ArrayList < REToken > ();
1417
      ArrayList < Object > addition = new ArrayList < Object > ();
1418
    boolean additionAndAppeared = false;
1419
    final int RETURN_AT_AND = 0x01;
1420
    boolean returnAtAndOperator = ((pflags & RETURN_AT_AND) != 0);
1421
    boolean negative = false;
1422
    char ch;
1423
 
1424
    char lastChar = 0;
1425
    boolean lastCharIsSet = false;
1426
    if (index == pLength)
1427
      throw new REException (getLocalizedMessage ("unmatched.bracket"),
1428
                             REException.REG_EBRACK, index);
1429
 
1430
    // Check for initial caret, negation
1431
    if ((ch = pattern[index]) == '^')
1432
      {
1433
        negative = true;
1434
        if (++index == pLength)
1435
          throw new REException (getLocalizedMessage ("class.no.end"),
1436
                                 REException.REG_EBRACK, index);
1437
          ch = pattern[index];
1438
      }
1439
 
1440
    // Check for leading right bracket literal
1441
    if (ch == ']')
1442
      {
1443
        lastChar = ch;
1444
        lastCharIsSet = true;
1445
        if (++index == pLength)
1446
          throw new REException (getLocalizedMessage ("class.no.end"),
1447
                                 REException.REG_EBRACK, index);
1448
      }
1449
 
1450
    while ((ch = pattern[index++]) != ']')
1451
      {
1452
        if ((ch == '-') && (lastCharIsSet))
1453
          {
1454
            if (index == pLength)
1455
              throw new REException (getLocalizedMessage ("class.no.end"),
1456
                                     REException.REG_EBRACK, index);
1457
            if ((ch = pattern[index]) == ']')
1458
              {
1459
                RETokenChar t = new RETokenChar (subIndex, lastChar, insens);
1460
                if (insensUSASCII)
1461
                  t.unicodeAware = false;
1462
                options.add (t);
1463
                lastChar = '-';
1464
              }
1465
            else
1466
              {
1467
                if ((ch == '\\')
1468
                    && syntax.get (RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS))
1469
                  {
1470
                    CharExpression ce =
1471
                      getCharExpression (pattern, index, pLength, syntax);
1472
                    if (ce == null)
1473
                      throw new REException ("invalid escape sequence",
1474
                                             REException.REG_ESCAPE, index);
1475
                    ch = ce.ch;
1476
                    index = index + ce.len - 1;
1477
                  }
1478
                RETokenRange t =
1479
                  new RETokenRange (subIndex, lastChar, ch, insens);
1480
                if (insensUSASCII)
1481
                  t.unicodeAware = false;
1482
                options.add (t);
1483
                lastChar = 0;
1484
                lastCharIsSet = false;
1485
                index++;
1486
              }
1487
          }
1488
        else if ((ch == '\\')
1489
                 && syntax.get (RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS))
1490
          {
1491
            if (index == pLength)
1492
              throw new REException (getLocalizedMessage ("class.no.end"),
1493
                                     REException.REG_EBRACK, index);
1494
            int posixID = -1;
1495
            boolean negate = false;
1496
            char asciiEsc = 0;
1497
            boolean asciiEscIsSet = false;
1498
            NamedProperty np = null;
1499
            if (("dswDSW".indexOf (pattern[index]) != -1)
1500
                && syntax.get (RESyntax.RE_CHAR_CLASS_ESC_IN_LISTS))
1501
              {
1502
                switch (pattern[index])
1503
                  {
1504
                  case 'D':
1505
                    negate = true;
1506
                  case 'd':
1507
                    posixID = RETokenPOSIX.DIGIT;
1508
                    break;
1509
                  case 'S':
1510
                    negate = true;
1511
                  case 's':
1512
                    posixID = RETokenPOSIX.SPACE;
1513
                    break;
1514
                  case 'W':
1515
                    negate = true;
1516
                  case 'w':
1517
                    posixID = RETokenPOSIX.ALNUM;
1518
                    break;
1519
                  }
1520
              }
1521
            if (("pP".indexOf (pattern[index]) != -1)
1522
                && syntax.get (RESyntax.RE_NAMED_PROPERTY))
1523
              {
1524
                np = getNamedProperty (pattern, index - 1, pLength);
1525
                if (np == null)
1526
                  throw new REException ("invalid escape sequence",
1527
                                         REException.REG_ESCAPE, index);
1528
                index = index - 1 + np.len - 1;
1529
              }
1530
            else
1531
              {
1532
                CharExpression ce =
1533
                  getCharExpression (pattern, index - 1, pLength, syntax);
1534
                if (ce == null)
1535
                  throw new REException ("invalid escape sequence",
1536
                                         REException.REG_ESCAPE, index);
1537
                asciiEsc = ce.ch;
1538
                asciiEscIsSet = true;
1539
                index = index - 1 + ce.len - 1;
1540
              }
1541
            if (lastCharIsSet)
1542
              {
1543
                RETokenChar t = new RETokenChar (subIndex, lastChar, insens);
1544
                if (insensUSASCII)
1545
                  t.unicodeAware = false;
1546
                options.add (t);
1547
              }
1548
 
1549
            if (posixID != -1)
1550
              {
1551
                RETokenPOSIX t =
1552
                  new RETokenPOSIX (subIndex, posixID, insens, negate);
1553
                if (insensUSASCII)
1554
                  t.unicodeAware = false;
1555
                options.add (t);
1556
              }
1557
            else if (np != null)
1558
              {
1559
                RETokenNamedProperty t =
1560
                  getRETokenNamedProperty (subIndex, np, insens, index);
1561
                if (insensUSASCII)
1562
                  t.unicodeAware = false;
1563
                options.add (t);
1564
              }
1565
            else if (asciiEscIsSet)
1566
              {
1567
                lastChar = asciiEsc;
1568
                lastCharIsSet = true;
1569
              }
1570
            else
1571
              {
1572
                lastChar = pattern[index];
1573
                lastCharIsSet = true;
1574
              }
1575
            ++index;
1576
          }
1577
        else if ((ch == '[') && (syntax.get (RESyntax.RE_CHAR_CLASSES))
1578
                 && (index < pLength) && (pattern[index] == ':'))
1579
          {
1580
            CPStringBuilder posixSet = new CPStringBuilder ();
1581
            index = getPosixSet (pattern, index + 1, posixSet);
1582
            int posixId = RETokenPOSIX.intValue (posixSet.toString ());
1583
            if (posixId != -1)
1584
              {
1585
                RETokenPOSIX t =
1586
                  new RETokenPOSIX (subIndex, posixId, insens, false);
1587
                if (insensUSASCII)
1588
                  t.unicodeAware = false;
1589
                options.add (t);
1590
              }
1591
          }
1592
        else if ((ch == '[') && (syntax.get (RESyntax.RE_NESTED_CHARCLASS)))
1593
          {
1594
            ParseCharClassResult result =
1595
              parseCharClass (subIndex, pattern, index, pLength, cflags,
1596
                              syntax, 0);
1597
            addition.add (result.token);
1598
            addition.add ("|");
1599
            index = result.index;
1600
          }
1601
        else if ((ch == '&') &&
1602
                 (syntax.get (RESyntax.RE_NESTED_CHARCLASS)) &&
1603
                 (index < pLength) && (pattern[index] == '&'))
1604
          {
1605
            if (returnAtAndOperator)
1606
              {
1607
                ParseCharClassResult result = new ParseCharClassResult ();
1608
                options.trimToSize ();
1609
                if (additionAndAppeared)
1610
                  addition.add ("&");
1611
                if (addition.size () == 0)
1612
                  addition = null;
1613
                result.token = new RETokenOneOf (subIndex,
1614
                                                 options, addition, negative);
1615
                result.index = index - 1;
1616
                result.returnAtAndOperator = true;
1617
                return result;
1618
              }
1619
            // The precedence of the operator "&&" is the lowest.
1620
            // So we postpone adding "&" until other elements
1621
            // are added. And we insert Boolean.FALSE at the
1622
            // beginning of the list of tokens following "&&".
1623
            // So, "&&[a-b][k-m]" will be stored in the Vecter
1624
            // addition in this order:
1625
            //     Boolean.FALSE, [a-b], "|", [k-m], "|", "&"
1626
            if (additionAndAppeared)
1627
              addition.add ("&");
1628
            addition.add (Boolean.FALSE);
1629
            additionAndAppeared = true;
1630
 
1631
            // The part on which "&&" operates may be either
1632
            //   (1) explicitly enclosed by []
1633
            //   or
1634
            //   (2) not enclosed by [] and terminated by the
1635
            //       next "&&" or the end of the character list.
1636
            //  Let the preceding else if block do the case (1).
1637
            //  We must do something in case of (2).
1638
            if ((index + 1 < pLength) && (pattern[index + 1] != '['))
1639
              {
1640
                ParseCharClassResult result =
1641
                  parseCharClass (subIndex, pattern, index + 1, pLength,
1642
                                  cflags, syntax,
1643
                                  RETURN_AT_AND);
1644
                addition.add (result.token);
1645
                addition.add ("|");
1646
                // If the method returned at the next "&&", it is OK.
1647
                // Otherwise we have eaten the mark of the end of this
1648
                // character list "]".  In this case we must give back
1649
                // the end mark.
1650
                index = (result.returnAtAndOperator ?
1651
                         result.index : result.index - 1);
1652
              }
1653
          }
1654
        else
1655
          {
1656
            if (lastCharIsSet)
1657
              {
1658
                RETokenChar t = new RETokenChar (subIndex, lastChar, insens);
1659
                if (insensUSASCII)
1660
                  t.unicodeAware = false;
1661
                options.add (t);
1662
              }
1663
            lastChar = ch;
1664
            lastCharIsSet = true;
1665
          }
1666
        if (index == pLength)
1667
          throw new REException (getLocalizedMessage ("class.no.end"),
1668
                                 REException.REG_EBRACK, index);
1669
      }                         // while in list
1670
    // Out of list, index is one past ']'
1671
 
1672
    if (lastCharIsSet)
1673
      {
1674
        RETokenChar t = new RETokenChar (subIndex, lastChar, insens);
1675
        if (insensUSASCII)
1676
          t.unicodeAware = false;
1677
        options.add (t);
1678
      }
1679
 
1680
    ParseCharClassResult result = new ParseCharClassResult ();
1681
    // Create a new RETokenOneOf
1682
    options.trimToSize ();
1683
    if (additionAndAppeared)
1684
      addition.add ("&");
1685
    if (addition.size () == 0)
1686
      addition = null;
1687
    result.token = new RETokenOneOf (subIndex, options, addition, negative);
1688
    result.index = index;
1689
    return result;
1690
  }
1691
 
1692
  private static int getCharUnit (char[]input, int index, CharUnit unit,
1693
                                  boolean quot) throws REException
1694
  {
1695
    unit.ch = input[index++];
1696
    unit.bk = (unit.ch == '\\'
1697
               && (!quot || index >= input.length || input[index] == 'E'));
1698
    if (unit.bk)
1699
      if (index < input.length)
1700
        unit.ch = input[index++];
1701
      else
1702
        throw new REException (getLocalizedMessage ("ends.with.backslash"),
1703
                               REException.REG_ESCAPE, index);
1704
    return index;
1705
  }
1706
 
1707
  private static int parseInt (char[]input, int pos, int len, int radix)
1708
  {
1709
    int ret = 0;
1710
    for (int i = pos; i < pos + len; i++)
1711
      {
1712
        ret = ret * radix + Character.digit (input[i], radix);
1713
      }
1714
    return ret;
1715
  }
1716
 
1717
  /**
1718
   * This class represents various expressions for a character.
1719
   * "a"      : 'a' itself.
1720
   * "\0123"  : Octal char 0123
1721
   * "\x1b"   : Hex char 0x1b
1722
   * "\u1234" : Unicode char \u1234
1723
   */
1724
  private static class CharExpression
1725
  {
1726
    /** character represented by this expression */
1727
    char ch;
1728
    /** String expression */
1729
    String expr;
1730
    /** length of this expression */
1731
    int len;
1732
    public String toString ()
1733
    {
1734
      return expr;
1735
    }
1736
  }
1737
 
1738
  private static CharExpression getCharExpression (char[]input, int pos,
1739
                                                   int lim, RESyntax syntax)
1740
  {
1741
    CharExpression ce = new CharExpression ();
1742
    char c = input[pos];
1743
    if (c == '\\')
1744
      {
1745
        if (pos + 1 >= lim)
1746
          return null;
1747
        c = input[pos + 1];
1748
        switch (c)
1749
          {
1750
          case 't':
1751
            ce.ch = '\t';
1752
            ce.len = 2;
1753
            break;
1754
          case 'n':
1755
            ce.ch = '\n';
1756
            ce.len = 2;
1757
            break;
1758
          case 'r':
1759
            ce.ch = '\r';
1760
            ce.len = 2;
1761
            break;
1762
          case 'x':
1763
          case 'u':
1764
            if ((c == 'x' && syntax.get (RESyntax.RE_HEX_CHAR)) ||
1765
                (c == 'u' && syntax.get (RESyntax.RE_UNICODE_CHAR)))
1766
              {
1767
                int l = 0;
1768
                int expectedLength = (c == 'x' ? 2 : 4);
1769
                for (int i = pos + 2; i < pos + 2 + expectedLength; i++)
1770
                  {
1771
                    if (i >= lim)
1772
                      break;
1773
                    if (!((input[i] >= '0' && input[i] <= '9') ||
1774
                          (input[i] >= 'A' && input[i] <= 'F') ||
1775
                          (input[i] >= 'a' && input[i] <= 'f')))
1776
                      break;
1777
                    l++;
1778
                  }
1779
                if (l != expectedLength)
1780
                  return null;
1781
                ce.ch = (char) (parseInt (input, pos + 2, l, 16));
1782
                ce.len = l + 2;
1783
              }
1784
            else
1785
              {
1786
                ce.ch = c;
1787
                ce.len = 2;
1788
              }
1789
            break;
1790
          case '0':
1791
            if (syntax.get (RESyntax.RE_OCTAL_CHAR))
1792
              {
1793
                int l = 0;
1794
                for (int i = pos + 2; i < pos + 2 + 3; i++)
1795
                  {
1796
                    if (i >= lim)
1797
                      break;
1798
                    if (input[i] < '0' || input[i] > '7')
1799
                      break;
1800
                    l++;
1801
                  }
1802
                if (l == 3 && input[pos + 2] > '3')
1803
                  l--;
1804
                if (l <= 0)
1805
                  return null;
1806
                ce.ch = (char) (parseInt (input, pos + 2, l, 8));
1807
                ce.len = l + 2;
1808
              }
1809
            else
1810
              {
1811
                ce.ch = c;
1812
                ce.len = 2;
1813
              }
1814
            break;
1815
          default:
1816
            ce.ch = c;
1817
            ce.len = 2;
1818
            break;
1819
          }
1820
      }
1821
    else
1822
      {
1823
        ce.ch = input[pos];
1824
        ce.len = 1;
1825
      }
1826
    ce.expr = new String (input, pos, ce.len);
1827
    return ce;
1828
  }
1829
 
1830
  /**
1831
   * This class represents a substring in a pattern string expressing
1832
   * a named property.
1833
   * "\pA"      : Property named "A"
1834
   * "\p{prop}" : Property named "prop"
1835
   * "\PA"      : Property named "A" (Negated)
1836
   * "\P{prop}" : Property named "prop" (Negated)
1837
   */
1838
  private static class NamedProperty
1839
  {
1840
    /** Property name */
1841
    String name;
1842
    /** Negated or not */
1843
    boolean negate;
1844
    /** length of this expression */
1845
    int len;
1846
  }
1847
 
1848
  private static NamedProperty getNamedProperty (char[]input, int pos,
1849
                                                 int lim)
1850
  {
1851
    NamedProperty np = new NamedProperty ();
1852
    char c = input[pos];
1853
    if (c == '\\')
1854
      {
1855
        if (++pos >= lim)
1856
          return null;
1857
        c = input[pos++];
1858
        switch (c)
1859
          {
1860
          case 'p':
1861
            np.negate = false;
1862
            break;
1863
          case 'P':
1864
            np.negate = true;
1865
            break;
1866
          default:
1867
            return null;
1868
          }
1869
        c = input[pos++];
1870
        if (c == '{')
1871
          {
1872
            int p = -1;
1873
            for (int i = pos; i < lim; i++)
1874
              {
1875
                if (input[i] == '}')
1876
                  {
1877
                    p = i;
1878
                    break;
1879
                  }
1880
              }
1881
            if (p < 0)
1882
              return null;
1883
            int len = p - pos;
1884
            np.name = new String (input, pos, len);
1885
            np.len = len + 4;
1886
          }
1887
        else
1888
          {
1889
            np.name = new String (input, pos - 1, 1);
1890
            np.len = 3;
1891
          }
1892
        return np;
1893
      }
1894
    else
1895
      return null;
1896
  }
1897
 
1898
  private static RETokenNamedProperty getRETokenNamedProperty (int subIndex,
1899
                                                               NamedProperty
1900
                                                               np,
1901
                                                               boolean insens,
1902
                                                               int index)
1903
    throws REException
1904
  {
1905
    try
1906
    {
1907
      return new RETokenNamedProperty (subIndex, np.name, insens, np.negate);
1908
    }
1909
    catch (REException e)
1910
    {
1911
      REException ree;
1912
      ree = new REException (e.getMessage (), REException.REG_ESCAPE, index);
1913
      ree.initCause (e);
1914
      throw ree;
1915
    }
1916
  }
1917
 
1918
  /**
1919
   * Checks if the regular expression matches the input in its entirety.
1920
   *
1921
   * @param input The input text.
1922
   */
1923
  public boolean isMatch (Object input)
1924
  {
1925
    return isMatch (input, 0, 0);
1926
  }
1927
 
1928
  /**
1929
   * Checks if the input string, starting from index, is an exact match of
1930
   * this regular expression.
1931
   *
1932
   * @param input The input text.
1933
   * @param index The offset index at which the search should be begin.
1934
   */
1935
  public boolean isMatch (Object input, int index)
1936
  {
1937
    return isMatch (input, index, 0);
1938
  }
1939
 
1940
 
1941
  /**
1942
   * Checks if the input, starting from index and using the specified
1943
   * execution flags, is an exact match of this regular expression.
1944
   *
1945
   * @param input The input text.
1946
   * @param index The offset index at which the search should be begin.
1947
   * @param eflags The logical OR of any execution flags above.
1948
   */
1949
  public boolean isMatch (Object input, int index, int eflags)
1950
  {
1951
    return isMatchImpl (makeCharIndexed (input, index), index, eflags);
1952
  }
1953
 
1954
  private boolean isMatchImpl (CharIndexed input, int index, int eflags)
1955
  {
1956
    if (firstToken == null)     // Trivial case
1957
      return (input.charAt (0) == CharIndexed.OUT_OF_BOUNDS);
1958
    REMatch m = new REMatch (numSubs, index, eflags);
1959
    if (firstToken.match (input, m))
1960
      {
1961
        if (m != null)
1962
          {
1963
            if (input.charAt (m.index) == CharIndexed.OUT_OF_BOUNDS)
1964
              {
1965
                return true;
1966
              }
1967
          }
1968
      }
1969
    return false;
1970
  }
1971
 
1972
  /**
1973
   * Returns the maximum number of subexpressions in this regular expression.
1974
   * If the expression contains branches, the value returned will be the
1975
   * maximum subexpressions in any of the branches.
1976
   */
1977
  public int getNumSubs ()
1978
  {
1979
    return numSubs;
1980
  }
1981
 
1982
  // Overrides REToken.setUncle
1983
  void setUncle (REToken uncle)
1984
  {
1985
    if (lastToken != null)
1986
      {
1987
        lastToken.setUncle (uncle);
1988
      }
1989
    else
1990
      super.setUncle (uncle);   // to deal with empty subexpressions
1991
  }
1992
 
1993
  // Overrides REToken.chain
1994
 
1995
  boolean chain (REToken next)
1996
  {
1997
    super.chain (next);
1998
    setUncle (next);
1999
    return true;
2000
  }
2001
 
2002
  /**
2003
   * Returns the minimum number of characters that could possibly
2004
   * constitute a match of this regular expression.
2005
   */
2006
  public int getMinimumLength ()
2007
  {
2008
    return minimumLength;
2009
  }
2010
 
2011
  public int getMaximumLength ()
2012
  {
2013
    return maximumLength;
2014
  }
2015
 
2016
  /**
2017
   * Returns an array of all matches found in the input.
2018
   *
2019
   * If the regular expression allows the empty string to match, it will
2020
   * substitute matches at all positions except the end of the input.
2021
   *
2022
   * @param input The input text.
2023
   * @return a non-null (but possibly zero-length) array of matches
2024
   */
2025
  public REMatch[] getAllMatches (Object input)
2026
  {
2027
    return getAllMatches (input, 0, 0);
2028
  }
2029
 
2030
  /**
2031
   * Returns an array of all matches found in the input,
2032
   * beginning at the specified index position.
2033
   *
2034
   * If the regular expression allows the empty string to match, it will
2035
   * substitute matches at all positions except the end of the input.
2036
   *
2037
   * @param input The input text.
2038
   * @param index The offset index at which the search should be begin.
2039
   * @return a non-null (but possibly zero-length) array of matches
2040
   */
2041
  public REMatch[] getAllMatches (Object input, int index)
2042
  {
2043
    return getAllMatches (input, index, 0);
2044
  }
2045
 
2046
  /**
2047
   * Returns an array of all matches found in the input string,
2048
   * beginning at the specified index position and using the specified
2049
   * execution flags.
2050
   *
2051
   * If the regular expression allows the empty string to match, it will
2052
   * substitute matches at all positions except the end of the input.
2053
   *
2054
   * @param input The input text.
2055
   * @param index The offset index at which the search should be begin.
2056
   * @param eflags The logical OR of any execution flags above.
2057
   * @return a non-null (but possibly zero-length) array of matches
2058
   */
2059
  public REMatch[] getAllMatches (Object input, int index, int eflags)
2060
  {
2061
    return getAllMatchesImpl (makeCharIndexed (input, index), index, eflags);
2062
  }
2063
 
2064
  // this has been changed since 1.03 to be non-overlapping matches
2065
  private REMatch[] getAllMatchesImpl (CharIndexed input, int index,
2066
                                       int eflags)
2067
  {
2068
    List < REMatch > all = new ArrayList < REMatch > ();
2069
    REMatch m = null;
2070
    while ((m = getMatchImpl (input, index, eflags, null)) != null)
2071
      {
2072
        all.add (m);
2073
        index = m.getEndIndex ();
2074
        if (m.end[0] == 0)
2075
          {                     // handle pathological case of zero-length match
2076
            index++;
2077
            input.move (1);
2078
          }
2079
        else
2080
          {
2081
            input.move (m.end[0]);
2082
          }
2083
        if (!input.isValid ())
2084
          break;
2085
      }
2086
    return all.toArray (new REMatch[all.size ()]);
2087
  }
2088
 
2089
  /* Implements abstract method REToken.match() */
2090
  boolean match (CharIndexed input, REMatch mymatch)
2091
  {
2092
    input.setHitEnd (mymatch);
2093
    if (firstToken == null)
2094
      {
2095
        return next (input, mymatch);
2096
      }
2097
 
2098
    // Note the start of this subexpression
2099
    mymatch.start1[subIndex] = mymatch.index;
2100
 
2101
    return firstToken.match (input, mymatch);
2102
  }
2103
 
2104
  REMatch findMatch (CharIndexed input, REMatch mymatch)
2105
  {
2106
    if (mymatch.backtrackStack == null)
2107
      mymatch.backtrackStack = new BacktrackStack ();
2108
    boolean b = match (input, mymatch);
2109
    if (b)
2110
      {
2111
        return mymatch;
2112
      }
2113
    return null;
2114
  }
2115
 
2116
  /**
2117
   * Returns the first match found in the input.  If no match is found,
2118
   * null is returned.
2119
   *
2120
   * @param input The input text.
2121
   * @return An REMatch instance referencing the match, or null if none.
2122
   */
2123
  public REMatch getMatch (Object input)
2124
  {
2125
    return getMatch (input, 0, 0);
2126
  }
2127
 
2128
  /**
2129
   * Returns the first match found in the input, beginning
2130
   * the search at the specified index.  If no match is found,
2131
   * returns null.
2132
   *
2133
   * @param input The input text.
2134
   * @param index The offset within the text to begin looking for a match.
2135
   * @return An REMatch instance referencing the match, or null if none.
2136
   */
2137
  public REMatch getMatch (Object input, int index)
2138
  {
2139
    return getMatch (input, index, 0);
2140
  }
2141
 
2142
  /**
2143
   * Returns the first match found in the input, beginning
2144
   * the search at the specified index, and using the specified
2145
   * execution flags.  If no match is found, returns null.
2146
   *
2147
   * @param input The input text.
2148
   * @param index The offset index at which the search should be begin.
2149
   * @param eflags The logical OR of any execution flags above.
2150
   * @return An REMatch instance referencing the match, or null if none.
2151
   */
2152
  public REMatch getMatch (Object input, int index, int eflags)
2153
  {
2154
    return getMatch (input, index, eflags, null);
2155
  }
2156
 
2157
  /**
2158
   * Returns the first match found in the input, beginning the search
2159
   * at the specified index, and using the specified execution flags.
2160
   * If no match is found, returns null.  If a StringBuffer is
2161
   * provided and is non-null, the contents of the input text from the
2162
   * index to the beginning of the match (or to the end of the input,
2163
   * if there is no match) are appended to the StringBuffer.
2164
   *
2165
   * @param input The input text.
2166
   * @param index The offset index at which the search should be begin.
2167
   * @param eflags The logical OR of any execution flags above.
2168
   * @param buffer The StringBuffer to save pre-match text in.
2169
   * @return An REMatch instance referencing the match, or null if none.  */
2170
  public REMatch getMatch (Object input, int index, int eflags,
2171
                           CPStringBuilder buffer)
2172
  {
2173
    return getMatchImpl (makeCharIndexed (input, index), index, eflags,
2174
                         buffer);
2175
  }
2176
 
2177
  REMatch getMatchImpl (CharIndexed input, int anchor, int eflags,
2178
                        CPStringBuilder buffer)
2179
  {
2180
    boolean tryEntireMatch = ((eflags & REG_TRY_ENTIRE_MATCH) != 0);
2181
    boolean doMove = ((eflags & REG_FIX_STARTING_POSITION) == 0);
2182
    RE re = (tryEntireMatch ? (RE) this.clone () : this);
2183
    if (tryEntireMatch)
2184
      {
2185
        RETokenEnd reEnd = new RETokenEnd (0, null);
2186
        reEnd.setFake (true);
2187
        re.chain (reEnd);
2188
      }
2189
    // Create a new REMatch to hold results
2190
    REMatch mymatch = new REMatch (numSubs, anchor, eflags);
2191
    do
2192
      {
2193
        /* The following potimization is commented out because
2194
           the matching should be tried even if the length of
2195
           input is obviously too short in order that
2196
           java.util.regex.Matcher#hitEnd() may work correctly.
2197
           // Optimization: check if anchor + minimumLength > length
2198
           if (minimumLength == 0 || input.charAt(minimumLength-1) != CharIndexed.OUT_OF_BOUNDS) {
2199
         */
2200
        if (re.match (input, mymatch))
2201
          {
2202
            REMatch best = mymatch;
2203
            // We assume that the match that coms first is the best.
2204
            // And the following "The longer, the better" rule has
2205
            // been commented out. The longest is not neccesarily
2206
            // the best. For example, "a" out of "aaa" is the best
2207
            // match for /a+?/.
2208
            /*
2209
               // Find best match of them all to observe leftmost longest
2210
               while ((mymatch = mymatch.next) != null) {
2211
               if (mymatch.index > best.index) {
2212
               best = mymatch;
2213
               }
2214
               }
2215
             */
2216
            best.end[0] = best.index;
2217
            best.finish (input);
2218
            input.setLastMatch (best);
2219
            return best;
2220
          }
2221
        /* End of the optimization commented out
2222
           }
2223
         */
2224
        mymatch.clear (++anchor);
2225
        // Append character to buffer if needed
2226
        if (buffer != null && input.charAt (0) != CharIndexed.OUT_OF_BOUNDS)
2227
          {
2228
            buffer.append (input.charAt (0));
2229
          }
2230
        // java.util.regex.Matcher#hitEnd() requires that the search should
2231
        // be tried at the end of input, so we use move1(1) instead of move(1)
2232
      }
2233
    while (doMove && input.move1 (1));
2234
 
2235
    // Special handling at end of input for e.g. "$"
2236
    if (minimumLength == 0)
2237
      {
2238
        if (match (input, mymatch))
2239
          {
2240
            mymatch.finish (input);
2241
            return mymatch;
2242
          }
2243
      }
2244
 
2245
    return null;
2246
  }
2247
 
2248
  /**
2249
   * Returns an REMatchEnumeration that can be used to iterate over the
2250
   * matches found in the input text.
2251
   *
2252
   * @param input The input text.
2253
   * @return A non-null REMatchEnumeration instance.
2254
   */
2255
  public REMatchEnumeration getMatchEnumeration (Object input)
2256
  {
2257
    return getMatchEnumeration (input, 0, 0);
2258
  }
2259
 
2260
 
2261
  /**
2262
   * Returns an REMatchEnumeration that can be used to iterate over the
2263
   * matches found in the input text.
2264
   *
2265
   * @param input The input text.
2266
   * @param index The offset index at which the search should be begin.
2267
   * @return A non-null REMatchEnumeration instance, with its input cursor
2268
   *  set to the index position specified.
2269
   */
2270
  public REMatchEnumeration getMatchEnumeration (Object input, int index)
2271
  {
2272
    return getMatchEnumeration (input, index, 0);
2273
  }
2274
 
2275
  /**
2276
   * Returns an REMatchEnumeration that can be used to iterate over the
2277
   * matches found in the input text.
2278
   *
2279
   * @param input The input text.
2280
   * @param index The offset index at which the search should be begin.
2281
   * @param eflags The logical OR of any execution flags above.
2282
   * @return A non-null REMatchEnumeration instance, with its input cursor
2283
   *  set to the index position specified.
2284
   */
2285
  public REMatchEnumeration getMatchEnumeration (Object input, int index,
2286
                                                 int eflags)
2287
  {
2288
    return new REMatchEnumeration (this, makeCharIndexed (input, index),
2289
                                   index, eflags);
2290
  }
2291
 
2292
 
2293
  /**
2294
   * Substitutes the replacement text for the first match found in the input.
2295
   *
2296
   * @param input The input text.
2297
   * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
2298
   * @return A String interpolating the substituted text.
2299
   * @see REMatch#substituteInto
2300
   */
2301
  public String substitute (Object input, String replace)
2302
  {
2303
    return substitute (input, replace, 0, 0);
2304
  }
2305
 
2306
  /**
2307
   * Substitutes the replacement text for the first match found in the input
2308
   * beginning at the specified index position.  Specifying an index
2309
   * effectively causes the regular expression engine to throw away the
2310
   * specified number of characters.
2311
   *
2312
   * @param input The input text.
2313
   * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
2314
   * @param index The offset index at which the search should be begin.
2315
   * @return A String containing the substring of the input, starting
2316
   *   at the index position, and interpolating the substituted text.
2317
   * @see REMatch#substituteInto
2318
   */
2319
  public String substitute (Object input, String replace, int index)
2320
  {
2321
    return substitute (input, replace, index, 0);
2322
  }
2323
 
2324
  /**
2325
   * Substitutes the replacement text for the first match found in the input
2326
   * string, beginning at the specified index position and using the
2327
   * specified execution flags.
2328
   *
2329
   * @param input The input text.
2330
   * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
2331
   * @param index The offset index at which the search should be begin.
2332
   * @param eflags The logical OR of any execution flags above.
2333
   * @return A String containing the substring of the input, starting
2334
   *   at the index position, and interpolating the substituted text.
2335
   * @see REMatch#substituteInto
2336
   */
2337
  public String substitute (Object input, String replace, int index,
2338
                            int eflags)
2339
  {
2340
    return substituteImpl (makeCharIndexed (input, index), replace, index,
2341
                           eflags);
2342
  }
2343
 
2344
  private String substituteImpl (CharIndexed input, String replace, int index,
2345
                                 int eflags)
2346
  {
2347
    CPStringBuilder buffer = new CPStringBuilder ();
2348
    REMatch m = getMatchImpl (input, index, eflags, buffer);
2349
    if (m == null)
2350
      return buffer.toString ();
2351
    buffer.append (getReplacement (replace, m, eflags));
2352
    if (input.move (m.end[0]))
2353
      {
2354
        do
2355
          {
2356
            buffer.append (input.charAt (0));
2357
          }
2358
        while (input.move (1));
2359
      }
2360
    return buffer.toString ();
2361
  }
2362
 
2363
  /**
2364
   * Substitutes the replacement text for each non-overlapping match found
2365
   * in the input text.
2366
   *
2367
   * @param input The input text.
2368
   * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
2369
   * @return A String interpolating the substituted text.
2370
   * @see REMatch#substituteInto
2371
   */
2372
  public String substituteAll (Object input, String replace)
2373
  {
2374
    return substituteAll (input, replace, 0, 0);
2375
  }
2376
 
2377
  /**
2378
   * Substitutes the replacement text for each non-overlapping match found
2379
   * in the input text, starting at the specified index.
2380
   *
2381
   * If the regular expression allows the empty string to match, it will
2382
   * substitute matches at all positions except the end of the input.
2383
   *
2384
   * @param input The input text.
2385
   * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
2386
   * @param index The offset index at which the search should be begin.
2387
   * @return A String containing the substring of the input, starting
2388
   *   at the index position, and interpolating the substituted text.
2389
   * @see REMatch#substituteInto
2390
   */
2391
  public String substituteAll (Object input, String replace, int index)
2392
  {
2393
    return substituteAll (input, replace, index, 0);
2394
  }
2395
 
2396
  /**
2397
   * Substitutes the replacement text for each non-overlapping match found
2398
   * in the input text, starting at the specified index and using the
2399
   * specified execution flags.
2400
   *
2401
   * @param input The input text.
2402
   * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
2403
   * @param index The offset index at which the search should be begin.
2404
   * @param eflags The logical OR of any execution flags above.
2405
   * @return A String containing the substring of the input, starting
2406
   *   at the index position, and interpolating the substituted text.
2407
   * @see REMatch#substituteInto
2408
   */
2409
  public String substituteAll (Object input, String replace, int index,
2410
                               int eflags)
2411
  {
2412
    return substituteAllImpl (makeCharIndexed (input, index), replace, index,
2413
                              eflags);
2414
  }
2415
 
2416
  private String substituteAllImpl (CharIndexed input, String replace,
2417
                                    int index, int eflags)
2418
  {
2419
    CPStringBuilder buffer = new CPStringBuilder ();
2420
    REMatch m;
2421
    while ((m = getMatchImpl (input, index, eflags, buffer)) != null)
2422
      {
2423
        buffer.append (getReplacement (replace, m, eflags));
2424
        index = m.getEndIndex ();
2425
        if (m.end[0] == 0)
2426
          {
2427
            char ch = input.charAt (0);
2428
            if (ch != CharIndexed.OUT_OF_BOUNDS)
2429
              buffer.append (ch);
2430
            input.move (1);
2431
          }
2432
        else
2433
          {
2434
            input.move (m.end[0]);
2435
          }
2436
 
2437
        if (!input.isValid ())
2438
          break;
2439
      }
2440
    return buffer.toString ();
2441
  }
2442
 
2443
  public static String getReplacement (String replace, REMatch m, int eflags)
2444
  {
2445
    if ((eflags & REG_NO_INTERPOLATE) > 0)
2446
      return replace;
2447
    else
2448
      {
2449
        if ((eflags & REG_REPLACE_USE_BACKSLASHESCAPE) > 0)
2450
          {
2451
            CPStringBuilder sb = new CPStringBuilder ();
2452
            int l = replace.length ();
2453
            for (int i = 0; i < l; i++)
2454
              {
2455
                char c = replace.charAt (i);
2456
                switch (c)
2457
                  {
2458
                  case '\\':
2459
                    i++;
2460
                    // Let StringIndexOutOfBoundsException be thrown.
2461
                    sb.append (replace.charAt (i));
2462
                    break;
2463
                  case '$':
2464
                    int i1 = i + 1;
2465
                    while (i1 < replace.length () &&
2466
                           Character.isDigit (replace.charAt (i1)))
2467
                      i1++;
2468
                    sb.append (m.substituteInto (replace.substring (i, i1)));
2469
                    i = i1 - 1;
2470
                    break;
2471
                  default:
2472
                    sb.append (c);
2473
                  }
2474
              }
2475
            return sb.toString ();
2476
          }
2477
        else
2478
          return m.substituteInto (replace);
2479
      }
2480
  }
2481
 
2482
  /* Helper function for constructor */
2483
  private void addToken (REToken next)
2484
  {
2485
    if (next == null)
2486
      return;
2487
    minimumLength += next.getMinimumLength ();
2488
    int nmax = next.getMaximumLength ();
2489
    if (nmax < Integer.MAX_VALUE && maximumLength < Integer.MAX_VALUE)
2490
      maximumLength += nmax;
2491
    else
2492
      maximumLength = Integer.MAX_VALUE;
2493
 
2494
    if (firstToken == null)
2495
      {
2496
        lastToken = firstToken = next;
2497
      }
2498
    else
2499
      {
2500
        // if chain returns false, it "rejected" the token due to
2501
        // an optimization, and next was combined with lastToken
2502
        if (lastToken.chain (next))
2503
          {
2504
            lastToken = next;
2505
          }
2506
      }
2507
  }
2508
 
2509
  private static REToken setRepeated (REToken current, int min, int max,
2510
                                      int index) throws REException
2511
  {
2512
    if (current == null)
2513
      throw new REException (getLocalizedMessage ("repeat.no.token"),
2514
                             REException.REG_BADRPT, index);
2515
      return new RETokenRepeated (current.subIndex, current, min, max);
2516
  }
2517
 
2518
  private static int getPosixSet (char[]pattern, int index,
2519
                                  CPStringBuilder buf)
2520
  {
2521
    // Precondition: pattern[index-1] == ':'
2522
    // we will return pos of closing ']'.
2523
    int i;
2524
    for (i = index; i < (pattern.length - 1); i++)
2525
      {
2526
        if ((pattern[i] == ':') && (pattern[i + 1] == ']'))
2527
          return i + 2;
2528
        buf.append (pattern[i]);
2529
      }
2530
    return index;               // didn't match up
2531
  }
2532
 
2533
  private int getMinMax (char[]input, int index, IntPair minMax,
2534
                         RESyntax syntax) throws REException
2535
  {
2536
    // Precondition: input[index-1] == '{', minMax != null
2537
 
2538
    boolean mustMatch = !syntax.get (RESyntax.RE_NO_BK_BRACES);
2539
    int startIndex = index;
2540
    if (index == input.length)
2541
      {
2542
        if (mustMatch)
2543
          throw new REException (getLocalizedMessage ("unmatched.brace"),
2544
                                 REException.REG_EBRACE, index);
2545
        else
2546
          return startIndex;
2547
      }
2548
 
2549
    int min, max = 0;
2550
    CharUnit unit = new CharUnit ();
2551
    CPStringBuilder buf = new CPStringBuilder ();
2552
 
2553
    // Read string of digits
2554
    do
2555
      {
2556
        index = getCharUnit (input, index, unit, false);
2557
        if (Character.isDigit (unit.ch))
2558
          buf.append (unit.ch);
2559
      }
2560
    while ((index != input.length) && Character.isDigit (unit.ch));
2561
 
2562
    // Check for {} tomfoolery
2563
    if (buf.length () == 0)
2564
      {
2565
        if (mustMatch)
2566
          throw new REException (getLocalizedMessage ("interval.error"),
2567
                                 REException.REG_EBRACE, index);
2568
        else
2569
        return startIndex;
2570
      }
2571
 
2572
    min = Integer.parseInt (buf.toString ());
2573
 
2574
    if ((unit.ch == '}') && (syntax.get (RESyntax.RE_NO_BK_BRACES) ^ unit.bk))
2575
      max = min;
2576
    else if (index == input.length)
2577
      if (mustMatch)
2578
        throw new REException (getLocalizedMessage ("interval.no.end"),
2579
                               REException.REG_EBRACE, index);
2580
    else
2581
    return startIndex;
2582
    else
2583
  if ((unit.ch == ',') && !unit.bk)
2584
    {
2585
      buf = new CPStringBuilder ();
2586
      // Read string of digits
2587
      while (((index =
2588
               getCharUnit (input, index, unit, false)) != input.length)
2589
             && Character.isDigit (unit.ch))
2590
        buf.append (unit.ch);
2591
 
2592
      if (!
2593
          ((unit.ch == '}')
2594
           && (syntax.get (RESyntax.RE_NO_BK_BRACES) ^ unit.bk)))
2595
        if (mustMatch)
2596
          throw new REException (getLocalizedMessage ("interval.error"),
2597
                                 REException.REG_EBRACE, index);
2598
      else
2599
      return startIndex;
2600
 
2601
      // This is the case of {x,}
2602
      if (buf.length () == 0)
2603
        max = Integer.MAX_VALUE;
2604
      else
2605
        max = Integer.parseInt (buf.toString ());
2606
    }
2607
  else if (mustMatch)
2608
    throw new REException (getLocalizedMessage ("interval.error"),
2609
                           REException.REG_EBRACE, index);
2610
  else
2611
  return startIndex;
2612
 
2613
  // We know min and max now, and they are valid.
2614
 
2615
  minMax.first = min;
2616
  minMax.second = max;
2617
 
2618
  // return the index following the '}'
2619
  return index;
2620
  }
2621
 
2622
   /**
2623
    * Return a human readable form of the compiled regular expression,
2624
    * useful for debugging.
2625
    */
2626
  public String toString ()
2627
  {
2628
    CPStringBuilder sb = new CPStringBuilder ();
2629
    dump (sb);
2630
    return sb.toString ();
2631
  }
2632
 
2633
  void dump (CPStringBuilder os)
2634
  {
2635
    os.append ("(?#startRE subIndex=" + subIndex + ")");
2636
    if (subIndex == 0)
2637
      os.append ("?:");
2638
    if (firstToken != null)
2639
      firstToken.dumpAll (os);
2640
    if (subIndex == 0)
2641
      os.append (")");
2642
    os.append ("(?#endRE subIndex=" + subIndex + ")");
2643
  }
2644
 
2645
  // Cast input appropriately or throw exception
2646
  // This method was originally a private method, but has been made
2647
  // public because java.util.regex.Matcher uses this.
2648
  public static CharIndexed makeCharIndexed (Object input, int index)
2649
  {
2650
    // The case where input is already a CharIndexed is supposed
2651
    // be the most likely because this is the case with
2652
    // java.util.regex.Matcher.
2653
    // We could let a String or a CharSequence fall through
2654
    // to final input, but since it'a very likely input type,
2655
    // we check it first.
2656
    if (input instanceof CharIndexed)
2657
      {
2658
        CharIndexed ci = (CharIndexed) input;
2659
        ci.setAnchor (index);
2660
        return ci;
2661
      }
2662
    else if (input instanceof CharSequence)
2663
      return new CharIndexedCharSequence ((CharSequence) input, index);
2664
    else if (input instanceof String)
2665
      return new CharIndexedString ((String) input, index);
2666
    else if (input instanceof char[])
2667
      return new CharIndexedCharArray ((char[]) input, index);
2668
    else if (input instanceof StringBuffer)
2669
      return new CharIndexedStringBuffer ((StringBuffer) input, index);
2670
    else if (input instanceof InputStream)
2671
      return new CharIndexedInputStream ((InputStream) input, index);
2672
    else
2673
      return new CharIndexedString (input.toString (), index);
2674
  }
2675
}

powered by: WebSVN 2.1.0

© copyright 1999-2025 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.