OpenCores
URL https://opencores.org/ocsvn/or1k/or1k/trunk

Subversion Repositories or1k

[/] [or1k/] [trunk/] [insight/] [tcl/] [generic/] [regexp.c] - Blame information for rev 1780

Go to most recent revision | Details | Compare with Previous | View Log

Line No. Rev Author Line
1 578 markom
/*
2
 * TclRegComp and TclRegExec -- TclRegSub is elsewhere
3
 *
4
 *      Copyright (c) 1986 by University of Toronto.
5
 *      Written by Henry Spencer.  Not derived from licensed software.
6
 *
7
 *      Permission is granted to anyone to use this software for any
8
 *      purpose on any computer system, and to redistribute it freely,
9
 *      subject to the following restrictions:
10
 *
11
 *      1. The author is not responsible for the consequences of use of
12
 *              this software, no matter how awful, even if they arise
13
 *              from defects in it.
14
 *
15
 *      2. The origin of this software must not be misrepresented, either
16
 *              by explicit claim or by omission.
17
 *
18
 *      3. Altered versions must be plainly marked as such, and must not
19
 *              be misrepresented as being the original software.
20
 *
21
 * Beware that some of this code is subtly aware of the way operator
22
 * precedence is structured in regular expressions.  Serious changes in
23
 * regular-expression syntax might require a total rethink.
24
 *
25
 * *** NOTE: this code has been altered slightly for use in Tcl: ***
26
 * *** 1. Use ckalloc and ckfree instead of  malloc and free.    ***
27
 * *** 2. Add extra argument to regexp to specify the real       ***
28
 * ***    start of the string separately from the start of the   ***
29
 * ***    current search. This is needed to search for multiple  ***
30
 * ***    matches within a string.                               ***
31
 * *** 3. Names have been changed, e.g. from regcomp to          ***
32
 * ***    TclRegComp, to avoid clashes with other                ***
33
 * ***    regexp implementations used by applications.           ***
34
 * *** 4. Added errMsg declaration and TclRegError procedure     ***
35
 * *** 5. Various lint-like things, such as casting arguments    ***
36
 * ***    in procedure calls.                                    ***
37
 *
38
 * *** NOTE: This code has been altered for use in MT-Sturdy Tcl ***
39
 * *** 1. All use of static variables has been changed to access ***
40
 * ***    fields of a structure.                                 ***
41
 * *** 2. This in addition to changes to TclRegError makes the   ***
42
 * ***    code multi-thread safe.                                ***
43
 *
44
 * RCS: @(#) $Id: regexp.c,v 1.1.1.1 2002-01-16 10:25:25 markom Exp $
45
 */
46
 
47
#include "tclInt.h"
48
#include "tclPort.h"
49
 
50
/*
51
 * The variable below is set to NULL before invoking regexp functions
52
 * and checked after those functions.  If an error occurred then TclRegError
53
 * will set the variable to point to a (static) error message.  This
54
 * mechanism unfortunately does not support multi-threading, but the
55
 * procedures TclRegError and TclGetRegError can be modified to use
56
 * thread-specific storage for the variable and thereby make the code
57
 * thread-safe.
58
 */
59
 
60
static char *errMsg = NULL;
61
 
62
/*
63
 * The "internal use only" fields in regexp.h are present to pass info from
64
 * compile to execute that permits the execute phase to run lots faster on
65
 * simple cases.  They are:
66
 *
67
 * regstart     char that must begin a match; '\0' if none obvious
68
 * reganch      is the match anchored (at beginning-of-line only)?
69
 * regmust      string (pointer into program) that match must include, or NULL
70
 * regmlen      length of regmust string
71
 *
72
 * Regstart and reganch permit very fast decisions on suitable starting points
73
 * for a match, cutting down the work a lot.  Regmust permits fast rejection
74
 * of lines that cannot possibly match.  The regmust tests are costly enough
75
 * that TclRegComp() supplies a regmust only if the r.e. contains something
76
 * potentially expensive (at present, the only such thing detected is * or +
77
 * at the start of the r.e., which can involve a lot of backup).  Regmlen is
78
 * supplied because the test in TclRegExec() needs it and TclRegComp() is
79
 * computing it anyway.
80
 */
81
 
82
/*
83
 * Structure for regexp "program".  This is essentially a linear encoding
84
 * of a nondeterministic finite-state machine (aka syntax charts or
85
 * "railroad normal form" in parsing technology).  Each node is an opcode
86
 * plus a "next" pointer, possibly plus an operand.  "Next" pointers of
87
 * all nodes except BRANCH implement concatenation; a "next" pointer with
88
 * a BRANCH on both ends of it is connecting two alternatives.  (Here we
89
 * have one of the subtle syntax dependencies:  an individual BRANCH (as
90
 * opposed to a collection of them) is never concatenated with anything
91
 * because of operator precedence.)  The operand of some types of node is
92
 * a literal string; for others, it is a node leading into a sub-FSM.  In
93
 * particular, the operand of a BRANCH node is the first node of the branch.
94
 * (NB this is *not* a tree structure:  the tail of the branch connects
95
 * to the thing following the set of BRANCHes.)  The opcodes are:
96
 */
97
 
98
/* definition   number  opnd?   meaning */
99
#define END     0        /* no   End of program. */
100
#define BOL     1       /* no   Match "" at beginning of line. */
101
#define EOL     2       /* no   Match "" at end of line. */
102
#define ANY     3       /* no   Match any one character. */
103
#define ANYOF   4       /* str  Match any character in this string. */
104
#define ANYBUT  5       /* str  Match any character not in this string. */
105
#define BRANCH  6       /* node Match this alternative, or the next... */
106
#define BACK    7       /* no   Match "", "next" ptr points backward. */
107
#define EXACTLY 8       /* str  Match this string. */
108
#define NOTHING 9       /* no   Match empty string. */
109
#define STAR    10      /* node Match this (simple) thing 0 or more times. */
110
#define PLUS    11      /* node Match this (simple) thing 1 or more times. */
111
#define OPEN    20      /* no   Mark this point in input as start of #n. */
112
                        /*      OPEN+1 is number 1, etc. */
113
#define CLOSE   (OPEN+NSUBEXP)  /* no   Analogous to OPEN. */
114
 
115
/*
116
 * Opcode notes:
117
 *
118
 * BRANCH       The set of branches constituting a single choice are hooked
119
 *              together with their "next" pointers, since precedence prevents
120
 *              anything being concatenated to any individual branch.  The
121
 *              "next" pointer of the last BRANCH in a choice points to the
122
 *              thing following the whole choice.  This is also where the
123
 *              final "next" pointer of each individual branch points; each
124
 *              branch starts with the operand node of a BRANCH node.
125
 *
126
 * BACK         Normal "next" pointers all implicitly point forward; BACK
127
 *              exists to make loop structures possible.
128
 *
129
 * STAR,PLUS    '?', and complex '*' and '+', are implemented as circular
130
 *              BRANCH structures using BACK.  Simple cases (one character
131
 *              per match) are implemented with STAR and PLUS for speed
132
 *              and to minimize recursive plunges.
133
 *
134
 * OPEN,CLOSE   ...are numbered at compile time.
135
 */
136
 
137
/*
138
 * A node is one char of opcode followed by two chars of "next" pointer.
139
 * "Next" pointers are stored as two 8-bit pieces, high order first.  The
140
 * value is a positive offset from the opcode of the node containing it.
141
 * An operand, if any, simply follows the node.  (Note that much of the
142
 * code generation knows about this implicit relationship.)
143
 *
144
 * Using two bytes for the "next" pointer is vast overkill for most things,
145
 * but allows patterns to get big without disasters.
146
 */
147
#define OP(p)   (*(p))
148
#define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
149
#define OPERAND(p)      ((p) + 3)
150
 
151
/*
152
 * See regmagic.h for one further detail of program structure.
153
 */
154
 
155
 
156
/*
157
 * Utility definitions.
158
 */
159
#ifndef CHARBITS
160
#define UCHARAT(p)      ((int)*(unsigned char *)(p))
161
#else
162
#define UCHARAT(p)      ((int)*(p)&CHARBITS)
163
#endif
164
 
165
#define FAIL(m) { TclRegError(m); return(NULL); }
166
#define ISMULT(c)       ((c) == '*' || (c) == '+' || (c) == '?')
167
#define META    "^$.[()|?+*\\"
168
 
169
/*
170
 * Flags to be passed up and down.
171
 */
172
#define HASWIDTH        01      /* Known never to match null string. */
173
#define SIMPLE          02      /* Simple enough to be STAR/PLUS operand. */
174
#define SPSTART         04      /* Starts with * or +. */
175
#define WORST           0        /* Worst case. */
176
 
177
/*
178
 * Global work variables for TclRegComp().
179
 */
180
struct regcomp_state  {
181
    char *regparse;             /* Input-scan pointer. */
182
    int regnpar;                /* () count. */
183
    char *regcode;              /* Code-emit pointer; &regdummy = don't. */
184
    long regsize;               /* Code size. */
185
};
186
 
187
static char regdummy;
188
 
189
/*
190
 * The first byte of the regexp internal "program" is actually this magic
191
 * number; the start node begins in the second byte.
192
 */
193
#define MAGIC   0234
194
 
195
 
196
/*
197
 * Forward declarations for TclRegComp()'s friends.
198
 */
199
 
200
static char *           reg _ANSI_ARGS_((int paren, int *flagp,
201
                            struct regcomp_state *rcstate));
202
static char *           regatom _ANSI_ARGS_((int *flagp,
203
                            struct regcomp_state *rcstate));
204
static char *           regbranch _ANSI_ARGS_((int *flagp,
205
                            struct regcomp_state *rcstate));
206
static void             regc _ANSI_ARGS_((int b,
207
                            struct regcomp_state *rcstate));
208
static void             reginsert _ANSI_ARGS_((int op, char *opnd,
209
                            struct regcomp_state *rcstate));
210
static char *           regnext _ANSI_ARGS_((char *p));
211
static char *           regnode _ANSI_ARGS_((int op,
212
                            struct regcomp_state *rcstate));
213
static void             regoptail _ANSI_ARGS_((char *p, char *val));
214
static char *           regpiece _ANSI_ARGS_((int *flagp,
215
                            struct regcomp_state *rcstate));
216
static void             regtail _ANSI_ARGS_((char *p, char *val));
217
 
218
#ifdef STRCSPN
219
static int strcspn _ANSI_ARGS_((char *s1, char *s2));
220
#endif
221
 
222
/*
223
 - TclRegComp - compile a regular expression into internal code
224
 *
225
 * We can't allocate space until we know how big the compiled form will be,
226
 * but we can't compile it (and thus know how big it is) until we've got a
227
 * place to put the code.  So we cheat:  we compile it twice, once with code
228
 * generation turned off and size counting turned on, and once "for real".
229
 * This also means that we don't allocate space until we are sure that the
230
 * thing really will compile successfully, and we never have to move the
231
 * code and thus invalidate pointers into it.  (Note that it has to be in
232
 * one piece because free() must be able to free it all.)
233
 *
234
 * Beware that the optimization-preparation code in here knows about some
235
 * of the structure of the compiled regexp.
236
 */
237
regexp *
238
TclRegComp(exp)
239
char *exp;
240
{
241
        register regexp *r;
242
        register char *scan;
243
        register char *longest;
244
        register int len;
245
        int flags;
246
        struct regcomp_state state;
247
        struct regcomp_state *rcstate= &state;
248
 
249
        if (exp == NULL)
250
                FAIL("NULL argument");
251
 
252
        /* First pass: determine size, legality. */
253
        rcstate->regparse = exp;
254
        rcstate->regnpar = 1;
255
        rcstate->regsize = 0L;
256
        rcstate->regcode = &regdummy;
257
        regc(MAGIC, rcstate);
258
        if (reg(0, &flags, rcstate) == NULL)
259
                return(NULL);
260
 
261
        /* Small enough for pointer-storage convention? */
262
        if (rcstate->regsize >= 32767L)         /* Probably could be 65535L. */
263
                FAIL("regexp too big");
264
 
265
        /* Allocate space. */
266
        r = (regexp *)ckalloc(sizeof(regexp) + (unsigned)rcstate->regsize);
267
        if (r == NULL)
268
                FAIL("out of space");
269
 
270
        /* Second pass: emit code. */
271
        rcstate->regparse = exp;
272
        rcstate->regnpar = 1;
273
        rcstate->regcode = r->program;
274
        regc(MAGIC, rcstate);
275
        if (reg(0, &flags, rcstate) == NULL)
276
                return(NULL);
277
 
278
        /* Dig out information for optimizations. */
279
        r->regstart = '\0';     /* Worst-case defaults. */
280
        r->reganch = 0;
281
        r->regmust = NULL;
282
        r->regmlen = 0;
283
        scan = r->program+1;                    /* First BRANCH. */
284
        if (OP(regnext(scan)) == END) {         /* Only one top-level choice. */
285
                scan = OPERAND(scan);
286
 
287
                /* Starting-point info. */
288
                if (OP(scan) == EXACTLY)
289
                        r->regstart = *OPERAND(scan);
290
                else if (OP(scan) == BOL)
291
                        r->reganch++;
292
 
293
                /*
294
                 * If there's something expensive in the r.e., find the
295
                 * longest literal string that must appear and make it the
296
                 * regmust.  Resolve ties in favor of later strings, since
297
                 * the regstart check works with the beginning of the r.e.
298
                 * and avoiding duplication strengthens checking.  Not a
299
                 * strong reason, but sufficient in the absence of others.
300
                 */
301
                if (flags&SPSTART) {
302
                        longest = NULL;
303
                        len = 0;
304
                        for (; scan != NULL; scan = regnext(scan))
305
                                if (OP(scan) == EXACTLY && ((int) strlen(OPERAND(scan))) >= len) {
306
                                        longest = OPERAND(scan);
307
                                        len = strlen(OPERAND(scan));
308
                                }
309
                        r->regmust = longest;
310
                        r->regmlen = len;
311
                }
312
        }
313
 
314
        return(r);
315
}
316
 
317
/*
318
 - reg - regular expression, i.e. main body or parenthesized thing
319
 *
320
 * Caller must absorb opening parenthesis.
321
 *
322
 * Combining parenthesis handling with the base level of regular expression
323
 * is a trifle forced, but the need to tie the tails of the branches to what
324
 * follows makes it hard to avoid.
325
 */
326
static char *
327
reg(paren, flagp, rcstate)
328
int paren;                      /* Parenthesized? */
329
int *flagp;
330
struct regcomp_state *rcstate;
331
{
332
        register char *ret;
333
        register char *br;
334
        register char *ender;
335
        register int parno = 0;
336
        int flags;
337
 
338
        *flagp = HASWIDTH;      /* Tentatively. */
339
 
340
        /* Make an OPEN node, if parenthesized. */
341
        if (paren) {
342
                if (rcstate->regnpar >= NSUBEXP)
343
                        FAIL("too many ()");
344
                parno = rcstate->regnpar;
345
                rcstate->regnpar++;
346
                ret = regnode(OPEN+parno,rcstate);
347
        } else
348
                ret = NULL;
349
 
350
        /* Pick up the branches, linking them together. */
351
        br = regbranch(&flags,rcstate);
352
        if (br == NULL)
353
                return(NULL);
354
        if (ret != NULL)
355
                regtail(ret, br);       /* OPEN -> first. */
356
        else
357
                ret = br;
358
        if (!(flags&HASWIDTH))
359
                *flagp &= ~HASWIDTH;
360
        *flagp |= flags&SPSTART;
361
        while (*rcstate->regparse == '|') {
362
                rcstate->regparse++;
363
                br = regbranch(&flags,rcstate);
364
                if (br == NULL)
365
                        return(NULL);
366
                regtail(ret, br);       /* BRANCH -> BRANCH. */
367
                if (!(flags&HASWIDTH))
368
                        *flagp &= ~HASWIDTH;
369
                *flagp |= flags&SPSTART;
370
        }
371
 
372
        /* Make a closing node, and hook it on the end. */
373
        ender = regnode((paren) ? CLOSE+parno : END,rcstate);
374
        regtail(ret, ender);
375
 
376
        /* Hook the tails of the branches to the closing node. */
377
        for (br = ret; br != NULL; br = regnext(br))
378
                regoptail(br, ender);
379
 
380
        /* Check for proper termination. */
381
        if (paren && *rcstate->regparse++ != ')') {
382
                FAIL("unmatched ()");
383
        } else if (!paren && *rcstate->regparse != '\0') {
384
                if (*rcstate->regparse == ')') {
385
                        FAIL("unmatched ()");
386
                } else
387
                        FAIL("junk on end");    /* "Can't happen". */
388
                /* NOTREACHED */
389
        }
390
 
391
        return(ret);
392
}
393
 
394
/*
395
 - regbranch - one alternative of an | operator
396
 *
397
 * Implements the concatenation operator.
398
 */
399
static char *
400
regbranch(flagp, rcstate)
401
int *flagp;
402
struct regcomp_state *rcstate;
403
{
404
        register char *ret;
405
        register char *chain;
406
        register char *latest;
407
        int flags;
408
 
409
        *flagp = WORST;         /* Tentatively. */
410
 
411
        ret = regnode(BRANCH,rcstate);
412
        chain = NULL;
413
        while (*rcstate->regparse != '\0' && *rcstate->regparse != '|' &&
414
                                *rcstate->regparse != ')') {
415
                latest = regpiece(&flags, rcstate);
416
                if (latest == NULL)
417
                        return(NULL);
418
                *flagp |= flags&HASWIDTH;
419
                if (chain == NULL)      /* First piece. */
420
                        *flagp |= flags&SPSTART;
421
                else
422
                        regtail(chain, latest);
423
                chain = latest;
424
        }
425
        if (chain == NULL)      /* Loop ran zero times. */
426
                (void) regnode(NOTHING,rcstate);
427
 
428
        return(ret);
429
}
430
 
431
/*
432
 - regpiece - something followed by possible [*+?]
433
 *
434
 * Note that the branching code sequences used for ? and the general cases
435
 * of * and + are somewhat optimized:  they use the same NOTHING node as
436
 * both the endmarker for their branch list and the body of the last branch.
437
 * It might seem that this node could be dispensed with entirely, but the
438
 * endmarker role is not redundant.
439
 */
440
static char *
441
regpiece(flagp, rcstate)
442
int *flagp;
443
struct regcomp_state *rcstate;
444
{
445
        register char *ret;
446
        register char op;
447
        register char *next;
448
        int flags;
449
 
450
        ret = regatom(&flags,rcstate);
451
        if (ret == NULL)
452
                return(NULL);
453
 
454
        op = *rcstate->regparse;
455
        if (!ISMULT(op)) {
456
                *flagp = flags;
457
                return(ret);
458
        }
459
 
460
        if (!(flags&HASWIDTH) && op != '?')
461
                FAIL("*+ operand could be empty");
462
        *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
463
 
464
        if (op == '*' && (flags&SIMPLE))
465
                reginsert(STAR, ret, rcstate);
466
        else if (op == '*') {
467
                /* Emit x* as (x&|), where & means "self". */
468
                reginsert(BRANCH, ret, rcstate);                        /* Either x */
469
                regoptail(ret, regnode(BACK,rcstate));          /* and loop */
470
                regoptail(ret, ret);                    /* back */
471
                regtail(ret, regnode(BRANCH,rcstate));          /* or */
472
                regtail(ret, regnode(NOTHING,rcstate));         /* null. */
473
        } else if (op == '+' && (flags&SIMPLE))
474
                reginsert(PLUS, ret, rcstate);
475
        else if (op == '+') {
476
                /* Emit x+ as x(&|), where & means "self". */
477
                next = regnode(BRANCH,rcstate);                 /* Either */
478
                regtail(ret, next);
479
                regtail(regnode(BACK,rcstate), ret);            /* loop back */
480
                regtail(next, regnode(BRANCH,rcstate));         /* or */
481
                regtail(ret, regnode(NOTHING,rcstate));         /* null. */
482
        } else if (op == '?') {
483
                /* Emit x? as (x|) */
484
                reginsert(BRANCH, ret, rcstate);                        /* Either x */
485
                regtail(ret, regnode(BRANCH,rcstate));          /* or */
486
                next = regnode(NOTHING,rcstate);                /* null. */
487
                regtail(ret, next);
488
                regoptail(ret, next);
489
        }
490
        rcstate->regparse++;
491
        if (ISMULT(*rcstate->regparse))
492
                FAIL("nested *?+");
493
 
494
        return(ret);
495
}
496
 
497
/*
498
 - regatom - the lowest level
499
 *
500
 * Optimization:  gobbles an entire sequence of ordinary characters so that
501
 * it can turn them into a single node, which is smaller to store and
502
 * faster to run.  Backslashed characters are exceptions, each becoming a
503
 * separate node; the code is simpler that way and it's not worth fixing.
504
 */
505
static char *
506
regatom(flagp, rcstate)
507
int *flagp;
508
struct regcomp_state *rcstate;
509
{
510
        register char *ret;
511
        int flags;
512
 
513
        *flagp = WORST;         /* Tentatively. */
514
 
515
        switch (*rcstate->regparse++) {
516
        case '^':
517
                ret = regnode(BOL,rcstate);
518
                break;
519
        case '$':
520
                ret = regnode(EOL,rcstate);
521
                break;
522
        case '.':
523
                ret = regnode(ANY,rcstate);
524
                *flagp |= HASWIDTH|SIMPLE;
525
                break;
526
        case '[': {
527
                        register int clss;
528
                        register int classend;
529
 
530
                        if(rcstate->regparse[0] != '\\' &&
531
                           rcstate->regparse[0] != '^' &&
532
                           rcstate->regparse[1] == ']') {
533
                                ret = regnode(EXACTLY,rcstate);
534
                                regc(*rcstate->regparse++,rcstate);
535
                                regc('\0',rcstate);
536
                                rcstate->regparse++;
537
                                *flagp |= HASWIDTH|SIMPLE;
538
                                break;
539
                        }
540
                        if (*rcstate->regparse == '^') {        /* Complement of range. */
541
                                ret = regnode(ANYBUT,rcstate);
542
                                rcstate->regparse++;
543
                        } else
544
                                ret = regnode(ANYOF,rcstate);
545
                        if (*rcstate->regparse == ']' || *rcstate->regparse == '-')
546
                                regc(*rcstate->regparse++,rcstate);
547
                        while (*rcstate->regparse != '\0' && *rcstate->regparse != ']') {
548
                                if (*rcstate->regparse == '-') {
549
                                        rcstate->regparse++;
550
                                        if (*rcstate->regparse == ']' || *rcstate->regparse == '\0')
551
                                                regc('-',rcstate);
552
                                        else {
553
                                                clss = UCHARAT(rcstate->regparse-2)+1;
554
                                                classend = UCHARAT(rcstate->regparse);
555
                                                if (clss > classend+1)
556
                                                        FAIL("invalid [] range");
557
                                                for (; clss <= classend; clss++)
558
                                                        regc((char)clss,rcstate);
559
                                                rcstate->regparse++;
560
                                        }
561
                                } else
562
                                        regc(*rcstate->regparse++,rcstate);
563
                        }
564
                        regc('\0',rcstate);
565
                        if (*rcstate->regparse != ']')
566
                                FAIL("unmatched []");
567
                        rcstate->regparse++;
568
                        *flagp |= HASWIDTH|SIMPLE;
569
                }
570
                break;
571
        case '(':
572
                ret = reg(1, &flags, rcstate);
573
                if (ret == NULL)
574
                        return(NULL);
575
                *flagp |= flags&(HASWIDTH|SPSTART);
576
                break;
577
        case '\0':
578
        case '|':
579
        case ')':
580
                FAIL("internal urp");   /* Supposed to be caught earlier. */
581
                /* NOTREACHED */
582
        case '?':
583
        case '+':
584
        case '*':
585
                FAIL("?+* follows nothing");
586
                /* NOTREACHED */
587
        case '\\':
588
                if (*rcstate->regparse == '\0')
589
                        FAIL("trailing \\");
590
                ret = regnode(EXACTLY,rcstate);
591
                regc(*rcstate->regparse++,rcstate);
592
                regc('\0',rcstate);
593
                *flagp |= HASWIDTH|SIMPLE;
594
                break;
595
        default: {
596
                        register int len;
597
                        register char ender;
598
 
599
                        rcstate->regparse--;
600
                        len = strcspn(rcstate->regparse, META);
601
                        if (len <= 0)
602
                                FAIL("internal disaster");
603
                        ender = *(rcstate->regparse+len);
604
                        if (len > 1 && ISMULT(ender))
605
                                len--;          /* Back off clear of ?+* operand. */
606
                        *flagp |= HASWIDTH;
607
                        if (len == 1)
608
                                *flagp |= SIMPLE;
609
                        ret = regnode(EXACTLY,rcstate);
610
                        while (len > 0) {
611
                                regc(*rcstate->regparse++,rcstate);
612
                                len--;
613
                        }
614
                        regc('\0',rcstate);
615
                }
616
                break;
617
        }
618
 
619
        return(ret);
620
}
621
 
622
/*
623
 - regnode - emit a node
624
 */
625
static char *                   /* Location. */
626
regnode(op, rcstate)
627
int op;
628
struct regcomp_state *rcstate;
629
{
630
        register char *ret;
631
        register char *ptr;
632
 
633
        ret = rcstate->regcode;
634
        if (ret == &regdummy) {
635
                rcstate->regsize += 3;
636
                return(ret);
637
        }
638
 
639
        ptr = ret;
640
        *ptr++ = (char)op;
641
        *ptr++ = '\0';          /* Null "next" pointer. */
642
        *ptr++ = '\0';
643
        rcstate->regcode = ptr;
644
 
645
        return(ret);
646
}
647
 
648
/*
649
 - regc - emit (if appropriate) a byte of code
650
 */
651
static void
652
regc(b, rcstate)
653
int b;
654
struct regcomp_state *rcstate;
655
{
656
        if (rcstate->regcode != &regdummy)
657
                *rcstate->regcode++ = (char)b;
658
        else
659
                rcstate->regsize++;
660
}
661
 
662
/*
663
 - reginsert - insert an operator in front of already-emitted operand
664
 *
665
 * Means relocating the operand.
666
 */
667
static void
668
reginsert(op, opnd, rcstate)
669
int op;
670
char *opnd;
671
struct regcomp_state *rcstate;
672
{
673
        register char *src;
674
        register char *dst;
675
        register char *place;
676
 
677
        if (rcstate->regcode == &regdummy) {
678
                rcstate->regsize += 3;
679
                return;
680
        }
681
 
682
        src = rcstate->regcode;
683
        rcstate->regcode += 3;
684
        dst = rcstate->regcode;
685
        while (src > opnd)
686
                *--dst = *--src;
687
 
688
        place = opnd;           /* Op node, where operand used to be. */
689
        *place++ = (char)op;
690
        *place++ = '\0';
691
        *place = '\0';
692
}
693
 
694
/*
695
 - regtail - set the next-pointer at the end of a node chain
696
 */
697
static void
698
regtail(p, val)
699
char *p;
700
char *val;
701
{
702
        register char *scan;
703
        register char *temp;
704
        register int offset;
705
 
706
        if (p == &regdummy)
707
                return;
708
 
709
        /* Find last node. */
710
        scan = p;
711
        for (;;) {
712
                temp = regnext(scan);
713
                if (temp == NULL)
714
                        break;
715
                scan = temp;
716
        }
717
 
718
        if (OP(scan) == BACK)
719
                offset = scan - val;
720
        else
721
                offset = val - scan;
722
        *(scan+1) = (char)((offset>>8)&0377);
723
        *(scan+2) = (char)(offset&0377);
724
}
725
 
726
/*
727
 - regoptail - regtail on operand of first argument; nop if operandless
728
 */
729
static void
730
regoptail(p, val)
731
char *p;
732
char *val;
733
{
734
        /* "Operandless" and "op != BRANCH" are synonymous in practice. */
735
        if (p == NULL || p == &regdummy || OP(p) != BRANCH)
736
                return;
737
        regtail(OPERAND(p), val);
738
}
739
 
740
/*
741
 * TclRegExec and friends
742
 */
743
 
744
/*
745
 * Global work variables for TclRegExec().
746
 */
747
struct regexec_state  {
748
    char *reginput;             /* String-input pointer. */
749
    char *regbol;               /* Beginning of input, for ^ check. */
750
    char **regstartp;   /* Pointer to startp array. */
751
    char **regendp;             /* Ditto for endp. */
752
};
753
 
754
/*
755
 * Forwards.
756
 */
757
static int              regtry _ANSI_ARGS_((regexp *prog, char *string,
758
                            struct regexec_state *restate));
759
static int              regmatch _ANSI_ARGS_((char *prog,
760
                            struct regexec_state *restate));
761
static int              regrepeat _ANSI_ARGS_((char *p,
762
                            struct regexec_state *restate));
763
 
764
#ifdef DEBUG
765
int regnarrate = 0;
766
void regdump _ANSI_ARGS_((regexp *r));
767
static char *regprop _ANSI_ARGS_((char *op));
768
#endif
769
 
770
/*
771
 - TclRegExec - match a regexp against a string
772
 */
773
int
774
TclRegExec(prog, string, start)
775
register regexp *prog;
776
register char *string;
777
char *start;
778
{
779
        register char *s;
780
        struct regexec_state state;
781
        struct regexec_state *restate= &state;
782
 
783
        /* Be paranoid... */
784
        if (prog == NULL || string == NULL) {
785
                TclRegError("NULL parameter");
786
                return(0);
787
        }
788
 
789
        /* Check validity of program. */
790
        if (UCHARAT(prog->program) != MAGIC) {
791
                TclRegError("corrupted program");
792
                return(0);
793
        }
794
 
795
        /* If there is a "must appear" string, look for it. */
796
        if (prog->regmust != NULL) {
797
                s = string;
798
                while ((s = strchr(s, prog->regmust[0])) != NULL) {
799
                        if (strncmp(s, prog->regmust, (size_t) prog->regmlen)
800
                            == 0)
801
                                break;  /* Found it. */
802
                        s++;
803
                }
804
                if (s == NULL)  /* Not present. */
805
                        return(0);
806
        }
807
 
808
        /* Mark beginning of line for ^ . */
809
        restate->regbol = start;
810
 
811
        /* Simplest case:  anchored match need be tried only once. */
812
        if (prog->reganch)
813
                return(regtry(prog, string, restate));
814
 
815
        /* Messy cases:  unanchored match. */
816
        s = string;
817
        if (prog->regstart != '\0')
818
                /* We know what char it must start with. */
819
                while ((s = strchr(s, prog->regstart)) != NULL) {
820
                        if (regtry(prog, s, restate))
821
                                return(1);
822
                        s++;
823
                }
824
        else
825
                /* We don't -- general case. */
826
                do {
827
                        if (regtry(prog, s, restate))
828
                                return(1);
829
                } while (*s++ != '\0');
830
 
831
        /* Failure. */
832
        return(0);
833
}
834
 
835
/*
836
 - regtry - try match at specific point
837
 */
838
static int                      /* 0 failure, 1 success */
839
regtry(prog, string, restate)
840
regexp *prog;
841
char *string;
842
struct regexec_state *restate;
843
{
844
        register int i;
845
        register char **sp;
846
        register char **ep;
847
 
848
        restate->reginput = string;
849
        restate->regstartp = prog->startp;
850
        restate->regendp = prog->endp;
851
 
852
        sp = prog->startp;
853
        ep = prog->endp;
854
        for (i = NSUBEXP; i > 0; i--) {
855
                *sp++ = NULL;
856
                *ep++ = NULL;
857
        }
858
        if (regmatch(prog->program + 1,restate)) {
859
                prog->startp[0] = string;
860
                prog->endp[0] = restate->reginput;
861
                return(1);
862
        } else
863
                return(0);
864
}
865
 
866
/*
867
 - regmatch - main matching routine
868
 *
869
 * Conceptually the strategy is simple:  check to see whether the current
870
 * node matches, call self recursively to see whether the rest matches,
871
 * and then act accordingly.  In practice we make some effort to avoid
872
 * recursion, in particular by going through "ordinary" nodes (that don't
873
 * need to know whether the rest of the match failed) by a loop instead of
874
 * by recursion.
875
 */
876
static int                      /* 0 failure, 1 success */
877
regmatch(prog, restate)
878
char *prog;
879
struct regexec_state *restate;
880
{
881
    register char *scan;        /* Current node. */
882
    char *next;         /* Next node. */
883
 
884
    scan = prog;
885
#ifdef DEBUG
886
    if (scan != NULL && regnarrate)
887
        fprintf(stderr, "%s(\n", regprop(scan));
888
#endif
889
    while (scan != NULL) {
890
#ifdef DEBUG
891
        if (regnarrate)
892
            fprintf(stderr, "%s...\n", regprop(scan));
893
#endif
894
        next = regnext(scan);
895
 
896
        switch (OP(scan)) {
897
            case BOL:
898
                if (restate->reginput != restate->regbol) {
899
                    return 0;
900
                }
901
                break;
902
            case EOL:
903
                if (*restate->reginput != '\0') {
904
                    return 0;
905
                }
906
                break;
907
            case ANY:
908
                if (*restate->reginput == '\0') {
909
                    return 0;
910
                }
911
                restate->reginput++;
912
                break;
913
            case EXACTLY: {
914
                register int len;
915
                register char *opnd;
916
 
917
                opnd = OPERAND(scan);
918
                /* Inline the first character, for speed. */
919
                if (*opnd != *restate->reginput) {
920
                    return 0 ;
921
                }
922
                len = strlen(opnd);
923
                if (len > 1 && strncmp(opnd, restate->reginput, (size_t) len)
924
                        != 0) {
925
                    return 0;
926
                }
927
                restate->reginput += len;
928
                break;
929
            }
930
            case ANYOF:
931
                if (*restate->reginput == '\0'
932
                        || strchr(OPERAND(scan), *restate->reginput) == NULL) {
933
                    return 0;
934
                }
935
                restate->reginput++;
936
                break;
937
            case ANYBUT:
938
                if (*restate->reginput == '\0'
939
                        || strchr(OPERAND(scan), *restate->reginput) != NULL) {
940
                    return 0;
941
                }
942
                restate->reginput++;
943
                break;
944
            case NOTHING:
945
                break;
946
            case BACK:
947
                break;
948
            case OPEN+1:
949
            case OPEN+2:
950
            case OPEN+3:
951
            case OPEN+4:
952
            case OPEN+5:
953
            case OPEN+6:
954
            case OPEN+7:
955
            case OPEN+8:
956
            case OPEN+9: {
957
                register int no;
958
                register char *save;
959
 
960
        doOpen:
961
                no = OP(scan) - OPEN;
962
                save = restate->reginput;
963
 
964
                if (regmatch(next,restate)) {
965
                    /*
966
                     * Don't set startp if some later invocation of the
967
                     * same parentheses already has.
968
                     */
969
                    if (restate->regstartp[no] == NULL) {
970
                        restate->regstartp[no] = save;
971
                    }
972
                    return 1;
973
                } else {
974
                    return 0;
975
                }
976
            }
977
            case CLOSE+1:
978
            case CLOSE+2:
979
            case CLOSE+3:
980
            case CLOSE+4:
981
            case CLOSE+5:
982
            case CLOSE+6:
983
            case CLOSE+7:
984
            case CLOSE+8:
985
            case CLOSE+9: {
986
                register int no;
987
                register char *save;
988
 
989
        doClose:
990
                no = OP(scan) - CLOSE;
991
                save = restate->reginput;
992
 
993
                if (regmatch(next,restate)) {
994
                                /*
995
                                 * Don't set endp if some later
996
                                 * invocation of the same parentheses
997
                                 * already has.
998
                                 */
999
                    if (restate->regendp[no] == NULL)
1000
                        restate->regendp[no] = save;
1001
                    return 1;
1002
                } else {
1003
                    return 0;
1004
                }
1005
            }
1006
            case BRANCH: {
1007
                register char *save;
1008
 
1009
                if (OP(next) != BRANCH) { /* No choice. */
1010
                    next = OPERAND(scan); /* Avoid recursion. */
1011
                } else {
1012
                    do {
1013
                        save = restate->reginput;
1014
                        if (regmatch(OPERAND(scan),restate))
1015
                            return(1);
1016
                        restate->reginput = save;
1017
                        scan = regnext(scan);
1018
                    } while (scan != NULL && OP(scan) == BRANCH);
1019
                    return 0;
1020
                }
1021
                break;
1022
            }
1023
            case STAR:
1024
            case PLUS: {
1025
                register char nextch;
1026
                register int no;
1027
                register char *save;
1028
                register int min;
1029
 
1030
                /*
1031
                 * Lookahead to avoid useless match attempts
1032
                 * when we know what character comes next.
1033
                 */
1034
                nextch = '\0';
1035
                if (OP(next) == EXACTLY)
1036
                    nextch = *OPERAND(next);
1037
                min = (OP(scan) == STAR) ? 0 : 1;
1038
                save = restate->reginput;
1039
                no = regrepeat(OPERAND(scan),restate);
1040
                while (no >= min) {
1041
                    /* If it could work, try it. */
1042
                    if (nextch == '\0' || *restate->reginput == nextch)
1043
                        if (regmatch(next,restate))
1044
                            return(1);
1045
                    if (nextch != '\0' && no > (min + 1)) {
1046
                        char tmp = *(save + no);
1047
                        char *p;
1048
                        *(save + no) = 0;
1049
                        p = strrchr(save, nextch);
1050
                        *(save + no) = tmp;
1051
                        if (p != NULL)
1052
                            no = p - save + 1;
1053
                        else
1054
                            no = 0;
1055
                    }
1056
 
1057
                    /* Couldn't or didn't -- back up. */
1058
                    no--;
1059
                    restate->reginput = save + no;
1060
                }
1061
                return(0);
1062
            }
1063
            case END:
1064
                return(1);      /* Success! */
1065
            default:
1066
                if (OP(scan) > OPEN && OP(scan) < OPEN+NSUBEXP) {
1067
                    goto doOpen;
1068
                } else if (OP(scan) > CLOSE && OP(scan) < CLOSE+NSUBEXP) {
1069
                    goto doClose;
1070
                }
1071
                TclRegError("memory corruption");
1072
                return 0;
1073
        }
1074
 
1075
        scan = next;
1076
    }
1077
 
1078
    /*
1079
     * We get here only if there's trouble -- normally "case END" is
1080
     * the terminating point.
1081
     */
1082
    TclRegError("corrupted pointers");
1083
    return(0);
1084
}
1085
 
1086
/*
1087
 - regrepeat - repeatedly match something simple, report how many
1088
 */
1089
static int
1090
regrepeat(p, restate)
1091
char *p;
1092
struct regexec_state *restate;
1093
{
1094
        register int count = 0;
1095
        register char *scan;
1096
        register char *opnd;
1097
 
1098
        scan = restate->reginput;
1099
        opnd = OPERAND(p);
1100
        switch (OP(p)) {
1101
        case ANY:
1102
                count = strlen(scan);
1103
                scan += count;
1104
                break;
1105
        case EXACTLY:
1106
                while (*opnd == *scan) {
1107
                        count++;
1108
                        scan++;
1109
                }
1110
                break;
1111
        case ANYOF:
1112
                while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
1113
                        count++;
1114
                        scan++;
1115
                }
1116
                break;
1117
        case ANYBUT:
1118
                while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
1119
                        count++;
1120
                        scan++;
1121
                }
1122
                break;
1123
        default:                /* Oh dear.  Called inappropriately. */
1124
                TclRegError("internal foulup");
1125
                count = 0;       /* Best compromise. */
1126
                break;
1127
        }
1128
        restate->reginput = scan;
1129
 
1130
        return(count);
1131
}
1132
 
1133
/*
1134
 - regnext - dig the "next" pointer out of a node
1135
 */
1136
static char *
1137
regnext(p)
1138
register char *p;
1139
{
1140
        register int offset;
1141
 
1142
        if (p == &regdummy)
1143
                return(NULL);
1144
 
1145
        offset = NEXT(p);
1146
        if (offset == 0)
1147
                return(NULL);
1148
 
1149
        if (OP(p) == BACK)
1150
                return(p-offset);
1151
        else
1152
                return(p+offset);
1153
}
1154
 
1155
#ifdef DEBUG
1156
 
1157
static char *regprop();
1158
 
1159
/*
1160
 - regdump - dump a regexp onto stdout in vaguely comprehensible form
1161
 */
1162
void
1163
regdump(r)
1164
regexp *r;
1165
{
1166
        register char *s;
1167
        register char op = EXACTLY;     /* Arbitrary non-END op. */
1168
        register char *next;
1169
 
1170
 
1171
        s = r->program + 1;
1172
        while (op != END) {     /* While that wasn't END last time... */
1173
                op = OP(s);
1174
                printf("%2d%s", s-r->program, regprop(s));      /* Where, what. */
1175
                next = regnext(s);
1176
                if (next == NULL)               /* Next ptr. */
1177
                        printf("(0)");
1178
                else
1179
                        printf("(%d)", (s-r->program)+(next-s));
1180
                s += 3;
1181
                if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
1182
                        /* Literal string, where present. */
1183
                        while (*s != '\0') {
1184
                                putchar(*s);
1185
                                s++;
1186
                        }
1187
                        s++;
1188
                }
1189
                putchar('\n');
1190
        }
1191
 
1192
        /* Header fields of interest. */
1193
        if (r->regstart != '\0')
1194
                printf("start `%c' ", r->regstart);
1195
        if (r->reganch)
1196
                printf("anchored ");
1197
        if (r->regmust != NULL)
1198
                printf("must have \"%s\"", r->regmust);
1199
        printf("\n");
1200
}
1201
 
1202
/*
1203
 - regprop - printable representation of opcode
1204
 */
1205
static char *
1206
regprop(op)
1207
char *op;
1208
{
1209
        register char *p;
1210
        static char buf[50];
1211
 
1212
        (void) strcpy(buf, ":");
1213
 
1214
        switch (OP(op)) {
1215
        case BOL:
1216
                p = "BOL";
1217
                break;
1218
        case EOL:
1219
                p = "EOL";
1220
                break;
1221
        case ANY:
1222
                p = "ANY";
1223
                break;
1224
        case ANYOF:
1225
                p = "ANYOF";
1226
                break;
1227
        case ANYBUT:
1228
                p = "ANYBUT";
1229
                break;
1230
        case BRANCH:
1231
                p = "BRANCH";
1232
                break;
1233
        case EXACTLY:
1234
                p = "EXACTLY";
1235
                break;
1236
        case NOTHING:
1237
                p = "NOTHING";
1238
                break;
1239
        case BACK:
1240
                p = "BACK";
1241
                break;
1242
        case END:
1243
                p = "END";
1244
                break;
1245
        case OPEN+1:
1246
        case OPEN+2:
1247
        case OPEN+3:
1248
        case OPEN+4:
1249
        case OPEN+5:
1250
        case OPEN+6:
1251
        case OPEN+7:
1252
        case OPEN+8:
1253
        case OPEN+9:
1254
                sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
1255
                p = NULL;
1256
                break;
1257
        case CLOSE+1:
1258
        case CLOSE+2:
1259
        case CLOSE+3:
1260
        case CLOSE+4:
1261
        case CLOSE+5:
1262
        case CLOSE+6:
1263
        case CLOSE+7:
1264
        case CLOSE+8:
1265
        case CLOSE+9:
1266
                sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
1267
                p = NULL;
1268
                break;
1269
        case STAR:
1270
                p = "STAR";
1271
                break;
1272
        case PLUS:
1273
                p = "PLUS";
1274
                break;
1275
        default:
1276
                if (OP(op) > OPEN && OP(op) < OPEN+NSUBEXP) {
1277
                    sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
1278
                    p = NULL;
1279
                    break;
1280
                } else if (OP(op) > CLOSE && OP(op) < CLOSE+NSUBEXP) {
1281
                    sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
1282
                    p = NULL;
1283
                } else {
1284
                    TclRegError("corrupted opcode");
1285
                }
1286
                break;
1287
        }
1288
        if (p != NULL)
1289
                (void) strcat(buf, p);
1290
        return(buf);
1291
}
1292
#endif
1293
 
1294
/*
1295
 * The following is provided for those people who do not have strcspn() in
1296
 * their C libraries.  They should get off their butts and do something
1297
 * about it; at least one public-domain implementation of those (highly
1298
 * useful) string routines has been published on Usenet.
1299
 */
1300
#ifdef STRCSPN
1301
/*
1302
 * strcspn - find length of initial segment of s1 consisting entirely
1303
 * of characters not from s2
1304
 */
1305
 
1306
static int
1307
strcspn(s1, s2)
1308
char *s1;
1309
char *s2;
1310
{
1311
        register char *scan1;
1312
        register char *scan2;
1313
        register int count;
1314
 
1315
        count = 0;
1316
        for (scan1 = s1; *scan1 != '\0'; scan1++) {
1317
                for (scan2 = s2; *scan2 != '\0';)       /* ++ moved down. */
1318
                        if (*scan1 == *scan2++)
1319
                                return(count);
1320
                count++;
1321
        }
1322
        return(count);
1323
}
1324
#endif
1325
 
1326
/*
1327
 *----------------------------------------------------------------------
1328
 *
1329
 * TclRegError --
1330
 *
1331
 *      This procedure is invoked by the regexp code when an error
1332
 *      occurs.  It saves the error message so it can be seen by the
1333
 *      code that called Spencer's code.
1334
 *
1335
 * Results:
1336
 *      None.
1337
 *
1338
 * Side effects:
1339
 *      The value of "string" is saved in "errMsg".
1340
 *
1341
 *----------------------------------------------------------------------
1342
 */
1343
 
1344
void
1345
TclRegError(string)
1346
    char *string;                       /* Error message. */
1347
{
1348
    errMsg = string;
1349
}
1350
 
1351
char *
1352
TclGetRegError()
1353
{
1354
    return errMsg;
1355
}

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.