OpenCores
URL https://opencores.org/ocsvn/or1k/or1k/trunk

Subversion Repositories or1k

[/] [or1k/] [trunk/] [insight/] [expect/] [exp_regexp.c] - Blame information for rev 1765

Details | Compare with Previous | View Log

Line No. Rev Author Line
1 578 markom
/*
2
 * regcomp and regexec -- regsub and regerror are elsewhere
3
 *
4
 *      Copyright (c) 1986 by University of Toronto.
5
 *      Written by Henry Spencer.  Not derived from licensed software.
6
 *
7
 *      Permission is granted to anyone to use this software for any
8
 *      purpose on any computer system, and to redistribute it freely,
9
 *      subject to the following restrictions:
10
 *
11
 *      1. The author is not responsible for the consequences of use of
12
 *              this software, no matter how awful, even if they arise
13
 *              from defects in it.
14
 *
15
 *      2. The origin of this software must not be misrepresented, either
16
 *              by explicit claim or by omission.
17
 *
18
 *      3. Altered versions must be plainly marked as such, and must not
19
 *              be misrepresented as being the original software.
20
 *
21
 * Beware that some of this code is subtly aware of the way operator
22
 * precedence is structured in regular expressions.  Serious changes in
23
 * regular-expression syntax might require a total rethink.
24
 *
25
 * *** NOTE: this code has been altered slightly for use in Tcl. ***
26
 * *** The only change is to use ckalloc and ckfree instead of   ***
27
 * *** malloc and free.                                          ***
28
 
29
 * *** and again for Expect!!! - DEL
30
 
31
 * *** More minor corrections stolen from tcl7.5p1/regexp.c - DEL
32
 
33
 */
34
 
35
#include "tcl.h"
36
#include "expect_cf.h"
37
#include "exp_prog.h"
38
#include "tclRegexp.h"
39
#include "exp_regexp.h"
40
#include "string.h"
41
 
42
#define NOTSTATIC       /* was at one time, but Expect needs access */
43
 
44
/*
45
 * The "internal use only" fields in regexp.h are present to pass info from
46
 * compile to execute that permits the execute phase to run lots faster on
47
 * simple cases.  They are:
48
 *
49
 * regstart     char that must begin a match; '\0' if none obvious
50
 * reganch      is the match anchored (at beginning-of-line only)?
51
 * regmust      string (pointer into program) that match must include, or NULL
52
 * regmlen      length of regmust string
53
 *
54
 * Regstart and reganch permit very fast decisions on suitable starting points
55
 * for a match, cutting down the work a lot.  Regmust permits fast rejection
56
 * of lines that cannot possibly match.  The regmust tests are costly enough
57
 * that regcomp() supplies a regmust only if the r.e. contains something
58
 * potentially expensive (at present, the only such thing detected is * or +
59
 * at the start of the r.e., which can involve a lot of backup).  Regmlen is
60
 * supplied because the test in regexec() needs it and regcomp() is computing
61
 * it anyway.
62
 */
63
 
64
/*
65
 * Structure for regexp "program".  This is essentially a linear encoding
66
 * of a nondeterministic finite-state machine (aka syntax charts or
67
 * "railroad normal form" in parsing technology).  Each node is an opcode
68
 * plus a "next" pointer, possibly plus an operand.  "Next" pointers of
69
 * all nodes except BRANCH implement concatenation; a "next" pointer with
70
 * a BRANCH on both ends of it is connecting two alternatives.  (Here we
71
 * have one of the subtle syntax dependencies:  an individual BRANCH (as
72
 * opposed to a collection of them) is never concatenated with anything
73
 * because of operator precedence.)  The operand of some types of node is
74
 * a literal string; for others, it is a node leading into a sub-FSM.  In
75
 * particular, the operand of a BRANCH node is the first node of the branch.
76
 * (NB this is *not* a tree structure:  the tail of the branch connects
77
 * to the thing following the set of BRANCHes.)  The opcodes are:
78
 */
79
 
80
/* definition   number  opnd?   meaning */
81
#define END     0        /* no   End of program. */
82
#define BOL     1       /* no   Match "" at beginning of line. */
83
#define EOL     2       /* no   Match "" at end of line. */
84
#define ANY     3       /* no   Match any one character. */
85
#define ANYOF   4       /* str  Match any character in this string. */
86
#define ANYBUT  5       /* str  Match any character not in this string. */
87
#define BRANCH  6       /* node Match this alternative, or the next... */
88
#define BACK    7       /* no   Match "", "next" ptr points backward. */
89
#define EXACTLY 8       /* str  Match this string. */
90
#define NOTHING 9       /* no   Match empty string. */
91
#define STAR    10      /* node Match this (simple) thing 0 or more times. */
92
#define PLUS    11      /* node Match this (simple) thing 1 or more times. */
93
#define OPEN    20      /* no   Mark this point in input as start of #n. */
94
                        /*      OPEN+1 is number 1, etc. */
95
#define CLOSE   (OPEN+NSUBEXP)  /* no   Analogous to OPEN. */
96
 
97
/*
98
 * Opcode notes:
99
 *
100
 * BRANCH       The set of branches constituting a single choice are hooked
101
 *              together with their "next" pointers, since precedence prevents
102
 *              anything being concatenated to any individual branch.  The
103
 *              "next" pointer of the last BRANCH in a choice points to the
104
 *              thing following the whole choice.  This is also where the
105
 *              final "next" pointer of each individual branch points; each
106
 *              branch starts with the operand node of a BRANCH node.
107
 *
108
 * BACK         Normal "next" pointers all implicitly point forward; BACK
109
 *              exists to make loop structures possible.
110
 *
111
 * STAR,PLUS    '?', and complex '*' and '+', are implemented as circular
112
 *              BRANCH structures using BACK.  Simple cases (one character
113
 *              per match) are implemented with STAR and PLUS for speed
114
 *              and to minimize recursive plunges.
115
 *
116
 * OPEN,CLOSE   ...are numbered at compile time.
117
 */
118
 
119
/*
120
 * A node is one char of opcode followed by two chars of "next" pointer.
121
 * "Next" pointers are stored as two 8-bit pieces, high order first.  The
122
 * value is a positive offset from the opcode of the node containing it.
123
 * An operand, if any, simply follows the node.  (Note that much of the
124
 * code generation knows about this implicit relationship.)
125
 *
126
 * Using two bytes for the "next" pointer is vast overkill for most things,
127
 * but allows patterns to get big without disasters.
128
 */
129
#define OP(p)   (*(p))
130
#define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
131
#define OPERAND(p)      ((p) + 3)
132
 
133
/*
134
 * See regmagic.h for one further detail of program structure.
135
 */
136
 
137
 
138
/*
139
 * Utility definitions.
140
 */
141
#ifndef CHARBITS
142
#define UCHARAT(p)      ((int)*(unsigned char *)(p))
143
#else
144
#define UCHARAT(p)      ((int)*(p)&CHARBITS)
145
#endif
146
 
147
#define FAIL(m) { regerror(m); return(NULL); }
148
#define ISMULT(c)       ((c) == '*' || (c) == '+' || (c) == '?')
149
#define META    "^$.[()|?+*\\"
150
 
151
/*
152
 * Flags to be passed up and down.
153
 */
154
#define HASWIDTH        01      /* Known never to match null string. */
155
#define SIMPLE          02      /* Simple enough to be STAR/PLUS operand. */
156
#define SPSTART         04      /* Starts with * or +. */
157
#define WORST           0        /* Worst case. */
158
 
159
/*
160
 * Global work variables for regcomp().
161
 */
162
static char *regparse;          /* Input-scan pointer. */
163
static int regnpar;             /* () count. */
164
static char regdummy;
165
static char *regcode;           /* Code-emit pointer; &regdummy = don't. */
166
static long regsize;            /* Code size. */
167
 
168
/*
169
 * The first byte of the regexp internal "program" is actually this magic
170
 * number; the start node begins in the second byte.
171
 */
172
#define MAGIC   0234
173
 
174
 
175
/*
176
 * Forward declarations for regcomp()'s friends.
177
 */
178
#ifndef STATIC
179
#define STATIC  static
180
#endif
181
STATIC char *reg();
182
STATIC char *regbranch();
183
STATIC char *regpiece();
184
STATIC char *regatom();
185
STATIC char *regnode();
186
STATIC char *regnext();
187
STATIC void regc();
188
STATIC void reginsert();
189
STATIC void regtail();
190
STATIC void regoptail();
191
#ifdef STRCSPN
192
STATIC int strcspn();
193
#endif
194
 
195
/* regcomp originally appeared here - DEL */
196
 
197
/*
198
 - reg - regular expression, i.e. main body or parenthesized thing
199
 *
200
 * Caller must absorb opening parenthesis.
201
 *
202
 * Combining parenthesis handling with the base level of regular expression
203
 * is a trifle forced, but the need to tie the tails of the branches to what
204
 * follows makes it hard to avoid.
205
 */
206
static char *
207
reg(paren, flagp)
208
int paren;                      /* Parenthesized? */
209
int *flagp;
210
{
211
        register char *ret;
212
        register char *br;
213
        register char *ender;
214
        register int parno = 0;
215
        int flags;
216
 
217
        *flagp = HASWIDTH;      /* Tentatively. */
218
 
219
        /* Make an OPEN node, if parenthesized. */
220
        if (paren) {
221
                if (regnpar >= NSUBEXP)
222
                        FAIL("too many ()");
223
                parno = regnpar;
224
                regnpar++;
225
                ret = regnode(OPEN+parno);
226
        } else
227
                ret = NULL;
228
 
229
        /* Pick up the branches, linking them together. */
230
        br = regbranch(&flags);
231
        if (br == NULL)
232
                return(NULL);
233
        if (ret != NULL)
234
                regtail(ret, br);       /* OPEN -> first. */
235
        else
236
                ret = br;
237
        if (!(flags&HASWIDTH))
238
                *flagp &= ~HASWIDTH;
239
        *flagp |= flags&SPSTART;
240
        while (*regparse == '|') {
241
                regparse++;
242
                br = regbranch(&flags);
243
                if (br == NULL)
244
                        return(NULL);
245
                regtail(ret, br);       /* BRANCH -> BRANCH. */
246
                if (!(flags&HASWIDTH))
247
                        *flagp &= ~HASWIDTH;
248
                *flagp |= flags&SPSTART;
249
        }
250
 
251
        /* Make a closing node, and hook it on the end. */
252
        ender = regnode((paren) ? CLOSE+parno : END);
253
        regtail(ret, ender);
254
 
255
        /* Hook the tails of the branches to the closing node. */
256
        for (br = ret; br != NULL; br = regnext(br))
257
                regoptail(br, ender);
258
 
259
        /* Check for proper termination. */
260
        if (paren && *regparse++ != ')') {
261
                FAIL("unmatched ()");
262
        } else if (!paren && *regparse != '\0') {
263
                if (*regparse == ')') {
264
                        FAIL("unmatched ()");
265
                } else
266
                        FAIL("junk on end");    /* "Can't happen". */
267
                /* NOTREACHED */
268
        }
269
 
270
        return(ret);
271
}
272
 
273
/*
274
 - regbranch - one alternative of an | operator
275
 *
276
 * Implements the concatenation operator.
277
 */
278
static char *
279
regbranch(flagp)
280
int *flagp;
281
{
282
        register char *ret;
283
        register char *chain;
284
        register char *latest;
285
        int flags;
286
 
287
        *flagp = WORST;         /* Tentatively. */
288
 
289
        ret = regnode(BRANCH);
290
        chain = NULL;
291
        while (*regparse != '\0' && *regparse != '|' && *regparse != ')') {
292
                latest = regpiece(&flags);
293
                if (latest == NULL)
294
                        return(NULL);
295
                *flagp |= flags&HASWIDTH;
296
                if (chain == NULL)      /* First piece. */
297
                        *flagp |= flags&SPSTART;
298
                else
299
                        regtail(chain, latest);
300
                chain = latest;
301
        }
302
        if (chain == NULL)      /* Loop ran zero times. */
303
                (void) regnode(NOTHING);
304
 
305
        return(ret);
306
}
307
 
308
/*
309
 - regpiece - something followed by possible [*+?]
310
 *
311
 * Note that the branching code sequences used for ? and the general cases
312
 * of * and + are somewhat optimized:  they use the same NOTHING node as
313
 * both the endmarker for their branch list and the body of the last branch.
314
 * It might seem that this node could be dispensed with entirely, but the
315
 * endmarker role is not redundant.
316
 */
317
static char *
318
regpiece(flagp)
319
int *flagp;
320
{
321
        register char *ret;
322
        register char op;
323
        register char *next;
324
        int flags;
325
 
326
        ret = regatom(&flags);
327
        if (ret == NULL)
328
                return(NULL);
329
 
330
        op = *regparse;
331
        if (!ISMULT(op)) {
332
                *flagp = flags;
333
                return(ret);
334
        }
335
 
336
        if (!(flags&HASWIDTH) && op != '?')
337
                FAIL("*+ operand could be empty");
338
        *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
339
 
340
        if (op == '*' && (flags&SIMPLE))
341
                reginsert(STAR, ret);
342
        else if (op == '*') {
343
                /* Emit x* as (x&|), where & means "self". */
344
                reginsert(BRANCH, ret);                 /* Either x */
345
                regoptail(ret, regnode(BACK));          /* and loop */
346
                regoptail(ret, ret);                    /* back */
347
                regtail(ret, regnode(BRANCH));          /* or */
348
                regtail(ret, regnode(NOTHING));         /* null. */
349
        } else if (op == '+' && (flags&SIMPLE))
350
                reginsert(PLUS, ret);
351
        else if (op == '+') {
352
                /* Emit x+ as x(&|), where & means "self". */
353
                next = regnode(BRANCH);                 /* Either */
354
                regtail(ret, next);
355
                regtail(regnode(BACK), ret);            /* loop back */
356
                regtail(next, regnode(BRANCH));         /* or */
357
                regtail(ret, regnode(NOTHING));         /* null. */
358
        } else if (op == '?') {
359
                /* Emit x? as (x|) */
360
                reginsert(BRANCH, ret);                 /* Either x */
361
                regtail(ret, regnode(BRANCH));          /* or */
362
                next = regnode(NOTHING);                /* null. */
363
                regtail(ret, next);
364
                regoptail(ret, next);
365
        }
366
        regparse++;
367
        if (ISMULT(*regparse))
368
                FAIL("nested *?+");
369
 
370
        return(ret);
371
}
372
 
373
/*
374
 - regatom - the lowest level
375
 *
376
 * Optimization:  gobbles an entire sequence of ordinary characters so that
377
 * it can turn them into a single node, which is smaller to store and
378
 * faster to run.  Backslashed characters are exceptions, each becoming a
379
 * separate node; the code is simpler that way and it's not worth fixing.
380
 */
381
static char *
382
regatom(flagp)
383
int *flagp;
384
{
385
        register char *ret;
386
        int flags;
387
 
388
        *flagp = WORST;         /* Tentatively. */
389
 
390
        switch (*regparse++) {
391
        case '^':
392
                ret = regnode(BOL);
393
                break;
394
        case '$':
395
                ret = regnode(EOL);
396
                break;
397
        case '.':
398
                ret = regnode(ANY);
399
                *flagp |= HASWIDTH|SIMPLE;
400
                break;
401
        case '[': {
402
                        register int clss;
403
                        register int classend;
404
 
405
                        if (*regparse == '^') { /* Complement of range. */
406
                                ret = regnode(ANYBUT);
407
                                regparse++;
408
                        } else
409
                                ret = regnode(ANYOF);
410
                        if (*regparse == ']' || *regparse == '-')
411
                                regc(*regparse++);
412
                        while (*regparse != '\0' && *regparse != ']') {
413
                                if (*regparse == '-') {
414
                                        regparse++;
415
                                        if (*regparse == ']' || *regparse == '\0')
416
                                                regc('-');
417
                                        else {
418
                                                clss = UCHARAT(regparse-2)+1;
419
                                                classend = UCHARAT(regparse);
420
                                                if (clss > classend+1)
421
                                                        FAIL("invalid [] range");
422
                                                for (; clss <= classend; clss++)
423
                                                        regc((char)clss);
424
                                                regparse++;
425
                                        }
426
                                } else
427
                                        regc(*regparse++);
428
                        }
429
                        regc('\0');
430
                        if (*regparse != ']')
431
                                FAIL("unmatched []");
432
                        regparse++;
433
                        *flagp |= HASWIDTH|SIMPLE;
434
                }
435
                break;
436
        case '(':
437
                ret = reg(1, &flags);
438
                if (ret == NULL)
439
                        return(NULL);
440
                *flagp |= flags&(HASWIDTH|SPSTART);
441
                break;
442
        case '\0':
443
        case '|':
444
        case ')':
445
                FAIL("internal urp");   /* Supposed to be caught earlier. */
446
                /* NOTREACHED */
447
                break;
448
        case '?':
449
        case '+':
450
        case '*':
451
                FAIL("?+* follows nothing");
452
                /* NOTREACHED */
453
                break;
454
        case '\\':
455
                if (*regparse == '\0')
456
                        FAIL("trailing \\");
457
                ret = regnode(EXACTLY);
458
                regc(*regparse++);
459
                regc('\0');
460
                *flagp |= HASWIDTH|SIMPLE;
461
                break;
462
        default: {
463
                        register int len;
464
                        register char ender;
465
 
466
                        regparse--;
467
                        len = strcspn(regparse, META);
468
                        if (len <= 0)
469
                                FAIL("internal disaster");
470
                        ender = *(regparse+len);
471
                        if (len > 1 && ISMULT(ender))
472
                                len--;          /* Back off clear of ?+* operand. */
473
                        *flagp |= HASWIDTH;
474
                        if (len == 1)
475
                                *flagp |= SIMPLE;
476
                        ret = regnode(EXACTLY);
477
                        while (len > 0) {
478
                                regc(*regparse++);
479
                                len--;
480
                        }
481
                        regc('\0');
482
                }
483
                break;
484
        }
485
 
486
        return(ret);
487
}
488
 
489
/*
490
 - regnode - emit a node
491
 */
492
static char *                   /* Location. */
493
regnode(op)
494
int op;
495
{
496
        register char *ret;
497
        register char *ptr;
498
 
499
        ret = regcode;
500
        if (ret == &regdummy) {
501
                regsize += 3;
502
                return(ret);
503
        }
504
 
505
        ptr = ret;
506
        *ptr++ = (char)op;
507
        *ptr++ = '\0';          /* Null "next" pointer. */
508
        *ptr++ = '\0';
509
        regcode = ptr;
510
 
511
        return(ret);
512
}
513
 
514
/*
515
 - regc - emit (if appropriate) a byte of code
516
 */
517
static void
518
regc(b)
519
int b;
520
{
521
        if (regcode != &regdummy)
522
                *regcode++ = (char)b;
523
        else
524
                regsize++;
525
}
526
 
527
/*
528
 - reginsert - insert an operator in front of already-emitted operand
529
 *
530
 * Means relocating the operand.
531
 */
532
static void
533
reginsert(op, opnd)
534
int op;
535
char *opnd;
536
{
537
        register char *src;
538
        register char *dst;
539
        register char *place;
540
 
541
        if (regcode == &regdummy) {
542
                regsize += 3;
543
                return;
544
        }
545
 
546
        src = regcode;
547
        regcode += 3;
548
        dst = regcode;
549
        while (src > opnd)
550
                *--dst = *--src;
551
 
552
        place = opnd;           /* Op node, where operand used to be. */
553
        *place++ = (char)op;
554
        *place++ = '\0';
555
        *place = '\0';
556
}
557
 
558
/*
559
 - regtail - set the next-pointer at the end of a node chain
560
 */
561
static void
562
regtail(p, val)
563
char *p;
564
char *val;
565
{
566
        register char *scan;
567
        register char *temp;
568
        register int offset;
569
 
570
        if (p == &regdummy)
571
                return;
572
 
573
        /* Find last node. */
574
        scan = p;
575
        for (;;) {
576
                temp = regnext(scan);
577
                if (temp == NULL)
578
                        break;
579
                scan = temp;
580
        }
581
 
582
        if (OP(scan) == BACK)
583
                offset = scan - val;
584
        else
585
                offset = val - scan;
586
        *(scan+1) = (char)(offset>>8)&0377;
587
        *(scan+2) = (char)offset&0377;
588
}
589
 
590
/*
591
 - regoptail - regtail on operand of first argument; nop if operandless
592
 */
593
static void
594
regoptail(p, val)
595
char *p;
596
char *val;
597
{
598
        /* "Operandless" and "op != BRANCH" are synonymous in practice. */
599
        if (p == NULL || p == &regdummy || OP(p) != BRANCH)
600
                return;
601
        regtail(OPERAND(p), val);
602
}
603
 
604
/*
605
 * regexec and friends
606
 */
607
 
608
/*
609
 * Global work variables for regexec().
610
 */
611
static char *reginput;          /* String-input pointer. */
612
NOTSTATIC char *regbol;         /* Beginning of input, for ^ check. */
613
static char **regstartp;        /* Pointer to startp array. */
614
static char **regendp;          /* Ditto for endp. */
615
 
616
/*
617
 * Forwards.
618
 */
619
 
620
NOTSTATIC int regtry();
621
STATIC int regmatch();
622
STATIC int regrepeat();
623
 
624
#ifdef DEBUG
625
int regnarrate = 0;
626
void regdump();
627
STATIC char *regprop();
628
#endif
629
 
630
#if 0
631
/*
632
 - regexec - match a regexp against a string
633
 */
634
int
635
regexec(prog, string, stringlength, matchlength)
636
register regexp *prog;
637
register char *string;  /* note: CURRENTLY ASSUMED TO BE NULL-TERMINATED!!! */
638
int stringlength;       /* length of string */
639
int *matchlength;       /* number of chars matched (or to be skipped) */
640
                        /* set when MATCH or CANT_MATCH */
641
{
642
        register char *s;
643
        extern char *strchr();
644
 
645
        /* Be paranoid... */
646
        if (prog == NULL || string == NULL) {
647
                regerror("NULL parameter");
648
                return(EXP_TCLERROR);
649
        }
650
 
651
        /* Check validity of program. */
652
        if (UCHARAT(prog->program) != MAGIC) {
653
                regerror("corrupted program");
654
                return(EXP_KM_ERROR);
655
        }
656
 
657
#if THIS_RUINS_EXP
658
/* no need for this shortcut anyway */
659
        /* If there is a "must appear" string, look for it. */
660
        if (prog->regmust != NULL) {
661
                s = string;
662
                while ((s = strchr(s, prog->regmust[0])) != NULL) {
663
                        if (strncmp(s, prog->regmust, prog->regmlen) == 0)
664
                                break;  /* Found it. */
665
                        s++;
666
                }
667
                if (s == NULL)  /* Not present. */
668
                        return(0);
669
        }
670
#endif
671
 
672
        /* Mark beginning of line for ^ . */
673
        regbol = string;
674
 
675
        /* Simplest case:  anchored match need be tried only once. */
676
        if (prog->reganch) {
677
                int r = regtry(prog,string,matchlength);
678
                if (r == CANT_MATCH) *matchlength = stringlength;
679
                return(r);
680
        }
681
 
682
        /* Messy cases:  unanchored match. */
683
        s = string;
684
        if (prog->regstart != '\0') {
685
                register char *s2 = s;
686
 
687
                /* We know what char it must start with. */
688
                while (1) {
689
                        int r;
690
 
691
                        s2 = strchr(s2,prog->regstart);
692
                        if (s2 == 0) {
693
                                *matchlength = stringlength;
694
                                return(CANT_MATCH);
695
                        }
696
                        r = regtry(prog,s2,matchlength);
697
                        if (r == CANT_MATCH) {
698
                                s2++;
699
                                continue;
700
                        }
701
                        if (s2 == s) return(r);
702
                        *matchlength = s2-s;
703
                        return CANT_MATCH;
704
                }
705
        } else {
706
                /* We don't -- general case. */
707
                register char *s2 = s;
708
                int r = regtry(prog,s,matchlength);
709
                if (r == EXP_MATCH) return(r);
710
                else if (r == EXP_CANMATCH) return(r);
711
                /* at this point, we know some characters at front */
712
                /* of string don't match */
713
                for (s2++;*s2;s2++) {
714
                        r = regtry(prog,s2,matchlength);
715
                        if (r == CANT_MATCH) continue;
716
                        /* if we match or can_match, say cant_match and */
717
                        /* record the number of chars at front that don't match */
718
                        *matchlength = s2-s;
719
                        return(CANT_MATCH);
720
                }
721
                /* made it thru string with CANT_MATCH all the way */
722
                *matchlength = stringlength;
723
                return(CANT_MATCH);
724
        }
725
}
726
#endif
727
 
728
/*
729
 - regtry - try match at specific point
730
 */
731
/* return CAN_MATCH, CANT_MATCH or MATCH */
732
int                     /* 0 failure, 1 success */
733
regtry(prog, string, matchlength)
734
regexp *prog;
735
char *string;
736
int *matchlength;       /* only set for MATCH */
737
{
738
        register int i;
739
        register char **sp;
740
        register char **ep;
741
        int r;          /* result of regmatch */
742
 
743
        reginput = string;
744
        regstartp = prog->startp;
745
        regendp = prog->endp;
746
 
747
        sp = prog->startp;
748
        ep = prog->endp;
749
        for (i = NSUBEXP; i > 0; i--) {
750
                *sp++ = NULL;
751
                *ep++ = NULL;
752
        }
753
        r = regmatch(prog->program + 1);
754
        if (EXP_MATCH == r) {
755
                prog->startp[0] = string;
756
                prog->endp[0] = reginput;
757
                *matchlength = reginput-string;
758
                return(EXP_MATCH);
759
        }
760
        return(r);      /* CAN_MATCH or CANT_MATCH */
761
}
762
 
763
/*
764
 - regmatch - main matching routine
765
 *
766
 * Conceptually the strategy is simple:  check to see whether the current
767
 * node matches, call self recursively to see whether the rest matches,
768
 * and then act accordingly.  In practice we make some effort to avoid
769
 * recursion, in particular by going through "ordinary" nodes (that don't
770
 * need to know whether the rest of the match failed) by a loop instead of
771
 * by recursion.
772
 */
773
/* returns CAN, CANT or MATCH */
774
static int                      /* 0 failure, 1 success */
775
regmatch(prog)
776
char *prog;
777
{
778
        register char *scan;    /* Current node. */
779
        char *next;             /* Next node. */
780
#ifndef strchr  /* May be #defined to something else */
781
        extern char *strchr();
782
#endif
783
 
784
        scan = prog;
785
#ifdef DEBUG
786
        if (scan != NULL && regnarrate)
787
                fprintf(stderr, "%s(\n", regprop(scan));
788
#endif
789
        while (scan != NULL) {
790
#ifdef DEBUG
791
                if (regnarrate)
792
                        fprintf(stderr, "%s...\n", regprop(scan));
793
#endif
794
                next = regnext(scan);
795
 
796
                switch (OP(scan)) {
797
                case BOL:
798
                        if (reginput != regbol)
799
/*                              return(0);*/
800
                                return(EXP_CANTMATCH);
801
                        break;
802
                case EOL:
803
                        if (*reginput != '\0')
804
/*                              return(0);*/
805
/* note this implies that "$" must match everything received to this point! */
806
                                return(EXP_CANTMATCH);
807
                        break;
808
                case ANY:
809
                        if (*reginput == '\0')
810
/*                              return(0);*/
811
                                return(EXP_CANMATCH);
812
                        reginput++;
813
                        break;
814
                case EXACTLY: {
815
/*                              register int len;*/
816
                                register char *opnd;
817
 
818
                                opnd = OPERAND(scan);
819
 
820
                                /* this section of code is totally rewritten - DEL */
821
                                /* group of literal chars in pattern */
822
                                /* compare each one */
823
                                do {
824
                                        if (*opnd != *reginput) {
825
                                                if (*reginput == '\0') {
826
                                                        return EXP_CANMATCH;
827
                                                } else  return EXP_CANTMATCH;
828
                                        }
829
 
830
                                        reginput++;
831
                                        opnd++;
832
                                } while (*opnd != '\0');
833
                        }
834
                        break;
835
                case ANYOF:
836
/*                      if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL)
837
                                return(0);
838
*/
839
                        if (*reginput == '\0')
840
                                return(EXP_CANMATCH);
841
                        if (strchr(OPERAND(scan),*reginput) == NULL)
842
                                return(EXP_CANTMATCH);
843
                        reginput++;
844
                        break;
845
                case ANYBUT:
846
/*                      if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL)
847
                                return(0);
848
*/
849
                        if (*reginput == '\0')
850
                                return(EXP_CANMATCH);
851
                        if (strchr(OPERAND(scan),*reginput) != NULL)
852
                                return(EXP_CANTMATCH);
853
                        reginput++;
854
                        break;
855
                case NOTHING:
856
                        break;
857
                case BACK:
858
                        break;
859
                case OPEN+1:
860
                case OPEN+2:
861
                case OPEN+3:
862
                case OPEN+4:
863
                case OPEN+5:
864
                case OPEN+6:
865
                case OPEN+7:
866
                case OPEN+8:
867
                case OPEN+9: {
868
                                register int no;
869
                                register char *save;
870
                                int r;  /* result of regmatch */
871
 
872
        doOpen:
873
                                no = OP(scan) - OPEN;
874
                                save = reginput;
875
 
876
                                r = regmatch(next);
877
                                if (r == EXP_MATCH) {
878
                                        /*
879
                                         * Don't set startp if some later
880
                                         * invocation of the same parentheses
881
                                         * already has.
882
                                         */
883
                                        if (regstartp[no] == NULL)
884
                                                regstartp[no] = save;
885
                                }
886
                                return(r);
887
                        }
888
                        /* NOTREACHED */
889
                        break;
890
                case CLOSE+1:
891
                case CLOSE+2:
892
                case CLOSE+3:
893
                case CLOSE+4:
894
                case CLOSE+5:
895
                case CLOSE+6:
896
                case CLOSE+7:
897
                case CLOSE+8:
898
                case CLOSE+9: {
899
                                register int no;
900
                                register char *save;
901
                                int r;  /* result of regmatch */
902
 
903
        doClose:
904
                                no = OP(scan) - CLOSE;
905
                                save = reginput;
906
 
907
                                r = regmatch(next);
908
                                if (r == EXP_MATCH) {
909
                                        /*
910
                                         * Don't set endp if some later
911
                                         * invocation of the same parentheses
912
                                         * already has.
913
                                         */
914
                                        if (regendp[no] == NULL)
915
                                                regendp[no] = save;
916
                                }
917
                                return(r);
918
                        }
919
                        /* NOTREACHED */
920
                        break;
921
                case BRANCH: {
922
                                register char *save;
923
                                int match_status;
924
 
925
                                if (OP(next) != BRANCH)         /* No choice. */
926
                                        next = OPERAND(scan);   /* Avoid recursion. */
927
                                else {
928
                                        match_status = EXP_CANTMATCH;
929
 
930
                                        do {
931
                                                int r;
932
 
933
                                                save = reginput;
934
                                                r = regmatch(OPERAND(scan));
935
                                                if (r == EXP_MATCH) return(r);
936
                                                if (r == EXP_CANMATCH) {
937
                                                        match_status = r;
938
                                                }
939
                                                reginput = save;
940
                                                scan = regnext(scan);
941
                                        } while (scan != NULL && OP(scan) == BRANCH);
942
                                        return(match_status);
943
                                        /* NOTREACHED */
944
                                }
945
                        }
946
                        /* NOTREACHED */
947
                        break;
948
                case STAR:
949
                case PLUS: {
950
                                register char nextch;
951
                                register int no;
952
                                register char *save;
953
                                register int min;
954
                                int match_status;
955
 
956
                                /*
957
                                 * Lookahead to avoid useless match attempts
958
                                 * when we know what character comes next.
959
                                 */
960
                                match_status = EXP_CANTMATCH;
961
                                nextch = '\0';
962
                                if (OP(next) == EXACTLY)
963
                                        nextch = *OPERAND(next);
964
                                min = (OP(scan) == STAR) ? 0 : 1;
965
                                save = reginput;
966
                                no = regrepeat(OPERAND(scan));
967
                                while (no >= min) {
968
                                        /* If it could work, try it. */
969
                                        /* 3rd condition allows for CAN_MATCH */
970
                                        if (nextch == '\0' || *reginput == nextch || *reginput == '\0') {
971
                                                int r = regmatch(next);
972
                                                if (r == EXP_MATCH)
973
                                                        return(EXP_MATCH);
974
                                                if (r == EXP_CANMATCH)
975
                                                        match_status = r;
976
                                        }
977
                                        /* Couldn't or didn't -- back up. */
978
                                        no--;
979
                                        reginput = save + no;
980
                                }
981
                                return(match_status);
982
                        }
983
                        /* NOTREACHED */
984
                        break;
985
                case END:
986
                        return(EXP_MATCH);      /* Success! */
987
                        /* NOTREACHED */
988
                        break;
989
                default:
990
                        if (OP(scan) > OPEN && OP(scan) < OPEN+NSUBEXP) {
991
                                goto doOpen;
992
                        } else if (OP(scan) > CLOSE && OP(scan) < CLOSE+NSUBEXP) {
993
                                goto doClose;
994
                        }
995
                        regerror("memory corruption");
996
                        return(EXP_TCLERROR);
997
                        /* NOTREACHED */
998
                        break;
999
                }
1000
 
1001
                scan = next;
1002
        }
1003
 
1004
        /*
1005
         * We get here only if there's trouble -- normally "case END" is
1006
         * the terminating point.
1007
         */
1008
        regerror("corrupted pointers");
1009
        return(EXP_TCLERROR);
1010
}
1011
 
1012
/*
1013
 - regrepeat - repeatedly match something simple, report how many
1014
 */
1015
static int
1016
regrepeat(p)
1017
char *p;
1018
{
1019
        register int count = 0;
1020
        register char *scan;
1021
        register char *opnd;
1022
#ifndef strchr  /* May be #defined to something else */
1023
/*DEL*/ extern char *strchr();
1024
#endif
1025
 
1026
        scan = reginput;
1027
        opnd = OPERAND(p);
1028
        switch (OP(p)) {
1029
        case ANY:
1030
                count = strlen(scan);
1031
                scan += count;
1032
                break;
1033
        case EXACTLY:
1034
                while (*opnd == *scan) {
1035
                        count++;
1036
                        scan++;
1037
                }
1038
                break;
1039
        case ANYOF:
1040
                while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
1041
                        count++;
1042
                        scan++;
1043
                }
1044
                break;
1045
        case ANYBUT:
1046
                while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
1047
                        count++;
1048
                        scan++;
1049
                }
1050
                break;
1051
        default:                /* Oh dear.  Called inappropriately. */
1052
                regerror("internal foulup");
1053
                count = 0;       /* Best compromise. */
1054
                break;
1055
        }
1056
        reginput = scan;
1057
 
1058
        return(count);
1059
}
1060
 
1061
/*
1062
 - regnext - dig the "next" pointer out of a node
1063
 */
1064
static char *
1065
regnext(p)
1066
register char *p;
1067
{
1068
        register int offset;
1069
 
1070
        if (p == &regdummy)
1071
                return(NULL);
1072
 
1073
        offset = NEXT(p);
1074
        if (offset == 0)
1075
                return(NULL);
1076
 
1077
        if (OP(p) == BACK)
1078
                return(p-offset);
1079
        else
1080
                return(p+offset);
1081
}
1082
 
1083
#ifdef DEBUG
1084
 
1085
STATIC char *regprop();
1086
 
1087
/*
1088
 - regdump - dump a regexp onto stdout in vaguely comprehensible form
1089
 */
1090
void
1091
regdump(r)
1092
regexp *r;
1093
{
1094
        register char *s;
1095
        register char op = EXACTLY;     /* Arbitrary non-END op. */
1096
        register char *next;
1097
        extern char *strchr();
1098
 
1099
 
1100
        s = r->program + 1;
1101
        while (op != END) {     /* While that wasn't END last time... */
1102
                op = OP(s);
1103
                printf("%2d%s", s-r->program, regprop(s));      /* Where, what. */
1104
                next = regnext(s);
1105
                if (next == NULL)               /* Next ptr. */
1106
                        printf("(0)");
1107
                else
1108
                        printf("(%d)", (s-r->program)+(next-s));
1109
                s += 3;
1110
                if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
1111
                        /* Literal string, where present. */
1112
                        while (*s != '\0') {
1113
                                putchar(*s);
1114
                                s++;
1115
                        }
1116
                        s++;
1117
                }
1118
                putchar('\n');
1119
        }
1120
 
1121
        /* Header fields of interest. */
1122
        if (r->regstart != '\0')
1123
                printf("start `%c' ", r->regstart);
1124
        if (r->reganch)
1125
                printf("anchored ");
1126
        if (r->regmust != NULL)
1127
                printf("must have \"%s\"", r->regmust);
1128
        printf("\n");
1129
}
1130
 
1131
/*
1132
 - regprop - printable representation of opcode
1133
 */
1134
static char *
1135
regprop(op)
1136
char *op;
1137
{
1138
        register char *p;
1139
        static char buf[50];
1140
 
1141
        (void) strcpy(buf, ":");
1142
 
1143
        switch (OP(op)) {
1144
        case BOL:
1145
                p = "BOL";
1146
                break;
1147
        case EOL:
1148
                p = "EOL";
1149
                break;
1150
        case ANY:
1151
                p = "ANY";
1152
                break;
1153
        case ANYOF:
1154
                p = "ANYOF";
1155
                break;
1156
        case ANYBUT:
1157
                p = "ANYBUT";
1158
                break;
1159
        case BRANCH:
1160
                p = "BRANCH";
1161
                break;
1162
        case EXACTLY:
1163
                p = "EXACTLY";
1164
                break;
1165
        case NOTHING:
1166
                p = "NOTHING";
1167
                break;
1168
        case BACK:
1169
                p = "BACK";
1170
                break;
1171
        case END:
1172
                p = "END";
1173
                break;
1174
        case OPEN+1:
1175
        case OPEN+2:
1176
        case OPEN+3:
1177
        case OPEN+4:
1178
        case OPEN+5:
1179
        case OPEN+6:
1180
        case OPEN+7:
1181
        case OPEN+8:
1182
        case OPEN+9:
1183
                sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
1184
                p = NULL;
1185
                break;
1186
        case CLOSE+1:
1187
        case CLOSE+2:
1188
        case CLOSE+3:
1189
        case CLOSE+4:
1190
        case CLOSE+5:
1191
        case CLOSE+6:
1192
        case CLOSE+7:
1193
        case CLOSE+8:
1194
        case CLOSE+9:
1195
                sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
1196
                p = NULL;
1197
                break;
1198
        case STAR:
1199
                p = "STAR";
1200
                break;
1201
        case PLUS:
1202
                p = "PLUS";
1203
                break;
1204
        default:
1205
                if (OP(op) > OPEN && OP(op) < OPEN+NSUBEXP) {
1206
                    sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
1207
                    p = NULL;
1208
                    break;
1209
                } else if (OP(op) > CLOSE && OP(op) < CLOSE+NSUBEXP) {
1210
                    sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
1211
                    p = NULL;
1212
                } else {
1213
                    TclRegError("corrupted opcode");
1214
                }
1215
                break;
1216
        }
1217
        if (p != NULL)
1218
                (void) strcat(buf, p);
1219
        return(buf);
1220
}
1221
#endif
1222
 
1223
/*
1224
 * The following is provided for those people who do not have strcspn() in
1225
 * their C libraries.  They should get off their butts and do something
1226
 * about it; at least one public-domain implementation of those (highly
1227
 * useful) string routines has been published on Usenet.
1228
 */
1229
#ifdef STRCSPN
1230
/*
1231
 * strcspn - find length of initial segment of s1 consisting entirely
1232
 * of characters not from s2
1233
 */
1234
 
1235
static int
1236
strcspn(s1, s2)
1237
char *s1;
1238
char *s2;
1239
{
1240
        register char *scan1;
1241
        register char *scan2;
1242
        register int count;
1243
 
1244
        count = 0;
1245
        for (scan1 = s1; *scan1 != '\0'; scan1++) {
1246
                for (scan2 = s2; *scan2 != '\0';)       /* ++ moved down. */
1247
                        if (*scan1 == *scan2++)
1248
                                return(count);
1249
                count++;
1250
        }
1251
        return(count);
1252
}
1253
#endif

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.