OpenCores
URL https://opencores.org/ocsvn/or1k/or1k/trunk

Subversion Repositories or1k

[/] [or1k/] [trunk/] [uclinux/] [uC-libc/] [regexp/] [regexp.c] - Blame information for rev 1768

Go to most recent revision | Details | Compare with Previous | View Log

Line No. Rev Author Line
1 199 simons
/*
2
 * regcomp and regexec -- regsub and regerror are elsewhere
3
 *
4
 *      Copyright (c) 1986 by University of Toronto.
5
 *      Written by Henry Spencer.  Not derived from licensed software.
6
 *
7
 *      Permission is granted to anyone to use this software for any
8
 *      purpose on any computer system, and to redistribute it freely,
9
 *      subject to the following restrictions:
10
 *
11
 *      1. The author is not responsible for the consequences of use of
12
 *              this software, no matter how awful, even if they arise
13
 *              from defects in it.
14
 *
15
 *      2. The origin of this software must not be misrepresented, either
16
 *              by explicit claim or by omission.
17
 *
18
 *      3. Altered versions must be plainly marked as such, and must not
19
 *              be misrepresented as being the original software.
20
 *
21
 * Beware that some of this code is subtly aware of the way operator
22
 * precedence is structured in regular expressions.  Serious changes in
23
 * regular-expression syntax might require a total rethink.
24
 */
25
#include <stdio.h>
26
#include <regexp.h>
27
#include <malloc.h>
28
#include <string.h>
29
#include "regmagic.h"
30
 
31
/*
32
 * The "internal use only" fields in regexp.h are present to pass info from
33
 * compile to execute that permits the execute phase to run lots faster on
34
 * simple cases.  They are:
35
 *
36
 * regstart     char that must begin a match; '\0' if none obvious
37
 * reganch      is the match anchored (at beginning-of-line only)?
38
 * regmust      string (pointer into program) that match must include, or NULL
39
 * regmlen      length of regmust string
40
 *
41
 * Regstart and reganch permit very fast decisions on suitable starting points
42
 * for a match, cutting down the work a lot.  Regmust permits fast rejection
43
 * of lines that cannot possibly match.  The regmust tests are costly enough
44
 * that regcomp() supplies a regmust only if the r.e. contains something
45
 * potentially expensive (at present, the only such thing detected is * or +
46
 * at the start of the r.e., which can involve a lot of backup).  Regmlen is
47
 * supplied because the test in regexec() needs it and regcomp() is computing
48
 * it anyway.
49
 */
50
 
51
/*
52
 * Structure for regexp "program".  This is essentially a linear encoding
53
 * of a nondeterministic finite-state machine (aka syntax charts or
54
 * "railroad normal form" in parsing technology).  Each node is an opcode
55
 * plus a "next" pointer, possibly plus an operand.  "Next" pointers of
56
 * all nodes except BRANCH implement concatenation; a "next" pointer with
57
 * a BRANCH on both ends of it is connecting two alternatives.  (Here we
58
 * have one of the subtle syntax dependencies:  an individual BRANCH (as
59
 * opposed to a collection of them) is never concatenated with anything
60
 * because of operator precedence.)  The operand of some types of node is
61
 * a literal string; for others, it is a node leading into a sub-FSM.  In
62
 * particular, the operand of a BRANCH node is the first node of the branch.
63
 * (NB this is *not* a tree structure:  the tail of the branch connects
64
 * to the thing following the set of BRANCHes.)  The opcodes are:
65
 */
66
 
67
/* definition   number  opnd?   meaning */
68
#define END     0        /* no   End of program. */
69
#define BOL     1       /* no   Match "" at beginning of line. */
70
#define EOL     2       /* no   Match "" at end of line. */
71
#define ANY     3       /* no   Match any one character. */
72
#define ANYOF   4       /* str  Match any character in this string. */
73
#define ANYBUT  5       /* str  Match any character not in this string. */
74
#define BRANCH  6       /* node Match this alternative, or the next... */
75
#define BACK    7       /* no   Match "", "next" ptr points backward. */
76
#define EXACTLY 8       /* str  Match this string. */
77
#define NOTHING 9       /* no   Match empty string. */
78
#define STAR    10      /* node Match this (simple) thing 0 or more times. */
79
#define PLUS    11      /* node Match this (simple) thing 1 or more times. */
80
#define OPEN    20      /* no   Mark this point in input as start of #n. */
81
                        /*      OPEN+1 is number 1, etc. */
82
#define CLOSE   30      /* no   Analogous to OPEN. */
83
 
84
/*
85
 * Opcode notes:
86
 *
87
 * BRANCH       The set of branches constituting a single choice are hooked
88
 *              together with their "next" pointers, since precedence prevents
89
 *              anything being concatenated to any individual branch.  The
90
 *              "next" pointer of the last BRANCH in a choice points to the
91
 *              thing following the whole choice.  This is also where the
92
 *              final "next" pointer of each individual branch points; each
93
 *              branch starts with the operand node of a BRANCH node.
94
 *
95
 * BACK         Normal "next" pointers all implicitly point forward; BACK
96
 *              exists to make loop structures possible.
97
 *
98
 * STAR,PLUS    '?', and complex '*' and '+', are implemented as circular
99
 *              BRANCH structures using BACK.  Simple cases (one character
100
 *              per match) are implemented with STAR and PLUS for speed
101
 *              and to minimize recursive plunges.
102
 *
103
 * OPEN,CLOSE   ...are numbered at compile time.
104
 */
105
 
106
/*
107
 * A node is one char of opcode followed by two chars of "next" pointer.
108
 * "Next" pointers are stored as two 8-bit pieces, high order first.  The
109
 * value is a positive offset from the opcode of the node containing it.
110
 * An operand, if any, simply follows the node.  (Note that much of the
111
 * code generation knows about this implicit relationship.)
112
 *
113
 * Using two bytes for the "next" pointer is vast overkill for most things,
114
 * but allows patterns to get big without disasters.
115
 */
116
#define OP(p)   (*(p))
117
#define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
118
#define OPERAND(p)      ((p) + 3)
119
 
120
/*
121
 * See regmagic.h for one further detail of program structure.
122
 */
123
 
124
 
125
/*
126
 * Utility definitions.
127
 */
128
#ifndef CHARBITS
129
#define UCHARAT(p)      ((int)*(unsigned char *)(p))
130
#else
131
#define UCHARAT(p)      ((int)*(p)&CHARBITS)
132
#endif
133
 
134
#define FAIL(m) { regerror(m); return(NULL); }
135
#define ISMULT(c)       ((c) == '*' || (c) == '+' || (c) == '?')
136
#define META    "^$.[()|?+*\\"
137
 
138
/*
139
 * Flags to be passed up and down.
140
 */
141
#define HASWIDTH        01      /* Known never to match null string. */
142
#define SIMPLE          02      /* Simple enough to be STAR/PLUS operand. */
143
#define SPSTART         04      /* Starts with * or +. */
144
#define WORST           0        /* Worst case. */
145
 
146
/*
147
 * Global work variables for regcomp().
148
 */
149
static char *regparse;          /* Input-scan pointer. */
150
static int regnpar;             /* () count. */
151
static char regdummy;
152
static char *regcode;           /* Code-emit pointer; &regdummy = don't. */
153
static long regsize;            /* Code size. */
154
 
155
/*
156
 * Forward declarations for regcomp()'s friends.
157
 */
158
#ifndef STATIC
159
#define STATIC  static
160
#endif
161
STATIC char *reg();
162
STATIC char *regbranch();
163
STATIC char *regpiece();
164
STATIC char *regatom();
165
STATIC char *regnode();
166
STATIC char *regnext();
167
STATIC void regc();
168
STATIC void reginsert();
169
STATIC void regtail();
170
STATIC void regoptail();
171
#ifdef STRCSPN
172
STATIC int strcspn();
173
#endif
174
 
175
/*
176
 - regcomp - compile a regular expression into internal code
177
 *
178
 * We can't allocate space until we know how big the compiled form will be,
179
 * but we can't compile it (and thus know how big it is) until we've got a
180
 * place to put the code.  So we cheat:  we compile it twice, once with code
181
 * generation turned off and size counting turned on, and once "for real".
182
 * This also means that we don't allocate space until we are sure that the
183
 * thing really will compile successfully, and we never have to move the
184
 * code and thus invalidate pointers into it.  (Note that it has to be in
185
 * one piece because free() must be able to free it all.)
186
 *
187
 * Beware that the optimization-preparation code in here knows about some
188
 * of the structure of the compiled regexp.
189
 */
190
regexp *
191
regcomp(exp)
192
char *exp;
193
{
194
        register regexp *r;
195
        register char *scan;
196
        register char *longest;
197
        register int len;
198
        int flags;
199
 
200
        if (exp == NULL)
201
                FAIL("NULL argument");
202
 
203
        /* First pass: determine size, legality. */
204
        regparse = exp;
205
        regnpar = 1;
206
        regsize = 0L;
207
        regcode = &regdummy;
208
        regc(MAGIC);
209
        if (reg(0, &flags) == NULL)
210
                return(NULL);
211
 
212
        /* Small enough for pointer-storage convention? */
213
        if (regsize >= 32767L)          /* Probably could be 65535L. */
214
                FAIL("regexp too big");
215
 
216
        /* Allocate space. */
217
        r = (regexp *)malloc(sizeof(regexp) + (unsigned)regsize);
218
        if (r == NULL)
219
                FAIL("out of space");
220
 
221
        /* Second pass: emit code. */
222
        regparse = exp;
223
        regnpar = 1;
224
        regcode = r->program;
225
        regc(MAGIC);
226
        if (reg(0, &flags) == NULL)
227
                return(NULL);
228
 
229
        /* Dig out information for optimizations. */
230
        r->regstart = '\0';     /* Worst-case defaults. */
231
        r->reganch = 0;
232
        r->regmust = NULL;
233
        r->regmlen = 0;
234
        scan = r->program+1;                    /* First BRANCH. */
235
        if (OP(regnext(scan)) == END) {         /* Only one top-level choice. */
236
                scan = OPERAND(scan);
237
 
238
                /* Starting-point info. */
239
                if (OP(scan) == EXACTLY)
240
                        r->regstart = *OPERAND(scan);
241
                else if (OP(scan) == BOL)
242
                        r->reganch++;
243
 
244
                /*
245
                 * If there's something expensive in the r.e., find the
246
                 * longest literal string that must appear and make it the
247
                 * regmust.  Resolve ties in favor of later strings, since
248
                 * the regstart check works with the beginning of the r.e.
249
                 * and avoiding duplication strengthens checking.  Not a
250
                 * strong reason, but sufficient in the absence of others.
251
                 */
252
                if (flags&SPSTART) {
253
                        longest = NULL;
254
                        len = 0;
255
                        for (; scan != NULL; scan = regnext(scan))
256
                                if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) {
257
                                        longest = OPERAND(scan);
258
                                        len = strlen(OPERAND(scan));
259
                                }
260
                        r->regmust = longest;
261
                        r->regmlen = len;
262
                }
263
        }
264
 
265
        return(r);
266
}
267
 
268
/*
269
 - reg - regular expression, i.e. main body or parenthesized thing
270
 *
271
 * Caller must absorb opening parenthesis.
272
 *
273
 * Combining parenthesis handling with the base level of regular expression
274
 * is a trifle forced, but the need to tie the tails of the branches to what
275
 * follows makes it hard to avoid.
276
 */
277
static char *
278
reg(paren, flagp)
279
int paren;                      /* Parenthesized? */
280
int *flagp;
281
{
282
        register char *ret;
283
        register char *br;
284
        register char *ender;
285
        register int parno;
286
        int flags;
287
 
288
        *flagp = HASWIDTH;      /* Tentatively. */
289
 
290
        /* Make an OPEN node, if parenthesized. */
291
        if (paren) {
292
                if (regnpar >= NSUBEXP)
293
                        FAIL("too many ()");
294
                parno = regnpar;
295
                regnpar++;
296
                ret = regnode(OPEN+parno);
297
        } else
298
                ret = NULL;
299
 
300
        /* Pick up the branches, linking them together. */
301
        br = regbranch(&flags);
302
        if (br == NULL)
303
                return(NULL);
304
        if (ret != NULL)
305
                regtail(ret, br);       /* OPEN -> first. */
306
        else
307
                ret = br;
308
        if (!(flags&HASWIDTH))
309
                *flagp &= ~HASWIDTH;
310
        *flagp |= flags&SPSTART;
311
        while (*regparse == '|') {
312
                regparse++;
313
                br = regbranch(&flags);
314
                if (br == NULL)
315
                        return(NULL);
316
                regtail(ret, br);       /* BRANCH -> BRANCH. */
317
                if (!(flags&HASWIDTH))
318
                        *flagp &= ~HASWIDTH;
319
                *flagp |= flags&SPSTART;
320
        }
321
 
322
        /* Make a closing node, and hook it on the end. */
323
        ender = regnode((paren) ? CLOSE+parno : END);
324
        regtail(ret, ender);
325
 
326
        /* Hook the tails of the branches to the closing node. */
327
        for (br = ret; br != NULL; br = regnext(br))
328
                regoptail(br, ender);
329
 
330
        /* Check for proper termination. */
331
        if (paren && *regparse++ != ')') {
332
                FAIL("unmatched ()");
333
        } else if (!paren && *regparse != '\0') {
334
                if (*regparse == ')') {
335
                        FAIL("unmatched ()");
336
                } else
337
                        FAIL("junk on end");    /* "Can't happen". */
338
                /* NOTREACHED */
339
        }
340
 
341
        return(ret);
342
}
343
 
344
/*
345
 - regbranch - one alternative of an | operator
346
 *
347
 * Implements the concatenation operator.
348
 */
349
static char *
350
regbranch(flagp)
351
int *flagp;
352
{
353
        register char *ret;
354
        register char *chain;
355
        register char *latest;
356
        int flags;
357
 
358
        *flagp = WORST;         /* Tentatively. */
359
 
360
        ret = regnode(BRANCH);
361
        chain = NULL;
362
        while (*regparse != '\0' && *regparse != '|' && *regparse != ')') {
363
                latest = regpiece(&flags);
364
                if (latest == NULL)
365
                        return(NULL);
366
                *flagp |= flags&HASWIDTH;
367
                if (chain == NULL)      /* First piece. */
368
                        *flagp |= flags&SPSTART;
369
                else
370
                        regtail(chain, latest);
371
                chain = latest;
372
        }
373
        if (chain == NULL)      /* Loop ran zero times. */
374
                (void) regnode(NOTHING);
375
 
376
        return(ret);
377
}
378
 
379
/*
380
 - regpiece - something followed by possible [*+?]
381
 *
382
 * Note that the branching code sequences used for ? and the general cases
383
 * of * and + are somewhat optimized:  they use the same NOTHING node as
384
 * both the endmarker for their branch list and the body of the last branch.
385
 * It might seem that this node could be dispensed with entirely, but the
386
 * endmarker role is not redundant.
387
 */
388
static char *
389
regpiece(flagp)
390
int *flagp;
391
{
392
        register char *ret;
393
        register char op;
394
        register char *next;
395
        int flags;
396
 
397
        ret = regatom(&flags);
398
        if (ret == NULL)
399
                return(NULL);
400
 
401
        op = *regparse;
402
        if (!ISMULT(op)) {
403
                *flagp = flags;
404
                return(ret);
405
        }
406
 
407
        if (!(flags&HASWIDTH) && op != '?')
408
                FAIL("*+ operand could be empty");
409
        *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
410
 
411
        if (op == '*' && (flags&SIMPLE))
412
                reginsert(STAR, ret);
413
        else if (op == '*') {
414
                /* Emit x* as (x&|), where & means "self". */
415
                reginsert(BRANCH, ret);                 /* Either x */
416
                regoptail(ret, regnode(BACK));          /* and loop */
417
                regoptail(ret, ret);                    /* back */
418
                regtail(ret, regnode(BRANCH));          /* or */
419
                regtail(ret, regnode(NOTHING));         /* null. */
420
        } else if (op == '+' && (flags&SIMPLE))
421
                reginsert(PLUS, ret);
422
        else if (op == '+') {
423
                /* Emit x+ as x(&|), where & means "self". */
424
                next = regnode(BRANCH);                 /* Either */
425
                regtail(ret, next);
426
                regtail(regnode(BACK), ret);            /* loop back */
427
                regtail(next, regnode(BRANCH));         /* or */
428
                regtail(ret, regnode(NOTHING));         /* null. */
429
        } else if (op == '?') {
430
                /* Emit x? as (x|) */
431
                reginsert(BRANCH, ret);                 /* Either x */
432
                regtail(ret, regnode(BRANCH));          /* or */
433
                next = regnode(NOTHING);                /* null. */
434
                regtail(ret, next);
435
                regoptail(ret, next);
436
        }
437
        regparse++;
438
        if (ISMULT(*regparse))
439
                FAIL("nested *?+");
440
 
441
        return(ret);
442
}
443
 
444
/*
445
 - regatom - the lowest level
446
 *
447
 * Optimization:  gobbles an entire sequence of ordinary characters so that
448
 * it can turn them into a single node, which is smaller to store and
449
 * faster to run.  Backslashed characters are exceptions, each becoming a
450
 * separate node; the code is simpler that way and it's not worth fixing.
451
 */
452
static char *
453
regatom(flagp)
454
int *flagp;
455
{
456
        register char *ret;
457
        int flags;
458
 
459
        *flagp = WORST;         /* Tentatively. */
460
 
461
        switch (*regparse++) {
462
        case '^':
463
                ret = regnode(BOL);
464
                break;
465
        case '$':
466
                ret = regnode(EOL);
467
                break;
468
        case '.':
469
                ret = regnode(ANY);
470
                *flagp |= HASWIDTH|SIMPLE;
471
                break;
472
        case '[': {
473
                        register int class;
474
                        register int classend;
475
 
476
                        if (*regparse == '^') { /* Complement of range. */
477
                                ret = regnode(ANYBUT);
478
                                regparse++;
479
                        } else
480
                                ret = regnode(ANYOF);
481
                        if (*regparse == ']' || *regparse == '-')
482
                                regc(*regparse++);
483
                        while (*regparse != '\0' && *regparse != ']') {
484
                                if (*regparse == '-') {
485
                                        regparse++;
486
                                        if (*regparse == ']' || *regparse == '\0')
487
                                                regc('-');
488
                                        else {
489
                                                class = UCHARAT(regparse-2)+1;
490
                                                classend = UCHARAT(regparse);
491
                                                if (class > classend+1)
492
                                                        FAIL("invalid [] range");
493
                                                for (; class <= classend; class++)
494
                                                        regc(class);
495
                                                regparse++;
496
                                        }
497
                                } else
498
                                        regc(*regparse++);
499
                        }
500
                        regc('\0');
501
                        if (*regparse != ']')
502
                                FAIL("unmatched []");
503
                        regparse++;
504
                        *flagp |= HASWIDTH|SIMPLE;
505
                }
506
                break;
507
        case '(':
508
                ret = reg(1, &flags);
509
                if (ret == NULL)
510
                        return(NULL);
511
                *flagp |= flags&(HASWIDTH|SPSTART);
512
                break;
513
        case '\0':
514
        case '|':
515
        case ')':
516
                FAIL("internal urp");   /* Supposed to be caught earlier. */
517
                break;
518
        case '?':
519
        case '+':
520
        case '*':
521
                FAIL("?+* follows nothing");
522
                break;
523
        case '\\':
524
                if (*regparse == '\0')
525
                        FAIL("trailing \\");
526
                ret = regnode(EXACTLY);
527
                regc(*regparse++);
528
                regc('\0');
529
                *flagp |= HASWIDTH|SIMPLE;
530
                break;
531
        default: {
532
                        register int len;
533
                        register char ender;
534
 
535
                        regparse--;
536
                        len = strcspn(regparse, META);
537
                        if (len <= 0)
538
                                FAIL("internal disaster");
539
                        ender = *(regparse+len);
540
                        if (len > 1 && ISMULT(ender))
541
                                len--;          /* Back off clear of ?+* operand. */
542
                        *flagp |= HASWIDTH;
543
                        if (len == 1)
544
                                *flagp |= SIMPLE;
545
                        ret = regnode(EXACTLY);
546
                        while (len > 0) {
547
                                regc(*regparse++);
548
                                len--;
549
                        }
550
                        regc('\0');
551
                }
552
                break;
553
        }
554
 
555
        return(ret);
556
}
557
 
558
/*
559
 - regnode - emit a node
560
 */
561
static char *                   /* Location. */
562
regnode(op)
563
char op;
564
{
565
        register char *ret;
566
        register char *ptr;
567
 
568
        ret = regcode;
569
        if (ret == &regdummy) {
570
                regsize += 3;
571
                return(ret);
572
        }
573
 
574
        ptr = ret;
575
        *ptr++ = op;
576
        *ptr++ = '\0';          /* Null "next" pointer. */
577
        *ptr++ = '\0';
578
        regcode = ptr;
579
 
580
        return(ret);
581
}
582
 
583
/*
584
 - regc - emit (if appropriate) a byte of code
585
 */
586
static void
587
regc(b)
588
char b;
589
{
590
        if (regcode != &regdummy)
591
                *regcode++ = b;
592
        else
593
                regsize++;
594
}
595
 
596
/*
597
 - reginsert - insert an operator in front of already-emitted operand
598
 *
599
 * Means relocating the operand.
600
 */
601
static void
602
reginsert(op, opnd)
603
char op;
604
char *opnd;
605
{
606
        register char *src;
607
        register char *dst;
608
        register char *place;
609
 
610
        if (regcode == &regdummy) {
611
                regsize += 3;
612
                return;
613
        }
614
 
615
        src = regcode;
616
        regcode += 3;
617
        dst = regcode;
618
        while (src > opnd)
619
                *--dst = *--src;
620
 
621
        place = opnd;           /* Op node, where operand used to be. */
622
        *place++ = op;
623
        *place++ = '\0';
624
        *place++ = '\0';
625
}
626
 
627
/*
628
 - regtail - set the next-pointer at the end of a node chain
629
 */
630
static void
631
regtail(p, val)
632
char *p;
633
char *val;
634
{
635
        register char *scan;
636
        register char *temp;
637
        register int offset;
638
 
639
        if (p == &regdummy)
640
                return;
641
 
642
        /* Find last node. */
643
        scan = p;
644
        for (;;) {
645
                temp = regnext(scan);
646
                if (temp == NULL)
647
                        break;
648
                scan = temp;
649
        }
650
 
651
        if (OP(scan) == BACK)
652
                offset = scan - val;
653
        else
654
                offset = val - scan;
655
        *(scan+1) = (offset>>8)&0377;
656
        *(scan+2) = offset&0377;
657
}
658
 
659
/*
660
 - regoptail - regtail on operand of first argument; nop if operandless
661
 */
662
static void
663
regoptail(p, val)
664
char *p;
665
char *val;
666
{
667
        /* "Operandless" and "op != BRANCH" are synonymous in practice. */
668
        if (p == NULL || p == &regdummy || OP(p) != BRANCH)
669
                return;
670
        regtail(OPERAND(p), val);
671
}
672
 
673
/*
674
 * regexec and friends
675
 */
676
 
677
/*
678
 * Global work variables for regexec().
679
 */
680
static char *reginput;          /* String-input pointer. */
681
static char *regbol;            /* Beginning of input, for ^ check. */
682
static char **regstartp;        /* Pointer to startp array. */
683
static char **regendp;          /* Ditto for endp. */
684
 
685
/*
686
 * Forwards.
687
 */
688
STATIC int regtry();
689
STATIC int regmatch();
690
STATIC int regrepeat();
691
 
692
#ifdef DEBUG
693
int regnarrate = 0;
694
void regdump();
695
STATIC char *regprop();
696
#endif
697
 
698
/*
699
 - regexec - match a regexp against a string
700
 */
701
int
702
regexec(prog, string)
703
register regexp *prog;
704
register char *string;
705
{
706
        register char *s;
707
 
708
        /* Be paranoid... */
709
        if (prog == NULL || string == NULL) {
710
                regerror("NULL parameter");
711
                return(0);
712
        }
713
 
714
        /* Check validity of program. */
715
        if (UCHARAT(prog->program) != MAGIC) {
716
                regerror("corrupted program");
717
                return(0);
718
        }
719
 
720
        /* If there is a "must appear" string, look for it. */
721
        if (prog->regmust != NULL) {
722
                s = string;
723
                while ((s = strchr(s, prog->regmust[0])) != NULL) {
724
                        if (strncmp(s, prog->regmust, prog->regmlen) == 0)
725
                                break;  /* Found it. */
726
                        s++;
727
                }
728
                if (s == NULL)  /* Not present. */
729
                        return(0);
730
        }
731
 
732
        /* Mark beginning of line for ^ . */
733
        regbol = string;
734
 
735
        /* Simplest case:  anchored match need be tried only once. */
736
        if (prog->reganch)
737
                return(regtry(prog, string));
738
 
739
        /* Messy cases:  unanchored match. */
740
        s = string;
741
        if (prog->regstart != '\0')
742
                /* We know what char it must start with. */
743
                while ((s = strchr(s, prog->regstart)) != NULL) {
744
                        if (regtry(prog, s))
745
                                return(1);
746
                        s++;
747
                }
748
        else
749
                /* We don't -- general case. */
750
                do {
751
                        if (regtry(prog, s))
752
                                return(1);
753
                } while (*s++ != '\0');
754
 
755
        /* Failure. */
756
        return(0);
757
}
758
 
759
/*
760
 - regtry - try match at specific point
761
 */
762
static int                      /* 0 failure, 1 success */
763
regtry(prog, string)
764
regexp *prog;
765
char *string;
766
{
767
        register int i;
768
        register char **sp;
769
        register char **ep;
770
 
771
        reginput = string;
772
        regstartp = prog->startp;
773
        regendp = prog->endp;
774
 
775
        sp = prog->startp;
776
        ep = prog->endp;
777
        for (i = NSUBEXP; i > 0; i--) {
778
                *sp++ = NULL;
779
                *ep++ = NULL;
780
        }
781
        if (regmatch(prog->program + 1)) {
782
                prog->startp[0] = string;
783
                prog->endp[0] = reginput;
784
                return(1);
785
        } else
786
                return(0);
787
}
788
 
789
/*
790
 - regmatch - main matching routine
791
 *
792
 * Conceptually the strategy is simple:  check to see whether the current
793
 * node matches, call self recursively to see whether the rest matches,
794
 * and then act accordingly.  In practice we make some effort to avoid
795
 * recursion, in particular by going through "ordinary" nodes (that don't
796
 * need to know whether the rest of the match failed) by a loop instead of
797
 * by recursion.
798
 */
799
static int                      /* 0 failure, 1 success */
800
regmatch(prog)
801
char *prog;
802
{
803
        register char *scan;    /* Current node. */
804
        char *next;             /* Next node. */
805
        extern char *strchr();
806
 
807
        scan = prog;
808
#ifdef DEBUG
809
        if (scan != NULL && regnarrate)
810
                fprintf(stderr, "%s(\n", regprop(scan));
811
#endif
812
        while (scan != NULL) {
813
#ifdef DEBUG
814
                if (regnarrate)
815
                        fprintf(stderr, "%s...\n", regprop(scan));
816
#endif
817
                next = regnext(scan);
818
 
819
                switch (OP(scan)) {
820
                case BOL:
821
                        if (reginput != regbol)
822
                                return(0);
823
                        break;
824
                case EOL:
825
                        if (*reginput != '\0')
826
                                return(0);
827
                        break;
828
                case ANY:
829
                        if (*reginput == '\0')
830
                                return(0);
831
                        reginput++;
832
                        break;
833
                case EXACTLY: {
834
                                register int len;
835
                                register char *opnd;
836
 
837
                                opnd = OPERAND(scan);
838
                                /* Inline the first character, for speed. */
839
                                if (*opnd != *reginput)
840
                                        return(0);
841
                                len = strlen(opnd);
842
                                if (len > 1 && strncmp(opnd, reginput, len) != 0)
843
                                        return(0);
844
                                reginput += len;
845
                        }
846
                        break;
847
                case ANYOF:
848
                        if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL)
849
                                return(0);
850
                        reginput++;
851
                        break;
852
                case ANYBUT:
853
                        if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL)
854
                                return(0);
855
                        reginput++;
856
                        break;
857
                case NOTHING:
858
                        break;
859
                case BACK:
860
                        break;
861
                case OPEN+1:
862
                case OPEN+2:
863
                case OPEN+3:
864
                case OPEN+4:
865
                case OPEN+5:
866
                case OPEN+6:
867
                case OPEN+7:
868
                case OPEN+8:
869
                case OPEN+9: {
870
                                register int no;
871
                                register char *save;
872
 
873
                                no = OP(scan) - OPEN;
874
                                save = reginput;
875
 
876
                                if (regmatch(next)) {
877
                                        /*
878
                                         * Don't set startp if some later
879
                                         * invocation of the same parentheses
880
                                         * already has.
881
                                         */
882
                                        if (regstartp[no] == NULL)
883
                                                regstartp[no] = save;
884
                                        return(1);
885
                                } else
886
                                        return(0);
887
                        }
888
                        break;
889
                case CLOSE+1:
890
                case CLOSE+2:
891
                case CLOSE+3:
892
                case CLOSE+4:
893
                case CLOSE+5:
894
                case CLOSE+6:
895
                case CLOSE+7:
896
                case CLOSE+8:
897
                case CLOSE+9: {
898
                                register int no;
899
                                register char *save;
900
 
901
                                no = OP(scan) - CLOSE;
902
                                save = reginput;
903
 
904
                                if (regmatch(next)) {
905
                                        /*
906
                                         * Don't set endp if some later
907
                                         * invocation of the same parentheses
908
                                         * already has.
909
                                         */
910
                                        if (regendp[no] == NULL)
911
                                                regendp[no] = save;
912
                                        return(1);
913
                                } else
914
                                        return(0);
915
                        }
916
                        break;
917
                case BRANCH: {
918
                                register char *save;
919
 
920
                                if (OP(next) != BRANCH)         /* No choice. */
921
                                        next = OPERAND(scan);   /* Avoid recursion. */
922
                                else {
923
                                        do {
924
                                                save = reginput;
925
                                                if (regmatch(OPERAND(scan)))
926
                                                        return(1);
927
                                                reginput = save;
928
                                                scan = regnext(scan);
929
                                        } while (scan != NULL && OP(scan) == BRANCH);
930
                                        return(0);
931
                                        /* NOTREACHED */
932
                                }
933
                        }
934
                        break;
935
                case STAR:
936
                case PLUS: {
937
                                register char nextch;
938
                                register int no;
939
                                register char *save;
940
                                register int min;
941
 
942
                                /*
943
                                 * Lookahead to avoid useless match attempts
944
                                 * when we know what character comes next.
945
                                 */
946
                                nextch = '\0';
947
                                if (OP(next) == EXACTLY)
948
                                        nextch = *OPERAND(next);
949
                                min = (OP(scan) == STAR) ? 0 : 1;
950
                                save = reginput;
951
                                no = regrepeat(OPERAND(scan));
952
                                while (no >= min) {
953
                                        /* If it could work, try it. */
954
                                        if (nextch == '\0' || *reginput == nextch)
955
                                                if (regmatch(next))
956
                                                        return(1);
957
                                        /* Couldn't or didn't -- back up. */
958
                                        no--;
959
                                        reginput = save + no;
960
                                }
961
                                return(0);
962
                        }
963
                        break;
964
                case END:
965
                        return(1);      /* Success! */
966
                        break;
967
                default:
968
                        regerror("memory corruption");
969
                        return(0);
970
                        break;
971
                }
972
 
973
                scan = next;
974
        }
975
 
976
        /*
977
         * We get here only if there's trouble -- normally "case END" is
978
         * the terminating point.
979
         */
980
        regerror("corrupted pointers");
981
        return(0);
982
}
983
 
984
/*
985
 - regrepeat - repeatedly match something simple, report how many
986
 */
987
static int
988
regrepeat(p)
989
char *p;
990
{
991
        register int count = 0;
992
        register char *scan;
993
        register char *opnd;
994
 
995
        scan = reginput;
996
        opnd = OPERAND(p);
997
        switch (OP(p)) {
998
        case ANY:
999
                count = strlen(scan);
1000
                scan += count;
1001
                break;
1002
        case EXACTLY:
1003
                while (*opnd == *scan) {
1004
                        count++;
1005
                        scan++;
1006
                }
1007
                break;
1008
        case ANYOF:
1009
                while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
1010
                        count++;
1011
                        scan++;
1012
                }
1013
                break;
1014
        case ANYBUT:
1015
                while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
1016
                        count++;
1017
                        scan++;
1018
                }
1019
                break;
1020
        default:                /* Oh dear.  Called inappropriately. */
1021
                regerror("internal foulup");
1022
                count = 0;       /* Best compromise. */
1023
                break;
1024
        }
1025
        reginput = scan;
1026
 
1027
        return(count);
1028
}
1029
 
1030
/*
1031
 - regnext - dig the "next" pointer out of a node
1032
 */
1033
static char *
1034
regnext(p)
1035
register char *p;
1036
{
1037
        register int offset;
1038
 
1039
        if (p == &regdummy)
1040
                return(NULL);
1041
 
1042
        offset = NEXT(p);
1043
        if (offset == 0)
1044
                return(NULL);
1045
 
1046
        if (OP(p) == BACK)
1047
                return(p-offset);
1048
        else
1049
                return(p+offset);
1050
}
1051
 
1052
#ifdef DEBUG
1053
 
1054
STATIC char *regprop();
1055
 
1056
/*
1057
 - regdump - dump a regexp onto stdout in vaguely comprehensible form
1058
 */
1059
void
1060
regdump(r)
1061
regexp *r;
1062
{
1063
        register char *s;
1064
        register char op = EXACTLY;     /* Arbitrary non-END op. */
1065
        register char *next;
1066
        extern char *strchr();
1067
 
1068
 
1069
        s = r->program + 1;
1070
        while (op != END) {     /* While that wasn't END last time... */
1071
                op = OP(s);
1072
                printf("%2d%s", s-r->program, regprop(s));      /* Where, what. */
1073
                next = regnext(s);
1074
                if (next == NULL)               /* Next ptr. */
1075
                        printf("(0)");
1076
                else
1077
                        printf("(%d)", (s-r->program)+(next-s));
1078
                s += 3;
1079
                if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
1080
                        /* Literal string, where present. */
1081
                        while (*s != '\0') {
1082
                                putchar(*s);
1083
                                s++;
1084
                        }
1085
                        s++;
1086
                }
1087
                putchar('\n');
1088
        }
1089
 
1090
        /* Header fields of interest. */
1091
        if (r->regstart != '\0')
1092
                printf("start `%c' ", r->regstart);
1093
        if (r->reganch)
1094
                printf("anchored ");
1095
        if (r->regmust != NULL)
1096
                printf("must have \"%s\"", r->regmust);
1097
        printf("\n");
1098
}
1099
 
1100
/*
1101
 - regprop - printable representation of opcode
1102
 */
1103
static char *
1104
regprop(op)
1105
char *op;
1106
{
1107
        register char *p;
1108
        static char buf[50];
1109
 
1110
        (void) strcpy(buf, ":");
1111
 
1112
        switch (OP(op)) {
1113
        case BOL:
1114
                p = "BOL";
1115
                break;
1116
        case EOL:
1117
                p = "EOL";
1118
                break;
1119
        case ANY:
1120
                p = "ANY";
1121
                break;
1122
        case ANYOF:
1123
                p = "ANYOF";
1124
                break;
1125
        case ANYBUT:
1126
                p = "ANYBUT";
1127
                break;
1128
        case BRANCH:
1129
                p = "BRANCH";
1130
                break;
1131
        case EXACTLY:
1132
                p = "EXACTLY";
1133
                break;
1134
        case NOTHING:
1135
                p = "NOTHING";
1136
                break;
1137
        case BACK:
1138
                p = "BACK";
1139
                break;
1140
        case END:
1141
                p = "END";
1142
                break;
1143
        case OPEN+1:
1144
        case OPEN+2:
1145
        case OPEN+3:
1146
        case OPEN+4:
1147
        case OPEN+5:
1148
        case OPEN+6:
1149
        case OPEN+7:
1150
        case OPEN+8:
1151
        case OPEN+9:
1152
                sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
1153
                p = NULL;
1154
                break;
1155
        case CLOSE+1:
1156
        case CLOSE+2:
1157
        case CLOSE+3:
1158
        case CLOSE+4:
1159
        case CLOSE+5:
1160
        case CLOSE+6:
1161
        case CLOSE+7:
1162
        case CLOSE+8:
1163
        case CLOSE+9:
1164
                sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
1165
                p = NULL;
1166
                break;
1167
        case STAR:
1168
                p = "STAR";
1169
                break;
1170
        case PLUS:
1171
                p = "PLUS";
1172
                break;
1173
        default:
1174
                regerror("corrupted opcode");
1175
                break;
1176
        }
1177
        if (p != NULL)
1178
                (void) strcat(buf, p);
1179
        return(buf);
1180
}
1181
#endif
1182
 
1183
/*
1184
 * The following is provided for those people who do not have strcspn() in
1185
 * their C libraries.  They should get off their butts and do something
1186
 * about it; at least one public-domain implementation of those (highly
1187
 * useful) string routines has been published on Usenet.
1188
 */
1189
#ifdef STRCSPN
1190
/*
1191
 * strcspn - find length of initial segment of s1 consisting entirely
1192
 * of characters not from s2
1193
 */
1194
 
1195
static int
1196
strcspn(s1, s2)
1197
char *s1;
1198
char *s2;
1199
{
1200
        register char *scan1;
1201
        register char *scan2;
1202
        register int count;
1203
 
1204
        count = 0;
1205
        for (scan1 = s1; *scan1 != '\0'; scan1++) {
1206
                for (scan2 = s2; *scan2 != '\0';)       /* ++ moved down. */
1207
                        if (*scan1 == *scan2++)
1208
                                return(count);
1209
                count++;
1210
        }
1211
        return(count);
1212
}
1213
#endif

powered by: WebSVN 2.1.0

© copyright 1999-2025 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.