OpenCores
URL https://opencores.org/ocsvn/openrisc_me/openrisc_me/trunk

Subversion Repositories openrisc_me

[/] [openrisc/] [trunk/] [gnu-src/] [gdb-7.1/] [gdb/] [gnulib/] [str-two-way.h] - Blame information for rev 309

Go to most recent revision | Details | Compare with Previous | View Log

Line No. Rev Author Line
1 227 jeremybenn
/* Byte-wise substring search, using the Two-Way algorithm.
2
   Copyright (C) 2008, 2009, 2010 Free Software Foundation, Inc.
3
   This file is part of the GNU C Library.
4
   Written by Eric Blake <ebb9@byu.net>, 2008.
5
 
6
   This program is free software; you can redistribute it and/or modify
7
   it under the terms of the GNU General Public License as published by
8
   the Free Software Foundation; either version 3, or (at your option)
9
   any later version.
10
 
11
   This program is distributed in the hope that it will be useful,
12
   but WITHOUT ANY WARRANTY; without even the implied warranty of
13
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
   GNU General Public License for more details.
15
 
16
   You should have received a copy of the GNU General Public License along
17
   with this program; if not, write to the Free Software Foundation,
18
   Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
19
 
20
/* Before including this file, you need to include <config.h> and
21
   <string.h>, and define:
22
     RESULT_TYPE             A macro that expands to the return type.
23
     AVAILABLE(h, h_l, j, n_l)
24
                             A macro that returns nonzero if there are
25
                             at least N_L bytes left starting at H[J].
26
                             H is 'unsigned char *', H_L, J, and N_L
27
                             are 'size_t'; H_L is an lvalue.  For
28
                             NUL-terminated searches, H_L can be
29
                             modified each iteration to avoid having
30
                             to compute the end of H up front.
31
 
32
  For case-insensitivity, you may optionally define:
33
     CMP_FUNC(p1, p2, l)     A macro that returns 0 iff the first L
34
                             characters of P1 and P2 are equal.
35
     CANON_ELEMENT(c)        A macro that canonicalizes an element right after
36
                             it has been fetched from one of the two strings.
37
                             The argument is an 'unsigned char'; the result
38
                             must be an 'unsigned char' as well.
39
 
40
  This file undefines the macros documented above, and defines
41
  LONG_NEEDLE_THRESHOLD.
42
*/
43
 
44
#include <limits.h>
45
#include <stdint.h>
46
 
47
/* We use the Two-Way string matching algorithm, which guarantees
48
   linear complexity with constant space.  Additionally, for long
49
   needles, we also use a bad character shift table similar to the
50
   Boyer-Moore algorithm to achieve improved (potentially sub-linear)
51
   performance.
52
 
53
   See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260
54
   and http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm
55
*/
56
 
57
/* Point at which computing a bad-byte shift table is likely to be
58
   worthwhile.  Small needles should not compute a table, since it
59
   adds (1 << CHAR_BIT) + NEEDLE_LEN computations of preparation for a
60
   speedup no greater than a factor of NEEDLE_LEN.  The larger the
61
   needle, the better the potential performance gain.  On the other
62
   hand, on non-POSIX systems with CHAR_BIT larger than eight, the
63
   memory required for the table is prohibitive.  */
64
#if CHAR_BIT < 10
65
# define LONG_NEEDLE_THRESHOLD 32U
66
#else
67
# define LONG_NEEDLE_THRESHOLD SIZE_MAX
68
#endif
69
 
70
#define MAX(a, b) ((a < b) ? (b) : (a))
71
 
72
#ifndef CANON_ELEMENT
73
# define CANON_ELEMENT(c) c
74
#endif
75
#ifndef CMP_FUNC
76
# define CMP_FUNC memcmp
77
#endif
78
 
79
/* Perform a critical factorization of NEEDLE, of length NEEDLE_LEN.
80
   Return the index of the first byte in the right half, and set
81
   *PERIOD to the global period of the right half.
82
 
83
   The global period of a string is the smallest index (possibly its
84
   length) at which all remaining bytes in the string are repetitions
85
   of the prefix (the last repetition may be a subset of the prefix).
86
 
87
   When NEEDLE is factored into two halves, a local period is the
88
   length of the smallest word that shares a suffix with the left half
89
   and shares a prefix with the right half.  All factorizations of a
90
   non-empty NEEDLE have a local period of at least 1 and no greater
91
   than NEEDLE_LEN.
92
 
93
   A critical factorization has the property that the local period
94
   equals the global period.  All strings have at least one critical
95
   factorization with the left half smaller than the global period.
96
 
97
   Given an ordered alphabet, a critical factorization can be computed
98
   in linear time, with 2 * NEEDLE_LEN comparisons, by computing the
99
   larger of two ordered maximal suffixes.  The ordered maximal
100
   suffixes are determined by lexicographic comparison of
101
   periodicity.  */
102
static size_t
103
critical_factorization (const unsigned char *needle, size_t needle_len,
104
                        size_t *period)
105
{
106
  /* Index of last byte of left half, or SIZE_MAX.  */
107
  size_t max_suffix, max_suffix_rev;
108
  size_t j; /* Index into NEEDLE for current candidate suffix.  */
109
  size_t k; /* Offset into current period.  */
110
  size_t p; /* Intermediate period.  */
111
  unsigned char a, b; /* Current comparison bytes.  */
112
 
113
  /* Invariants:
114
 
115
     -1 <= max_suffix{,_rev} < j (treating SIZE_MAX as if it were signed)
116
     min(max_suffix, max_suffix_rev) < global period of NEEDLE
117
     1 <= p <= global period of NEEDLE
118
     p == global period of the substring NEEDLE[max_suffix{,_rev}+1...j]
119
     1 <= k <= p
120
  */
121
 
122
  /* Perform lexicographic search.  */
123
  max_suffix = SIZE_MAX;
124
  j = 0;
125
  k = p = 1;
126
  while (j + k < needle_len)
127
    {
128
      a = CANON_ELEMENT (needle[j + k]);
129
      b = CANON_ELEMENT (needle[max_suffix + k]);
130
      if (a < b)
131
        {
132
          /* Suffix is smaller, period is entire prefix so far.  */
133
          j += k;
134
          k = 1;
135
          p = j - max_suffix;
136
        }
137
      else if (a == b)
138
        {
139
          /* Advance through repetition of the current period.  */
140
          if (k != p)
141
            ++k;
142
          else
143
            {
144
              j += p;
145
              k = 1;
146
            }
147
        }
148
      else /* b < a */
149
        {
150
          /* Suffix is larger, start over from current location.  */
151
          max_suffix = j++;
152
          k = p = 1;
153
        }
154
    }
155
  *period = p;
156
 
157
  /* Perform reverse lexicographic search.  */
158
  max_suffix_rev = SIZE_MAX;
159
  j = 0;
160
  k = p = 1;
161
  while (j + k < needle_len)
162
    {
163
      a = CANON_ELEMENT (needle[j + k]);
164
      b = CANON_ELEMENT (needle[max_suffix_rev + k]);
165
      if (b < a)
166
        {
167
          /* Suffix is smaller, period is entire prefix so far.  */
168
          j += k;
169
          k = 1;
170
          p = j - max_suffix_rev;
171
        }
172
      else if (a == b)
173
        {
174
          /* Advance through repetition of the current period.  */
175
          if (k != p)
176
            ++k;
177
          else
178
            {
179
              j += p;
180
              k = 1;
181
            }
182
        }
183
      else /* a < b */
184
        {
185
          /* Suffix is larger, start over from current location.  */
186
          max_suffix_rev = j++;
187
          k = p = 1;
188
        }
189
    }
190
 
191
  /* Choose the longer suffix.  Return the first byte of the right
192
     half, rather than the last byte of the left half.  */
193
  if (max_suffix_rev + 1 < max_suffix + 1)
194
    return max_suffix + 1;
195
  *period = p;
196
  return max_suffix_rev + 1;
197
}
198
 
199
/* Return the first location of non-empty NEEDLE within HAYSTACK, or
200
   NULL.  HAYSTACK_LEN is the minimum known length of HAYSTACK.  This
201
   method is optimized for NEEDLE_LEN < LONG_NEEDLE_THRESHOLD.
202
   Performance is guaranteed to be linear, with an initialization cost
203
   of 2 * NEEDLE_LEN comparisons.
204
 
205
   If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at
206
   most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching.
207
   If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
208
   HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching.  */
209
static RETURN_TYPE
210
two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
211
                      const unsigned char *needle, size_t needle_len)
212
{
213
  size_t i; /* Index into current byte of NEEDLE.  */
214
  size_t j; /* Index into current window of HAYSTACK.  */
215
  size_t period; /* The period of the right half of needle.  */
216
  size_t suffix; /* The index of the right half of needle.  */
217
 
218
  /* Factor the needle into two halves, such that the left half is
219
     smaller than the global period, and the right half is
220
     periodic (with a period as large as NEEDLE_LEN - suffix).  */
221
  suffix = critical_factorization (needle, needle_len, &period);
222
 
223
  /* Perform the search.  Each iteration compares the right half
224
     first.  */
225
  if (CMP_FUNC (needle, needle + period, suffix) == 0)
226
    {
227
      /* Entire needle is periodic; a mismatch can only advance by the
228
         period, so use memory to avoid rescanning known occurrences
229
         of the period.  */
230
      size_t memory = 0;
231
      j = 0;
232
      while (AVAILABLE (haystack, haystack_len, j, needle_len))
233
        {
234
          /* Scan for matches in right half.  */
235
          i = MAX (suffix, memory);
236
          while (i < needle_len && (CANON_ELEMENT (needle[i])
237
                                    == CANON_ELEMENT (haystack[i + j])))
238
            ++i;
239
          if (needle_len <= i)
240
            {
241
              /* Scan for matches in left half.  */
242
              i = suffix - 1;
243
              while (memory < i + 1 && (CANON_ELEMENT (needle[i])
244
                                        == CANON_ELEMENT (haystack[i + j])))
245
                --i;
246
              if (i + 1 < memory + 1)
247
                return (RETURN_TYPE) (haystack + j);
248
              /* No match, so remember how many repetitions of period
249
                 on the right half were scanned.  */
250
              j += period;
251
              memory = needle_len - period;
252
            }
253
          else
254
            {
255
              j += i - suffix + 1;
256
              memory = 0;
257
            }
258
        }
259
    }
260
  else
261
    {
262
      /* The two halves of needle are distinct; no extra memory is
263
         required, and any mismatch results in a maximal shift.  */
264
      period = MAX (suffix, needle_len - suffix) + 1;
265
      j = 0;
266
      while (AVAILABLE (haystack, haystack_len, j, needle_len))
267
        {
268
          /* Scan for matches in right half.  */
269
          i = suffix;
270
          while (i < needle_len && (CANON_ELEMENT (needle[i])
271
                                    == CANON_ELEMENT (haystack[i + j])))
272
            ++i;
273
          if (needle_len <= i)
274
            {
275
              /* Scan for matches in left half.  */
276
              i = suffix - 1;
277
              while (i != SIZE_MAX && (CANON_ELEMENT (needle[i])
278
                                       == CANON_ELEMENT (haystack[i + j])))
279
                --i;
280
              if (i == SIZE_MAX)
281
                return (RETURN_TYPE) (haystack + j);
282
              j += period;
283
            }
284
          else
285
            j += i - suffix + 1;
286
        }
287
    }
288
  return NULL;
289
}
290
 
291
/* Return the first location of non-empty NEEDLE within HAYSTACK, or
292
   NULL.  HAYSTACK_LEN is the minimum known length of HAYSTACK.  This
293
   method is optimized for LONG_NEEDLE_THRESHOLD <= NEEDLE_LEN.
294
   Performance is guaranteed to be linear, with an initialization cost
295
   of 3 * NEEDLE_LEN + (1 << CHAR_BIT) operations.
296
 
297
   If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at
298
   most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching,
299
   and sublinear performance O(HAYSTACK_LEN / NEEDLE_LEN) is possible.
300
   If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
301
   HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, and
302
   sublinear performance is not possible.  */
303
static RETURN_TYPE
304
two_way_long_needle (const unsigned char *haystack, size_t haystack_len,
305
                     const unsigned char *needle, size_t needle_len)
306
{
307
  size_t i; /* Index into current byte of NEEDLE.  */
308
  size_t j; /* Index into current window of HAYSTACK.  */
309
  size_t period; /* The period of the right half of needle.  */
310
  size_t suffix; /* The index of the right half of needle.  */
311
  size_t shift_table[1U << CHAR_BIT]; /* See below.  */
312
 
313
  /* Factor the needle into two halves, such that the left half is
314
     smaller than the global period, and the right half is
315
     periodic (with a period as large as NEEDLE_LEN - suffix).  */
316
  suffix = critical_factorization (needle, needle_len, &period);
317
 
318
  /* Populate shift_table.  For each possible byte value c,
319
     shift_table[c] is the distance from the last occurrence of c to
320
     the end of NEEDLE, or NEEDLE_LEN if c is absent from the NEEDLE.
321
     shift_table[NEEDLE[NEEDLE_LEN - 1]] contains the only 0.  */
322
  for (i = 0; i < 1U << CHAR_BIT; i++)
323
    shift_table[i] = needle_len;
324
  for (i = 0; i < needle_len; i++)
325
    shift_table[CANON_ELEMENT (needle[i])] = needle_len - i - 1;
326
 
327
  /* Perform the search.  Each iteration compares the right half
328
     first.  */
329
  if (CMP_FUNC (needle, needle + period, suffix) == 0)
330
    {
331
      /* Entire needle is periodic; a mismatch can only advance by the
332
         period, so use memory to avoid rescanning known occurrences
333
         of the period.  */
334
      size_t memory = 0;
335
      size_t shift;
336
      j = 0;
337
      while (AVAILABLE (haystack, haystack_len, j, needle_len))
338
        {
339
          /* Check the last byte first; if it does not match, then
340
             shift to the next possible match location.  */
341
          shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])];
342
          if (0 < shift)
343
            {
344
              if (memory && shift < period)
345
                {
346
                  /* Since needle is periodic, but the last period has
347
                     a byte out of place, there can be no match until
348
                     after the mismatch.  */
349
                  shift = needle_len - period;
350
                  memory = 0;
351
                }
352
              j += shift;
353
              continue;
354
            }
355
          /* Scan for matches in right half.  The last byte has
356
             already been matched, by virtue of the shift table.  */
357
          i = MAX (suffix, memory);
358
          while (i < needle_len - 1 && (CANON_ELEMENT (needle[i])
359
                                        == CANON_ELEMENT (haystack[i + j])))
360
            ++i;
361
          if (needle_len - 1 <= i)
362
            {
363
              /* Scan for matches in left half.  */
364
              i = suffix - 1;
365
              while (memory < i + 1 && (CANON_ELEMENT (needle[i])
366
                                        == CANON_ELEMENT (haystack[i + j])))
367
                --i;
368
              if (i + 1 < memory + 1)
369
                return (RETURN_TYPE) (haystack + j);
370
              /* No match, so remember how many repetitions of period
371
                 on the right half were scanned.  */
372
              j += period;
373
              memory = needle_len - period;
374
            }
375
          else
376
            {
377
              j += i - suffix + 1;
378
              memory = 0;
379
            }
380
        }
381
    }
382
  else
383
    {
384
      /* The two halves of needle are distinct; no extra memory is
385
         required, and any mismatch results in a maximal shift.  */
386
      size_t shift;
387
      period = MAX (suffix, needle_len - suffix) + 1;
388
      j = 0;
389
      while (AVAILABLE (haystack, haystack_len, j, needle_len))
390
        {
391
          /* Check the last byte first; if it does not match, then
392
             shift to the next possible match location.  */
393
          shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])];
394
          if (0 < shift)
395
            {
396
              j += shift;
397
              continue;
398
            }
399
          /* Scan for matches in right half.  The last byte has
400
             already been matched, by virtue of the shift table.  */
401
          i = suffix;
402
          while (i < needle_len - 1 && (CANON_ELEMENT (needle[i])
403
                                        == CANON_ELEMENT (haystack[i + j])))
404
            ++i;
405
          if (needle_len - 1 <= i)
406
            {
407
              /* Scan for matches in left half.  */
408
              i = suffix - 1;
409
              while (i != SIZE_MAX && (CANON_ELEMENT (needle[i])
410
                                       == CANON_ELEMENT (haystack[i + j])))
411
                --i;
412
              if (i == SIZE_MAX)
413
                return (RETURN_TYPE) (haystack + j);
414
              j += period;
415
            }
416
          else
417
            j += i - suffix + 1;
418
        }
419
    }
420
  return NULL;
421
}
422
 
423
#undef AVAILABLE
424
#undef CANON_ELEMENT
425
#undef CMP_FUNC
426
#undef MAX
427
#undef RETURN_TYPE

powered by: WebSVN 2.1.0

© copyright 1999-2025 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.