OpenCores
URL https://opencores.org/ocsvn/an-fpga-implementation-of-low-latency-noc-based-mpsoc/an-fpga-implementation-of-low-latency-noc-based-mpsoc/trunk

Subversion Repositories an-fpga-implementation-of-low-latency-noc-based-mpsoc

[/] [an-fpga-implementation-of-low-latency-noc-based-mpsoc/] [trunk/] [mpsoc/] [Integration_test/] [synthetic_sim/] [perl_lib/] [String/] [fstrcmp.c] - Blame information for rev 56

Details | Compare with Previous | View Log

Line No. Rev Author Line
1 56 alirezamon
/* Functions to make fuzzy comparisons between strings
2
   Copyright (C) 1988, 1989, 1992, 1993, 1995 Free Software Foundation, Inc.
3
 
4
   This program is free software; you can redistribute it and/or modify
5
   it under the terms of the GNU General Public License as published by
6
   the Free Software Foundation; either version 2 of the License, or (at
7
   your option) any later version.
8
 
9
   This program is distributed in the hope that it will be useful, but
10
   WITHOUT ANY WARRANTY; without even the implied warranty of
11
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12
   General Public License for more details.
13
 
14
   You should have received a copy of the GNU General Public License
15
   along with this program; if not, write to the Free Software
16
   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
17
 
18
 
19
   Derived from GNU diff 2.7, analyze.c et al.
20
 
21
   The basic algorithm is described in:
22
   "An O(ND) Difference Algorithm and its Variations", Eugene Myers,
23
   Algorithmica Vol. 1 No. 2, 1986, pp. 251-266;
24
   see especially section 4.2, which describes the variation used below.
25
 
26
   The basic algorithm was independently discovered as described in:
27
   "Algorithms for Approximate String Matching", E. Ukkonen,
28
   Information and Control Vol. 64, 1985, pp. 100-118.
29
 
30
   Modified to work on strings rather than files
31
   by Peter Miller <pmiller@agso.gov.au>, October 1995
32
 
33
   Modified to accept a "minimum similarity limit" to stop analyzing the
34
   string when the similarity drops below the given limit by Marc Lehmann
35
   <schmorp@schmorp.de>.
36
 
37
   Modified to work on unicode (actually 31 bit are allowed) by Marc Lehmann
38
   <schmorp@schmorp.de>.
39
*/
40
 
41
#include <string.h>
42
#include <stdio.h>
43
#include <stdlib.h>
44
#include <limits.h>
45
 
46
#include "fstrcmp.h"
47
 
48
#define PARAMS(proto) proto
49
 
50
/*
51
 * Data on one input string being compared.
52
 */
53
struct string_data
54
{
55
  /* The string to be compared. */
56
  const UV *data;
57
 
58
  /* The length of the string to be compared. */
59
  int data_length;
60
 
61
  /* The number of characters inserted or deleted. */
62
  int edit_count;
63
};
64
 
65
static struct string_data string[2];
66
 
67
static int max_edits; /* compareseq stops when edits > max_edits */
68
 
69
#ifdef MINUS_H_FLAG
70
 
71
/* This corresponds to the diff -H flag.  With this heuristic, for
72
   strings with a constant small density of changes, the algorithm is
73
   linear in the strings size.  This is unlikely in typical uses of
74
   fstrcmp, and so is usually compiled out.  Besides, there is no
75
   interface to set it true.  */
76
static int heuristic;
77
 
78
#endif
79
 
80
 
81
/* Vector, indexed by diagonal, containing 1 + the X coordinate of the
82
   point furthest along the given diagonal in the forward search of the
83
   edit matrix.  */
84
static int *fdiag;
85
 
86
/* Vector, indexed by diagonal, containing the X coordinate of the point
87
   furthest along the given diagonal in the backward search of the edit
88
   matrix.  */
89
static int *bdiag;
90
 
91
/* Edit scripts longer than this are too expensive to compute.  */
92
static int too_expensive;
93
 
94
/* Snakes bigger than this are considered `big'.  */
95
#define SNAKE_LIMIT     20
96
 
97
struct partition
98
{
99
  /* Midpoints of this partition.  */
100
  int xmid, ymid;
101
 
102
  /* Nonzero if low half will be analyzed minimally.  */
103
  int lo_minimal;
104
 
105
  /* Likewise for high half.  */
106
  int hi_minimal;
107
};
108
 
109
 
110
/* NAME
111
        diag - find diagonal path
112
 
113
   SYNOPSIS
114
        int diag(int xoff, int xlim, int yoff, int ylim, int minimal,
115
                struct partition *part);
116
 
117
   DESCRIPTION
118
        Find the midpoint of the shortest edit script for a specified
119
        portion of the two strings.
120
 
121
        Scan from the beginnings of the strings, and simultaneously from
122
        the ends, doing a breadth-first search through the space of
123
        edit-sequence.  When the two searches meet, we have found the
124
        midpoint of the shortest edit sequence.
125
 
126
        If MINIMAL is nonzero, find the minimal edit script regardless
127
        of expense.  Otherwise, if the search is too expensive, use
128
        heuristics to stop the search and report a suboptimal answer.
129
 
130
   RETURNS
131
        Set PART->(XMID,YMID) to the midpoint (XMID,YMID).  The diagonal
132
        number XMID - YMID equals the number of inserted characters
133
        minus the number of deleted characters (counting only characters
134
        before the midpoint).  Return the approximate edit cost; this is
135
        the total number of characters inserted or deleted (counting
136
        only characters before the midpoint), unless a heuristic is used
137
        to terminate the search prematurely.
138
 
139
        Set PART->LEFT_MINIMAL to nonzero iff the minimal edit script
140
        for the left half of the partition is known; similarly for
141
        PART->RIGHT_MINIMAL.
142
 
143
   CAVEAT
144
        This function assumes that the first characters of the specified
145
        portions of the two strings do not match, and likewise that the
146
        last characters do not match.  The caller must trim matching
147
        characters from the beginning and end of the portions it is
148
        going to specify.
149
 
150
        If we return the "wrong" partitions, the worst this can do is
151
        cause suboptimal diff output.  It cannot cause incorrect diff
152
        output.  */
153
 
154
static int diag PARAMS ((int, int, int, int, int, struct partition *));
155
 
156
static int
157
diag (xoff, xlim, yoff, ylim, minimal, part)
158
     int xoff;
159
     int xlim;
160
     int yoff;
161
     int ylim;
162
     int minimal;
163
     struct partition *part;
164
{
165
  int *const fd = fdiag;        /* Give the compiler a chance. */
166
  int *const bd = bdiag;        /* Additional help for the compiler. */
167
  const UV *const xv = string[0].data;   /* Still more help for the compiler. */
168
  const UV *const yv = string[1].data;  /* And more and more . . . */
169
  const int dmin = xoff - ylim; /* Minimum valid diagonal. */
170
  const int dmax = xlim - yoff; /* Maximum valid diagonal. */
171
  const int fmid = xoff - yoff; /* Center diagonal of top-down search. */
172
  const int bmid = xlim - ylim; /* Center diagonal of bottom-up search. */
173
  int fmin = fmid;
174
  int fmax = fmid;              /* Limits of top-down search. */
175
  int bmin = bmid;
176
  int bmax = bmid;              /* Limits of bottom-up search. */
177
  int c;                        /* Cost. */
178
  int odd = (fmid - bmid) & 1;
179
 
180
  /*
181
         * True if southeast corner is on an odd diagonal with respect
182
         * to the northwest.
183
         */
184
  fd[fmid] = xoff;
185
  bd[bmid] = xlim;
186
  for (c = 1;; ++c)
187
    {
188
      int d;                    /* Active diagonal. */
189
      int big_snake;
190
 
191
      big_snake = 0;
192
      /* Extend the top-down search by an edit step in each diagonal. */
193
      if (fmin > dmin)
194
        fd[--fmin - 1] = -1;
195
      else
196
        ++fmin;
197
      if (fmax < dmax)
198
        fd[++fmax + 1] = -1;
199
      else
200
        --fmax;
201
      for (d = fmax; d >= fmin; d -= 2)
202
        {
203
          int x;
204
          int y;
205
          int oldx;
206
          int tlo;
207
          int thi;
208
 
209
          tlo = fd[d - 1],
210
            thi = fd[d + 1];
211
 
212
          if (tlo >= thi)
213
            x = tlo + 1;
214
          else
215
            x = thi;
216
          oldx = x;
217
          y = x - d;
218
          while (x < xlim && y < ylim && xv[x] == yv[y])
219
            {
220
              ++x;
221
              ++y;
222
            }
223
          if (x - oldx > SNAKE_LIMIT)
224
            big_snake = 1;
225
          fd[d] = x;
226
          if (odd && bmin <= d && d <= bmax && bd[d] <= x)
227
            {
228
              part->xmid = x;
229
              part->ymid = y;
230
              part->lo_minimal = part->hi_minimal = 1;
231
              return 2 * c - 1;
232
            }
233
        }
234
      /* Similarly extend the bottom-up search.  */
235
      if (bmin > dmin)
236
        bd[--bmin - 1] = INT_MAX;
237
      else
238
        ++bmin;
239
      if (bmax < dmax)
240
        bd[++bmax + 1] = INT_MAX;
241
      else
242
        --bmax;
243
      for (d = bmax; d >= bmin; d -= 2)
244
        {
245
          int x;
246
          int y;
247
          int oldx;
248
          int tlo;
249
          int thi;
250
 
251
          tlo = bd[d - 1],
252
            thi = bd[d + 1];
253
          if (tlo < thi)
254
            x = tlo;
255
          else
256
            x = thi - 1;
257
          oldx = x;
258
          y = x - d;
259
          while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1])
260
            {
261
              --x;
262
              --y;
263
            }
264
          if (oldx - x > SNAKE_LIMIT)
265
            big_snake = 1;
266
          bd[d] = x;
267
          if (!odd && fmin <= d && d <= fmax && x <= fd[d])
268
            {
269
              part->xmid = x;
270
              part->ymid = y;
271
              part->lo_minimal = part->hi_minimal = 1;
272
              return 2 * c;
273
            }
274
        }
275
 
276
      if (minimal)
277
        continue;
278
 
279
#ifdef MINUS_H_FLAG
280
      /* Heuristic: check occasionally for a diagonal that has made lots
281
         of progress compared with the edit distance.  If we have any
282
         such, find the one that has made the most progress and return
283
         it as if it had succeeded.
284
 
285
         With this heuristic, for strings with a constant small density
286
         of changes, the algorithm is linear in the strings size.  */
287
      if (c > 200 && big_snake && heuristic)
288
        {
289
          int best;
290
 
291
          best = 0;
292
          for (d = fmax; d >= fmin; d -= 2)
293
            {
294
              int dd;
295
              int x;
296
              int y;
297
              int v;
298
 
299
              dd = d - fmid;
300
              x = fd[d];
301
              y = x - d;
302
              v = (x - xoff) * 2 - dd;
303
 
304
              if (v > 12 * (c + (dd < 0 ? -dd : dd)))
305
                {
306
                  if
307
                    (
308
                      v > best
309
                      &&
310
                      xoff + SNAKE_LIMIT <= x
311
                      &&
312
                      x < xlim
313
                      &&
314
                      yoff + SNAKE_LIMIT <= y
315
                      &&
316
                      y < ylim
317
                    )
318
                    {
319
                      /* We have a good enough best diagonal; now insist
320
                         that it end with a significant snake.  */
321
                      int k;
322
 
323
                      for (k = 1; xv[x - k] == yv[y - k]; k++)
324
                        {
325
                          if (k == SNAKE_LIMIT)
326
                            {
327
                              best = v;
328
                              part->xmid = x;
329
                              part->ymid = y;
330
                              break;
331
                            }
332
                        }
333
                    }
334
                }
335
            }
336
          if (best > 0)
337
            {
338
              part->lo_minimal = 1;
339
              part->hi_minimal = 0;
340
              return 2 * c - 1;
341
            }
342
          best = 0;
343
          for (d = bmax; d >= bmin; d -= 2)
344
            {
345
              int dd;
346
              int x;
347
              int y;
348
              int v;
349
 
350
              dd = d - bmid;
351
              x = bd[d];
352
              y = x - d;
353
              v = (xlim - x) * 2 + dd;
354
 
355
              if (v > 12 * (c + (dd < 0 ? -dd : dd)))
356
                {
357
                  if (v > best && xoff < x && x <= xlim - SNAKE_LIMIT &&
358
                      yoff < y && y <= ylim - SNAKE_LIMIT)
359
                    {
360
                      /* We have a good enough best diagonal; now insist
361
                         that it end with a significant snake.  */
362
                      int k;
363
 
364
                      for (k = 0; xv[x + k] == yv[y + k]; k++)
365
                        {
366
                          if (k == SNAKE_LIMIT - 1)
367
                            {
368
                              best = v;
369
                              part->xmid = x;
370
                              part->ymid = y;
371
                              break;
372
                            }
373
                        }
374
                    }
375
                }
376
            }
377
          if (best > 0)
378
            {
379
              part->lo_minimal = 0;
380
              part->hi_minimal = 1;
381
              return 2 * c - 1;
382
            }
383
        }
384
#endif /* MINUS_H_FLAG */
385
 
386
      /* Heuristic: if we've gone well beyond the call of duty, give up
387
         and report halfway between our best results so far.  */
388
      if (c >= too_expensive)
389
        {
390
          int fxybest;
391
          int fxbest;
392
          int bxybest;
393
          int bxbest;
394
 
395
          /* Pacify `gcc -Wall'. */
396
          fxbest = 0;
397
          bxbest = 0;
398
 
399
          /* Find forward diagonal that maximizes X + Y.  */
400
          fxybest = -1;
401
          for (d = fmax; d >= fmin; d -= 2)
402
            {
403
              int x;
404
              int y;
405
 
406
              x = fd[d] < xlim ? fd[d] : xlim;
407
              y = x - d;
408
 
409
              if (ylim < y)
410
                {
411
                  x = ylim + d;
412
                  y = ylim;
413
                }
414
              if (fxybest < x + y)
415
                {
416
                  fxybest = x + y;
417
                  fxbest = x;
418
                }
419
            }
420
          /* Find backward diagonal that minimizes X + Y.  */
421
          bxybest = INT_MAX;
422
          for (d = bmax; d >= bmin; d -= 2)
423
            {
424
              int x;
425
              int y;
426
 
427
              x = xoff > bd[d] ? xoff : bd[d];
428
              y = x - d;
429
 
430
              if (y < yoff)
431
                {
432
                  x = yoff + d;
433
                  y = yoff;
434
                }
435
              if (x + y < bxybest)
436
                {
437
                  bxybest = x + y;
438
                  bxbest = x;
439
                }
440
            }
441
          /* Use the better of the two diagonals.  */
442
          if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff))
443
            {
444
              part->xmid = fxbest;
445
              part->ymid = fxybest - fxbest;
446
              part->lo_minimal = 1;
447
              part->hi_minimal = 0;
448
            }
449
          else
450
            {
451
              part->xmid = bxbest;
452
              part->ymid = bxybest - bxbest;
453
              part->lo_minimal = 0;
454
              part->hi_minimal = 1;
455
            }
456
          return 2 * c - 1;
457
        }
458
    }
459
}
460
 
461
 
462
/* NAME
463
        compareseq - find edit sequence
464
 
465
   SYNOPSIS
466
        void compareseq(int xoff, int xlim, int yoff, int ylim, int minimal);
467
 
468
   DESCRIPTION
469
        Compare in detail contiguous subsequences of the two strings
470
        which are known, as a whole, to match each other.
471
 
472
        The subsequence of string 0 is [XOFF, XLIM) and likewise for
473
        string 1.
474
 
475
        Note that XLIM, YLIM are exclusive bounds.  All character
476
        numbers are origin-0.
477
 
478
        If MINIMAL is nonzero, find a minimal difference no matter how
479
        expensive it is.  */
480
 
481
static void compareseq PARAMS ((int, int, int, int, int));
482
 
483
static void
484
compareseq (xoff, xlim, yoff, ylim, minimal)
485
     int xoff;
486
     int xlim;
487
     int yoff;
488
     int ylim;
489
     int minimal;
490
{
491
  const UV *const xv = string[0].data;   /* Help the compiler.  */
492
  const UV *const yv = string[1].data;
493
 
494
  if (string[1].edit_count + string[0].edit_count > max_edits)
495
    return;
496
 
497
  /* Slide down the bottom initial diagonal. */
498
  while (xoff < xlim && yoff < ylim && xv[xoff] == yv[yoff])
499
    {
500
      ++xoff;
501
      ++yoff;
502
    }
503
 
504
  /* Slide up the top initial diagonal. */
505
  while (xlim > xoff && ylim > yoff && xv[xlim - 1] == yv[ylim - 1])
506
    {
507
      --xlim;
508
      --ylim;
509
    }
510
 
511
  /* Handle simple cases. */
512
  if (xoff == xlim)
513
    {
514
      while (yoff < ylim)
515
        {
516
          ++string[1].edit_count;
517
          ++yoff;
518
        }
519
    }
520
  else if (yoff == ylim)
521
    {
522
      while (xoff < xlim)
523
        {
524
          ++string[0].edit_count;
525
          ++xoff;
526
        }
527
    }
528
  else
529
    {
530
      int c;
531
      struct partition part;
532
 
533
      /* Find a point of correspondence in the middle of the strings.  */
534
      c = diag (xoff, xlim, yoff, ylim, minimal, &part);
535
      if (c == 1)
536
        {
537
#if 0
538
          /* This should be impossible, because it implies that one of
539
             the two subsequences is empty, and that case was handled
540
             above without calling `diag'.  Let's verify that this is
541
             true.  */
542
          abort ();
543
#else
544
          /* The two subsequences differ by a single insert or delete;
545
             record it and we are done.  */
546
          if (part.xmid - part.ymid < xoff - yoff)
547
            ++string[1].edit_count;
548
          else
549
            ++string[0].edit_count;
550
#endif
551
        }
552
      else
553
        {
554
          /* Use the partitions to split this problem into subproblems.  */
555
          compareseq (xoff, part.xmid, yoff, part.ymid, part.lo_minimal);
556
          compareseq (part.xmid, xlim, part.ymid, ylim, part.hi_minimal);
557
        }
558
    }
559
}
560
 
561
 
562
/* NAME
563
        fstrcmp - fuzzy string compare
564
 
565
   SYNOPSIS
566
        double fstrcmp(const ChaR *s1, int l1, const UV *s2, int l2, double);
567
 
568
   DESCRIPTION
569
        The fstrcmp function may be used to compare two string for
570
        similarity.  It is very useful in reducing "cascade" or
571
        "secondary" errors in compilers or other situations where
572
        symbol tables occur.
573
 
574
   RETURNS
575
        double; 0 if the strings are entirly dissimilar, 1 if the
576
        strings are identical, and a number in between if they are
577
        similar.  */
578
 
579
double
580
fstrcmp (const UV *string1, int length1,
581
         const UV *string2, int length2,
582
         double minimum)
583
{
584
  int i;
585
 
586
  size_t fdiag_len;
587
  static int *fdiag_buf;
588
  static size_t fdiag_max;
589
 
590
  /* set the info for each string.  */
591
  string[0].data = string1;
592
  string[0].data_length = length1;
593
  string[1].data = string2;
594
  string[1].data_length = length2;
595
 
596
  /* short-circuit obvious comparisons */
597
  if (string[0].data_length == 0 && string[1].data_length == 0)
598
    return 1.0;
599
  if (string[0].data_length == 0 || string[1].data_length == 0)
600
    return 0.0;
601
 
602
  /* Set TOO_EXPENSIVE to be approximate square root of input size,
603
     bounded below by 256.  */
604
  too_expensive = 1;
605
  for (i = string[0].data_length + string[1].data_length; i != 0; i >>= 2)
606
    too_expensive <<= 1;
607
  if (too_expensive < 256)
608
    too_expensive = 256;
609
 
610
  /* Because fstrcmp is typically called multiple times, while scanning
611
     symbol tables, etc, attempt to minimize the number of memory
612
     allocations performed.  Thus, we use a static buffer for the
613
     diagonal vectors, and never free them.  */
614
  fdiag_len = string[0].data_length + string[1].data_length + 3;
615
  if (fdiag_len > fdiag_max)
616
    {
617
      fdiag_max = fdiag_len;
618
      fdiag_buf = realloc (fdiag_buf, fdiag_max * (2 * sizeof (int)));
619
    }
620
  fdiag = fdiag_buf + string[1].data_length + 1;
621
  bdiag = fdiag + fdiag_len;
622
 
623
  max_edits = 1 + (string[0].data_length + string[1].data_length) * (1. - minimum);
624
 
625
  /* Now do the main comparison algorithm */
626
  string[0].edit_count = 0;
627
  string[1].edit_count = 0;
628
  compareseq (0, string[0].data_length, 0, string[1].data_length, 0);
629
 
630
  /* The result is
631
        ((number of chars in common) / (average length of the strings)).
632
     This is admittedly biased towards finding that the strings are
633
     similar, however it does produce meaningful results.  */
634
  return ((double)
635
             (string[0].data_length + string[1].data_length - string[1].edit_count - string[0].edit_count)
636
           / (string[0].data_length + string[1].data_length));
637
 
638
}

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.