1 |
700 |
jeremybenn |
/*
|
2 |
|
|
Redistribution and use in source and binary forms, with or without
|
3 |
|
|
modification, are permitted provided that the following conditions are met:
|
4 |
|
|
|
5 |
|
|
* Redistributions of source code must retain the above copyright
|
6 |
|
|
notice, this list of conditions and the following disclaimer.
|
7 |
|
|
|
8 |
|
|
* Redistributions in binary form must reproduce the above copyright
|
9 |
|
|
notice, this list of conditions and the following disclaimer in the
|
10 |
|
|
documentation and/or other materials provided with the distribution.
|
11 |
|
|
|
12 |
|
|
* Neither the name of "The Computer Language Benchmarks Game" nor the
|
13 |
|
|
name of "The Computer Language Shootout Benchmarks" nor the names of
|
14 |
|
|
its contributors may be used to endorse or promote products derived
|
15 |
|
|
from this software without specific prior written permission.
|
16 |
|
|
|
17 |
|
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
18 |
|
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
19 |
|
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
20 |
|
|
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
21 |
|
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
22 |
|
|
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
23 |
|
|
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
24 |
|
|
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
25 |
|
|
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
26 |
|
|
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
27 |
|
|
POSSIBILITY OF SUCH DAMAGE.
|
28 |
|
|
*/
|
29 |
|
|
|
30 |
|
|
/*
|
31 |
|
|
** The Computer Language Shootout
|
32 |
|
|
** http://shootout.alioth.debian.org/
|
33 |
|
|
** contributed by Mike Pall
|
34 |
|
|
**
|
35 |
|
|
** regex-dna benchmark using PCRE
|
36 |
|
|
**
|
37 |
|
|
** compile with:
|
38 |
|
|
** gcc -O3 -fomit-frame-pointer -o regexdna regexdna.c -lpcre
|
39 |
|
|
*/
|
40 |
|
|
|
41 |
|
|
#define __USE_STRING_INLINES
|
42 |
|
|
#include <stdio.h>
|
43 |
|
|
#include <string.h>
|
44 |
|
|
#include <stdlib.h>
|
45 |
|
|
#include <pcre.h>
|
46 |
|
|
|
47 |
|
|
typedef struct fbuf {
|
48 |
|
|
char *buf;
|
49 |
|
|
size_t size, len;
|
50 |
|
|
} fbuf_t;
|
51 |
|
|
|
52 |
|
|
static void fb_init(fbuf_t *b)
|
53 |
|
|
{
|
54 |
|
|
b->buf = NULL;
|
55 |
|
|
b->len = b->size = 0;
|
56 |
|
|
}
|
57 |
|
|
|
58 |
|
|
static char *fb_need(fbuf_t *b, size_t need)
|
59 |
|
|
{
|
60 |
|
|
need += b->len;
|
61 |
|
|
if (need > b->size) {
|
62 |
|
|
if (b->size == 0) b->size = need;
|
63 |
|
|
else while (need > b->size) b->size += b->size;
|
64 |
|
|
if (!(b->buf = realloc(b->buf, b->size))) exit(1);
|
65 |
|
|
}
|
66 |
|
|
return b->buf+b->len;
|
67 |
|
|
}
|
68 |
|
|
|
69 |
|
|
#define FB_MINREAD (3<<16)
|
70 |
|
|
|
71 |
|
|
/* Read all of a stdio stream into dst buffer. */
|
72 |
|
|
static size_t fb_readall(fbuf_t *dst, FILE *fp)
|
73 |
|
|
{
|
74 |
|
|
char *dp;
|
75 |
|
|
int n;
|
76 |
|
|
for (dp = fb_need(dst, FB_MINREAD);
|
77 |
|
|
(n = fread(dp, 1, dst->size-dst->len, fp)) > 0;
|
78 |
|
|
dp = fb_need(dst, FB_MINREAD)) dst->len += n;
|
79 |
|
|
if (ferror(fp)) exit(1);
|
80 |
|
|
return dst->len;
|
81 |
|
|
}
|
82 |
|
|
|
83 |
|
|
/* Substitute pattern p with replacement r, copying from src to dst buffer. */
|
84 |
|
|
static size_t fb_subst(fbuf_t *dst, fbuf_t *src, const char *p, const char *r)
|
85 |
|
|
{
|
86 |
|
|
pcre *re;
|
87 |
|
|
pcre_extra *re_ex;
|
88 |
|
|
const char *re_e;
|
89 |
|
|
char *dp;
|
90 |
|
|
int re_eo, m[3], pos, rlen, clen;
|
91 |
|
|
if (!(re = pcre_compile(p, PCRE_CASELESS, &re_e, &re_eo, NULL))) exit(1);
|
92 |
|
|
re_ex = pcre_study(re, 0, &re_e);
|
93 |
|
|
for (dst->len = 0, rlen = strlen(r), pos = 0;
|
94 |
|
|
pcre_exec(re, re_ex, src->buf, src->len, pos, 0, m, 3) >= 0;
|
95 |
|
|
pos = m[1]) {
|
96 |
|
|
clen = m[0]-pos;
|
97 |
|
|
dp = fb_need(dst, clen+rlen);
|
98 |
|
|
dst->len += clen+rlen;
|
99 |
|
|
memcpy(dp, src->buf+pos, clen);
|
100 |
|
|
memcpy(dp+clen, r, rlen);
|
101 |
|
|
}
|
102 |
|
|
clen = src->len-pos;
|
103 |
|
|
dp = fb_need(dst, clen);
|
104 |
|
|
dst->len += clen;
|
105 |
|
|
memcpy(dp, src->buf+pos, clen);
|
106 |
|
|
return dst->len;
|
107 |
|
|
}
|
108 |
|
|
|
109 |
|
|
/* Count all matches with pattern p in src buffer. */
|
110 |
|
|
static int fb_countmatches(fbuf_t *src, const char *p)
|
111 |
|
|
{
|
112 |
|
|
pcre *re;
|
113 |
|
|
pcre_extra *re_ex;
|
114 |
|
|
const char *re_e;
|
115 |
|
|
int re_eo, m[3], pos, count;
|
116 |
|
|
if (!(re = pcre_compile(p, PCRE_CASELESS, &re_e, &re_eo, NULL))) exit(1);
|
117 |
|
|
re_ex = pcre_study(re, 0, &re_e);
|
118 |
|
|
for (count = 0, pos = 0;
|
119 |
|
|
pcre_exec(re, re_ex, src->buf, src->len, pos, 0, m, 3) >= 0;
|
120 |
|
|
pos = m[1]) count++;
|
121 |
|
|
return count;
|
122 |
|
|
}
|
123 |
|
|
|
124 |
|
|
static const char *variants[] = {
|
125 |
|
|
"agggtaaa|tttaccct", "[cgt]gggtaaa|tttaccc[acg]",
|
126 |
|
|
"a[act]ggtaaa|tttacc[agt]t", "ag[act]gtaaa|tttac[agt]ct",
|
127 |
|
|
"agg[act]taaa|ttta[agt]cct", "aggg[acg]aaa|ttt[cgt]ccct",
|
128 |
|
|
"agggt[cgt]aa|tt[acg]accct", "agggta[cgt]a|t[acg]taccct",
|
129 |
|
|
"agggtaa[cgt]|[acg]ttaccct", NULL
|
130 |
|
|
};
|
131 |
|
|
|
132 |
|
|
static const char *subst[] = {
|
133 |
|
|
"B", "(c|g|t)", "D", "(a|g|t)", "H", "(a|c|t)", "K", "(g|t)",
|
134 |
|
|
"M", "(a|c)", "N", "(a|c|g|t)", "R", "(a|g)", "S", "(c|g)",
|
135 |
|
|
"V", "(a|c|g)", "W", "(a|t)", "Y", "(c|t)", NULL
|
136 |
|
|
};
|
137 |
|
|
|
138 |
|
|
int main(int argc, char **argv)
|
139 |
|
|
{
|
140 |
|
|
fbuf_t seq[2];
|
141 |
|
|
const char **pp;
|
142 |
|
|
size_t ilen, clen, slen;
|
143 |
|
|
int flip;
|
144 |
|
|
fb_init(&seq[0]);
|
145 |
|
|
fb_init(&seq[1]);
|
146 |
|
|
ilen = fb_readall(&seq[0], stdin);
|
147 |
|
|
clen = fb_subst(&seq[1], &seq[0], ">.*|\n", "");
|
148 |
|
|
for (pp = variants; *pp; pp++)
|
149 |
|
|
printf("%s %d\n", *pp, fb_countmatches(&seq[1], *pp));
|
150 |
|
|
for (slen = 0, flip = 1, pp = subst; *pp; pp += 2, flip = 1-flip)
|
151 |
|
|
slen = fb_subst(&seq[1-flip], &seq[flip], *pp, pp[1]);
|
152 |
|
|
printf("\n%zu\n%zu\n%zu\n", ilen, clen, slen);
|
153 |
|
|
return 0;
|
154 |
|
|
}
|