OpenCores
URL https://opencores.org/ocsvn/openrisc/openrisc/trunk

Subversion Repositories openrisc

[/] [openrisc/] [trunk/] [gnu-dev/] [or1k-gcc/] [zlib/] [contrib/] [asm586/] [match.S] - Blame information for rev 767

Go to most recent revision | Details | Compare with Previous | View Log

Line No. Rev Author Line
1 745 jeremybenn
/* match.s -- Pentium-optimized version of longest_match()
2
 * Written for zlib 1.1.2
3
 * Copyright (C) 1998 Brian Raiter 
4
 *
5
 * This is free software; you can redistribute it and/or modify it
6
 * under the terms of the GNU General Public License.
7
 */
8
 
9
#ifndef NO_UNDERLINE
10
#define match_init      _match_init
11
#define longest_match   _longest_match
12
#endif
13
 
14
#define MAX_MATCH       (258)
15
#define MIN_MATCH       (3)
16
#define MIN_LOOKAHEAD   (MAX_MATCH + MIN_MATCH + 1)
17
#define MAX_MATCH_8     ((MAX_MATCH + 7) & ~7)
18
 
19
/* stack frame offsets */
20
 
21
#define wmask                   0        /* local copy of s->wmask       */
22
#define window                  4       /* local copy of s->window      */
23
#define windowbestlen           8       /* s->window + bestlen          */
24
#define chainlenscanend         12      /* high word: current chain len */
25
                                        /* low word: last bytes sought  */
26
#define scanstart               16      /* first two bytes of string    */
27
#define scanalign               20      /* dword-misalignment of string */
28
#define nicematch               24      /* a good enough match size     */
29
#define bestlen                 28      /* size of best match so far    */
30
#define scan                    32      /* ptr to string wanting match  */
31
 
32
#define LocalVarsSize           (36)
33
/*      saved ebx               36 */
34
/*      saved edi               40 */
35
/*      saved esi               44 */
36
/*      saved ebp               48 */
37
/*      return address          52 */
38
#define deflatestate            56      /* the function arguments       */
39
#define curmatch                60
40
 
41
/* Offsets for fields in the deflate_state structure. These numbers
42
 * are calculated from the definition of deflate_state, with the
43
 * assumption that the compiler will dword-align the fields. (Thus,
44
 * changing the definition of deflate_state could easily cause this
45
 * program to crash horribly, without so much as a warning at
46
 * compile time. Sigh.)
47
 */
48
 
49
/* All the +zlib1222add offsets are due to the addition of fields
50
 *  in zlib in the deflate_state structure since the asm code was first written
51
 * (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)").
52
 * (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0").
53
 * if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8").
54
 */
55
 
56
#define zlib1222add             (8)
57
 
58
#define dsWSize                 (36+zlib1222add)
59
#define dsWMask                 (44+zlib1222add)
60
#define dsWindow                (48+zlib1222add)
61
#define dsPrev                  (56+zlib1222add)
62
#define dsMatchLen              (88+zlib1222add)
63
#define dsPrevMatch             (92+zlib1222add)
64
#define dsStrStart              (100+zlib1222add)
65
#define dsMatchStart            (104+zlib1222add)
66
#define dsLookahead             (108+zlib1222add)
67
#define dsPrevLen               (112+zlib1222add)
68
#define dsMaxChainLen           (116+zlib1222add)
69
#define dsGoodMatch             (132+zlib1222add)
70
#define dsNiceMatch             (136+zlib1222add)
71
 
72
 
73
.file "match.S"
74
 
75
.globl  match_init, longest_match
76
 
77
.text
78
 
79
/* uInt longest_match(deflate_state *deflatestate, IPos curmatch) */
80
 
81
longest_match:
82
 
83
/* Save registers that the compiler may be using, and adjust %esp to    */
84
/* make room for our stack frame.                                       */
85
 
86
                pushl   %ebp
87
                pushl   %edi
88
                pushl   %esi
89
                pushl   %ebx
90
                subl    $LocalVarsSize, %esp
91
 
92
/* Retrieve the function arguments. %ecx will hold cur_match            */
93
/* throughout the entire function. %edx will hold the pointer to the    */
94
/* deflate_state structure during the function's setup (before          */
95
/* entering the main loop).                                             */
96
 
97
                movl    deflatestate(%esp), %edx
98
                movl    curmatch(%esp), %ecx
99
 
100
/* if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;      */
101
 
102
                movl    dsNiceMatch(%edx), %eax
103
                movl    dsLookahead(%edx), %ebx
104
                cmpl    %eax, %ebx
105
                jl      LookaheadLess
106
                movl    %eax, %ebx
107
LookaheadLess:  movl    %ebx, nicematch(%esp)
108
 
109
/* register Bytef *scan = s->window + s->strstart;                      */
110
 
111
                movl    dsWindow(%edx), %esi
112
                movl    %esi, window(%esp)
113
                movl    dsStrStart(%edx), %ebp
114
                lea     (%esi,%ebp), %edi
115
                movl    %edi, scan(%esp)
116
 
117
/* Determine how many bytes the scan ptr is off from being              */
118
/* dword-aligned.                                                       */
119
 
120
                movl    %edi, %eax
121
                negl    %eax
122
                andl    $3, %eax
123
                movl    %eax, scanalign(%esp)
124
 
125
/* IPos limit = s->strstart > (IPos)MAX_DIST(s) ?                       */
126
/*     s->strstart - (IPos)MAX_DIST(s) : NIL;                           */
127
 
128
                movl    dsWSize(%edx), %eax
129
                subl    $MIN_LOOKAHEAD, %eax
130
                subl    %eax, %ebp
131
                jg      LimitPositive
132
                xorl    %ebp, %ebp
133
LimitPositive:
134
 
135
/* unsigned chain_length = s->max_chain_length;                         */
136
/* if (s->prev_length >= s->good_match) {                               */
137
/*     chain_length >>= 2;                                              */
138
/* }                                                                    */
139
 
140
                movl    dsPrevLen(%edx), %eax
141
                movl    dsGoodMatch(%edx), %ebx
142
                cmpl    %ebx, %eax
143
                movl    dsMaxChainLen(%edx), %ebx
144
                jl      LastMatchGood
145
                shrl    $2, %ebx
146
LastMatchGood:
147
 
148
/* chainlen is decremented once beforehand so that the function can     */
149
/* use the sign flag instead of the zero flag for the exit test.        */
150
/* It is then shifted into the high word, to make room for the scanend  */
151
/* scanend value, which it will always accompany.                       */
152
 
153
                decl    %ebx
154
                shll    $16, %ebx
155
 
156
/* int best_len = s->prev_length;                                       */
157
 
158
                movl    dsPrevLen(%edx), %eax
159
                movl    %eax, bestlen(%esp)
160
 
161
/* Store the sum of s->window + best_len in %esi locally, and in %esi.  */
162
 
163
                addl    %eax, %esi
164
                movl    %esi, windowbestlen(%esp)
165
 
166
/* register ush scan_start = *(ushf*)scan;                              */
167
/* register ush scan_end   = *(ushf*)(scan+best_len-1);                 */
168
 
169
                movw    (%edi), %bx
170
                movw    %bx, scanstart(%esp)
171
                movw    -1(%edi,%eax), %bx
172
                movl    %ebx, chainlenscanend(%esp)
173
 
174
/* Posf *prev = s->prev;                                                */
175
/* uInt wmask = s->w_mask;                                              */
176
 
177
                movl    dsPrev(%edx), %edi
178
                movl    dsWMask(%edx), %edx
179
                mov     %edx, wmask(%esp)
180
 
181
/* Jump into the main loop.                                             */
182
 
183
                jmp     LoopEntry
184
 
185
.balign 16
186
 
187
/* do {
188
 *     match = s->window + cur_match;
189
 *     if (*(ushf*)(match+best_len-1) != scan_end ||
190
 *         *(ushf*)match != scan_start) continue;
191
 *     [...]
192
 * } while ((cur_match = prev[cur_match & wmask]) > limit
193
 *          && --chain_length != 0);
194
 *
195
 * Here is the inner loop of the function. The function will spend the
196
 * majority of its time in this loop, and majority of that time will
197
 * be spent in the first ten instructions.
198
 *
199
 * Within this loop:
200
 * %ebx = chainlenscanend - i.e., ((chainlen << 16) | scanend)
201
 * %ecx = curmatch
202
 * %edx = curmatch & wmask
203
 * %esi = windowbestlen - i.e., (window + bestlen)
204
 * %edi = prev
205
 * %ebp = limit
206
 *
207
 * Two optimization notes on the choice of instructions:
208
 *
209
 * The first instruction uses a 16-bit address, which costs an extra,
210
 * unpairable cycle. This is cheaper than doing a 32-bit access and
211
 * zeroing the high word, due to the 3-cycle misalignment penalty which
212
 * would occur half the time. This also turns out to be cheaper than
213
 * doing two separate 8-bit accesses, as the memory is so rarely in the
214
 * L1 cache.
215
 *
216
 * The window buffer, however, apparently spends a lot of time in the
217
 * cache, and so it is faster to retrieve the word at the end of the
218
 * match string with two 8-bit loads. The instructions that test the
219
 * word at the beginning of the match string, however, are executed
220
 * much less frequently, and there it was cheaper to use 16-bit
221
 * instructions, which avoided the necessity of saving off and
222
 * subsequently reloading one of the other registers.
223
 */
224
LookupLoop:
225
                                                        /* 1 U & V  */
226
                movw    (%edi,%edx,2), %cx              /* 2 U pipe */
227
                movl    wmask(%esp), %edx               /* 2 V pipe */
228
                cmpl    %ebp, %ecx                      /* 3 U pipe */
229
                jbe     LeaveNow                        /* 3 V pipe */
230
                subl    $0x00010000, %ebx               /* 4 U pipe */
231
                js      LeaveNow                        /* 4 V pipe */
232
LoopEntry:      movb    -1(%esi,%ecx), %al              /* 5 U pipe */
233
                andl    %ecx, %edx                      /* 5 V pipe */
234
                cmpb    %bl, %al                        /* 6 U pipe */
235
                jnz     LookupLoop                      /* 6 V pipe */
236
                movb    (%esi,%ecx), %ah
237
                cmpb    %bh, %ah
238
                jnz     LookupLoop
239
                movl    window(%esp), %eax
240
                movw    (%eax,%ecx), %ax
241
                cmpw    scanstart(%esp), %ax
242
                jnz     LookupLoop
243
 
244
/* Store the current value of chainlen.                                 */
245
 
246
                movl    %ebx, chainlenscanend(%esp)
247
 
248
/* Point %edi to the string under scrutiny, and %esi to the string we   */
249
/* are hoping to match it up with. In actuality, %esi and %edi are      */
250
/* both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and %edx is      */
251
/* initialized to -(MAX_MATCH_8 - scanalign).                           */
252
 
253
                movl    window(%esp), %esi
254
                movl    scan(%esp), %edi
255
                addl    %ecx, %esi
256
                movl    scanalign(%esp), %eax
257
                movl    $(-MAX_MATCH_8), %edx
258
                lea     MAX_MATCH_8(%edi,%eax), %edi
259
                lea     MAX_MATCH_8(%esi,%eax), %esi
260
 
261
/* Test the strings for equality, 8 bytes at a time. At the end,
262
 * adjust %edx so that it is offset to the exact byte that mismatched.
263
 *
264
 * We already know at this point that the first three bytes of the
265
 * strings match each other, and they can be safely passed over before
266
 * starting the compare loop. So what this code does is skip over 0-3
267
 * bytes, as much as necessary in order to dword-align the %edi
268
 * pointer. (%esi will still be misaligned three times out of four.)
269
 *
270
 * It should be confessed that this loop usually does not represent
271
 * much of the total running time. Replacing it with a more
272
 * straightforward "rep cmpsb" would not drastically degrade
273
 * performance.
274
 */
275
LoopCmps:
276
                movl    (%esi,%edx), %eax
277
                movl    (%edi,%edx), %ebx
278
                xorl    %ebx, %eax
279
                jnz     LeaveLoopCmps
280
                movl    4(%esi,%edx), %eax
281
                movl    4(%edi,%edx), %ebx
282
                xorl    %ebx, %eax
283
                jnz     LeaveLoopCmps4
284
                addl    $8, %edx
285
                jnz     LoopCmps
286
                jmp     LenMaximum
287
LeaveLoopCmps4: addl    $4, %edx
288
LeaveLoopCmps:  testl   $0x0000FFFF, %eax
289
                jnz     LenLower
290
                addl    $2, %edx
291
                shrl    $16, %eax
292
LenLower:       subb    $1, %al
293
                adcl    $0, %edx
294
 
295
/* Calculate the length of the match. If it is longer than MAX_MATCH,   */
296
/* then automatically accept it as the best possible match and leave.   */
297
 
298
                lea     (%edi,%edx), %eax
299
                movl    scan(%esp), %edi
300
                subl    %edi, %eax
301
                cmpl    $MAX_MATCH, %eax
302
                jge     LenMaximum
303
 
304
/* If the length of the match is not longer than the best match we      */
305
/* have so far, then forget it and return to the lookup loop.           */
306
 
307
                movl    deflatestate(%esp), %edx
308
                movl    bestlen(%esp), %ebx
309
                cmpl    %ebx, %eax
310
                jg      LongerMatch
311
                movl    chainlenscanend(%esp), %ebx
312
                movl    windowbestlen(%esp), %esi
313
                movl    dsPrev(%edx), %edi
314
                movl    wmask(%esp), %edx
315
                andl    %ecx, %edx
316
                jmp     LookupLoop
317
 
318
/*         s->match_start = cur_match;                                  */
319
/*         best_len = len;                                              */
320
/*         if (len >= nice_match) break;                                */
321
/*         scan_end = *(ushf*)(scan+best_len-1);                        */
322
 
323
LongerMatch:    movl    nicematch(%esp), %ebx
324
                movl    %eax, bestlen(%esp)
325
                movl    %ecx, dsMatchStart(%edx)
326
                cmpl    %ebx, %eax
327
                jge     LeaveNow
328
                movl    window(%esp), %esi
329
                addl    %eax, %esi
330
                movl    %esi, windowbestlen(%esp)
331
                movl    chainlenscanend(%esp), %ebx
332
                movw    -1(%edi,%eax), %bx
333
                movl    dsPrev(%edx), %edi
334
                movl    %ebx, chainlenscanend(%esp)
335
                movl    wmask(%esp), %edx
336
                andl    %ecx, %edx
337
                jmp     LookupLoop
338
 
339
/* Accept the current string, with the maximum possible length.         */
340
 
341
LenMaximum:     movl    deflatestate(%esp), %edx
342
                movl    $MAX_MATCH, bestlen(%esp)
343
                movl    %ecx, dsMatchStart(%edx)
344
 
345
/* if ((uInt)best_len <= s->lookahead) return (uInt)best_len;         */
346
/* return s->lookahead;                                                 */
347
 
348
LeaveNow:
349
                movl    deflatestate(%esp), %edx
350
                movl    bestlen(%esp), %ebx
351
                movl    dsLookahead(%edx), %eax
352
                cmpl    %eax, %ebx
353
                jg      LookaheadRet
354
                movl    %ebx, %eax
355
LookaheadRet:
356
 
357
/* Restore the stack and return from whence we came.                    */
358
 
359
                addl    $LocalVarsSize, %esp
360
                popl    %ebx
361
                popl    %esi
362
                popl    %edi
363
                popl    %ebp
364
match_init:     ret

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.