1 |
745 |
jeremybenn |
|
2 |
|
|
; match.asm -- Pentium-Pro optimized version of longest_match()
|
3 |
|
|
;
|
4 |
|
|
; Updated for zlib 1.1.3 and converted to MASM 6.1x
|
5 |
|
|
; Copyright (C) 2000 Dan Higdon
|
6 |
|
|
; and Chuck Walbourn
|
7 |
|
|
; Corrections by Cosmin Truta
|
8 |
|
|
;
|
9 |
|
|
; This is free software; you can redistribute it and/or modify it
|
10 |
|
|
; under the terms of the GNU General Public License.
|
11 |
|
|
|
12 |
|
|
; Based on match.S
|
13 |
|
|
; Written for zlib 1.1.2
|
14 |
|
|
; Copyright (C) 1998 Brian Raiter
|
15 |
|
|
;
|
16 |
|
|
; Modified by Gilles Vollant (2005) for add gzhead and gzindex
|
17 |
|
|
|
18 |
|
|
.686P
|
19 |
|
|
.MODEL FLAT
|
20 |
|
|
|
21 |
|
|
;===========================================================================
|
22 |
|
|
; EQUATES
|
23 |
|
|
;===========================================================================
|
24 |
|
|
|
25 |
|
|
MAX_MATCH EQU 258
|
26 |
|
|
MIN_MATCH EQU 3
|
27 |
|
|
MIN_LOOKAHEAD EQU (MAX_MATCH + MIN_MATCH + 1)
|
28 |
|
|
MAX_MATCH_8 EQU ((MAX_MATCH + 7) AND (NOT 7))
|
29 |
|
|
|
30 |
|
|
;===========================================================================
|
31 |
|
|
; STRUCTURES
|
32 |
|
|
;===========================================================================
|
33 |
|
|
|
34 |
|
|
; This STRUCT assumes a 4-byte alignment
|
35 |
|
|
|
36 |
|
|
DEFLATE_STATE STRUCT
|
37 |
|
|
ds_strm dd ?
|
38 |
|
|
ds_status dd ?
|
39 |
|
|
ds_pending_buf dd ?
|
40 |
|
|
ds_pending_buf_size dd ?
|
41 |
|
|
ds_pending_out dd ?
|
42 |
|
|
ds_pending dd ?
|
43 |
|
|
ds_wrap dd ?
|
44 |
|
|
; gzhead and gzindex are added in zlib 1.2.2.2 (see deflate.h)
|
45 |
|
|
ds_gzhead dd ?
|
46 |
|
|
ds_gzindex dd ?
|
47 |
|
|
ds_data_type db ?
|
48 |
|
|
ds_method db ?
|
49 |
|
|
db ? ; padding
|
50 |
|
|
db ? ; padding
|
51 |
|
|
ds_last_flush dd ?
|
52 |
|
|
ds_w_size dd ? ; used
|
53 |
|
|
ds_w_bits dd ?
|
54 |
|
|
ds_w_mask dd ? ; used
|
55 |
|
|
ds_window dd ? ; used
|
56 |
|
|
ds_window_size dd ?
|
57 |
|
|
ds_prev dd ? ; used
|
58 |
|
|
ds_head dd ?
|
59 |
|
|
ds_ins_h dd ?
|
60 |
|
|
ds_hash_size dd ?
|
61 |
|
|
ds_hash_bits dd ?
|
62 |
|
|
ds_hash_mask dd ?
|
63 |
|
|
ds_hash_shift dd ?
|
64 |
|
|
ds_block_start dd ?
|
65 |
|
|
ds_match_length dd ? ; used
|
66 |
|
|
ds_prev_match dd ? ; used
|
67 |
|
|
ds_match_available dd ?
|
68 |
|
|
ds_strstart dd ? ; used
|
69 |
|
|
ds_match_start dd ? ; used
|
70 |
|
|
ds_lookahead dd ? ; used
|
71 |
|
|
ds_prev_length dd ? ; used
|
72 |
|
|
ds_max_chain_length dd ? ; used
|
73 |
|
|
ds_max_laxy_match dd ?
|
74 |
|
|
ds_level dd ?
|
75 |
|
|
ds_strategy dd ?
|
76 |
|
|
ds_good_match dd ? ; used
|
77 |
|
|
ds_nice_match dd ? ; used
|
78 |
|
|
|
79 |
|
|
; Don't need anymore of the struct for match
|
80 |
|
|
DEFLATE_STATE ENDS
|
81 |
|
|
|
82 |
|
|
;===========================================================================
|
83 |
|
|
; CODE
|
84 |
|
|
;===========================================================================
|
85 |
|
|
_TEXT SEGMENT
|
86 |
|
|
|
87 |
|
|
;---------------------------------------------------------------------------
|
88 |
|
|
; match_init
|
89 |
|
|
;---------------------------------------------------------------------------
|
90 |
|
|
ALIGN 4
|
91 |
|
|
PUBLIC _match_init
|
92 |
|
|
_match_init PROC
|
93 |
|
|
; no initialization needed
|
94 |
|
|
ret
|
95 |
|
|
_match_init ENDP
|
96 |
|
|
|
97 |
|
|
;---------------------------------------------------------------------------
|
98 |
|
|
; uInt longest_match(deflate_state *deflatestate, IPos curmatch)
|
99 |
|
|
;---------------------------------------------------------------------------
|
100 |
|
|
ALIGN 4
|
101 |
|
|
|
102 |
|
|
PUBLIC _longest_match
|
103 |
|
|
_longest_match PROC
|
104 |
|
|
|
105 |
|
|
; Since this code uses EBP for a scratch register, the stack frame must
|
106 |
|
|
; be manually constructed and referenced relative to the ESP register.
|
107 |
|
|
|
108 |
|
|
; Stack image
|
109 |
|
|
; Variables
|
110 |
|
|
chainlenwmask = 0 ; high word: current chain len
|
111 |
|
|
; low word: s->wmask
|
112 |
|
|
window = 4 ; local copy of s->window
|
113 |
|
|
windowbestlen = 8 ; s->window + bestlen
|
114 |
|
|
scanend = 12 ; last two bytes of string
|
115 |
|
|
scanstart = 16 ; first two bytes of string
|
116 |
|
|
scanalign = 20 ; dword-misalignment of string
|
117 |
|
|
nicematch = 24 ; a good enough match size
|
118 |
|
|
bestlen = 28 ; size of best match so far
|
119 |
|
|
scan = 32 ; ptr to string wanting match
|
120 |
|
|
varsize = 36 ; number of bytes (also offset to last saved register)
|
121 |
|
|
|
122 |
|
|
; Saved Registers (actually pushed into place)
|
123 |
|
|
ebx_save = 36
|
124 |
|
|
edi_save = 40
|
125 |
|
|
esi_save = 44
|
126 |
|
|
ebp_save = 48
|
127 |
|
|
|
128 |
|
|
; Parameters
|
129 |
|
|
retaddr = 52
|
130 |
|
|
deflatestate = 56
|
131 |
|
|
curmatch = 60
|
132 |
|
|
|
133 |
|
|
; Save registers that the compiler may be using
|
134 |
|
|
push ebp
|
135 |
|
|
push edi
|
136 |
|
|
push esi
|
137 |
|
|
push ebx
|
138 |
|
|
|
139 |
|
|
; Allocate local variable space
|
140 |
|
|
sub esp,varsize
|
141 |
|
|
|
142 |
|
|
; Retrieve the function arguments. ecx will hold cur_match
|
143 |
|
|
; throughout the entire function. edx will hold the pointer to the
|
144 |
|
|
; deflate_state structure during the function's setup (before
|
145 |
|
|
; entering the main loop).
|
146 |
|
|
|
147 |
|
|
mov edx, [esp+deflatestate]
|
148 |
|
|
ASSUME edx:PTR DEFLATE_STATE
|
149 |
|
|
|
150 |
|
|
mov ecx, [esp+curmatch]
|
151 |
|
|
|
152 |
|
|
; uInt wmask = s->w_mask;
|
153 |
|
|
; unsigned chain_length = s->max_chain_length;
|
154 |
|
|
; if (s->prev_length >= s->good_match) {
|
155 |
|
|
; chain_length >>= 2;
|
156 |
|
|
; }
|
157 |
|
|
|
158 |
|
|
mov eax, [edx].ds_prev_length
|
159 |
|
|
mov ebx, [edx].ds_good_match
|
160 |
|
|
cmp eax, ebx
|
161 |
|
|
mov eax, [edx].ds_w_mask
|
162 |
|
|
mov ebx, [edx].ds_max_chain_length
|
163 |
|
|
jl SHORT LastMatchGood
|
164 |
|
|
shr ebx, 2
|
165 |
|
|
LastMatchGood:
|
166 |
|
|
|
167 |
|
|
; chainlen is decremented once beforehand so that the function can
|
168 |
|
|
; use the sign flag instead of the zero flag for the exit test.
|
169 |
|
|
; It is then shifted into the high word, to make room for the wmask
|
170 |
|
|
; value, which it will always accompany.
|
171 |
|
|
|
172 |
|
|
dec ebx
|
173 |
|
|
shl ebx, 16
|
174 |
|
|
or ebx, eax
|
175 |
|
|
mov [esp+chainlenwmask], ebx
|
176 |
|
|
|
177 |
|
|
; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;
|
178 |
|
|
|
179 |
|
|
mov eax, [edx].ds_nice_match
|
180 |
|
|
mov ebx, [edx].ds_lookahead
|
181 |
|
|
cmp ebx, eax
|
182 |
|
|
jl SHORT LookaheadLess
|
183 |
|
|
mov ebx, eax
|
184 |
|
|
LookaheadLess:
|
185 |
|
|
mov [esp+nicematch], ebx
|
186 |
|
|
|
187 |
|
|
;/* register Bytef *scan = s->window + s->strstart; */
|
188 |
|
|
|
189 |
|
|
mov esi, [edx].ds_window
|
190 |
|
|
mov [esp+window], esi
|
191 |
|
|
mov ebp, [edx].ds_strstart
|
192 |
|
|
lea edi, [esi+ebp]
|
193 |
|
|
mov [esp+scan],edi
|
194 |
|
|
|
195 |
|
|
;/* Determine how many bytes the scan ptr is off from being */
|
196 |
|
|
;/* dword-aligned. */
|
197 |
|
|
|
198 |
|
|
mov eax, edi
|
199 |
|
|
neg eax
|
200 |
|
|
and eax, 3
|
201 |
|
|
mov [esp+scanalign], eax
|
202 |
|
|
|
203 |
|
|
;/* IPos limit = s->strstart > (IPos)MAX_DIST(s) ? */
|
204 |
|
|
;/* s->strstart - (IPos)MAX_DIST(s) : NIL; */
|
205 |
|
|
|
206 |
|
|
mov eax, [edx].ds_w_size
|
207 |
|
|
sub eax, MIN_LOOKAHEAD
|
208 |
|
|
sub ebp, eax
|
209 |
|
|
jg SHORT LimitPositive
|
210 |
|
|
xor ebp, ebp
|
211 |
|
|
LimitPositive:
|
212 |
|
|
|
213 |
|
|
;/* int best_len = s->prev_length; */
|
214 |
|
|
|
215 |
|
|
mov eax, [edx].ds_prev_length
|
216 |
|
|
mov [esp+bestlen], eax
|
217 |
|
|
|
218 |
|
|
;/* Store the sum of s->window + best_len in %esi locally, and in %esi. */
|
219 |
|
|
|
220 |
|
|
add esi, eax
|
221 |
|
|
mov [esp+windowbestlen], esi
|
222 |
|
|
|
223 |
|
|
;/* register ush scan_start = *(ushf*)scan; */
|
224 |
|
|
;/* register ush scan_end = *(ushf*)(scan+best_len-1); */
|
225 |
|
|
;/* Posf *prev = s->prev; */
|
226 |
|
|
|
227 |
|
|
movzx ebx, WORD PTR[edi]
|
228 |
|
|
mov [esp+scanstart], ebx
|
229 |
|
|
movzx ebx, WORD PTR[eax+edi-1]
|
230 |
|
|
mov [esp+scanend], ebx
|
231 |
|
|
mov edi, [edx].ds_prev
|
232 |
|
|
|
233 |
|
|
;/* Jump into the main loop. */
|
234 |
|
|
|
235 |
|
|
mov edx, [esp+chainlenwmask]
|
236 |
|
|
jmp SHORT LoopEntry
|
237 |
|
|
|
238 |
|
|
;/* do {
|
239 |
|
|
; * match = s->window + cur_match;
|
240 |
|
|
; * if (*(ushf*)(match+best_len-1) != scan_end ||
|
241 |
|
|
; * *(ushf*)match != scan_start) continue;
|
242 |
|
|
; * [...]
|
243 |
|
|
; * } while ((cur_match = prev[cur_match & wmask]) > limit
|
244 |
|
|
; * && --chain_length != 0);
|
245 |
|
|
; *
|
246 |
|
|
; * Here is the inner loop of the function. The function will spend the
|
247 |
|
|
; * majority of its time in this loop, and majority of that time will
|
248 |
|
|
; * be spent in the first ten instructions.
|
249 |
|
|
; *
|
250 |
|
|
; * Within this loop:
|
251 |
|
|
; * %ebx = scanend
|
252 |
|
|
; * %ecx = curmatch
|
253 |
|
|
; * %edx = chainlenwmask - i.e., ((chainlen << 16) | wmask)
|
254 |
|
|
; * %esi = windowbestlen - i.e., (window + bestlen)
|
255 |
|
|
; * %edi = prev
|
256 |
|
|
; * %ebp = limit
|
257 |
|
|
; */
|
258 |
|
|
|
259 |
|
|
ALIGN 4
|
260 |
|
|
LookupLoop:
|
261 |
|
|
and ecx, edx
|
262 |
|
|
movzx ecx, WORD PTR[edi+ecx*2]
|
263 |
|
|
cmp ecx, ebp
|
264 |
|
|
jbe LeaveNow
|
265 |
|
|
sub edx, 000010000H
|
266 |
|
|
js LeaveNow
|
267 |
|
|
|
268 |
|
|
LoopEntry:
|
269 |
|
|
movzx eax, WORD PTR[esi+ecx-1]
|
270 |
|
|
cmp eax, ebx
|
271 |
|
|
jnz SHORT LookupLoop
|
272 |
|
|
|
273 |
|
|
mov eax, [esp+window]
|
274 |
|
|
movzx eax, WORD PTR[eax+ecx]
|
275 |
|
|
cmp eax, [esp+scanstart]
|
276 |
|
|
jnz SHORT LookupLoop
|
277 |
|
|
|
278 |
|
|
;/* Store the current value of chainlen. */
|
279 |
|
|
|
280 |
|
|
mov [esp+chainlenwmask], edx
|
281 |
|
|
|
282 |
|
|
;/* Point %edi to the string under scrutiny, and %esi to the string we */
|
283 |
|
|
;/* are hoping to match it up with. In actuality, %esi and %edi are */
|
284 |
|
|
;/* both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and %edx is */
|
285 |
|
|
;/* initialized to -(MAX_MATCH_8 - scanalign). */
|
286 |
|
|
|
287 |
|
|
mov esi, [esp+window]
|
288 |
|
|
mov edi, [esp+scan]
|
289 |
|
|
add esi, ecx
|
290 |
|
|
mov eax, [esp+scanalign]
|
291 |
|
|
mov edx, -MAX_MATCH_8
|
292 |
|
|
lea edi, [edi+eax+MAX_MATCH_8]
|
293 |
|
|
lea esi, [esi+eax+MAX_MATCH_8]
|
294 |
|
|
|
295 |
|
|
;/* Test the strings for equality, 8 bytes at a time. At the end,
|
296 |
|
|
; * adjust %edx so that it is offset to the exact byte that mismatched.
|
297 |
|
|
; *
|
298 |
|
|
; * We already know at this point that the first three bytes of the
|
299 |
|
|
; * strings match each other, and they can be safely passed over before
|
300 |
|
|
; * starting the compare loop. So what this code does is skip over 0-3
|
301 |
|
|
; * bytes, as much as necessary in order to dword-align the %edi
|
302 |
|
|
; * pointer. (%esi will still be misaligned three times out of four.)
|
303 |
|
|
; *
|
304 |
|
|
; * It should be confessed that this loop usually does not represent
|
305 |
|
|
; * much of the total running time. Replacing it with a more
|
306 |
|
|
; * straightforward "rep cmpsb" would not drastically degrade
|
307 |
|
|
; * performance.
|
308 |
|
|
; */
|
309 |
|
|
|
310 |
|
|
LoopCmps:
|
311 |
|
|
mov eax, DWORD PTR[esi+edx]
|
312 |
|
|
xor eax, DWORD PTR[edi+edx]
|
313 |
|
|
jnz SHORT LeaveLoopCmps
|
314 |
|
|
|
315 |
|
|
mov eax, DWORD PTR[esi+edx+4]
|
316 |
|
|
xor eax, DWORD PTR[edi+edx+4]
|
317 |
|
|
jnz SHORT LeaveLoopCmps4
|
318 |
|
|
|
319 |
|
|
add edx, 8
|
320 |
|
|
jnz SHORT LoopCmps
|
321 |
|
|
jmp LenMaximum
|
322 |
|
|
ALIGN 4
|
323 |
|
|
|
324 |
|
|
LeaveLoopCmps4:
|
325 |
|
|
add edx, 4
|
326 |
|
|
|
327 |
|
|
LeaveLoopCmps:
|
328 |
|
|
test eax, 00000FFFFH
|
329 |
|
|
jnz SHORT LenLower
|
330 |
|
|
|
331 |
|
|
add edx, 2
|
332 |
|
|
shr eax, 16
|
333 |
|
|
|
334 |
|
|
LenLower:
|
335 |
|
|
sub al, 1
|
336 |
|
|
adc edx, 0
|
337 |
|
|
|
338 |
|
|
;/* Calculate the length of the match. If it is longer than MAX_MATCH, */
|
339 |
|
|
;/* then automatically accept it as the best possible match and leave. */
|
340 |
|
|
|
341 |
|
|
lea eax, [edi+edx]
|
342 |
|
|
mov edi, [esp+scan]
|
343 |
|
|
sub eax, edi
|
344 |
|
|
cmp eax, MAX_MATCH
|
345 |
|
|
jge SHORT LenMaximum
|
346 |
|
|
|
347 |
|
|
;/* If the length of the match is not longer than the best match we */
|
348 |
|
|
;/* have so far, then forget it and return to the lookup loop. */
|
349 |
|
|
|
350 |
|
|
mov edx, [esp+deflatestate]
|
351 |
|
|
mov ebx, [esp+bestlen]
|
352 |
|
|
cmp eax, ebx
|
353 |
|
|
jg SHORT LongerMatch
|
354 |
|
|
mov esi, [esp+windowbestlen]
|
355 |
|
|
mov edi, [edx].ds_prev
|
356 |
|
|
mov ebx, [esp+scanend]
|
357 |
|
|
mov edx, [esp+chainlenwmask]
|
358 |
|
|
jmp LookupLoop
|
359 |
|
|
ALIGN 4
|
360 |
|
|
|
361 |
|
|
;/* s->match_start = cur_match; */
|
362 |
|
|
;/* best_len = len; */
|
363 |
|
|
;/* if (len >= nice_match) break; */
|
364 |
|
|
;/* scan_end = *(ushf*)(scan+best_len-1); */
|
365 |
|
|
|
366 |
|
|
LongerMatch:
|
367 |
|
|
mov ebx, [esp+nicematch]
|
368 |
|
|
mov [esp+bestlen], eax
|
369 |
|
|
mov [edx].ds_match_start, ecx
|
370 |
|
|
cmp eax, ebx
|
371 |
|
|
jge SHORT LeaveNow
|
372 |
|
|
mov esi, [esp+window]
|
373 |
|
|
add esi, eax
|
374 |
|
|
mov [esp+windowbestlen], esi
|
375 |
|
|
movzx ebx, WORD PTR[edi+eax-1]
|
376 |
|
|
mov edi, [edx].ds_prev
|
377 |
|
|
mov [esp+scanend], ebx
|
378 |
|
|
mov edx, [esp+chainlenwmask]
|
379 |
|
|
jmp LookupLoop
|
380 |
|
|
ALIGN 4
|
381 |
|
|
|
382 |
|
|
;/* Accept the current string, with the maximum possible length. */
|
383 |
|
|
|
384 |
|
|
LenMaximum:
|
385 |
|
|
mov edx, [esp+deflatestate]
|
386 |
|
|
mov DWORD PTR[esp+bestlen], MAX_MATCH
|
387 |
|
|
mov [edx].ds_match_start, ecx
|
388 |
|
|
|
389 |
|
|
;/* if ((uInt)best_len <= s->lookahead) return (uInt)best_len; */
|
390 |
|
|
;/* return s->lookahead; */
|
391 |
|
|
|
392 |
|
|
LeaveNow:
|
393 |
|
|
mov edx, [esp+deflatestate]
|
394 |
|
|
mov ebx, [esp+bestlen]
|
395 |
|
|
mov eax, [edx].ds_lookahead
|
396 |
|
|
cmp ebx, eax
|
397 |
|
|
jg SHORT LookaheadRet
|
398 |
|
|
mov eax, ebx
|
399 |
|
|
LookaheadRet:
|
400 |
|
|
|
401 |
|
|
; Restore the stack and return from whence we came.
|
402 |
|
|
|
403 |
|
|
add esp, varsize
|
404 |
|
|
pop ebx
|
405 |
|
|
pop esi
|
406 |
|
|
pop edi
|
407 |
|
|
pop ebp
|
408 |
|
|
ret
|
409 |
|
|
|
410 |
|
|
_longest_match ENDP
|
411 |
|
|
|
412 |
|
|
_TEXT ENDS
|
413 |
|
|
END
|