OpenCores
URL https://opencores.org/ocsvn/openrisc/openrisc/trunk

Subversion Repositories openrisc

[/] [openrisc/] [trunk/] [gnu-dev/] [or1k-gcc/] [zlib/] [contrib/] [masmx64/] [gvmat64.asm] - Blame information for rev 767

Go to most recent revision | Details | Compare with Previous | View Log

Line No. Rev Author Line
1 745 jeremybenn
;uInt longest_match_x64(
2
;    deflate_state *s,
3
;    IPos cur_match);                             /* current match */
4
 
5
; gvmat64.asm -- Asm portion of the optimized longest_match for 32 bits x86
6
; Copyright (C) 1995-2005 Jean-loup Gailly, Brian Raiter and Gilles Vollant.
7
;
8
; File written by Gilles Vollant, by converting to assembly the longest_match
9
;  from Jean-loup Gailly in deflate.c of zLib and infoZip zip.
10
;
11
;  and by taking inspiration on asm686 with masm, optimised assembly code
12
;        from Brian Raiter, written 1998
13
;
14
;         http://www.zlib.net
15
;         http://www.winimage.com/zLibDll
16
;         http://www.muppetlabs.com/~breadbox/software/assembly.html
17
;
18
; to compile this file for infozip Zip, I use option:
19
;   ml64.exe /Flgvmat64 /c /Zi /DINFOZIP gvmat64.asm
20
;
21
; to compile this file for zLib, I use option:
22
;   ml64.exe /Flgvmat64 /c /Zi gvmat64.asm
23
; Be carrefull to adapt zlib1222add below to your version of zLib
24
;   (if you use a version of zLib before 1.0.4 or after 1.2.2.2, change
25
;    value of zlib1222add later)
26
;
27
; This file compile with Microsoft Macro Assembler (x64) for AMD64
28
;
29
;   ml64.exe is given with Visual Studio 2005 and Windows 2003 server DDK
30
;
31
;   (you can get Windows 2003 server DDK with ml64 and cl for AMD64 from
32
;      http://www.microsoft.com/whdc/devtools/ddk/default.mspx for low price)
33
;
34
 
35
 
36
;uInt longest_match(s, cur_match)
37
;    deflate_state *s;
38
;    IPos cur_match;                             /* current match */
39
.code
40
longest_match PROC
41
 
42
 
43
;LocalVarsSize   equ 88
44
 LocalVarsSize   equ 72
45
 
46
; register used : rax,rbx,rcx,rdx,rsi,rdi,r8,r9,r10,r11,r12
47
; free register :  r14,r15
48
; register can be saved : rsp
49
 
50
 chainlenwmask   equ  rsp + 8 - LocalVarsSize    ; high word: current chain len
51
                                                 ; low word: s->wmask
52
;window          equ  rsp + xx - LocalVarsSize   ; local copy of s->window ; stored in r10
53
;windowbestlen   equ  rsp + xx - LocalVarsSize   ; s->window + bestlen , use r10+r11
54
;scanstart       equ  rsp + xx - LocalVarsSize   ; first two bytes of string ; stored in r12w
55
;scanend         equ  rsp + xx - LocalVarsSize   ; last two bytes of string use ebx
56
;scanalign       equ  rsp + xx - LocalVarsSize   ; dword-misalignment of string r13
57
;bestlen         equ  rsp + xx - LocalVarsSize   ; size of best match so far -> r11d
58
;scan            equ  rsp + xx - LocalVarsSize   ; ptr to string wanting match -> r9
59
IFDEF INFOZIP
60
ELSE
61
 nicematch       equ  (rsp + 16 - LocalVarsSize) ; a good enough match size
62
ENDIF
63
 
64
save_rdi        equ  rsp + 24 - LocalVarsSize
65
save_rsi        equ  rsp + 32 - LocalVarsSize
66
save_rbx        equ  rsp + 40 - LocalVarsSize
67
save_rbp        equ  rsp + 48 - LocalVarsSize
68
save_r12        equ  rsp + 56 - LocalVarsSize
69
save_r13        equ  rsp + 64 - LocalVarsSize
70
;save_r14        equ  rsp + 72 - LocalVarsSize
71
;save_r15        equ  rsp + 80 - LocalVarsSize
72
 
73
 
74
 
75
;  all the +4 offsets are due to the addition of pending_buf_size (in zlib
76
;  in the deflate_state structure since the asm code was first written
77
;  (if you compile with zlib 1.0.4 or older, remove the +4).
78
;  Note : these value are good with a 8 bytes boundary pack structure
79
 
80
 
81
    MAX_MATCH           equ     258
82
    MIN_MATCH           equ     3
83
    MIN_LOOKAHEAD       equ     (MAX_MATCH+MIN_MATCH+1)
84
 
85
 
86
;;; Offsets for fields in the deflate_state structure. These numbers
87
;;; are calculated from the definition of deflate_state, with the
88
;;; assumption that the compiler will dword-align the fields. (Thus,
89
;;; changing the definition of deflate_state could easily cause this
90
;;; program to crash horribly, without so much as a warning at
91
;;; compile time. Sigh.)
92
 
93
;  all the +zlib1222add offsets are due to the addition of fields
94
;  in zlib in the deflate_state structure since the asm code was first written
95
;  (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)").
96
;  (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0").
97
;  if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8").
98
 
99
 
100
IFDEF INFOZIP
101
 
102
_DATA   SEGMENT
103
COMM    window_size:DWORD
104
; WMask ; 7fff
105
COMM    window:BYTE:010040H
106
COMM    prev:WORD:08000H
107
; MatchLen : unused
108
; PrevMatch : unused
109
COMM    strstart:DWORD
110
COMM    match_start:DWORD
111
; Lookahead : ignore
112
COMM    prev_length:DWORD ; PrevLen
113
COMM    max_chain_length:DWORD
114
COMM    good_match:DWORD
115
COMM    nice_match:DWORD
116
prev_ad equ OFFSET prev
117
window_ad equ OFFSET window
118
nicematch equ nice_match
119
_DATA ENDS
120
WMask equ 07fffh
121
 
122
ELSE
123
 
124
  IFNDEF zlib1222add
125
    zlib1222add equ 8
126
  ENDIF
127
dsWSize         equ 56+zlib1222add+(zlib1222add/2)
128
dsWMask         equ 64+zlib1222add+(zlib1222add/2)
129
dsWindow        equ 72+zlib1222add
130
dsPrev          equ 88+zlib1222add
131
dsMatchLen      equ 128+zlib1222add
132
dsPrevMatch     equ 132+zlib1222add
133
dsStrStart      equ 140+zlib1222add
134
dsMatchStart    equ 144+zlib1222add
135
dsLookahead     equ 148+zlib1222add
136
dsPrevLen       equ 152+zlib1222add
137
dsMaxChainLen   equ 156+zlib1222add
138
dsGoodMatch     equ 172+zlib1222add
139
dsNiceMatch     equ 176+zlib1222add
140
 
141
window_size     equ [ rcx + dsWSize]
142
WMask           equ [ rcx + dsWMask]
143
window_ad       equ [ rcx + dsWindow]
144
prev_ad         equ [ rcx + dsPrev]
145
strstart        equ [ rcx + dsStrStart]
146
match_start     equ [ rcx + dsMatchStart]
147
Lookahead       equ [ rcx + dsLookahead] ; 0ffffffffh on infozip
148
prev_length     equ [ rcx + dsPrevLen]
149
max_chain_length equ [ rcx + dsMaxChainLen]
150
good_match      equ [ rcx + dsGoodMatch]
151
nice_match      equ [ rcx + dsNiceMatch]
152
ENDIF
153
 
154
; parameter 1 in r8(deflate state s), param 2 in rdx (cur match)
155
 
156
; see http://weblogs.asp.net/oldnewthing/archive/2004/01/14/58579.aspx and
157
; http://msdn.microsoft.com/library/en-us/kmarch/hh/kmarch/64bitAMD_8e951dd2-ee77-4728-8702-55ce4b5dd24a.xml.asp
158
;
159
; All registers must be preserved across the call, except for
160
;   rax, rcx, rdx, r8, r9, r10, and r11, which are scratch.
161
 
162
 
163
 
164
;;; Save registers that the compiler may be using, and adjust esp to
165
;;; make room for our stack frame.
166
 
167
 
168
;;; Retrieve the function arguments. r8d will hold cur_match
169
;;; throughout the entire function. edx will hold the pointer to the
170
;;; deflate_state structure during the function's setup (before
171
;;; entering the main loop.
172
 
173
; parameter 1 in rcx (deflate_state* s), param 2 in edx -> r8 (cur match)
174
 
175
; this clear high 32 bits of r8, which can be garbage in both r8 and rdx
176
 
177
        mov [save_rdi],rdi
178
        mov [save_rsi],rsi
179
        mov [save_rbx],rbx
180
        mov [save_rbp],rbp
181
IFDEF INFOZIP
182
        mov r8d,ecx
183
ELSE
184
        mov r8d,edx
185
ENDIF
186
        mov [save_r12],r12
187
        mov [save_r13],r13
188
;        mov [save_r14],r14
189
;        mov [save_r15],r15
190
 
191
 
192
;;; uInt wmask = s->w_mask;
193
;;; unsigned chain_length = s->max_chain_length;
194
;;; if (s->prev_length >= s->good_match) {
195
;;;     chain_length >>= 2;
196
;;; }
197
 
198
        mov edi, prev_length
199
        mov esi, good_match
200
        mov eax, WMask
201
        mov ebx, max_chain_length
202
        cmp edi, esi
203
        jl  LastMatchGood
204
        shr ebx, 2
205
LastMatchGood:
206
 
207
;;; chainlen is decremented once beforehand so that the function can
208
;;; use the sign flag instead of the zero flag for the exit test.
209
;;; It is then shifted into the high word, to make room for the wmask
210
;;; value, which it will always accompany.
211
 
212
        dec ebx
213
        shl ebx, 16
214
        or  ebx, eax
215
 
216
;;; on zlib only
217
;;; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;
218
 
219
IFDEF INFOZIP
220
        mov [chainlenwmask], ebx
221
; on infozip nice_match = [nice_match]
222
ELSE
223
        mov eax, nice_match
224
        mov [chainlenwmask], ebx
225
        mov r10d, Lookahead
226
        cmp r10d, eax
227
        cmovnl r10d, eax
228
        mov [nicematch],r10d
229
ENDIF
230
 
231
;;; register Bytef *scan = s->window + s->strstart;
232
        mov r10, window_ad
233
        mov ebp, strstart
234
        lea r13, [r10 + rbp]
235
 
236
;;; Determine how many bytes the scan ptr is off from being
237
;;; dword-aligned.
238
 
239
         mov r9,r13
240
         neg r13
241
         and r13,3
242
 
243
;;; IPos limit = s->strstart > (IPos)MAX_DIST(s) ?
244
;;;     s->strstart - (IPos)MAX_DIST(s) : NIL;
245
IFDEF INFOZIP
246
        mov eax,07efah ; MAX_DIST = (WSIZE-MIN_LOOKAHEAD) (0x8000-(3+8+1))
247
ELSE
248
        mov eax, window_size
249
        sub eax, MIN_LOOKAHEAD
250
ENDIF
251
        xor edi,edi
252
        sub ebp, eax
253
 
254
        mov r11d, prev_length
255
 
256
        cmovng ebp,edi
257
 
258
;;; int best_len = s->prev_length;
259
 
260
 
261
;;; Store the sum of s->window + best_len in esi locally, and in esi.
262
 
263
       lea  rsi,[r10+r11]
264
 
265
;;; register ush scan_start = *(ushf*)scan;
266
;;; register ush scan_end   = *(ushf*)(scan+best_len-1);
267
;;; Posf *prev = s->prev;
268
 
269
        movzx r12d,word ptr [r9]
270
        movzx ebx, word ptr [r9 + r11 - 1]
271
 
272
        mov rdi, prev_ad
273
 
274
;;; Jump into the main loop.
275
 
276
        mov edx, [chainlenwmask]
277
 
278
        cmp bx,word ptr [rsi + r8 - 1]
279
        jz  LookupLoopIsZero
280
 
281
LookupLoop1:
282
        and r8d, edx
283
 
284
        movzx   r8d, word ptr [rdi + r8*2]
285
        cmp r8d, ebp
286
        jbe LeaveNow
287
        sub edx, 00010000h
288
        js  LeaveNow
289
 
290
LoopEntry1:
291
        cmp bx,word ptr [rsi + r8 - 1]
292
        jz  LookupLoopIsZero
293
 
294
LookupLoop2:
295
        and r8d, edx
296
 
297
        movzx   r8d, word ptr [rdi + r8*2]
298
        cmp r8d, ebp
299
        jbe LeaveNow
300
        sub edx, 00010000h
301
        js  LeaveNow
302
 
303
LoopEntry2:
304
        cmp bx,word ptr [rsi + r8 - 1]
305
        jz  LookupLoopIsZero
306
 
307
LookupLoop4:
308
        and r8d, edx
309
 
310
        movzx   r8d, word ptr [rdi + r8*2]
311
        cmp r8d, ebp
312
        jbe LeaveNow
313
        sub edx, 00010000h
314
        js  LeaveNow
315
 
316
LoopEntry4:
317
 
318
        cmp bx,word ptr [rsi + r8 - 1]
319
        jnz LookupLoop1
320
        jmp LookupLoopIsZero
321
 
322
 
323
;;; do {
324
;;;     match = s->window + cur_match;
325
;;;     if (*(ushf*)(match+best_len-1) != scan_end ||
326
;;;         *(ushf*)match != scan_start) continue;
327
;;;     [...]
328
;;; } while ((cur_match = prev[cur_match & wmask]) > limit
329
;;;          && --chain_length != 0);
330
;;;
331
;;; Here is the inner loop of the function. The function will spend the
332
;;; majority of its time in this loop, and majority of that time will
333
;;; be spent in the first ten instructions.
334
;;;
335
;;; Within this loop:
336
;;; ebx = scanend
337
;;; r8d = curmatch
338
;;; edx = chainlenwmask - i.e., ((chainlen << 16) | wmask)
339
;;; esi = windowbestlen - i.e., (window + bestlen)
340
;;; edi = prev
341
;;; ebp = limit
342
 
343
LookupLoop:
344
        and r8d, edx
345
 
346
        movzx   r8d, word ptr [rdi + r8*2]
347
        cmp r8d, ebp
348
        jbe LeaveNow
349
        sub edx, 00010000h
350
        js  LeaveNow
351
 
352
LoopEntry:
353
 
354
        cmp bx,word ptr [rsi + r8 - 1]
355
        jnz LookupLoop1
356
LookupLoopIsZero:
357
        cmp     r12w, word ptr [r10 + r8]
358
        jnz LookupLoop1
359
 
360
 
361
;;; Store the current value of chainlen.
362
        mov [chainlenwmask], edx
363
 
364
;;; Point edi to the string under scrutiny, and esi to the string we
365
;;; are hoping to match it up with. In actuality, esi and edi are
366
;;; both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and edx is
367
;;; initialized to -(MAX_MATCH_8 - scanalign).
368
 
369
        lea rsi,[r8+r10]
370
        mov rdx, 0fffffffffffffef8h; -(MAX_MATCH_8)
371
        lea rsi, [rsi + r13 + 0108h] ;MAX_MATCH_8]
372
        lea rdi, [r9 + r13 + 0108h] ;MAX_MATCH_8]
373
 
374
        prefetcht1 [rsi+rdx]
375
        prefetcht1 [rdi+rdx]
376
 
377
 
378
;;; Test the strings for equality, 8 bytes at a time. At the end,
379
;;; adjust rdx so that it is offset to the exact byte that mismatched.
380
;;;
381
;;; We already know at this point that the first three bytes of the
382
;;; strings match each other, and they can be safely passed over before
383
;;; starting the compare loop. So what this code does is skip over 0-3
384
;;; bytes, as much as necessary in order to dword-align the edi
385
;;; pointer. (rsi will still be misaligned three times out of four.)
386
;;;
387
;;; It should be confessed that this loop usually does not represent
388
;;; much of the total running time. Replacing it with a more
389
;;; straightforward "rep cmpsb" would not drastically degrade
390
;;; performance.
391
 
392
 
393
LoopCmps:
394
        mov rax, [rsi + rdx]
395
        xor rax, [rdi + rdx]
396
        jnz LeaveLoopCmps
397
 
398
        mov rax, [rsi + rdx + 8]
399
        xor rax, [rdi + rdx + 8]
400
        jnz LeaveLoopCmps8
401
 
402
 
403
        mov rax, [rsi + rdx + 8+8]
404
        xor rax, [rdi + rdx + 8+8]
405
        jnz LeaveLoopCmps16
406
 
407
        add rdx,8+8+8
408
 
409
        jmp short LoopCmps
410
LeaveLoopCmps16: add rdx,8
411
LeaveLoopCmps8: add rdx,8
412
LeaveLoopCmps:
413
 
414
        test    eax, 0000FFFFh
415
        jnz LenLower
416
 
417
        test eax,0ffffffffh
418
 
419
        jnz LenLower32
420
 
421
        add rdx,4
422
        shr rax,32
423
        or ax,ax
424
        jnz LenLower
425
 
426
LenLower32:
427
        shr eax,16
428
        add rdx,2
429
LenLower:   sub al, 1
430
        adc rdx, 0
431
;;; Calculate the length of the match. If it is longer than MAX_MATCH,
432
;;; then automatically accept it as the best possible match and leave.
433
 
434
        lea rax, [rdi + rdx]
435
        sub rax, r9
436
        cmp eax, MAX_MATCH
437
        jge LenMaximum
438
 
439
;;; If the length of the match is not longer than the best match we
440
;;; have so far, then forget it and return to the lookup loop.
441
;///////////////////////////////////
442
 
443
        cmp eax, r11d
444
        jg  LongerMatch
445
 
446
        lea rsi,[r10+r11]
447
 
448
        mov rdi, prev_ad
449
        mov edx, [chainlenwmask]
450
        jmp LookupLoop
451
 
452
;;;         s->match_start = cur_match;
453
;;;         best_len = len;
454
;;;         if (len >= nice_match) break;
455
;;;         scan_end = *(ushf*)(scan+best_len-1);
456
 
457
LongerMatch:
458
        mov r11d, eax
459
        mov match_start, r8d
460
        cmp eax, [nicematch]
461
        jge LeaveNow
462
 
463
        lea rsi,[r10+rax]
464
 
465
        movzx   ebx, word ptr [r9 + rax - 1]
466
        mov rdi, prev_ad
467
        mov edx, [chainlenwmask]
468
        jmp LookupLoop
469
 
470
;;; Accept the current string, with the maximum possible length.
471
 
472
LenMaximum:
473
        mov r11d,MAX_MATCH
474
        mov match_start, r8d
475
 
476
;;; if ((uInt)best_len <= s->lookahead) return (uInt)best_len;
477
;;; return s->lookahead;
478
 
479
LeaveNow:
480
IFDEF INFOZIP
481
        mov eax,r11d
482
ELSE
483
        mov eax, Lookahead
484
        cmp r11d, eax
485
        cmovng eax, r11d
486
ENDIF
487
 
488
;;; Restore the stack and return from whence we came.
489
 
490
 
491
        mov rsi,[save_rsi]
492
        mov rdi,[save_rdi]
493
        mov rbx,[save_rbx]
494
        mov rbp,[save_rbp]
495
        mov r12,[save_r12]
496
        mov r13,[save_r13]
497
;        mov r14,[save_r14]
498
;        mov r15,[save_r15]
499
 
500
 
501
        ret 0
502
; please don't remove this string !
503
; Your can freely use gvmat64 in any free or commercial app
504
; but it is far better don't remove the string in the binary!
505
    db     0dh,0ah,"asm686 with masm, optimised assembly code from Brian Raiter, written 1998, converted to amd 64 by Gilles Vollant 2005",0dh,0ah,0
506
longest_match   ENDP
507
 
508
match_init PROC
509
  ret 0
510
match_init ENDP
511
 
512
 
513
END

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.