URL https://opencores.org/ocsvn/or1k/or1k/trunk

Subversion Repositories or1k

[/] [or1k/] [trunk/] [linux/] [linux-2.4/] [arch/] [alpha/] [lib/] [ev6-copy_page.S] - Blame information for rev 1275

Go to most recent revision | Details | Compare with Previous | View Log


/*
 * arch/alpha/lib/ev6-copy_page.S
 *
 * Copy an entire page.
 */
 
/* The following comparison of this routine vs the normal copy_page.S
   was written by an unnamed ev6 hardware designer and forwarded to me
   via Steven Hobbs .
 
   First Problem: STQ overflows.
   -----------------------------
 
        It would be nice if EV6 handled every resource overflow efficiently,
        but for some it doesn't.  Including store queue overflows.  It causes
        a trap and a restart of the pipe.
 
        To get around this we sometimes use (to borrow a term from a VSSAD
        researcher) "aeration".  The idea is to slow the rate at which the
        processor receives valid instructions by inserting nops in the fetch
        path.  In doing so, you can prevent the overflow and actually make
        the code run faster.  You can, of course, take advantage of the fact
        that the processor can fetch at most 4 aligned instructions per cycle.
 
        I inserted enough nops to force it to take 10 cycles to fetch the
        loop code.  In theory, EV6 should be able to execute this loop in
        9 cycles but I was not able to get it to run that fast -- the initial
        conditions were such that I could not reach this optimum rate on
        (chaotic) EV6.  I wrote the code such that everything would issue
        in order.
 
   Second Problem: Dcache index matches.
   -------------------------------------
 
        If you are going to use this routine on random aligned pages, there
        is a 25% chance that the pages will be at the same dcache indices.
        This results in many nasty memory traps without care.
 
        The solution is to schedule the prefetches to avoid the memory
        conflicts.  I schedule the wh64 prefetches farther ahead of the
        read prefetches to avoid this problem.
 
   Third Problem: Needs more prefetching.
   --------------------------------------
 
        In order to improve the code I added deeper prefetching to take the
        most advantage of EV6's bandwidth.
 
        I also prefetched the read stream. Note that adding the read prefetch
        forced me to add another cycle to the inner-most kernel - up to 11
        from the original 8 cycles per iteration.  We could improve performance
        further by unrolling the loop and doing multiple prefetches per cycle.
 
   I think that the code below will be very robust and fast code for the
   purposes of copying aligned pages.  It is slower when both source and
   destination pages are in the dcache, but it is my guess that this is
   less important than the dcache miss case.  */
 
 
        .text
        .align 4
        .global copy_page
        .ent copy_page
copy_page:
        .prologue 0
 
        /* Prefetch 5 read cachelines; write-hint 10 cache lines.  */
        wh64    ($16)
        ldl     $31,0($17)
        ldl     $31,64($17)
        lda     $1,1*64($16)
 
        wh64    ($1)
        ldl     $31,128($17)
        ldl     $31,192($17)
        lda     $1,2*64($16)
 
        wh64    ($1)
        ldl     $31,256($17)
        lda     $18,118
        lda     $1,3*64($16)
 
        wh64    ($1)
        nop
        lda     $1,4*64($16)
        lda     $2,5*64($16)
 
        wh64    ($1)
        wh64    ($2)
        lda     $1,6*64($16)
        lda     $2,7*64($16)
 
        wh64    ($1)
        wh64    ($2)
        lda     $1,8*64($16)
        lda     $2,9*64($16)
 
        wh64    ($1)
        wh64    ($2)
        lda     $19,10*64($16)
        nop
 
        /* Main prefetching/write-hinting loop.  */
1:      ldq     $0,0($17)
        ldq     $1,8($17)
        unop
        unop
 
        unop
        unop
        ldq     $2,16($17)
        ldq     $3,24($17)
 
        ldq     $4,32($17)
        ldq     $5,40($17)
        unop
        unop
 
        unop
        unop
        ldq     $6,48($17)
        ldq     $7,56($17)
 
        ldl     $31,320($17)
        unop
        unop
        unop
 
        /* This gives the extra cycle of aeration above the minimum.  */
        unop
        unop
        unop
        unop
 
        wh64    ($19)
        unop
        unop
        unop
 
        stq     $0,0($16)
        subq    $18,1,$18
        stq     $1,8($16)
        unop
 
        unop
        stq     $2,16($16)
        addq    $17,64,$17
        stq     $3,24($16)
 
        stq     $4,32($16)
        stq     $5,40($16)
        addq    $19,64,$19
        unop
 
        stq     $6,48($16)
        stq     $7,56($16)
        addq    $16,64,$16
        bne     $18, 1b
 
        /* Prefetch the final 5 cache lines of the read stream.  */
        lda     $18,10
        ldl     $31,320($17)
        ldl     $31,384($17)
        ldl     $31,448($17)
 
        ldl     $31,512($17)
        ldl     $31,576($17)
        nop
        nop
 
        /* Non-prefetching, non-write-hinting cleanup loop for the
           final 10 cache lines.  */
2:      ldq     $0,0($17)
        ldq     $1,8($17)
        ldq     $2,16($17)
        ldq     $3,24($17)
 
        ldq     $4,32($17)
        ldq     $5,40($17)
        ldq     $6,48($17)
        ldq     $7,56($17)
 
        stq     $0,0($16)
        subq    $18,1,$18
        stq     $1,8($16)
        addq    $17,64,$17
 
        stq     $2,16($16)
        stq     $3,24($16)
        stq     $4,32($16)
        stq     $5,40($16)
 
        stq     $6,48($16)
        stq     $7,56($16)
        addq    $16,64,$16
        bne     $18, 2b
 
        ret
        nop
        unop
        nop
 
        .end copy_page

Browse

Tools

Subversion Repositories or1k

[/] [or1k/] [trunk/] [linux/] [linux-2.4/] [arch/] [alpha/] [lib/] [ev6-copy_page.S] - Blame information for rev 1275

Line No.	Rev	Author	Line
1	1275	phoenix	`/*`
2			`* arch/alpha/lib/ev6-copy_page.S`
3			`*`
4			`* Copy an entire page.`
5			`*/`
6
7			`/* The following comparison of this routine vs the normal copy_page.S`
8			`was written by an unnamed ev6 hardware designer and forwarded to me`
9			`via Steven Hobbs .`
10
11			`First Problem: STQ overflows.`
12			`-----------------------------`
13
14			`It would be nice if EV6 handled every resource overflow efficiently,`
15			`but for some it doesn't. Including store queue overflows. It causes`
16			`a trap and a restart of the pipe.`
17
18			`To get around this we sometimes use (to borrow a term from a VSSAD`
19			`researcher) "aeration". The idea is to slow the rate at which the`
20			`processor receives valid instructions by inserting nops in the fetch`
21			`path. In doing so, you can prevent the overflow and actually make`
22			`the code run faster. You can, of course, take advantage of the fact`
23			`that the processor can fetch at most 4 aligned instructions per cycle.`
24
25			`I inserted enough nops to force it to take 10 cycles to fetch the`
26			`loop code. In theory, EV6 should be able to execute this loop in`
27			`9 cycles but I was not able to get it to run that fast -- the initial`
28			`conditions were such that I could not reach this optimum rate on`
29			`(chaotic) EV6. I wrote the code such that everything would issue`
30			`in order.`
31
32			`Second Problem: Dcache index matches.`
33			`-------------------------------------`
34
35			`If you are going to use this routine on random aligned pages, there`
36			`is a 25% chance that the pages will be at the same dcache indices.`
37			`This results in many nasty memory traps without care.`
38
39			`The solution is to schedule the prefetches to avoid the memory`
40			`conflicts. I schedule the wh64 prefetches farther ahead of the`
41			`read prefetches to avoid this problem.`
42
43			`Third Problem: Needs more prefetching.`
44			`--------------------------------------`
45
46			`In order to improve the code I added deeper prefetching to take the`
47			`most advantage of EV6's bandwidth.`
48
49			`I also prefetched the read stream. Note that adding the read prefetch`
50			`forced me to add another cycle to the inner-most kernel - up to 11`
51			`from the original 8 cycles per iteration. We could improve performance`
52			`further by unrolling the loop and doing multiple prefetches per cycle.`
53
54			`I think that the code below will be very robust and fast code for the`
55			`purposes of copying aligned pages. It is slower when both source and`
56			`destination pages are in the dcache, but it is my guess that this is`
57			`less important than the dcache miss case. */`
58
59
60			`.text`
61			`.align 4`
62			`.global copy_page`
63			`.ent copy_page`
64			`copy_page:`
65			`.prologue 0`
66
67			`/* Prefetch 5 read cachelines; write-hint 10 cache lines. */`
68			`wh64 ($16)`
69			`ldl $31,0($17)`
70			`ldl $31,64($17)`
71			`lda $1,1*64($16)`
72
73			`wh64 ($1)`
74			`ldl $31,128($17)`
75			`ldl $31,192($17)`
76			`lda $1,2*64($16)`
77
78			`wh64 ($1)`
79			`ldl $31,256($17)`
80			`lda $18,118`
81			`lda $1,3*64($16)`
82
83			`wh64 ($1)`
84			`nop`
85			`lda $1,4*64($16)`
86			`lda $2,5*64($16)`
87
88			`wh64 ($1)`
89			`wh64 ($2)`
90			`lda $1,6*64($16)`
91			`lda $2,7*64($16)`
92
93			`wh64 ($1)`
94			`wh64 ($2)`
95			`lda $1,8*64($16)`
96			`lda $2,9*64($16)`
97
98			`wh64 ($1)`
99			`wh64 ($2)`
100			`lda $19,10*64($16)`
101			`nop`
102
103			`/* Main prefetching/write-hinting loop. */`
104			`1: ldq $0,0($17)`
105			`ldq $1,8($17)`
106			`unop`
107			`unop`
108
109			`unop`
110			`unop`
111			`ldq $2,16($17)`
112			`ldq $3,24($17)`
113
114			`ldq $4,32($17)`
115			`ldq $5,40($17)`
116			`unop`
117			`unop`
118
119			`unop`
120			`unop`
121			`ldq $6,48($17)`
122			`ldq $7,56($17)`
123
124			`ldl $31,320($17)`
125			`unop`
126			`unop`
127			`unop`
128
129			`/* This gives the extra cycle of aeration above the minimum. */`
130			`unop`
131			`unop`
132			`unop`
133			`unop`
134
135			`wh64 ($19)`
136			`unop`
137			`unop`
138			`unop`
139
140			`stq $0,0($16)`
141			`subq $18,1,$18`
142			`stq $1,8($16)`
143			`unop`
144
145			`unop`
146			`stq $2,16($16)`
147			`addq $17,64,$17`
148			`stq $3,24($16)`
149
150			`stq $4,32($16)`
151			`stq $5,40($16)`
152			`addq $19,64,$19`
153			`unop`
154
155			`stq $6,48($16)`
156			`stq $7,56($16)`
157			`addq $16,64,$16`
158			`bne $18, 1b`
159
160			`/* Prefetch the final 5 cache lines of the read stream. */`
161			`lda $18,10`
162			`ldl $31,320($17)`
163			`ldl $31,384($17)`
164			`ldl $31,448($17)`
165
166			`ldl $31,512($17)`
167			`ldl $31,576($17)`
168			`nop`
169			`nop`
170
171			`/* Non-prefetching, non-write-hinting cleanup loop for the`
172			`final 10 cache lines. */`
173			`2: ldq $0,0($17)`
174			`ldq $1,8($17)`
175			`ldq $2,16($17)`
176			`ldq $3,24($17)`
177
178			`ldq $4,32($17)`
179			`ldq $5,40($17)`
180			`ldq $6,48($17)`
181			`ldq $7,56($17)`
182
183			`stq $0,0($16)`
184			`subq $18,1,$18`
185			`stq $1,8($16)`
186			`addq $17,64,$17`
187
188			`stq $2,16($16)`
189			`stq $3,24($16)`
190			`stq $4,32($16)`
191			`stq $5,40($16)`
192
193			`stq $6,48($16)`
194			`stq $7,56($16)`
195			`addq $16,64,$16`
196			`bne $18, 2b`
197
198			`ret`
199			`nop`
200			`unop`
201			`nop`
202
203			`.end copy_page`