OpenCores
URL https://opencores.org/ocsvn/or1k/or1k/trunk

Subversion Repositories or1k

[/] [or1k/] [trunk/] [linux/] [linux-2.4/] [arch/] [alpha/] [lib/] [ev6-copy_page.S] - Blame information for rev 1275

Go to most recent revision | Details | Compare with Previous | View Log

Line No. Rev Author Line
1 1275 phoenix
/*
2
 * arch/alpha/lib/ev6-copy_page.S
3
 *
4
 * Copy an entire page.
5
 */
6
 
7
/* The following comparison of this routine vs the normal copy_page.S
8
   was written by an unnamed ev6 hardware designer and forwarded to me
9
   via Steven Hobbs .
10
 
11
   First Problem: STQ overflows.
12
   -----------------------------
13
 
14
        It would be nice if EV6 handled every resource overflow efficiently,
15
        but for some it doesn't.  Including store queue overflows.  It causes
16
        a trap and a restart of the pipe.
17
 
18
        To get around this we sometimes use (to borrow a term from a VSSAD
19
        researcher) "aeration".  The idea is to slow the rate at which the
20
        processor receives valid instructions by inserting nops in the fetch
21
        path.  In doing so, you can prevent the overflow and actually make
22
        the code run faster.  You can, of course, take advantage of the fact
23
        that the processor can fetch at most 4 aligned instructions per cycle.
24
 
25
        I inserted enough nops to force it to take 10 cycles to fetch the
26
        loop code.  In theory, EV6 should be able to execute this loop in
27
        9 cycles but I was not able to get it to run that fast -- the initial
28
        conditions were such that I could not reach this optimum rate on
29
        (chaotic) EV6.  I wrote the code such that everything would issue
30
        in order.
31
 
32
   Second Problem: Dcache index matches.
33
   -------------------------------------
34
 
35
        If you are going to use this routine on random aligned pages, there
36
        is a 25% chance that the pages will be at the same dcache indices.
37
        This results in many nasty memory traps without care.
38
 
39
        The solution is to schedule the prefetches to avoid the memory
40
        conflicts.  I schedule the wh64 prefetches farther ahead of the
41
        read prefetches to avoid this problem.
42
 
43
   Third Problem: Needs more prefetching.
44
   --------------------------------------
45
 
46
        In order to improve the code I added deeper prefetching to take the
47
        most advantage of EV6's bandwidth.
48
 
49
        I also prefetched the read stream. Note that adding the read prefetch
50
        forced me to add another cycle to the inner-most kernel - up to 11
51
        from the original 8 cycles per iteration.  We could improve performance
52
        further by unrolling the loop and doing multiple prefetches per cycle.
53
 
54
   I think that the code below will be very robust and fast code for the
55
   purposes of copying aligned pages.  It is slower when both source and
56
   destination pages are in the dcache, but it is my guess that this is
57
   less important than the dcache miss case.  */
58
 
59
 
60
        .text
61
        .align 4
62
        .global copy_page
63
        .ent copy_page
64
copy_page:
65
        .prologue 0
66
 
67
        /* Prefetch 5 read cachelines; write-hint 10 cache lines.  */
68
        wh64    ($16)
69
        ldl     $31,0($17)
70
        ldl     $31,64($17)
71
        lda     $1,1*64($16)
72
 
73
        wh64    ($1)
74
        ldl     $31,128($17)
75
        ldl     $31,192($17)
76
        lda     $1,2*64($16)
77
 
78
        wh64    ($1)
79
        ldl     $31,256($17)
80
        lda     $18,118
81
        lda     $1,3*64($16)
82
 
83
        wh64    ($1)
84
        nop
85
        lda     $1,4*64($16)
86
        lda     $2,5*64($16)
87
 
88
        wh64    ($1)
89
        wh64    ($2)
90
        lda     $1,6*64($16)
91
        lda     $2,7*64($16)
92
 
93
        wh64    ($1)
94
        wh64    ($2)
95
        lda     $1,8*64($16)
96
        lda     $2,9*64($16)
97
 
98
        wh64    ($1)
99
        wh64    ($2)
100
        lda     $19,10*64($16)
101
        nop
102
 
103
        /* Main prefetching/write-hinting loop.  */
104
1:      ldq     $0,0($17)
105
        ldq     $1,8($17)
106
        unop
107
        unop
108
 
109
        unop
110
        unop
111
        ldq     $2,16($17)
112
        ldq     $3,24($17)
113
 
114
        ldq     $4,32($17)
115
        ldq     $5,40($17)
116
        unop
117
        unop
118
 
119
        unop
120
        unop
121
        ldq     $6,48($17)
122
        ldq     $7,56($17)
123
 
124
        ldl     $31,320($17)
125
        unop
126
        unop
127
        unop
128
 
129
        /* This gives the extra cycle of aeration above the minimum.  */
130
        unop
131
        unop
132
        unop
133
        unop
134
 
135
        wh64    ($19)
136
        unop
137
        unop
138
        unop
139
 
140
        stq     $0,0($16)
141
        subq    $18,1,$18
142
        stq     $1,8($16)
143
        unop
144
 
145
        unop
146
        stq     $2,16($16)
147
        addq    $17,64,$17
148
        stq     $3,24($16)
149
 
150
        stq     $4,32($16)
151
        stq     $5,40($16)
152
        addq    $19,64,$19
153
        unop
154
 
155
        stq     $6,48($16)
156
        stq     $7,56($16)
157
        addq    $16,64,$16
158
        bne     $18, 1b
159
 
160
        /* Prefetch the final 5 cache lines of the read stream.  */
161
        lda     $18,10
162
        ldl     $31,320($17)
163
        ldl     $31,384($17)
164
        ldl     $31,448($17)
165
 
166
        ldl     $31,512($17)
167
        ldl     $31,576($17)
168
        nop
169
        nop
170
 
171
        /* Non-prefetching, non-write-hinting cleanup loop for the
172
           final 10 cache lines.  */
173
2:      ldq     $0,0($17)
174
        ldq     $1,8($17)
175
        ldq     $2,16($17)
176
        ldq     $3,24($17)
177
 
178
        ldq     $4,32($17)
179
        ldq     $5,40($17)
180
        ldq     $6,48($17)
181
        ldq     $7,56($17)
182
 
183
        stq     $0,0($16)
184
        subq    $18,1,$18
185
        stq     $1,8($16)
186
        addq    $17,64,$17
187
 
188
        stq     $2,16($16)
189
        stq     $3,24($16)
190
        stq     $4,32($16)
191
        stq     $5,40($16)
192
 
193
        stq     $6,48($16)
194
        stq     $7,56($16)
195
        addq    $16,64,$16
196
        bne     $18, 2b
197
 
198
        ret
199
        nop
200
        unop
201
        nop
202
 
203
        .end copy_page

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.