OpenCores
URL https://opencores.org/ocsvn/or1k_soc_on_altera_embedded_dev_kit/or1k_soc_on_altera_embedded_dev_kit/trunk

Subversion Repositories or1k_soc_on_altera_embedded_dev_kit

[/] [or1k_soc_on_altera_embedded_dev_kit/] [trunk/] [linux-2.6/] [linux-2.6.24/] [arch/] [alpha/] [lib/] [ev6-memcpy.S] - Blame information for rev 17

Go to most recent revision | Details | Compare with Previous | View Log

Line No. Rev Author Line
1 3 xianfeng
/*
2
 * arch/alpha/lib/ev6-memcpy.S
3
 * 21264 version by Rick Gorton 
4
 *
5
 * Reasonably optimized memcpy() routine for the Alpha 21264
6
 *
7
 *      - memory accessed as aligned quadwords only
8
 *      - uses bcmpge to compare 8 bytes in parallel
9
 *
10
 * Much of the information about 21264 scheduling/coding comes from:
11
 *      Compiler Writer's Guide for the Alpha 21264
12
 *      abbreviated as 'CWG' in other comments here
13
 *      ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
14
 * Scheduling notation:
15
 *      E       - either cluster
16
 *      U       - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
17
 *      L       - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
18
 *
19
 * Temp usage notes:
20
 *      $1,$2,          - scratch
21
 */
22
 
23
        .set noreorder
24
        .set noat
25
 
26
        .align  4
27
        .globl memcpy
28
        .ent memcpy
29
memcpy:
30
        .frame $30,0,$26,0
31
        .prologue 0
32
 
33
        mov     $16, $0                 # E : copy dest to return
34
        ble     $18, $nomoredata        # U : done with the copy?
35
        xor     $16, $17, $1            # E : are source and dest alignments the same?
36
        and     $1, 7, $1               # E : are they the same mod 8?
37
 
38
        bne     $1, $misaligned         # U : Nope - gotta do this the slow way
39
        /* source and dest are same mod 8 address */
40
        and     $16, 7, $1              # E : Are both 0mod8?
41
        beq     $1, $both_0mod8         # U : Yes
42
        nop                             # E :
43
 
44
        /*
45
         * source and dest are same misalignment.  move a byte at a time
46
         * until a 0mod8 alignment for both is reached.
47
         * At least one byte more to move
48
         */
49
 
50
$head_align:
51
        ldbu    $1, 0($17)              # L : grab a byte
52
        subq    $18, 1, $18             # E : count--
53
        addq    $17, 1, $17             # E : src++
54
        stb     $1, 0($16)              # L :
55
        addq    $16, 1, $16             # E : dest++
56
        and     $16, 7, $1              # E : Are we at 0mod8 yet?
57
        ble     $18, $nomoredata        # U : done with the copy?
58
        bne     $1, $head_align         # U :
59
 
60
$both_0mod8:
61
        cmple   $18, 127, $1            # E : Can we unroll the loop?
62
        bne     $1, $no_unroll          # U :
63
        and     $16, 63, $1             # E : get mod64 alignment
64
        beq     $1, $do_unroll          # U : no single quads to fiddle
65
 
66
$single_head_quad:
67
        ldq     $1, 0($17)              # L : get 8 bytes
68
        subq    $18, 8, $18             # E : count -= 8
69
        addq    $17, 8, $17             # E : src += 8
70
        nop                             # E :
71
 
72
        stq     $1, 0($16)              # L : store
73
        addq    $16, 8, $16             # E : dest += 8
74
        and     $16, 63, $1             # E : get mod64 alignment
75
        bne     $1, $single_head_quad   # U : still not fully aligned
76
 
77
$do_unroll:
78
        addq    $16, 64, $7             # E : Initial (+1 trip) wh64 address
79
        cmple   $18, 127, $1            # E : Can we go through the unrolled loop?
80
        bne     $1, $tail_quads         # U : Nope
81
        nop                             # E :
82
 
83
$unroll_body:
84
        wh64    ($7)                    # L1 : memory subsystem hint: 64 bytes at
85
                                        # ($7) are about to be over-written
86
        ldq     $6, 0($17)              # L0 : bytes 0..7
87
        nop                             # E :
88
        nop                             # E :
89
 
90
        ldq     $4, 8($17)              # L : bytes 8..15
91
        ldq     $5, 16($17)             # L : bytes 16..23
92
        addq    $7, 64, $7              # E : Update next wh64 address
93
        nop                             # E :
94
 
95
        ldq     $3, 24($17)             # L : bytes 24..31
96
        addq    $16, 64, $1             # E : fallback value for wh64
97
        nop                             # E :
98
        nop                             # E :
99
 
100
        addq    $17, 32, $17            # E : src += 32 bytes
101
        stq     $6, 0($16)              # L : bytes 0..7
102
        nop                             # E :
103
        nop                             # E :
104
 
105
        stq     $4, 8($16)              # L : bytes 8..15
106
        stq     $5, 16($16)             # L : bytes 16..23
107
        subq    $18, 192, $2            # E : At least two more trips to go?
108
        nop                             # E :
109
 
110
        stq     $3, 24($16)             # L : bytes 24..31
111
        addq    $16, 32, $16            # E : dest += 32 bytes
112
        nop                             # E :
113
        nop                             # E :
114
 
115
        ldq     $6, 0($17)              # L : bytes 0..7
116
        ldq     $4, 8($17)              # L : bytes 8..15
117
        cmovlt  $2, $1, $7              # E : Latency 2, extra map slot - Use
118
                                        # fallback wh64 address if < 2 more trips
119
        nop                             # E :
120
 
121
        ldq     $5, 16($17)             # L : bytes 16..23
122
        ldq     $3, 24($17)             # L : bytes 24..31
123
        addq    $16, 32, $16            # E : dest += 32
124
        subq    $18, 64, $18            # E : count -= 64
125
 
126
        addq    $17, 32, $17            # E : src += 32
127
        stq     $6, -32($16)            # L : bytes 0..7
128
        stq     $4, -24($16)            # L : bytes 8..15
129
        cmple   $18, 63, $1             # E : At least one more trip?
130
 
131
        stq     $5, -16($16)            # L : bytes 16..23
132
        stq     $3, -8($16)             # L : bytes 24..31
133
        nop                             # E :
134
        beq     $1, $unroll_body
135
 
136
$tail_quads:
137
$no_unroll:
138
        .align 4
139
        subq    $18, 8, $18             # E : At least a quad left?
140
        blt     $18, $less_than_8       # U : Nope
141
        nop                             # E :
142
        nop                             # E :
143
 
144
$move_a_quad:
145
        ldq     $1, 0($17)              # L : fetch 8
146
        subq    $18, 8, $18             # E : count -= 8
147
        addq    $17, 8, $17             # E : src += 8
148
        nop                             # E :
149
 
150
        stq     $1, 0($16)              # L : store 8
151
        addq    $16, 8, $16             # E : dest += 8
152
        bge     $18, $move_a_quad       # U :
153
        nop                             # E :
154
 
155
$less_than_8:
156
        .align 4
157
        addq    $18, 8, $18             # E : add back for trailing bytes
158
        ble     $18, $nomoredata        # U : All-done
159
        nop                             # E :
160
        nop                             # E :
161
 
162
        /* Trailing bytes */
163
$tail_bytes:
164
        subq    $18, 1, $18             # E : count--
165
        ldbu    $1, 0($17)              # L : fetch a byte
166
        addq    $17, 1, $17             # E : src++
167
        nop                             # E :
168
 
169
        stb     $1, 0($16)              # L : store a byte
170
        addq    $16, 1, $16             # E : dest++
171
        bgt     $18, $tail_bytes        # U : more to be done?
172
        nop                             # E :
173
 
174
        /* branching to exit takes 3 extra cycles, so replicate exit here */
175
        ret     $31, ($26), 1           # L0 :
176
        nop                             # E :
177
        nop                             # E :
178
        nop                             # E :
179
 
180
$misaligned:
181
        mov     $0, $4                  # E : dest temp
182
        and     $0, 7, $1               # E : dest alignment mod8
183
        beq     $1, $dest_0mod8         # U : life doesnt totally suck
184
        nop
185
 
186
$aligndest:
187
        ble     $18, $nomoredata        # U :
188
        ldbu    $1, 0($17)              # L : fetch a byte
189
        subq    $18, 1, $18             # E : count--
190
        addq    $17, 1, $17             # E : src++
191
 
192
        stb     $1, 0($4)               # L : store it
193
        addq    $4, 1, $4               # E : dest++
194
        and     $4, 7, $1               # E : dest 0mod8 yet?
195
        bne     $1, $aligndest          # U : go until we are aligned.
196
 
197
        /* Source has unknown alignment, but dest is known to be 0mod8 */
198
$dest_0mod8:
199
        subq    $18, 8, $18             # E : At least a quad left?
200
        blt     $18, $misalign_tail     # U : Nope
201
        ldq_u   $3, 0($17)              # L : seed (rotating load) of 8 bytes
202
        nop                             # E :
203
 
204
$mis_quad:
205
        ldq_u   $16, 8($17)             # L : Fetch next 8
206
        extql   $3, $17, $3             # U : masking
207
        extqh   $16, $17, $1            # U : masking
208
        bis     $3, $1, $1              # E : merged bytes to store
209
 
210
        subq    $18, 8, $18             # E : count -= 8
211
        addq    $17, 8, $17             # E : src += 8
212
        stq     $1, 0($4)               # L : store 8 (aligned)
213
        mov     $16, $3                 # E : "rotate" source data
214
 
215
        addq    $4, 8, $4               # E : dest += 8
216
        bge     $18, $mis_quad          # U : More quads to move
217
        nop
218
        nop
219
 
220
$misalign_tail:
221
        addq    $18, 8, $18             # E : account for tail stuff
222
        ble     $18, $nomoredata        # U :
223
        nop
224
        nop
225
 
226
$misalign_byte:
227
        ldbu    $1, 0($17)              # L : fetch 1
228
        subq    $18, 1, $18             # E : count--
229
        addq    $17, 1, $17             # E : src++
230
        nop                             # E :
231
 
232
        stb     $1, 0($4)               # L : store
233
        addq    $4, 1, $4               # E : dest++
234
        bgt     $18, $misalign_byte     # U : more to go?
235
        nop
236
 
237
 
238
$nomoredata:
239
        ret     $31, ($26), 1           # L0 :
240
        nop                             # E :
241
        nop                             # E :
242
        nop                             # E :
243
 
244
        .end memcpy
245
 
246
/* For backwards module compatibility.  */
247
__memcpy = memcpy
248
.globl __memcpy

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.