OpenCores
URL https://opencores.org/ocsvn/or1k_soc_on_altera_embedded_dev_kit/or1k_soc_on_altera_embedded_dev_kit/trunk

Subversion Repositories or1k_soc_on_altera_embedded_dev_kit

[/] [or1k_soc_on_altera_embedded_dev_kit/] [trunk/] [linux-2.6/] [linux-2.6.24/] [arch/] [mips/] [lib/] [memcpy.S] - Blame information for rev 3

Details | Compare with Previous | View Log

Line No. Rev Author Line
1 3 xianfeng
/*
2
 * This file is subject to the terms and conditions of the GNU General Public
3
 * License.  See the file "COPYING" in the main directory of this archive
4
 * for more details.
5
 *
6
 * Unified implementation of memcpy, memmove and the __copy_user backend.
7
 *
8
 * Copyright (C) 1998, 99, 2000, 01, 2002 Ralf Baechle (ralf@gnu.org)
9
 * Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc.
10
 * Copyright (C) 2002 Broadcom, Inc.
11
 *   memcpy/copy_user author: Mark Vandevoorde
12
 *
13
 * Mnemonic names for arguments to memcpy/__copy_user
14
 */
15
 
16
/*
17
 * Hack to resolve longstanding prefetch issue
18
 *
19
 * Prefetching may be fatal on some systems if we're prefetching beyond the
20
 * end of memory on some systems.  It's also a seriously bad idea on non
21
 * dma-coherent systems.
22
 */
23
#if !defined(CONFIG_DMA_COHERENT) || !defined(CONFIG_DMA_IP27)
24
#undef CONFIG_CPU_HAS_PREFETCH
25
#endif
26
#ifdef CONFIG_MIPS_MALTA
27
#undef CONFIG_CPU_HAS_PREFETCH
28
#endif
29
 
30
#include 
31
#include 
32
#include 
33
 
34
#define dst a0
35
#define src a1
36
#define len a2
37
 
38
/*
39
 * Spec
40
 *
41
 * memcpy copies len bytes from src to dst and sets v0 to dst.
42
 * It assumes that
43
 *   - src and dst don't overlap
44
 *   - src is readable
45
 *   - dst is writable
46
 * memcpy uses the standard calling convention
47
 *
48
 * __copy_user copies up to len bytes from src to dst and sets a2 (len) to
49
 * the number of uncopied bytes due to an exception caused by a read or write.
50
 * __copy_user assumes that src and dst don't overlap, and that the call is
51
 * implementing one of the following:
52
 *   copy_to_user
53
 *     - src is readable  (no exceptions when reading src)
54
 *   copy_from_user
55
 *     - dst is writable  (no exceptions when writing dst)
56
 * __copy_user uses a non-standard calling convention; see
57
 * include/asm-mips/uaccess.h
58
 *
59
 * When an exception happens on a load, the handler must
60
 # ensure that all of the destination buffer is overwritten to prevent
61
 * leaking information to user mode programs.
62
 */
63
 
64
/*
65
 * Implementation
66
 */
67
 
68
/*
69
 * The exception handler for loads requires that:
70
 *  1- AT contain the address of the byte just past the end of the source
71
 *     of the copy,
72
 *  2- src_entry <= src < AT, and
73
 *  3- (dst - src) == (dst_entry - src_entry),
74
 * The _entry suffix denotes values when __copy_user was called.
75
 *
76
 * (1) is set up up by uaccess.h and maintained by not writing AT in copy_user
77
 * (2) is met by incrementing src by the number of bytes copied
78
 * (3) is met by not doing loads between a pair of increments of dst and src
79
 *
80
 * The exception handlers for stores adjust len (if necessary) and return.
81
 * These handlers do not need to overwrite any data.
82
 *
83
 * For __rmemcpy and memmove an exception is always a kernel bug, therefore
84
 * they're not protected.
85
 */
86
 
87
#define EXC(inst_reg,addr,handler)              \
88
9:      inst_reg, addr;                         \
89
        .section __ex_table,"a";                \
90
        PTR     9b, handler;                    \
91
        .previous
92
 
93
/*
94
 * Only on the 64-bit kernel we can made use of 64-bit registers.
95
 */
96
#ifdef CONFIG_64BIT
97
#define USE_DOUBLE
98
#endif
99
 
100
#ifdef USE_DOUBLE
101
 
102
#define LOAD   ld
103
#define LOADL  ldl
104
#define LOADR  ldr
105
#define STOREL sdl
106
#define STORER sdr
107
#define STORE  sd
108
#define ADD    daddu
109
#define SUB    dsubu
110
#define SRL    dsrl
111
#define SRA    dsra
112
#define SLL    dsll
113
#define SLLV   dsllv
114
#define SRLV   dsrlv
115
#define NBYTES 8
116
#define LOG_NBYTES 3
117
 
118
/*
119
 * As we are sharing code base with the mips32 tree (which use the o32 ABI
120
 * register definitions). We need to redefine the register definitions from
121
 * the n64 ABI register naming to the o32 ABI register naming.
122
 */
123
#undef t0
124
#undef t1
125
#undef t2
126
#undef t3
127
#define t0      $8
128
#define t1      $9
129
#define t2      $10
130
#define t3      $11
131
#define t4      $12
132
#define t5      $13
133
#define t6      $14
134
#define t7      $15
135
 
136
#else
137
 
138
#define LOAD   lw
139
#define LOADL  lwl
140
#define LOADR  lwr
141
#define STOREL swl
142
#define STORER swr
143
#define STORE  sw
144
#define ADD    addu
145
#define SUB    subu
146
#define SRL    srl
147
#define SLL    sll
148
#define SRA    sra
149
#define SLLV   sllv
150
#define SRLV   srlv
151
#define NBYTES 4
152
#define LOG_NBYTES 2
153
 
154
#endif /* USE_DOUBLE */
155
 
156
#ifdef CONFIG_CPU_LITTLE_ENDIAN
157
#define LDFIRST LOADR
158
#define LDREST  LOADL
159
#define STFIRST STORER
160
#define STREST  STOREL
161
#define SHIFT_DISCARD SLLV
162
#else
163
#define LDFIRST LOADL
164
#define LDREST  LOADR
165
#define STFIRST STOREL
166
#define STREST  STORER
167
#define SHIFT_DISCARD SRLV
168
#endif
169
 
170
#define FIRST(unit) ((unit)*NBYTES)
171
#define REST(unit)  (FIRST(unit)+NBYTES-1)
172
#define UNIT(unit)  FIRST(unit)
173
 
174
#define ADDRMASK (NBYTES-1)
175
 
176
        .text
177
        .set    noreorder
178
        .set    noat
179
 
180
/*
181
 * A combined memcpy/__copy_user
182
 * __copy_user sets len to 0 for success; else to an upper bound of
183
 * the number of uncopied bytes.
184
 * memcpy sets v0 to dst.
185
 */
186
        .align  5
187
LEAF(memcpy)                                    /* a0=dst a1=src a2=len */
188
        move    v0, dst                         /* return value */
189
__memcpy:
190
FEXPORT(__copy_user)
191
        /*
192
         * Note: dst & src may be unaligned, len may be 0
193
         * Temps
194
         */
195
#define rem t8
196
 
197
        /*
198
         * The "issue break"s below are very approximate.
199
         * Issue delays for dcache fills will perturb the schedule, as will
200
         * load queue full replay traps, etc.
201
         *
202
         * If len < NBYTES use byte operations.
203
         */
204
        PREF(   0, 0(src) )
205
        PREF(   1, 0(dst) )
206
        sltu    t2, len, NBYTES
207
        and     t1, dst, ADDRMASK
208
        PREF(   0, 1*32(src) )
209
        PREF(   1, 1*32(dst) )
210
        bnez    t2, copy_bytes_checklen
211
         and    t0, src, ADDRMASK
212
        PREF(   0, 2*32(src) )
213
        PREF(   1, 2*32(dst) )
214
        bnez    t1, dst_unaligned
215
         nop
216
        bnez    t0, src_unaligned_dst_aligned
217
        /*
218
         * use delay slot for fall-through
219
         * src and dst are aligned; need to compute rem
220
         */
221
both_aligned:
222
         SRL    t0, len, LOG_NBYTES+3    # +3 for 8 units/iter
223
        beqz    t0, cleanup_both_aligned # len < 8*NBYTES
224
         and    rem, len, (8*NBYTES-1)   # rem = len % (8*NBYTES)
225
        PREF(   0, 3*32(src) )
226
        PREF(   1, 3*32(dst) )
227
        .align  4
228
1:
229
EXC(    LOAD    t0, UNIT(0)(src),       l_exc)
230
EXC(    LOAD    t1, UNIT(1)(src),       l_exc_copy)
231
EXC(    LOAD    t2, UNIT(2)(src),       l_exc_copy)
232
EXC(    LOAD    t3, UNIT(3)(src),       l_exc_copy)
233
        SUB     len, len, 8*NBYTES
234
EXC(    LOAD    t4, UNIT(4)(src),       l_exc_copy)
235
EXC(    LOAD    t7, UNIT(5)(src),       l_exc_copy)
236
EXC(    STORE   t0, UNIT(0)(dst),       s_exc_p8u)
237
EXC(    STORE   t1, UNIT(1)(dst),       s_exc_p7u)
238
EXC(    LOAD    t0, UNIT(6)(src),       l_exc_copy)
239
EXC(    LOAD    t1, UNIT(7)(src),       l_exc_copy)
240
        ADD     src, src, 8*NBYTES
241
        ADD     dst, dst, 8*NBYTES
242
EXC(    STORE   t2, UNIT(-6)(dst),      s_exc_p6u)
243
EXC(    STORE   t3, UNIT(-5)(dst),      s_exc_p5u)
244
EXC(    STORE   t4, UNIT(-4)(dst),      s_exc_p4u)
245
EXC(    STORE   t7, UNIT(-3)(dst),      s_exc_p3u)
246
EXC(    STORE   t0, UNIT(-2)(dst),      s_exc_p2u)
247
EXC(    STORE   t1, UNIT(-1)(dst),      s_exc_p1u)
248
        PREF(   0, 8*32(src) )
249
        PREF(   1, 8*32(dst) )
250
        bne     len, rem, 1b
251
         nop
252
 
253
        /*
254
         * len == rem == the number of bytes left to copy < 8*NBYTES
255
         */
256
cleanup_both_aligned:
257
        beqz    len, done
258
         sltu   t0, len, 4*NBYTES
259
        bnez    t0, less_than_4units
260
         and    rem, len, (NBYTES-1)    # rem = len % NBYTES
261
        /*
262
         * len >= 4*NBYTES
263
         */
264
EXC(    LOAD    t0, UNIT(0)(src),       l_exc)
265
EXC(    LOAD    t1, UNIT(1)(src),       l_exc_copy)
266
EXC(    LOAD    t2, UNIT(2)(src),       l_exc_copy)
267
EXC(    LOAD    t3, UNIT(3)(src),       l_exc_copy)
268
        SUB     len, len, 4*NBYTES
269
        ADD     src, src, 4*NBYTES
270
EXC(    STORE   t0, UNIT(0)(dst),       s_exc_p4u)
271
EXC(    STORE   t1, UNIT(1)(dst),       s_exc_p3u)
272
EXC(    STORE   t2, UNIT(2)(dst),       s_exc_p2u)
273
EXC(    STORE   t3, UNIT(3)(dst),       s_exc_p1u)
274
        beqz    len, done
275
         ADD    dst, dst, 4*NBYTES
276
less_than_4units:
277
        /*
278
         * rem = len % NBYTES
279
         */
280
        beq     rem, len, copy_bytes
281
         nop
282
1:
283
EXC(    LOAD    t0, 0(src),             l_exc)
284
        ADD     src, src, NBYTES
285
        SUB     len, len, NBYTES
286
EXC(    STORE   t0, 0(dst),             s_exc_p1u)
287
        bne     rem, len, 1b
288
         ADD    dst, dst, NBYTES
289
 
290
        /*
291
         * src and dst are aligned, need to copy rem bytes (rem < NBYTES)
292
         * A loop would do only a byte at a time with possible branch
293
         * mispredicts.  Can't do an explicit LOAD dst,mask,or,STORE
294
         * because can't assume read-access to dst.  Instead, use
295
         * STREST dst, which doesn't require read access to dst.
296
         *
297
         * This code should perform better than a simple loop on modern,
298
         * wide-issue mips processors because the code has fewer branches and
299
         * more instruction-level parallelism.
300
         */
301
#define bits t2
302
        beqz    len, done
303
         ADD    t1, dst, len    # t1 is just past last byte of dst
304
        li      bits, 8*NBYTES
305
        SLL     rem, len, 3     # rem = number of bits to keep
306
EXC(    LOAD    t0, 0(src),             l_exc)
307
        SUB     bits, bits, rem # bits = number of bits to discard
308
        SHIFT_DISCARD t0, t0, bits
309
EXC(    STREST  t0, -1(t1),             s_exc)
310
        jr      ra
311
         move   len, zero
312
dst_unaligned:
313
        /*
314
         * dst is unaligned
315
         * t0 = src & ADDRMASK
316
         * t1 = dst & ADDRMASK; T1 > 0
317
         * len >= NBYTES
318
         *
319
         * Copy enough bytes to align dst
320
         * Set match = (src and dst have same alignment)
321
         */
322
#define match rem
323
EXC(    LDFIRST t3, FIRST(0)(src),      l_exc)
324
        ADD     t2, zero, NBYTES
325
EXC(    LDREST  t3, REST(0)(src),       l_exc_copy)
326
        SUB     t2, t2, t1      # t2 = number of bytes copied
327
        xor     match, t0, t1
328
EXC(    STFIRST t3, FIRST(0)(dst),      s_exc)
329
        beq     len, t2, done
330
         SUB    len, len, t2
331
        ADD     dst, dst, t2
332
        beqz    match, both_aligned
333
         ADD    src, src, t2
334
 
335
src_unaligned_dst_aligned:
336
        SRL     t0, len, LOG_NBYTES+2    # +2 for 4 units/iter
337
        PREF(   0, 3*32(src) )
338
        beqz    t0, cleanup_src_unaligned
339
         and    rem, len, (4*NBYTES-1)   # rem = len % 4*NBYTES
340
        PREF(   1, 3*32(dst) )
341
1:
342
/*
343
 * Avoid consecutive LD*'s to the same register since some mips
344
 * implementations can't issue them in the same cycle.
345
 * It's OK to load FIRST(N+1) before REST(N) because the two addresses
346
 * are to the same unit (unless src is aligned, but it's not).
347
 */
348
EXC(    LDFIRST t0, FIRST(0)(src),      l_exc)
349
EXC(    LDFIRST t1, FIRST(1)(src),      l_exc_copy)
350
        SUB     len, len, 4*NBYTES
351
EXC(    LDREST  t0, REST(0)(src),       l_exc_copy)
352
EXC(    LDREST  t1, REST(1)(src),       l_exc_copy)
353
EXC(    LDFIRST t2, FIRST(2)(src),      l_exc_copy)
354
EXC(    LDFIRST t3, FIRST(3)(src),      l_exc_copy)
355
EXC(    LDREST  t2, REST(2)(src),       l_exc_copy)
356
EXC(    LDREST  t3, REST(3)(src),       l_exc_copy)
357
        PREF(   0, 9*32(src) )          # 0 is PREF_LOAD  (not streamed)
358
        ADD     src, src, 4*NBYTES
359
#ifdef CONFIG_CPU_SB1
360
        nop                             # improves slotting
361
#endif
362
EXC(    STORE   t0, UNIT(0)(dst),       s_exc_p4u)
363
EXC(    STORE   t1, UNIT(1)(dst),       s_exc_p3u)
364
EXC(    STORE   t2, UNIT(2)(dst),       s_exc_p2u)
365
EXC(    STORE   t3, UNIT(3)(dst),       s_exc_p1u)
366
        PREF(   1, 9*32(dst) )          # 1 is PREF_STORE (not streamed)
367
        bne     len, rem, 1b
368
         ADD    dst, dst, 4*NBYTES
369
 
370
cleanup_src_unaligned:
371
        beqz    len, done
372
         and    rem, len, NBYTES-1  # rem = len % NBYTES
373
        beq     rem, len, copy_bytes
374
         nop
375
1:
376
EXC(    LDFIRST t0, FIRST(0)(src),      l_exc)
377
EXC(    LDREST  t0, REST(0)(src),       l_exc_copy)
378
        ADD     src, src, NBYTES
379
        SUB     len, len, NBYTES
380
EXC(    STORE   t0, 0(dst),             s_exc_p1u)
381
        bne     len, rem, 1b
382
         ADD    dst, dst, NBYTES
383
 
384
copy_bytes_checklen:
385
        beqz    len, done
386
         nop
387
copy_bytes:
388
        /* 0 < len < NBYTES  */
389
#define COPY_BYTE(N)                    \
390
EXC(    lb      t0, N(src), l_exc);     \
391
        SUB     len, len, 1;            \
392
        beqz    len, done;              \
393
EXC(     sb     t0, N(dst), s_exc_p1)
394
 
395
        COPY_BYTE(0)
396
        COPY_BYTE(1)
397
#ifdef USE_DOUBLE
398
        COPY_BYTE(2)
399
        COPY_BYTE(3)
400
        COPY_BYTE(4)
401
        COPY_BYTE(5)
402
#endif
403
EXC(    lb      t0, NBYTES-2(src), l_exc)
404
        SUB     len, len, 1
405
        jr      ra
406
EXC(     sb     t0, NBYTES-2(dst), s_exc_p1)
407
done:
408
        jr      ra
409
         nop
410
        END(memcpy)
411
 
412
l_exc_copy:
413
        /*
414
         * Copy bytes from src until faulting load address (or until a
415
         * lb faults)
416
         *
417
         * When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)
418
         * may be more than a byte beyond the last address.
419
         * Hence, the lb below may get an exception.
420
         *
421
         * Assumes src < THREAD_BUADDR($28)
422
         */
423
        LOAD    t0, TI_TASK($28)
424
         nop
425
        LOAD    t0, THREAD_BUADDR(t0)
426
1:
427
EXC(    lb      t1, 0(src),     l_exc)
428
        ADD     src, src, 1
429
        sb      t1, 0(dst)      # can't fault -- we're copy_from_user
430
        bne     src, t0, 1b
431
         ADD    dst, dst, 1
432
l_exc:
433
        LOAD    t0, TI_TASK($28)
434
         nop
435
        LOAD    t0, THREAD_BUADDR(t0)   # t0 is just past last good address
436
         nop
437
        SUB     len, AT, t0             # len number of uncopied bytes
438
        /*
439
         * Here's where we rely on src and dst being incremented in tandem,
440
         *   See (3) above.
441
         * dst += (fault addr - src) to put dst at first byte to clear
442
         */
443
        ADD     dst, t0                 # compute start address in a1
444
        SUB     dst, src
445
        /*
446
         * Clear len bytes starting at dst.  Can't call __bzero because it
447
         * might modify len.  An inefficient loop for these rare times...
448
         */
449
        beqz    len, done
450
         SUB    src, len, 1
451
1:      sb      zero, 0(dst)
452
        ADD     dst, dst, 1
453
        bnez    src, 1b
454
         SUB    src, src, 1
455
        jr      ra
456
         nop
457
 
458
 
459
#define SEXC(n)                         \
460
s_exc_p ## n ## u:                      \
461
        jr      ra;                     \
462
         ADD    len, len, n*NBYTES
463
 
464
SEXC(8)
465
SEXC(7)
466
SEXC(6)
467
SEXC(5)
468
SEXC(4)
469
SEXC(3)
470
SEXC(2)
471
SEXC(1)
472
 
473
s_exc_p1:
474
        jr      ra
475
         ADD    len, len, 1
476
s_exc:
477
        jr      ra
478
         nop
479
 
480
        .align  5
481
LEAF(memmove)
482
        ADD     t0, a0, a2
483
        ADD     t1, a1, a2
484
        sltu    t0, a1, t0                      # dst + len <= src -> memcpy
485
        sltu    t1, a0, t1                      # dst >= src + len -> memcpy
486
        and     t0, t1
487
        beqz    t0, __memcpy
488
         move   v0, a0                          /* return value */
489
        beqz    a2, r_out
490
        END(memmove)
491
 
492
        /* fall through to __rmemcpy */
493
LEAF(__rmemcpy)                                 /* a0=dst a1=src a2=len */
494
         sltu   t0, a1, a0
495
        beqz    t0, r_end_bytes_up              # src >= dst
496
         nop
497
        ADD     a0, a2                          # dst = dst + len
498
        ADD     a1, a2                          # src = src + len
499
 
500
r_end_bytes:
501
        lb      t0, -1(a1)
502
        SUB     a2, a2, 0x1
503
        sb      t0, -1(a0)
504
        SUB     a1, a1, 0x1
505
        bnez    a2, r_end_bytes
506
         SUB    a0, a0, 0x1
507
 
508
r_out:
509
        jr      ra
510
         move   a2, zero
511
 
512
r_end_bytes_up:
513
        lb      t0, (a1)
514
        SUB     a2, a2, 0x1
515
        sb      t0, (a0)
516
        ADD     a1, a1, 0x1
517
        bnez    a2, r_end_bytes_up
518
         ADD    a0, a0, 0x1
519
 
520
        jr      ra
521
         move   a2, zero
522
        END(__rmemcpy)

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.