OpenCores
URL https://opencores.org/ocsvn/or1k_soc_on_altera_embedded_dev_kit/or1k_soc_on_altera_embedded_dev_kit/trunk

Subversion Repositories or1k_soc_on_altera_embedded_dev_kit

[/] [or1k_soc_on_altera_embedded_dev_kit/] [trunk/] [linux-2.6/] [linux-2.6.24/] [arch/] [alpha/] [lib/] [ev6-strncpy_from_user.S] - Blame information for rev 17

Go to most recent revision | Details | Compare with Previous | View Log

Line No. Rev Author Line
1 3 xianfeng
/*
2
 * arch/alpha/lib/ev6-strncpy_from_user.S
3
 * 21264 version contributed by Rick Gorton 
4
 *
5
 * Just like strncpy except in the return value:
6
 *
7
 * -EFAULT       if an exception occurs before the terminator is copied.
8
 * N             if the buffer filled.
9
 *
10
 * Otherwise the length of the string is returned.
11
 *
12
 * Much of the information about 21264 scheduling/coding comes from:
13
 *      Compiler Writer's Guide for the Alpha 21264
14
 *      abbreviated as 'CWG' in other comments here
15
 *      ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
16
 * Scheduling notation:
17
 *      E       - either cluster
18
 *      U       - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
19
 *      L       - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
20
 * A bunch of instructions got moved and temp registers were changed
21
 * to aid in scheduling.  Control flow was also re-arranged to eliminate
22
 * branches, and to provide longer code sequences to enable better scheduling.
23
 * A total rewrite (using byte load/stores for start & tail sequences)
24
 * is desirable, but very difficult to do without a from-scratch rewrite.
25
 * Save that for the future.
26
 */
27
 
28
 
29
#include 
30
#include 
31
 
32
 
33
/* Allow an exception for an insn; exit if we get one.  */
34
#define EX(x,y...)                      \
35
        99: x,##y;                      \
36
        .section __ex_table,"a";        \
37
        .long 99b - .;                  \
38
        lda $31, $exception-99b($0);    \
39
        .previous
40
 
41
 
42
        .set noat
43
        .set noreorder
44
        .text
45
 
46
        .globl __strncpy_from_user
47
        .ent __strncpy_from_user
48
        .frame $30, 0, $26
49
        .prologue 0
50
 
51
        .align 4
52
__strncpy_from_user:
53
        and     a0, 7, t3       # E : find dest misalignment
54
        beq     a2, $zerolength # U :
55
 
56
        /* Are source and destination co-aligned?  */
57
        mov     a0, v0          # E : save the string start
58
        xor     a0, a1, t4      # E :
59
        EX( ldq_u t1, 0(a1) )   # L : Latency=3 load first quadword
60
        ldq_u   t0, 0(a0)       # L : load first (partial) aligned dest quadword
61
 
62
        addq    a2, t3, a2      # E : bias count by dest misalignment
63
        subq    a2, 1, a3       # E :
64
        addq    zero, 1, t10    # E :
65
        and     t4, 7, t4       # E : misalignment between the two
66
 
67
        and     a3, 7, t6       # E : number of tail bytes
68
        sll     t10, t6, t10    # E : t10 = bitmask of last count byte
69
        bne     t4, $unaligned  # U :
70
        lda     t2, -1          # E : build a mask against false zero
71
 
72
        /*
73
         * We are co-aligned; take care of a partial first word.
74
         * On entry to this basic block:
75
         * t0 == the first destination word for masking back in
76
         * t1 == the first source word.
77
         */
78
 
79
        srl     a3, 3, a2       # E : a2 = loop counter = (count - 1)/8
80
        addq    a1, 8, a1       # E :
81
        mskqh   t2, a1, t2      # U :   detection in the src word
82
        nop
83
 
84
        /* Create the 1st output word and detect 0's in the 1st input word.  */
85
        mskqh   t1, a1, t3      # U :
86
        mskql   t0, a1, t0      # U : assemble the first output word
87
        ornot   t1, t2, t2      # E :
88
        nop
89
 
90
        cmpbge  zero, t2, t8    # E : bits set iff null found
91
        or      t0, t3, t0      # E :
92
        beq     a2, $a_eoc      # U :
93
        bne     t8, $a_eos      # U : 2nd branch in a quad.  Bad.
94
 
95
        /* On entry to this basic block:
96
         * t0 == a source quad not containing a null.
97
         * a0 - current aligned destination address
98
         * a1 - current aligned source address
99
         * a2 - count of quadwords to move.
100
         * NOTE: Loop improvement - unrolling this is going to be
101
         *      a huge win, since we're going to stall otherwise.
102
         *      Fix this later.  For _really_ large copies, look
103
         *      at using wh64 on a look-ahead basis.  See the code
104
         *      in clear_user.S and copy_user.S.
105
         * Presumably, since (a0) and (a1) do not overlap (by C definition)
106
         * Lots of nops here:
107
         *      - Separate loads from stores
108
         *      - Keep it to 1 branch/quadpack so the branch predictor
109
         *        can train.
110
         */
111
$a_loop:
112
        stq_u   t0, 0(a0)       # L :
113
        addq    a0, 8, a0       # E :
114
        nop
115
        subq    a2, 1, a2       # E :
116
 
117
        EX( ldq_u t0, 0(a1) )   # L :
118
        addq    a1, 8, a1       # E :
119
        cmpbge  zero, t0, t8    # E : Stall 2 cycles on t0
120
        beq     a2, $a_eoc      # U :
121
 
122
        beq     t8, $a_loop     # U :
123
        nop
124
        nop
125
        nop
126
 
127
        /* Take care of the final (partial) word store.  At this point
128
         * the end-of-count bit is set in t8 iff it applies.
129
         *
130
         * On entry to this basic block we have:
131
         * t0 == the source word containing the null
132
         * t8 == the cmpbge mask that found it.
133
         */
134
$a_eos:
135
        negq    t8, t12         # E : find low bit set
136
        and     t8, t12, t12    # E :
137
 
138
        /* We're doing a partial word store and so need to combine
139
           our source and original destination words.  */
140
        ldq_u   t1, 0(a0)       # L :
141
        subq    t12, 1, t6      # E :
142
 
143
        or      t12, t6, t8     # E :
144
        zapnot  t0, t8, t0      # U : clear src bytes > null
145
        zap     t1, t8, t1      # U : clear dst bytes <= null
146
        or      t0, t1, t0      # E :
147
 
148
        stq_u   t0, 0(a0)       # L :
149
        br      $finish_up      # L0 :
150
        nop
151
        nop
152
 
153
        /* Add the end-of-count bit to the eos detection bitmask.  */
154
        .align 4
155
$a_eoc:
156
        or      t10, t8, t8
157
        br      $a_eos
158
        nop
159
        nop
160
 
161
 
162
/* The source and destination are not co-aligned.  Align the destination
163
   and cope.  We have to be very careful about not reading too much and
164
   causing a SEGV.  */
165
 
166
        .align 4
167
$u_head:
168
        /* We know just enough now to be able to assemble the first
169
           full source word.  We can still find a zero at the end of it
170
           that prevents us from outputting the whole thing.
171
 
172
           On entry to this basic block:
173
           t0 == the first dest word, unmasked
174
           t1 == the shifted low bits of the first source word
175
           t6 == bytemask that is -1 in dest word bytes */
176
 
177
        EX( ldq_u t2, 8(a1) )   # L : load second src word
178
        addq    a1, 8, a1       # E :
179
        mskql   t0, a0, t0      # U : mask trailing garbage in dst
180
        extqh   t2, a1, t4      # U :
181
 
182
        or      t1, t4, t1      # E : first aligned src word complete
183
        mskqh   t1, a0, t1      # U : mask leading garbage in src
184
        or      t0, t1, t0      # E : first output word complete
185
        or      t0, t6, t6      # E : mask original data for zero test
186
 
187
        cmpbge  zero, t6, t8    # E :
188
        beq     a2, $u_eocfin   # U :
189
        bne     t8, $u_final    # U : bad news - 2nd branch in a quad
190
        lda     t6, -1          # E : mask out the bits we have
191
 
192
        mskql   t6, a1, t6      # U :   already seen
193
        stq_u   t0, 0(a0)       # L : store first output word
194
        or      t6, t2, t2      # E :
195
        cmpbge  zero, t2, t8    # E : find nulls in second partial
196
 
197
        addq    a0, 8, a0               # E :
198
        subq    a2, 1, a2               # E :
199
        bne     t8, $u_late_head_exit   # U :
200
        nop
201
 
202
        /* Finally, we've got all the stupid leading edge cases taken care
203
           of and we can set up to enter the main loop.  */
204
 
205
        extql   t2, a1, t1      # U : position hi-bits of lo word
206
        EX( ldq_u t2, 8(a1) )   # L : read next high-order source word
207
        addq    a1, 8, a1       # E :
208
        cmpbge  zero, t2, t8    # E :
209
 
210
        beq     a2, $u_eoc      # U :
211
        bne     t8, $u_eos      # U :
212
        nop
213
        nop
214
 
215
        /* Unaligned copy main loop.  In order to avoid reading too much,
216
           the loop is structured to detect zeros in aligned source words.
217
           This has, unfortunately, effectively pulled half of a loop
218
           iteration out into the head and half into the tail, but it does
219
           prevent nastiness from accumulating in the very thing we want
220
           to run as fast as possible.
221
 
222
           On entry to this basic block:
223
           t1 == the shifted high-order bits from the previous source word
224
           t2 == the unshifted current source word
225
 
226
           We further know that t2 does not contain a null terminator.  */
227
 
228
        /*
229
         * Extra nops here:
230
         *      separate load quads from store quads
231
         *      only one branch/quad to permit predictor training
232
         */
233
 
234
        .align 4
235
$u_loop:
236
        extqh   t2, a1, t0      # U : extract high bits for current word
237
        addq    a1, 8, a1       # E :
238
        extql   t2, a1, t3      # U : extract low bits for next time
239
        addq    a0, 8, a0       # E :
240
 
241
        or      t0, t1, t0      # E : current dst word now complete
242
        EX( ldq_u t2, 0(a1) )   # L : load high word for next time
243
        subq    a2, 1, a2       # E :
244
        nop
245
 
246
        stq_u   t0, -8(a0)      # L : save the current word
247
        mov     t3, t1          # E :
248
        cmpbge  zero, t2, t8    # E : test new word for eos
249
        beq     a2, $u_eoc      # U :
250
 
251
        beq     t8, $u_loop     # U :
252
        nop
253
        nop
254
        nop
255
 
256
        /* We've found a zero somewhere in the source word we just read.
257
           If it resides in the lower half, we have one (probably partial)
258
           word to write out, and if it resides in the upper half, we
259
           have one full and one partial word left to write out.
260
 
261
           On entry to this basic block:
262
           t1 == the shifted high-order bits from the previous source word
263
           t2 == the unshifted current source word.  */
264
        .align 4
265
$u_eos:
266
        extqh   t2, a1, t0      # U :
267
        or      t0, t1, t0      # E : first (partial) source word complete
268
        cmpbge  zero, t0, t8    # E : is the null in this first bit?
269
        nop
270
 
271
        bne     t8, $u_final    # U :
272
        stq_u   t0, 0(a0)       # L : the null was in the high-order bits
273
        addq    a0, 8, a0       # E :
274
        subq    a2, 1, a2       # E :
275
 
276
        .align 4
277
$u_late_head_exit:
278
        extql   t2, a1, t0      # U :
279
        cmpbge  zero, t0, t8    # E :
280
        or      t8, t10, t6     # E :
281
        cmoveq  a2, t6, t8      # E :
282
 
283
        /* Take care of a final (probably partial) result word.
284
           On entry to this basic block:
285
           t0 == assembled source word
286
           t8 == cmpbge mask that found the null.  */
287
        .align 4
288
$u_final:
289
        negq    t8, t6          # E : isolate low bit set
290
        and     t6, t8, t12     # E :
291
        ldq_u   t1, 0(a0)       # L :
292
        subq    t12, 1, t6      # E :
293
 
294
        or      t6, t12, t8     # E :
295
        zapnot  t0, t8, t0      # U : kill source bytes > null
296
        zap     t1, t8, t1      # U : kill dest bytes <= null
297
        or      t0, t1, t0      # E :
298
 
299
        stq_u   t0, 0(a0)       # E :
300
        br      $finish_up      # U :
301
        nop
302
        nop
303
 
304
        .align 4
305
$u_eoc:                         # end-of-count
306
        extqh   t2, a1, t0      # U :
307
        or      t0, t1, t0      # E :
308
        cmpbge  zero, t0, t8    # E :
309
        nop
310
 
311
        .align 4
312
$u_eocfin:                      # end-of-count, final word
313
        or      t10, t8, t8     # E :
314
        br      $u_final        # U :
315
        nop
316
        nop
317
 
318
        /* Unaligned copy entry point.  */
319
        .align 4
320
$unaligned:
321
 
322
        srl     a3, 3, a2       # U : a2 = loop counter = (count - 1)/8
323
        and     a0, 7, t4       # E : find dest misalignment
324
        and     a1, 7, t5       # E : find src misalignment
325
        mov     zero, t0        # E :
326
 
327
        /* Conditionally load the first destination word and a bytemask
328
           with 0xff indicating that the destination byte is sacrosanct.  */
329
 
330
        mov     zero, t6        # E :
331
        beq     t4, 1f          # U :
332
        ldq_u   t0, 0(a0)       # L :
333
        lda     t6, -1          # E :
334
 
335
        mskql   t6, a0, t6      # E :
336
        nop
337
        nop
338
        nop
339
 
340
        .align 4
341
1:
342
        subq    a1, t4, a1      # E : sub dest misalignment from src addr
343
        /* If source misalignment is larger than dest misalignment, we need
344
           extra startup checks to avoid SEGV.  */
345
        cmplt   t4, t5, t12     # E :
346
        extql   t1, a1, t1      # U : shift src into place
347
        lda     t2, -1          # E : for creating masks later
348
 
349
        beq     t12, $u_head    # U :
350
        mskqh   t2, t5, t2      # U : begin src byte validity mask
351
        cmpbge  zero, t1, t8    # E : is there a zero?
352
        nop
353
 
354
        extql   t2, a1, t2      # U :
355
        or      t8, t10, t5     # E : test for end-of-count too
356
        cmpbge  zero, t2, t3    # E :
357
        cmoveq  a2, t5, t8      # E : Latency=2, extra map slot
358
 
359
        nop                     # E : goes with cmov
360
        andnot  t8, t3, t8      # E :
361
        beq     t8, $u_head     # U :
362
        nop
363
 
364
        /* At this point we've found a zero in the first partial word of
365
           the source.  We need to isolate the valid source data and mask
366
           it into the original destination data.  (Incidentally, we know
367
           that we'll need at least one byte of that original dest word.) */
368
 
369
        ldq_u   t0, 0(a0)       # L :
370
        negq    t8, t6          # E : build bitmask of bytes <= zero
371
        mskqh   t1, t4, t1      # U :
372
        and     t6, t8, t12     # E :
373
 
374
        subq    t12, 1, t6      # E :
375
        or      t6, t12, t8     # E :
376
        zapnot  t2, t8, t2      # U : prepare source word; mirror changes
377
        zapnot  t1, t8, t1      # U : to source validity mask
378
 
379
        andnot  t0, t2, t0      # E : zero place for source to reside
380
        or      t0, t1, t0      # E : and put it there
381
        stq_u   t0, 0(a0)       # L :
382
        nop
383
 
384
        .align 4
385
$finish_up:
386
        zapnot  t0, t12, t4     # U : was last byte written null?
387
        and     t12, 0xf0, t3   # E : binary search for the address of the
388
        cmovne  t4, 1, t4       # E : Latency=2, extra map slot
389
        nop                     # E : with cmovne
390
 
391
        and     t12, 0xcc, t2   # E : last byte written
392
        and     t12, 0xaa, t1   # E :
393
        cmovne  t3, 4, t3       # E : Latency=2, extra map slot
394
        nop                     # E : with cmovne
395
 
396
        bic     a0, 7, t0
397
        cmovne  t2, 2, t2       # E : Latency=2, extra map slot
398
        nop                     # E : with cmovne
399
        nop
400
 
401
        cmovne  t1, 1, t1       # E : Latency=2, extra map slot
402
        nop                     # E : with cmovne
403
        addq    t0, t3, t0      # E :
404
        addq    t1, t2, t1      # E :
405
 
406
        addq    t0, t1, t0      # E :
407
        addq    t0, t4, t0      # add one if we filled the buffer
408
        subq    t0, v0, v0      # find string length
409
        ret                     # L0 :
410
 
411
        .align 4
412
$zerolength:
413
        nop
414
        nop
415
        nop
416
        clr     v0
417
 
418
$exception:
419
        nop
420
        nop
421
        nop
422
        ret
423
 
424
        .end __strncpy_from_user

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.