URL https://opencores.org/ocsvn/or1k_soc_on_altera_embedded_dev_kit/or1k_soc_on_altera_embedded_dev_kit/trunk

Subversion Repositories or1k_soc_on_altera_embedded_dev_kit

[/] [or1k_soc_on_altera_embedded_dev_kit/] [trunk/] [linux-2.6/] [linux-2.6.24/] [arch/] [alpha/] [lib/] [ev6-clear_user.S] - Blame information for rev 17

Go to most recent revision | Details | Compare with Previous | View Log


/*
 * arch/alpha/lib/ev6-clear_user.S
 * 21264 version contributed by Rick Gorton 
 *
 * Zero user space, handling exceptions as we go.
 *
 * We have to make sure that $0 is always up-to-date and contains the
 * right "bytes left to zero" value (and that it is updated only _after_
 * a successful copy).  There is also some rather minor exception setup
 * stuff.
 *
 * NOTE! This is not directly C-callable, because the calling semantics
 * are different:
 *
 * Inputs:
 *      length in $0
 *      destination address in $6
 *      exception pointer in $7
 *      return address in $28 (exceptions expect it there)
 *
 * Outputs:
 *      bytes left to copy in $0
 *
 * Clobbers:
 *      $1,$2,$3,$4,$5,$6
 *
 * Much of the information about 21264 scheduling/coding comes from:
 *      Compiler Writer's Guide for the Alpha 21264
 *      abbreviated as 'CWG' in other comments here
 *      ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
 * Scheduling notation:
 *      E       - either cluster
 *      U       - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
 *      L       - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
 * Try not to change the actual algorithm if possible for consistency.
 * Determining actual stalls (other than slotting) doesn't appear to be easy to do.
 * From perusing the source code context where this routine is called, it is
 * a fair assumption that significant fractions of entire pages are zeroed, so
 * it's going to be worth the effort to hand-unroll a big loop, and use wh64.
 * ASSUMPTION:
 *      The believed purpose of only updating $0 after a store is that a signal
 *      may come along during the execution of this chunk of code, and we don't
 *      want to leave a hole (and we also want to avoid repeating lots of work)
 */
 
/* Allow an exception for an insn; exit if we get one.  */
#define EX(x,y...)                      \
        99: x,##y;                      \
        .section __ex_table,"a";        \
        .long 99b - .;                  \
        lda $31, $exception-99b($31);   \
        .previous
 
        .set noat
        .set noreorder
        .align 4
 
        .globl __do_clear_user
        .ent __do_clear_user
        .frame  $30, 0, $28
        .prologue 0
 
                                # Pipeline info : Slotting & Comments
__do_clear_user:
        and     $6, 7, $4       # .. E  .. ..   : find dest head misalignment
        beq     $0, $zerolength # U  .. .. ..   :  U L U L
 
        addq    $0, $4, $1      # .. .. .. E    : bias counter
        and     $1, 7, $2       # .. .. E  ..   : number of misaligned bytes in tail
# Note - we never actually use $2, so this is a moot computation
# and we can rewrite this later...
        srl     $1, 3, $1       # .. E  .. ..   : number of quadwords to clear
        beq     $4, $headalign  # U  .. .. ..   : U L U L
 
/*
 * Head is not aligned.  Write (8 - $4) bytes to head of destination
 * This means $6 is known to be misaligned
 */
        EX( ldq_u $5, 0($6) )   # .. .. .. L    : load dst word to mask back in
        beq     $1, $onebyte    # .. .. U  ..   : sub-word store?
        mskql   $5, $6, $5      # .. U  .. ..   : take care of misaligned head
        addq    $6, 8, $6       # E  .. .. ..   : L U U L
 
        EX( stq_u $5, -8($6) )  # .. .. .. L    :
        subq    $1, 1, $1       # .. .. E  ..   :
        addq    $0, $4, $0      # .. E  .. ..   : bytes left -= 8 - misalignment
        subq    $0, 8, $0       # E  .. .. ..   : U L U L
 
        .align  4
/*
 * (The .align directive ought to be a moot point)
 * values upon initial entry to the loop
 * $1 is number of quadwords to clear (zero is a valid value)
 * $2 is number of trailing bytes (0..7) ($2 never used...)
 * $6 is known to be aligned 0mod8
 */
$headalign:
        subq    $1, 16, $4      # .. .. .. E    : If < 16, we can not use the huge loop
        and     $6, 0x3f, $2    # .. .. E  ..   : Forward work for huge loop
        subq    $2, 0x40, $3    # .. E  .. ..   : bias counter (huge loop)
        blt     $4, $trailquad  # U  .. .. ..   : U L U L
 
/*
 * We know that we're going to do at least 16 quads, which means we are
 * going to be able to use the large block clear loop at least once.
 * Figure out how many quads we need to clear before we are 0mod64 aligned
 * so we can use the wh64 instruction.
 */
 
        nop                     # .. .. .. E
        nop                     # .. .. E  ..
        nop                     # .. E  .. ..
        beq     $3, $bigalign   # U  .. .. ..   : U L U L : Aligned 0mod64
 
$alignmod64:
        EX( stq_u $31, 0($6) )  # .. .. .. L
        addq    $3, 8, $3       # .. .. E  ..
        subq    $0, 8, $0       # .. E  .. ..
        nop                     # E  .. .. ..   : U L U L
 
        nop                     # .. .. .. E
        subq    $1, 1, $1       # .. .. E  ..
        addq    $6, 8, $6       # .. E  .. ..
        blt     $3, $alignmod64 # U  .. .. ..   : U L U L
 
$bigalign:
/*
 * $0 is the number of bytes left
 * $1 is the number of quads left
 * $6 is aligned 0mod64
 * we know that we'll be taking a minimum of one trip through
 * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
 * We are _not_ going to update $0 after every single store.  That
 * would be silly, because there will be cross-cluster dependencies
 * no matter how the code is scheduled.  By doing it in slightly
 * staggered fashion, we can still do this loop in 5 fetches
 * The worse case will be doing two extra quads in some future execution,
 * in the event of an interrupted clear.
 * Assumes the wh64 needs to be for 2 trips through the loop in the future
 * The wh64 is issued on for the starting destination address for trip +2
 * through the loop, and if there are less than two trips left, the target
 * address will be for the current trip.
 */
        nop                     # E :
        nop                     # E :
        nop                     # E :
        bis     $6,$6,$3        # E : U L U L : Initial wh64 address is dest
        /* This might actually help for the current trip... */
 
$do_wh64:
        wh64    ($3)            # .. .. .. L1   : memory subsystem hint
        subq    $1, 16, $4      # .. .. E  ..   : Forward calculation - repeat the loop?
        EX( stq_u $31, 0($6) )  # .. L  .. ..
        subq    $0, 8, $0       # E  .. .. ..   : U L U L
 
        addq    $6, 128, $3     # E : Target address of wh64
        EX( stq_u $31, 8($6) )  # L :
        EX( stq_u $31, 16($6) ) # L :
        subq    $0, 16, $0      # E : U L L U
 
        nop                     # E :
        EX( stq_u $31, 24($6) ) # L :
        EX( stq_u $31, 32($6) ) # L :
        subq    $0, 168, $5     # E : U L L U : two trips through the loop left?
        /* 168 = 192 - 24, since we've already completed some stores */
 
        subq    $0, 16, $0      # E :
        EX( stq_u $31, 40($6) ) # L :
        EX( stq_u $31, 48($6) ) # L :
        cmovlt  $5, $6, $3      # E : U L L U : Latency 2, extra mapping cycle
 
        subq    $1, 8, $1       # E :
        subq    $0, 16, $0      # E :
        EX( stq_u $31, 56($6) ) # L :
        nop                     # E : U L U L
 
        nop                     # E :
        subq    $0, 8, $0       # E :
        addq    $6, 64, $6      # E :
        bge     $4, $do_wh64    # U : U L U L
 
$trailquad:
        # zero to 16 quadwords left to store, plus any trailing bytes
        # $1 is the number of quadwords left to go.
        #
        nop                     # .. .. .. E
        nop                     # .. .. E  ..
        nop                     # .. E  .. ..
        beq     $1, $trailbytes # U  .. .. ..   : U L U L : Only 0..7 bytes to go
 
$onequad:
        EX( stq_u $31, 0($6) )  # .. .. .. L
        subq    $1, 1, $1       # .. .. E  ..
        subq    $0, 8, $0       # .. E  .. ..
        nop                     # E  .. .. ..   : U L U L
 
        nop                     # .. .. .. E
        nop                     # .. .. E  ..
        addq    $6, 8, $6       # .. E  .. ..
        bgt     $1, $onequad    # U  .. .. ..   : U L U L
 
        # We have an unknown number of bytes left to go.
$trailbytes:
        nop                     # .. .. .. E
        nop                     # .. .. E  ..
        nop                     # .. E  .. ..
        beq     $0, $zerolength # U  .. .. ..   : U L U L
 
        # $0 contains the number of bytes left to copy (0..31)
        # so we will use $0 as the loop counter
        # We know for a fact that $0 > 0 zero due to previous context
$onebyte:
        EX( stb $31, 0($6) )    # .. .. .. L
        subq    $0, 1, $0       # .. .. E  ..   :
        addq    $6, 1, $6       # .. E  .. ..   :
        bgt     $0, $onebyte    # U  .. .. ..   : U L U L
 
$zerolength:
$exception:                     # Destination for exception recovery(?)
        nop                     # .. .. .. E    :
        nop                     # .. .. E  ..   :
        nop                     # .. E  .. ..   :
        ret     $31, ($28), 1   # L0 .. .. ..   : L U L U
        .end __do_clear_user
 

Browse

Tools

Subversion Repositories or1k_soc_on_altera_embedded_dev_kit

[/] [or1k_soc_on_altera_embedded_dev_kit/] [trunk/] [linux-2.6/] [linux-2.6.24/] [arch/] [alpha/] [lib/] [ev6-clear_user.S] - Blame information for rev 17

Line No.	Rev	Author	Line
1	3	xianfeng	`/*`
2			`* arch/alpha/lib/ev6-clear_user.S`
3			`* 21264 version contributed by Rick Gorton`
4			`*`
5			`* Zero user space, handling exceptions as we go.`
6			`*`
7			`* We have to make sure that $0 is always up-to-date and contains the`
8			`* right "bytes left to zero" value (and that it is updated only _after_`
9			`* a successful copy). There is also some rather minor exception setup`
10			`* stuff.`
11			`*`
12			`* NOTE! This is not directly C-callable, because the calling semantics`
13			`* are different:`
14			`*`
15			`* Inputs:`
16			`* length in $0`
17			`* destination address in $6`
18			`* exception pointer in $7`
19			`* return address in $28 (exceptions expect it there)`
20			`*`
21			`* Outputs:`
22			`* bytes left to copy in $0`
23			`*`
24			`* Clobbers:`
25			`* $1,$2,$3,$4,$5,$6`
26			`*`
27			`* Much of the information about 21264 scheduling/coding comes from:`
28			`* Compiler Writer's Guide for the Alpha 21264`
29			`* abbreviated as 'CWG' in other comments here`
30			`* ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html`
31			`* Scheduling notation:`
32			`* E - either cluster`
33			`* U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1`
34			`* L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1`
35			`* Try not to change the actual algorithm if possible for consistency.`
36			`* Determining actual stalls (other than slotting) doesn't appear to be easy to do.`
37			`* From perusing the source code context where this routine is called, it is`
38			`* a fair assumption that significant fractions of entire pages are zeroed, so`
39			`* it's going to be worth the effort to hand-unroll a big loop, and use wh64.`
40			`* ASSUMPTION:`
41			`* The believed purpose of only updating $0 after a store is that a signal`
42			`* may come along during the execution of this chunk of code, and we don't`
43			`* want to leave a hole (and we also want to avoid repeating lots of work)`
44			`*/`
45
46			`/* Allow an exception for an insn; exit if we get one. */`
47			`#define EX(x,y...) \`
48			`99: x,##y; \`
49			`.section __ex_table,"a"; \`
50			`.long 99b - .; \`
51			`lda $31, $exception-99b($31); \`
52			`.previous`
53
54			`.set noat`
55			`.set noreorder`
56			`.align 4`
57
58			`.globl __do_clear_user`
59			`.ent __do_clear_user`
60			`.frame $30, 0, $28`
61			`.prologue 0`
62
63			`# Pipeline info : Slotting & Comments`
64			`__do_clear_user:`
65			`and $6, 7, $4 # .. E .. .. : find dest head misalignment`
66			`beq $0, $zerolength # U .. .. .. : U L U L`
67
68			`addq $0, $4, $1 # .. .. .. E : bias counter`
69			`and $1, 7, $2 # .. .. E .. : number of misaligned bytes in tail`
70			`# Note - we never actually use $2, so this is a moot computation`
71			`# and we can rewrite this later...`
72			`srl $1, 3, $1 # .. E .. .. : number of quadwords to clear`
73			`beq $4, $headalign # U .. .. .. : U L U L`
74
75			`/*`
76			`* Head is not aligned. Write (8 - $4) bytes to head of destination`
77			`* This means $6 is known to be misaligned`
78			`*/`
79			`EX( ldq_u $5, 0($6) ) # .. .. .. L : load dst word to mask back in`
80			`beq $1, $onebyte # .. .. U .. : sub-word store?`
81			`mskql $5, $6, $5 # .. U .. .. : take care of misaligned head`
82			`addq $6, 8, $6 # E .. .. .. : L U U L`
83
84			`EX( stq_u $5, -8($6) ) # .. .. .. L :`
85			`subq $1, 1, $1 # .. .. E .. :`
86			`addq $0, $4, $0 # .. E .. .. : bytes left -= 8 - misalignment`
87			`subq $0, 8, $0 # E .. .. .. : U L U L`
88
89			`.align 4`
90			`/*`
91			`* (The .align directive ought to be a moot point)`
92			`* values upon initial entry to the loop`
93			`* $1 is number of quadwords to clear (zero is a valid value)`
94			`* $2 is number of trailing bytes (0..7) ($2 never used...)`
95			`* $6 is known to be aligned 0mod8`
96			`*/`
97			`$headalign:`
98			`subq $1, 16, $4 # .. .. .. E : If < 16, we can not use the huge loop`
99			`and $6, 0x3f, $2 # .. .. E .. : Forward work for huge loop`
100			`subq $2, 0x40, $3 # .. E .. .. : bias counter (huge loop)`
101			`blt $4, $trailquad # U .. .. .. : U L U L`
102
103			`/*`
104			`* We know that we're going to do at least 16 quads, which means we are`
105			`* going to be able to use the large block clear loop at least once.`
106			`* Figure out how many quads we need to clear before we are 0mod64 aligned`
107			`* so we can use the wh64 instruction.`
108			`*/`
109
110			`nop # .. .. .. E`
111			`nop # .. .. E ..`
112			`nop # .. E .. ..`
113			`beq $3, $bigalign # U .. .. .. : U L U L : Aligned 0mod64`
114
115			`$alignmod64:`
116			`EX( stq_u $31, 0($6) ) # .. .. .. L`
117			`addq $3, 8, $3 # .. .. E ..`
118			`subq $0, 8, $0 # .. E .. ..`
119			`nop # E .. .. .. : U L U L`
120
121			`nop # .. .. .. E`
122			`subq $1, 1, $1 # .. .. E ..`
123			`addq $6, 8, $6 # .. E .. ..`
124			`blt $3, $alignmod64 # U .. .. .. : U L U L`
125
126			`$bigalign:`
127			`/*`
128			`* $0 is the number of bytes left`
129			`* $1 is the number of quads left`
130			`* $6 is aligned 0mod64`
131			`* we know that we'll be taking a minimum of one trip through`
132			`* CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle`
133			`* We are _not_ going to update $0 after every single store. That`
134			`* would be silly, because there will be cross-cluster dependencies`
135			`* no matter how the code is scheduled. By doing it in slightly`
136			`* staggered fashion, we can still do this loop in 5 fetches`
137			`* The worse case will be doing two extra quads in some future execution,`
138			`* in the event of an interrupted clear.`
139			`* Assumes the wh64 needs to be for 2 trips through the loop in the future`
140			`* The wh64 is issued on for the starting destination address for trip +2`
141			`* through the loop, and if there are less than two trips left, the target`
142			`* address will be for the current trip.`
143			`*/`
144			`nop # E :`
145			`nop # E :`
146			`nop # E :`
147			`bis $6,$6,$3 # E : U L U L : Initial wh64 address is dest`
148			`/* This might actually help for the current trip... */`
149
150			`$do_wh64:`
151			`wh64 ($3) # .. .. .. L1 : memory subsystem hint`
152			`subq $1, 16, $4 # .. .. E .. : Forward calculation - repeat the loop?`
153			`EX( stq_u $31, 0($6) ) # .. L .. ..`
154			`subq $0, 8, $0 # E .. .. .. : U L U L`
155
156			`addq $6, 128, $3 # E : Target address of wh64`
157			`EX( stq_u $31, 8($6) ) # L :`
158			`EX( stq_u $31, 16($6) ) # L :`
159			`subq $0, 16, $0 # E : U L L U`
160
161			`nop # E :`
162			`EX( stq_u $31, 24($6) ) # L :`
163			`EX( stq_u $31, 32($6) ) # L :`
164			`subq $0, 168, $5 # E : U L L U : two trips through the loop left?`
165			`/* 168 = 192 - 24, since we've already completed some stores */`
166
167			`subq $0, 16, $0 # E :`
168			`EX( stq_u $31, 40($6) ) # L :`
169			`EX( stq_u $31, 48($6) ) # L :`
170			`cmovlt $5, $6, $3 # E : U L L U : Latency 2, extra mapping cycle`
171
172			`subq $1, 8, $1 # E :`
173			`subq $0, 16, $0 # E :`
174			`EX( stq_u $31, 56($6) ) # L :`
175			`nop # E : U L U L`
176
177			`nop # E :`
178			`subq $0, 8, $0 # E :`
179			`addq $6, 64, $6 # E :`
180			`bge $4, $do_wh64 # U : U L U L`
181
182			`$trailquad:`
183			`# zero to 16 quadwords left to store, plus any trailing bytes`
184			`# $1 is the number of quadwords left to go.`
185			`#`
186			`nop # .. .. .. E`
187			`nop # .. .. E ..`
188			`nop # .. E .. ..`
189			`beq $1, $trailbytes # U .. .. .. : U L U L : Only 0..7 bytes to go`
190
191			`$onequad:`
192			`EX( stq_u $31, 0($6) ) # .. .. .. L`
193			`subq $1, 1, $1 # .. .. E ..`
194			`subq $0, 8, $0 # .. E .. ..`
195			`nop # E .. .. .. : U L U L`
196
197			`nop # .. .. .. E`
198			`nop # .. .. E ..`
199			`addq $6, 8, $6 # .. E .. ..`
200			`bgt $1, $onequad # U .. .. .. : U L U L`
201
202			`# We have an unknown number of bytes left to go.`
203			`$trailbytes:`
204			`nop # .. .. .. E`
205			`nop # .. .. E ..`
206			`nop # .. E .. ..`
207			`beq $0, $zerolength # U .. .. .. : U L U L`
208
209			`# $0 contains the number of bytes left to copy (0..31)`
210			`# so we will use $0 as the loop counter`
211			`# We know for a fact that $0 > 0 zero due to previous context`
212			`$onebyte:`
213			`EX( stb $31, 0($6) ) # .. .. .. L`
214			`subq $0, 1, $0 # .. .. E .. :`
215			`addq $6, 1, $6 # .. E .. .. :`
216			`bgt $0, $onebyte # U .. .. .. : U L U L`
217
218			`$zerolength:`
219			`$exception: # Destination for exception recovery(?)`
220			`nop # .. .. .. E :`
221			`nop # .. .. E .. :`
222			`nop # .. E .. .. :`
223			`ret $31, ($28), 1 # L0 .. .. .. : L U L U`
224			`.end __do_clear_user`
225