1 |
1275 |
phoenix |
/*
|
2 |
|
|
* Copyright (C) 1996 David S. Miller (dm@engr.sgi.com)
|
3 |
|
|
* Copyright (C) 1997, 2001 Ralf Baechle (ralf@gnu.org)
|
4 |
|
|
* Copyright (C) 2000 SiByte, Inc.
|
5 |
|
|
* Copyright (C) 2002, 2003 Broadcom Corporation
|
6 |
|
|
*
|
7 |
|
|
* Written by Justin Carlson of SiByte, Inc.
|
8 |
|
|
* and Kip Walker of Broadcom Corp.
|
9 |
|
|
*
|
10 |
|
|
*
|
11 |
|
|
* This program is free software; you can redistribute it and/or
|
12 |
|
|
* modify it under the terms of the GNU General Public License
|
13 |
|
|
* as published by the Free Software Foundation; either version 2
|
14 |
|
|
* of the License, or (at your option) any later version.
|
15 |
|
|
*
|
16 |
|
|
* This program is distributed in the hope that it will be useful,
|
17 |
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
18 |
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
19 |
|
|
* GNU General Public License for more details.
|
20 |
|
|
*
|
21 |
|
|
* You should have received a copy of the GNU General Public License
|
22 |
|
|
* along with this program; if not, write to the Free Software
|
23 |
|
|
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
24 |
|
|
*/
|
25 |
|
|
|
26 |
|
|
#include <linux/config.h>
|
27 |
|
|
#include <linux/sched.h>
|
28 |
|
|
#include <linux/smp.h>
|
29 |
|
|
|
30 |
|
|
#include <asm/io.h>
|
31 |
|
|
#include <asm/sibyte/sb1250.h>
|
32 |
|
|
#include <asm/sibyte/sb1250_regs.h>
|
33 |
|
|
#include <asm/sibyte/sb1250_dma.h>
|
34 |
|
|
#include <asm/sibyte/64bit.h>
|
35 |
|
|
|
36 |
|
|
#ifdef CONFIG_SB1_PASS_1_WORKAROUNDS
|
37 |
|
|
#define SB1_PREF_LOAD_STREAMED_HINT "0"
|
38 |
|
|
#define SB1_PREF_STORE_STREAMED_HINT "1"
|
39 |
|
|
#else
|
40 |
|
|
#define SB1_PREF_LOAD_STREAMED_HINT "4"
|
41 |
|
|
#define SB1_PREF_STORE_STREAMED_HINT "5"
|
42 |
|
|
#endif
|
43 |
|
|
|
44 |
|
|
#ifdef CONFIG_SIBYTE_DMA_PAGEOPS
|
45 |
|
|
static inline void clear_page_cpu(void *page)
|
46 |
|
|
#else
|
47 |
|
|
void clear_page(void *page)
|
48 |
|
|
#endif
|
49 |
|
|
{
|
50 |
|
|
/*
|
51 |
|
|
* JDCXXX - This should be bottlenecked by the write buffer, but these
|
52 |
|
|
* things tend to be mildly unpredictable...should check this on the
|
53 |
|
|
* performance model
|
54 |
|
|
*
|
55 |
|
|
* We prefetch 4 lines ahead. We're also "cheating" slightly here...
|
56 |
|
|
* since we know we're on an SB1, we force the assembler to take
|
57 |
|
|
* 64-bit operands to speed things up
|
58 |
|
|
*/
|
59 |
|
|
__asm__ __volatile__(
|
60 |
|
|
".set push \n"
|
61 |
|
|
".set noreorder \n"
|
62 |
|
|
".set noat \n"
|
63 |
|
|
".set mips4 \n"
|
64 |
|
|
" daddiu $1, %0, %2 \n" /* Calculate the end of the page to clear */
|
65 |
|
|
#ifdef CONFIG_CPU_HAS_PREFETCH
|
66 |
|
|
" pref " SB1_PREF_STORE_STREAMED_HINT ", 0(%0) \n" /* Prefetch the first 4 lines */
|
67 |
|
|
" pref " SB1_PREF_STORE_STREAMED_HINT ", 32(%0) \n"
|
68 |
|
|
" pref " SB1_PREF_STORE_STREAMED_HINT ", 64(%0) \n"
|
69 |
|
|
" pref " SB1_PREF_STORE_STREAMED_HINT ", 96(%0) \n"
|
70 |
|
|
#endif
|
71 |
|
|
"1: sd $0, 0(%0) \n" /* Throw out a cacheline of 0's */
|
72 |
|
|
" sd $0, 8(%0) \n"
|
73 |
|
|
" sd $0, 16(%0) \n"
|
74 |
|
|
" sd $0, 24(%0) \n"
|
75 |
|
|
#ifdef CONFIG_CPU_HAS_PREFETCH
|
76 |
|
|
" pref " SB1_PREF_STORE_STREAMED_HINT ",128(%0) \n" /* Prefetch 4 lines ahead */
|
77 |
|
|
#endif
|
78 |
|
|
" bne $1, %0, 1b \n"
|
79 |
|
|
" daddiu %0, %0, 32\n" /* Next cacheline (This instruction better be short piped!) */
|
80 |
|
|
".set pop \n"
|
81 |
|
|
: "=r" (page)
|
82 |
|
|
: "0" (page), "I" (PAGE_SIZE-32)
|
83 |
|
|
: "memory");
|
84 |
|
|
|
85 |
|
|
}
|
86 |
|
|
|
87 |
|
|
#ifdef CONFIG_SIBYTE_DMA_PAGEOPS
|
88 |
|
|
static inline void copy_page_cpu(void *to, void *from)
|
89 |
|
|
#else
|
90 |
|
|
void copy_page(void *to, void *from)
|
91 |
|
|
#endif
|
92 |
|
|
{
|
93 |
|
|
/*
|
94 |
|
|
* This should be optimized in assembly...can't use ld/sd, though,
|
95 |
|
|
* because the top 32 bits could be nuked if we took an interrupt
|
96 |
|
|
* during the routine. And this is not a good place to be cli()'ing
|
97 |
|
|
*
|
98 |
|
|
* The pref's used here are using "streaming" hints, which cause the
|
99 |
|
|
* copied data to be kicked out of the cache sooner. A page copy often
|
100 |
|
|
* ends up copying a lot more data than is commonly used, so this seems
|
101 |
|
|
* to make sense in terms of reducing cache pollution, but I've no real
|
102 |
|
|
* performance data to back this up
|
103 |
|
|
*/
|
104 |
|
|
|
105 |
|
|
__asm__ __volatile__(
|
106 |
|
|
".set push \n"
|
107 |
|
|
".set noreorder \n"
|
108 |
|
|
".set noat \n"
|
109 |
|
|
".set mips4 \n"
|
110 |
|
|
" daddiu $1, %0, %4 \n" /* Calculate the end of the page to copy */
|
111 |
|
|
#ifdef CONFIG_CPU_HAS_PREFETCH
|
112 |
|
|
" pref " SB1_PREF_LOAD_STREAMED_HINT ", 0(%0) \n" /* Prefetch the first 3 lines */
|
113 |
|
|
" pref " SB1_PREF_STORE_STREAMED_HINT ", 0(%1) \n"
|
114 |
|
|
" pref " SB1_PREF_LOAD_STREAMED_HINT ", 32(%0) \n"
|
115 |
|
|
" pref " SB1_PREF_STORE_STREAMED_HINT ", 32(%1) \n"
|
116 |
|
|
" pref " SB1_PREF_LOAD_STREAMED_HINT ", 64(%0) \n"
|
117 |
|
|
" pref " SB1_PREF_STORE_STREAMED_HINT ", 64(%1) \n"
|
118 |
|
|
#endif
|
119 |
|
|
"1: lw $2, 0(%0) \n" /* Block copy a cacheline */
|
120 |
|
|
" lw $3, 4(%0) \n"
|
121 |
|
|
" lw $4, 8(%0) \n"
|
122 |
|
|
" lw $5, 12(%0) \n"
|
123 |
|
|
" lw $6, 16(%0) \n"
|
124 |
|
|
" lw $7, 20(%0) \n"
|
125 |
|
|
" lw $8, 24(%0) \n"
|
126 |
|
|
" lw $9, 28(%0) \n"
|
127 |
|
|
#ifdef CONFIG_CPU_HAS_PREFETCH
|
128 |
|
|
" pref " SB1_PREF_LOAD_STREAMED_HINT ", 96(%0) \n" /* Prefetch ahead */
|
129 |
|
|
" pref " SB1_PREF_STORE_STREAMED_HINT ", 96(%1) \n"
|
130 |
|
|
#endif
|
131 |
|
|
" sw $2, 0(%1) \n"
|
132 |
|
|
" sw $3, 4(%1) \n"
|
133 |
|
|
" sw $4, 8(%1) \n"
|
134 |
|
|
" sw $5, 12(%1) \n"
|
135 |
|
|
" sw $6, 16(%1) \n"
|
136 |
|
|
" sw $7, 20(%1) \n"
|
137 |
|
|
" sw $8, 24(%1) \n"
|
138 |
|
|
" sw $9, 28(%1) \n"
|
139 |
|
|
" daddiu %1, %1, 32 \n" /* Next cacheline */
|
140 |
|
|
" nop \n" /* Force next add to short pipe */
|
141 |
|
|
" nop \n" /* Force next add to short pipe */
|
142 |
|
|
" bne $1, %0, 1b \n"
|
143 |
|
|
" daddiu %0, %0, 32 \n" /* Next cacheline */
|
144 |
|
|
".set pop \n"
|
145 |
|
|
: "=r" (to), "=r" (from)
|
146 |
|
|
: "0" (from), "1" (to), "I" (PAGE_SIZE-32)
|
147 |
|
|
: "$2","$3","$4","$5","$6","$7","$8","$9","memory");
|
148 |
|
|
}
|
149 |
|
|
|
150 |
|
|
|
151 |
|
|
#ifdef CONFIG_SIBYTE_DMA_PAGEOPS
|
152 |
|
|
|
153 |
|
|
/*
|
154 |
|
|
* Pad descriptors to cacheline, since each is exclusively owned by a
|
155 |
|
|
* particular CPU.
|
156 |
|
|
*/
|
157 |
|
|
typedef struct dmadscr_s {
|
158 |
|
|
uint64_t dscr_a;
|
159 |
|
|
uint64_t dscr_b;
|
160 |
|
|
uint64_t pad_a;
|
161 |
|
|
uint64_t pad_b;
|
162 |
|
|
} dmadscr_t;
|
163 |
|
|
|
164 |
|
|
static dmadscr_t page_descr[NR_CPUS] __attribute__((aligned(SMP_CACHE_BYTES)));
|
165 |
|
|
|
166 |
|
|
void sb1_dma_init(void)
|
167 |
|
|
{
|
168 |
|
|
int cpu = smp_processor_id();
|
169 |
|
|
uint64_t base_val = PHYSADDR(&page_descr[cpu]) | V_DM_DSCR_BASE_RINGSZ(1);
|
170 |
|
|
|
171 |
|
|
out64(base_val,
|
172 |
|
|
IO_SPACE_BASE + A_DM_REGISTER(cpu, R_DM_DSCR_BASE));
|
173 |
|
|
out64(base_val | M_DM_DSCR_BASE_RESET,
|
174 |
|
|
IO_SPACE_BASE + A_DM_REGISTER(cpu, R_DM_DSCR_BASE));
|
175 |
|
|
out64(base_val | M_DM_DSCR_BASE_ENABL,
|
176 |
|
|
IO_SPACE_BASE + A_DM_REGISTER(cpu, R_DM_DSCR_BASE));
|
177 |
|
|
}
|
178 |
|
|
|
179 |
|
|
void clear_page(void *page)
|
180 |
|
|
{
|
181 |
|
|
int cpu = smp_processor_id();
|
182 |
|
|
|
183 |
|
|
/* if the page is above Kseg0, use old way */
|
184 |
|
|
if (KSEGX(page) != K0BASE)
|
185 |
|
|
return clear_page_cpu(page);
|
186 |
|
|
|
187 |
|
|
page_descr[cpu].dscr_a = PHYSADDR(page) | M_DM_DSCRA_ZERO_MEM | M_DM_DSCRA_L2C_DEST | M_DM_DSCRA_INTERRUPT;
|
188 |
|
|
page_descr[cpu].dscr_b = V_DM_DSCRB_SRC_LENGTH(PAGE_SIZE);
|
189 |
|
|
out64(1, IO_SPACE_BASE + A_DM_REGISTER(cpu, R_DM_DSCR_COUNT));
|
190 |
|
|
|
191 |
|
|
/*
|
192 |
|
|
* Don't really want to do it this way, but there's no
|
193 |
|
|
* reliable way to delay completion detection.
|
194 |
|
|
*/
|
195 |
|
|
while (!(in64(IO_SPACE_BASE + A_DM_REGISTER(cpu, R_DM_DSCR_BASE_DEBUG)) & M_DM_DSCR_BASE_INTERRUPT))
|
196 |
|
|
;
|
197 |
|
|
in64(IO_SPACE_BASE + A_DM_REGISTER(cpu, R_DM_DSCR_BASE));
|
198 |
|
|
}
|
199 |
|
|
|
200 |
|
|
void copy_page(void *to, void *from)
|
201 |
|
|
{
|
202 |
|
|
unsigned long from_phys = PHYSADDR(from);
|
203 |
|
|
unsigned long to_phys = PHYSADDR(to);
|
204 |
|
|
int cpu = smp_processor_id();
|
205 |
|
|
|
206 |
|
|
/* if either page is above Kseg0, use old way */
|
207 |
|
|
if ((KSEGX(to) != K0BASE) || (KSEGX(from) != K0BASE))
|
208 |
|
|
return copy_page_cpu(to, from);
|
209 |
|
|
|
210 |
|
|
page_descr[cpu].dscr_a = PHYSADDR(to_phys) | M_DM_DSCRA_L2C_DEST | M_DM_DSCRA_INTERRUPT;
|
211 |
|
|
page_descr[cpu].dscr_b = PHYSADDR(from_phys) | V_DM_DSCRB_SRC_LENGTH(PAGE_SIZE);
|
212 |
|
|
out64(1, IO_SPACE_BASE + A_DM_REGISTER(cpu, R_DM_DSCR_COUNT));
|
213 |
|
|
|
214 |
|
|
/*
|
215 |
|
|
* Don't really want to do it this way, but there's no
|
216 |
|
|
* reliable way to delay completion detection.
|
217 |
|
|
*/
|
218 |
|
|
while (!(in64(IO_SPACE_BASE + A_DM_REGISTER(cpu, R_DM_DSCR_BASE_DEBUG)) & M_DM_DSCR_BASE_INTERRUPT))
|
219 |
|
|
;
|
220 |
|
|
in64(IO_SPACE_BASE + A_DM_REGISTER(cpu, R_DM_DSCR_BASE));
|
221 |
|
|
}
|
222 |
|
|
|
223 |
|
|
#endif /* CONFIG_SIBYTE_DMA_PAGEOPS */
|