1 |
734 |
jeremybenn |
/* Macros for atomic functionality for tile.
|
2 |
|
|
Copyright (C) 2011, 2012
|
3 |
|
|
Free Software Foundation, Inc.
|
4 |
|
|
Contributed by Walter Lee (walt@tilera.com)
|
5 |
|
|
|
6 |
|
|
This file is free software; you can redistribute it and/or modify it
|
7 |
|
|
under the terms of the GNU General Public License as published by the
|
8 |
|
|
Free Software Foundation; either version 3, or (at your option) any
|
9 |
|
|
later version.
|
10 |
|
|
|
11 |
|
|
This file is distributed in the hope that it will be useful, but
|
12 |
|
|
WITHOUT ANY WARRANTY; without even the implied warranty of
|
13 |
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
14 |
|
|
General Public License for more details.
|
15 |
|
|
|
16 |
|
|
Under Section 7 of GPL version 3, you are granted additional
|
17 |
|
|
permissions described in the GCC Runtime Library Exception, version
|
18 |
|
|
3.1, as published by the Free Software Foundation.
|
19 |
|
|
|
20 |
|
|
You should have received a copy of the GNU General Public License and
|
21 |
|
|
a copy of the GCC Runtime Library Exception along with this program;
|
22 |
|
|
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
|
23 |
|
|
<http://www.gnu.org/licenses/>. */
|
24 |
|
|
|
25 |
|
|
|
26 |
|
|
/* Provides macros for common atomic functionality. */
|
27 |
|
|
|
28 |
|
|
#ifndef _ATOMIC_H_
|
29 |
|
|
#define _ATOMIC_H_
|
30 |
|
|
|
31 |
|
|
#ifdef __tilegx__
|
32 |
|
|
/* Atomic instruction macros
|
33 |
|
|
|
34 |
|
|
The macros provided by atomic.h simplify access to the TILE-Gx
|
35 |
|
|
architecture's atomic instructions. The architecture provides a
|
36 |
|
|
variety of atomic instructions, including "exchange", "compare and
|
37 |
|
|
exchange", "fetch and ADD", "fetch and AND", "fetch and OR", and
|
38 |
|
|
"fetch and ADD if greater than or equal to zero".
|
39 |
|
|
|
40 |
|
|
No barrier or fence semantics are implied by any of the atomic
|
41 |
|
|
instructions for manipulating memory; you must specify the barriers
|
42 |
|
|
that you wish explicitly, using the provided macros.
|
43 |
|
|
|
44 |
|
|
Any integral 32- or 64-bit value can be used as the argument
|
45 |
|
|
to these macros, such as "int", "long long", "unsigned long", etc.
|
46 |
|
|
The pointers must be aligned to 4 or 8 bytes for 32- or 64-bit data.
|
47 |
|
|
The "exchange" and "compare and exchange" macros may also take
|
48 |
|
|
pointer values. We use the pseudo-type "VAL" in the documentation
|
49 |
|
|
to indicate the use of an appropriate type. */
|
50 |
|
|
#else
|
51 |
|
|
/* Atomic instruction macros
|
52 |
|
|
|
53 |
|
|
The macros provided by atomic.h simplify access to the Tile
|
54 |
|
|
architecture's atomic instructions. Since the architecture
|
55 |
|
|
supports test-and-set as its only in-silicon atomic operation, many
|
56 |
|
|
of the operations provided by this header are implemented as
|
57 |
|
|
fast-path calls to Linux emulation routines.
|
58 |
|
|
|
59 |
|
|
Using the kernel for atomic operations allows userspace to take
|
60 |
|
|
advantage of the kernel's existing atomic-integer support (managed
|
61 |
|
|
by a distributed array of locks). The kernel provides proper
|
62 |
|
|
ordering among simultaneous atomic operations on different cores,
|
63 |
|
|
and guarantees a process can not be context-switched part way
|
64 |
|
|
through an atomic operation. By virtue of sharing the kernel
|
65 |
|
|
atomic implementation, the userspace atomic operations
|
66 |
|
|
are compatible with the atomic methods provided by the kernel's
|
67 |
|
|
futex() syscall API. Note that these operations never cause Linux
|
68 |
|
|
kernel scheduling, and are in fact invisible to the kernel; they
|
69 |
|
|
simply act as regular function calls but with an elevated privilege
|
70 |
|
|
level. Note that the kernel's distributed lock array is hashed by
|
71 |
|
|
using only VA bits from the atomic value's address (to avoid the
|
72 |
|
|
performance hit of page table locking and multiple page-table
|
73 |
|
|
lookups to get the PA) and only the VA bits that are below page
|
74 |
|
|
granularity (to properly lock simultaneous accesses to the same
|
75 |
|
|
page mapped at different VAs). As a result, simultaneous atomic
|
76 |
|
|
operations on values whose addresses are at the same offset on a
|
77 |
|
|
page will contend in the kernel for the same lock array element.
|
78 |
|
|
|
79 |
|
|
No barrier or fence semantics are implied by any of the atomic
|
80 |
|
|
instructions for manipulating memory; you must specify the barriers
|
81 |
|
|
that you wish explicitly, using the provided macros.
|
82 |
|
|
|
83 |
|
|
Any integral 32- or 64-bit value can be used as the argument
|
84 |
|
|
to these macros, such as "int", "long long", "unsigned long", etc.
|
85 |
|
|
The pointers must be aligned to 4 or 8 bytes for 32- or 64-bit data.
|
86 |
|
|
The "exchange" and "compare and exchange" macros may also take
|
87 |
|
|
pointer values. We use the pseudo-type "VAL" in the documentation
|
88 |
|
|
to indicate the use of an appropriate type.
|
89 |
|
|
|
90 |
|
|
The 32-bit routines are implemented using a single kernel fast
|
91 |
|
|
syscall, as is the 64-bit compare-and-exchange. The other 64-bit
|
92 |
|
|
routines are implemented by looping over the 64-bit
|
93 |
|
|
compare-and-exchange routine, so may be potentially less efficient. */
|
94 |
|
|
#endif
|
95 |
|
|
|
96 |
|
|
#include <stdint.h>
|
97 |
|
|
#include <features.h>
|
98 |
|
|
#ifdef __tilegx__
|
99 |
|
|
#include <arch/spr_def.h>
|
100 |
|
|
#else
|
101 |
|
|
#include <asm/unistd.h>
|
102 |
|
|
#endif
|
103 |
|
|
|
104 |
|
|
|
105 |
|
|
/* 32-bit integer compare-and-exchange. */
|
106 |
|
|
static __inline __attribute__ ((always_inline))
|
107 |
|
|
int atomic_val_compare_and_exchange_4 (volatile int *mem,
|
108 |
|
|
int oldval, int newval)
|
109 |
|
|
{
|
110 |
|
|
#ifdef __tilegx__
|
111 |
|
|
__insn_mtspr (SPR_CMPEXCH_VALUE, oldval);
|
112 |
|
|
return __insn_cmpexch4 (mem, newval);
|
113 |
|
|
#else
|
114 |
|
|
int result;
|
115 |
|
|
__asm__ __volatile__ ("swint1":"=R00" (result),
|
116 |
|
|
"=m" (*mem):"R10" (__NR_FAST_cmpxchg), "R00" (mem),
|
117 |
|
|
"R01" (oldval), "R02" (newval), "m" (*mem):"r20",
|
118 |
|
|
"r21", "r22", "r23", "r24", "r25", "r26", "r27",
|
119 |
|
|
"r28", "r29", "memory");
|
120 |
|
|
return result;
|
121 |
|
|
#endif
|
122 |
|
|
}
|
123 |
|
|
|
124 |
|
|
/* 64-bit integer compare-and-exchange. */
|
125 |
|
|
static __inline __attribute__ ((always_inline))
|
126 |
|
|
int64_t atomic_val_compare_and_exchange_8 (volatile int64_t * mem,
|
127 |
|
|
int64_t oldval,
|
128 |
|
|
int64_t newval)
|
129 |
|
|
{
|
130 |
|
|
#ifdef __tilegx__
|
131 |
|
|
__insn_mtspr (SPR_CMPEXCH_VALUE, oldval);
|
132 |
|
|
return __insn_cmpexch (mem, newval);
|
133 |
|
|
#else
|
134 |
|
|
unsigned int result_lo, result_hi;
|
135 |
|
|
unsigned int oldval_lo = oldval & 0xffffffffu, oldval_hi = oldval >> 32;
|
136 |
|
|
unsigned int newval_lo = newval & 0xffffffffu, newval_hi = newval >> 32;
|
137 |
|
|
__asm__ __volatile__ ("swint1":"=R00" (result_lo), "=R01" (result_hi),
|
138 |
|
|
"=m" (*mem):"R10" (__NR_FAST_cmpxchg64), "R00" (mem),
|
139 |
|
|
"R02" (oldval_lo), "R03" (oldval_hi),
|
140 |
|
|
"R04" (newval_lo), "R05" (newval_hi),
|
141 |
|
|
"m" (*mem):"r20", "r21", "r22", "r23", "r24", "r25",
|
142 |
|
|
"r26", "r27", "r28", "r29", "memory");
|
143 |
|
|
return ((uint64_t) result_hi) << 32 | result_lo;
|
144 |
|
|
#endif
|
145 |
|
|
}
|
146 |
|
|
|
147 |
|
|
/* This non-existent symbol is called for sizes other than "4" and "8",
|
148 |
|
|
indicating a bug in the caller. */
|
149 |
|
|
extern int __atomic_error_bad_argument_size (void)
|
150 |
|
|
__attribute__ ((warning ("sizeof atomic argument not 4 or 8")));
|
151 |
|
|
|
152 |
|
|
|
153 |
|
|
#define atomic_val_compare_and_exchange(mem, o, n) \
|
154 |
|
|
({ \
|
155 |
|
|
(__typeof(*(mem)))(__typeof(*(mem)-*(mem))) \
|
156 |
|
|
((sizeof(*(mem)) == 8) ? \
|
157 |
|
|
atomic_val_compare_and_exchange_8( \
|
158 |
|
|
(volatile int64_t*)(mem), (__typeof((o)-(o)))(o), \
|
159 |
|
|
(__typeof((n)-(n)))(n)) : \
|
160 |
|
|
(sizeof(*(mem)) == 4) ? \
|
161 |
|
|
atomic_val_compare_and_exchange_4( \
|
162 |
|
|
(volatile int*)(mem), (__typeof((o)-(o)))(o), \
|
163 |
|
|
(__typeof((n)-(n)))(n)) : \
|
164 |
|
|
__atomic_error_bad_argument_size()); \
|
165 |
|
|
})
|
166 |
|
|
|
167 |
|
|
#define atomic_bool_compare_and_exchange(mem, o, n) \
|
168 |
|
|
({ \
|
169 |
|
|
__typeof(o) __o = (o); \
|
170 |
|
|
__builtin_expect( \
|
171 |
|
|
__o == atomic_val_compare_and_exchange((mem), __o, (n)), 1); \
|
172 |
|
|
})
|
173 |
|
|
|
174 |
|
|
|
175 |
|
|
/* Loop with compare_and_exchange until we guess the correct value.
|
176 |
|
|
Normally "expr" will be an expression using __old and __value. */
|
177 |
|
|
#define __atomic_update_cmpxchg(mem, value, expr) \
|
178 |
|
|
({ \
|
179 |
|
|
__typeof(value) __value = (value); \
|
180 |
|
|
__typeof(*(mem)) *__mem = (mem), __old = *__mem, __guess; \
|
181 |
|
|
do { \
|
182 |
|
|
__guess = __old; \
|
183 |
|
|
__old = atomic_val_compare_and_exchange(__mem, __old, (expr)); \
|
184 |
|
|
} while (__builtin_expect(__old != __guess, 0)); \
|
185 |
|
|
__old; \
|
186 |
|
|
})
|
187 |
|
|
|
188 |
|
|
#ifdef __tilegx__
|
189 |
|
|
|
190 |
|
|
/* Generic atomic op with 8- or 4-byte variant.
|
191 |
|
|
The _mask, _addend, and _expr arguments are ignored on tilegx. */
|
192 |
|
|
#define __atomic_update(mem, value, op, _mask, _addend, _expr) \
|
193 |
|
|
({ \
|
194 |
|
|
((__typeof(*(mem))) \
|
195 |
|
|
((sizeof(*(mem)) == 8) ? (__typeof(*(mem)-*(mem)))__insn_##op( \
|
196 |
|
|
(void *)(mem), (int64_t)(__typeof((value)-(value)))(value)) : \
|
197 |
|
|
(sizeof(*(mem)) == 4) ? (int)__insn_##op##4( \
|
198 |
|
|
(void *)(mem), (int32_t)(__typeof((value)-(value)))(value)) : \
|
199 |
|
|
__atomic_error_bad_argument_size())); \
|
200 |
|
|
})
|
201 |
|
|
|
202 |
|
|
#else
|
203 |
|
|
|
204 |
|
|
/* This uses TILEPro's fast syscall support to atomically compute:
|
205 |
|
|
|
206 |
|
|
int old = *ptr;
|
207 |
|
|
*ptr = (old & mask) + addend;
|
208 |
|
|
return old;
|
209 |
|
|
|
210 |
|
|
This primitive can be used for atomic exchange, add, or, and.
|
211 |
|
|
Only 32-bit support is provided. */
|
212 |
|
|
static __inline __attribute__ ((always_inline))
|
213 |
|
|
int
|
214 |
|
|
__atomic_update_4 (volatile int *mem, int mask, int addend)
|
215 |
|
|
{
|
216 |
|
|
int result;
|
217 |
|
|
__asm__ __volatile__ ("swint1":"=R00" (result),
|
218 |
|
|
"=m" (*mem):"R10" (__NR_FAST_atomic_update),
|
219 |
|
|
"R00" (mem), "R01" (mask), "R02" (addend),
|
220 |
|
|
"m" (*mem):"r20", "r21", "r22", "r23", "r24", "r25",
|
221 |
|
|
"r26", "r27", "r28", "r29", "memory");
|
222 |
|
|
return result;
|
223 |
|
|
}
|
224 |
|
|
|
225 |
|
|
/* Generic atomic op with 8- or 4-byte variant.
|
226 |
|
|
The _op argument is ignored on tilepro. */
|
227 |
|
|
#define __atomic_update(mem, value, _op, mask, addend, expr) \
|
228 |
|
|
({ \
|
229 |
|
|
(__typeof(*(mem)))(__typeof(*(mem)-*(mem))) \
|
230 |
|
|
((sizeof(*(mem)) == 8) ? \
|
231 |
|
|
__atomic_update_cmpxchg((mem), (value), (expr)) : \
|
232 |
|
|
(sizeof(*(mem)) == 4) ? \
|
233 |
|
|
__atomic_update_4((volatile int*)(mem), (__typeof((mask)-(mask)))(mask), \
|
234 |
|
|
(__typeof((addend)-(addend)))(addend)) : \
|
235 |
|
|
__atomic_error_bad_argument_size()); \
|
236 |
|
|
})
|
237 |
|
|
|
238 |
|
|
#endif /* __tilegx__ */
|
239 |
|
|
|
240 |
|
|
|
241 |
|
|
#define atomic_exchange(mem, newvalue) \
|
242 |
|
|
__atomic_update(mem, newvalue, exch, 0, newvalue, __value)
|
243 |
|
|
|
244 |
|
|
#define atomic_add(mem, value) \
|
245 |
|
|
__atomic_update(mem, value, fetchadd, -1, value, __old + __value)
|
246 |
|
|
|
247 |
|
|
#define atomic_sub(mem, value) atomic_add((mem), -(value))
|
248 |
|
|
|
249 |
|
|
#define atomic_increment(mem) atomic_add((mem), 1)
|
250 |
|
|
|
251 |
|
|
#define atomic_decrement(mem) atomic_add((mem), -1)
|
252 |
|
|
|
253 |
|
|
#define atomic_and(mem, mask) \
|
254 |
|
|
__atomic_update(mem, mask, fetchand, mask, 0, __old & __value)
|
255 |
|
|
|
256 |
|
|
#define atomic_or(mem, mask) \
|
257 |
|
|
__atomic_update(mem, mask, fetchor, ~mask, mask, __old | __value)
|
258 |
|
|
|
259 |
|
|
#define atomic_bit_set(mem, bit) \
|
260 |
|
|
({ \
|
261 |
|
|
__typeof(*(mem)) __mask = (__typeof(*(mem)))1 << (bit); \
|
262 |
|
|
__mask & atomic_or((mem), __mask); \
|
263 |
|
|
})
|
264 |
|
|
|
265 |
|
|
#define atomic_bit_clear(mem, bit) \
|
266 |
|
|
({ \
|
267 |
|
|
__typeof(*(mem)) __mask = (__typeof(*(mem)))1 << (bit); \
|
268 |
|
|
__mask & atomic_and((mem), ~__mask); \
|
269 |
|
|
})
|
270 |
|
|
|
271 |
|
|
#ifdef __tilegx__
|
272 |
|
|
/* Atomically store a new value to memory.
|
273 |
|
|
Note that you can freely use types of any size here, unlike the
|
274 |
|
|
other atomic routines, which require 32- or 64-bit types.
|
275 |
|
|
This accessor is provided for compatibility with TILEPro, which
|
276 |
|
|
required an explicit atomic operation for stores that needed
|
277 |
|
|
to be atomic with respect to other atomic methods in this header. */
|
278 |
|
|
#define atomic_write(mem, value) ((void) (*(mem) = (value)))
|
279 |
|
|
#else
|
280 |
|
|
#define atomic_write(mem, value) \
|
281 |
|
|
do { \
|
282 |
|
|
__typeof(mem) __aw_mem = (mem); \
|
283 |
|
|
__typeof(value) __aw_val = (value); \
|
284 |
|
|
unsigned int *__aw_mem32, __aw_intval, __aw_val32, __aw_off, __aw_mask; \
|
285 |
|
|
__aw_intval = (__typeof((value) - (value)))__aw_val; \
|
286 |
|
|
switch (sizeof(*__aw_mem)) { \
|
287 |
|
|
case 8: \
|
288 |
|
|
__atomic_update_cmpxchg(__aw_mem, __aw_val, __value); \
|
289 |
|
|
break; \
|
290 |
|
|
case 4: \
|
291 |
|
|
__atomic_update_4((int *)__aw_mem, 0, __aw_intval); \
|
292 |
|
|
break; \
|
293 |
|
|
case 2: \
|
294 |
|
|
__aw_off = 8 * ((long)__aw_mem & 0x2); \
|
295 |
|
|
__aw_mask = 0xffffU << __aw_off; \
|
296 |
|
|
__aw_mem32 = (unsigned int *)((long)__aw_mem & ~0x2); \
|
297 |
|
|
__aw_val32 = (__aw_intval << __aw_off) & __aw_mask; \
|
298 |
|
|
__atomic_update_cmpxchg(__aw_mem32, __aw_val32, \
|
299 |
|
|
(__old & ~__aw_mask) | __value); \
|
300 |
|
|
break; \
|
301 |
|
|
case 1: \
|
302 |
|
|
__aw_off = 8 * ((long)__aw_mem & 0x3); \
|
303 |
|
|
__aw_mask = 0xffU << __aw_off; \
|
304 |
|
|
__aw_mem32 = (unsigned int *)((long)__aw_mem & ~0x3); \
|
305 |
|
|
__aw_val32 = (__aw_intval << __aw_off) & __aw_mask; \
|
306 |
|
|
__atomic_update_cmpxchg(__aw_mem32, __aw_val32, \
|
307 |
|
|
(__old & ~__aw_mask) | __value); \
|
308 |
|
|
break; \
|
309 |
|
|
} \
|
310 |
|
|
} while (0)
|
311 |
|
|
#endif
|
312 |
|
|
|
313 |
|
|
/* Compiler barrier.
|
314 |
|
|
|
315 |
|
|
This macro prevents loads or stores from being moved by the compiler
|
316 |
|
|
across the macro. Any loaded value that was loaded before this
|
317 |
|
|
macro must then be reloaded by the compiler. */
|
318 |
|
|
#define atomic_compiler_barrier() __asm__ __volatile__("" ::: "memory")
|
319 |
|
|
|
320 |
|
|
/* Full memory barrier.
|
321 |
|
|
|
322 |
|
|
This macro has the semantics of atomic_compiler_barrer(), but also
|
323 |
|
|
ensures that previous stores are visible to other cores, and that
|
324 |
|
|
all previous loaded values have been placed into their target
|
325 |
|
|
register on this core. */
|
326 |
|
|
#define atomic_full_barrier() __insn_mf()
|
327 |
|
|
|
328 |
|
|
/* Read memory barrier.
|
329 |
|
|
|
330 |
|
|
Ensure that all reads by this processor that occurred prior to the
|
331 |
|
|
read memory barrier have completed, and that no reads that occur
|
332 |
|
|
after the read memory barrier on this processor are initiated
|
333 |
|
|
before the barrier.
|
334 |
|
|
|
335 |
|
|
On current TILE chips a read barrier is implemented as a full barrier,
|
336 |
|
|
but this may not be true in later versions of the architecture.
|
337 |
|
|
|
338 |
|
|
See also atomic_acquire_barrier() for the appropriate idiom to use
|
339 |
|
|
to ensure no reads are lifted above an atomic lock instruction. */
|
340 |
|
|
#define atomic_read_barrier() atomic_full_barrier()
|
341 |
|
|
|
342 |
|
|
/* Write memory barrier.
|
343 |
|
|
|
344 |
|
|
Ensure that all writes by this processor that occurred prior to the
|
345 |
|
|
write memory barrier have completed, and that no writes that occur
|
346 |
|
|
after the write memory barrier on this processor are initiated
|
347 |
|
|
before the barrier.
|
348 |
|
|
|
349 |
|
|
On current TILE chips a write barrier is implemented as a full barrier,
|
350 |
|
|
but this may not be true in later versions of the architecture.
|
351 |
|
|
|
352 |
|
|
See also atomic_release_barrier() for the appropriate idiom to use
|
353 |
|
|
to ensure all writes are complete prior to an atomic unlock instruction. */
|
354 |
|
|
#define atomic_write_barrier() atomic_full_barrier()
|
355 |
|
|
|
356 |
|
|
/* Lock acquisition barrier.
|
357 |
|
|
|
358 |
|
|
Ensure that no load operations that follow this macro in the
|
359 |
|
|
program can issue prior to the barrier. Without such a barrier,
|
360 |
|
|
the compiler can reorder them to issue earlier, or the hardware can
|
361 |
|
|
issue them speculatively. The latter is not currently done in the
|
362 |
|
|
Tile microarchitecture, but using this operation improves
|
363 |
|
|
portability to future implementations.
|
364 |
|
|
|
365 |
|
|
This operation is intended to be used as part of the "acquire"
|
366 |
|
|
path for locking, that is, when entering a critical section.
|
367 |
|
|
This should be done after the atomic operation that actually
|
368 |
|
|
acquires the lock, and in conjunction with a "control dependency"
|
369 |
|
|
that checks the atomic operation result to see if the lock was
|
370 |
|
|
in fact acquired. See the atomic_read_barrier() macro
|
371 |
|
|
for a heavier-weight barrier to use in certain unusual constructs,
|
372 |
|
|
or atomic_acquire_barrier_value() if no control dependency exists. */
|
373 |
|
|
#define atomic_acquire_barrier() atomic_compiler_barrier()
|
374 |
|
|
|
375 |
|
|
/* Lock release barrier.
|
376 |
|
|
|
377 |
|
|
Ensure that no store operations that precede this macro in the
|
378 |
|
|
program complete subsequent to the barrier. Without such a
|
379 |
|
|
barrier, the compiler can reorder stores to issue later, or stores
|
380 |
|
|
can be still outstanding in the memory network.
|
381 |
|
|
|
382 |
|
|
This operation is intended to be used as part of the "release" path
|
383 |
|
|
for locking, that is, when leaving a critical section. This should
|
384 |
|
|
be done before the operation (such as a store of zero) that
|
385 |
|
|
actually releases the lock. */
|
386 |
|
|
#define atomic_release_barrier() atomic_write_barrier()
|
387 |
|
|
|
388 |
|
|
/* Barrier until the read of a particular value is complete.
|
389 |
|
|
|
390 |
|
|
This is occasionally useful when constructing certain locking
|
391 |
|
|
scenarios. For example, you might write a routine that issues an
|
392 |
|
|
atomic instruction to enter a critical section, then reads one or
|
393 |
|
|
more values within the critical section without checking to see if
|
394 |
|
|
the critical section was in fact acquired, and only later checks
|
395 |
|
|
the atomic instruction result to see if the lock was acquired. If
|
396 |
|
|
so the routine could properly release the lock and know that the
|
397 |
|
|
values that were read were valid.
|
398 |
|
|
|
399 |
|
|
In this scenario, it is required to wait for the result of the
|
400 |
|
|
atomic instruction, even if the value itself is not checked. This
|
401 |
|
|
guarantees that if the atomic instruction succeeded in taking the lock,
|
402 |
|
|
the lock was held before any reads in the critical section issued. */
|
403 |
|
|
#define atomic_acquire_barrier_value(val) \
|
404 |
|
|
__asm__ __volatile__("move %0, %0" :: "r"(val))
|
405 |
|
|
|
406 |
|
|
/* Access the given variable in memory exactly once.
|
407 |
|
|
|
408 |
|
|
In some contexts, an algorithm may need to force access to memory,
|
409 |
|
|
since otherwise the compiler may think it can optimize away a
|
410 |
|
|
memory load or store; for example, in a loop when polling memory to
|
411 |
|
|
see if another cpu has updated it yet. Generally this is only
|
412 |
|
|
required for certain very carefully hand-tuned algorithms; using it
|
413 |
|
|
unnecessarily may result in performance losses.
|
414 |
|
|
|
415 |
|
|
A related use of this macro is to ensure that the compiler does not
|
416 |
|
|
rematerialize the value of "x" by reloading it from memory
|
417 |
|
|
unexpectedly; the "volatile" marking will prevent the compiler from
|
418 |
|
|
being able to rematerialize. This is helpful if an algorithm needs
|
419 |
|
|
to read a variable without locking, but needs it to have the same
|
420 |
|
|
value if it ends up being used several times within the algorithm.
|
421 |
|
|
|
422 |
|
|
Note that multiple uses of this macro are guaranteed to be ordered,
|
423 |
|
|
i.e. the compiler will not reorder stores or loads that are wrapped
|
424 |
|
|
in atomic_access_once(). */
|
425 |
|
|
#define atomic_access_once(x) (*(volatile __typeof(x) *)&(x))
|
426 |
|
|
|
427 |
|
|
|
428 |
|
|
#endif /* !_ATOMIC_H_ */
|