~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/arch/x86/include/asm/xor_32.h

Version: ~ [ linux-5.5-rc7 ] ~ [ linux-5.4.13 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.97 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.166 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.210 ] ~ [ linux-4.8.17 ] ~ [ linux-4.7.10 ] ~ [ linux-4.6.7 ] ~ [ linux-4.5.7 ] ~ [ linux-4.4.210 ] ~ [ linux-4.3.6 ] ~ [ linux-4.2.8 ] ~ [ linux-4.1.52 ] ~ [ linux-4.0.9 ] ~ [ linux-3.19.8 ] ~ [ linux-3.18.140 ] ~ [ linux-3.17.8 ] ~ [ linux-3.16.81 ] ~ [ linux-3.15.10 ] ~ [ linux-3.14.79 ] ~ [ linux-3.13.11 ] ~ [ linux-3.12.74 ] ~ [ linux-3.11.10 ] ~ [ linux-3.10.108 ] ~ [ linux-3.9.11 ] ~ [ linux-3.8.13 ] ~ [ linux-3.7.10 ] ~ [ linux-3.6.11 ] ~ [ linux-3.5.7 ] ~ [ linux-3.4.113 ] ~ [ linux-3.3.8 ] ~ [ linux-3.2.102 ] ~ [ linux-3.1.10 ] ~ [ linux-3.0.101 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.5 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 #ifndef _ASM_X86_XOR_32_H
  2 #define _ASM_X86_XOR_32_H
  3 
  4 /*
  5  * Optimized RAID-5 checksumming functions for MMX and SSE.
  6  *
  7  * This program is free software; you can redistribute it and/or modify
  8  * it under the terms of the GNU General Public License as published by
  9  * the Free Software Foundation; either version 2, or (at your option)
 10  * any later version.
 11  *
 12  * You should have received a copy of the GNU General Public License
 13  * (for example /usr/src/linux/COPYING); if not, write to the Free
 14  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 15  */
 16 
 17 /*
 18  * High-speed RAID5 checksumming functions utilizing MMX instructions.
 19  * Copyright (C) 1998 Ingo Molnar.
 20  */
 21 
 22 #define LD(x, y)        "       movq   8*("#x")(%1), %%mm"#y"   ;\n"
 23 #define ST(x, y)        "       movq %%mm"#y",   8*("#x")(%1)   ;\n"
 24 #define XO1(x, y)       "       pxor   8*("#x")(%2), %%mm"#y"   ;\n"
 25 #define XO2(x, y)       "       pxor   8*("#x")(%3), %%mm"#y"   ;\n"
 26 #define XO3(x, y)       "       pxor   8*("#x")(%4), %%mm"#y"   ;\n"
 27 #define XO4(x, y)       "       pxor   8*("#x")(%5), %%mm"#y"   ;\n"
 28 
 29 #include <asm/i387.h>
 30 
 31 static void
 32 xor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
 33 {
 34         unsigned long lines = bytes >> 7;
 35 
 36         kernel_fpu_begin();
 37 
 38         asm volatile(
 39 #undef BLOCK
 40 #define BLOCK(i)                                \
 41         LD(i, 0)                                \
 42                 LD(i + 1, 1)                    \
 43                         LD(i + 2, 2)            \
 44                                 LD(i + 3, 3)    \
 45         XO1(i, 0)                               \
 46         ST(i, 0)                                \
 47                 XO1(i+1, 1)                     \
 48                 ST(i+1, 1)                      \
 49                         XO1(i + 2, 2)           \
 50                         ST(i + 2, 2)            \
 51                                 XO1(i + 3, 3)   \
 52                                 ST(i + 3, 3)
 53 
 54         " .align 32                     ;\n"
 55         " 1:                            ;\n"
 56 
 57         BLOCK(0)
 58         BLOCK(4)
 59         BLOCK(8)
 60         BLOCK(12)
 61 
 62         "       addl $128, %1         ;\n"
 63         "       addl $128, %2         ;\n"
 64         "       decl %0               ;\n"
 65         "       jnz 1b                ;\n"
 66         : "+r" (lines),
 67           "+r" (p1), "+r" (p2)
 68         :
 69         : "memory");
 70 
 71         kernel_fpu_end();
 72 }
 73 
 74 static void
 75 xor_pII_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 76               unsigned long *p3)
 77 {
 78         unsigned long lines = bytes >> 7;
 79 
 80         kernel_fpu_begin();
 81 
 82         asm volatile(
 83 #undef BLOCK
 84 #define BLOCK(i)                                \
 85         LD(i, 0)                                \
 86                 LD(i + 1, 1)                    \
 87                         LD(i + 2, 2)            \
 88                                 LD(i + 3, 3)    \
 89         XO1(i, 0)                               \
 90                 XO1(i + 1, 1)                   \
 91                         XO1(i + 2, 2)           \
 92                                 XO1(i + 3, 3)   \
 93         XO2(i, 0)                               \
 94         ST(i, 0)                                \
 95                 XO2(i + 1, 1)                   \
 96                 ST(i + 1, 1)                    \
 97                         XO2(i + 2, 2)           \
 98                         ST(i + 2, 2)            \
 99                                 XO2(i + 3, 3)   \
100                                 ST(i + 3, 3)
101 
102         " .align 32                     ;\n"
103         " 1:                            ;\n"
104 
105         BLOCK(0)
106         BLOCK(4)
107         BLOCK(8)
108         BLOCK(12)
109 
110         "       addl $128, %1         ;\n"
111         "       addl $128, %2         ;\n"
112         "       addl $128, %3         ;\n"
113         "       decl %0               ;\n"
114         "       jnz 1b                ;\n"
115         : "+r" (lines),
116           "+r" (p1), "+r" (p2), "+r" (p3)
117         :
118         : "memory");
119 
120         kernel_fpu_end();
121 }
122 
123 static void
124 xor_pII_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
125               unsigned long *p3, unsigned long *p4)
126 {
127         unsigned long lines = bytes >> 7;
128 
129         kernel_fpu_begin();
130 
131         asm volatile(
132 #undef BLOCK
133 #define BLOCK(i)                                \
134         LD(i, 0)                                \
135                 LD(i + 1, 1)                    \
136                         LD(i + 2, 2)            \
137                                 LD(i + 3, 3)    \
138         XO1(i, 0)                               \
139                 XO1(i + 1, 1)                   \
140                         XO1(i + 2, 2)           \
141                                 XO1(i + 3, 3)   \
142         XO2(i, 0)                               \
143                 XO2(i + 1, 1)                   \
144                         XO2(i + 2, 2)           \
145                                 XO2(i + 3, 3)   \
146         XO3(i, 0)                               \
147         ST(i, 0)                                \
148                 XO3(i + 1, 1)                   \
149                 ST(i + 1, 1)                    \
150                         XO3(i + 2, 2)           \
151                         ST(i + 2, 2)            \
152                                 XO3(i + 3, 3)   \
153                                 ST(i + 3, 3)
154 
155         " .align 32                     ;\n"
156         " 1:                            ;\n"
157 
158         BLOCK(0)
159         BLOCK(4)
160         BLOCK(8)
161         BLOCK(12)
162 
163         "       addl $128, %1         ;\n"
164         "       addl $128, %2         ;\n"
165         "       addl $128, %3         ;\n"
166         "       addl $128, %4         ;\n"
167         "       decl %0               ;\n"
168         "       jnz 1b                ;\n"
169         : "+r" (lines),
170           "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
171         :
172         : "memory");
173 
174         kernel_fpu_end();
175 }
176 
177 
178 static void
179 xor_pII_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
180               unsigned long *p3, unsigned long *p4, unsigned long *p5)
181 {
182         unsigned long lines = bytes >> 7;
183 
184         kernel_fpu_begin();
185 
186         /* Make sure GCC forgets anything it knows about p4 or p5,
187            such that it won't pass to the asm volatile below a
188            register that is shared with any other variable.  That's
189            because we modify p4 and p5 there, but we can't mark them
190            as read/write, otherwise we'd overflow the 10-asm-operands
191            limit of GCC < 3.1.  */
192         asm("" : "+r" (p4), "+r" (p5));
193 
194         asm volatile(
195 #undef BLOCK
196 #define BLOCK(i)                                \
197         LD(i, 0)                                \
198                 LD(i + 1, 1)                    \
199                         LD(i + 2, 2)            \
200                                 LD(i + 3, 3)    \
201         XO1(i, 0)                               \
202                 XO1(i + 1, 1)                   \
203                         XO1(i + 2, 2)           \
204                                 XO1(i + 3, 3)   \
205         XO2(i, 0)                               \
206                 XO2(i + 1, 1)                   \
207                         XO2(i + 2, 2)           \
208                                 XO2(i + 3, 3)   \
209         XO3(i, 0)                               \
210                 XO3(i + 1, 1)                   \
211                         XO3(i + 2, 2)           \
212                                 XO3(i + 3, 3)   \
213         XO4(i, 0)                               \
214         ST(i, 0)                                \
215                 XO4(i + 1, 1)                   \
216                 ST(i + 1, 1)                    \
217                         XO4(i + 2, 2)           \
218                         ST(i + 2, 2)            \
219                                 XO4(i + 3, 3)   \
220                                 ST(i + 3, 3)
221 
222         " .align 32                     ;\n"
223         " 1:                            ;\n"
224 
225         BLOCK(0)
226         BLOCK(4)
227         BLOCK(8)
228         BLOCK(12)
229 
230         "       addl $128, %1         ;\n"
231         "       addl $128, %2         ;\n"
232         "       addl $128, %3         ;\n"
233         "       addl $128, %4         ;\n"
234         "       addl $128, %5         ;\n"
235         "       decl %0               ;\n"
236         "       jnz 1b                ;\n"
237         : "+r" (lines),
238           "+r" (p1), "+r" (p2), "+r" (p3)
239         : "r" (p4), "r" (p5)
240         : "memory");
241 
242         /* p4 and p5 were modified, and now the variables are dead.
243            Clobber them just to be sure nobody does something stupid
244            like assuming they have some legal value.  */
245         asm("" : "=r" (p4), "=r" (p5));
246 
247         kernel_fpu_end();
248 }
249 
250 #undef LD
251 #undef XO1
252 #undef XO2
253 #undef XO3
254 #undef XO4
255 #undef ST
256 #undef BLOCK
257 
258 static void
259 xor_p5_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
260 {
261         unsigned long lines = bytes >> 6;
262 
263         kernel_fpu_begin();
264 
265         asm volatile(
266         " .align 32                  ;\n"
267         " 1:                         ;\n"
268         "       movq   (%1), %%mm0   ;\n"
269         "       movq  8(%1), %%mm1   ;\n"
270         "       pxor   (%2), %%mm0   ;\n"
271         "       movq 16(%1), %%mm2   ;\n"
272         "       movq %%mm0,   (%1)   ;\n"
273         "       pxor  8(%2), %%mm1   ;\n"
274         "       movq 24(%1), %%mm3   ;\n"
275         "       movq %%mm1,  8(%1)   ;\n"
276         "       pxor 16(%2), %%mm2   ;\n"
277         "       movq 32(%1), %%mm4   ;\n"
278         "       movq %%mm2, 16(%1)   ;\n"
279         "       pxor 24(%2), %%mm3   ;\n"
280         "       movq 40(%1), %%mm5   ;\n"
281         "       movq %%mm3, 24(%1)   ;\n"
282         "       pxor 32(%2), %%mm4   ;\n"
283         "       movq 48(%1), %%mm6   ;\n"
284         "       movq %%mm4, 32(%1)   ;\n"
285         "       pxor 40(%2), %%mm5   ;\n"
286         "       movq 56(%1), %%mm7   ;\n"
287         "       movq %%mm5, 40(%1)   ;\n"
288         "       pxor 48(%2), %%mm6   ;\n"
289         "       pxor 56(%2), %%mm7   ;\n"
290         "       movq %%mm6, 48(%1)   ;\n"
291         "       movq %%mm7, 56(%1)   ;\n"
292 
293         "       addl $64, %1         ;\n"
294         "       addl $64, %2         ;\n"
295         "       decl %0              ;\n"
296         "       jnz 1b               ;\n"
297         : "+r" (lines),
298           "+r" (p1), "+r" (p2)
299         :
300         : "memory");
301 
302         kernel_fpu_end();
303 }
304 
305 static void
306 xor_p5_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
307              unsigned long *p3)
308 {
309         unsigned long lines = bytes >> 6;
310 
311         kernel_fpu_begin();
312 
313         asm volatile(
314         " .align 32,0x90             ;\n"
315         " 1:                         ;\n"
316         "       movq   (%1), %%mm0   ;\n"
317         "       movq  8(%1), %%mm1   ;\n"
318         "       pxor   (%2), %%mm0   ;\n"
319         "       movq 16(%1), %%mm2   ;\n"
320         "       pxor  8(%2), %%mm1   ;\n"
321         "       pxor   (%3), %%mm0   ;\n"
322         "       pxor 16(%2), %%mm2   ;\n"
323         "       movq %%mm0,   (%1)   ;\n"
324         "       pxor  8(%3), %%mm1   ;\n"
325         "       pxor 16(%3), %%mm2   ;\n"
326         "       movq 24(%1), %%mm3   ;\n"
327         "       movq %%mm1,  8(%1)   ;\n"
328         "       movq 32(%1), %%mm4   ;\n"
329         "       movq 40(%1), %%mm5   ;\n"
330         "       pxor 24(%2), %%mm3   ;\n"
331         "       movq %%mm2, 16(%1)   ;\n"
332         "       pxor 32(%2), %%mm4   ;\n"
333         "       pxor 24(%3), %%mm3   ;\n"
334         "       pxor 40(%2), %%mm5   ;\n"
335         "       movq %%mm3, 24(%1)   ;\n"
336         "       pxor 32(%3), %%mm4   ;\n"
337         "       pxor 40(%3), %%mm5   ;\n"
338         "       movq 48(%1), %%mm6   ;\n"
339         "       movq %%mm4, 32(%1)   ;\n"
340         "       movq 56(%1), %%mm7   ;\n"
341         "       pxor 48(%2), %%mm6   ;\n"
342         "       movq %%mm5, 40(%1)   ;\n"
343         "       pxor 56(%2), %%mm7   ;\n"
344         "       pxor 48(%3), %%mm6   ;\n"
345         "       pxor 56(%3), %%mm7   ;\n"
346         "       movq %%mm6, 48(%1)   ;\n"
347         "       movq %%mm7, 56(%1)   ;\n"
348 
349         "       addl $64, %1         ;\n"
350         "       addl $64, %2         ;\n"
351         "       addl $64, %3         ;\n"
352         "       decl %0              ;\n"
353         "       jnz 1b               ;\n"
354         : "+r" (lines),
355           "+r" (p1), "+r" (p2), "+r" (p3)
356         :
357         : "memory" );
358 
359         kernel_fpu_end();
360 }
361 
362 static void
363 xor_p5_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
364              unsigned long *p3, unsigned long *p4)
365 {
366         unsigned long lines = bytes >> 6;
367 
368         kernel_fpu_begin();
369 
370         asm volatile(
371         " .align 32,0x90             ;\n"
372         " 1:                         ;\n"
373         "       movq   (%1), %%mm0   ;\n"
374         "       movq  8(%1), %%mm1   ;\n"
375         "       pxor   (%2), %%mm0   ;\n"
376         "       movq 16(%1), %%mm2   ;\n"
377         "       pxor  8(%2), %%mm1   ;\n"
378         "       pxor   (%3), %%mm0   ;\n"
379         "       pxor 16(%2), %%mm2   ;\n"
380         "       pxor  8(%3), %%mm1   ;\n"
381         "       pxor   (%4), %%mm0   ;\n"
382         "       movq 24(%1), %%mm3   ;\n"
383         "       pxor 16(%3), %%mm2   ;\n"
384         "       pxor  8(%4), %%mm1   ;\n"
385         "       movq %%mm0,   (%1)   ;\n"
386         "       movq 32(%1), %%mm4   ;\n"
387         "       pxor 24(%2), %%mm3   ;\n"
388         "       pxor 16(%4), %%mm2   ;\n"
389         "       movq %%mm1,  8(%1)   ;\n"
390         "       movq 40(%1), %%mm5   ;\n"
391         "       pxor 32(%2), %%mm4   ;\n"
392         "       pxor 24(%3), %%mm3   ;\n"
393         "       movq %%mm2, 16(%1)   ;\n"
394         "       pxor 40(%2), %%mm5   ;\n"
395         "       pxor 32(%3), %%mm4   ;\n"
396         "       pxor 24(%4), %%mm3   ;\n"
397         "       movq %%mm3, 24(%1)   ;\n"
398         "       movq 56(%1), %%mm7   ;\n"
399         "       movq 48(%1), %%mm6   ;\n"
400         "       pxor 40(%3), %%mm5   ;\n"
401         "       pxor 32(%4), %%mm4   ;\n"
402         "       pxor 48(%2), %%mm6   ;\n"
403         "       movq %%mm4, 32(%1)   ;\n"
404         "       pxor 56(%2), %%mm7   ;\n"
405         "       pxor 40(%4), %%mm5   ;\n"
406         "       pxor 48(%3), %%mm6   ;\n"
407         "       pxor 56(%3), %%mm7   ;\n"
408         "       movq %%mm5, 40(%1)   ;\n"
409         "       pxor 48(%4), %%mm6   ;\n"
410         "       pxor 56(%4), %%mm7   ;\n"
411         "       movq %%mm6, 48(%1)   ;\n"
412         "       movq %%mm7, 56(%1)   ;\n"
413 
414         "       addl $64, %1         ;\n"
415         "       addl $64, %2         ;\n"
416         "       addl $64, %3         ;\n"
417         "       addl $64, %4         ;\n"
418         "       decl %0              ;\n"
419         "       jnz 1b               ;\n"
420         : "+r" (lines),
421           "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
422         :
423         : "memory");
424 
425         kernel_fpu_end();
426 }
427 
428 static void
429 xor_p5_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
430              unsigned long *p3, unsigned long *p4, unsigned long *p5)
431 {
432         unsigned long lines = bytes >> 6;
433 
434         kernel_fpu_begin();
435 
436         /* Make sure GCC forgets anything it knows about p4 or p5,
437            such that it won't pass to the asm volatile below a
438            register that is shared with any other variable.  That's
439            because we modify p4 and p5 there, but we can't mark them
440            as read/write, otherwise we'd overflow the 10-asm-operands
441            limit of GCC < 3.1.  */
442         asm("" : "+r" (p4), "+r" (p5));
443 
444         asm volatile(
445         " .align 32,0x90             ;\n"
446         " 1:                         ;\n"
447         "       movq   (%1), %%mm0   ;\n"
448         "       movq  8(%1), %%mm1   ;\n"
449         "       pxor   (%2), %%mm0   ;\n"
450         "       pxor  8(%2), %%mm1   ;\n"
451         "       movq 16(%1), %%mm2   ;\n"
452         "       pxor   (%3), %%mm0   ;\n"
453         "       pxor  8(%3), %%mm1   ;\n"
454         "       pxor 16(%2), %%mm2   ;\n"
455         "       pxor   (%4), %%mm0   ;\n"
456         "       pxor  8(%4), %%mm1   ;\n"
457         "       pxor 16(%3), %%mm2   ;\n"
458         "       movq 24(%1), %%mm3   ;\n"
459         "       pxor   (%5), %%mm0   ;\n"
460         "       pxor  8(%5), %%mm1   ;\n"
461         "       movq %%mm0,   (%1)   ;\n"
462         "       pxor 16(%4), %%mm2   ;\n"
463         "       pxor 24(%2), %%mm3   ;\n"
464         "       movq %%mm1,  8(%1)   ;\n"
465         "       pxor 16(%5), %%mm2   ;\n"
466         "       pxor 24(%3), %%mm3   ;\n"
467         "       movq 32(%1), %%mm4   ;\n"
468         "       movq %%mm2, 16(%1)   ;\n"
469         "       pxor 24(%4), %%mm3   ;\n"
470         "       pxor 32(%2), %%mm4   ;\n"
471         "       movq 40(%1), %%mm5   ;\n"
472         "       pxor 24(%5), %%mm3   ;\n"
473         "       pxor 32(%3), %%mm4   ;\n"
474         "       pxor 40(%2), %%mm5   ;\n"
475         "       movq %%mm3, 24(%1)   ;\n"
476         "       pxor 32(%4), %%mm4   ;\n"
477         "       pxor 40(%3), %%mm5   ;\n"
478         "       movq 48(%1), %%mm6   ;\n"
479         "       movq 56(%1), %%mm7   ;\n"
480         "       pxor 32(%5), %%mm4   ;\n"
481         "       pxor 40(%4), %%mm5   ;\n"
482         "       pxor 48(%2), %%mm6   ;\n"
483         "       pxor 56(%2), %%mm7   ;\n"
484         "       movq %%mm4, 32(%1)   ;\n"
485         "       pxor 48(%3), %%mm6   ;\n"
486         "       pxor 56(%3), %%mm7   ;\n"
487         "       pxor 40(%5), %%mm5   ;\n"
488         "       pxor 48(%4), %%mm6   ;\n"
489         "       pxor 56(%4), %%mm7   ;\n"
490         "       movq %%mm5, 40(%1)   ;\n"
491         "       pxor 48(%5), %%mm6   ;\n"
492         "       pxor 56(%5), %%mm7   ;\n"
493         "       movq %%mm6, 48(%1)   ;\n"
494         "       movq %%mm7, 56(%1)   ;\n"
495 
496         "       addl $64, %1         ;\n"
497         "       addl $64, %2         ;\n"
498         "       addl $64, %3         ;\n"
499         "       addl $64, %4         ;\n"
500         "       addl $64, %5         ;\n"
501         "       decl %0              ;\n"
502         "       jnz 1b               ;\n"
503         : "+r" (lines),
504           "+r" (p1), "+r" (p2), "+r" (p3)
505         : "r" (p4), "r" (p5)
506         : "memory");
507 
508         /* p4 and p5 were modified, and now the variables are dead.
509            Clobber them just to be sure nobody does something stupid
510            like assuming they have some legal value.  */
511         asm("" : "=r" (p4), "=r" (p5));
512 
513         kernel_fpu_end();
514 }
515 
516 static struct xor_block_template xor_block_pII_mmx = {
517         .name = "pII_mmx",
518         .do_2 = xor_pII_mmx_2,
519         .do_3 = xor_pII_mmx_3,
520         .do_4 = xor_pII_mmx_4,
521         .do_5 = xor_pII_mmx_5,
522 };
523 
524 static struct xor_block_template xor_block_p5_mmx = {
525         .name = "p5_mmx",
526         .do_2 = xor_p5_mmx_2,
527         .do_3 = xor_p5_mmx_3,
528         .do_4 = xor_p5_mmx_4,
529         .do_5 = xor_p5_mmx_5,
530 };
531 
532 /*
533  * Cache avoiding checksumming functions utilizing KNI instructions
534  * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
535  */
536 
537 #define XMMS_SAVE                               \
538 do {                                            \
539         preempt_disable();                      \
540         cr0 = read_cr0();                       \
541         clts();                                 \
542         asm volatile(                           \
543                 "movups %%xmm0,(%0)     ;\n\t"  \
544                 "movups %%xmm1,0x10(%0) ;\n\t"  \
545                 "movups %%xmm2,0x20(%0) ;\n\t"  \
546                 "movups %%xmm3,0x30(%0) ;\n\t"  \
547                 :                               \
548                 : "r" (xmm_save)                \
549                 : "memory");                    \
550 } while (0)
551 
552 #define XMMS_RESTORE                            \
553 do {                                            \
554         asm volatile(                           \
555                 "sfence                 ;\n\t"  \
556                 "movups (%0),%%xmm0     ;\n\t"  \
557                 "movups 0x10(%0),%%xmm1 ;\n\t"  \
558                 "movups 0x20(%0),%%xmm2 ;\n\t"  \
559                 "movups 0x30(%0),%%xmm3 ;\n\t"  \
560                 :                               \
561                 : "r" (xmm_save)                \
562                 : "memory");                    \
563         write_cr0(cr0);                         \
564         preempt_enable();                       \
565 } while (0)
566 
567 #define ALIGN16 __attribute__((aligned(16)))
568 
569 #define OFFS(x)         "16*("#x")"
570 #define PF_OFFS(x)      "256+16*("#x")"
571 #define PF0(x)          "       prefetchnta "PF_OFFS(x)"(%1)            ;\n"
572 #define LD(x, y)        "       movaps   "OFFS(x)"(%1), %%xmm"#y"       ;\n"
573 #define ST(x, y)        "       movaps %%xmm"#y",   "OFFS(x)"(%1)       ;\n"
574 #define PF1(x)          "       prefetchnta "PF_OFFS(x)"(%2)            ;\n"
575 #define PF2(x)          "       prefetchnta "PF_OFFS(x)"(%3)            ;\n"
576 #define PF3(x)          "       prefetchnta "PF_OFFS(x)"(%4)            ;\n"
577 #define PF4(x)          "       prefetchnta "PF_OFFS(x)"(%5)            ;\n"
578 #define PF5(x)          "       prefetchnta "PF_OFFS(x)"(%6)            ;\n"
579 #define XO1(x, y)       "       xorps   "OFFS(x)"(%2), %%xmm"#y"        ;\n"
580 #define XO2(x, y)       "       xorps   "OFFS(x)"(%3), %%xmm"#y"        ;\n"
581 #define XO3(x, y)       "       xorps   "OFFS(x)"(%4), %%xmm"#y"        ;\n"
582 #define XO4(x, y)       "       xorps   "OFFS(x)"(%5), %%xmm"#y"        ;\n"
583 #define XO5(x, y)       "       xorps   "OFFS(x)"(%6), %%xmm"#y"        ;\n"
584 
585 
586 static void
587 xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
588 {
589         unsigned long lines = bytes >> 8;
590         char xmm_save[16*4] ALIGN16;
591         int cr0;
592 
593         XMMS_SAVE;
594 
595         asm volatile(
596 #undef BLOCK
597 #define BLOCK(i)                                        \
598                 LD(i, 0)                                \
599                         LD(i + 1, 1)                    \
600                 PF1(i)                                  \
601                                 PF1(i + 2)              \
602                                 LD(i + 2, 2)            \
603                                         LD(i + 3, 3)    \
604                 PF0(i + 4)                              \
605                                 PF0(i + 6)              \
606                 XO1(i, 0)                               \
607                         XO1(i + 1, 1)                   \
608                                 XO1(i + 2, 2)           \
609                                         XO1(i + 3, 3)   \
610                 ST(i, 0)                                \
611                         ST(i + 1, 1)                    \
612                                 ST(i + 2, 2)            \
613                                         ST(i + 3, 3)    \
614 
615 
616                 PF0(0)
617                                 PF0(2)
618 
619         " .align 32                     ;\n"
620         " 1:                            ;\n"
621 
622                 BLOCK(0)
623                 BLOCK(4)
624                 BLOCK(8)
625                 BLOCK(12)
626 
627         "       addl $256, %1           ;\n"
628         "       addl $256, %2           ;\n"
629         "       decl %0                 ;\n"
630         "       jnz 1b                  ;\n"
631         : "+r" (lines),
632           "+r" (p1), "+r" (p2)
633         :
634         : "memory");
635 
636         XMMS_RESTORE;
637 }
638 
639 static void
640 xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
641           unsigned long *p3)
642 {
643         unsigned long lines = bytes >> 8;
644         char xmm_save[16*4] ALIGN16;
645         int cr0;
646 
647         XMMS_SAVE;
648 
649         asm volatile(
650 #undef BLOCK
651 #define BLOCK(i) \
652                 PF1(i)                                  \
653                                 PF1(i + 2)              \
654                 LD(i,0)                                 \
655                         LD(i + 1, 1)                    \
656                                 LD(i + 2, 2)            \
657                                         LD(i + 3, 3)    \
658                 PF2(i)                                  \
659                                 PF2(i + 2)              \
660                 PF0(i + 4)                              \
661                                 PF0(i + 6)              \
662                 XO1(i,0)                                \
663                         XO1(i + 1, 1)                   \
664                                 XO1(i + 2, 2)           \
665                                         XO1(i + 3, 3)   \
666                 XO2(i,0)                                \
667                         XO2(i + 1, 1)                   \
668                                 XO2(i + 2, 2)           \
669                                         XO2(i + 3, 3)   \
670                 ST(i,0)                                 \
671                         ST(i + 1, 1)                    \
672                                 ST(i + 2, 2)            \
673                                         ST(i + 3, 3)    \
674 
675 
676                 PF0(0)
677                                 PF0(2)
678 
679         " .align 32                     ;\n"
680         " 1:                            ;\n"
681 
682                 BLOCK(0)
683                 BLOCK(4)
684                 BLOCK(8)
685                 BLOCK(12)
686 
687         "       addl $256, %1           ;\n"
688         "       addl $256, %2           ;\n"
689         "       addl $256, %3           ;\n"
690         "       decl %0                 ;\n"
691         "       jnz 1b                  ;\n"
692         : "+r" (lines),
693           "+r" (p1), "+r"(p2), "+r"(p3)
694         :
695         : "memory" );
696 
697         XMMS_RESTORE;
698 }
699 
700 static void
701 xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
702           unsigned long *p3, unsigned long *p4)
703 {
704         unsigned long lines = bytes >> 8;
705         char xmm_save[16*4] ALIGN16;
706         int cr0;
707 
708         XMMS_SAVE;
709 
710         asm volatile(
711 #undef BLOCK
712 #define BLOCK(i) \
713                 PF1(i)                                  \
714                                 PF1(i + 2)              \
715                 LD(i,0)                                 \
716                         LD(i + 1, 1)                    \
717                                 LD(i + 2, 2)            \
718                                         LD(i + 3, 3)    \
719                 PF2(i)                                  \
720                                 PF2(i + 2)              \
721                 XO1(i,0)                                \
722                         XO1(i + 1, 1)                   \
723                                 XO1(i + 2, 2)           \
724                                         XO1(i + 3, 3)   \
725                 PF3(i)                                  \
726                                 PF3(i + 2)              \
727                 PF0(i + 4)                              \
728                                 PF0(i + 6)              \
729                 XO2(i,0)                                \
730                         XO2(i + 1, 1)                   \
731                                 XO2(i + 2, 2)           \
732                                         XO2(i + 3, 3)   \
733                 XO3(i,0)                                \
734                         XO3(i + 1, 1)                   \
735                                 XO3(i + 2, 2)           \
736                                         XO3(i + 3, 3)   \
737                 ST(i,0)                                 \
738                         ST(i + 1, 1)                    \
739                                 ST(i + 2, 2)            \
740                                         ST(i + 3, 3)    \
741 
742 
743                 PF0(0)
744                                 PF0(2)
745 
746         " .align 32                     ;\n"
747         " 1:                            ;\n"
748 
749                 BLOCK(0)
750                 BLOCK(4)
751                 BLOCK(8)
752                 BLOCK(12)
753 
754         "       addl $256, %1           ;\n"
755         "       addl $256, %2           ;\n"
756         "       addl $256, %3           ;\n"
757         "       addl $256, %4           ;\n"
758         "       decl %0                 ;\n"
759         "       jnz 1b                  ;\n"
760         : "+r" (lines),
761           "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
762         :
763         : "memory" );
764 
765         XMMS_RESTORE;
766 }
767 
768 static void
769 xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
770           unsigned long *p3, unsigned long *p4, unsigned long *p5)
771 {
772         unsigned long lines = bytes >> 8;
773         char xmm_save[16*4] ALIGN16;
774         int cr0;
775 
776         XMMS_SAVE;
777 
778         /* Make sure GCC forgets anything it knows about p4 or p5,
779            such that it won't pass to the asm volatile below a
780            register that is shared with any other variable.  That's
781            because we modify p4 and p5 there, but we can't mark them
782            as read/write, otherwise we'd overflow the 10-asm-operands
783            limit of GCC < 3.1.  */
784         asm("" : "+r" (p4), "+r" (p5));
785 
786         asm volatile(
787 #undef BLOCK
788 #define BLOCK(i) \
789                 PF1(i)                                  \
790                                 PF1(i + 2)              \
791                 LD(i,0)                                 \
792                         LD(i + 1, 1)                    \
793                                 LD(i + 2, 2)            \
794                                         LD(i + 3, 3)    \
795                 PF2(i)                                  \
796                                 PF2(i + 2)              \
797                 XO1(i,0)                                \
798                         XO1(i + 1, 1)                   \
799                                 XO1(i + 2, 2)           \
800                                         XO1(i + 3, 3)   \
801                 PF3(i)                                  \
802                                 PF3(i + 2)              \
803                 XO2(i,0)                                \
804                         XO2(i + 1, 1)                   \
805                                 XO2(i + 2, 2)           \
806                                         XO2(i + 3, 3)   \
807                 PF4(i)                                  \
808                                 PF4(i + 2)              \
809                 PF0(i + 4)                              \
810                                 PF0(i + 6)              \
811                 XO3(i,0)                                \
812                         XO3(i + 1, 1)                   \
813                                 XO3(i + 2, 2)           \
814                                         XO3(i + 3, 3)   \
815                 XO4(i,0)                                \
816                         XO4(i + 1, 1)                   \
817                                 XO4(i + 2, 2)           \
818                                         XO4(i + 3, 3)   \
819                 ST(i,0)                                 \
820                         ST(i + 1, 1)                    \
821                                 ST(i + 2, 2)            \
822                                         ST(i + 3, 3)    \
823 
824 
825                 PF0(0)
826                                 PF0(2)
827 
828         " .align 32                     ;\n"
829         " 1:                            ;\n"
830 
831                 BLOCK(0)
832                 BLOCK(4)
833                 BLOCK(8)
834                 BLOCK(12)
835 
836         "       addl $256, %1           ;\n"
837         "       addl $256, %2           ;\n"
838         "       addl $256, %3           ;\n"
839         "       addl $256, %4           ;\n"
840         "       addl $256, %5           ;\n"
841         "       decl %0                 ;\n"
842         "       jnz 1b                  ;\n"
843         : "+r" (lines),
844           "+r" (p1), "+r" (p2), "+r" (p3)
845         : "r" (p4), "r" (p5)
846         : "memory");
847 
848         /* p4 and p5 were modified, and now the variables are dead.
849            Clobber them just to be sure nobody does something stupid
850            like assuming they have some legal value.  */
851         asm("" : "=r" (p4), "=r" (p5));
852 
853         XMMS_RESTORE;
854 }
855 
856 static struct xor_block_template xor_block_pIII_sse = {
857         .name = "pIII_sse",
858         .do_2 = xor_sse_2,
859         .do_3 = xor_sse_3,
860         .do_4 = xor_sse_4,
861         .do_5 = xor_sse_5,
862 };
863 
864 /* Also try the AVX routines */
865 #include "xor_avx.h"
866 
867 /* Also try the generic routines.  */
868 #include <asm-generic/xor.h>
869 
870 #undef XOR_TRY_TEMPLATES
871 #define XOR_TRY_TEMPLATES                               \
872 do {                                                    \
873         xor_speed(&xor_block_8regs);                    \
874         xor_speed(&xor_block_8regs_p);                  \
875         xor_speed(&xor_block_32regs);                   \
876         xor_speed(&xor_block_32regs_p);                 \
877         AVX_XOR_SPEED;                                  \
878         if (cpu_has_xmm)                                \
879                 xor_speed(&xor_block_pIII_sse);         \
880         if (cpu_has_mmx) {                              \
881                 xor_speed(&xor_block_pII_mmx);          \
882                 xor_speed(&xor_block_p5_mmx);           \
883         }                                               \
884 } while (0)
885 
886 /* We force the use of the SSE xor block because it can write around L2.
887    We may also be able to load into the L1 only depending on how the cpu
888    deals with a load to a line that is being prefetched.  */
889 #define XOR_SELECT_TEMPLATE(FASTEST)                    \
890         AVX_SELECT(cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)
891 
892 #endif /* _ASM_X86_XOR_32_H */
893 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | Wiki (Japanese) | Wiki (English) | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

osdn.jp