00001
00002
00003
00004
00005
00006
00007
00008
00009
00010 #pragma fPIC
00011
00012 #define ALIGN16 __attribute__ ((aligned (16)))
00013 #define ALIGN64 __attribute__ ((aligned (64)))
00014 #define _ASM __asm__ __volatile__
00015
00016 typedef struct { float c1,c2,c3,c4; } _sse_float ALIGN16;
00017 typedef struct { _sse_float c1,c2,c3;} _sse_vector ALIGN16;
00018 typedef struct { int c1,c2,c3,c4;} _sse_int ALIGN16;
00019 typedef struct { double c1,c2; } _sse_double ALIGN16;
00020
00021 typedef struct {mdp_complex c11,c12,c13,c21,c22,c23,c31,c32,c33; } _sse_su3;
00022 typedef struct {mdp_complex c1,c2,c3; } _sse_su3_vector;
00023 typedef struct {_sse_su3_vector c1,c2,c3,c4; } _sse_spinor;
00024
00025 static _sse_float _sse_float_sgn12 __attribute__ ((unused)) = {-1.0f,-1.0f,1.0f,1.0f};
00026 static _sse_float _sse_float_sgn13 __attribute__ ((unused)) = {-1.0f,1.0f,-1.0f,1.0f};
00027 static _sse_float _sse_float_sgn14 __attribute__ ((unused)) = {-1.0f,1.0f,1.0f,-1.0f};
00028 static _sse_float _sse_float_sgn23 __attribute__ ((unused)) = {1.0f,-1.0f,-1.0f,1.0f};
00029 static _sse_float _sse_float_sgn24 __attribute__ ((unused)) = {1.0f,-1.0f,1.0f,-1.0f};
00030 static _sse_float _sse_float_sgn34 __attribute__ ((unused)) = {1.0f,1.0f,-1.0f,-1.0f};
00031 static _sse_int _sse_double_sgn __attribute__ ((unused)) = {0x0,0x80000000,0x0,0x0};
00032 static _sse_int _sse_double_sgn2 __attribute__ ((unused)) = {0x0,0x0,0x0,0x80000000};
00033
00034
00035
00036
00037
00038
00039 #define _sse_float_prefetch_spinor(addr) \
00040 _ASM ("prefetcht0 %0 \n\t" \
00041 "prefetcht0 %1" \
00042 : \
00043 : \
00044 "m" (*(((char*)(((unsigned int)(addr))&~0x7f)))), \
00045 "m" (*(((char*)(((unsigned int)(addr))&~0x7f))+128)))
00046
00047 #define _sse_float_prefetch_su3(addr) \
00048 _ASM ("prefetcht0 %0 \n\t" \
00049 "prefetcht0 %1" \
00050 : \
00051 : \
00052 "m" (*(((char*)(((unsigned int)(addr))&~0x7f)))), \
00053 "m" (*(((char*)(((unsigned int)(addr))&~0x7f))+128)))
00054
00055
00056
00057
00058
00059
00060
00061
00062
00063
00064
00065
00066
00067
00068
00069
00070
00071
00072
00073
00074
00075
00076
00077
00078
00079
00080
00081
00082
00083
00084
00085
00086
00087
00088 #define _sse_float_pair_load(sl,sh) \
00089 _ASM ("movlps %0, %%xmm0 \n\t" \
00090 "movlps %1, %%xmm1 \n\t" \
00091 "movlps %2, %%xmm2 \n\t" \
00092 "movhps %3, %%xmm0 \n\t" \
00093 "movhps %4, %%xmm1 \n\t" \
00094 "movhps %5, %%xmm2 " \
00095 : \
00096 : \
00097 "m" ((sl).c1), \
00098 "m" ((sl).c2), \
00099 "m" ((sl).c3), \
00100 "m" ((sh).c1), \
00101 "m" ((sh).c2), \
00102 "m" ((sh).c3))
00103
00104
00105
00106
00107
00108 #define _sse_float_pair_load_up(sl,sh) \
00109 _ASM ("movlps %0, %%xmm3 \n\t" \
00110 "movlps %1, %%xmm4 \n\t" \
00111 "movlps %2, %%xmm5 \n\t" \
00112 "movhps %3, %%xmm3 \n\t" \
00113 "movhps %4, %%xmm4 \n\t" \
00114 "movhps %5, %%xmm5" \
00115 : \
00116 : \
00117 "m" ((sl).c1), \
00118 "m" ((sl).c2), \
00119 "m" ((sl).c3), \
00120 "m" ((sh).c1), \
00121 "m" ((sh).c2), \
00122 "m" ((sh).c3))
00123
00124
00125
00126
00127
00128
00129 #define _sse_float_pair_store(rl,rh) \
00130 _ASM ("movlps %%xmm0, %0 \n\t" \
00131 "movlps %%xmm1, %1 \n\t" \
00132 "movlps %%xmm2, %2 \n\t" \
00133 "movhps %%xmm0, %3 \n\t" \
00134 "movhps %%xmm1, %4 \n\t" \
00135 "movhps %%xmm2, %5" \
00136 : \
00137 "=m" ((rl).c1), \
00138 "=m" ((rl).c2), \
00139 "=m" ((rl).c3), \
00140 "=m" ((rh).c1), \
00141 "=m" ((rh).c2), \
00142 "=m" ((rh).c3))
00143
00144
00145
00146
00147
00148 #define _sse_float_pair_store_up(rl,rh) \
00149 _ASM ("movlps %%xmm3, %0 \n\t" \
00150 "movlps %%xmm4, %1 \n\t" \
00151 "movlps %%xmm5, %2 \n\t" \
00152 "movhps %%xmm3, %3 \n\t" \
00153 "movhps %%xmm4, %4 \n\t" \
00154 "movhps %%xmm5, %5" \
00155 : \
00156 "=m" ((rl).c1), \
00157 "=m" ((rl).c2), \
00158 "=m" ((rl).c3), \
00159 "=m" ((rh).c1), \
00160 "=m" ((rh).c2), \
00161 "=m" ((rh).c3))
00162
00163
00164
00165
00166
00167 #define _sse_float_vector_load(s) \
00168 _ASM ("movaps %0, %%xmm0 \n\t" \
00169 "movaps %1, %%xmm1 \n\t" \
00170 "movaps %2, %%xmm2" \
00171 : \
00172 : \
00173 "m" ((s).c1), \
00174 "m" ((s).c2), \
00175 "m" ((s).c3))
00176
00177 #define _sse_float_vector_load_up(s) \
00178 _ASM ("movaps %0, %%xmm3 \n\t" \
00179 "movaps %1, %%xmm4 \n\t" \
00180 "movaps %2, %%xmm5" \
00181 : \
00182 : \
00183 "m" ((s).c1), \
00184 "m" ((s).c2), \
00185 "m" ((s).c3))
00186
00187
00188
00189
00190 #define _sse_float_vector_store(r) \
00191 _ASM ("movaps %%xmm0, %0 \n\t" \
00192 "movaps %%xmm1, %1 \n\t" \
00193 "movaps %%xmm2, %2" \
00194 : \
00195 "=m" ((r).c1), \
00196 "=m" ((r).c2), \
00197 "=m" ((r).c3))
00198
00199
00200
00201
00202
00203 #define _sse_float_vector_mul(c) \
00204 _ASM ("mulps %0, %%xmm0 \n\t" \
00205 "mulps %0, %%xmm1 \n\t" \
00206 "mulps %0, %%xmm2" \
00207 : \
00208 : \
00209 "m" (c))
00210
00211
00212
00213
00214
00215
00216
00217
00218
00219
00220
00221
00222
00223
00224
00225
00226
00227
00228
00229
00230
00231
00232
00233
00234
00235
00236
00237
00238
00239
00240
00241
00242
00243
00244
00245
00246
00247 #ifdef SSE2FIX
00248 #define _sse_float_vector_add() \
00249 _ASM ("addps %xmm3, %xmm0 \n\t" \
00250 "addps %xmm4, %xmm1 \n\t" \
00251 "addps %xmm5, %xmm2 \n\t" \
00252 : \
00253 : )
00254 #else
00255 #define _sse_float_vector_add() \
00256 _ASM ("addps %%xmm3, %%xmm0 \n\t" \
00257 "addps %%xmm4, %%xmm1 \n\t" \
00258 "addps %%xmm5, %%xmm2 \n\t" \
00259 : \
00260 : )
00261 #endif
00262
00263
00264
00265
00266
00267
00268 #ifdef SSE2FIX
00269 #define _sse_float_vector_sub() \
00270 _ASM ("subps %xmm3, %xmm0 \n\t" \
00271 "subps %xmm4, %xmm1 \n\t" \
00272 "subps %xmm5, %xmm2" \
00273 : \
00274 :)
00275 #else
00276 #define _sse_float_vector_sub() \
00277 _ASM ("subps %%xmm3, %%xmm0 \n\t" \
00278 "subps %%xmm4, %%xmm1 \n\t" \
00279 "subps %%xmm5, %%xmm2" \
00280 : \
00281 :)
00282 #endif
00283
00284
00285
00286
00287
00288 #define _sse_float_vector_addsub() \
00289 _ASM ("mulps %0, %%xmm3 \n\t" \
00290 "mulps %0, %%xmm4 \n\t" \
00291 "mulps %0, %%xmm5 \n\t" \
00292 "addps %%xmm3, %%xmm0 \n\t" \
00293 "addps %%xmm4, %%xmm1 \n\t" \
00294 "addps %%xmm5, %%xmm2" \
00295 : \
00296 : \
00297 "m" (_sse_float_sgn34))
00298
00299
00300
00301
00302
00303
00304
00305
00306
00307 #define _sse_float_su3_multiply(u) { \
00308 _ASM ("movss %0, %%xmm3 \n\t" \
00309 "movss %1, %%xmm6 \n\t" \
00310 "movss %2, %%xmm4 \n\t" \
00311 "movss %3, %%xmm7 \n\t" \
00312 "movss %4, %%xmm5 " \
00313 : \
00314 : \
00315 "m" ((u).c11.real()), \
00316 "m" ((u).c12.real()), \
00317 "m" ((u).c21.real()), \
00318 "m" ((u).c23.real()), \
00319 "m" ((u).c31.real())); \
00320 _ASM ("shufps $0x0, %%xmm3, %%xmm3 \n\t" \
00321 "shufps $0x0, %%xmm6, %%xmm6 \n\t" \
00322 "shufps $0x0, %%xmm4, %%xmm4 \n\t" \
00323 "mulps %%xmm0, %%xmm3 \n\t" \
00324 "shufps $0x0, %%xmm7, %%xmm7 \n\t" \
00325 "mulps %%xmm1, %%xmm6 \n\t" \
00326 "shufps $0x0, %%xmm5, %%xmm5 \n\t" \
00327 "mulps %%xmm0, %%xmm4 \n\t" \
00328 "addps %%xmm6, %%xmm3 \n\t" \
00329 "mulps %%xmm2, %%xmm7 \n\t" \
00330 "mulps %%xmm0, %%xmm5 \n\t" \
00331 "addps %%xmm7, %%xmm4 \n\t" \
00332 "movss %0, %%xmm6 \n\t" \
00333 "movss %1, %%xmm7 \n\t" \
00334 "shufps $0x0, %%xmm6, %%xmm6 \n\t" \
00335 "shufps $0x0, %%xmm7, %%xmm7 \n\t" \
00336 "mulps %%xmm1, %%xmm6 \n\t" \
00337 "mulps %%xmm2, %%xmm7 \n\t" \
00338 "addps %%xmm6, %%xmm5 \n\t" \
00339 "addps %%xmm7, %%xmm3 \n\t" \
00340 "movss %2, %%xmm6 \n\t" \
00341 "movss %3, %%xmm7 \n\t" \
00342 "shufps $0x0, %%xmm6, %%xmm6 \n\t" \
00343 "shufps $0x0, %%xmm7, %%xmm7 \n\t" \
00344 "mulps %%xmm1, %%xmm6 \n\t" \
00345 "mulps %%xmm2, %%xmm7 \n\t" \
00346 "addps %%xmm6, %%xmm4 \n\t" \
00347 "addps %%xmm7, %%xmm5" \
00348 : \
00349 : \
00350 "m" ((u).c32.real()), \
00351 "m" ((u).c13.real()), \
00352 "m" ((u).c22.real()), \
00353 "m" ((u).c33.real())); \
00354 _ASM ("movss %0, %%xmm6 \n\t" \
00355 "movss %1, %%xmm7 \n\t" \
00356 "shufps $0xb1, %%xmm0, %%xmm0 \n\t" \
00357 "shufps $0xb1, %%xmm1, %%xmm1 \n\t" \
00358 "shufps $0xb1, %%xmm2, %%xmm2 \n\t" \
00359 "shufps $0x0, %%xmm6, %%xmm6 \n\t" \
00360 "shufps $0x0, %%xmm7, %%xmm7 \n\t" \
00361 "mulps %4, %%xmm0 \n\t" \
00362 "mulps %4, %%xmm1 \n\t" \
00363 "mulps %4, %%xmm2 \n\t" \
00364 "mulps %%xmm0, %%xmm6 \n\t" \
00365 "mulps %%xmm1, %%xmm7 \n\t" \
00366 "addps %%xmm6, %%xmm3 \n\t" \
00367 "addps %%xmm7, %%xmm4 \n\t" \
00368 "movss %2, %%xmm6 \n\t" \
00369 "movss %3, %%xmm7 \n\t" \
00370 "shufps $0x0, %%xmm6, %%xmm6 \n\t" \
00371 "shufps $0x0, %%xmm7, %%xmm7 \n\t" \
00372 "mulps %%xmm2, %%xmm6 \n\t" \
00373 "mulps %%xmm0, %%xmm7 \n\t" \
00374 "addps %%xmm6, %%xmm5 \n\t" \
00375 "addps %%xmm7, %%xmm4 " \
00376 : \
00377 : \
00378 "m" ((u).c11.imag()), \
00379 "m" ((u).c22.imag()), \
00380 "m" ((u).c33.imag()), \
00381 "m" ((u).c21.imag()), \
00382 "m" (_sse_float_sgn13)); \
00383 _ASM ("movss %0, %%xmm6 \n\t" \
00384 "movss %1, %%xmm7 \n\t" \
00385 "shufps $0x0, %%xmm6, %%xmm6 \n\t" \
00386 "shufps $0x0, %%xmm7, %%xmm7 \n\t" \
00387 "mulps %%xmm1, %%xmm6 \n\t" \
00388 "mulps %%xmm0, %%xmm7 \n\t" \
00389 "addps %%xmm6, %%xmm3 \n\t" \
00390 "addps %%xmm7, %%xmm5 \n\t" \
00391 "movss %2, %%xmm0 \n\t" \
00392 "movss %3, %%xmm6 \n\t" \
00393 "movss %4, %%xmm7 \n\t" \
00394 "shufps $0x0, %%xmm0, %%xmm0 \n\t" \
00395 "shufps $0x0, %%xmm6, %%xmm6 \n\t" \
00396 "shufps $0x0, %%xmm7, %%xmm7 \n\t" \
00397 "mulps %%xmm2, %%xmm0 \n\t" \
00398 "mulps %%xmm1, %%xmm6 \n\t" \
00399 "mulps %%xmm2, %%xmm7 \n\t" \
00400 "addps %%xmm0, %%xmm3 \n\t" \
00401 "addps %%xmm6, %%xmm5 \n\t" \
00402 "addps %%xmm7, %%xmm4" \
00403 : \
00404 : \
00405 "m" ((u).c12.imag()), \
00406 "m" ((u).c31.imag()), \
00407 "m" ((u).c13.imag()), \
00408 "m" ((u).c32.imag()), \
00409 "m" ((u).c23.imag())); }
00410
00411
00412
00413
00414
00415
00416
00417
00418
00419 #define _sse_float_su3_inverse_multiply(u) { \
00420 _ASM ("movss %0, %%xmm3 \n\t" \
00421 "movss %1, %%xmm6 \n\t" \
00422 "movss %2, %%xmm4 \n\t" \
00423 "movss %3, %%xmm7 \n\t" \
00424 "movss %4, %%xmm5 " \
00425 : \
00426 : \
00427 "m" ((u).c11.real()), \
00428 "m" ((u).c21.real()), \
00429 "m" ((u).c12.real()), \
00430 "m" ((u).c32.real()), \
00431 "m" ((u).c13.real())); \
00432 _ASM ("shufps $0x0, %%xmm3, %%xmm3 \n\t" \
00433 "shufps $0x0, %%xmm6, %%xmm6 \n\t" \
00434 "shufps $0x0, %%xmm4, %%xmm4 \n\t" \
00435 "mulps %%xmm0, %%xmm3 \n\t" \
00436 "shufps $0x0, %%xmm7, %%xmm7 \n\t" \
00437 "mulps %%xmm1, %%xmm6 \n\t" \
00438 "shufps $0x0, %%xmm5, %%xmm5 \n\t" \
00439 "mulps %%xmm0, %%xmm4 \n\t" \
00440 "addps %%xmm6, %%xmm3 \n\t" \
00441 "mulps %%xmm2, %%xmm7 \n\t" \
00442 "mulps %%xmm0, %%xmm5 \n\t" \
00443 "addps %%xmm7, %%xmm4 \n\t" \
00444 "movss %0, %%xmm6 \n\t" \
00445 "movss %1, %%xmm7 \n\t" \
00446 "shufps $0x0, %%xmm6, %%xmm6 \n\t" \
00447 "shufps $0x0, %%xmm7, %%xmm7 \n\t" \
00448 "mulps %%xmm1, %%xmm6 \n\t" \
00449 "mulps %%xmm2, %%xmm7 \n\t" \
00450 "addps %%xmm6, %%xmm5 \n\t" \
00451 "addps %%xmm7, %%xmm3 \n\t" \
00452 "movss %2, %%xmm6 \n\t" \
00453 "movss %3, %%xmm7 \n\t" \
00454 "shufps $0x0, %%xmm6, %%xmm6 \n\t" \
00455 "shufps $0x0, %%xmm7, %%xmm7 \n\t" \
00456 "mulps %%xmm1, %%xmm6 \n\t" \
00457 "mulps %%xmm2, %%xmm7 \n\t" \
00458 "addps %%xmm6, %%xmm4 \n\t" \
00459 "addps %%xmm7, %%xmm5 " \
00460 : \
00461 : \
00462 "m" ((u).c23.real()), \
00463 "m" ((u).c31.real()), \
00464 "m" ((u).c22.real()), \
00465 "m" ((u).c33.real())); \
00466 _ASM ("movss %0, %%xmm6 \n\t" \
00467 "movss %1, %%xmm7 \n\t" \
00468 "shufps $0xb1, %%xmm0, %%xmm0 \n\t" \
00469 "shufps $0xb1, %%xmm1, %%xmm1 \n\t" \
00470 "shufps $0xb1, %%xmm2, %%xmm2 \n\t" \
00471 "shufps $0x0, %%xmm6, %%xmm6 \n\t" \
00472 "shufps $0x0, %%xmm7, %%xmm7 \n\t" \
00473 "mulps %4, %%xmm0 \n\t" \
00474 "mulps %4, %%xmm1 \n\t" \
00475 "mulps %4, %%xmm2 \n\t" \
00476 "mulps %%xmm0, %%xmm6 \n\t" \
00477 "mulps %%xmm1, %%xmm7 \n\t" \
00478 "addps %%xmm6, %%xmm3 \n\t" \
00479 "addps %%xmm7, %%xmm4 \n\t" \
00480 "movss %2, %%xmm6 \n\t" \
00481 "movss %3, %%xmm7 " \
00482 : \
00483 : \
00484 "m" ((u).c11.imag()), \
00485 "m" ((u).c22.imag()), \
00486 "m" ((u).c33.imag()), \
00487 "m" ((u).c12.imag()), \
00488 "m" (_sse_float_sgn24)); \
00489 _ASM ("shufps $0x0, %%xmm6, %%xmm6 \n\t" \
00490 "shufps $0x0, %%xmm7, %%xmm7 \n\t" \
00491 "mulps %%xmm2, %%xmm6 \n\t" \
00492 "mulps %%xmm0, %%xmm7 \n\t" \
00493 "addps %%xmm6, %%xmm5 \n\t" \
00494 "addps %%xmm7, %%xmm4 \n\t" \
00495 "movss %0, %%xmm6 \n\t" \
00496 "movss %1, %%xmm7 \n\t" \
00497 "shufps $0x0, %%xmm6, %%xmm6 \n\t" \
00498 "shufps $0x0, %%xmm7, %%xmm7 \n\t" \
00499 "mulps %%xmm1, %%xmm6 \n\t" \
00500 "mulps %%xmm0, %%xmm7 \n\t" \
00501 "addps %%xmm6, %%xmm3 \n\t" \
00502 "addps %%xmm7, %%xmm5 \n\t" \
00503 "movss %2, %%xmm0 \n\t" \
00504 "movss %3, %%xmm6 \n\t" \
00505 "movss %4, %%xmm7 \n\t" \
00506 "shufps $0x0, %%xmm0, %%xmm0 \n\t" \
00507 "shufps $0x0, %%xmm6, %%xmm6 \n\t" \
00508 "shufps $0x0, %%xmm7, %%xmm7 \n\t" \
00509 "mulps %%xmm2, %%xmm0 \n\t" \
00510 "mulps %%xmm1, %%xmm6 \n\t" \
00511 "mulps %%xmm2, %%xmm7 \n\t" \
00512 "addps %%xmm0, %%xmm3 \n\t" \
00513 "addps %%xmm6, %%xmm5 \n\t" \
00514 "addps %%xmm7, %%xmm4" \
00515 : \
00516 : \
00517 "m" ((u).c21.imag()), \
00518 "m" ((u).c13.imag()), \
00519 "m" ((u).c31.imag()), \
00520 "m" ((u).c23.imag()), \
00521 "m" ((u).c32.imag())); }
00522
00523
00524
00525
00526
00527
00528
00529
00530
00531
00532
00533
00534
00535 #define _sse_float_vector_subadd() \
00536 _ASM ("mulps %0, %%xmm3 \n\t" \
00537 "mulps %0, %%xmm4 \n\t" \
00538 "mulps %0, %%xmm5 \n\t" \
00539 "addps %%xmm3, %%xmm0 \n\t" \
00540 "addps %%xmm4, %%xmm1 \n\t" \
00541 "addps %%xmm5, %%xmm2" \
00542 : \
00543 : \
00544 "m" (_sse_float_sgn12))
00545
00546
00547
00548
00549
00550 #define _sse_float_vector_i_add() \
00551 _ASM ("shufps $0xb1, %%xmm3, %%xmm3 \n\t" \
00552 "shufps $0xb1, %%xmm4, %%xmm4 \n\t" \
00553 "shufps $0xb1, %%xmm5, %%xmm5 \n\t" \
00554 "mulps %0, %%xmm3 \n\t" \
00555 "mulps %0, %%xmm4 \n\t" \
00556 "mulps %0, %%xmm5 \n\t" \
00557 "addps %%xmm3, %%xmm0 \n\t" \
00558 "addps %%xmm4, %%xmm1 \n\t" \
00559 "addps %%xmm5, %%xmm2" \
00560 : \
00561 : \
00562 "m" (_sse_float_sgn13))
00563
00564
00565
00566
00567
00568 #define _sse_float_vector_i_sub() \
00569 _ASM ("shufps $0xb1, %%xmm3, %%xmm3 \n\t" \
00570 "shufps $0xb1, %%xmm4, %%xmm4 \n\t" \
00571 "shufps $0xb1, %%xmm5, %%xmm5 \n\t" \
00572 "mulps %0, %%xmm3 \n\t" \
00573 "mulps %0, %%xmm4 \n\t" \
00574 "mulps %0, %%xmm5 \n\t" \
00575 "addps %%xmm3, %%xmm0 \n\t" \
00576 "addps %%xmm4, %%xmm1 \n\t" \
00577 "addps %%xmm5, %%xmm2" \
00578 : \
00579 : \
00580 "m" (_sse_float_sgn24))
00581
00582
00583
00584
00585
00586
00587 #define _sse_float_vector_xch_i_add() \
00588 _ASM ("shufps $0x1b, %%xmm3, %%xmm3 \n\t" \
00589 "shufps $0x1b, %%xmm4, %%xmm4 \n\t" \
00590 "shufps $0x1b, %%xmm5, %%xmm5 \n\t" \
00591 "mulps %0, %%xmm3 \n\t" \
00592 "mulps %0, %%xmm4 \n\t" \
00593 "mulps %0, %%xmm5 \n\t" \
00594 "addps %%xmm3, %%xmm0 \n\t" \
00595 "addps %%xmm4, %%xmm1 \n\t" \
00596 "addps %%xmm5, %%xmm2" \
00597 : \
00598 : \
00599 "m" (_sse_float_sgn13))
00600
00601
00602
00603
00604
00605
00606 #define _sse_float_vector_xch_i_sub() \
00607 _ASM ("shufps $0x1b, %%xmm3, %%xmm3 \n\t" \
00608 "shufps $0x1b, %%xmm4, %%xmm4 \n\t" \
00609 "shufps $0x1b, %%xmm5, %%xmm5 \n\t" \
00610 "mulps %0, %%xmm3 \n\t" \
00611 "mulps %0, %%xmm4 \n\t" \
00612 "mulps %0, %%xmm5 \n\t" \
00613 "addps %%xmm3, %%xmm0 \n\t" \
00614 "addps %%xmm4, %%xmm1 \n\t" \
00615 "addps %%xmm5, %%xmm2" \
00616 : \
00617 : \
00618 "m" (_sse_float_sgn24))
00619
00620
00621
00622
00623
00624
00625 #define _sse_float_vector_i_addsub() \
00626 _ASM ("shufps $0xb1, %%xmm3, %%xmm3 \n\t" \
00627 "shufps $0xb1, %%xmm4, %%xmm4 \n\t" \
00628 "shufps $0xb1, %%xmm5, %%xmm5 \n\t" \
00629 "mulps %0, %%xmm3 \n\t" \
00630 "mulps %0, %%xmm4 \n\t" \
00631 "mulps %0, %%xmm5 \n\t" \
00632 "addps %%xmm3, %%xmm0 \n\t" \
00633 "addps %%xmm4, %%xmm1 \n\t" \
00634 "addps %%xmm5, %%xmm2" \
00635 : \
00636 : \
00637 "m" (_sse_float_sgn14))
00638
00639
00640
00641
00642
00643
00644 #define _sse_float_vector_i_subadd() \
00645 _ASM ("shufps $0xb1, %%xmm3, %%xmm3 \n\t" \
00646 "shufps $0xb1, %%xmm4, %%xmm4 \n\t" \
00647 "shufps $0xb1, %%xmm5, %%xmm5 \n\t" \
00648 "mulps %0, %%xmm3 \n\t" \
00649 "mulps %0, %%xmm4 \n\t" \
00650 "mulps %0, %%xmm5 \n\t" \
00651 "addps %%xmm3, %%xmm0 \n\t" \
00652 "addps %%xmm4, %%xmm1 \n\t" \
00653 "addps %%xmm5, %%xmm2" \
00654 : \
00655 : \
00656 "m" (_sse_float_sgn23))
00657
00658
00659
00660
00661
00662 #ifdef SSE2FIX
00663 #define _sse_float_vector_xch() \
00664 _ASM ("shufps $0x4e, %xmm3, %xmm3 \n\t" \
00665 "shufps $0x4e, %xmm4, %xmm4 \n\t" \
00666 "shufps $0x4e, %xmm5, %xmm5" \
00667 : \
00668 :)
00669 #else
00670 #define _sse_float_vector_xch() \
00671 _ASM ("shufps $0x4e, %%xmm3, %%xmm3 \n\t" \
00672 "shufps $0x4e, %%xmm4, %%xmm4 \n\t" \
00673 "shufps $0x4e, %%xmm5, %%xmm5" \
00674 : \
00675 :)
00676 #endif
00677
00678
00679
00680
00681
00682
00683
00684 #define _sse_double_prefetch_16(addr) \
00685 _ASM ("prefetcht0 %0" \
00686 : \
00687 : "m" (*(addr)))
00688
00689 #define _sse_double_prefetch_spinor(addr) \
00690 _ASM ("prefetcht0 %0 \n\t" \
00691 "prefetcht0 %1" \
00692 : \
00693 : \
00694 "m" (*(((char*)(((unsigned int)(addr))&~0x7f)))), \
00695 "m" (*(((char*)(((unsigned int)(addr))&~0x7f))+128)))
00696
00697 #define _sse_double_prefetch_nta_spinor(addr) \
00698 _ASM ("prefetchnta %0 \n\t" \
00699 "prefetchnta %1" \
00700 : \
00701 : \
00702 "m" (*(((char*)(((unsigned int)(addr))&~0x7f)))), \
00703 "m" (*(((char*)(((unsigned int)(addr))&~0x7f))+128)))
00704
00705 #define _sse_double_prefetch_su3(addr) \
00706 _ASM ("prefetcht0 %0 \n\t" \
00707 "prefetcht0 %1" \
00708 : \
00709 : \
00710 "m" (*(((char*)(((unsigned int)(addr))&~0x7f)))), \
00711 "m" (*(((char*)(((unsigned int)(addr))&~0x7f))+128)))
00712
00713
00714
00715
00716
00717
00718
00719
00720
00721
00722
00723
00724 #define _sse_double_load(s) \
00725 _ASM ("movapd %0, %%xmm0 \n\t" \
00726 "movapd %1, %%xmm1 \n\t" \
00727 "movapd %2, %%xmm2" \
00728 : \
00729 : \
00730 "m" ((s).c1), \
00731 "m" ((s).c2), \
00732 "m" ((s).c3))
00733
00734 #define _sse_double_load_123(c1, c2, c3) \
00735 _ASM ("movapd %0, %%xmm0 \n\t" \
00736 "movapd %1, %%xmm1 \n\t" \
00737 "movapd %2, %%xmm2" \
00738 : \
00739 : \
00740 "m" (c1), \
00741 "m" (c2), \
00742 "m" (c3))
00743
00744
00745
00746
00747
00748
00749 #define _sse_double_load_up(s) \
00750 _ASM ("movapd %0, %%xmm3 \n\t" \
00751 "movapd %1, %%xmm4 \n\t" \
00752 "movapd %2, %%xmm5" \
00753 : \
00754 : \
00755 "m" ((s).c1), \
00756 "m" ((s).c2), \
00757 "m" ((s).c3))
00758
00759 #define _sse_double_load_up_123(c1, c2, c3) \
00760 _ASM ("movapd %0, %%xmm3 \n\t" \
00761 "movapd %1, %%xmm4 \n\t" \
00762 "movapd %2, %%xmm5" \
00763 : \
00764 : \
00765 "m" (c1), \
00766 "m" (c2), \
00767 "m" (c3))
00768
00769
00770
00771
00772
00773 #define _sse_double_store(r) \
00774 _ASM ("movapd %%xmm0, %0 \n\t" \
00775 "movapd %%xmm1, %1 \n\t" \
00776 "movapd %%xmm2, %2" \
00777 : \
00778 "=m" ((r).c1), \
00779 "=m" ((r).c2), \
00780 "=m" ((r).c3))
00781
00782 #define _sse_double_store_123(c1, c2, c3) \
00783 _ASM ("movapd %%xmm0, %0 \n\t" \
00784 "movapd %%xmm1, %1 \n\t" \
00785 "movapd %%xmm2, %2" \
00786 : \
00787 "=m" (c1), \
00788 "=m" (c2), \
00789 "=m" (c3))
00790
00791
00792
00793
00794
00795 #define _sse_double_store_up(r) \
00796 _ASM ("movapd %%xmm3, %0 \n\t" \
00797 "movapd %%xmm4, %1 \n\t" \
00798 "movapd %%xmm5, %2" \
00799 : \
00800 "=m" ((r).c1), \
00801 "=m" ((r).c2), \
00802 "=m" ((r).c3))
00803
00804 #define _sse_double_store_up_123(c1, c2, c3) \
00805 _ASM ("movapd %%xmm3, %0 \n\t" \
00806 "movapd %%xmm4, %1 \n\t" \
00807 "movapd %%xmm5, %2" \
00808 : \
00809 "=m" (c1), \
00810 "=m" (c2), \
00811 "=m" (c3))
00812
00813
00814
00815
00816
00817 #define _sse_double_vector_mul(c) \
00818 _ASM ("mulpd %0, %%xmm0 \n\t" \
00819 "mulpd %0, %%xmm1 \n\t" \
00820 "mulpd %0, %%xmm2" \
00821 : \
00822 : \
00823 "m" (c))
00824
00825
00826
00827
00828
00829
00830
00831
00832
00833
00834
00835
00836
00837
00838
00839
00840
00841
00842
00843
00844
00845
00846
00847
00848
00849
00850
00851
00852
00853
00854
00855
00856
00857
00858 #define _sse_double_vector_mul_complex(x,y) \
00859 _ASM ("movapd %%xmm0, %%xmm3 \n\t" \
00860 "movapd %%xmm1, %%xmm4 \n\t" \
00861 "movapd %%xmm2, %%xmm5 \n\t" \
00862 "mulpd %1, %%xmm3 \n\t" \
00863 "mulpd %1, %%xmm4 \n\t" \
00864 "mulpd %1, %%xmm5 \n\t" \
00865 "shufpd $0x1, %%xmm3, %%xmm3 \n\t" \
00866 "shufpd $0x1, %%xmm4, %%xmm4 \n\t" \
00867 "shufpd $0x1, %%xmm5, %%xmm5 \n\t" \
00868 "xorpd %2, %%xmm3 \n\t" \
00869 "xorpd %2, %%xmm4 \n\t" \
00870 "xorpd %2, %%xmm5 \n\t" \
00871 "mulpd %0, %%xmm0 \n\t" \
00872 "mulpd %0, %%xmm1 \n\t" \
00873 "mulpd %0, %%xmm2 \n\t" \
00874 "addpd %%xmm0, %%xmm3 \n\t" \
00875 "addpd %%xmm1, %%xmm4 \n\t" \
00876 "addpd %%xmm2, %%xmm5" \
00877 : \
00878 : \
00879 "m" (x), \
00880 "m" (y), \
00881 "m" (_sse_double_sgn))
00882
00883
00884
00885
00886
00887 #ifdef SSE2FIX
00888 #define _sse_double_vector_add() \
00889 _ASM ("addpd %xmm3, %xmm0 \n\t" \
00890 "addpd %xmm4, %xmm1 \n\t" \
00891 "addpd %xmm5, %xmm2" \
00892 : \
00893 :)
00894 #else
00895 #define _sse_double_vector_add() \
00896 _ASM ("addpd %%xmm3, %%xmm0 \n\t" \
00897 "addpd %%xmm4, %%xmm1 \n\t" \
00898 "addpd %%xmm5, %%xmm2" \
00899 : \
00900 :)
00901 #endif
00902
00903
00904
00905
00906
00907 #ifdef SSE2FIX
00908 #define _sse_double_vector_sub() \
00909 _ASM ("subpd %xmm3, %xmm0 \n\t" \
00910 "subpd %xmm4, %xmm1 \n\t" \
00911 "subpd %xmm5, %xmm2" \
00912 : \
00913 :)
00914 #else
00915 #define _sse_double_vector_sub() \
00916 _ASM ("subpd %%xmm3, %%xmm0 \n\t" \
00917 "subpd %%xmm4, %%xmm1 \n\t" \
00918 "subpd %%xmm5, %%xmm2" \
00919 : \
00920 :)
00921 #endif
00922
00923
00924
00925
00926
00927
00928
00929
00930 #define _sse_double_su3_multiply(u) { \
00931 _ASM ("movsd %0, %%xmm3 \n\t" \
00932 "movsd %1, %%xmm6 \n\t" \
00933 "movsd %2, %%xmm4 \n\t" \
00934 "movsd %3, %%xmm7 \n\t" \
00935 "movsd %4, %%xmm5 " \
00936 : \
00937 : \
00938 "m" ((u).c11.real()), \
00939 "m" ((u).c12.real()), \
00940 "m" ((u).c21.real()), \
00941 "m" ((u).c23.real()), \
00942 "m" ((u).c31.real())); \
00943 _ASM ("unpcklpd %%xmm3, %%xmm3 \n\t" \
00944 "unpcklpd %%xmm6, %%xmm6 \n\t" \
00945 "unpcklpd %%xmm4, %%xmm4 \n\t" \
00946 "mulpd %%xmm0, %%xmm3 \n\t" \
00947 "unpcklpd %%xmm7, %%xmm7 \n\t" \
00948 "mulpd %%xmm1, %%xmm6 \n\t" \
00949 "unpcklpd %%xmm5, %%xmm5 \n\t" \
00950 "mulpd %%xmm0, %%xmm4 \n\t" \
00951 "addpd %%xmm6, %%xmm3 \n\t" \
00952 "mulpd %%xmm2, %%xmm7 \n\t" \
00953 "mulpd %%xmm0, %%xmm5 \n\t" \
00954 "addpd %%xmm7, %%xmm4 \n\t" \
00955 "movsd %0, %%xmm6 \n\t" \
00956 "movsd %1, %%xmm7 \n\t" \
00957 "unpcklpd %%xmm6, %%xmm6 \n\t" \
00958 "unpcklpd %%xmm7, %%xmm7 \n\t" \
00959 "mulpd %%xmm1, %%xmm6 \n\t" \
00960 "mulpd %%xmm2, %%xmm7 \n\t" \
00961 "addpd %%xmm6, %%xmm5 \n\t" \
00962 "addpd %%xmm7, %%xmm3 \n\t" \
00963 "movsd %2, %%xmm6 \n\t" \
00964 "movsd %3, %%xmm7 \n\t" \
00965 "unpcklpd %%xmm6, %%xmm6 \n\t" \
00966 "unpcklpd %%xmm7, %%xmm7 \n\t" \
00967 "mulpd %%xmm1, %%xmm6 \n\t" \
00968 "mulpd %%xmm2, %%xmm7 \n\t" \
00969 "addpd %%xmm6, %%xmm4 \n\t" \
00970 "addpd %%xmm7, %%xmm5 " \
00971 : \
00972 : \
00973 "m" ((u).c32.real()), \
00974 "m" ((u).c13.real()), \
00975 "m" ((u).c22.real()), \
00976 "m" ((u).c33.real())); \
00977 _ASM ("movsd %0, %%xmm6 \n\t" \
00978 "movsd %1, %%xmm7 \n\t" \
00979 "shufpd $0x1, %%xmm0, %%xmm0 \n\t" \
00980 "shufpd $0x1, %%xmm1, %%xmm1 \n\t" \
00981 "shufpd $0x1, %%xmm2, %%xmm2 \n\t" \
00982 "unpcklpd %%xmm6, %%xmm6 \n\t" \
00983 "unpcklpd %%xmm7, %%xmm7 \n\t" \
00984 "xorpd %4, %%xmm0 \n\t" \
00985 "xorpd %4, %%xmm1 \n\t" \
00986 "xorpd %4, %%xmm2 \n\t" \
00987 "mulpd %%xmm0, %%xmm6 \n\t" \
00988 "mulpd %%xmm1, %%xmm7 \n\t" \
00989 "addpd %%xmm6, %%xmm3 \n\t" \
00990 "addpd %%xmm7, %%xmm4 \n\t" \
00991 "movsd %2, %%xmm6 \n\t" \
00992 "movsd %3, %%xmm7 " \
00993 : \
00994 : \
00995 "m" ((u).c11.imag()), \
00996 "m" ((u).c22.imag()), \
00997 "m" ((u).c33.imag()), \
00998 "m" ((u).c21.imag()), \
00999 "m" (_sse_double_sgn)); \
01000 _ASM ("unpcklpd %%xmm6, %%xmm6 \n\t" \
01001 "unpcklpd %%xmm7, %%xmm7 \n\t" \
01002 "mulpd %%xmm2, %%xmm6 \n\t" \
01003 "mulpd %%xmm0, %%xmm7 \n\t" \
01004 "addpd %%xmm6, %%xmm5 \n\t" \
01005 "addpd %%xmm7, %%xmm4 \n\t" \
01006 "movsd %0, %%xmm6 \n\t" \
01007 "movsd %1, %%xmm7 \n\t" \
01008 "unpcklpd %%xmm6, %%xmm6 \n\t" \
01009 "unpcklpd %%xmm7, %%xmm7 \n\t" \
01010 "mulpd %%xmm1, %%xmm6 \n\t" \
01011 "mulpd %%xmm0, %%xmm7 \n\t" \
01012 "addpd %%xmm6, %%xmm3 \n\t" \
01013 "addpd %%xmm7, %%xmm5 \n\t" \
01014 "movsd %2, %%xmm0 \n\t" \
01015 "movsd %3, %%xmm6 \n\t" \
01016 "movsd %4, %%xmm7 \n\t" \
01017 "unpcklpd %%xmm0, %%xmm0 \n\t" \
01018 "unpcklpd %%xmm6, %%xmm6 \n\t" \
01019 "unpcklpd %%xmm7, %%xmm7 \n\t" \
01020 "mulpd %%xmm2, %%xmm0 \n\t" \
01021 "mulpd %%xmm1, %%xmm6 \n\t" \
01022 "mulpd %%xmm2, %%xmm7 \n\t" \
01023 "addpd %%xmm0, %%xmm3 \n\t" \
01024 "addpd %%xmm6, %%xmm5 \n\t" \
01025 "addpd %%xmm7, %%xmm4 " \
01026 : \
01027 : \
01028 "m" ((u).c12.imag()), \
01029 "m" ((u).c31.imag()), \
01030 "m" ((u).c13.imag()), \
01031 "m" ((u).c32.imag()), \
01032 "m" ((u).c23.imag())); }
01033
01034
01035
01036
01037
01038
01039
01040
01041
01042 #define _sse_double_su3_inverse_multiply(u) { \
01043 _ASM ("movsd %0, %%xmm3 \n\t" \
01044 "movsd %1, %%xmm6 \n\t" \
01045 "movsd %2, %%xmm4 \n\t" \
01046 "movsd %3, %%xmm7 \n\t" \
01047 "movsd %4, %%xmm5 " \
01048 : \
01049 : \
01050 "m" ((u).c11.real()), \
01051 "m" ((u).c21.real()), \
01052 "m" ((u).c12.real()), \
01053 "m" ((u).c32.real()), \
01054 "m" ((u).c13.real())); \
01055 _ASM ("unpcklpd %%xmm3, %%xmm3 \n\t" \
01056 "unpcklpd %%xmm6, %%xmm6 \n\t" \
01057 "unpcklpd %%xmm4, %%xmm4 \n\t" \
01058 "mulpd %%xmm0, %%xmm3 \n\t" \
01059 "unpcklpd %%xmm7, %%xmm7 \n\t" \
01060 "mulpd %%xmm1, %%xmm6 \n\t" \
01061 "unpcklpd %%xmm5, %%xmm5 \n\t" \
01062 "mulpd %%xmm0, %%xmm4 \n\t" \
01063 "addpd %%xmm6, %%xmm3 \n\t" \
01064 "mulpd %%xmm2, %%xmm7 \n\t" \
01065 "mulpd %%xmm0, %%xmm5 \n\t" \
01066 "addpd %%xmm7, %%xmm4 \n\t" \
01067 "movsd %0, %%xmm6 \n\t" \
01068 "movsd %1, %%xmm7 \n\t" \
01069 "unpcklpd %%xmm6, %%xmm6 \n\t" \
01070 "unpcklpd %%xmm7, %%xmm7 \n\t" \
01071 "mulpd %%xmm1, %%xmm6 \n\t" \
01072 "mulpd %%xmm2, %%xmm7 \n\t" \
01073 "addpd %%xmm6, %%xmm5 \n\t" \
01074 "addpd %%xmm7, %%xmm3 \n\t" \
01075 "movsd %2, %%xmm6 \n\t" \
01076 "movsd %3, %%xmm7 \n\t" \
01077 "unpcklpd %%xmm6, %%xmm6 \n\t" \
01078 "unpcklpd %%xmm7, %%xmm7 \n\t" \
01079 "mulpd %%xmm1, %%xmm6 \n\t" \
01080 "mulpd %%xmm2, %%xmm7 \n\t" \
01081 "addpd %%xmm6, %%xmm4 \n\t" \
01082 "addpd %%xmm7, %%xmm5" \
01083 : \
01084 : \
01085 "m" ((u).c23.real()), \
01086 "m" ((u).c31.real()), \
01087 "m" ((u).c22.real()), \
01088 "m" ((u).c33.real())); \
01089 _ASM ("movsd %0, %%xmm6 \n\t" \
01090 "movsd %1, %%xmm7 \n\t" \
01091 "xorpd %4, %%xmm0 \n\t" \
01092 "xorpd %4, %%xmm1 \n\t" \
01093 "xorpd %4, %%xmm2 \n\t" \
01094 "unpcklpd %%xmm6, %%xmm6 \n\t" \
01095 "unpcklpd %%xmm7, %%xmm7 \n\t" \
01096 "shufpd $0x1, %%xmm0, %%xmm0 \n\t" \
01097 "shufpd $0x1, %%xmm1, %%xmm1 \n\t" \
01098 "shufpd $0x1, %%xmm2, %%xmm2 \n\t" \
01099 "mulpd %%xmm0, %%xmm6 \n\t" \
01100 "mulpd %%xmm1, %%xmm7 \n\t" \
01101 "addpd %%xmm6, %%xmm3 \n\t" \
01102 "addpd %%xmm7, %%xmm4 \n\t" \
01103 "movsd %2, %%xmm6 \n\t" \
01104 "movsd %3, %%xmm7 " \
01105 : \
01106 : \
01107 "m" ((u).c11.imag()), \
01108 "m" ((u).c22.imag()), \
01109 "m" ((u).c33.imag()), \
01110 "m" ((u).c12.imag()), \
01111 "m" (_sse_double_sgn)); \
01112 _ASM ("unpcklpd %%xmm6, %%xmm6 \n\t" \
01113 "unpcklpd %%xmm7, %%xmm7 \n\t" \
01114 "mulpd %%xmm2, %%xmm6 \n\t" \
01115 "mulpd %%xmm0, %%xmm7 \n\t" \
01116 "addpd %%xmm6, %%xmm5 \n\t" \
01117 "addpd %%xmm7, %%xmm4 \n\t" \
01118 "movsd %0, %%xmm6 \n\t" \
01119 "movsd %1, %%xmm7 \n\t" \
01120 "unpcklpd %%xmm6, %%xmm6 \n\t" \
01121 "unpcklpd %%xmm7, %%xmm7 \n\t" \
01122 "mulpd %%xmm1, %%xmm6 \n\t" \
01123 "mulpd %%xmm0, %%xmm7 \n\t" \
01124 "addpd %%xmm6, %%xmm3 \n\t" \
01125 "addpd %%xmm7, %%xmm5 \n\t" \
01126 "movsd %2, %%xmm0 \n\t" \
01127 "movsd %3, %%xmm6 \n\t" \
01128 "movsd %4, %%xmm7 \n\t" \
01129 "unpcklpd %%xmm0, %%xmm0 \n\t" \
01130 "unpcklpd %%xmm6, %%xmm6 \n\t" \
01131 "unpcklpd %%xmm7, %%xmm7 \n\t" \
01132 "mulpd %%xmm2, %%xmm0 \n\t" \
01133 "mulpd %%xmm1, %%xmm6 \n\t" \
01134 "mulpd %%xmm2, %%xmm7 \n\t" \
01135 "addpd %%xmm0, %%xmm3 \n\t" \
01136 "addpd %%xmm6, %%xmm5 \n\t" \
01137 "addpd %%xmm7, %%xmm4 " \
01138 : \
01139 : \
01140 "m" ((u).c21.imag()), \
01141 "m" ((u).c13.imag()), \
01142 "m" ((u).c31.imag()), \
01143 "m" ((u).c23.imag()), \
01144 "m" ((u).c32.imag())); }
01145
01146
01147
01148
01149
01150
01151
01152
01153
01154
01155
01156 #define _sse_double_vector_i_mul() \
01157 _ASM ("shufpd $0x1, %%xmm3, %%xmm3 \n\t" \
01158 "shufpd $0x1, %%xmm4, %%xmm4 \n\t" \
01159 "shufpd $0x1, %%xmm5, %%xmm5 \n\t" \
01160 "xorpd %0, %%xmm3 \n\t" \
01161 "xorpd %0, %%xmm4 \n\t" \
01162 "xorpd %0, %%xmm5" \
01163 : \
01164 : \
01165 "m" (_sse_double_sgn))
01166
01167
01168
01169
01170
01171 #define _sse_double_vector_minus_i_mul() \
01172 _ASM ("xorpd %0, %%xmm3 \n\t" \
01173 "xorpd %0, %%xmm4 \n\t" \
01174 "xorpd %0, %%xmm5 \n\t" \
01175 "shufpd $0x1, %%xmm3, %%xmm3 \n\t" \
01176 "shufpd $0x1, %%xmm4, %%xmm4 \n\t" \
01177 "shufpd $0x1, %%xmm5, %%xmm5" \
01178 : \
01179 : \
01180 "m" (_sse_double_sgn))
01181
01182
01183
01184
01185
01186
01187
01188
01189
01190
01191
01192
01193 #define _sse_double_add_norm_square_16(r,c) { \
01194 _ASM ("movapd %0, %%xmm0 \n\t" \
01195 "movapd %1, %%xmm1 \n\t" \
01196 "movapd %2, %%xmm2 \n\t" \
01197 "movapd %3, %%xmm3" \
01198 : \
01199 : \
01200 "m" (*((r))), \
01201 "m" (*((r)+1)), \
01202 "m" (*((r)+2)), \
01203 "m" (*((r)+3))); \
01204 _ASM ("movapd %0, %%xmm4 \n\t" \
01205 "movapd %1, %%xmm5 \n\t" \
01206 "movapd %2, %%xmm6 \n\t" \
01207 "movapd %3, %%xmm7 \n\t" \
01208 "mulpd %%xmm0, %%xmm0 \n\t" \
01209 "mulpd %%xmm1, %%xmm1 \n\t" \
01210 "mulpd %%xmm2, %%xmm2 \n\t" \
01211 "mulpd %%xmm3, %%xmm3 \n\t" \
01212 "mulpd %%xmm4, %%xmm4 \n\t" \
01213 "mulpd %%xmm5, %%xmm5 \n\t" \
01214 "mulpd %%xmm6, %%xmm6 \n\t" \
01215 "mulpd %%xmm7, %%xmm7 \n\t" \
01216 "addpd %%xmm0, %%xmm1 \n\t" \
01217 "addpd %%xmm2, %%xmm3 \n\t" \
01218 "addpd %%xmm4, %%xmm5 \n\t" \
01219 "addpd %%xmm6, %%xmm7 \n\t" \
01220 "addpd %%xmm1, %%xmm3 \n\t" \
01221 "addpd %%xmm5, %%xmm7 \n\t" \
01222 "addpd %%xmm3, %%xmm7" \
01223 : \
01224 : \
01225 "m" (*((r)+4)), \
01226 "m" (*((r)+5)), \
01227 "m" (*((r)+6)), \
01228 "m" (*((r)+7))); \
01229 _ASM ("movapd %0, %%xmm1 \n\t" \
01230 "addpd %%xmm1, %%xmm7 \n\t" \
01231 "movapd %%xmm7, %0" \
01232 : \
01233 "=m" (c)); }
01234
01235
01236
01237
01238
01239 #define _sse_double_add_real_scalar_product_16(r,s,c) { \
01240 _ASM ("movapd %0, %%xmm0 \n\t" \
01241 "movapd %1, %%xmm1 \n\t" \
01242 "movapd %2, %%xmm2 \n\t" \
01243 "movapd %3, %%xmm3 \n\t" \
01244 : \
01245 : \
01246 "m" (*((r))), \
01247 "m" (*((r)+1)), \
01248 "m" (*((r)+2)), \
01249 "m" (*((r)+3))); \
01250 _ASM ("mulpd %0, %%xmm0 \n\t" \
01251 "mulpd %1, %%xmm1 \n\t" \
01252 "mulpd %2, %%xmm2 \n\t" \
01253 "mulpd %3, %%xmm3 \n\t" \
01254 : \
01255 : \
01256 "m" (*((s))), \
01257 "m" (*((s)+1)), \
01258 "m" (*((s)+2)), \
01259 "m" (*((s)+3))); \
01260 _ASM ("movapd %0, %%xmm4 \n\t" \
01261 "movapd %1, %%xmm5 \n\t" \
01262 "movapd %2, %%xmm6 \n\t" \
01263 "movapd %3, %%xmm7 \n\t" \
01264 : \
01265 : \
01266 "m" (*((r)+4)), \
01267 "m" (*((r)+5)), \
01268 "m" (*((r)+6)), \
01269 "m" (*((r)+7))); \
01270 _ASM ("mulpd %0, %%xmm4 \n\t" \
01271 "mulpd %1, %%xmm5 \n\t" \
01272 "mulpd %2, %%xmm6 \n\t" \
01273 "mulpd %3, %%xmm7 \n\t" \
01274 : \
01275 : \
01276 "m" (*((s)+4)), \
01277 "m" (*((s)+5)), \
01278 "m" (*((s)+6)), \
01279 "m" (*((s)+7))); \
01280 _ASM ("addpd %%xmm0, %%xmm1 \n\t" \
01281 "addpd %%xmm2, %%xmm3 \n\t" \
01282 "addpd %%xmm4, %%xmm5 \n\t" \
01283 "addpd %%xmm6, %%xmm7 \n\t" \
01284 "addpd %%xmm1, %%xmm3 \n\t" \
01285 "addpd %%xmm5, %%xmm7 \n\t" \
01286 "addpd %%xmm3, %%xmm7 \n\t" \
01287 "movapd %0, %%xmm1 \n\t" \
01288 "addpd %%xmm1, %%xmm7 \n\t" \
01289 "movapd %%xmm7, %0 \n\t" \
01290 : \
01291 "=m" (c)); }
01292
01293 #define _sse_double_add_imag_scalar_product_16(r,s,c) { \
01294 _ASM ("movapd %0, %%xmm0 \n\t" \
01295 "movapd %1, %%xmm1 \n\t" \
01296 "movapd %2, %%xmm2 \n\t" \
01297 "movapd %3, %%xmm3 \n\t" \
01298 : \
01299 : \
01300 "m" (*((r))), \
01301 "m" (*((r)+1)), \
01302 "m" (*((r)+2)), \
01303 "m" (*((r)+3))); \
01304 _ASM ("shufpd $0x1, %%xmm0, %%xmm0 \n\t" \
01305 "shufpd $0x1, %%xmm1, %%xmm1 \n\t" \
01306 "shufpd $0x1, %%xmm2, %%xmm2 \n\t" \
01307 "shufpd $0x1, %%xmm3, %%xmm3 \n\t" \
01308 "mulpd %0, %%xmm0 \n\t" \
01309 "mulpd %1, %%xmm1 \n\t" \
01310 "mulpd %2, %%xmm2 \n\t" \
01311 "mulpd %3, %%xmm3 \n\t" \
01312 : \
01313 : \
01314 "m" (*((s))), \
01315 "m" (*((s)+1)), \
01316 "m" (*((s)+2)), \
01317 "m" (*((s)+3))); \
01318 _ASM ("movapd %0, %%xmm4 \n\t" \
01319 "movapd %1, %%xmm5 \n\t" \
01320 "movapd %2, %%xmm6 \n\t" \
01321 "movapd %3, %%xmm7 \n\t" \
01322 : \
01323 : \
01324 "m" (*((r)+4)), \
01325 "m" (*((r)+5)), \
01326 "m" (*((r)+6)), \
01327 "m" (*((r)+7))); \
01328 _ASM ("shufpd $0x1, %%xmm4, %%xmm4 \n\t" \
01329 "shufpd $0x1, %%xmm5, %%xmm5 \n\t" \
01330 "shufpd $0x1, %%xmm6, %%xmm6 \n\t" \
01331 "shufpd $0x1, %%xmm7, %%xmm7 \n\t" \
01332 "mulpd %0, %%xmm4 \n\t" \
01333 "mulpd %1, %%xmm5 \n\t" \
01334 "mulpd %2, %%xmm6 \n\t" \
01335 "mulpd %3, %%xmm7 \n\t" \
01336 : \
01337 : \
01338 "m" (*((s)+4)), \
01339 "m" (*((s)+5)), \
01340 "m" (*((s)+6)), \
01341 "m" (*((s)+7))); \
01342 _ASM ("addpd %%xmm0, %%xmm1 \n\t" \
01343 "addpd %%xmm2, %%xmm3 \n\t" \
01344 "addpd %%xmm4, %%xmm5 \n\t" \
01345 "addpd %%xmm6, %%xmm7 \n\t" \
01346 "addpd %%xmm1, %%xmm3 \n\t" \
01347 "addpd %%xmm5, %%xmm7 \n\t" \
01348 "addpd %%xmm3, %%xmm7 \n\t" \
01349 "movapd %0, %%xmm1 \n\t" \
01350 "addpd %%xmm1, %%xmm7 \n\t" \
01351 "movapd %%xmm7, %0 \n\t" \
01352 : \
01353 "=m" (c)); }
01354
01355 #define _sse_double_hermitian_su3(r,s) { \
01356 _ASM ("movapd %0, %%xmm0 \n\t"\
01357 "xorpd %3, %%xmm0 \n\t" \
01358 "movapd %1, %%xmm1 \n\t"\
01359 "xorpd %3, %%xmm1 \n\t" \
01360 "movapd %2, %%xmm2 \n\t"\
01361 "xorpd %3, %%xmm2 \n\t" \
01362 : \
01363 : \
01364 "m" (*((s))), \
01365 "m" (*((s)+4)), \
01366 "m" (*((s)+8)), \
01367 "m" (_sse_double_sgn2)); \
01368 _ASM ("movapd %%xmm0, %0 \n\t"\
01369 "movapd %%xmm1, %1 \n\t"\
01370 "movapd %%xmm2, %2 \n\t"\
01371 : \
01372 "=m" (*((r))), \
01373 "=m" (*((r)+4)), \
01374 "=m" (*((r)+8))); \
01375 _ASM ("movapd %0, %%xmm0 \n\t"\
01376 "xorpd %3, %%xmm0 \n\t" \
01377 "movapd %1, %%xmm1 \n\t"\
01378 "xorpd %3, %%xmm1 \n\t" \
01379 "movapd %2, %%xmm2 \n\t"\
01380 "xorpd %3, %%xmm2 \n\t" \
01381 : \
01382 : \
01383 "m" (*((s)+1)), \
01384 "m" (*((s)+2)), \
01385 "m" (*((s)+5)), \
01386 "m" (_sse_double_sgn2)); \
01387 _ASM ("movapd %%xmm0, %0 \n\t"\
01388 "movapd %%xmm1, %1 \n\t"\
01389 "movapd %%xmm2, %2 \n\t"\
01390 : \
01391 "=m" (*((r)+3)), \
01392 "=m" (*((r)+6)), \
01393 "=m" (*((r)+7))); \
01394 _ASM ("movapd %0, %%xmm0 \n\t"\
01395 "xorpd %3, %%xmm0 \n\t" \
01396 "movapd %1, %%xmm1 \n\t"\
01397 "xorpd %3, %%xmm1 \n\t" \
01398 "movapd %2, %%xmm2 \n\t"\
01399 "xorpd %3, %%xmm2 \n\t" \
01400 : \
01401 : \
01402 "m" (*((s)+3)), \
01403 "m" (*((s)+6)), \
01404 "m" (*((s)+7)), \
01405 "m" (_sse_double_sgn2)); \
01406 _ASM ("movapd %%xmm0, %0 \n\t"\
01407 "movapd %%xmm1, %1 \n\t"\
01408 "movapd %%xmm2, %2 \n\t"\
01409 : \
01410 "=m" (*((r)+1)), \
01411 "=m" (*((r)+2)), \
01412 "=m" (*((r)+5))); } \
01413
01414
01415
01416
01417
01418 #define _sse_double_copy_16(r,s) { \
01419 _ASM ("movapd %0, %%xmm0 \n\t" \
01420 "movapd %1, %%xmm1 \n\t" \
01421 "movapd %2, %%xmm2 \n\t" \
01422 "movapd %3, %%xmm3 \n\t" \
01423 : \
01424 : \
01425 "m" (*((s))), \
01426 "m" (*((s)+1)), \
01427 "m" (*((s)+2)), \
01428 "m" (*((s)+3))); \
01429 _ASM ("movapd %0, %%xmm4 \n\t" \
01430 "movapd %1, %%xmm5 \n\t" \
01431 "movapd %2, %%xmm6 \n\t" \
01432 "movapd %3, %%xmm7 \n\t" \
01433 : \
01434 : \
01435 "m" (*((s)+4)), \
01436 "m" (*((s)+5)), \
01437 "m" (*((s)+6)), \
01438 "m" (*((s)+7))); \
01439 _ASM ("movapd %%xmm0, %0 \n\t" \
01440 "movapd %%xmm1, %1 \n\t" \
01441 "movapd %%xmm2, %2 \n\t" \
01442 "movapd %%xmm3, %3 \n\t" \
01443 : \
01444 "=m" (*((r))), \
01445 "=m" (*((r)+1)), \
01446 "=m" (*((r)+2)), \
01447 "=m" (*((r)+3))); \
01448 _ASM ("movapd %%xmm4, %0 \n\t" \
01449 "movapd %%xmm5, %1 \n\t" \
01450 "movapd %%xmm6, %2 \n\t" \
01451 "movapd %%xmm7, %3 \n\t" \
01452 : \
01453 "=m" (*((r)+4)), \
01454 "=m" (*((r)+5)), \
01455 "=m" (*((r)+6)), \
01456 "=m" (*((r)+7))); }
01457
01458
01459
01460
01461
01462 #define _sse_double_add_16(r,s) { \
01463 _ASM ("movapd %0, %%xmm0 \n\t" \
01464 "movapd %1, %%xmm1 \n\t" \
01465 "movapd %2, %%xmm2 \n\t" \
01466 "movapd %3, %%xmm3 \n\t" \
01467 : \
01468 : \
01469 "m" (*((s))), \
01470 "m" (*((s)+1)), \
01471 "m" (*((s)+2)), \
01472 "m" (*((s)+3))); \
01473 _ASM ("movapd %0, %%xmm4 \n\t" \
01474 "movapd %1, %%xmm5 \n\t" \
01475 "movapd %2, %%xmm6 \n\t" \
01476 "movapd %3, %%xmm7 \n\t" \
01477 : \
01478 : \
01479 "m" (*((s)+4)), \
01480 "m" (*((s)+5)), \
01481 "m" (*((s)+6)), \
01482 "m" (*((s)+7))); \
01483 _ASM ("addpd %0, %%xmm0 \n\t" \
01484 "addpd %1, %%xmm1 \n\t" \
01485 "addpd %2, %%xmm2 \n\t" \
01486 "addpd %3, %%xmm3 \n\t" \
01487 : \
01488 : \
01489 "m" (*((r))), \
01490 "m" (*((r)+1)), \
01491 "m" (*((r)+2)), \
01492 "m" (*((r)+3))); \
01493 _ASM ("addpd %0, %%xmm4 \n\t" \
01494 "addpd %1, %%xmm5 \n\t" \
01495 "addpd %2, %%xmm6 \n\t" \
01496 "addpd %3, %%xmm7 \n\t" \
01497 : \
01498 : \
01499 "m" (*((r)+4)), \
01500 "m" (*((r)+5)), \
01501 "m" (*((r)+6)), \
01502 "m" (*((r)+7))); \
01503 _ASM ("movapd %%xmm0, %0 \n\t" \
01504 "movapd %%xmm1, %1 \n\t" \
01505 "movapd %%xmm2, %2 \n\t" \
01506 "movapd %%xmm3, %3 \n\t" \
01507 : \
01508 "=m" (*((r))), \
01509 "=m" (*((r)+1)), \
01510 "=m" (*((r)+2)), \
01511 "=m" (*((r)+3))); \
01512 _ASM ("movapd %%xmm4, %0 \n\t" \
01513 "movapd %%xmm5, %1 \n\t" \
01514 "movapd %%xmm6, %2 \n\t" \
01515 "movapd %%xmm7, %3 \n\t" \
01516 : \
01517 "=m" (*((r)+4)), \
01518 "=m" (*((r)+5)), \
01519 "=m" (*((r)+6)), \
01520 "=m" (*((r)+7))); }
01521
01522
01523
01524
01525
01526 #define _sse_double_sub_16(r,s) { \
01527 _ASM ("movapd %0, %%xmm0 \n\t" \
01528 "movapd %1, %%xmm1 \n\t" \
01529 "movapd %2, %%xmm2 \n\t" \
01530 "movapd %3, %%xmm3 \n\t" \
01531 : \
01532 : \
01533 "m" (*((s))), \
01534 "m" (*((s)+1)), \
01535 "m" (*((s)+2)), \
01536 "m" (*((s)+3))); \
01537 _ASM ("movapd %0, %%xmm4 \n\t" \
01538 "movapd %1, %%xmm5 \n\t" \
01539 "movapd %2, %%xmm6 \n\t" \
01540 "movapd %3, %%xmm7 \n\t" \
01541 : \
01542 : \
01543 "m" (*((s)+4)), \
01544 "m" (*((s)+5)), \
01545 "m" (*((s)+6)), \
01546 "m" (*((s)+7))); \
01547 _ASM ("subpd %0, %%xmm0 \n\t" \
01548 "subpd %1, %%xmm1 \n\t" \
01549 "subpd %2, %%xmm2 \n\t" \
01550 "subpd %3, %%xmm3 \n\t" \
01551 : \
01552 : \
01553 "m" (*((r))), \
01554 "m" (*((r)+1)), \
01555 "m" (*((r)+2)), \
01556 "m" (*((r)+3))); \
01557 _ASM ("subpd %0, %%xmm4 \n\t" \
01558 "subpd %1, %%xmm5 \n\t" \
01559 "subpd %2, %%xmm6 \n\t" \
01560 "subpd %3, %%xmm7 \n\t" \
01561 : \
01562 : \
01563 "m" (*((r)+4)), \
01564 "m" (*((r)+5)), \
01565 "m" (*((r)+6)), \
01566 "m" (*((r)+7))); \
01567 _ASM ("movapd %%xmm0, %0 \n\t" \
01568 "movapd %%xmm1, %1 \n\t" \
01569 "movapd %%xmm2, %2 \n\t" \
01570 "movapd %%xmm3, %3 \n\t" \
01571 : \
01572 "=m" (*((r))), \
01573 "=m" (*((r)+1)), \
01574 "=m" (*((r)+2)), \
01575 "=m" (*((r)+3))); \
01576 _ASM ("movapd %%xmm4, %0 \n\t" \
01577 "movapd %%xmm5, %1 \n\t" \
01578 "movapd %%xmm6, %2 \n\t" \
01579 "movapd %%xmm7, %3 \n\t" \
01580 : \
01581 "=m" (*((r)+4)), \
01582 "=m" (*((r)+5)), \
01583 "=m" (*((r)+6)), \
01584 "=m" (*((r)+7))); }
01585
01586
01587
01588
01589
01590 #define _sse_double_add_multiply_16(r,c,s) { \
01591 _ASM ("movapd %0, %%xmm0 \n\t" \
01592 "movapd %1, %%xmm1 \n\t" \
01593 "movapd %2, %%xmm2 \n\t" \
01594 "movapd %3, %%xmm3 \n\t" \
01595 : \
01596 : \
01597 "m" (*((s))), \
01598 "m" (*((s)+1)), \
01599 "m" (*((s)+2)), \
01600 "m" (*((s)+3))); \
01601 _ASM ("movapd %0, %%xmm4 \n\t" \
01602 "movapd %1, %%xmm5 \n\t" \
01603 "movapd %2, %%xmm6 \n\t" \
01604 "movapd %3, %%xmm7 \n\t" \
01605 "mulpd %4, %%xmm0 \n\t" \
01606 "mulpd %4, %%xmm1 \n\t" \
01607 "mulpd %4, %%xmm2 \n\t" \
01608 "mulpd %4, %%xmm3 \n\t" \
01609 "mulpd %4, %%xmm4 \n\t" \
01610 "mulpd %4, %%xmm5 \n\t" \
01611 "mulpd %4, %%xmm6 \n\t" \
01612 "mulpd %4, %%xmm7 \n\t" \
01613 : \
01614 : \
01615 "m" (*((s)+4)), \
01616 "m" (*((s)+5)), \
01617 "m" (*((s)+6)), \
01618 "m" (*((s)+7)), \
01619 "m" (c)); \
01620 _ASM ("addpd %0, %%xmm0 \n\t" \
01621 "addpd %1, %%xmm1 \n\t" \
01622 "addpd %2, %%xmm2 \n\t" \
01623 "addpd %3, %%xmm3 \n\t" \
01624 : \
01625 : \
01626 "m" (*((r))), \
01627 "m" (*((r)+1)), \
01628 "m" (*((r)+2)), \
01629 "m" (*((r)+3))); \
01630 _ASM ("addpd %0, %%xmm4 \n\t" \
01631 "addpd %1, %%xmm5 \n\t" \
01632 "addpd %2, %%xmm6 \n\t" \
01633 "addpd %3, %%xmm7 \n\t" \
01634 : \
01635 : \
01636 "m" (*((r)+4)), \
01637 "m" (*((r)+5)), \
01638 "m" (*((r)+6)), \
01639 "m" (*((r)+7))); \
01640 _ASM ("movapd %%xmm0, %0 \n\t" \
01641 "movapd %%xmm1, %1 \n\t" \
01642 "movapd %%xmm2, %2 \n\t" \
01643 "movapd %%xmm3, %3 \n\t" \
01644 : \
01645 "=m" (*((r))), \
01646 "=m" (*((r)+1)), \
01647 "=m" (*((r)+2)), \
01648 "=m" (*((r)+3))); \
01649 _ASM ("movapd %%xmm4, %0 \n\t" \
01650 "movapd %%xmm5, %1 \n\t" \
01651 "movapd %%xmm6, %2 \n\t" \
01652 "movapd %%xmm7, %3 \n\t" \
01653 : \
01654 "=m" (*((r)+4)), \
01655 "=m" (*((r)+5)), \
01656 "=m" (*((r)+6)), \
01657 "=m" (*((r)+7))); }
01658
01659 #define _sse_double_multiply_16(r,c,s) { \
01660 _ASM ("movapd %0, %%xmm0 \n\t" \
01661 "movapd %1, %%xmm1 \n\t" \
01662 "movapd %2, %%xmm2 \n\t" \
01663 "movapd %3, %%xmm3 \n\t" \
01664 : \
01665 : \
01666 "m" (*((s))), \
01667 "m" (*((s)+1)), \
01668 "m" (*((s)+2)), \
01669 "m" (*((s)+3))); \
01670 _ASM ("movapd %0, %%xmm4 \n\t" \
01671 "movapd %1, %%xmm5 \n\t" \
01672 "movapd %2, %%xmm6 \n\t" \
01673 "movapd %3, %%xmm7 \n\t" \
01674 : \
01675 : \
01676 "m" (*((s)+4)), \
01677 "m" (*((s)+5)), \
01678 "m" (*((s)+6)), \
01679 "m" (*((s)+7))); \
01680 _ASM ("mulpd %0, %%xmm0 \n\t" \
01681 "mulpd %0, %%xmm1 \n\t" \
01682 "mulpd %0, %%xmm2 \n\t" \
01683 "mulpd %0, %%xmm3 \n\t" \
01684 "mulpd %0, %%xmm4 \n\t" \
01685 "mulpd %0, %%xmm5 \n\t" \
01686 "mulpd %0, %%xmm6 \n\t" \
01687 "mulpd %0, %%xmm7 \n\t" \
01688 : \
01689 : \
01690 "m" (c)); \
01691 _ASM ("movapd %%xmm0, %0 \n\t" \
01692 "movapd %%xmm1, %1 \n\t" \
01693 "movapd %%xmm2, %2 \n\t" \
01694 "movapd %%xmm3, %3 \n\t" \
01695 : \
01696 "=m" (*((r))), \
01697 "=m" (*((r)+1)), \
01698 "=m" (*((r)+2)), \
01699 "=m" (*((r)+3))); \
01700 _ASM ("movapd %%xmm4, %0 \n\t" \
01701 "movapd %%xmm5, %1 \n\t" \
01702 "movapd %%xmm6, %2 \n\t" \
01703 "movapd %%xmm7, %3 \n\t" \
01704 : \
01705 "=m" (*((r)+4)), \
01706 "=m" (*((r)+5)), \
01707 "=m" (*((r)+6)), \
01708 "=m" (*((r)+7))); }
01709
01710
01711 static void _sse_check_alignment(void* var, unsigned int base) {
01712 unsigned int af1=(unsigned int) var;
01713 if (af1!=(af1&~base)) {
01714 error("_sse_check_alignment()\nVariable not aligned properly");
01715 }
01716 }
01717