00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011 #if defined(SSE2)
00012
00034 class FermiCloverActionSSE2 {
00035 public:
00036 static void mul_Q(fermi_field &chi_out,
00037 fermi_field &psi_in,
00038 gauge_field &U_in,
00039 coefficients &coeff,
00040 int parity=EVENODD) {
00041
00042 register int ndim=psi_in.lattice().ndim;
00043 register int nspin=psi_in.nspin;
00044 register int nc=psi_in.nc;
00045 register mdp_real kappa_t=0;
00046 register mdp_real kappa_s=0;
00047 register mdp_real r_t;
00048 register mdp_real r_s;
00049 register mdp_real cSW;
00050 register mdp_real c_E;
00051 register mdp_real c_B;
00052 register int sign;
00053
00054 if(coeff.has_key("kappa")) kappa_s=kappa_t=coeff["kappa"];
00055 if(coeff.has_key("kappa_t")) kappa_t=coeff["kappa_t"];
00056 if(coeff.has_key("kappa_s")) kappa_s=coeff["kappa_s"];
00057 if(kappa_t==0) error("FermiCloverActionSSE2\nkappa_t=0 or undeclared");
00058 if(kappa_s==0) error("FermiCloverActionSSE2\nkappa_s=0 or undeclared");
00059 if(coeff.has_key("r_t")) r_t=coeff["r_t"]; else r_t=1;
00060 if(coeff.has_key("r_s")) r_s=coeff["r_s"]; else r_s=1;
00061 if(coeff.has_key("c_{sw}")) cSW=coeff["c_{sw}"]; else cSW=0;
00062 if(coeff.has_key("c_E")) c_E=coeff["c_E"]; else c_E=1;
00063 if(coeff.has_key("c_B")) c_B=coeff["c_B"]; else c_B=1;
00064 if(coeff.has_key("sign")) sign=(int) coeff["sign"]; else sign=+1;
00065 if(parity!=EVENODD) error("FermiCloverActionSSE2\nparity must be EVENODD here");
00066
00067 #if !defined(USE_DOUBLE_PRECISION)
00068
00069 _sse_spinor *chi=(_sse_spinor*) chi_out.physical_address();
00070 _sse_check_alignment((void*) chi, 0xf);
00071 _sse_spinor *psi=(_sse_spinor*) psi_in.physical_address();
00072 _sse_check_alignment((void*) psi, 0xf);
00073 _sse_su3 *U=(_sse_su3*) U_in.physical_address();
00074 _sse_check_alignment((void*) U, 0xf);
00075 _sse_su3 *uem=0;
00076 if(cSW!=0.0) {
00077 uem=(_sse_su3*) U_in.em.physical_address();
00078 _sse_check_alignment((void*) uem, 0xf);
00079 }
00080 mdp_int **iup=U_in.lattice().up;
00081 mdp_int **idw=U_in.lattice().dw;
00082 mdp_int start=U_in.lattice().start_index(ME,0);
00083 mdp_int stop =U_in.lattice().stop_index(ME,1);
00084
00085 _sse_float fact1 ALIGN16;
00086 _sse_float fact2 ALIGN16;
00087 _sse_float fact3 ALIGN16;
00088 _sse_float fact4 ALIGN16;
00089 _sse_float fact5 ALIGN16;
00090 _sse_float fact6 ALIGN16;
00091 _sse_vector r12_1 ALIGN16;
00092 _sse_vector r34_1 ALIGN16;
00093 _sse_vector r12_2 ALIGN16;
00094 _sse_vector r34_2 ALIGN16;
00095 _sse_vector r0 ALIGN16;
00096 mdp_int ix1,iy1,iy2,iz1;
00097 float rho;
00098 _sse_su3 *up1,*um1,*um2;
00099 _sse_spinor *s1,*sp1,*sp2,*sm1,*sm2,*sn1;
00100
00101 if(sign!=1) exit(1);
00102
00103 if((stop-start)%2 !=0)
00104 error("FermiCloverActionSSE2\nProblem with parallelization: odd # of sites on process!");
00105
00106 if(r_t!=1.0)
00107 error("FermiCloverActionSSE2\nr_t!=1 not compatible with SSE2\n");
00108
00109 _sse_check_alignment((void*) &fact1, 0xf);
00110 _sse_check_alignment((void*) &fact2, 0xf);
00111 _sse_check_alignment((void*) &fact3, 0xf);
00112 _sse_check_alignment((void*) &fact4, 0xf);
00113 _sse_check_alignment((void*) &fact5, 0xf);
00114 _sse_check_alignment((void*) &fact6, 0xf);
00115 _sse_check_alignment((void*) &r12_1, 0xf);
00116 _sse_check_alignment((void*) &r34_1, 0xf);
00117 _sse_check_alignment((void*) &r12_2, 0xf);
00118 _sse_check_alignment((void*) &r34_2, 0xf);
00119
00120
00121
00122 r0.c1.c1=r0.c1.c2=r0.c1.c3=r0.c1.c4=0;
00123 r0.c2.c1=r0.c2.c2=r0.c2.c3=r0.c2.c4=0;
00124 r0.c3.c1=r0.c3.c2=r0.c3.c3=r0.c3.c4=0;
00125
00126 rho=-1.0f/kappa_s;
00127
00128
00129
00130 fact1.c1=rho;
00131 fact1.c2=rho;
00132 fact1.c3=rho;
00133 fact1.c4=rho;
00134
00135 fact2.c1=-1.0f*kappa_s;
00136 fact2.c2=fact2.c1;
00137 fact2.c3=fact2.c1;
00138 fact2.c4=fact2.c1;
00139
00140
00141
00142 fact3.c1=(1.0f+r_t)*kappa_t/kappa_s;
00143 fact3.c2=fact3.c1;
00144 fact3.c3=fact3.c1;
00145 fact3.c4=fact3.c1;
00146
00147 fact4.c1=-1.0f;
00148 fact4.c2=-1.0f;
00149 fact4.c3=-1.0f;
00150 fact4.c4=-1.0f;
00151
00152
00153
00154 fact5.c1=1.0f*kappa_s*cSW*c_B;
00155 fact5.c2=fact5.c1;
00156 fact5.c3=fact5.c1;
00157 fact5.c4=fact5.c1;
00158
00159
00160
00161 fact6.c1=1.0f*c_E/c_B;
00162 fact6.c2=fact6.c1;
00163 fact6.c3=fact6.c1;
00164 fact6.c4=fact6.c1;
00165
00166 sp1=(_sse_spinor*) &psi[iup[start][0]];
00167 sp2=(_sse_spinor*) &psi[iup[start+1][0]];
00168 up1=(_sse_su3*) U+4*start;
00169
00170
00171
00172
00173 for (ix1=start; ix1<stop; ix1+=2) {
00174
00175
00176 s1=psi+ix1;
00177 _sse_float_prefetch_spinor(s1);
00178
00179
00180
00181
00182 iy1=idw[ix1][0];
00183 iy2=idw[ix1+1][0];
00184 sm1=psi+iy1;
00185 sm2=psi+iy2;
00186 _sse_float_prefetch_spinor(sm1);
00187 _sse_float_prefetch_spinor(sm2);
00188
00189 _sse_float_pair_load((*sp1).c3,(*sp1).c4);
00190 _sse_float_vector_mul(fact3);
00191
00192 _sse_float_su3_multiply((*up1));
00193
00194 _sse_float_pair_load((*s1).c1,(*s1).c2);
00195 _sse_float_vector_mul(fact1);
00196 _sse_float_vector_store(r12_1);
00197
00198 _sse_float_pair_load((*s1).c3,(*s1).c4);
00199 _sse_float_vector_mul(fact1);
00200 _sse_float_vector_add();
00201 _sse_float_vector_store(r34_1);
00202
00203 um1=U+iy1*4;
00204 _sse_float_prefetch_su3(um1);
00205 um2=U+iy2*4;
00206 _sse_float_prefetch_su3(um2);
00207
00208 _sse_float_pair_load((*(sp2)).c3,(*(sp2)).c4);
00209 _sse_float_vector_mul(fact3);
00210
00211 _sse_float_su3_multiply((*(up1+4)));
00212
00213 _sse_float_pair_load((*(s1+1)).c1,(*(s1+1)).c2);
00214 _sse_float_vector_mul(fact1);
00215 _sse_float_vector_store(r12_2);
00216
00217 _sse_float_pair_load((*(s1+1)).c3,(*(s1+1)).c4);
00218 _sse_float_vector_mul(fact1);
00219 _sse_float_vector_add();
00220 _sse_float_vector_store(r34_2);
00221
00222
00223
00224
00225 sp1=psi+iup[ix1][1];
00226 sp2=psi+iup[ix1+1][1];
00227 _sse_float_prefetch_spinor(sp1);
00228 _sse_float_prefetch_spinor(sp2);
00229
00230 _sse_float_pair_load((*sm1).c1,(*sm1).c2);
00231 _sse_float_vector_mul(fact3);
00232
00233 _sse_float_su3_inverse_multiply((*um1));
00234
00235 _sse_float_vector_load(r12_1);
00236 _sse_float_vector_add();
00237 _sse_float_vector_store(r12_1);
00238
00239 up1++;
00240 _sse_float_prefetch_su3(up1);
00241
00242 _sse_float_pair_load((*(sm2)).c1,(*(sm2)).c2);
00243 _sse_float_vector_mul(fact3);
00244
00245 _sse_float_su3_inverse_multiply((*(um2)));
00246
00247 _sse_float_vector_load(r12_2);
00248 _sse_float_vector_add();
00249 _sse_float_vector_store(r12_2);
00250
00251
00252
00253 iy1=idw[ix1][1];
00254 iy2=idw[ix1+1][1];
00255 sm1=psi+iy1;
00256 sm2=psi+iy2;
00257 _sse_float_prefetch_spinor(sm1);
00258 _sse_float_prefetch_spinor(sm2);
00259
00260 _sse_float_pair_load((*sp1).c1,(*sp1).c2);
00261 _sse_float_pair_load_up((*sp1).c4,(*sp1).c3);
00262 _sse_float_vector_sub();
00263
00264 _sse_float_su3_multiply((*up1));
00265
00266 _sse_float_vector_load(r12_1);
00267 _sse_float_vector_add();
00268 _sse_float_vector_store(r12_1);
00269
00270 _sse_float_vector_load(r34_1);
00271 _sse_float_vector_xch();
00272 _sse_float_vector_sub();
00273 _sse_float_vector_store(r34_1);
00274
00275 um1=U+iy1*4+1;
00276 _sse_float_prefetch_su3(um1);
00277 um2=U+iy2*4+1;
00278 _sse_float_prefetch_su3(um2);
00279
00280 _sse_float_pair_load((*(sp2)).c1,(*(sp2)).c2);
00281 _sse_float_pair_load_up((*(sp2)).c4,(*(sp2)).c3);
00282 _sse_float_vector_sub();
00283
00284 _sse_float_su3_multiply((*(up1+4)));
00285
00286 _sse_float_vector_load(r12_2);
00287 _sse_float_vector_add();
00288 _sse_float_vector_store(r12_2);
00289
00290 _sse_float_vector_load(r34_2);
00291 _sse_float_vector_xch();
00292 _sse_float_vector_sub();
00293 _sse_float_vector_store(r34_2);
00294
00295
00296
00297 sp1=psi+iup[ix1][2];
00298 sp2=psi+iup[ix1+1][2];
00299 _sse_float_prefetch_spinor(sp1);
00300 _sse_float_prefetch_spinor(sp2);
00301
00302 _sse_float_pair_load((*sm1).c1,(*sm1).c2);
00303 _sse_float_pair_load_up((*sm1).c4,(*sm1).c3);
00304 _sse_float_vector_add();
00305
00306 _sse_float_su3_inverse_multiply((*um1));
00307
00308 _sse_float_vector_load(r12_1);
00309 _sse_float_vector_add();
00310 _sse_float_vector_store(r12_1);
00311
00312 _sse_float_vector_load(r34_1);
00313 _sse_float_vector_xch();
00314 _sse_float_vector_add();
00315 _sse_float_vector_store(r34_1);
00316
00317 up1++;
00318 _sse_float_prefetch_su3(up1);
00319
00320 _sse_float_pair_load((*(sm2)).c1,(*(sm2)).c2);
00321 _sse_float_pair_load_up((*(sm2)).c4,(*(sm2)).c3);
00322 _sse_float_vector_add();
00323
00324 _sse_float_su3_inverse_multiply((*(um2)));
00325
00326 _sse_float_vector_load(r12_2);
00327 _sse_float_vector_add();
00328 _sse_float_vector_store(r12_2);
00329
00330 _sse_float_vector_load(r34_2);
00331 _sse_float_vector_xch();
00332 _sse_float_vector_add();
00333 _sse_float_vector_store(r34_2);
00334
00335
00336
00337 iy1=idw[ix1][2];
00338 iy2=idw[ix1+1][2];
00339 sm1=psi+iy1;
00340 sm2=psi+iy2;
00341 _sse_float_prefetch_spinor(sm1);
00342 _sse_float_prefetch_spinor(sm2);
00343
00344 _sse_float_pair_load((*sp1).c1,(*sp1).c2);
00345 _sse_float_pair_load_up((*sp1).c4,(*sp1).c3);
00346 _sse_float_vector_i_addsub();
00347
00348 _sse_float_su3_multiply((*up1));
00349
00350 _sse_float_vector_load(r12_1);
00351 _sse_float_vector_add();
00352 _sse_float_vector_store(r12_1);
00353
00354 _sse_float_vector_load(r34_1);
00355 _sse_float_vector_xch();
00356 _sse_float_vector_i_addsub();
00357 _sse_float_vector_store(r34_1);
00358
00359
00360 um1=U+iy1*4+2;
00361 um2=U+iy2*4+2;
00362 _sse_float_prefetch_su3(um1);
00363 _sse_float_prefetch_su3(um2);
00364
00365 _sse_float_pair_load((*(sp2)).c1,(*(sp2)).c2);
00366 _sse_float_pair_load_up((*(sp2)).c4,(*(sp2)).c3);
00367 _sse_float_vector_i_addsub();
00368
00369 _sse_float_su3_multiply((*(up1+4)));
00370
00371 _sse_float_vector_load(r12_2);
00372 _sse_float_vector_add();
00373 _sse_float_vector_store(r12_2);
00374
00375 _sse_float_vector_load(r34_2);
00376 _sse_float_vector_xch();
00377 _sse_float_vector_i_addsub();
00378 _sse_float_vector_store(r34_2);
00379
00380
00381
00382 sp1=psi+iup[ix1][3];
00383 sp2=psi+iup[ix1+1][3];
00384 _sse_float_prefetch_spinor(sp1);
00385 _sse_float_prefetch_spinor(sp2);
00386
00387 _sse_float_pair_load((*sm1).c1,(*sm1).c2);
00388 _sse_float_pair_load_up((*sm1).c4,(*sm1).c3);
00389 _sse_float_vector_i_subadd();
00390
00391 _sse_float_su3_inverse_multiply((*um1));
00392
00393 _sse_float_vector_load(r12_1);
00394 _sse_float_vector_add();
00395 _sse_float_vector_store(r12_1);
00396
00397 _sse_float_vector_load(r34_1);
00398 _sse_float_vector_xch();
00399 _sse_float_vector_i_subadd();
00400 _sse_float_vector_store(r34_1);
00401
00402 up1++;
00403 _sse_float_prefetch_su3(up1);
00404
00405 _sse_float_pair_load((*(sm2)).c1,(*(sm2)).c2);
00406 _sse_float_pair_load_up((*(sm2)).c4,(*(sm2)).c3);
00407 _sse_float_vector_i_subadd();
00408
00409 _sse_float_su3_inverse_multiply((*(um2)));
00410
00411 _sse_float_vector_load(r12_2);
00412 _sse_float_vector_add();
00413 _sse_float_vector_store(r12_2);
00414
00415 _sse_float_vector_load(r34_2);
00416 _sse_float_vector_xch();
00417 _sse_float_vector_i_subadd();
00418 _sse_float_vector_store(r34_2);
00419
00420
00421
00422 iy1=idw[ix1][3];
00423 iy2=idw[ix1+1][3];
00424 sm1=psi+iy1;
00425 sm2=psi+iy2;
00426 _sse_float_prefetch_spinor(sm1);
00427 _sse_float_prefetch_spinor(sm2);
00428
00429 _sse_float_pair_load((*sp1).c1,(*sp1).c2);
00430 _sse_float_pair_load_up((*sp1).c3,(*sp1).c4);
00431 _sse_float_vector_subadd();
00432
00433 _sse_float_su3_multiply((*up1));
00434
00435 _sse_float_vector_load(r12_1);
00436 _sse_float_vector_add();
00437 _sse_float_vector_store(r12_1);
00438
00439 _sse_float_vector_load(r34_1);
00440 _sse_float_vector_subadd();
00441 _sse_float_vector_store(r34_1);
00442
00443 um1=U+iy1*4+3;
00444 _sse_float_prefetch_su3(um1);
00445 um2=U+iy2*4+3;
00446 _sse_float_prefetch_su3(um2);
00447
00448 _sse_float_pair_load((*sp2).c1,(*sp2).c2);
00449 _sse_float_pair_load_up((*sp2).c3,(*sp2).c4);
00450 _sse_float_vector_subadd();
00451
00452 _sse_float_su3_multiply((*(up1+4)));
00453
00454 _sse_float_vector_load(r12_2);
00455 _sse_float_vector_add();
00456 _sse_float_vector_store(r12_2);
00457
00458 _sse_float_vector_load(r34_2);
00459 _sse_float_vector_subadd();
00460 _sse_float_vector_store(r34_2);
00461
00462
00463
00464 sn1=(_sse_spinor*) &chi[ix1];
00465 _sse_float_prefetch_spinor(sn1);
00466
00467 iz1=ix1+2;
00468 if (iz1<stop) {
00469 sp1=(_sse_spinor*) &psi[iup[iz1][0]];
00470 sp2=(_sse_spinor*) &psi[iup[iz1+1][0]];
00471 _sse_float_prefetch_spinor(sp1);
00472 _sse_float_prefetch_spinor(sp2);
00473 }
00474
00475 _sse_float_pair_load((*sm1).c1,(*sm1).c2);
00476 _sse_float_pair_load_up((*sm1).c3,(*sm1).c4);
00477 _sse_float_vector_addsub();
00478
00479 _sse_float_su3_inverse_multiply((*um1));
00480
00481 _sse_float_vector_load(r12_1);
00482 _sse_float_vector_add();
00483 _sse_float_vector_mul(fact2);
00484 _sse_float_pair_store((*sn1).c1,(*sn1).c2);
00485
00486 _sse_float_vector_load(r34_1);
00487 _sse_float_vector_addsub();
00488 _sse_float_vector_mul(fact2);
00489 _sse_float_pair_store((*sn1).c3,(*sn1).c4);
00490
00491 up1=U+iz1*4;
00492 _sse_float_prefetch_su3(up1);
00493
00494
00495 _sse_float_pair_load((*sm2).c1,(*sm2).c2);
00496 _sse_float_pair_load_up((*sm2).c3,(*sm2).c4);
00497 _sse_float_vector_addsub();
00498
00499 _sse_float_su3_inverse_multiply((*um2));
00500
00501 _sse_float_vector_load(r12_2);
00502 _sse_float_vector_add();
00503 _sse_float_vector_mul(fact2);
00504 _sse_float_pair_store((*(sn1+1)).c1,(*(sn1+1)).c2);
00505
00506 _sse_float_vector_load(r34_2);
00507 _sse_float_vector_addsub();
00508 _sse_float_vector_mul(fact2);
00509 _sse_float_pair_store((*(sn1+1)).c3,(*(sn1+1)).c4);
00510
00511
00512 }
00513
00514 if(cSW==0) return;
00515
00516
00517
00518
00519
00520
00521
00522 um1=uem+6*start;
00523
00524 for (ix1=start; ix1<stop; ix1+=2) {
00525 s1=psi+ix1;
00526 _sse_float_prefetch_spinor(s1);
00527
00528
00529
00530 _sse_float_prefetch_su3(um1+1);
00531 _sse_float_prefetch_su3(um1+7);
00532
00533 _sse_float_pair_load((*s1).c4,(*s1).c3);
00534 _sse_float_vector_mul(fact4);
00535 _sse_float_su3_multiply((*um1));
00536
00537 _sse_float_vector_load(r0);
00538 _sse_float_vector_add();
00539 _sse_float_vector_store(r12_1);
00540
00541 _sse_float_pair_load((*s1).c2,(*s1).c1);
00542 _sse_float_su3_multiply((*um1));
00543 _sse_float_vector_load(r0);
00544 _sse_float_vector_add();
00545 _sse_float_vector_store(r34_1);
00546
00547 _sse_float_pair_load((*(s1+1)).c4,(*(s1+1)).c3);
00548 _sse_float_vector_mul(fact4);
00549 _sse_float_su3_multiply((*(um1+6)));
00550
00551 _sse_float_vector_load(r0);
00552 _sse_float_vector_add();
00553 _sse_float_vector_store(r12_2);
00554
00555 _sse_float_pair_load((*(s1+1)).c2,(*(s1+1)).c1);
00556 _sse_float_su3_multiply((*(um1+6)));
00557
00558 _sse_float_vector_load(r0);
00559 _sse_float_vector_add();
00560 _sse_float_vector_store(r34_2);
00561
00562 um1++;
00563
00564
00565
00566 _sse_float_prefetch_su3(um1+1);
00567 _sse_float_prefetch_su3(um1+7);
00568
00569 _sse_float_pair_load((*s1).c4,(*s1).c3);
00570 _sse_float_su3_multiply((*um1));
00571 _sse_float_vector_load(r12_1);
00572 _sse_float_vector_i_addsub();
00573 _sse_float_vector_store(r12_1);
00574
00575 _sse_float_pair_load((*s1).c2,(*s1).c1);
00576 _sse_float_su3_multiply((*um1));
00577 _sse_float_vector_load(r34_1);
00578 _sse_float_vector_i_subadd();
00579 _sse_float_vector_store(r34_1);
00580
00581 _sse_float_pair_load((*(s1+1)).c4,(*(s1+1)).c3);
00582 _sse_float_su3_multiply((*(um1+6)));
00583 _sse_float_vector_load(r12_2);
00584 _sse_float_vector_i_addsub();
00585 _sse_float_vector_store(r12_2);
00586
00587 _sse_float_pair_load((*(s1+1)).c2,(*(s1+1)).c1);
00588 _sse_float_su3_multiply((*(um1+6)));
00589 _sse_float_vector_load(r34_2);
00590 _sse_float_vector_i_subadd();
00591 _sse_float_vector_store(r34_2);
00592
00593 um1++;
00594
00595
00596
00597 _sse_float_prefetch_su3(um1+1);
00598 _sse_float_prefetch_su3(um1+7);
00599
00600 _sse_float_pair_load((*s1).c3,(*s1).c4);
00601 _sse_float_su3_multiply((*um1));
00602 _sse_float_vector_load(r12_1);
00603 _sse_float_vector_subadd();
00604 _sse_float_vector_mul(fact6);
00605 _sse_float_vector_store(r12_1);
00606
00607 _sse_float_pair_load((*s1).c1,(*s1).c2);
00608 _sse_float_su3_multiply((*um1));
00609 _sse_float_vector_load(r34_1);
00610 _sse_float_vector_addsub();
00611 _sse_float_vector_mul(fact6);
00612 _sse_float_vector_store(r34_1);
00613
00614 _sse_float_pair_load((*(s1+1)).c3,(*(s1+1)).c4);
00615 _sse_float_su3_multiply((*(um1+6)));
00616 _sse_float_vector_load(r12_2);
00617 _sse_float_vector_subadd();
00618 _sse_float_vector_mul(fact6);
00619 _sse_float_vector_store(r12_2);
00620
00621 _sse_float_pair_load((*(s1+1)).c1,(*(s1+1)).c2);
00622 _sse_float_su3_multiply((*(um1+6)));
00623 _sse_float_vector_load(r34_2);
00624 _sse_float_vector_addsub();
00625 _sse_float_vector_mul(fact6);
00626 _sse_float_vector_store(r34_2);
00627
00628 um1++;
00629
00630
00631
00632 _sse_float_prefetch_su3(um1+1);
00633 _sse_float_prefetch_su3(um1+7);
00634
00635 _sse_float_pair_load((*s1).c1,(*s1).c2);
00636 _sse_float_su3_multiply((*um1));
00637 _sse_float_vector_load(r12_1);
00638 _sse_float_vector_i_subadd();
00639 _sse_float_vector_store(r12_1);
00640
00641 _sse_float_pair_load((*s1).c3,(*s1).c4);
00642 _sse_float_su3_multiply((*um1));
00643 _sse_float_vector_load(r34_1);
00644 _sse_float_vector_i_subadd();
00645 _sse_float_vector_store(r34_1);
00646
00647 _sse_float_pair_load((*(s1+1)).c1,(*(s1+1)).c2);
00648 _sse_float_su3_multiply((*(um1+6)));
00649 _sse_float_vector_load(r12_2);
00650 _sse_float_vector_i_subadd();
00651 _sse_float_vector_store(r12_2);
00652
00653 _sse_float_pair_load((*(s1+1)).c3,(*(s1+1)).c4);
00654 _sse_float_su3_multiply((*(um1+6)));
00655 _sse_float_vector_load(r34_2);
00656 _sse_float_vector_i_subadd();
00657 _sse_float_vector_store(r34_2);
00658
00659 um1++;
00660
00661
00662
00663 _sse_float_prefetch_su3(um1+1);
00664 _sse_float_prefetch_su3(um1+7);
00665
00666 _sse_float_pair_load((*s1).c2,(*s1).c1);
00667 _sse_float_su3_multiply((*um1));
00668 _sse_float_vector_load(r12_1);
00669 _sse_float_vector_addsub();
00670 _sse_float_vector_store(r12_1);
00671
00672 _sse_float_pair_load((*s1).c4,(*s1).c3);
00673 _sse_float_su3_multiply((*um1));
00674 _sse_float_vector_load(r34_1);
00675 _sse_float_vector_addsub();
00676 _sse_float_vector_store(r34_1);
00677
00678 _sse_float_pair_load((*(s1+1)).c2,(*(s1+1)).c1);
00679 _sse_float_su3_multiply((*(um1+6)));
00680 _sse_float_vector_load(r12_2);
00681 _sse_float_vector_addsub();
00682 _sse_float_vector_store(r12_2);
00683
00684 _sse_float_pair_load((*(s1+1)).c4,(*(s1+1)).c3);
00685 _sse_float_su3_multiply((*(um1+6)));
00686 _sse_float_vector_load(r34_2);
00687 _sse_float_vector_addsub();
00688 _sse_float_vector_store(r34_2);
00689
00690 um1++;
00691
00692
00693
00694 sn1=(_sse_spinor*) &chi[ix1];
00695 _sse_float_prefetch_spinor(sn1);
00696
00697 _sse_float_pair_load((*s1).c2,(*s1).c1);
00698 _sse_float_su3_multiply((*um1));
00699 _sse_float_vector_load(r12_1);
00700 _sse_float_vector_i_sub();
00701 _sse_float_vector_store(r12_1);
00702
00703 _sse_float_pair_load((*s1).c4,(*s1).c3);
00704 _sse_float_su3_multiply((*um1));
00705 _sse_float_vector_load(r34_1);
00706 _sse_float_vector_i_sub();
00707 _sse_float_vector_store(r34_1);
00708
00709 _sse_float_pair_load((*(s1+1)).c2,(*(s1+1)).c1);
00710 _sse_float_su3_multiply((*(um1+6)));
00711 _sse_float_vector_load(r12_2);
00712 _sse_float_vector_i_sub();
00713 _sse_float_vector_store(r12_2);
00714
00715 _sse_float_pair_load((*(s1+1)).c4,(*(s1+1)).c3);
00716 _sse_float_su3_multiply((*(um1+6)));
00717 _sse_float_vector_load(r34_2);
00718 _sse_float_vector_i_sub();
00719 _sse_float_vector_store(r34_2);
00720
00721 um1+=7;
00722 if(ix1<stop-1) {
00723 _sse_float_prefetch_su3(um1);
00724 _sse_float_prefetch_su3(um1+6);
00725 }
00726
00727
00728 _sse_float_pair_load_up((*sn1).c1,(*sn1).c2);
00729 _sse_float_vector_load(r12_1);
00730 _sse_float_vector_mul(fact5);
00731 _sse_float_vector_add();
00732 _sse_float_pair_store((*sn1).c1,(*sn1).c2);
00733
00734 _sse_float_pair_load_up((*sn1).c3,(*sn1).c4);
00735 _sse_float_vector_load(r34_1);
00736 _sse_float_vector_mul(fact5);
00737 _sse_float_vector_add();
00738 _sse_float_pair_store((*sn1).c3,(*sn1).c4);
00739
00740 _sse_float_pair_load_up((*(sn1+1)).c1,(*(sn1+1)).c2);
00741 _sse_float_vector_load(r12_2);
00742 _sse_float_vector_mul(fact5);
00743 _sse_float_vector_add();
00744 _sse_float_pair_store((*(sn1+1)).c1,(*(sn1+1)).c2);
00745
00746 _sse_float_pair_load_up((*(sn1+1)).c3,(*(sn1+1)).c4);
00747 _sse_float_vector_load(r34_2);
00748 _sse_float_vector_mul(fact5);
00749 _sse_float_vector_add();
00750 _sse_float_pair_store((*(sn1+1)).c3,(*(sn1+1)).c4);
00751
00752
00753
00754 }
00755
00756
00757 #else
00758
00759 _sse_spinor *chi=(_sse_spinor*) chi_out.physical_address();
00760 _sse_spinor *psi=(_sse_spinor*) psi_in.physical_address();
00761 _sse_su3 *U=(_sse_su3*) U_in.physical_address();
00762 _sse_su3 *uem=(_sse_su3*) U_in.em.physical_address();
00763 mdp_int **iup=U_in.lattice().up;
00764 mdp_int **idw=U_in.lattice().dw;
00765 mdp_int start=U_in.lattice().start_index(ME,0);
00766 mdp_int stop =U_in.lattice().stop_index(ME,1);
00767
00768 _sse_double fact1 ALIGN16;
00769 _sse_double fact2 ALIGN16;
00770 _sse_double fact3 ALIGN16;
00771 _sse_double fact4 ALIGN16;
00772 _sse_double fact5 ALIGN16;
00773 _sse_double fact6 ALIGN16;
00774 _sse_spinor rs ALIGN16;
00775 _sse_spinor r0 ALIGN16;
00776 mdp_int ix,iy,iz;
00777 double rho;
00778 _sse_su3 *up, *um;
00779 _sse_spinor *s,*sp,*sm,*sn;
00780
00781 if(sign!=1) exit(1);
00782 if((stop-start)%2 !=0)
00783 error("FermiCloverActionSSE2\nProblem with parallelization: odd # of sites on process!");
00784
00785 if(r_t!=1.0)
00786 error("FermiCloverActionSSE2\nr_t!=1 not compatible with SSE2\n");
00787
00788 _sse_check_alignment((void*) &fact1, 0xf);
00789 _sse_check_alignment((void*) &fact2, 0xf);
00790 _sse_check_alignment((void*) &fact3, 0xf);
00791 _sse_check_alignment((void*) &fact4, 0xf);
00792 _sse_check_alignment((void*) &fact5, 0xf);
00793 _sse_check_alignment((void*) &fact6, 0xf);
00794 _sse_check_alignment((void*) &rs, 0xf);
00795 _sse_check_alignment((void*) &r0, 0xf);
00796
00797 r0.c1.c1.real()=r0.c1.c2.real()=r0.c1.c3.real()=0;
00798 r0.c2.c1.real()=r0.c2.c2.real()=r0.c2.c3.real()=0;
00799 r0.c3.c1.real()=r0.c3.c2.real()=r0.c3.c3.real()=0;
00800 r0.c4.c1.real()=r0.c4.c2.real()=r0.c4.c3.real()=0;
00801
00802 rho=-1.0/kappa_s;
00803
00804 fact1.c1=rho;
00805 fact1.c2=rho;
00806
00807 fact2.c1=-1.0*kappa_s;
00808 fact2.c2=fact2.c1;
00809
00810 fact3.c1=(1.0+r_t)*kappa_t/kappa_s;
00811 fact3.c2=fact3.c1;
00812
00813 fact4.c1=-1.0;
00814 fact4.c2=-1.0;
00815
00816 fact5.c1=1.0*kappa_s*cSW*c_E;
00817 fact5.c2=fact5.c1;
00818
00819 fact6.c1=1.0*c_E/c_B;
00820 fact6.c2=fact6.c1;
00821
00822 sp=(_sse_spinor*) &psi[iup[start][0]];
00823 up=(_sse_su3*) U+4*start;
00824
00825
00826
00827 for (ix=start; ix<stop; ix++) {
00828 s=psi+ix;
00829 _sse_double_prefetch_spinor(s);
00830
00831
00832
00833 iy=idw[ix][0];
00834 sm=psi+iy;
00835 _sse_double_prefetch_spinor(sm);
00836
00837 _sse_double_load((*s).c1);
00838 _sse_double_vector_mul(fact1);
00839 _sse_double_store(rs.c1);
00840 _sse_double_load((*s).c2);
00841 _sse_double_vector_mul(fact1);
00842 _sse_double_store(rs.c2);
00843
00844 um=U+iy*4;
00845 _sse_double_prefetch_su3(um);
00846
00847 _sse_double_load((*sp).c3);
00848 _sse_double_vector_mul(fact3);
00849 _sse_double_su3_multiply((*up));
00850 _sse_double_load((*s).c3);
00851 _sse_double_vector_mul(fact1);
00852 _sse_double_vector_add();
00853 _sse_double_store(rs.c3);
00854
00855 _sse_double_load((*sp).c4);
00856 _sse_double_vector_mul(fact3);
00857 _sse_double_su3_multiply((*up));
00858 _sse_double_load((*s).c4);
00859 _sse_double_vector_mul(fact1);
00860 _sse_double_vector_add();
00861 _sse_double_store(rs.c4);
00862
00863
00864
00865
00866 sp=psi+iup[ix][1];
00867 _sse_double_prefetch_spinor(sp);
00868 up++;
00869 _sse_double_prefetch_su3(up);
00870
00871 _sse_double_load((*sm).c1);
00872 _sse_double_vector_mul(fact3);
00873 _sse_double_su3_inverse_multiply((*um));
00874 _sse_double_load(rs.c1);
00875 _sse_double_vector_add();
00876 _sse_double_store(rs.c1);
00877
00878 _sse_double_load((*sm).c2);
00879 _sse_double_vector_mul(fact3);
00880 _sse_double_su3_inverse_multiply((*um));
00881 _sse_double_load(rs.c2);
00882 _sse_double_vector_add();
00883 _sse_double_store(rs.c2);
00884
00885
00886
00887 iy=idw[ix][1];
00888 sm=psi+iy;
00889 _sse_double_prefetch_spinor(sm);
00890 um=U+iy*4+1;
00891 _sse_double_prefetch_su3(um);
00892
00893 _sse_double_load((*sp).c1);
00894 _sse_double_load_up((*sp).c4);
00895 _sse_double_vector_sub();
00896 _sse_double_su3_multiply((*up));
00897 _sse_double_load(rs.c1);
00898 _sse_double_vector_add();
00899 _sse_double_store(rs.c1);
00900 _sse_double_load(rs.c4);
00901 _sse_double_vector_sub();
00902 _sse_double_store(rs.c4);
00903
00904 _sse_double_load((*sp).c2);
00905 _sse_double_load_up((*sp).c3);
00906 _sse_double_vector_sub();
00907 _sse_double_su3_multiply((*up));
00908 _sse_double_load(rs.c2);
00909 _sse_double_vector_add();
00910 _sse_double_store(rs.c2);
00911 _sse_double_load(rs.c3);
00912 _sse_double_vector_sub();
00913 _sse_double_store(rs.c3);
00914
00915
00916
00917
00918 sp=psi+iup[ix][2];
00919 _sse_double_prefetch_spinor(sp);
00920 up++;
00921 _sse_double_prefetch_su3(up);
00922
00923 _sse_double_load((*sm).c1);
00924 _sse_double_load_up((*sm).c4);
00925 _sse_double_vector_add();
00926 _sse_double_su3_inverse_multiply((*um));
00927 _sse_double_load(rs.c1);
00928 _sse_double_vector_add();
00929 _sse_double_store(rs.c1);
00930 _sse_double_load(rs.c4);
00931 _sse_double_vector_add();
00932 _sse_double_store(rs.c4);
00933
00934 _sse_double_load((*sm).c2);
00935 _sse_double_load_up((*sm).c3);
00936 _sse_double_vector_add();
00937 _sse_double_su3_inverse_multiply((*um));
00938 _sse_double_load(rs.c2);
00939 _sse_double_vector_add();
00940 _sse_double_store(rs.c2);
00941 _sse_double_load(rs.c3);
00942 _sse_double_vector_add();
00943 _sse_double_store(rs.c3);
00944
00945
00946
00947 iy=idw[ix][2];
00948 sm=psi+iy;
00949 _sse_double_prefetch_spinor(sm);
00950 um=U+iy*4+2;
00951 _sse_double_prefetch_su3(um);
00952
00953 _sse_double_load((*sp).c1);
00954 _sse_double_load_up((*sp).c4);
00955 _sse_double_vector_i_mul(); _sse_double_vector_add();
00956 _sse_double_su3_multiply((*up));
00957 _sse_double_load(rs.c1);
00958 _sse_double_vector_add();
00959 _sse_double_store(rs.c1);
00960 _sse_double_load(rs.c4);
00961 _sse_double_vector_i_mul(); _sse_double_vector_sub();
00962 _sse_double_store(rs.c4);
00963
00964 _sse_double_load((*sp).c2);
00965 _sse_double_load_up((*sp).c3);
00966 _sse_double_vector_i_mul(); _sse_double_vector_sub();
00967 _sse_double_su3_multiply((*up));
00968 _sse_double_load(rs.c2);
00969 _sse_double_vector_add();
00970 _sse_double_store(rs.c2);
00971 _sse_double_load(rs.c3);
00972 _sse_double_vector_i_mul(); _sse_double_vector_add();
00973 _sse_double_store(rs.c3);
00974
00975
00976
00977
00978 sp=psi+iup[ix][3];
00979 _sse_double_prefetch_spinor(sp);
00980 up++;
00981 _sse_double_prefetch_su3(up);
00982
00983 _sse_double_load((*sm).c1);
00984 _sse_double_load_up((*sm).c4);
00985 _sse_double_vector_i_mul(); _sse_double_vector_sub();
00986 _sse_double_su3_inverse_multiply((*um));
00987 _sse_double_load(rs.c1);
00988 _sse_double_vector_add();
00989 _sse_double_store(rs.c1);
00990 _sse_double_load(rs.c4);
00991 _sse_double_vector_i_mul(); _sse_double_vector_add();
00992 _sse_double_store(rs.c4);
00993
00994 _sse_double_load((*sm).c2);
00995 _sse_double_load_up((*sm).c3);
00996 _sse_double_vector_i_mul(); _sse_double_vector_add();
00997 _sse_double_su3_inverse_multiply((*um));
00998 _sse_double_load(rs.c2);
00999 _sse_double_vector_add();
01000 _sse_double_store(rs.c2);
01001 _sse_double_load(rs.c3);
01002 _sse_double_vector_i_mul(); _sse_double_vector_sub();
01003 _sse_double_store(rs.c3);
01004
01005
01006
01007 iy=idw[ix][3];
01008 sm=psi+iy;
01009 _sse_double_prefetch_spinor(sm);
01010 um=U+iy*4+3;
01011 _sse_double_prefetch_su3(um);
01012
01013 _sse_double_load((*sp).c1);
01014 _sse_double_load_up((*sp).c3);
01015 _sse_double_vector_sub();
01016 _sse_double_su3_multiply((*up));
01017 _sse_double_load(rs.c1);
01018 _sse_double_vector_add();
01019 _sse_double_store(rs.c1);
01020 _sse_double_load(rs.c3);
01021 _sse_double_vector_sub();
01022 _sse_double_store(rs.c3);
01023
01024 _sse_double_load((*sp).c2);
01025 _sse_double_load_up((*sp).c4);
01026 _sse_double_vector_add();
01027 _sse_double_su3_multiply((*up));
01028 _sse_double_load(rs.c2);
01029 _sse_double_vector_add();
01030 _sse_double_store(rs.c2);
01031 _sse_double_load(rs.c4);
01032 _sse_double_vector_add();
01033 _sse_double_store(rs.c4);
01034
01035
01036
01037 sn=chi+ix;
01038 _sse_double_prefetch_spinor(sn);
01039
01040 iz=ix+1;
01041 if (iz<stop) {
01042 sp=psi+iup[iz][0];
01043 _sse_double_prefetch_spinor(sp);
01044 up=U+iz*4;
01045 _sse_double_prefetch_su3(up);
01046 }
01047
01048 _sse_double_load((*sm).c1);
01049 _sse_double_load_up((*sm).c3);
01050 _sse_double_vector_add();
01051 _sse_double_su3_inverse_multiply((*um));
01052 _sse_double_load(rs.c1);
01053 _sse_double_vector_add();
01054 _sse_double_vector_mul(fact2);
01055 _sse_double_store((*sn).c1);
01056 _sse_double_load(rs.c3);
01057 _sse_double_vector_add();
01058 _sse_double_vector_mul(fact2);
01059 _sse_double_store((*sn).c3);
01060
01061 _sse_double_load((*sm).c2);
01062 _sse_double_load_up((*sm).c4);
01063 _sse_double_vector_sub();
01064 _sse_double_su3_inverse_multiply((*um));
01065 _sse_double_load(rs.c2);
01066 _sse_double_vector_add();
01067 _sse_double_vector_mul(fact2);
01068 _sse_double_store((*sn).c2);
01069 _sse_double_load(rs.c4);
01070 _sse_double_vector_sub();
01071 _sse_double_vector_mul(fact2);
01072 _sse_double_store((*sn).c4);
01073
01074
01075 }
01076
01077 if(cSW==0) return;
01078
01079
01080
01081
01082
01083
01084
01085 um=uem+6*start;
01086
01087 for (ix=start; ix<stop; ix++) {
01088 s=psi+ix;
01089 _sse_double_prefetch_spinor(s);
01090
01091
01092
01093
01094 _sse_double_prefetch_su3(um+1);
01095
01096 _sse_double_load((*s).c4);
01097 _sse_double_vector_mul(fact4);
01098 _sse_double_su3_multiply((*um));
01099
01100 _sse_double_load(r0.c1);
01101 _sse_double_vector_add();
01102 _sse_double_store(rs.c1);
01103
01104 _sse_double_load((*s).c3);
01105 _sse_double_vector_mul(fact4);
01106 _sse_double_su3_multiply((*um));
01107
01108 _sse_double_load(r0.c2);
01109 _sse_double_vector_add();
01110 _sse_double_store(rs.c2);
01111
01112 _sse_double_load((*s).c2);
01113 _sse_double_su3_multiply((*um));
01114 _sse_double_load(r0.c3);
01115 _sse_double_vector_add();
01116 _sse_double_store(rs.c3);
01117
01118 _sse_double_load((*s).c1);
01119 _sse_double_su3_multiply((*um));
01120 _sse_double_load(r0.c4);
01121 _sse_double_vector_add();
01122 _sse_double_store(rs.c4);
01123
01124 um++;
01125
01126
01127
01128 _sse_double_prefetch_su3(um+1);
01129
01130 _sse_double_load((*s).c4);
01131 _sse_double_su3_multiply((*um));
01132 _sse_double_load(rs.c1);
01133 _sse_double_vector_i_mul(); _sse_double_vector_add();
01134 _sse_double_store(rs.c1);
01135
01136 _sse_double_load((*s).c3);
01137 _sse_double_su3_multiply((*um));
01138 _sse_double_load(rs.c2);
01139 _sse_double_vector_i_mul(); _sse_double_vector_sub();
01140 _sse_double_store(rs.c2);
01141
01142 _sse_double_load((*s).c2);
01143 _sse_double_su3_multiply((*um));
01144 _sse_double_load(rs.c3);
01145 _sse_double_vector_i_mul(); _sse_double_vector_sub();
01146 _sse_double_store(rs.c3);
01147
01148 _sse_double_load((*s).c1);
01149 _sse_double_su3_multiply((*um));
01150 _sse_double_load(rs.c4);
01151 _sse_double_vector_i_mul(); _sse_double_vector_add();
01152 _sse_double_store(rs.c4);
01153
01154 um++;
01155
01156
01157
01158 _sse_double_prefetch_su3(um+1);
01159
01160 _sse_double_load((*s).c3);
01161 _sse_double_su3_multiply((*um));
01162 _sse_double_load(rs.c1);
01163 _sse_double_vector_sub();
01164 _sse_double_vector_mul(fact6);
01165 _sse_double_store(rs.c1);
01166
01167 _sse_double_load((*s).c4);
01168 _sse_double_su3_multiply((*um));
01169 _sse_double_load(rs.c2);
01170 _sse_double_vector_add();
01171 _sse_double_vector_mul(fact6);
01172 _sse_double_store(rs.c2);
01173
01174 _sse_double_load((*s).c1);
01175 _sse_double_su3_multiply((*um));
01176 _sse_double_load(rs.c3);
01177 _sse_double_vector_add();
01178 _sse_double_vector_mul(fact6);
01179 _sse_double_store(rs.c3);
01180
01181 _sse_double_load((*s).c2);
01182 _sse_double_su3_multiply((*um));
01183 _sse_double_load(rs.c4);
01184 _sse_double_vector_sub();
01185 _sse_double_vector_mul(fact6);
01186 _sse_double_store(rs.c4);
01187
01188 um++;
01189
01190
01191
01192 _sse_double_prefetch_su3(um+1);
01193
01194 _sse_double_load((*s).c1);
01195 _sse_double_su3_multiply((*um));
01196 _sse_double_load(rs.c1);
01197 _sse_double_vector_i_mul(); _sse_double_vector_sub();
01198 _sse_double_store(rs.c1);
01199
01200 _sse_double_load((*s).c2);
01201 _sse_double_su3_multiply((*um));
01202 _sse_double_load(rs.c2);
01203 _sse_double_vector_i_mul(); _sse_double_vector_add();
01204 _sse_double_store(rs.c2);
01205
01206 _sse_double_load((*s).c3);
01207 _sse_double_su3_multiply((*um));
01208 _sse_double_load(rs.c3);
01209 _sse_double_vector_i_mul(); _sse_double_vector_sub();
01210 _sse_double_store(rs.c3);
01211
01212 _sse_double_load((*s).c4);
01213 _sse_double_su3_multiply((*um));
01214 _sse_double_load(rs.c4);
01215 _sse_double_vector_i_mul(); _sse_double_vector_add();
01216 _sse_double_store(rs.c4);
01217
01218 um++;
01219
01220
01221
01222 _sse_double_prefetch_su3(um+1);
01223
01224 _sse_double_load((*s).c2);
01225 _sse_double_su3_multiply((*um));
01226 _sse_double_load(rs.c1);
01227 _sse_double_vector_add();
01228 _sse_double_store(rs.c1);
01229
01230 _sse_double_load((*s).c1);
01231 _sse_double_su3_multiply((*um));
01232 _sse_double_load(rs.c2);
01233 _sse_double_vector_sub();
01234 _sse_double_store(rs.c2);
01235
01236 _sse_double_load((*s).c4);
01237 _sse_double_su3_multiply((*um));
01238 _sse_double_load(rs.c3);
01239 _sse_double_vector_add();
01240 _sse_double_store(rs.c3);
01241
01242 _sse_double_load((*s).c3);
01243 _sse_double_su3_multiply((*um));
01244 _sse_double_load(rs.c4);
01245 _sse_double_vector_sub();
01246 _sse_double_store(rs.c4);
01247
01248 um++;
01249
01250
01251
01252 sn=(_sse_spinor*) &chi[ix];
01253 _sse_double_prefetch_spinor(sn);
01254
01255 _sse_double_load((*s).c2);
01256 _sse_double_su3_multiply((*um));
01257 _sse_double_load(rs.c1);
01258 _sse_double_vector_i_mul(); _sse_double_vector_sub();
01259 _sse_double_store(rs.c1);
01260
01261 _sse_double_load((*s).c1);
01262 _sse_double_su3_multiply((*um));
01263 _sse_double_load(rs.c2);
01264 _sse_double_vector_i_mul(); _sse_double_vector_sub();
01265 _sse_double_store(rs.c2);
01266
01267 _sse_double_load((*s).c4);
01268 _sse_double_su3_multiply((*um));
01269 _sse_double_load(rs.c3);
01270 _sse_double_vector_i_mul(); _sse_double_vector_sub();
01271 _sse_double_store(rs.c3);
01272
01273 _sse_double_load((*s).c3);
01274 _sse_double_su3_multiply((*um));
01275 _sse_double_load(rs.c4);
01276 _sse_double_vector_i_mul(); _sse_double_vector_sub();
01277 _sse_double_store(rs.c4);
01278
01279 um++;
01280 if(ix<stop) {
01281 _sse_double_prefetch_su3(um);
01282 }
01283
01284 _sse_double_load_up((*sn).c1);
01285 _sse_double_load(rs.c1);
01286 _sse_double_vector_mul(fact5);
01287 _sse_double_vector_add();
01288 _sse_double_store((*sn).c1);
01289
01290 _sse_double_load_up((*sn).c2);
01291 _sse_double_load(rs.c2);
01292 _sse_double_vector_mul(fact5);
01293 _sse_double_vector_add();
01294 _sse_double_store((*sn).c2);
01295
01296 _sse_double_load_up((*sn).c3);
01297 _sse_double_load(rs.c3);
01298 _sse_double_vector_mul(fact5);
01299 _sse_double_vector_add();
01300 _sse_double_store((*sn).c3);
01301
01302 _sse_double_load_up((*sn).c4);
01303 _sse_double_load(rs.c4);
01304 _sse_double_vector_mul(fact5);
01305 _sse_double_vector_add();
01306 _sse_double_store((*sn).c4);
01307
01308
01309 }
01310 #endif // if defined(USE_DOUBLE_PRECISION)
01311
01312 }
01313 };
01314
01315 #endif // if defined(SSE2)
01316
01317