00001
00002
00003
00004
00005
00006
00007
00008
00009
00011
00012 #if defined(SSE2)
00013
00029 class DwFermiActionSSE2 {
00030 public:
00031 static void mul_Q(dwfermi_field &chi_out,
00032 dwfermi_field &psi_in,
00033 gauge_field &U_in,
00034 coefficients &coeff) {
00035
00036 if(psi_in.nspin!=4) error("fermiqcd_dwfermi_algorithms/dwfermi_mul_Q_ONE: nspin!=4");
00037 if(psi_in.nc!=U_in.nc) error("fermiqcd_dwfermi_algorithms/dwfermi_mul_Q_ONE: gauge and spinor have different nc");
00038
00039 register int ndim=psi_in.lattice().ndim;
00040 register int nspin=psi_in.nspin;
00041 register int nc=psi_in.nc;
00042 register int L5=psi_in.L5;
00043 register mdp_real m_5,m_f,sign;
00044 if(coeff.has_key("m_5")) m_5=coeff["m_5"];
00045 else error("coefficients m_5 undeclared");
00046 if(coeff.has_key("m_f")) m_f=coeff["m_f"];
00047 else error("coefficients m_f undeclared");
00048 if(coeff.has_key("sign")) sign=coeff["sign"];
00049 else sign=1;
00050
00051
00052 register mdp_real kappa5=0.5/(m_5-6.0);
00053 register mdp_real kappaf=-m_f*kappa5;
00054
00055 #if !defined(USE_DOUBLE_PRECISION)
00056
00057 _sse_spinor *chi=(_sse_spinor*) chi_out.physical_address();
00058 _sse_check_alignment((void*) chi, 0xf);
00059 _sse_spinor *psi=(_sse_spinor*) psi_in.physical_address();
00060 _sse_check_alignment((void*) psi, 0xf);
00061 _sse_su3 *U=(_sse_su3*) U_in.physical_address();
00062 _sse_check_alignment((void*) U, 0xf);
00063 mdp_int **iup=U_in.lattice().up;
00064 mdp_int **idw=U_in.lattice().dw;
00065 mdp_int start=U_in.lattice().start_index(ME,0);
00066 mdp_int stop =U_in.lattice().stop_index(ME,1);
00067
00068 _sse_float fact1 ALIGN16;
00069 _sse_float fact2 ALIGN16;
00070 _sse_float fact3 ALIGN16;
00071 _sse_float fact4 ALIGN16;
00072 _sse_vector r12_1 ALIGN16;
00073 _sse_vector r34_1 ALIGN16;
00074 _sse_vector r12_2 ALIGN16;
00075 _sse_vector r34_2 ALIGN16;
00076 _sse_vector r0 ALIGN16;
00077 mdp_int ix1,iy1,iy2,iz1;
00078 float rho;
00079 _sse_su3 *up1,*um1,*um2;
00080 _sse_spinor *s1,*sp1,*sp2,*sm1,*sm2,*sn1;
00081
00082 if(sign!=1) exit(1);
00083
00084 if((stop-start)%2 !=0)
00085 error("DWFermiActionSSE2\nProblem with parallelization: odd # of sites on process!");
00086
00087 _sse_check_alignment((void*) &fact1, 0xf);
00088 _sse_check_alignment((void*) &fact2, 0xf);
00089 _sse_check_alignment((void*) &fact3, 0xf);
00090 _sse_check_alignment((void*) &fact4, 0xf);
00091 _sse_check_alignment((void*) &r12_1, 0xf);
00092 _sse_check_alignment((void*) &r34_1, 0xf);
00093 _sse_check_alignment((void*) &r12_2, 0xf);
00094 _sse_check_alignment((void*) &r34_2, 0xf);
00095
00096
00097
00098 r0.c1.c1=r0.c1.c2=r0.c1.c3=r0.c1.c4=0;
00099 r0.c2.c1=r0.c2.c2=r0.c2.c3=r0.c2.c4=0;
00100 r0.c3.c1=r0.c3.c2=r0.c3.c3=r0.c3.c4=0;
00101
00102 rho=-1.0f/kappa5;
00103
00104
00105
00106 fact1.c1=rho;
00107 fact1.c2=rho;
00108 fact1.c3=rho;
00109 fact1.c4=rho;
00110
00111 fact2.c1=-1.0f*kappa5;
00112 fact2.c2=fact2.c1;
00113 fact2.c3=fact2.c1;
00114 fact2.c4=fact2.c1;
00115
00116
00117
00118 fact3.c1=2.0f;
00119 fact3.c2=fact3.c1;
00120 fact3.c3=fact3.c1;
00121 fact3.c4=fact3.c1;
00122
00123 fact4.c1=-1.0f;
00124 fact4.c2=-1.0f;
00125 fact4.c3=-1.0f;
00126 fact4.c4=-1.0f;
00127
00128
00129
00130
00131
00132
00133
00134 for (ix1=start; ix1<stop; ix1++) {
00135
00136 up1=(_sse_su3*) U+4*ix1;
00137
00138 for(int ell=0; ell<L5; ell++) {
00139
00140 s1=psi+ix1*L5+ell;
00141 _sse_float_prefetch_spinor(s1);
00142
00143
00144
00145 sm1=psi+(iy1=idw[ix1][0])*L5+ell;
00146 sp1=psi+iup[ix1][0]*L5+ell;
00147 _sse_float_prefetch_spinor(sm1);
00148
00149 _sse_float_pair_load((*sp1).c3,(*sp1).c4);
00150 _sse_float_vector_mul(fact3);
00151 _sse_float_su3_multiply((*up1));
00152 _sse_float_pair_load((*s1).c1,(*s1).c2);
00153 _sse_float_vector_mul(fact1);
00154 _sse_float_vector_store(r12_1);
00155
00156 _sse_float_pair_load((*s1).c3,(*s1).c4);
00157 _sse_float_vector_mul(fact1);
00158 _sse_float_vector_add();
00159 _sse_float_vector_store(r34_1);
00160
00161 um1=U+iy1*4;
00162 _sse_float_prefetch_su3(um1);
00163
00164
00165
00166
00167 sp1=psi+iup[ix1][1]*L5+ell;
00168 _sse_float_prefetch_spinor(sp1);
00169
00170 _sse_float_pair_load((*sm1).c1,(*sm1).c2);
00171 _sse_float_vector_mul(fact3);
00172 _sse_float_su3_inverse_multiply((*um1));
00173
00174 _sse_float_vector_load(r12_1);
00175 _sse_float_vector_add();
00176 _sse_float_vector_store(r12_1);
00177
00178 up1++;
00179 _sse_float_prefetch_su3(up1);
00180
00181
00182
00183 sm1=psi+(iy1=idw[ix1][1])*L5+ell;
00184 _sse_float_prefetch_spinor(sm1);
00185
00186 _sse_float_pair_load((*sp1).c1,(*sp1).c2);
00187 _sse_float_pair_load_up((*sp1).c4,(*sp1).c3);
00188 _sse_float_vector_sub();
00189
00190 _sse_float_su3_multiply((*up1));
00191
00192 _sse_float_vector_load(r12_1);
00193 _sse_float_vector_add();
00194 _sse_float_vector_store(r12_1);
00195
00196 _sse_float_vector_load(r34_1);
00197 _sse_float_vector_xch();
00198 _sse_float_vector_sub();
00199 _sse_float_vector_store(r34_1);
00200
00201 um1=U+iy1*4+1;
00202 _sse_float_prefetch_su3(um1);
00203
00204
00205
00206 sp1=psi+iup[ix1][2]*L5;
00207 _sse_float_prefetch_spinor(sp1);
00208
00209 _sse_float_pair_load((*sm1).c1,(*sm1).c2);
00210 _sse_float_pair_load_up((*sm1).c4,(*sm1).c3);
00211 _sse_float_vector_add();
00212
00213 _sse_float_su3_inverse_multiply((*um1));
00214
00215 _sse_float_vector_load(r12_1);
00216 _sse_float_vector_add();
00217 _sse_float_vector_store(r12_1);
00218
00219 _sse_float_vector_load(r34_1);
00220 _sse_float_vector_xch();
00221 _sse_float_vector_add();
00222 _sse_float_vector_store(r34_1);
00223
00224 up1++;
00225 _sse_float_prefetch_su3(up1);
00226
00227
00228
00229 sm1=psi+(iy1=idw[ix1][2])*L5+ell;
00230 _sse_float_prefetch_spinor(sm1);
00231
00232 _sse_float_pair_load((*sp1).c1,(*sp1).c2);
00233 _sse_float_pair_load_up((*sp1).c4,(*sp1).c3);
00234 _sse_float_vector_i_addsub();
00235
00236 _sse_float_su3_multiply((*up1));
00237
00238 _sse_float_vector_load(r12_1);
00239 _sse_float_vector_add();
00240 _sse_float_vector_store(r12_1);
00241
00242 _sse_float_vector_load(r34_1);
00243 _sse_float_vector_xch();
00244 _sse_float_vector_i_addsub();
00245 _sse_float_vector_store(r34_1);
00246
00247 um1=U+iy1*4+2;
00248 _sse_float_prefetch_su3(um1);
00249
00250
00251
00252 sp1=psi+iup[ix1][3]*L5+ell;
00253 _sse_float_prefetch_spinor(sp1);
00254
00255 _sse_float_pair_load((*sm1).c1,(*sm1).c2);
00256 _sse_float_pair_load_up((*sm1).c4,(*sm1).c3);
00257 _sse_float_vector_i_subadd();
00258
00259 _sse_float_su3_inverse_multiply((*um1));
00260
00261 _sse_float_vector_load(r12_1);
00262 _sse_float_vector_add();
00263 _sse_float_vector_store(r12_1);
00264
00265 _sse_float_vector_load(r34_1);
00266 _sse_float_vector_xch();
00267 _sse_float_vector_i_subadd();
00268 _sse_float_vector_store(r34_1);
00269
00270 up1++;
00271 _sse_float_prefetch_su3(up1);
00272
00273
00274
00275 sm1=psi+(iy1=idw[ix1][3])*L5+ell;
00276 _sse_float_prefetch_spinor(sm1);
00277
00278 _sse_float_pair_load((*sp1).c1,(*sp1).c2);
00279 _sse_float_pair_load_up((*sp1).c3,(*sp1).c4);
00280 _sse_float_vector_subadd();
00281
00282 _sse_float_su3_multiply((*up1));
00283
00284 _sse_float_vector_load(r12_1);
00285 _sse_float_vector_add();
00286 _sse_float_vector_store(r12_1);
00287
00288 _sse_float_vector_load(r34_1);
00289 _sse_float_vector_subadd();
00290 _sse_float_vector_store(r34_1);
00291
00292 um1=U+iy1*4+3;
00293 _sse_float_prefetch_su3(um1);
00294
00295
00296
00297 sn1=chi+ix1*L5+ell;
00298 _sse_float_prefetch_spinor(sn1);
00299
00300 _sse_float_pair_load((*sm1).c1,(*sm1).c2);
00301 _sse_float_pair_load_up((*sm1).c3,(*sm1).c4);
00302 _sse_float_vector_addsub();
00303
00304 _sse_float_su3_inverse_multiply((*um1));
00305
00306 _sse_float_vector_load(r12_1);
00307 _sse_float_vector_add();
00308 _sse_float_vector_mul(fact2);
00309 _sse_float_pair_store((*sn1).c1,(*sn1).c2);
00310
00311 _sse_float_vector_load(r34_1);
00312 _sse_float_vector_addsub();
00313 _sse_float_vector_mul(fact2);
00314 _sse_float_pair_store((*sn1).c3,(*sn1).c4);
00315
00316
00317
00318 }
00319 }
00320 #else
00321
00322 error("NOT IMPLEMENTED SORRY");
00323
00324 _sse_spinor *chi=(_sse_spinor*) chi_out.physical_address();
00325 _sse_spinor *psi=(_sse_spinor*) psi_in.physical_address();
00326 _sse_su3 *U=(_sse_su3*) U_in.physical_address();
00327 _sse_su3 *uem=(_sse_su3*) U_in.em.physical_address();
00328 mdp_int **iup=U_in.lattice().up;
00329 mdp_int **idw=U_in.lattice().dw;
00330 mdp_int start=U_in.lattice().start_index(ME,0);
00331 mdp_int stop =U_in.lattice().stop_index(ME,1);
00332
00333 _sse_double fact1 ALIGN16;
00334 _sse_double fact2 ALIGN16;
00335 _sse_double fact3 ALIGN16;
00336 _sse_double fact4 ALIGN16;
00337 _sse_double fact5 ALIGN16;
00338 _sse_double fact6 ALIGN16;
00339 _sse_spinor rs ALIGN16;
00340 _sse_spinor r0 ALIGN16;
00341 mdp_int ix,iy,iz;
00342 double rho;
00343 _sse_su3 *up, *um;
00344 _sse_spinor *s,*sp,*sm,*sn;
00345
00346 if(sign!=1) exit(1);
00347 if((stop-start)%2 !=0)
00348 error("FermiCloverActionSSE2\nProblem with parallelization: odd # of sites on process!");
00349
00350 if(r_t!=1.0)
00351 error("FermiCloverActionSSE2\nr_t!=1 not compatible with SSE2\n");
00352
00353 _sse_check_alignment((void*) &fact1, 0xf);
00354 _sse_check_alignment((void*) &fact2, 0xf);
00355 _sse_check_alignment((void*) &fact3, 0xf);
00356 _sse_check_alignment((void*) &fact4, 0xf);
00357 _sse_check_alignment((void*) &fact5, 0xf);
00358 _sse_check_alignment((void*) &fact6, 0xf);
00359 _sse_check_alignment((void*) &rs, 0xf);
00360 _sse_check_alignment((void*) &r0, 0xf);
00361
00362 r0.c1.c1.real()=r0.c1.c2.real()=r0.c1.c3.real()=0;
00363 r0.c2.c1.real()=r0.c2.c2.real()=r0.c2.c3.real()=0;
00364 r0.c3.c1.real()=r0.c3.c2.real()=r0.c3.c3.real()=0;
00365 r0.c4.c1.real()=r0.c4.c2.real()=r0.c4.c3.real()=0;
00366
00367 rho=-1.0/kappa_s;
00368
00369 fact1.c1=rho;
00370 fact1.c2=rho;
00371
00372 fact2.c1=-1.0*kappa_s;
00373 fact2.c2=fact2.c1;
00374
00375 fact3.c1=(1.0+r_t)*kappa_t/kappa_s;
00376 fact3.c2=fact3.c1;
00377
00378 fact4.c1=-1.0;
00379 fact4.c2=-1.0;
00380
00381 sp=(_sse_spinor*) &psi[iup[start][0]];
00382 up=(_sse_su3*) U+4*start;
00383
00384
00385
00386 for (ix=start; ix<stop; ix++) {
00387 s=psi+ix;
00388 _sse_double_prefetch_spinor(s);
00389
00390
00391
00392 iy=idw[ix][0];
00393 sm=psi+iy;
00394 _sse_double_prefetch_spinor(sm);
00395
00396 _sse_double_load((*s).c1);
00397 _sse_double_vector_mul(fact1);
00398 _sse_double_store(rs.c1);
00399 _sse_double_load((*s).c2);
00400 _sse_double_vector_mul(fact1);
00401 _sse_double_store(rs.c2);
00402
00403 um=U+iy*4;
00404 _sse_double_prefetch_su3(um);
00405
00406 _sse_double_load((*sp).c3);
00407 _sse_double_vector_mul(fact3);
00408 _sse_double_su3_multiply((*up));
00409 _sse_double_load((*s).c3);
00410 _sse_double_vector_mul(fact1);
00411 _sse_double_vector_add();
00412 _sse_double_store(rs.c3);
00413
00414 _sse_double_load((*sp).c4);
00415 _sse_double_vector_mul(fact3);
00416 _sse_double_su3_multiply((*up));
00417 _sse_double_load((*s).c4);
00418 _sse_double_vector_mul(fact1);
00419 _sse_double_vector_add();
00420 _sse_double_store(rs.c4);
00421
00422
00423
00424
00425 sp=psi+iup[ix][1];
00426 _sse_double_prefetch_spinor(sp);
00427 up++;
00428 _sse_double_prefetch_su3(up);
00429
00430 _sse_double_load((*sm).c1);
00431 _sse_double_vector_mul(fact3);
00432 _sse_double_su3_inverse_multiply((*um));
00433 _sse_double_load(rs.c1);
00434 _sse_double_vector_add();
00435 _sse_double_store(rs.c1);
00436
00437 _sse_double_load((*sm).c2);
00438 _sse_double_vector_mul(fact3);
00439 _sse_double_su3_inverse_multiply((*um));
00440 _sse_double_load(rs.c2);
00441 _sse_double_vector_add();
00442 _sse_double_store(rs.c2);
00443
00444
00445
00446 iy=idw[ix][1];
00447 sm=psi+iy;
00448 _sse_double_prefetch_spinor(sm);
00449 um=U+iy*4+1;
00450 _sse_double_prefetch_su3(um);
00451
00452 _sse_double_load((*sp).c1);
00453 _sse_double_load_up((*sp).c4);
00454 _sse_double_vector_sub();
00455 _sse_double_su3_multiply((*up));
00456 _sse_double_load(rs.c1);
00457 _sse_double_vector_add();
00458 _sse_double_store(rs.c1);
00459 _sse_double_load(rs.c4);
00460 _sse_double_vector_sub();
00461 _sse_double_store(rs.c4);
00462
00463 _sse_double_load((*sp).c2);
00464 _sse_double_load_up((*sp).c3);
00465 _sse_double_vector_sub();
00466 _sse_double_su3_multiply((*up));
00467 _sse_double_load(rs.c2);
00468 _sse_double_vector_add();
00469 _sse_double_store(rs.c2);
00470 _sse_double_load(rs.c3);
00471 _sse_double_vector_sub();
00472 _sse_double_store(rs.c3);
00473
00474
00475
00476
00477 sp=psi+iup[ix][2];
00478 _sse_double_prefetch_spinor(sp);
00479 up++;
00480 _sse_double_prefetch_su3(up);
00481
00482 _sse_double_load((*sm).c1);
00483 _sse_double_load_up((*sm).c4);
00484 _sse_double_vector_add();
00485 _sse_double_su3_inverse_multiply((*um));
00486 _sse_double_load(rs.c1);
00487 _sse_double_vector_add();
00488 _sse_double_store(rs.c1);
00489 _sse_double_load(rs.c4);
00490 _sse_double_vector_add();
00491 _sse_double_store(rs.c4);
00492
00493 _sse_double_load((*sm).c2);
00494 _sse_double_load_up((*sm).c3);
00495 _sse_double_vector_add();
00496 _sse_double_su3_inverse_multiply((*um));
00497 _sse_double_load(rs.c2);
00498 _sse_double_vector_add();
00499 _sse_double_store(rs.c2);
00500 _sse_double_load(rs.c3);
00501 _sse_double_vector_add();
00502 _sse_double_store(rs.c3);
00503
00504
00505
00506 iy=idw[ix][2];
00507 sm=psi+iy;
00508 _sse_double_prefetch_spinor(sm);
00509 um=U+iy*4+2;
00510 _sse_double_prefetch_su3(um);
00511
00512 _sse_double_load((*sp).c1);
00513 _sse_double_load_up((*sp).c4);
00514 _sse_double_vector_i_mul(); _sse_double_vector_add();
00515 _sse_double_su3_multiply((*up));
00516 _sse_double_load(rs.c1);
00517 _sse_double_vector_add();
00518 _sse_double_store(rs.c1);
00519 _sse_double_load(rs.c4);
00520 _sse_double_vector_i_mul(); _sse_double_vector_sub();
00521 _sse_double_store(rs.c4);
00522
00523 _sse_double_load((*sp).c2);
00524 _sse_double_load_up((*sp).c3);
00525 _sse_double_vector_i_mul(); _sse_double_vector_sub();
00526 _sse_double_su3_multiply((*up));
00527 _sse_double_load(rs.c2);
00528 _sse_double_vector_add();
00529 _sse_double_store(rs.c2);
00530 _sse_double_load(rs.c3);
00531 _sse_double_vector_i_mul(); _sse_double_vector_add();
00532 _sse_double_store(rs.c3);
00533
00534
00535
00536
00537 sp=psi+iup[ix][3];
00538 _sse_double_prefetch_spinor(sp);
00539 up++;
00540 _sse_double_prefetch_su3(up);
00541
00542 _sse_double_load((*sm).c1);
00543 _sse_double_load_up((*sm).c4);
00544 _sse_double_vector_i_mul(); _sse_double_vector_sub();
00545 _sse_double_su3_inverse_multiply((*um));
00546 _sse_double_load(rs.c1);
00547 _sse_double_vector_add();
00548 _sse_double_store(rs.c1);
00549 _sse_double_load(rs.c4);
00550 _sse_double_vector_i_mul(); _sse_double_vector_add();
00551 _sse_double_store(rs.c4);
00552
00553 _sse_double_load((*sm).c2);
00554 _sse_double_load_up((*sm).c3);
00555 _sse_double_vector_i_mul(); _sse_double_vector_add();
00556 _sse_double_su3_inverse_multiply((*um));
00557 _sse_double_load(rs.c2);
00558 _sse_double_vector_add();
00559 _sse_double_store(rs.c2);
00560 _sse_double_load(rs.c3);
00561 _sse_double_vector_i_mul(); _sse_double_vector_sub();
00562 _sse_double_store(rs.c3);
00563
00564
00565
00566 iy=idw[ix][3];
00567 sm=psi+iy;
00568 _sse_double_prefetch_spinor(sm);
00569 um=U+iy*4+3;
00570 _sse_double_prefetch_su3(um);
00571
00572 _sse_double_load((*sp).c1);
00573 _sse_double_load_up((*sp).c3);
00574 _sse_double_vector_sub();
00575 _sse_double_su3_multiply((*up));
00576 _sse_double_load(rs.c1);
00577 _sse_double_vector_add();
00578 _sse_double_store(rs.c1);
00579 _sse_double_load(rs.c3);
00580 _sse_double_vector_sub();
00581 _sse_double_store(rs.c3);
00582
00583 _sse_double_load((*sp).c2);
00584 _sse_double_load_up((*sp).c4);
00585 _sse_double_vector_add();
00586 _sse_double_su3_multiply((*up));
00587 _sse_double_load(rs.c2);
00588 _sse_double_vector_add();
00589 _sse_double_store(rs.c2);
00590 _sse_double_load(rs.c4);
00591 _sse_double_vector_add();
00592 _sse_double_store(rs.c4);
00593
00594
00595
00596 sn=chi+ix;
00597 _sse_double_prefetch_spinor(sn);
00598
00599 iz=ix+1;
00600 if (iz<stop) {
00601 sp=psi+iup[iz][0];
00602 _sse_double_prefetch_spinor(sp);
00603 up=U+iz*4;
00604 _sse_double_prefetch_su3(up);
00605 }
00606
00607 _sse_double_load((*sm).c1);
00608 _sse_double_load_up((*sm).c3);
00609 _sse_double_vector_add();
00610 _sse_double_su3_inverse_multiply((*um));
00611 _sse_double_load(rs.c1);
00612 _sse_double_vector_add();
00613 _sse_double_vector_mul(fact2);
00614 _sse_double_store((*sn).c1);
00615 _sse_double_load(rs.c3);
00616 _sse_double_vector_add();
00617 _sse_double_vector_mul(fact2);
00618 _sse_double_store((*sn).c3);
00619
00620 _sse_double_load((*sm).c2);
00621 _sse_double_load_up((*sm).c4);
00622 _sse_double_vector_sub();
00623 _sse_double_su3_inverse_multiply((*um));
00624 _sse_double_load(rs.c2);
00625 _sse_double_vector_add();
00626 _sse_double_vector_mul(fact2);
00627 _sse_double_store((*sn).c2);
00628 _sse_double_load(rs.c4);
00629 _sse_double_vector_sub();
00630 _sse_double_vector_mul(fact2);
00631 _sse_double_store((*sn).c4);
00632
00633
00634 }
00635
00636 #endif // if defined(USE_DOUBLE_PRECISION)
00637
00638 }
00639 };
00640
00641 #endif // if defined(SSE2)
00642
00643