It seems like the `Ax` and `Ay` arrays are somehow mixed up in GPUE, and the current master branch does not evolve correctly with rotation. The following `apply_gauge(...)` function works: ```cuda // 3D void apply_gauge(Grid &par, double2 *wfc, double2 *Ax, double2 *Ay, double2 *Az, double renorm_factor_x, double renorm_factor_y, double renorm_factor_z, bool flip, cufftHandle plan_1d, cufftHandle plan_dim2, cufftHandle plan_dim3, double dx, double dy, double dz, double time, int yDim, int size){ dim3 grid = par.grid; dim3 threads = par.threads; if (flip){ // 1d forward / mult by Ay cufftHandleError( cufftExecZ2Z(plan_1d, wfc, wfc, CUFFT_FORWARD) ); scalarMult<<<grid,threads>>>(wfc, renorm_factor_z, wfc); cudaCheckError(); if(par.bval("Ay_time")){ EqnNode_gpu* Ay_eqn = par.astval("Ay"); int e_num = par.ival("Ay_num"); ast_cmult<<<grid,threads>>>(wfc, wfc, Ay_eqn, dx, dy, dz, time, e_num); cudaCheckError(); } else{ cMult<<<grid,threads>>>(wfc, (cufftDoubleComplex*) Ay, wfc); cudaCheckError(); } cufftHandleError( cufftExecZ2Z(plan_1d, wfc, wfc, CUFFT_INVERSE) ); scalarMult<<<grid,threads>>>(wfc, renorm_factor_z, wfc); cudaCheckError(); // loop to multiply by Ax for (int i = 0; i < yDim; i++){ cufftHandleError( cufftExecZ2Z(plan_dim2, &wfc[i*size], &wfc[i*size], CUFFT_FORWARD) ); } scalarMult<<<grid,threads>>>(wfc, renorm_factor_y, wfc); cudaCheckError(); if(par.bval("Ax_time")){ EqnNode_gpu* Ax_eqn = par.astval("Ax"); int e_num = par.ival("Ax_num"); ast_cmult<<<grid,threads>>>(wfc, wfc, Ax_eqn, dx, dy, dz, time, e_num); cudaCheckError(); } else{ cMult<<<grid,threads>>>(wfc, (cufftDoubleComplex*) Ax, wfc); cudaCheckError(); } for (int i = 0; i < yDim; i++){ //size = xDim * zDim; cufftHandleError( cufftExecZ2Z(plan_dim2, &wfc[i*size], &wfc[i*size], CUFFT_INVERSE) ); } scalarMult<<<grid,threads>>>(wfc, renorm_factor_y, wfc); cudaCheckError(); // 1D FFT to Az cufftHandleError( cufftExecZ2Z(plan_dim3, wfc, wfc, CUFFT_FORWARD) ); scalarMult<<<grid,threads>>>(wfc, renorm_factor_x, wfc); cudaCheckError(); if(par.bval("Az_time")){ EqnNode_gpu* Az_eqn = par.astval("Az"); int e_num = par.ival("Az_num"); ast_cmult<<<grid,threads>>>(wfc, wfc, Az_eqn, dx, dy, dz, time, e_num); cudaCheckError(); } else{ cMult<<<grid,threads>>>(wfc, (cufftDoubleComplex*) Az, wfc); cudaCheckError(); } cufftHandleError( cufftExecZ2Z(plan_dim3, wfc, wfc, CUFFT_INVERSE) ); scalarMult<<<grid,threads>>>(wfc, renorm_factor_x, wfc); cudaCheckError(); } else{ // 1D FFT to Az cufftHandleError( cufftExecZ2Z(plan_dim3, wfc, wfc, CUFFT_FORWARD) ); scalarMult<<<grid,threads>>>(wfc, renorm_factor_z, wfc); cudaCheckError(); if(par.bval("Az_time")){ EqnNode_gpu* Az_eqn = par.astval("Az"); int e_num = par.ival("Az_num"); ast_cmult<<<grid,threads>>>(wfc, wfc, Az_eqn, dx, dy, dz, time, e_num); cudaCheckError(); } else{ cMult<<<grid,threads>>>(wfc, (cufftDoubleComplex*) Az, wfc); cudaCheckError(); } cufftHandleError( cufftExecZ2Z(plan_dim3, wfc, wfc, CUFFT_INVERSE) ); scalarMult<<<grid,threads>>>(wfc, renorm_factor_z, wfc); cudaCheckError(); // loop to multiply by Ax for (int i = 0; i < yDim; i++){ cufftHandleError( cufftExecZ2Z(plan_dim2, &wfc[i*size], &wfc[i*size], CUFFT_FORWARD) ); } scalarMult<<<grid,threads>>>(wfc, renorm_factor_x, wfc); cudaCheckError(); if(par.bval("Ax_time")){ EqnNode_gpu* Ax_eqn = par.astval("Ax"); int e_num = par.ival("Ax_num"); ast_cmult<<<grid,threads>>>(wfc, wfc, Ax_eqn, dx, dy, dz, time, e_num); cudaCheckError(); } else{ cMult<<<grid,threads>>>(wfc, (cufftDoubleComplex*) Ax, wfc); cudaCheckError(); } for (int i = 0; i < yDim; i++){ //size = xDim * zDim; cufftHandleError( cufftExecZ2Z(plan_dim2, &wfc[i*size], &wfc[i*size], CUFFT_INVERSE) ); } scalarMult<<<grid,threads>>>(wfc, renorm_factor_x, wfc); cudaCheckError(); // 1d forward / mult by Ay cufftHandleError( cufftExecZ2Z(plan_1d, wfc, wfc, CUFFT_FORWARD) ); scalarMult<<<grid,threads>>>(wfc, renorm_factor_y, wfc); cudaCheckError(); if(par.bval("Ay_time")){ EqnNode_gpu* Ay_eqn = par.astval("Ay"); int e_num = par.ival("Ay_num"); ast_cmult<<<grid,threads>>>(wfc, wfc, Ay_eqn, dx, dy, dz, time, e_num); cudaCheckError(); } else{ cMult<<<grid,threads>>>(wfc, (cufftDoubleComplex*) Ay, wfc); cudaCheckError(); } cufftHandleError( cufftExecZ2Z(plan_1d, wfc, wfc, CUFFT_INVERSE) ); scalarMult<<<grid,threads>>>(wfc, renorm_factor_y, wfc); cudaCheckError(); } } ``` Note that the `dim2` plan is for some reason for the Ax operator instead of Ay. I am looking into this now. These changes are on the `3d_rot_fix` branch.