From 09f6d279f0aeaea9c340f44db16ab83978dcd63f Mon Sep 17 00:00:00 2001
From: Himanshu Goel <himanshu@hgoel.dev>
Date: Sun, 3 Dec 2023 18:36:29 -0500
Subject: [PATCH 1/9] Import previous changes.

---
 cpp/src/core/gmfft.cpp       | 1555 ++++++++++++++++++++++++++++++++++
 cpp/src/core/gmfft.h         | 1042 +++++++++++++++++++++++
 cpp/src/core/gmfft_gpu.cu    |  704 +++++++++++++++
 cpp/src/core/gmfft_gpu.h     |   43 +
 cpp/src/core/srradmnp.cpp    |  613 +++++++-------
 cpp/src/core/srradmnp.h      |   65 +-
 cpp/src/core/srradmnp_gpu.cu |  519 ++++++++++++
 cpp/src/core/srradstr.h      |   41 +-
 cpp/src/core/srradstr_gpu.cu |  330 ++++++++
 cpp/src/lib/auxgpu.cpp       |  368 ++++++++
 cpp/src/lib/auxgpu.h         |   62 ++
 cpp/src/lib/srwlib.cpp       |   59 +-
 cpp/src/lib/srwlib.h         |   46 +-
 13 files changed, 5099 insertions(+), 348 deletions(-)
 create mode 100644 cpp/src/core/gmfft.cpp
 create mode 100644 cpp/src/core/gmfft.h
 create mode 100644 cpp/src/core/gmfft_gpu.cu
 create mode 100644 cpp/src/core/gmfft_gpu.h
 create mode 100644 cpp/src/core/srradmnp_gpu.cu
 create mode 100644 cpp/src/core/srradstr_gpu.cu
 create mode 100644 cpp/src/lib/auxgpu.cpp
 create mode 100644 cpp/src/lib/auxgpu.h

diff --git a/cpp/src/core/gmfft.cpp b/cpp/src/core/gmfft.cpp
new file mode 100644
index 00000000..6e59db8a
--- /dev/null
+++ b/cpp/src/core/gmfft.cpp
@@ -0,0 +1,1555 @@
+/************************************************************************//**
+ * File: gmfft.cpp
+ * Description: Auxiliary utilities to work with FFTW library
+ * Project: 
+ * First release: 2000
+ *
+ * Copyright (C) European Synchrotron Radiation Facility, Grenoble, France
+ * All Rights Reserved
+ *
+ * @author O.Chubar, P.Elleaume
+ * @author S. Yakubov (E-XFEL) - noticed issue and suggested fix in FFT1D
+ * @version 1.1
+ ***************************************************************************/
+
+#include "gmfft.h"
+
+#ifdef _OFFLOAD_GPU
+#include "gmfft_gpu.h"
+#endif
+
+//#include "srwlib.h" //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP
+
+#ifdef _WITH_OMP //OC27102018
+//SY: adopted for OpenMP
+#include "omp.h"
+#endif
+
+//*************************************************************************
+
+long CGenMathFFT::GoodNumbers[] = {
+	2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 36, 40, 42, 44, 
+	48, 50, 52, 54, 56, 60, 64, 66, 70, 72, 78, 80, 84, 88, 90, 96, 98, 100, 104, 
+	108, 110, 112, 120, 126, 128, 130, 132, 140, 144, 150, 154, 156, 160, 162, 
+	168, 176, 180, 182, 192, 196, 198, 200, 208, 210, 216, 220, 224, 234, 240, 
+	250, 252, 256, 260, 264, 270, 280, 286, 288, 294, 300, 308, 312, 320, 324, 
+	330, 336, 350, 352, 360, 364, 378, 384, 390, 392, 396, 400, 416, 420, 432, 
+	440, 448, 450, 462, 468, 480, 486, 490, 500, 504, 512, 520, 528, 540, 546, 
+	550, 560, 572, 576, 588, 594, 600, 616, 624, 630, 640, 648, 650, 660, 672, 
+	686, 700, 702, 704, 720, 728, 750, 756, 768, 770, 780, 784, 792, 800, 810, 
+	832, 840, 858, 864, 880, 882, 896, 900, 910, 924, 936, 960, 972, 980, 990, 
+	1000, 1008, 1024, 1040, 1050, 1056, 1078, 1080, 1092, 1100, 1120, 1134, 1144, 
+	1152, 1170, 1176, 1188, 1200, 1232, 1248, 1250, 1260, 1274, 1280, 1296, 1300, 
+	1320, 1344, 1350, 1372, 1386, 1400, 1404, 1408, 1430, 1440, 1456, 1458, 1470, 
+	1500, 1512, 1536, 1540, 1560, 1568, 1584, 1600, 1620, 1638, 1650, 1664, 1680, 
+	1716, 1728, 1750, 1760, 1764, 1782, 1792, 1800, 1820, 1848, 1872, 1890, 1920, 
+	1944, 1950, 1960, 1980, 2000, 2002, 2016, 2048, 2058, 2080, 2100, 2106, 2112, 
+	2156, 2160, 2184, 2200, 2240, 2250, 2268, 2288, 2304, 2310, 2340, 2352, 2376, 
+	2400, 2430, 2450, 2464, 2496, 2500, 2520, 2548, 2560, 2574, 2592, 2600, 2640, 
+	2646, 2688, 2700, 2730, 2744, 2750, 2772, 2800, 2808, 2816, 2860, 2880, 2912, 
+	2916, 2940, 2970, 3000, 3024, 3072, 3080, 3120, 3136, 3150, 3168, 3200, 3234, 
+	3240, 3250, 3276, 3300, 3328, 3360, 3402, 3430, 3432, 3456, 3500, 3510, 3520, 
+	3528, 3564, 3584, 3600, 3640, 3696, 3744, 3750, 3780, 3822, 3840, 3850, 3888, 
+	3900, 3920, 3960, 4000, 4004, 4032, 4050, 4096, 4116, 4158, 4160, 4200, 4212, 
+	4224, 4290, 4312, 4320, 4368, 4374, 4400, 4410, 4480, 4500, 4536, 4550, 4576, 
+	4608, 4620, 4680, 4704, 4752, 4800, 4802, 4860, 4900, 4914, 4928, 4950, 4992,
+	5000, 5040, 5096, 5120, 5148, 5184, 5200, 5250, 5280, 5292, 5346, 5376, 
+	5390, 5400, 5460, 5488, 5500, 5544, 5600, 5616, 5632, 5670, 5720, 5760, 5824, 
+	5832, 5850, 5880, 5940, 6000, 6006, 6048, 6144, 6160, 6174, 6240, 6250, 6272, 
+	6300, 6318, 6336, 6370, 6400, 6468, 6480, 6500, 6552, 6600, 6656, 6720, 6750, 
+	6804, 6860, 6864, 6912, 6930, 7000, 7020, 7040, 7056, 7128, 7150, 7168, 7200, 
+	7280, 7290, 7350, 7392, 7488, 7500, 7546, 7560, 7644, 7680, 7700, 7722, 7776, 
+	7800, 7840, 7920, 7938, 8000, 8008, 8064, 8100, 8190, 8192, 8232, 8250, 8316, 
+	8320, 8400, 8424, 8448, 8580, 8624, 8640, 8736, 8748, 8750, 8800, 8820, 8910, 
+	8918, 8960, 9000, 9072, 9100, 9152, 9216, 9240, 9360, 9408, 9450, 9504, 9600, 
+	9604, 9702, 9720, 9750, 9800, 9828, 9856, 9900, 9984, 10000, 10010, 10080, 
+	10192, 10206, 10240, 10290, 10296, 10368, 10400, 10500, 10530, 10560, 10584, 
+	10692, 10752, 10780, 10800, 10920, 10976, 11000, 11088, 11200, 11232, 11250, 
+	11264, 11340, 11440, 11466, 11520, 11550, 11648, 11664, 11700, 11760, 11880, 
+	12000, 12012, 12096, 12150, 12250, 12288, 12320, 12348, 12474, 12480, 12500, 
+	12544, 12600, 12636, 12672, 12740, 12800, 12870, 12936, 12960, 13000, 13104, 
+	13122, 13200, 13230, 13312, 13440, 13500, 13608, 13650, 13720, 13728, 13750, 
+	13824, 13860, 14000, 14014, 14040, 14080, 14112, 14256, 14300, 14336, 14400, 
+	14406, 14560, 14580, 14700, 14742, 14784, 14850, 14976, 15000, 15092, 15120, 
+	15288, 15360, 15400, 15444, 15552, 15600, 15680, 15750, 15840, 15876, 16000, 
+	16016, 16038, 16128, 16170, 16200, 16250, 16380, 16384, 16464, 16500, 16632, 
+	16640, 16800, 16848, 16896, 17010, 17150, 17160, 17248, 17280, 17472, 17496, 
+	17500, 17550, 17600, 17640, 17820, 17836, 17920, 18000, 18018, 18144, 18200, 
+	18304, 18432, 18480, 18522, 18720, 18750, 18816, 18900, 18954, 19008, 19110, 
+	19200, 19208, 19250, 19404, 19440, 19500, 19600, 19656, 19712, 19800, 19968, 
+	20000, 20020, 20160, 20250, 20384, 20412, 20480, 20580, 20592, 20736, 20790, 
+	20800, 21000, 21060, 21120, 21168, 21384, 21450, 21504, 21560, 21600, 21840, 
+	21870, 21952, 22000, 22050, 22176, 22400, 22464, 22500, 22528, 22638, 22680, 
+	22750, 22880, 22932, 23040, 23100, 23166, 23296, 23328, 23400, 23520, 23760, 
+	23814, 24000, 24010, 24024, 24192, 24300, 24500, 24570, 24576, 24640, 24696, 
+	24750, 24948, 24960, 25000, 25088, 25200, 25272, 25344, 25480, 25600, 25740, 
+	25872, 25920, 26000, 26208, 26244, 26250, 26400, 26460, 26624, 26730, 26754, 
+	26880, 26950, 27000, 27216, 27300, 27440, 27456, 27500, 27648, 27720, 28000, 
+	28028, 28080, 28160, 28224, 28350, 28512, 28600, 28672, 28800, 28812, 29106, 
+	29120, 29160, 29250, 29400, 29484, 29568, 29700, 29952, 30000, 30030, 30184, 
+	30240, 30576, 30618, 30720, 30800, 30870, 30888, 31104, 31200, 31250, 31360, 
+	31500, 31590, 31680, 31752, 31850, 32000, 32032, 32076, 32256, 32340, 32400, 
+	32500, 32760, 32768, 32928, 33000, 33264, 33280, 33600, 33614, 33696, 33750, 
+	33792, 34020, 34300, 34320, 34398, 34496, 34560, 34650, 34944, 34992, 35000, 
+	35100, 35200, 35280, 35640, 35672, 35750, 35840, 36000, 36036, 36288, 36400, 
+	36450, 36608, 36750, 36864, 36960, 37044, 37422, 37440, 37500, 37632, 37730, 
+	37800, 37908, 38016, 38220, 38400, 38416, 38500, 38610, 38808, 38880, 39000, 
+	39200, 39312, 39366, 39424, 39600, 39690, 39936, 40000, 40040, 40320, 40500, 
+	40768, 40824, 40950, 40960, 41160, 41184, 41250, 41472, 41580, 41600, 42000, 
+	42042, 42120, 42240, 42336, 42768, 42900, 43008, 43120, 43200, 43218, 43680, 
+	43740, 43750, 43904, 44000, 44100, 44226, 44352, 44550, 44590, 44800, 44928, 
+	45000, 45056, 45276, 45360, 45500, 45760, 45864, 46080, 46200, 46332, 46592, 
+	46656, 46800, 47040, 47250, 47520, 47628, 48000, 48020, 48048, 48114, 48384, 
+	48510, 48600, 48750, 49000, 49140, 49152, 49280, 49392, 49500, 49896, 49920, 
+	50000, 50050, 50176, 50400, 50544, 50688, 50960, 51030, 51200, 51450, 51480, 
+	51744, 51840, 52000, 52416, 52488, 52500, 52650, 52800, 52822, 52920, 53248, 
+	53460, 53508, 53760, 53900, 54000, 54054, 54432, 54600, 54880, 54912, 55000, 
+	55296, 55440, 55566, 56000, 56056, 56160, 56250, 56320, 56448, 56700, 56862, 
+	57024, 57200, 57330, 57344, 57600, 57624, 57750, 58212, 58240, 58320, 58500, 
+	58800, 58968, 59136, 59400, 59904, 60000, 60060, 60368, 60480, 60750, 61152, 
+	61236, 61250, 61440, 61600, 61740, 61776, 62208, 62370, 62400, 62426, 62500, 
+	62720, 63000, 63180, 63360, 63504, 63700, 64000, 64064, 64152, 64350, 64512, 
+	64680, 64800, 65000, 65520, 65536, 65610, 65856, 66000, 66150, 66528, 66560, 
+	67200, 67228, 67392, 67500, 67584, 67914, 68040, 68250, 68600, 68640, 68750, 
+	68796, 68992, 69120, 69300, 69498, 69888, 69984, 70000, 70070, 70200, 70400, 
+	70560, 71280, 71344, 71442, 71500, 71680, 72000, 72030, 72072, 72576, 72800, 
+	72900, 73216, 73500, 73710, 73728, 73920, 74088, 74250, 74844, 74880, 75000, 
+	75264, 75460, 75600, 75816, 76032, 76440, 76800, 76832, 77000, 77220, 77616, 
+	77760, 78000, 78400, 78624, 78732, 78750, 78848, 79200, 79380, 79872, 80000, 
+	80080, 80190, 80262, 80640, 80850, 81000, 81250, 81536, 81648, 81900, 81920, 
+	82320, 82368, 82500, 82944, 83160, 83200, 84000, 84084, 84240, 84480, 84672, 
+	85050, 85536, 85750, 85800, 86016, 86240, 86400, 86436, 87318, 87360, 87480, 
+	87500, 87750, 87808, 88000, 88200, 88452, 88704, 89100, 89180, 89600, 89856, 
+	90000, 90090, 90112, 90552, 90720, 91000, 91520, 91728, 91854, 92160, 92400, 
+	92610, 92664, 93184, 93312, 93600, 93750, 94080, 94500, 94770, 95040, 95256, 
+	95550, 96000, 96040, 96096, 96228, 96250, 96768, 97020, 97200, 97500, 98000, 
+	98098, 98280, 98304, 98560, 98784, 99000, 99792, 99840, 100000 
+};
+long CGenMathFFT::LenGoodNumbers = 1151; //637;
+
+long CGenMathFFT::GoodNum100s[] = { 0,37,61,79,95,107,120,130,142,151,159 };
+long CGenMathFFT::LenGoodNum100s = 11;
+
+long CGenMathFFT::GoodNum1000s[] = { 0,159,228,279,318,354,383,410,435,459,479 };
+long CGenMathFFT::LenGoodNum1000s = 11;
+
+long CGenMathFFT::GoodNum10000s[] = { 0,479,636,743,830,900,960,1017,1064,1109,1150 };
+long CGenMathFFT::LenGoodNum10000s = 11;
+
+#ifdef _OFFLOAD_GPU
+long CGenMathFFT1D::PlanLen;
+long CGenMathFFT1D::dPlanLen;
+long CGenMathFFT1D::HowMany;
+long CGenMathFFT1D::dHowMany;
+cufftHandle CGenMathFFT1D::Plan1DFFT_cu;
+cufftHandle CGenMathFFT1D::dPlan1DFFT_cu;
+#endif
+
+#ifdef _OFFLOAD_GPU
+long CGenMathFFT2D::PlanNx;
+long CGenMathFFT2D::PlanNy;
+long CGenMathFFT2D::HowMany;
+long CGenMathFFT2D::dPlanNx;
+long CGenMathFFT2D::dPlanNy;
+long CGenMathFFT2D::dHowMany;
+cufftHandle CGenMathFFT2D::Plan2DFFT_cu;
+cufftHandle CGenMathFFT2D::dPlan2DFFT_cu;
+#endif
+//*************************************************************************
+
+void CGenMathFFT::NextCorrectNumberForFFT(long& n)
+//void CGenMathFFT::NextCorrectNumberForFFT(long long& n) //OC26042019
+{
+	if(n < 4)
+	{
+		n = 4; return;
+	}
+	if(n < 100001)
+	{
+		long *pGoodPrev, *pGoodNext;
+
+		long n_d_10000 = long(n*0.0001);
+		if(n_d_10000 > 0) pGoodPrev = GoodNumbers + GoodNum10000s[n_d_10000] - 1;
+		else
+		{
+			long n_d_1000 = long(n*0.001);
+			if(n_d_1000 > 0) pGoodPrev = GoodNumbers + GoodNum1000s[n_d_1000] - 1;
+			else
+			{
+				long n_d_100 = long(n*0.01);
+				if(n_d_100 > 0) pGoodPrev = GoodNumbers + GoodNum100s[n_d_100] - 1;
+				else pGoodPrev = GoodNumbers;
+			}
+		}
+		pGoodNext = pGoodPrev + 1;
+		for(;;)
+		{
+			if((n > *(pGoodPrev++)) && (n <= *pGoodNext))
+			{
+				n = *pGoodNext; return;
+			}
+			pGoodNext++;
+		}
+	}
+	else
+	{
+		//OC23072020: sorted multiplies by ratios of power of first prime numbers bw 1 and 2
+		const double arTestMults[] = {10./9., 9./8., 6./5., 5./4., 4./3., 3./2., 8./5., 5./3., 16./9., 15./8.};
+		const int nTestMults = 10;
+
+		//long k = 16384;
+		//long k = 65536;
+		long k = 99000; //OC23072020 (make sure this number is < 100001, and divides by 9,8,5)
+
+		for(int j=0; j<100; j++)
+		{
+			//OC23072020 (added tests of intermed numbers obtained by multiplying k by a factor bw 1 and 2)
+			bool intermedNumFound = false;
+			for(int m=0; m<nTestMults; m++)
+			{
+				double dkTest = k*arTestMults[m];
+				long kTest = (long)dkTest;
+				if((dkTest - (double)kTest) >= 0.5) kTest++;
+				if(n <= kTest)
+				{
+					n = kTest; 
+					intermedNumFound = true;
+					break;
+				}
+			}
+			if(intermedNumFound) break;
+
+			k <<= 1; 
+			if(n <= k)
+			{
+				n = k; break;
+			}
+		}
+	}
+}
+
+//*************************************************************************
+//int CGenMathFFT1D::Make1DFFT_InPlace(CGenMathFFT1DInfo& FFT1DInfo)
+//int CGenMathFFT1D::Make1DFFT_InPlace(CGenMathFFT1DInfo& FFT1DInfo, gpuUsageArg *pGpuUsage) //HG18072022
+int CGenMathFFT1D::Make1DFFT_InPlace(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU) //OC06092023
+{
+#ifdef _OFFLOAD_GPU //OC06092023 (to avoid useless operations / calls at execution on CPU)
+	GPU_COND(pvGPU, //OC06092023
+	//GPU_COND(pGpuUsage,
+	{
+		//HG03082022 GPU can do an inplace fft without being given a temporary buffer
+		FFT1DInfo.pOutData = FFT1DInfo.pInData;
+		int result;
+		if (result = Make1DFFT(FFT1DInfo, pvGPU)) return result; //OC06092023
+		//if (result = Make1DFFT(FFT1DInfo, pGpuUsage)) return result;
+	})
+	else
+#endif
+	{
+		//long TotAmOfPo = (FFT1DInfo.Nx << 1)*FFT1DInfo.HowMany;
+		long long TotAmOfPo = ((long long)(FFT1DInfo.Nx << 1))*((long long)FFT1DInfo.HowMany);
+		float* AuxDataCont = new float[TotAmOfPo];
+		if(AuxDataCont == 0) return MEMORY_ALLOCATION_FAILURE;
+		FFT1DInfo.pOutData = AuxDataCont;
+
+		int result;
+		if(result = Make1DFFT(FFT1DInfo)) return result;
+
+		float *tOut = FFT1DInfo.pInData, *t = AuxDataCont;
+		for(int ix=0; ix<TotAmOfPo; ix++) *(tOut++) = *(t++);
+
+		if(AuxDataCont != 0) delete[] AuxDataCont;
+	}
+	return 0;
+}
+
+//*************************************************************************
+
+int CGenMathFFT2D::AuxDebug_TestFFT_Plans()
+{//debug function to test why fftw2d_create_plan crashed at Nx=Nz=104
+
+	for(long i=3; i<(CGenMathFFT::LenGoodNumbers); i++)
+	{
+		int CurN = GoodNumbers[i];
+
+#ifdef _FFTW3 //OC28012019
+		fftwf_complex *in=0;
+		fftwf_plan Plan2DFFT = fftwf_plan_dft_2d(CurN, CurN, in, in, FFTW_FORWARD, FFTW_ESTIMATE); 
+		fftwf_destroy_plan(Plan2DFFT);
+#else
+		fftwnd_plan Plan2DFFT;
+		Plan2DFFT = fftw2d_create_plan(CurN, CurN, FFTW_FORWARD, FFTW_IN_PLACE);
+        fftwnd_destroy_plan(Plan2DFFT);
+#endif
+	}
+	return 0;
+}
+
+//*************************************************************************
+//Forward FFT (FFT2DInfo.Dir = 1?): Int f(x,y)*exp(-i*2*Pi*(qx*x + qy*y)) dx dy
+//Backward FFT (FFT2DInfo.Dir = -1?): Int f(qx,qy)*exp(i*2*Pi*(qx*x + qy*y)) dqx dqy
+//int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo)
+//Modification by S.Yakubov for parallelizing SRW via OpenMP:
+// SY: creation (and deletion) of FFTW plans is not thread-safe. Therefore added option to use precreated plans
+#ifdef _FFTW3 //OC29012019
+//int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwf_plan* pPrecreatedPlan2DFFT, fftw_plan* pdPrecreatedPlan2DFFT)
+int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwf_plan* pPrecreatedPlan2DFFT, fftw_plan* pdPrecreatedPlan2DFFT, void* pvGPU) //OC05092023
+//int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwf_plan* pPrecreatedPlan2DFFT, fftw_plan* pdPrecreatedPlan2DFFT, gpuUsageArg *pGpuUsage) //HG18072022
+//int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwf_plan* pPrecreatedPlan2DFFT)
+#else
+int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwnd_plan* pPrecreatedPlan2DFFT) //OC27102018
+#endif
+{// Assumes Nx, Ny even !
+	const double RelShiftTol = 1.E-06;
+
+		//debug
+		//AuxDebug_TestFFT_Plans();
+		//end debug
+
+	SetupLimitsTr(FFT2DInfo);
+
+	double xStepNx = FFT2DInfo.Nx*FFT2DInfo.xStep;
+	double yStepNy = FFT2DInfo.Ny*FFT2DInfo.yStep;
+
+	double x0_After = FFT2DInfo.xStart + 0.5*xStepNx;
+	double y0_After = FFT2DInfo.yStart + 0.5*yStepNy;
+
+	NeedsShiftAfterX = (::fabs(x0_After) > RelShiftTol*xStepNx);
+	NeedsShiftAfterY = (::fabs(y0_After) > RelShiftTol*yStepNy);
+
+	double xStartTr = -0.5/FFT2DInfo.xStep;
+	double yStartTr = -0.5/FFT2DInfo.yStep;
+
+	NeedsShiftBeforeX = NeedsShiftBeforeY = 0;
+	double x0_Before = 0., y0_Before = 0.;
+	if(FFT2DInfo.UseGivenStartTrValues)
+	{
+		x0_Before = (FFT2DInfo.xStartTr - xStartTr); // Sign should be probably reversed here: check!!!
+		y0_Before = (FFT2DInfo.yStartTr - yStartTr); // Sign should be probably reversed here: check!!!
+
+		NeedsShiftBeforeX = (::fabs(x0_Before) > RelShiftTol*(::fabs(xStartTr)));
+		NeedsShiftBeforeY = (::fabs(y0_Before) > RelShiftTol*(::fabs(yStartTr)));
+	}
+
+	//ArrayShiftX = 0; ArrayShiftY = 0; 
+	m_ArrayShiftX = 0; m_ArrayShiftY = 0; //OC02022019
+	m_dArrayShiftX = 0; m_dArrayShiftY = 0;
+	if (FFT2DInfo.pData != 0)
+	{
+		if (NeedsShiftBeforeX || NeedsShiftAfterX)
+		{
+			//ArrayShiftX = new float[Nx << 1];
+			//if(ArrayShiftX == 0) return MEMORY_ALLOCATION_FAILURE;
+			m_ArrayShiftX = new float[Nx << 1];
+			if (m_ArrayShiftX == 0) return MEMORY_ALLOCATION_FAILURE;
+		}
+		if (NeedsShiftBeforeY || NeedsShiftAfterY)
+		{
+			//ArrayShiftY = new float[Ny << 1];
+			//if(ArrayShiftY == 0) return MEMORY_ALLOCATION_FAILURE;
+			m_ArrayShiftY = new float[Ny << 1];
+			if (m_ArrayShiftY == 0) return MEMORY_ALLOCATION_FAILURE;
+		}
+	}
+	else if (FFT2DInfo.pdData != 0)
+	{
+		if (NeedsShiftBeforeX || NeedsShiftAfterX)
+		{
+			m_dArrayShiftX = new double[Nx << 1];
+			if (m_dArrayShiftX == 0) return MEMORY_ALLOCATION_FAILURE;
+		}
+		if (NeedsShiftBeforeY || NeedsShiftAfterY)
+		{
+			m_dArrayShiftY = new double[Ny << 1];
+			if (m_dArrayShiftY == 0) return MEMORY_ALLOCATION_FAILURE;
+		}
+	}
+
+#ifdef _FFTW3
+	fftwf_plan Plan2DFFT;
+	fftw_plan dPlan2DFFT;
+	fftwf_complex* DataToFFT = 0;
+	fftw_complex* dDataToFFT = 0;
+#endif
+
+//HG18072022
+//#ifdef _DEBUG
+//	if (pGpuUsage != NULL)
+//		printf ("GPU: Make2DFFT\n");
+//#endif
+
+#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
+	GPU_COND(pvGPU, //OC06092023
+	//GPU_COND(pGpuUsage, //HG02112021
+	{
+		if(FFT2DInfo.pData != 0) 
+		{
+			DataToFFT = (fftwf_complex*)CAuxGPU::ToDevice((TGPUUsageArg*)pvGPU, FFT2DInfo.pData, FFT2DInfo.Nx * FFT2DInfo.Ny * FFT2DInfo.howMany * 2 * sizeof(float)); //OC06092023
+			//DataToFFT = (fftwf_complex*)AuxGPU::ToDevice(pGpuUsage, FFT2DInfo.pData, FFT2DInfo.Nx * FFT2DInfo.Ny * FFT2DInfo.howMany * 2 * sizeof(float));
+		}
+		else if(FFT2DInfo.pdData != 0) 
+		{
+			dDataToFFT = (fftw_complex*)CAuxGPU::ToDevice((TGPUUsageArg*)pvGPU, FFT2DInfo.pdData, FFT2DInfo.Nx * FFT2DInfo.Ny * FFT2DInfo.howMany * 2 * sizeof(double)); //OC06092023
+			//dDataToFFT = (fftw_complex*)AuxGpu::ToDevice(pGpuUsage, FFT2DInfo.pdData, FFT2DInfo.Nx * FFT2DInfo.Ny * FFT2DInfo.howMany * 2 * sizeof(double));
+		}
+	})
+	else
+#endif
+	{
+#if _FFTW3 //OC28012019
+		if (FFT2DInfo.pData != 0) DataToFFT = (fftwf_complex*)(FFT2DInfo.pData);
+		else if (FFT2DInfo.pdData != 0) dDataToFFT = (fftw_complex*)(FFT2DInfo.pdData); //OC02022019
+
+#else
+		fftwnd_plan Plan2DFFT;
+		FFTW_COMPLEX* DataToFFT = (FFTW_COMPLEX*)(FFT2DInfo.pData);
+#endif
+	}
+
+	char t0SignMult = (FFT2DInfo.Dir > 0)? -1 : 1;
+
+	//if(NeedsShiftBeforeX) FillArrayShift('x', t0SignMult*x0_Before, FFT2DInfo.xStep);
+	//if(NeedsShiftBeforeY) FillArrayShift('y', t0SignMult*y0_Before, FFT2DInfo.yStep);
+	if(NeedsShiftBeforeX) 
+	{//OC02022019
+		if(m_ArrayShiftX != 0)
+			FillArrayShift('x', t0SignMult*x0_Before, FFT2DInfo.xStep, m_ArrayShiftX); 
+		else if(m_dArrayShiftX != 0)
+			FillArrayShift('x', t0SignMult*x0_Before, FFT2DInfo.xStep, m_dArrayShiftX);
+	}
+	if(NeedsShiftBeforeY) 
+	{//OC02022019
+		if(m_ArrayShiftY != 0)
+			FillArrayShift('y', t0SignMult*y0_Before, FFT2DInfo.yStep, m_ArrayShiftY);
+		else if(m_dArrayShiftY != 0)
+			FillArrayShift('y', t0SignMult*y0_Before, FFT2DInfo.yStep, m_dArrayShiftY);
+	}
+	
+#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
+	if (DataToFFT != 0) CAuxGPU::EnsureDeviceMemoryReady((TGPUUsageArg*)pvGPU, DataToFFT); //OC06092023
+	else if (dDataToFFT != 0) CAuxGPU::EnsureDeviceMemoryReady((TGPUUsageArg*)pvGPU, dDataToFFT);
+	//if (DataToFFT != 0) AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, DataToFFT);
+	//else if (dDataToFFT != 0) AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, dDataToFFT);
+#endif
+
+	if (NeedsShiftBeforeX || NeedsShiftBeforeY) //HG02112021
+	{
+#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
+		GPU_COND(pvGPU, { //OC06092023
+			//GPU_COND(pGpuUsage, {
+			TGPUUsageArg *pGPU = (TGPUUsageArg*)pvGPU;
+			if (DataToFFT != 0) {
+				m_ArrayShiftX = (float*)CAuxGPU::ToDevice(pGPU, m_ArrayShiftX, (Nx << 1) * sizeof(float), false); //OC06092023
+				m_ArrayShiftY = (float*)CAuxGPU::ToDevice(pGPU, m_ArrayShiftY, (Ny << 1) * sizeof(float), false);
+				CAuxGPU::EnsureDeviceMemoryReady(pGPU, m_ArrayShiftX);
+				CAuxGPU::EnsureDeviceMemoryReady(pGPU, m_ArrayShiftY);
+				//m_ArrayShiftX = (float*)AuxGpu::ToDevice(pGpuUsage, m_ArrayShiftX, (Nx << 1) * sizeof(float), false);
+				//m_ArrayShiftY = (float*)AuxGpu::ToDevice(pGpuUsage, m_ArrayShiftY, (Ny << 1) * sizeof(float), false);	
+				//AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, m_ArrayShiftX);
+				//AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, m_ArrayShiftY);
+				TreatShifts2D_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany, NeedsShiftBeforeX, NeedsShiftBeforeY, m_ArrayShiftX, m_ArrayShiftY);
+				m_ArrayShiftX = (float*)CAuxGPU::ToHostAndFree(pGPU, m_ArrayShiftX, (Nx << 1) * sizeof(float), true); //OC06092023
+				m_ArrayShiftY = (float*)CAuxGPU::ToHostAndFree(pGPU, m_ArrayShiftY, (Ny << 1) * sizeof(float), true);
+				//m_ArrayShiftX = (float*)AuxGpu::ToHostAndFree(pGpuUsage, m_ArrayShiftX, (Nx << 1) * sizeof(float), true);
+				//m_ArrayShiftY = (float*)AuxGpu::ToHostAndFree(pGpuUsage, m_ArrayShiftY, (Ny << 1) * sizeof(float), true);
+			}
+			else if (dDataToFFT != 0) {
+				m_dArrayShiftX = (double*)CAuxGPU::ToDevice(pGPU, m_dArrayShiftX, (Nx << 1) * sizeof(double), false); //OC06092023
+				m_dArrayShiftY = (double*)CAuxGPU::ToDevice(pGPU, m_dArrayShiftY, (Ny << 1) * sizeof(double), false);
+				CAuxGPU::EnsureDeviceMemoryReady(pGPU, m_dArrayShiftX);
+				CAuxGPU::EnsureDeviceMemoryReady(pGPU, m_dArrayShiftY);
+				//m_dArrayShiftX = (double*)AuxGpu::ToDevice(pGpuUsage, m_dArrayShiftX, (Nx << 1) * sizeof(double), false);	
+				//m_dArrayShiftY = (double*)AuxGpu::ToDevice(pGpuUsage, m_dArrayShiftY, (Ny << 1) * sizeof(double), false);	
+				//AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, m_dArrayShiftX);
+				//AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, m_dArrayShiftY);
+				TreatShifts2D_GPU((double*)dDataToFFT, Nx, Ny, FFT2DInfo.howMany, NeedsShiftBeforeX, NeedsShiftBeforeY, m_dArrayShiftX, m_dArrayShiftY);
+				m_dArrayShiftX = (double*)CAuxGPU::ToHostAndFree(pGPU, m_dArrayShiftX, (Nx << 1) * sizeof(double), true); //OC06092023
+				m_dArrayShiftY = (double*)CAuxGPU::ToHostAndFree(pGPU, m_dArrayShiftY, (Ny << 1) * sizeof(double), true);
+				//m_dArrayShiftX = (double*)AuxGpu::ToHostAndFree(pGpuUsage, m_dArrayShiftX, (Nx << 1) * sizeof(double), true);
+				//m_dArrayShiftY = (double*)AuxGpu::ToHostAndFree(pGpuUsage, m_dArrayShiftY, (Ny << 1) * sizeof(double), true);
+			}
+		})
+		else 
+#endif
+		{
+			if (DataToFFT != 0) TreatShifts(DataToFFT, FFT2DInfo.howMany);
+
+#ifdef _FFTW3 //OC27022019
+			else if (dDataToFFT != 0) TreatShifts(dDataToFFT, FFT2DInfo.howMany); //OC02022019
+#endif
+		}
+	}
+
+	bool alreadyNormalized = false; //HG17032022
+	//double Mult = FFT2DInfo.xStep*FFT2DInfo.yStep;
+	double Mult = FFT2DInfo.xStep * FFT2DInfo.yStep * FFT2DInfo.ExtraMult; //OC20112017
+	if (FFT2DInfo.Dir > 0)
+	{
+#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
+		GPU_COND(pvGPU, //OC06092023
+		//GPU_COND(pGpuUsage, //HG02112021
+		{
+			if (DataToFFT != 0)
+			{
+				if (pPrecreatedPlan2DFFT == 0) 
+				{
+					if((Plan2DFFT_cu == NULL) || (!(PlanNx == Nx && PlanNy == Ny && HowMany == FFT2DInfo.howMany))) //OC06092023
+					//if (Plan2DFFT_cu == NULL | !(PlanNx == Nx && PlanNy == Ny && HowMany == FFT2DInfo.howMany)) 
+					{
+						if (Plan2DFFT_cu != NULL)
+						{
+							cufftDestroy(Plan2DFFT_cu);
+							Plan2DFFT_cu = NULL;
+						}
+
+						PlanNx = Nx;
+						PlanNy = Ny;
+						HowMany = FFT2DInfo.howMany;
+						int plan_shape[2]; plan_shape[0] = Nx; plan_shape[1] = Ny;
+						cufftPlanMany(&Plan2DFFT_cu, 2, plan_shape, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, FFT2DInfo.howMany);
+						//cufftPlan2d(&Plan2DFFT_cu, Nx, Ny, CUFFT_C2C);
+					}
+				}
+				else Plan2DFFT_cu = *(cufftHandle*)pPrecreatedPlan2DFFT;
+				if (Plan2DFFT_cu == 0) return ERROR_IN_FFT;
+
+				auto res = cufftExecC2C(Plan2DFFT_cu, (cufftComplex*)DataToFFT, (cufftComplex*)DataToFFT, CUFFT_FORWARD);
+//				if (res != CUFFT_SUCCESS)
+//					printf("CUFFT Error: %d\r\n", res);
+			}
+			else if (dDataToFFT != 0)
+			{
+				if (pdPrecreatedPlan2DFFT == 0) 
+				{
+					if((dPlan2DFFT_cu == NULL) || (!(dPlanNx == Nx && dPlanNy == Ny && dHowMany == FFT2DInfo.howMany))) //OC06092023
+					//if (dPlan2DFFT_cu == NULL | !(dPlanNx == Nx && dPlanNy == Ny && dHowMany == FFT2DInfo.howMany)) 
+					{
+						if (dPlan2DFFT_cu != NULL)
+						{
+							cufftDestroy(dPlan2DFFT_cu);
+							dPlan2DFFT_cu = NULL;
+						}
+
+						dPlanNx = Nx;
+						dPlanNy = Ny;
+						HowMany = FFT2DInfo.howMany;
+						int plan_shape[2]; plan_shape[0] = Nx; plan_shape[1] = Ny;
+						cufftPlanMany(&Plan2DFFT_cu, 2, plan_shape, 0, 0, 0, 0, 0, 0, CUFFT_Z2Z, FFT2DInfo.howMany);
+						//cufftPlan2d(&dPlan2DFFT_cu, Nx, Ny, CUFFT_Z2Z);
+					}
+				}
+				else dPlan2DFFT_cu = *(cufftHandle*)pdPrecreatedPlan2DFFT;
+				if (dPlan2DFFT_cu == 0) return ERROR_IN_FFT;
+
+				cufftExecZ2Z(dPlan2DFFT_cu, (cufftDoubleComplex*)dDataToFFT, (cufftDoubleComplex*)dDataToFFT, CUFFT_FORWARD);
+			}
+		})
+		else 
+#endif
+		{
+			//Plan2DFFT = fftw2d_create_plan(Ny, Nx, FFTW_FORWARD, FFTW_IN_PLACE);
+			//OC27102018
+			//SY: adopted for OpenMP
+#if _FFTW3 //OC28012019
+
+			for(long iHowMany = 0; iHowMany < FFT2DInfo.howMany; iHowMany++)
+			{
+				long iFFT = Nx * Ny * iHowMany;
+				if (DataToFFT != 0)
+				{
+					if (pPrecreatedPlan2DFFT == 0) Plan2DFFT = fftwf_plan_dft_2d(Ny, Nx, DataToFFT + iFFT, DataToFFT + iFFT, FFTW_FORWARD, FFTW_ESTIMATE);
+					else Plan2DFFT = *pPrecreatedPlan2DFFT;
+					if (Plan2DFFT == 0) return ERROR_IN_FFT;
+
+					fftwf_execute(Plan2DFFT);
+				}
+				else if (dDataToFFT != 0)
+				{
+					if (pdPrecreatedPlan2DFFT == 0) dPlan2DFFT = fftw_plan_dft_2d(Ny, Nx, dDataToFFT + iFFT, dDataToFFT + iFFT, FFTW_FORWARD, FFTW_ESTIMATE);
+					else dPlan2DFFT = *pdPrecreatedPlan2DFFT;
+					if (dPlan2DFFT == 0) return ERROR_IN_FFT;
+
+					fftw_execute(dPlan2DFFT);
+				}
+			}
+
+#else
+			if (pPrecreatedPlan2DFFT == 0) Plan2DFFT = fftw2d_create_plan(Ny, Nx, FFTW_FORWARD, FFTW_IN_PLACE);
+			else Plan2DFFT = *pPrecreatedPlan2DFFT;
+			if (Plan2DFFT == 0) return ERROR_IN_FFT;
+			fftwnd(Plan2DFFT, 1, DataToFFT, 1, 0, DataToFFT, 1, 0);
+#endif
+		}
+
+#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
+		GPU_COND(pvGPU, //OC06092023
+		//GPU_COND(pGpuUsage, //HG18072022
+		{
+			if (DataToFFT != 0)
+			{
+				//RepairSignAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany);
+				//RotateDataAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany);
+				//RepairSignAndRotateDataAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany, Mult);
+				RepairSignAndRotateDataAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany, (float)Mult); //OC06092023
+			}
+			else if (dDataToFFT != 0)
+			{
+				//RepairSignAfter2DFFT_GPU((double*)dDataToFFT, Nx, Ny, FFT2DInfo.howMany);
+				//RotateDataAfter2DFFT_GPU((double*)dDataToFFT, Nx, Ny, FFT2DInfo.howMany);
+				RepairSignAndRotateDataAfter2DFFT_GPU((double*)dDataToFFT, Nx, Ny, FFT2DInfo.howMany, Mult);
+			}
+			alreadyNormalized = true;
+		})
+		else 
+#endif
+		{
+			if (DataToFFT != 0)
+			{
+				RepairSignAfter2DFFT(DataToFFT, FFT2DInfo.howMany);
+				RotateDataAfter2DFFT(DataToFFT, FFT2DInfo.howMany);
+			}
+
+#ifdef _FFTW3 //OC27022019
+			else if (dDataToFFT != 0)
+			{
+				RepairSignAfter2DFFT(dDataToFFT, FFT2DInfo.howMany);
+				RotateDataAfter2DFFT(dDataToFFT, FFT2DInfo.howMany);
+			}
+#endif
+		}
+	}
+	else
+	{
+#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
+		GPU_COND(pvGPU, //OC06092023
+		//GPU_COND(pGpuUsage, //HG18072022
+		{
+			if (DataToFFT != 0)
+			{
+				if (pPrecreatedPlan2DFFT == 0) {
+					if((Plan2DFFT_cu == NULL) || (!(PlanNx == Nx && PlanNy == Ny && HowMany == FFT2DInfo.howMany))) //OC06092023
+					//if (Plan2DFFT_cu == NULL | !(PlanNx == Nx && PlanNy == Ny && HowMany == FFT2DInfo.howMany)) 
+					{
+						if (Plan2DFFT_cu != NULL){
+							cufftDestroy(Plan2DFFT_cu);
+							Plan2DFFT_cu = NULL;
+						}
+
+						PlanNx = Nx;
+						PlanNy = Ny;
+						HowMany = FFT2DInfo.howMany;
+						int plan_shape[2]; plan_shape[0] = Nx; plan_shape[1] = Ny;
+						cufftPlanMany(&Plan2DFFT_cu, 2, plan_shape, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, FFT2DInfo.howMany);
+						//cufftPlan2d(&Plan2DFFT_cu, Nx, Ny, CUFFT_C2C);
+					}
+				}
+				else Plan2DFFT_cu = *(cufftHandle*)pPrecreatedPlan2DFFT;
+				if (Plan2DFFT_cu == 0) return ERROR_IN_FFT;
+
+				RotateDataAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany);
+				RepairSignAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany);
+				cufftExecC2C(Plan2DFFT_cu, (cufftComplex*)DataToFFT, (cufftComplex*)DataToFFT, CUFFT_INVERSE);
+			}
+			else if (dDataToFFT != 0)
+			{
+				if (pdPrecreatedPlan2DFFT == 0) {
+					if((dPlan2DFFT_cu == NULL) || (!(dPlanNx == Nx && dPlanNy == Ny && dHowMany == FFT2DInfo.howMany))) //OC06092023
+					//if (dPlan2DFFT_cu == NULL | !(dPlanNx == Nx && dPlanNy == Ny && dHowMany == FFT2DInfo.howMany)) 
+					{
+						if (dPlan2DFFT_cu != NULL){
+							cufftDestroy(dPlan2DFFT_cu);
+							dPlan2DFFT_cu = NULL;
+						}
+
+						dPlanNx = Nx;
+						dPlanNy = Ny;
+						dHowMany = FFT2DInfo.howMany;
+						int plan_shape[2]; plan_shape[0] = Nx; plan_shape[1] = Ny;
+						cufftPlanMany(&Plan2DFFT_cu, 2, plan_shape, NULL, 0, 0, NULL, 0, 0, CUFFT_Z2Z, FFT2DInfo.howMany);
+						//cufftPlan2d(&dPlan2DFFT_cu, Nx, Ny, CUFFT_Z2Z);
+					}
+				}
+				else dPlan2DFFT_cu = *(cufftHandle*)pdPrecreatedPlan2DFFT;
+				if (dPlan2DFFT_cu == 0) return ERROR_IN_FFT;
+
+				RotateDataAfter2DFFT_GPU((double*)dDataToFFT, Nx, Ny, FFT2DInfo.howMany);
+				RepairSignAfter2DFFT_GPU((double*)dDataToFFT, Nx, Ny, FFT2DInfo.howMany);
+				cufftExecZ2Z(dPlan2DFFT_cu, (cufftDoubleComplex*)dDataToFFT, (cufftDoubleComplex*)dDataToFFT, CUFFT_INVERSE);
+			}
+		})
+		else 
+#endif
+		{
+			//Plan2DFFT = fftw2d_create_plan(Ny, Nx, FFTW_BACKWARD, FFTW_IN_PLACE);
+			//OC27102018
+			//SY: adopted for OpenMP
+#ifdef _FFTW3 //OC28012019
+			for (long iHowMany = 0; iHowMany < FFT2DInfo.howMany; iHowMany++)
+			{
+				long iFFT = Nx * Ny * iHowMany;
+				if (DataToFFT != 0)
+				{
+					if (pPrecreatedPlan2DFFT == 0) Plan2DFFT = fftwf_plan_dft_2d(Ny, Nx, DataToFFT + iFFT, DataToFFT + iFFT, FFTW_BACKWARD, FFTW_ESTIMATE);
+					else Plan2DFFT = *pPrecreatedPlan2DFFT;
+					if (Plan2DFFT == 0) return ERROR_IN_FFT;
+					RotateDataAfter2DFFT(DataToFFT, FFT2DInfo.howMany);
+					RepairSignAfter2DFFT(DataToFFT, FFT2DInfo.howMany);
+					fftwf_execute(Plan2DFFT);
+				}
+				else if (dDataToFFT != 0)
+				{
+					if (pdPrecreatedPlan2DFFT == 0) dPlan2DFFT = fftw_plan_dft_2d(Ny, Nx, dDataToFFT + iFFT, dDataToFFT + iFFT, FFTW_BACKWARD, FFTW_ESTIMATE);
+					else dPlan2DFFT = *pdPrecreatedPlan2DFFT;
+					if (dPlan2DFFT == 0) return ERROR_IN_FFT;
+					RotateDataAfter2DFFT(dDataToFFT, FFT2DInfo.howMany);
+					RepairSignAfter2DFFT(dDataToFFT, FFT2DInfo.howMany);
+					fftw_execute(dPlan2DFFT);
+				}
+			}
+#else
+			if (pPrecreatedPlan2DFFT == 0) Plan2DFFT = fftw2d_create_plan(Ny, Nx, FFTW_BACKWARD, FFTW_IN_PLACE);
+			else Plan2DFFT = *pPrecreatedPlan2DFFT;
+			if (Plan2DFFT == 0) return ERROR_IN_FFT;
+			RotateDataAfter2DFFT(DataToFFT);
+			RepairSignAfter2DFFT(DataToFFT);
+			fftwnd(Plan2DFFT, 1, DataToFFT, 1, 0, DataToFFT, 1, 0);
+#endif
+		}
+	}
+	
+	if (!alreadyNormalized){
+#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
+		GPU_COND(pvGPU, //OC06092023
+		//GPU_COND(pGpuUsage, //HG18072022
+		{
+			if (DataToFFT != 0)
+				NormalizeDataAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany, Mult);
+			else if (dDataToFFT != 0)
+				NormalizeDataAfter2DFFT_GPU((double*)dDataToFFT, Nx, Ny, FFT2DInfo.howMany, Mult);
+		})
+		else 
+#endif
+		{
+			if (DataToFFT != 0) NormalizeDataAfter2DFFT(DataToFFT, Mult, FFT2DInfo.howMany);
+
+#ifdef _FFTW3 //OC27022019
+			else if (dDataToFFT != 0) NormalizeDataAfter2DFFT(dDataToFFT, Mult, FFT2DInfo.howMany);
+#endif
+		}
+	}
+
+	//if(NeedsShiftAfterX) FillArrayShift('x', t0SignMult*x0_After, FFT2DInfo.xStepTr);
+	//if(NeedsShiftAfterY) FillArrayShift('y', t0SignMult*y0_After, FFT2DInfo.yStepTr);
+
+	if (NeedsShiftAfterX)
+	{//OC02022019
+		if (m_ArrayShiftX != 0) FillArrayShift('x', t0SignMult * x0_After, FFT2DInfo.xStepTr, m_ArrayShiftX);
+		else if (m_dArrayShiftX != 0) FillArrayShift('x', t0SignMult * x0_After, FFT2DInfo.xStepTr, m_dArrayShiftX);
+	}
+	if (NeedsShiftAfterY)
+	{//OC02022019
+		if (m_ArrayShiftY != 0) FillArrayShift('y', t0SignMult * y0_After, FFT2DInfo.yStepTr, m_ArrayShiftY);
+		else if (m_dArrayShiftY != 0) FillArrayShift('y', t0SignMult * y0_After, FFT2DInfo.yStepTr, m_dArrayShiftY);
+	}
+	if (NeedsShiftAfterX || NeedsShiftAfterY)
+	{
+#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
+		GPU_COND(pvGPU, //OC06092023
+		//GPU_COND(pGpuUsage, //HG18072022
+		{
+			TGPUUsageArg *pGPU = (TGPUUsageArg*)pvGPU;
+			if (DataToFFT != 0) {
+				m_ArrayShiftX = (float*)CAuxGPU::ToDevice(pGPU, m_ArrayShiftX, (Nx << 1) * sizeof(float), false); //OC06092023
+				m_ArrayShiftY = (float*)CAuxGPU::ToDevice(pGPU, m_ArrayShiftY, (Ny << 1) * sizeof(float), false);
+				CAuxGPU::EnsureDeviceMemoryReady(pGPU, m_ArrayShiftX);
+				CAuxGPU::EnsureDeviceMemoryReady(pGPU, m_ArrayShiftY);
+				//m_ArrayShiftX = (float*)AuxGpu::ToDevice(pGpuUsage, m_ArrayShiftX, (Nx << 1) * sizeof(float), false);
+				//m_ArrayShiftY = (float*)AuxGpu::ToDevice(pGpuUsage, m_ArrayShiftY, (Ny << 1) * sizeof(float), false);
+				//AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, m_ArrayShiftX);
+				//AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, m_ArrayShiftY);
+				TreatShifts2D_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany, NeedsShiftAfterX, NeedsShiftAfterY, m_ArrayShiftX, m_ArrayShiftY);
+				m_ArrayShiftX = (float*)CAuxGPU::ToHostAndFree(pGPU, m_ArrayShiftX, (Nx << 1) * sizeof(float), true); //OC06092023
+				m_ArrayShiftY = (float*)CAuxGPU::ToHostAndFree(pGPU, m_ArrayShiftY, (Ny << 1) * sizeof(float), true);
+				//m_ArrayShiftX = (float*)AuxGpu::ToHostAndFree(pGpuUsage, m_ArrayShiftX, (Nx << 1) * sizeof(float), true);
+				//m_ArrayShiftY = (float*)AuxGpu::ToHostAndFree(pGpuUsage, m_ArrayShiftY, (Ny << 1) * sizeof(float), true);
+			}
+			else if (dDataToFFT != 0) {
+				m_dArrayShiftX = (double*)CAuxGPU::ToDevice(pGPU, m_dArrayShiftX, (Nx << 1) * sizeof(double), false); //OC06092023
+				m_dArrayShiftY = (double*)CAuxGPU::ToDevice(pGPU, m_dArrayShiftY, (Ny << 1) * sizeof(double), false);
+				CAuxGPU::EnsureDeviceMemoryReady(pGPU, m_dArrayShiftX);
+				CAuxGPU::EnsureDeviceMemoryReady(pGPU, m_dArrayShiftY);
+				//m_dArrayShiftX = (double*)AuxGpu::ToDevice(pGpuUsage, m_dArrayShiftX, (Nx << 1) * sizeof(double), false);	
+				//m_dArrayShiftY = (double*)AuxGpu::ToDevice(pGpuUsage, m_dArrayShiftY, (Ny << 1) * sizeof(double), false);
+				//AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, m_dArrayShiftX);
+				//AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, m_dArrayShiftY);
+				TreatShifts2D_GPU((double*)dDataToFFT, Nx, Ny, FFT2DInfo.howMany, NeedsShiftAfterX, NeedsShiftAfterY, m_dArrayShiftX, m_dArrayShiftY);
+				m_dArrayShiftX = (double*)CAuxGPU::ToHostAndFree(pGPU, m_dArrayShiftX, (Nx << 1) * sizeof(double), true); //OC06092023
+				m_dArrayShiftY = (double*)CAuxGPU::ToHostAndFree(pGPU, m_dArrayShiftY, (Ny << 1) * sizeof(double), true);
+				//m_dArrayShiftX = (double*)AuxGpu::ToHostAndFree(pGpuUsage, m_dArrayShiftX, (Nx << 1) * sizeof(double), true);
+				//m_dArrayShiftY = (double*)AuxGpu::ToHostAndFree(pGpuUsage, m_dArrayShiftY, (Ny << 1) * sizeof(double), true);
+			}
+		})
+		else 
+#endif
+		{
+			if (DataToFFT != 0) TreatShifts(DataToFFT, FFT2DInfo.howMany);
+
+#ifdef _FFTW3 //OC27022019
+			else if (dDataToFFT != 0) TreatShifts(dDataToFFT, FFT2DInfo.howMany); //OC02022019
+#endif
+		}
+	}
+
+	//OC_NERSC: to comment-out the following line for NERSC (to avoid crash with "python-mpi")
+	//fftwnd_destroy_plan(Plan2DFFT);
+	//OC27102018
+	//SY: adopted for OpenMP
+#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
+	GPU_COND(pvGPU, //OC06092023
+	//GPU_COND(pGpuUsage, //HG02112021
+	{
+		if (FFT2DInfo.pData != 0) 
+		{
+			CAuxGPU::MarkUpdated((TGPUUsageArg*)pvGPU, DataToFFT, true, false); //OC06092023
+			//AuxGpu::MarkUpdated(pGpuUsage, DataToFFT, true, false);
+		}
+		else if (FFT2DInfo.pdData != 0) 
+		{
+			CAuxGPU::MarkUpdated((TGPUUsageArg*)pvGPU, dDataToFFT, true, false); //OC06092023
+			//AuxGpu::MarkUpdated(pGpuUsage, dDataToFFT, true, false);
+		}
+	})
+	else
+#endif
+	{
+#if _FFTW3 //OC28012019
+		if (DataToFFT != 0)
+		{
+			if (pPrecreatedPlan2DFFT == 0) fftwf_destroy_plan(Plan2DFFT);
+		}
+		else if (dDataToFFT != 0) //OC03022019
+		{
+			if (pdPrecreatedPlan2DFFT == 0) fftw_destroy_plan(dPlan2DFFT);
+		}
+#else
+		if (pPrecreatedPlan2DFFT == 0) fftwnd_destroy_plan(Plan2DFFT);
+#endif
+	}
+
+	//if(ArrayShiftX != 0) { delete[] ArrayShiftX; ArrayShiftX = 0;}
+	//if(ArrayShiftY != 0) { delete[] ArrayShiftY; ArrayShiftY = 0;}
+	if (m_ArrayShiftX != 0) { delete[] m_ArrayShiftX;}
+	if (m_ArrayShiftY != 0) { delete[] m_ArrayShiftY;}
+	if (m_dArrayShiftX != 0) { delete[] m_dArrayShiftX;} //OC02022019
+	if (m_dArrayShiftY != 0) { delete[] m_dArrayShiftY;}
+	
+	return 0;
+}
+
+//*************************************************************************
+//Forward FFT: Int f(x)*exp(-i*2*Pi*qx*x)dx
+//Backward FFT: Int f(qx)*exp(i*2*Pi*qx*x)dqx
+//int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo)
+//int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, gpuUsageArg *pGpuUsage) //HG20012022
+int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU) //OC05092023
+{// Assumes Nx, Ny even !
+	//Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP:
+	//double start;
+	//get_walltime (&start);
+
+	const double RelShiftTol = 1.E-06;
+
+	SetupLimitsTr(FFT1DInfo);
+
+	double xStepNx = FFT1DInfo.Nx*FFT1DInfo.xStep;
+	double x0_After = FFT1DInfo.xStart + 0.5*xStepNx;
+	NeedsShiftAfterX = FFT1DInfo.ApplyAutoShiftAfter && (::fabs(x0_After) > RelShiftTol*xStepNx);
+
+	double xStartTr = -0.5/FFT1DInfo.xStep;
+
+	NeedsShiftBeforeX = 0;
+	double x0_Before = 0.;
+
+	if(FFT1DInfo.UseGivenStartTrValue)
+	{
+		x0_Before = (FFT1DInfo.xStartTr - xStartTr);
+		NeedsShiftBeforeX = (::fabs(x0_Before) > RelShiftTol*(::fabs(xStartTr)));
+	}
+
+	m_ArrayShiftX = 0;
+	m_dArrayShiftX = 0;
+	if (NeedsShiftBeforeX || NeedsShiftAfterX)
+	{
+		if (FFT1DInfo.pInData != 0)
+		{
+			m_ArrayShiftX = new float[Nx << 1];
+			if (m_ArrayShiftX == 0) return MEMORY_ALLOCATION_FAILURE;
+
+#ifdef _OFFLOAD_GPU //OC05092023 (check for memory leak / misuse!)
+			m_ArrayShiftX = (float*)CAuxGPU::ToDevice((TGPUUsageArg*)pvGPU, m_ArrayShiftX, (Nx << 1) * sizeof(float), true);
+			//m_ArrayShiftX = (float*)AuxGpu::ToDevice(pGpuUsage, m_ArrayShiftX, (Nx << 1) * sizeof(float), true); //HG20012022
+#endif
+		}
+		else if (FFT1DInfo.pdInData != 0)
+		{
+			m_dArrayShiftX = new double[Nx << 1];
+			if (m_dArrayShiftX == 0) return MEMORY_ALLOCATION_FAILURE;
+
+#ifdef _OFFLOAD_GPU //OC05092023 
+			m_dArrayShiftX = (double*)CAuxGPU::ToDevice((TGPUUsageArg*)pvGPU, m_dArrayShiftX, (Nx << 1) * sizeof(double), true);
+			//m_dArrayShiftX = (double*)AuxGpu::ToDevice(pGpuUsage, m_dArrayShiftX, (Nx << 1) * sizeof(double), true); //HG20012022
+#endif
+		}
+	}
+
+#ifdef _FFTW3 //OC28012019
+	fftwf_plan Plan1DFFT;
+	fftwf_complex* DataToFFT = 0, * OutDataFFT = 0; //, *pOutDataFFT=0;
+
+	fftw_plan dPlan1DFFT;
+	fftw_complex* dDataToFFT = 0, * dOutDataFFT = 0; //, *pdOutDataFFT=0;
+#endif
+
+//HG20012022
+//#ifdef _DEBUG
+//	if (pGpuUsage != NULL)
+//		printf ("GPU: Make1DFFT\n");
+//#endif
+#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
+	GPU_COND(pvGPU, //OC06092023
+	//GPU_COND(pGpuUsage, //HG20012022
+	{
+		if ((FFT1DInfo.pInData != 0) && (FFT1DInfo.pOutData != 0))
+		{
+			DataToFFT = (fftwf_complex*)CAuxGPU::ToDevice((TGPUUsageArg*)pvGPU, FFT1DInfo.pInData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(float)); //OC06092023
+			OutDataFFT = (fftwf_complex*)CAuxGPU::ToDevice((TGPUUsageArg*)pvGPU, FFT1DInfo.pOutData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(float), true);
+			//DataToFFT = (fftwf_complex*)AuxGpu::ToDevice(pGpuUsage, FFT1DInfo.pInData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(float));
+			//OutDataFFT = (fftwf_complex*)AuxGpu::ToDevice(pGpuUsage, FFT1DInfo.pOutData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(float), true);
+		}
+		else if ((FFT1DInfo.pdInData != 0) && (FFT1DInfo.pdOutData != 0))
+		{
+			dDataToFFT = (fftw_complex*)CAuxGPU::ToDevice((TGPUUsageArg*)pvGPU, FFT1DInfo.pdInData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(double)); //OC06092023
+			dOutDataFFT = (fftw_complex*)CAuxGPU::ToDevice((TGPUUsageArg*)pvGPU, FFT1DInfo.pdOutData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(double), true);
+			//dDataToFFT = (fftw_complex*)AuxGpu::ToDevice(pGpuUsage, FFT1DInfo.pdInData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(double));
+			//dOutDataFFT = (fftw_complex*)AuxGpu::ToDevice(pGpuUsage, FFT1DInfo.pdOutData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(double), true);
+		}
+	})
+	else 
+#endif
+	{
+#ifdef _FFTW3 //OC28012019
+		if ((FFT1DInfo.pInData != 0) && (FFT1DInfo.pOutData != 0))
+		{
+			DataToFFT = (fftwf_complex*)(FFT1DInfo.pInData);
+			OutDataFFT = (fftwf_complex*)(FFT1DInfo.pOutData);
+			//pOutDataFFT = OutDataFFT; //OC03092016 to be used solely in fftw call
+		}
+		else if ((FFT1DInfo.pdInData != 0) && (FFT1DInfo.pdOutData != 0))
+		{
+			dDataToFFT = (fftw_complex*)(FFT1DInfo.pdInData);
+			dOutDataFFT = (fftw_complex*)(FFT1DInfo.pdOutData);
+			//pdOutDataFFT = dOutDataFFT;
+		}
+#else
+		fftw_plan Plan1DFFT;
+		FFTW_COMPLEX* DataToFFT = (FFTW_COMPLEX*)(FFT1DInfo.pInData);
+		FFTW_COMPLEX* OutDataFFT = (FFTW_COMPLEX*)(FFT1DInfo.pOutData);
+		FFTW_COMPLEX* pOutDataFFT = OutDataFFT; //OC03092016 to be used solely in fftw call
+	/**
+		Pointed-out by Sergey Yakubov (E-XFEL).
+		From FFTW 2.1.5 docs:
+		void fftw(fftw_plan plan, int howmany,
+			  fftw_complex *in, int istride, int idist,
+			  fftw_complex *out, int ostride, int odist);
+		...
+		out, ostride and odist describe the output array(s). The format is the same as for the input array.
+		In-place transforms:  If the plan specifies an in-place transform, ostride and odist are always ignored.
+		If out is NULL, out is ignored, too. Otherwise, out is interpreted as a pointer to an array of n complex numbers,
+		that FFTW will use as temporary space to perform the in-place computation. out is used as scratch space and its contents destroyed.
+		In this case, out must be an ordinary array whose elements are contiguous in memory (no striding).
+	**/
+#endif
+	}
+
+#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
+	if (DataToFFT != 0) CAuxGPU::EnsureDeviceMemoryReady((TGPUUsageArg*)pvGPU, DataToFFT);
+	else if (dDataToFFT != 0) CAuxGPU::EnsureDeviceMemoryReady((TGPUUsageArg*)pvGPU, dDataToFFT);
+	//if (DataToFFT != 0) AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, DataToFFT);
+	//else if (dDataToFFT != 0) AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, dDataToFFT);
+#endif
+
+	char t0SignMult = (FFT1DInfo.Dir > 0) ? -1 : 1;
+	if (NeedsShiftBeforeX)
+	{
+#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
+		GPU_COND(pvGPU, 
+		//GPU_COND(pGpuUsage, //HG20012022
+		{
+			if (m_ArrayShiftX != 0) FillArrayShift_GPU(t0SignMult * x0_Before, FFT1DInfo.xStep, Nx, m_ArrayShiftX);
+			else if (m_dArrayShiftX != 0) FillArrayShift_GPU(t0SignMult * x0_Before, FFT1DInfo.xStep, Nx, m_dArrayShiftX);
+
+			if (DataToFFT != 0) TreatShift_GPU((float*)DataToFFT, FFT1DInfo.HowMany, Nx, m_ArrayShiftX);
+			else if (dDataToFFT != 0) TreatShift_GPU((double*)dDataToFFT, FFT1DInfo.HowMany, Nx, m_dArrayShiftX);
+		})
+		else 
+#endif
+		{
+			//FillArrayShift(t0SignMult*x0_Before, FFT1DInfo.xStep);
+			if (m_ArrayShiftX != 0) FillArrayShift(t0SignMult * x0_Before, FFT1DInfo.xStep, m_ArrayShiftX);
+			else if (m_dArrayShiftX != 0) FillArrayShift(t0SignMult * x0_Before, FFT1DInfo.xStep, m_dArrayShiftX);
+
+			if (DataToFFT != 0) TreatShift(DataToFFT, FFT1DInfo.HowMany);
+
+#ifdef _FFTW3 //OC27022019
+			else if (dDataToFFT != 0) TreatShift(dDataToFFT, FFT1DInfo.HowMany);
+#endif
+		}
+	}
+
+	//Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP:
+	//srwlPrintTime("::Make1DFFT : before fft",&start);
+	
+	int flags = FFTW_ESTIMATE; //OC30012019
+	bool alreadyNormalized = false; //HG17032022
+	//double Mult = FFT1DInfo.xStep;
+	double Mult = FFT1DInfo.xStep * FFT1DInfo.MultExtra;
+
+	if (FFT1DInfo.Dir > 0) //HG17112021
+	{
+#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
+		GPU_COND(pvGPU, 
+		//GPU_COND(pGpuUsage, 
+		{
+			int arN[] = { (int)Nx }; //OC14052020
+			if (DataToFFT != 0)
+			{
+				if (PlanLen != Nx) {
+					PlanLen = Nx;
+					if (Plan1DFFT_cu != NULL)
+					{
+						cufftDestroy(Plan1DFFT_cu);
+						Plan1DFFT_cu = NULL;
+					}
+					cufftPlanMany(&Plan1DFFT_cu, 1, arN, NULL, 1, Nx, NULL, 1, Nx, CUFFT_C2C, FFT1DInfo.HowMany);
+				}
+				if (Plan1DFFT_cu == 0) return ERROR_IN_FFT;
+				cufftExecC2C(Plan1DFFT_cu, (cufftComplex*)DataToFFT, (cufftComplex*)OutDataFFT, CUFFT_FORWARD);
+			}
+			else if (dDataToFFT != 0) //OC02022019
+			{
+				if (dPlanLen != Nx) {
+					if (dPlan1DFFT_cu != NULL)
+					{
+						cufftDestroy(dPlan1DFFT_cu);
+						dPlan1DFFT_cu = NULL;
+					}
+					dPlanLen = Nx;
+					cufftPlanMany(&dPlan1DFFT_cu, 1, arN, NULL, 1, Nx, NULL, 1, Nx, CUFFT_Z2Z, FFT1DInfo.HowMany);
+				}
+				if (dPlan1DFFT_cu == 0) return ERROR_IN_FFT;
+				cufftExecZ2Z(dPlan1DFFT_cu, (cufftDoubleComplex*)dDataToFFT, (cufftDoubleComplex*)dOutDataFFT, CUFFT_FORWARD);
+			}
+		})
+		else 
+#endif
+		{
+			//int flags = FFTW_ESTIMATE;
+#ifdef _FFTW3 //OC28012019
+#ifdef _WITH_OMP
+		//Still needs to be tested!
+			if (DataToFFT != 0)
+			{
+				fftwf_init_threads(); //initialize threading support
+				int nthreads = omp_get_max_threads(); //detect number of OpenMP threads that are available
+				fftwf_plan_with_nthreads(nthreads);
+			}
+			else if (dDataToFFT != 0) //OC02022019
+			{
+				fftw_init_threads(); //initialize threading support
+				int nthreads = omp_get_max_threads(); //detect number of OpenMP threads that are available
+				fftw_plan_with_nthreads(nthreads);
+			}
+#endif //ifndef _WITH_OMP
+			int arN[] = { (int)Nx }; //OC14052020
+			//int arN[] = {Nx};
+			if (DataToFFT != 0)
+			{
+				//Plan1DFFT = fftwf_plan_many_dft(1, arN, FFT1DInfo.HowMany, DataToFFT, NULL, 1, Nx, pOutDataFFT, NULL, 1, Nx, FFTW_FORWARD, flags); 
+				Plan1DFFT = fftwf_plan_many_dft(1, arN, FFT1DInfo.HowMany, DataToFFT, NULL, 1, Nx, OutDataFFT, NULL, 1, Nx, FFTW_FORWARD, flags); //OC02022019
+				if (Plan1DFFT == 0) return ERROR_IN_FFT;
+				fftwf_execute(Plan1DFFT);
+			}
+			else if (dDataToFFT != 0) //OC02022019
+			{
+				dPlan1DFFT = fftw_plan_many_dft(1, arN, FFT1DInfo.HowMany, dDataToFFT, NULL, 1, Nx, dOutDataFFT, NULL, 1, Nx, FFTW_FORWARD, flags);
+				if (dPlan1DFFT == 0) return ERROR_IN_FFT;
+				fftw_execute(dPlan1DFFT);
+			}
+
+#else //ifndef _FFTW3
+			if (DataToFFT == OutDataFFT)
+			{
+				flags |= FFTW_IN_PLACE;
+				pOutDataFFT = 0; //OC03092016 (see FFTW 2.1.5 doc clause above)
+			}
+			Plan1DFFT = fftw_create_plan(Nx, FFTW_FORWARD, flags);
+			if (Plan1DFFT == 0) return ERROR_IN_FFT;
+
+			//Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP:
+			//srwlPrintTime("::Make1DFFT : fft create plan dir>0",&start);
+
+#ifndef _WITH_OMP //OC27102018
+		//fftw(Plan1DFFT, FFT1DInfo.HowMany, DataToFFT, 1, Nx, OutDataFFT, 1, Nx);
+			fftw(Plan1DFFT, FFT1DInfo.HowMany, DataToFFT, 1, Nx, pOutDataFFT, 1, Nx); //OC03092016
+#else //OC27102018
+		//SY: split one call into many (for OpenMP)
+#pragma omp parallel for if (omp_get_num_threads()==1) // to avoid nested multi-threading (just in case)
+			for (int i = 0; i < FFT1DInfo.HowMany; i++)
+			{
+				//SY: do not use OutDataFFT as scratch space if in-place
+				if (DataToFFT == OutDataFFT) fftw_one(Plan1DFFT, DataToFFT + i * Nx, 0);
+				else fftw_one(Plan1DFFT, DataToFFT + i * Nx, OutDataFFT + i * Nx);
+			}
+#endif
+#endif
+		}
+		//Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP:
+		//srwlPrintTime("::Make1DFFT : fft  dir>0",&start);
+
+#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
+		GPU_COND(pvGPU, 
+		//GPU_COND(pGpuUsage, //HG20012022
+		{
+			if (OutDataFFT != 0)
+			{
+				RepairAndRotateDataAfter1DFFT_GPU((float*)OutDataFFT, FFT1DInfo.HowMany, Nx, (float)Mult); //OC06092023
+				//RepairAndRotateDataAfter1DFFT_GPU((float*)OutDataFFT, FFT1DInfo.HowMany, Nx, Mult);
+				//RepairSignAfter1DFFT_GPU((float*)OutDataFFT, FFT1DInfo.HowMany, Nx);
+				//RotateDataAfter1DFFT_GPU((float*)OutDataFFT, FFT1DInfo.HowMany, Nx);
+			}
+			else if (dOutDataFFT != 0)
+			{
+				RepairAndRotateDataAfter1DFFT_GPU((double*)dOutDataFFT, FFT1DInfo.HowMany, Nx, Mult);
+				//RepairSignAfter1DFFT_GPU((double*)dOutDataFFT, FFT1DInfo.HowMany, Nx);
+				//RotateDataAfter1DFFT_GPU((double*)dOutDataFFT, FFT1DInfo.HowMany, Nx);
+			}
+			alreadyNormalized = true;
+		})
+		else 
+#endif
+		{
+			if (OutDataFFT != 0)
+			{
+				RepairSignAfter1DFFT(OutDataFFT, FFT1DInfo.HowMany);
+				RotateDataAfter1DFFT(OutDataFFT, FFT1DInfo.HowMany);
+			}
+#ifdef _FFTW3 //OC27022019
+			else if (dOutDataFFT != 0)
+			{
+				RepairSignAfter1DFFT(dOutDataFFT, FFT1DInfo.HowMany);
+				RotateDataAfter1DFFT(dOutDataFFT, FFT1DInfo.HowMany);
+			}
+#endif
+		}
+	}
+	else
+	{
+		//int flags = FFTW_ESTIMATE; //OC30012019 (commented-out)
+#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
+		GPU_COND(pvGPU, 
+		//GPU_COND(pGpuUsage, //HG20012022
+		{
+			int arN[] = { (int)Nx }; //OC14052020
+			//int arN[] = {Nx};
+			if (DataToFFT != 0)
+			{
+				if (PlanLen != Nx) {
+					PlanLen = Nx;
+					HowMany = FFT1DInfo.HowMany;
+					if (Plan1DFFT_cu != NULL)
+					{
+						cufftDestroy(Plan1DFFT_cu);
+						Plan1DFFT_cu = NULL;
+					}
+					cufftPlanMany(&Plan1DFFT_cu, 1, arN, NULL, 1, Nx, NULL, 1, Nx, CUFFT_C2C, FFT1DInfo.HowMany);
+				}
+				if (Plan1DFFT_cu == 0) return ERROR_IN_FFT;
+
+				RotateDataAfter1DFFT_GPU((float*)DataToFFT, FFT1DInfo.HowMany, Nx);
+				RepairSignAfter1DFFT_GPU((float*)DataToFFT, FFT1DInfo.HowMany, Nx);
+				cufftExecC2C(Plan1DFFT_cu, (cufftComplex*)DataToFFT, (cufftComplex*)OutDataFFT, CUFFT_INVERSE);
+			}
+			else if (dDataToFFT != 0) //OC02022019
+			{
+				if (dPlanLen != Nx) 
+				{
+					dPlanLen = Nx;
+					dHowMany = FFT1DInfo.HowMany;
+					if (dPlan1DFFT_cu != NULL)
+					{
+						cufftDestroy(dPlan1DFFT_cu);
+						dPlan1DFFT_cu = NULL;
+					}
+					cufftPlanMany(&dPlan1DFFT_cu, 1, arN, NULL, 1, Nx, NULL, 1, Nx, CUFFT_Z2Z, FFT1DInfo.HowMany);
+				}
+				if (dPlan1DFFT_cu == 0) return ERROR_IN_FFT;
+
+				RotateDataAfter1DFFT_GPU((double*)dDataToFFT, FFT1DInfo.HowMany, Nx);
+				RepairSignAfter1DFFT_GPU((double*)dDataToFFT, FFT1DInfo.HowMany, Nx);
+				cufftExecZ2Z(dPlan1DFFT_cu, (cufftDoubleComplex*)dDataToFFT, (cufftDoubleComplex*)dOutDataFFT, CUFFT_INVERSE);
+			}
+		})
+		else 
+#endif
+		{
+#ifdef _FFTW3 //OC28012019
+#ifdef _WITH_OMP
+
+			//Still needs to be tested!
+			if (DataToFFT != 0)
+			{
+				fftwf_init_threads(); //initialize threading support
+				int nthreads = omp_get_max_threads(); //detect number of OpenMP threads that are available
+				fftwf_plan_with_nthreads(nthreads);
+			}
+			else if (dDataToFFT != 0)
+			{
+				fftw_init_threads(); //initialize threading support
+				int nthreads = omp_get_max_threads(); //detect number of OpenMP threads that are available
+				fftw_plan_with_nthreads(nthreads);
+			}
+
+#endif
+			int arN[] = { (int)Nx }; //OC14052020
+	//int arN[] = {Nx};
+			if (DataToFFT != 0)
+			{
+				//Plan1DFFT = fftwf_plan_many_dft(1, arN, FFT1DInfo.HowMany, DataToFFT, NULL, 1, Nx, pOutDataFFT, NULL, 1, Nx, FFTW_BACKWARD, flags); 
+				Plan1DFFT = fftwf_plan_many_dft(1, arN, FFT1DInfo.HowMany, DataToFFT, NULL, 1, Nx, OutDataFFT, NULL, 1, Nx, FFTW_BACKWARD, flags); //OC02022019
+				if (Plan1DFFT == 0) return ERROR_IN_FFT;
+				RotateDataAfter1DFFT(DataToFFT, FFT1DInfo.HowMany);
+				RepairSignAfter1DFFT(DataToFFT, FFT1DInfo.HowMany);
+
+				fftwf_execute(Plan1DFFT);
+			}
+			else if (dDataToFFT != 0) //OC02022019
+			{
+				dPlan1DFFT = fftw_plan_many_dft(1, arN, FFT1DInfo.HowMany, dDataToFFT, NULL, 1, Nx, dOutDataFFT, NULL, 1, Nx, FFTW_BACKWARD, flags);
+				if (dPlan1DFFT == 0) return ERROR_IN_FFT;
+				RotateDataAfter1DFFT(dDataToFFT, FFT1DInfo.HowMany);
+				RepairSignAfter1DFFT(dDataToFFT, FFT1DInfo.HowMany);
+				fftw_execute(dPlan1DFFT);
+			}
+#else //ifndef _FFTW3
+			if (DataToFFT == OutDataFFT)
+			{
+				flags |= FFTW_IN_PLACE;
+				pOutDataFFT = 0; //OC03092016 (see FFTW 2.1.5 doc clause above)
+			}
+			Plan1DFFT = fftw_create_plan(Nx, FFTW_BACKWARD, flags);
+			if (Plan1DFFT == 0) return ERROR_IN_FFT;
+
+			//Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP:
+			//srwlPrintTime("::Make1DFFT : fft create plan dir<0",&start);
+
+			RotateDataAfter1DFFT(DataToFFT, FFT1DInfo.HowMany);
+			//srwlPrintTime("::Make1DFFT : rotate dir<0",&start);
+
+			RepairSignAfter1DFFT(DataToFFT, FFT1DInfo.HowMany);
+			//srwlPrintTime("::Make1DFFT : repair dir<0",&start);
+
+#ifndef _WITH_OMP //OC27102018
+		//fftw(Plan1DFFT, FFT1DInfo.HowMany, DataToFFT, 1, Nx, OutDataFFT, 1, Nx);
+			fftw(Plan1DFFT, FFT1DInfo.HowMany, DataToFFT, 1, Nx, pOutDataFFT, 1, Nx); //OC03092016
+#else //OC27102018
+		//SY: split one call into many (for OpenMP)
+#pragma omp parallel for if (omp_get_num_threads()==1) // to avoid nested multi-threading (just in case)
+			for (int i = 0; i < FFT1DInfo.HowMany; i++)
+			{
+				if (DataToFFT == OutDataFFT) fftw_one(Plan1DFFT, DataToFFT + i * Nx, 0);
+				else fftw_one(Plan1DFFT, DataToFFT + i * Nx, OutDataFFT + i * Nx);
+			}
+#endif
+#endif //_FFTW3
+		}
+		//Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP:
+		//srwlPrintTime("::Make1DFFT : fft  dir<0",&start);
+	}
+
+	if (!alreadyNormalized)
+	{
+#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
+		GPU_COND(pvGPU, 
+		//GPU_COND(pGpuUsage, 
+		{
+			if (OutDataFFT != 0) {
+				NormalizeDataAfter1DFFT_GPU((float*)OutDataFFT, FFT1DInfo.HowMany, Nx, Mult);
+			}
+			else if (dOutDataFFT != 0)
+				NormalizeDataAfter1DFFT_GPU((double*)dOutDataFFT, FFT1DInfo.HowMany, Nx, Mult);
+		})
+		else 
+#endif
+		{
+			if (OutDataFFT != 0) NormalizeDataAfter1DFFT(OutDataFFT, FFT1DInfo.HowMany, Mult);
+#ifdef _FFTW3 //OC27022019
+			else if (dOutDataFFT != 0) NormalizeDataAfter1DFFT(dOutDataFFT, FFT1DInfo.HowMany, Mult);
+#endif
+		}
+	}
+	
+	//Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP:
+	//srwlPrintTime("::Make1DFFT : NormalizeDataAfter1DFFT",&start);
+
+	if (NeedsShiftAfterX)
+	{
+#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
+		GPU_COND(pvGPU, 
+		//GPU_COND(pGpuUsage, 
+		{
+			if (m_ArrayShiftX != 0) FillArrayShift_GPU(t0SignMult * x0_After, FFT1DInfo.xStepTr, Nx, m_ArrayShiftX); //OC02022019
+			else if (m_dArrayShiftX != 0) FillArrayShift_GPU(t0SignMult * x0_After, FFT1DInfo.xStepTr, Nx, m_dArrayShiftX);
+
+			if (OutDataFFT != 0) TreatShift_GPU((float*)OutDataFFT, FFT1DInfo.HowMany, Nx, m_ArrayShiftX);
+			else if (dOutDataFFT != 0) TreatShift_GPU((double*)dOutDataFFT, FFT1DInfo.HowMany, Nx, m_dArrayShiftX);
+		})
+		else 
+#endif
+		{
+			//FillArrayShift(t0SignMult*x0_After, FFT1DInfo.xStepTr);
+			if (m_ArrayShiftX != 0) FillArrayShift(t0SignMult * x0_After, FFT1DInfo.xStepTr, m_ArrayShiftX); //OC02022019
+			else if (m_dArrayShiftX != 0) FillArrayShift(t0SignMult * x0_After, FFT1DInfo.xStepTr, m_dArrayShiftX);
+
+			if (OutDataFFT != 0) TreatShift(OutDataFFT, FFT1DInfo.HowMany);
+#ifdef _FFTW3 //OC27022019
+			else if (dOutDataFFT != 0) TreatShift(dOutDataFFT, FFT1DInfo.HowMany);
+#endif
+		}
+	}
+
+	if(FFT1DInfo.TreatSharpEdges)
+	{
+		int result = ProcessSharpEdges(FFT1DInfo);
+		if(result) return result;
+	}
+
+#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
+	GPU_COND(pvGPU,
+	//GPU_COND(pGpuUsage, //HG20012022
+	{
+		if ((FFT1DInfo.pInData != 0) && (FFT1DInfo.pOutData != 0))
+		{
+			CAuxGPU::MarkUpdated((TGPUUsageArg*)pvGPU, OutDataFFT, true, false); //OC06092023
+			//AuxGpu::MarkUpdated(pGpuUsage, OutDataFFT, true, false);
+		}
+		else if ((FFT1DInfo.pdInData != 0) && (FFT1DInfo.pdOutData != 0))
+		{
+			CAuxGPU::MarkUpdated((TGPUUsageArg*)pvGPU, dOutDataFFT, true, false); //OC06092023
+			//AuxGpu::MarkUpdated(pGpuUsage, dOutDataFFT, true, false);
+		}
+	})
+	else 
+#endif
+	{
+		//Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP:
+		//srwlPrintTime("::Make1DFFT : ProcessSharpEdges",&start);
+
+		//OC_NERSC: to comment-out the following line for NERSC (to avoid crash with "python-mpi")
+		//OC27102018: thread safety issue?
+#ifdef _FFTW3 //OC29012019
+
+		if(DataToFFT != 0) fftwf_destroy_plan(Plan1DFFT);
+		else if(dDataToFFT != 0) fftw_destroy_plan(dPlan1DFFT);
+
+#ifdef _WITH_OMP 
+
+		if(DataToFFT != 0) fftwf_cleanup_threads(); //??
+		else if(dDataToFFT != 0) fftw_cleanup_threads();
+
+#endif
+#else //ifndef _FFTW3
+
+		fftw_destroy_plan(Plan1DFFT);
+
+#endif
+	}
+
+	if (m_ArrayShiftX != 0)
+	{
+#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
+		m_ArrayShiftX = (float*)CAuxGPU::ToHostAndFree((TGPUUsageArg*)pvGPU, m_ArrayShiftX, (Nx << 1) * sizeof(float), true); //OC06092023
+		//m_ArrayShiftX = (float*)AuxGpu::ToHostAndFree(pGpuUsage, m_ArrayShiftX, (Nx << 1) * sizeof(float), true);
+#endif
+		delete[] m_ArrayShiftX;
+	}
+	if (m_dArrayShiftX != 0)
+	{
+#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
+		m_dArrayShiftX = (double*)CAuxGPU::ToHostAndFree((TGPUUsageArg*)pvGPU, m_dArrayShiftX, (Nx << 1) * sizeof(double), true); //OC06092023
+		//m_dArrayShiftX = (double*)AuxGpu::ToHostAndFree(pGpuUsage, m_dArrayShiftX, (Nx << 1) * sizeof(double), true);
+#endif
+		delete[] m_dArrayShiftX;
+	}
+
+	//Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP:
+	//srwlPrintTime("::Make1DFFT : after fft ",&start);
+	return 0;
+}
+
+//*************************************************************************
+
+int CGenMathFFT1D::SetupAuxDataForSharpEdgeCorr(CGenMathFFT1DInfo& FFT1DInfo, CGenMathAuxDataForSharpEdgeCorr1D& AuxDataForSharpEdgeCorr, char dataType)
+//int CGenMathFFT1D::SetupAuxDataForSharpEdgeCorr(CGenMathFFT1DInfo& FFT1DInfo, CGenMathAuxDataForSharpEdgeCorr1D& AuxDataForSharpEdgeCorr)
+{
+	double Step = FFT1DInfo.xStep, Start = FFT1DInfo.xStart;
+	double AbsTol = 0.05*Step;
+
+	double EdgeMinOffsetFromStart = FFT1DInfo.LeftSharpEdge - Start;
+	long iEdgeMinLower = long(EdgeMinOffsetFromStart/Step + 1.E-04); // Steer: threr was a bug at 1.E-08 and less!
+	double EdgeMinLowerMisfit = EdgeMinOffsetFromStart - iEdgeMinLower*Step;
+
+	double EdgeMaxOffsetFromStart = FFT1DInfo.RightSharpEdge - Start;
+	long iEdgeMaxLower = long(EdgeMaxOffsetFromStart/Step + 1.E-04); // Steer: threr was a bug at 1.E-08 and less!
+	double EdgeMaxLowerMisfit = EdgeMaxOffsetFromStart - iEdgeMaxLower*Step;
+
+	char EdgeMinIsBetweenMeshPoints = (EdgeMinLowerMisfit > AbsTol);
+	char EdgeMaxIsBetweenMeshPoints = (EdgeMaxLowerMisfit > AbsTol);
+	char EdgeMaxIsSmallerThanDataEnd = (::fabs((Start + FFT1DInfo.Nx*Step) - FFT1DInfo.RightSharpEdge) > AbsTol);
+	char EdgeCorrNeeded = (EdgeMinIsBetweenMeshPoints || EdgeMaxIsBetweenMeshPoints || EdgeMaxIsSmallerThanDataEnd);
+
+	//float dSt = 0.;
+	//if(EdgeMinIsBetweenMeshPoints) dSt = (float)(Step - EdgeMinLowerMisfit);
+	//float dFi = 0.;
+	//if(EdgeMaxIsBetweenMeshPoints) dFi = (float)(Step - EdgeMaxLowerMisfit);
+	//else if(EdgeMaxIsSmallerThanDataEnd) dFi = (float)(0.5*Step);
+
+	//OC02022019
+	double dSt = 0.;
+	if(EdgeMinIsBetweenMeshPoints) dSt = Step - EdgeMinLowerMisfit;
+	double dFi = 0.;
+	if(EdgeMaxIsBetweenMeshPoints) dFi = Step - EdgeMaxLowerMisfit;
+	else if(EdgeMaxIsSmallerThanDataEnd) dFi = 0.5*Step;
+
+	CGenMathFFT1DInfo FFT1DInfoLoc = FFT1DInfo;
+	FFT1DInfoLoc.UseGivenStartTrValue = 0;
+	CGenMathFFT1D FFT1D;
+	FFT1D.SetupLimitsTr(FFT1DInfoLoc);
+
+	if(EdgeCorrNeeded)
+	{
+		AuxDataForSharpEdgeCorr.d = Step;
+		long TwoN = FFT1DInfo.Nx << 1;
+
+		if(dSt != 0.)
+		{
+			if(dataType == 'f')
+			{
+				AuxDataForSharpEdgeCorr.ExpArrSt = new float[TwoN];
+				if(AuxDataForSharpEdgeCorr.ExpArrSt == 0) return MEMORY_ALLOCATION_FAILURE;
+			}
+			else if(dataType == 'd') //OC02022019
+			{
+				AuxDataForSharpEdgeCorr.dExpArrSt = new double[TwoN];
+				if(AuxDataForSharpEdgeCorr.dExpArrSt == 0) return MEMORY_ALLOCATION_FAILURE;
+			}
+
+			AuxDataForSharpEdgeCorr.dSt = dSt;
+			long jSt = iEdgeMinLower + 1;
+			AuxDataForSharpEdgeCorr.iSt = jSt;
+
+			double ArgjSt = Start + jSt*Step;
+			SetupSharpEdgeExpCorrArray(AuxDataForSharpEdgeCorr.ExpArrSt, FFT1DInfoLoc.Nx, ArgjSt, FFT1DInfoLoc.xStartTr, FFT1DInfoLoc.xStepTr);
+		}
+		if(dFi != 0.)
+		{
+			if(dataType == 'f')
+			{
+				AuxDataForSharpEdgeCorr.ExpArrFi = new float[TwoN];
+				if(AuxDataForSharpEdgeCorr.ExpArrFi == 0) return MEMORY_ALLOCATION_FAILURE;
+			}
+			else if(dataType == 'd')
+			{
+				AuxDataForSharpEdgeCorr.dExpArrFi = new double[TwoN];
+				if(AuxDataForSharpEdgeCorr.dExpArrFi == 0) return MEMORY_ALLOCATION_FAILURE;
+			}
+
+			AuxDataForSharpEdgeCorr.dFi = dFi;
+			double ArgjFi = Start + iEdgeMaxLower*Step;
+			AuxDataForSharpEdgeCorr.iFi = iEdgeMaxLower;
+
+			SetupSharpEdgeExpCorrArray(AuxDataForSharpEdgeCorr.ExpArrFi, FFT1DInfoLoc.Nx, ArgjFi, FFT1DInfoLoc.xStartTr, FFT1DInfoLoc.xStepTr);
+		}
+		AuxDataForSharpEdgeCorr.WasSetUp = 1;
+	}
+	return 0;
+}
+
+//*************************************************************************
+
+void CGenMathFFT1D::MakeSharpEdgeCorr(CGenMathFFT1DInfo& FFT1DInfo, CGenMathAuxDataForSharpEdgeCorr1D& AuxData)
+{
+	double fSRe, fSIm, fFRe, fFIm;
+	double ExpStRe, ExpStIm, ExpFiRe, ExpFiIm, Re, Im;
+	long Two_i, Two_i_p_1;
+
+	if((FFT1DInfo.pInData != 0) && (FFT1DInfo.pOutData != 0))
+	{
+		float *t = FFT1DInfo.pOutData;
+		float *tSt = FFT1DInfo.pInData + (AuxData.iSt << 1);
+		float *tFi = FFT1DInfo.pInData + (AuxData.iFi << 1);
+		fSRe = *tSt, fSIm = *(tSt + 1);
+		fFRe = *tFi, fFIm = *(tFi + 1);
+
+		for(long i=0; i<FFT1DInfo.Nx; i++)
+		{
+			Two_i = i << 1; Two_i_p_1 = Two_i + 1;
+			Re = *t, Im = *(t+1);
+			if(AuxData.dSt != 0.)
+			{
+				ExpStRe = AuxData.ExpArrSt[Two_i]; ExpStIm = AuxData.ExpArrSt[Two_i_p_1];
+				//float ExpStRe = AuxData.ExpArrSt[Two_i]; ExpStIm = AuxData.ExpArrSt[Two_i_p_1];
+				Re += AuxData.dSt*(ExpStRe*fSRe - ExpStIm*fSIm);
+				Im += AuxData.dSt*(ExpStRe*fSIm + ExpStIm*fSRe);
+			}
+			if(AuxData.dFi != 0.)
+			{
+				ExpFiRe = AuxData.ExpArrFi[Two_i]; ExpFiIm = AuxData.ExpArrFi[Two_i_p_1];
+				//float ExpFiRe = AuxData.ExpArrFi[Two_i], ExpFiIm = AuxData.ExpArrFi[Two_i_p_1];
+				Re -= AuxData.dFi*(ExpFiRe*fFRe - ExpFiIm*fFIm);
+				Im -= AuxData.dFi*(ExpFiRe*fFIm + ExpFiIm*fFRe);
+			}
+			*t = (float)Re; *(t+1) = (float)Im;
+			t += 2;
+		}
+	}
+	else if((FFT1DInfo.pdInData != 0) && (FFT1DInfo.pdOutData != 0)) //OC02022019
+	{
+		double *td = FFT1DInfo.pdOutData;
+		double *tdSt = FFT1DInfo.pdInData + (AuxData.iSt << 1);
+		double *tdFi = FFT1DInfo.pdInData + (AuxData.iFi << 1);
+		fSRe = *tdSt, fSIm = *(tdSt + 1);
+		fFRe = *tdFi, fFIm = *(tdFi + 1);
+
+		for(long i=0; i<FFT1DInfo.Nx; i++)
+		{
+			Two_i = i << 1; Two_i_p_1 = Two_i + 1;
+			Re = *td, Im = *(td+1);
+			if(AuxData.dSt != 0.)
+			{
+				ExpStRe = AuxData.dExpArrSt[Two_i]; ExpStIm = AuxData.dExpArrSt[Two_i_p_1];
+				Re += AuxData.dSt*(ExpStRe*fSRe - ExpStIm*fSIm);
+				Im += AuxData.dSt*(ExpStRe*fSIm + ExpStIm*fSRe);
+			}
+			if(AuxData.dFi != 0.)
+			{
+				ExpFiRe = AuxData.dExpArrFi[Two_i]; ExpFiIm = AuxData.dExpArrFi[Two_i_p_1];
+				Re -= AuxData.dFi*(ExpFiRe*fFRe - ExpFiIm*fFIm);
+				Im -= AuxData.dFi*(ExpFiRe*fFIm + ExpFiIm*fFRe);
+			}
+			*td = Re; *(td+1) = Im;
+			td += 2;
+		}
+	}
+}
+
+//*************************************************************************
diff --git a/cpp/src/core/gmfft.h b/cpp/src/core/gmfft.h
new file mode 100644
index 00000000..5d12c1d6
--- /dev/null
+++ b/cpp/src/core/gmfft.h
@@ -0,0 +1,1042 @@
+/************************************************************************//**
+ * File: srfft.h
+ * Description: Auxiliary utilities to work with FFTW library (header)
+ * Project: 
+ * First release: 2000
+ *
+ * Copyright (C) European Synchrotron Radiation Facility, Grenoble, France
+ * All Rights Reserved
+ *
+ * @author O.Chubar, P.Elleaume
+ * @version 1.0
+ ***************************************************************************/
+
+#ifndef __GMFFT_H
+#define __GMFFT_H
+
+#ifdef _OFFLOAD_GPU //HG10072021
+#include "cufft.h"
+#include "cuda_runtime.h"
+#include "auxgpu.h" //OC06092023
+#endif 
+//#include "auxgpu.h" //OC05092023: to move into "_OFFLOAD_GPU" block?
+
+//#ifndef GPU_COND //OC06092023 ???
+//#define GPU_COND(arg, code) { code }
+//#endif
+
+#ifdef _FFTW3 //OC28012019
+#include "fftw3.h"
+#else
+#include "fftw.h"
+#endif
+
+//#ifdef __IGOR_PRO__
+//#include "srigintr.h"
+//#endif
+//#include "srercode.h"
+
+//#include <cmath>
+#include <math.h>
+
+#ifndef _GM_WITHOUT_BASE
+#include "gmobj.h"
+#endif
+
+#ifdef _WITH_OMP //OC31102018: Pre-processor definition for compiling SRW with OpenMP library
+#include "omp.h"
+#endif
+
+#ifndef MEMORY_ALLOCATION_FAILURE
+#define MEMORY_ALLOCATION_FAILURE 8 + 10000 //in line with SRW
+#endif
+#ifndef ERROR_IN_FFT
+#define ERROR_IN_FFT 40 + 10000
+#endif
+
+//*************************************************************************
+
+class CGenMathFFT //{
+#ifndef _GM_WITHOUT_BASE
+	: public CGenMathObj
+#endif
+{//OC01052013
+	double a2c, a4c, a6c, a8c, a10c, a12c;
+	double a3s, a5s, a7s, a9s, a11s, a13s;
+
+protected:
+
+	static long GoodNumbers[];
+	static long LenGoodNumbers;
+	static long GoodNum100s[];
+	static long LenGoodNum100s;
+	static long GoodNum1000s[];
+	static long LenGoodNum1000s;
+	static long GoodNum10000s[];
+	static long LenGoodNum10000s;
+
+public:
+
+	double HalfPI, PI, TwoPI, ThreePIdTwo, One_dTwoPI; // Constants
+
+	CGenMathFFT()
+	{
+		HalfPI = 1.5707963267949;
+		PI = 3.141592653590;
+		TwoPI = 6.2831853071796;
+		ThreePIdTwo = 4.7123889803847;
+		One_dTwoPI = 0.1591549430919;
+		a2c = -0.5; a4c = 0.041666666666667; a6c = -0.0013888888888889; a8c = 0.000024801587301587; a10c = -2.755731922E-07;
+		a3s = -0.16666666666667; a5s = 0.0083333333333333; a7s = -0.0001984126984127; a9s = 2.755731922E-06; a11s = -2.505210839E-08;
+	}
+
+	void CosAndSin(double x, float& Cos, float& Sin)
+	{
+		x -= TwoPI*int(x*One_dTwoPI);
+		if(x < 0.) x += TwoPI;
+
+		char ChangeSign=0;
+		if(x > ThreePIdTwo) x -= TwoPI;
+		else if(x > HalfPI) { x -= PI; ChangeSign = 1;}
+
+		double xe2 = x*x;
+		Cos = float(1. + xe2*(a2c + xe2*(a4c + xe2*(a6c + xe2*(a8c + xe2*a10c)))));
+		Sin = float(x*(1. + xe2*(a3s + xe2*(a5s + xe2*(a7s + xe2*(a9s + xe2*a11s))))));
+		if(ChangeSign) { Cos = -Cos; Sin = -Sin;}
+	}
+	void CosAndSin(double x, double& Cos, double& Sin) //OC02022019
+	{
+		//x -= TwoPI*int(x*One_dTwoPI);
+		x -= TwoPI*((long long)(x*One_dTwoPI));
+
+		if(x < 0.) x += TwoPI;
+
+		char ChangeSign=0;
+		if(x > ThreePIdTwo) x -= TwoPI;
+		else if(x > HalfPI) { x -= PI; ChangeSign = 1;}
+
+		double xe2 = x*x;
+		Cos = 1. + xe2*(a2c + xe2*(a4c + xe2*(a6c + xe2*(a8c + xe2*a10c))));
+		Sin = x*(1. + xe2*(a3s + xe2*(a5s + xe2*(a7s + xe2*(a9s + xe2*a11s)))));
+		if(ChangeSign) { Cos = -Cos; Sin = -Sin;}
+	}
+
+	//void NextCorrectNumberForFFT(long long&); //OC26042019
+	void NextCorrectNumberForFFT(long&);
+};
+
+//*************************************************************************
+
+struct CGenMathFFT2DInfo {
+	float* pData;
+	double* pdData; //OC31012019
+
+	char Dir; // >0: forward; <0: backward
+	double xStep, yStep, xStart, yStart;
+	double xStepTr, yStepTr, xStartTr, yStartTr;
+	long Nx, Ny;
+	//long long Nx, Ny;
+
+	long howMany; //OC151014
+	long iStride, iDist; //OC151014
+	//From FFTW 2.1.5 Tutorial
+	//iStride and iDist describe the input array(s). 
+	//There are howMany multi-dimensional input arrays; the first one is pointed to by in (= pData), 
+	//the second one is pointed to by in + iDist, and so on, up to in + (howMany - 1) * iDist. 
+	//Each multi-dimensional input array consists of complex numbers (see Section Data Types), 
+	//stored in row-major format (see Section Multi-dimensional Array Format), which are not necessarily contiguous in memory. 
+	//Specifically, in[0] is the first element of the first array, in[istride] is the second element of the first array, and so on. 
+	//In general, the i-th element of the j-th input array will be in position in[i * istride + j * idist]. 
+	//Note that, here, i refers to an index into the row-major format for the multi-dimensional array, rather than an index in any particular dimension. 
+	//In-place transforms:  For plans created with the FFTW_IN_PLACE option, the transform is computed in-place--the output is returned in the in array, 
+	//using the same strides, etcetera, as were used in the input. 
+
+	char UseGivenStartTrValues;
+	double ExtraMult; //OC20112017
+
+	CGenMathFFT2DInfo() 
+	{ 
+		howMany = 1; iStride = 1; iDist = 0; //OC151014
+		UseGivenStartTrValues = 0;
+		ExtraMult = 1.; //OC20112017
+
+		pData = 0; //OC31012019
+		pdData = 0;
+	}
+};
+
+//*************************************************************************
+
+class CGenMathFFT2D : public CGenMathFFT {
+
+	long Nx, Ny;
+	long HalfNx, HalfNy;
+	//long long Nx, Ny;
+	//long long HalfNx, HalfNy;
+	char NeedsShiftBeforeX, NeedsShiftBeforeY, NeedsShiftAfterX, NeedsShiftAfterY;
+	//float *ArrayShiftX, *ArrayShiftY;
+	float *m_ArrayShiftX, *m_ArrayShiftY; //OC02022019
+	double *m_dArrayShiftX, *m_dArrayShiftY; 
+
+#ifdef _OFFLOAD_GPU
+	static long PlanNx, PlanNy, HowMany;
+	static long dPlanNx, dPlanNy, dHowMany;
+	static cufftHandle Plan2DFFT_cu;
+	static cufftHandle dPlan2DFFT_cu;
+#endif
+
+public:
+	CGenMathFFT2D()
+	{
+		NeedsShiftBeforeX = NeedsShiftBeforeY = NeedsShiftAfterX = NeedsShiftAfterY = 0;
+#ifdef _OFFLOAD_GPU
+		HowMany = PlanNx = PlanNy = dHowMany = dPlanNx = dPlanNy = 0;
+		Plan2DFFT_cu = dPlan2DFFT_cu = 0;
+#endif
+	}
+
+	//int Make2DFFT(CGenMathFFT2DInfo&);
+	//Modification by S.Yakubov for parallelizing SRW via OpenMP:
+#ifdef _FFTW3 //28012019
+	int Make2DFFT(CGenMathFFT2DInfo&, fftwf_plan* pPrecreatedPlan2DFFT=0, fftw_plan* pdPrecreatedPlan2DFFT=0, void* pvGPU = 0); //OC05092023
+	//int Make2DFFT(CGenMathFFT2DInfo&, fftwf_plan* pPrecreatedPlan2DFFT=0, fftw_plan* pdPrecreatedPlan2DFFT=0, gpuUsageArg *pGpuUsage = 0); //OC02022019
+	//int Make2DFFT(CGenMathFFT2DInfo&, fftwf_plan* pPrecreatedPlan2DFFT=0);
+#else
+	int Make2DFFT(CGenMathFFT2DInfo&, fftwnd_plan* pPrecreatedPlan2DFFT=0); //OC27102018
+#endif
+
+	int AuxDebug_TestFFT_Plans();
+
+	void SetupLimitsTr(CGenMathFFT2DInfo& FFT2DInfo)
+	{// Modify this if Make2DFFT is modified !
+		Nx = FFT2DInfo.Nx; Ny = FFT2DInfo.Ny; 
+		HalfNx = (Nx >> 1); HalfNy = (Ny >> 1);
+
+		double xStartTr = -0.5/FFT2DInfo.xStep;
+		FFT2DInfo.xStepTr = -xStartTr/HalfNx;
+
+		double yStartTr = -0.5/FFT2DInfo.yStep;
+		FFT2DInfo.yStepTr = -yStartTr/HalfNy;
+
+		if(!FFT2DInfo.UseGivenStartTrValues)
+		{
+			FFT2DInfo.xStartTr = xStartTr;
+			FFT2DInfo.yStartTr = yStartTr;
+		}
+	}
+
+	template <class T> void FillArrayShift(char x_or_y, double t0, double tStep, T* arShift) //OC02022019
+	//void FillArrayShift(char x_or_y, double t0, double tStep)
+	{
+		T* tArrayShift = arShift;
+		//float* tArrayShift;
+		//long N;
+		long N = (x_or_y == 'x')? Nx : Ny;
+		//if(x_or_y == 'x') { tArrayShift = m_ArrayShiftX; N = Nx;}
+		//else { tArrayShift = m_ArrayShiftY; N = Ny;}
+
+		T *tp = tArrayShift + N;
+		//float *tp = tArrayShift + N;
+		*tp = 1.; *(tp+1) = 0.; tp += 2;
+		T *tm = tp - 4;
+		//float *tm = tp - 4;
+		
+		double t0TwoPI = t0*TwoPI;
+		double q = tStep;
+		long HalfN = N >> 1;
+		for(int i=0; i<HalfN - 1; i++)
+		{
+			CosAndSin(q*t0TwoPI, *tp, *(tp+1));
+			*tm = *tp; *(tm+1) = -(*(tp+1));
+			tp += 2; tm -= 2; q += tStep;
+		}
+		CosAndSin(-q*t0TwoPI, *tm, *(tm+1));
+	}
+
+#ifdef _FFTW3 //OC29012019
+	template <class T> void RotateDataAfter2DFFT(T* pAfterFFT, long HowMany)
+	//void RotateDataAfter2DFFT(fftwf_complex* pAfterFFT)
+	{// Assumes Nx, Ny even !
+	 //OC281117: Make it work for odd Nx, Ny as well!
+	 //OC281117: Consider combining RotateDataAfter2DFFT, RepairSignAfter2DFFT, NormalizeDataAfter2DFFT
+		//long HalfNyNx = HalfNy*Nx;
+		long long HalfNyNx = ((long long)HalfNy)*((long long)Nx);
+
+		for(long iHowMany=0; iHowMany<HowMany; iHowMany++)
+		{
+			//fftwf_complex *t1 = pAfterFFT, *t2 = pAfterFFT + (HalfNyNx + HalfNx);
+			//fftwf_complex *t3 = pAfterFFT + HalfNx, *t4 = pAfterFFT + HalfNyNx;
+			//fftwf_complex Buf;
+			long PerFFT = Nx * Ny * iHowMany;
+			T *t1 = pAfterFFT + PerFFT, *t2 = pAfterFFT + (HalfNyNx + HalfNx) + PerFFT;
+			T *t3 = pAfterFFT + HalfNx + PerFFT, *t4 = pAfterFFT + HalfNyNx + PerFFT;
+			T Buf;
+
+			for(long jj=0; jj<HalfNy; jj++)
+			{
+				for(long ii=0; ii<HalfNx; ii++)
+				{
+					Buf[0] = (*t1)[0]; Buf[1] = (*t1)[1]; //Buf = *t1;
+					(*t1)[0] = (*t2)[0]; (*(t1++))[1] = (*t2)[1]; //*(t1++) = *t2;
+					(*t2)[0] = Buf[0]; (*(t2++))[1] = Buf[1]; //*(t2++) = Buf;
+
+					Buf[0] = (*t3)[0]; Buf[1] = (*t3)[1]; //Buf = *t3; 
+					(*t3)[0] = (*t4)[0]; (*(t3++))[1] = (*t4)[1]; //*(t3++) = *t4; 
+					(*t4)[0] = Buf[0]; (*(t4++))[1] = Buf[1]; //*(t4++) = Buf;
+				}
+				t1 += HalfNx; t2 += HalfNx; t3 += HalfNx; t4 += HalfNx;
+			}
+		}
+	}
+#else
+	void RotateDataAfter2DFFT(FFTW_COMPLEX* pAfterFFT)
+	{// Assumes Nx, Ny even !
+	 //OC281117: Make it work for odd Nx, Ny as well!
+	 //OC281117: Consider combining RotateDataAfter2DFFT, RepairSignAfter2DFFT, NormalizeDataAfter2DFFT
+		//long HalfNyNx = HalfNy*Nx;
+		long long HalfNyNx = ((long long)HalfNy)*((long long)Nx);
+
+		FFTW_COMPLEX *t1 = pAfterFFT, *t2 = pAfterFFT + (HalfNyNx + HalfNx);
+	    FFTW_COMPLEX *t3 = pAfterFFT + HalfNx, *t4 = pAfterFFT + HalfNyNx;
+		FFTW_COMPLEX Buf;
+
+		for(long jj=0; jj<HalfNy; jj++)
+		{
+			for(long ii=0; ii<HalfNx; ii++)
+			{
+				Buf = *t1; *(t1++) = *t2; *(t2++) = Buf;
+				Buf = *t3; *(t3++) = *t4; *(t4++) = Buf;
+			}
+			t1 += HalfNx; t2 += HalfNx; t3 += HalfNx; t4 += HalfNx;
+		}
+	}
+#endif
+
+#ifdef _FFTW3 //OC29012019
+	void RepairSignAfter2DFFT(fftwf_complex* pAfterFFT, long HowMany)
+	{// Assumes Nx, Ny even !
+	 //OC281117: Make it work for odd Nx, Ny as well!
+	    for(long iHowMany=0; iHowMany<HowMany; iHowMany++)
+		{
+			fftwf_complex *t = pAfterFFT + Nx*Ny*iHowMany;
+			float sx0 = 1., sy0 = 1., s;
+			for(long iy=0; iy<Ny; iy++)
+			{
+				s = sy0*sx0;
+				for(long ix=0; ix<Nx; ix++)
+				{
+					(*t)[0] *= s; (*(t++))[1] *= s; s = -s;
+				}
+				sy0 = -sy0;
+			}
+		}
+	}
+	void RepairSignAfter2DFFT(fftw_complex* pAfterFFT, long HowMany)
+	{// Assumes Nx, Ny even !
+	 //OC281117: Make it work for odd Nx, Ny as well!
+	    for(long iHowMany=0; iHowMany<HowMany; iHowMany++)
+		{
+			fftw_complex *t = pAfterFFT + Nx*Ny*iHowMany;
+			double sx0 = 1., sy0 = 1., s;
+			for(long iy=0; iy<Ny; iy++)
+			{
+				s = sy0*sx0;
+				for(long ix=0; ix<Nx; ix++)
+				{
+					(*t)[0] *= s; (*(t++))[1] *= s; s = -s;
+				}
+				sy0 = -sy0;
+			}
+		}
+	}
+#else
+	void RepairSignAfter2DFFT(FFTW_COMPLEX* pAfterFFT)
+	{// Assumes Nx, Ny even !
+	 //OC281117: Make it work for odd Nx, Ny as well!
+		FFTW_COMPLEX *t = pAfterFFT;
+		FFTW_REAL sx0 = 1., sy0 = 1., s;
+		for(long iy=0; iy<Ny; iy++)
+		{
+			s = sy0*sx0;
+			for(long ix=0; ix<Nx; ix++)
+			{
+				t->re *= s; (t++)->im *= s; s = -s;
+			}
+			sy0 = -sy0;
+		}
+	}
+#endif
+
+#ifdef _FFTW3 //OC29012019
+	void NormalizeDataAfter2DFFT(fftwf_complex* pAfterFFT, double Mult, long HowMany)
+	{// Assumes Nx, Ny even !
+	 //OC281117: To make it work for odd Nx, Ny as well in the future!
+		float fMult = (float)Mult;
+		long long NxNy = ((long long)Nx)*((long long)Ny);
+		for(long iHowMany=0; iHowMany<HowMany; iHowMany++)
+		{
+			fftwf_complex *t = pAfterFFT + NxNy*iHowMany;
+			for(long long i=0; i<NxNy; i++)
+			{
+				(*t)[0] *= fMult; (*(t++))[1] *= fMult; 
+			}
+		}
+	}
+	void NormalizeDataAfter2DFFT(fftw_complex* pAfterFFT, double Mult, long HowMany)
+	{// Assumes Nx, Ny even !
+	 //OC281117: To make it work for odd Nx, Ny as well in the future!
+		long long NxNy = ((long long)Nx)*((long long)Ny);
+		for(long iHowMany=0; iHowMany<HowMany; iHowMany++)
+		{
+			fftw_complex *t = pAfterFFT + NxNy*iHowMany;
+			for(long long i=0; i<NxNy; i++)
+			{
+				(*t)[0] *= Mult; (*(t++))[1] *= Mult; 
+			}
+		}
+	}
+#else
+	void NormalizeDataAfter2DFFT(FFTW_COMPLEX* pAfterFFT, double Mult)
+	{// Assumes Nx, Ny even !
+	 //OC281117: Make it work for odd Nx, Ny as well!
+		//long NxNy = Nx*Ny;
+		long long NxNy = ((long long)Nx)*((long long)Ny);
+		FFTW_COMPLEX *t = pAfterFFT;
+		//for(long i=0; i<NxNy; i++)
+		for(long long i=0; i<NxNy; i++)
+		{
+			t->re *= (FFTW_REAL)Mult; (t++)->im *= (FFTW_REAL)Mult;
+		}
+	}
+#endif
+
+#ifdef _FFTW3 //OC29012019
+	void TreatShifts(fftwf_complex* pData, long HowMany)
+	{
+		fftwf_complex *t = pData;
+		char NeedsShiftX = NeedsShiftBeforeX || NeedsShiftAfterX;
+		char NeedsShiftY = NeedsShiftBeforeY || NeedsShiftAfterY;
+
+		for(long iHowMany=0; iHowMany<HowMany; iHowMany++)
+		{
+			float *tShiftY = m_ArrayShiftY;
+			float MultY_Re = 1., MultY_Im = 0., MultX_Re = 1., MultX_Im = 0.;
+			float MultRe, MultIm;
+
+			for(long iy=0; iy<Ny; iy++)
+			{
+				if(NeedsShiftY) { MultY_Re = *(tShiftY++); MultY_Im = *(tShiftY++);}
+				float *tShiftX = m_ArrayShiftX;
+				for(long ix=0; ix<Nx; ix++)
+				{
+					if(NeedsShiftX) 
+					{ 
+						MultX_Re = *(tShiftX++); MultX_Im = *(tShiftX++);
+						if(NeedsShiftY)
+						{
+							MultRe = MultX_Re*MultY_Re - MultX_Im*MultY_Im;
+							MultIm = MultX_Re*MultY_Im + MultX_Im*MultY_Re;
+						}
+						else
+						{
+							MultRe = MultX_Re; MultIm = MultX_Im;
+						}
+					}
+					else
+					{
+						MultRe = MultY_Re; MultIm = MultY_Im;
+					}
+
+//	#ifdef _FFTW3 //OC29012019
+					float NewRe = (*t)[0]*MultRe - (*t)[1]*MultIm;
+					float NewIm = (*t)[0]*MultIm + (*t)[1]*MultRe;
+					(*t)[0] = NewRe;
+					(*(t++))[1] = NewIm;
+//	#else
+//					float NewRe = t->re*MultRe - t->im*MultIm;
+//					float NewIm = t->re*MultIm + t->im*MultRe;
+//					t->re = NewRe;
+//					(t++)->im = NewIm;
+//	#endif
+				}
+			}
+		}
+	}
+#else
+	void TreatShifts(FFTW_COMPLEX* pData)
+	{
+		FFTW_COMPLEX *t = pData;
+		char NeedsShiftX = NeedsShiftBeforeX || NeedsShiftAfterX;
+		char NeedsShiftY = NeedsShiftBeforeY || NeedsShiftAfterY;
+
+		float *tShiftY = m_ArrayShiftY;
+		float MultY_Re = 1., MultY_Im = 0., MultX_Re = 1., MultX_Im = 0.;
+		float MultRe, MultIm;
+
+		for(long iy=0; iy<Ny; iy++)
+		{
+			if(NeedsShiftY) { MultY_Re = *(tShiftY++); MultY_Im = *(tShiftY++);}
+			float *tShiftX = m_ArrayShiftX;
+			for(long ix=0; ix<Nx; ix++)
+			{
+				if(NeedsShiftX) 
+				{ 
+					MultX_Re = *(tShiftX++); MultX_Im = *(tShiftX++);
+					if(NeedsShiftY)
+					{
+						MultRe = MultX_Re*MultY_Re - MultX_Im*MultY_Im;
+						MultIm = MultX_Re*MultY_Im + MultX_Im*MultY_Re;
+					}
+					else
+					{
+						MultRe = MultX_Re; MultIm = MultX_Im;
+					}
+				}
+				else
+				{
+					MultRe = MultY_Re; MultIm = MultY_Im;
+				}
+
+				float NewRe = t->re*MultRe - t->im*MultIm;
+				float NewIm = t->re*MultIm + t->im*MultRe;
+				t->re = NewRe;
+				(t++)->im = NewIm;
+			}
+		}
+	}
+#endif
+#ifdef _FFTW3 //OC02022019
+	void TreatShifts(fftw_complex* pData, long HowMany)
+	{
+		fftw_complex *t = pData;
+		char NeedsShiftX = NeedsShiftBeforeX || NeedsShiftAfterX;
+		char NeedsShiftY = NeedsShiftBeforeY || NeedsShiftAfterY;
+
+        for(long iHowMany=0; iHowMany<HowMany; iHowMany++)
+		{
+			double *tShiftY = m_dArrayShiftY;
+			double MultY_Re = 1., MultY_Im = 0., MultX_Re = 1., MultX_Im = 0.;
+			double MultRe, MultIm;
+
+			for(long iy=0; iy<Ny; iy++)
+			{
+				if(NeedsShiftY) { MultY_Re = *(tShiftY++); MultY_Im = *(tShiftY++);}
+				double *tShiftX = m_dArrayShiftX;
+				for(long ix=0; ix<Nx; ix++)
+				{
+					if(NeedsShiftX) 
+					{ 
+						MultX_Re = *(tShiftX++); MultX_Im = *(tShiftX++);
+						if(NeedsShiftY)
+						{
+							MultRe = MultX_Re*MultY_Re - MultX_Im*MultY_Im;
+							MultIm = MultX_Re*MultY_Im + MultX_Im*MultY_Re;
+						}
+						else
+						{
+							MultRe = MultX_Re; MultIm = MultX_Im;
+						}
+					}
+					else
+					{
+						MultRe = MultY_Re; MultIm = MultY_Im;
+					}
+
+					double NewRe = (*t)[0]*MultRe - (*t)[1]*MultIm;
+					double NewIm = (*t)[0]*MultIm + (*t)[1]*MultRe;
+					(*t)[0] = NewRe;
+					(*(t++))[1] = NewIm;
+				}
+			}
+		}
+	}
+#endif
+};
+
+//*************************************************************************
+
+struct CGenMathFFT1DInfo {
+	float *pInData, *pOutData;
+	double *pdInData, *pdOutData; //OC31012019
+
+	char Dir; // >0: forward; <0: backward
+	double xStep, xStart;
+	double xStepTr, xStartTr;
+	long Nx;
+	//long long Nx;
+	long HowMany;
+	//long long HowMany;
+	char UseGivenStartTrValue;
+	double MultExtra;
+
+	char TreatSharpEdges;
+	double LeftSharpEdge, RightSharpEdge;
+	char ApplyAutoShiftAfter;
+
+	CGenMathFFT1DInfo() 
+	{ 
+		HowMany = 1; UseGivenStartTrValue = 0;
+		TreatSharpEdges = 0;
+		MultExtra = 1.;
+		ApplyAutoShiftAfter = 1;
+
+		pInData = 0; //OC31012019
+		pOutData = 0;
+		pdInData = 0;
+		pdOutData = 0;
+	}
+};
+
+//*************************************************************************
+
+struct CGenMathAuxDataForSharpEdgeCorr1D {
+
+	float *ExpArrSt, *ExpArrFi;
+	double *dExpArrSt, *dExpArrFi;
+
+	double dSt, dFi, d;
+	long iSt, iFi;
+
+	char WasSetUp;
+
+	CGenMathAuxDataForSharpEdgeCorr1D()
+	{
+		Initialize();
+	}
+
+	void Initialize()
+	{
+		ExpArrSt = ExpArrFi = 0;
+		dExpArrSt = dExpArrFi = 0;
+
+		dSt = dFi = d = 0.;
+		iSt = iFi = 0;
+		WasSetUp = 0;
+	}
+
+	void Dispose()
+	{
+		if(ExpArrSt != 0) delete[] ExpArrSt;
+		if(ExpArrFi != 0) delete[] ExpArrFi;
+
+		if(dExpArrSt != 0) delete[] dExpArrSt;
+		if(dExpArrFi != 0) delete[] dExpArrFi;
+
+		Initialize();
+	}
+};
+
+//*************************************************************************
+
+class CGenMathFFT1D : public CGenMathFFT {
+
+	long Nx;
+	long HalfNx;
+	//long long Nx;
+	//long long HalfNx;
+	char NeedsShiftBeforeX, NeedsShiftAfterX;
+	float *m_ArrayShiftX;
+	double *m_dArrayShiftX; //OC02022019
+#ifdef _OFFLOAD_GPU
+	static long PlanLen, HowMany;
+	static long dPlanLen, dHowMany;
+	static cufftHandle Plan1DFFT_cu;
+	static cufftHandle dPlan1DFFT_cu;
+#endif
+
+public:
+	CGenMathFFT1D()
+	{
+		NeedsShiftBeforeX = NeedsShiftAfterX = 0;
+#ifdef _OFFLOAD_GPU
+		PlanLen = dPlanLen = 0;
+		Plan1DFFT_cu = dPlan1DFFT_cu = 0;
+		HowMany = dHowMany = 0;
+#endif
+	}
+
+	int Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU=0); //OC05092023
+	int Make1DFFT_InPlace(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU=0); //OC05092023
+
+//#ifndef _OFFLOAD_GPU //OC05092023
+//	int Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo);
+//	int Make1DFFT_InPlace(CGenMathFFT1DInfo& FFT1DInfo);
+//#else
+//	int Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, TGPUUsageArg* pGPU=0);
+//	int Make1DFFT_InPlace(CGenMathFFT1DInfo& FFT1DInfo, TGPUUsageArg* pGPU=0);
+//	//int Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, gpuUsageArg *pGpuUsage=0); //HG
+//	//int Make1DFFT_InPlace(CGenMathFFT1DInfo& FFT1DInfo, gpuUsageArg *pGpuUsage=0);
+//#endif
+
+	void SetupLimitsTr(CGenMathFFT1DInfo& FFT1DInfo)
+	{ // Modify this if Make1DFFT is modified !
+		Nx = FFT1DInfo.Nx;
+		HalfNx = (Nx >> 1);
+
+		double xStartTr = -0.5/FFT1DInfo.xStep;
+		FFT1DInfo.xStepTr = -xStartTr/HalfNx;
+
+		if(!FFT1DInfo.UseGivenStartTrValue)
+		{
+			FFT1DInfo.xStartTr = xStartTr;
+		}
+	}
+
+	template <class T> void FillArrayShift(double t0, double tStep, T* arShiftX) //OC02022019
+	//void FillArrayShift(double t0, double tStep)
+	{
+		//float *tArrayShift = m_ArrayShiftX;
+		T *tArrayShift = arShiftX; //OC02022019
+		long N = Nx;
+
+		//float *tp = tArrayShift + N;
+		T *tp = tArrayShift + N; //OC02022019
+		*tp = 1.; *(tp+1) = 0.; tp += 2;
+		//float *tm = tp - 4;
+		T *tm = tp - 4;
+
+		double t0TwoPI = t0*TwoPI;
+		double q = tStep;
+		long HalfN = N >> 1;
+
+		for(int i=0; i<HalfN - 1; i++)
+		{
+			CosAndSin(q*t0TwoPI, *tp, *(tp+1));
+			*tm = *tp; *(tm+1) = -(*(tp+1));
+			tp += 2; tm -= 2; q += tStep;
+		}
+		CosAndSin(-q*t0TwoPI, *tm, *(tm+1));
+	}
+
+#ifdef _FFTW3 //OC29012019
+	void TreatShift(fftwf_complex* pData, long HowMany)
+	{
+		char NeedsShiftX = NeedsShiftBeforeX || NeedsShiftAfterX;
+		if(!NeedsShiftX) return;
+
+		fftwf_complex *t = pData;
+		float *tShiftX = m_ArrayShiftX;
+
+		for(long ix=0; ix<Nx; ix++)
+		{
+			float MultX_Re = *(tShiftX++), MultX_Im = *(tShiftX++);
+
+			fftwf_complex *tMany = t++;
+			for(long k=0; k<HowMany; k++)
+			{
+				float NewRe = (*tMany)[0]*MultX_Re - (*tMany)[1]*MultX_Im;
+				float NewIm = (*tMany)[0]*MultX_Im + (*tMany)[1]*MultX_Re;
+				(*tMany)[0] = NewRe; (*tMany)[1] = NewIm;
+				tMany += Nx;
+			}
+		}
+	}
+	void TreatShift(fftw_complex* pData, long HowMany) //OC02022019
+	{
+		char NeedsShiftX = NeedsShiftBeforeX || NeedsShiftAfterX;
+		if(!NeedsShiftX) return;
+
+		fftw_complex *t = pData;
+		double *tShiftX = m_dArrayShiftX;
+
+		for(long ix=0; ix<Nx; ix++)
+		{
+			double MultX_Re = *(tShiftX++), MultX_Im = *(tShiftX++);
+
+			fftw_complex *tMany = t++;
+			for(long k=0; k<HowMany; k++)
+			{
+				double NewRe = (*tMany)[0]*MultX_Re - (*tMany)[1]*MultX_Im;
+				double NewIm = (*tMany)[0]*MultX_Im + (*tMany)[1]*MultX_Re;
+				(*tMany)[0] = NewRe; (*tMany)[1] = NewIm;
+				tMany += Nx;
+			}
+		}
+	}
+
+#else
+	void TreatShift(FFTW_COMPLEX* pData, long HowMany)
+	{
+		char NeedsShiftX = NeedsShiftBeforeX || NeedsShiftAfterX;
+		if(!NeedsShiftX) return;
+
+		FFTW_COMPLEX *t = pData;
+		float *tShiftX = m_ArrayShiftX;
+
+		for(long ix=0; ix<Nx; ix++)
+		{
+			float MultX_Re = *(tShiftX++), MultX_Im = *(tShiftX++);
+
+			FFTW_COMPLEX *tMany = t++;
+			for(long k=0; k<HowMany; k++)
+			{
+				float NewRe = tMany->re*MultX_Re - tMany->im*MultX_Im;
+				float NewIm = tMany->re*MultX_Im + tMany->im*MultX_Re;
+				tMany->re = NewRe; tMany->im = NewIm;
+				tMany += Nx;
+			}
+		}
+	}
+#endif
+
+#ifdef _FFTW3 //OC29012019
+	template <class T> void RepairSignAfter1DFFT(T* pAfterFFT, long HowMany) //OC02022019
+	//void RepairSignAfter1DFFT(fftwf_complex* pAfterFFT, long HowMany)
+	{// Assumes Nx even ! - to be improved
+		//OC27102018
+		//SY: optimized, adopt for OpenMP
+#ifdef _WITH_OMP
+		#pragma omp parallel for
+#endif
+		for(long ix=1; ix<Nx; ix+=2)
+		{
+			//FFTW_COMPLEX *t = pAfterFFT + ix;
+			//FFTW_COMPLEX *tMany = t;
+			//OC27102018
+			//fftwf_complex *tMany = pAfterFFT + ix;
+			T *tMany = pAfterFFT + ix;
+			for(long k=0; k<HowMany; k++)
+			{
+				(*tMany)[0] = -(*tMany)[0]; (*tMany)[1] = -(*tMany)[1];
+				tMany += Nx;
+			}
+		}
+	}
+#else
+	void RepairSignAfter1DFFT(FFTW_COMPLEX* pAfterFFT, long HowMany)
+	{// Assumes Nx even !
+		//FFTW_COMPLEX *t = pAfterFFT;
+		//int s = 1;
+		//for(long ix=0; ix<Nx; ix++)
+		//{
+		//	if(s < 0)
+		//	{
+		//		FFTW_COMPLEX *tMany = t;
+		//		for(long k=0; k<HowMany; k++)
+		//		{
+		//			tMany->re = -tMany->re; tMany->im = -tMany->im;
+		//			tMany += Nx;
+		//		}
+		//	}
+		//	t++; s = -s;
+		//}
+		//OC27102018
+		//SY: optimized, adopt for OpenMP
+#ifdef _WITH_OMP
+		#pragma omp parallel for
+#endif
+		for(long ix=1; ix<Nx; ix+=2)
+		{
+			//FFTW_COMPLEX *t = pAfterFFT + ix;
+			//FFTW_COMPLEX *tMany = t;
+			//OC27102018
+			FFTW_COMPLEX *tMany = pAfterFFT + ix;
+			for(long k=0; k<HowMany; k++)
+			{
+				tMany->re = -tMany->re; tMany->im = -tMany->im;
+				tMany += Nx;
+			}
+		}
+	}
+#endif
+
+#ifdef _FFTW3 //OC29012019
+	template <class T> void RotateDataAfter1DFFT(T* pAfterFFT, long HowMany) //OC02022019
+	//void RotateDataAfter1DFFT(fftwf_complex* pAfterFFT, long HowMany)
+	{// Assumes Nx even !
+#ifndef _WITH_OMP //OC27102018
+		//fftwf_complex *t1 = pAfterFFT, *t2 = pAfterFFT + HalfNx;
+		//fftwf_complex Buf;
+		T *t1 = pAfterFFT, *t2 = pAfterFFT + HalfNx, Buf;
+		for(long ix=0; ix<HalfNx; ix++)
+		{
+			//fftwf_complex *t1Many = t1++, *t2Many = t2++;
+			T *t1Many = t1++, *t2Many = t2++;
+			for(long k=0; k<HowMany; k++)
+			{
+				Buf[0] = (*t1Many)[0]; Buf[1] = (*t1Many)[1];
+				(*t1Many)[0] = (*t2Many)[0]; (*t1Many)[1] = (*t2Many)[1];
+				(*t2Many)[0] = Buf[0]; (*t2Many)[1] = Buf[1]; 
+				t1Many += Nx; t2Many += Nx; 
+			}
+		}
+#else //OC27102018
+		//SY: adopted for OpenMP
+		#pragma omp parallel for
+		for(long ix=0; ix<HalfNx; ix++)
+		{
+			//fftwf_complex *t1Many = pAfterFFT + ix;
+			//fftwf_complex *t2Many = pAfterFFT + HalfNx + ix;
+			//fftwf_complex Buf;
+			T *t1Many = pAfterFFT + ix; //OC02022019
+			T *t2Many = pAfterFFT + HalfNx + ix;
+			T Buf;
+			for(long k=0; k<HowMany; k++)
+			{
+				Buf[0] = (*t1Many)[0]; Buf[1] = (*t1Many)[1]; 
+				(*t1Many)[0] = (*t2Many)[0]; (*t1Many)[1] = (*t2Many)[1];
+				(*t2Many)[0] = Buf[0]; (*t2Many)[1] = Buf[1];
+				t1Many += Nx; t2Many += Nx; 
+			}
+		}
+#endif
+	}
+#else
+	void RotateDataAfter1DFFT(FFTW_COMPLEX* pAfterFFT, long HowMany)
+	{// Assumes Nx even !
+#ifndef _WITH_OMP //OC27102018
+		FFTW_COMPLEX *t1 = pAfterFFT, *t2 = pAfterFFT + HalfNx;
+		FFTW_COMPLEX Buf;
+		for(long ix=0; ix<HalfNx; ix++)
+		{
+			FFTW_COMPLEX *t1Many = t1++, *t2Many = t2++;
+			for(long k=0; k<HowMany; k++)
+			{
+				Buf = *t1Many; *t1Many = *t2Many; *t2Many = Buf;
+				t1Many += Nx; t2Many += Nx; 
+			}
+		}
+#else //OC27102018
+		//SY: adopted for OpenMP
+		#pragma omp parallel for
+		for(long ix=0; ix<HalfNx; ix++)
+		{
+			FFTW_COMPLEX *t1Many = pAfterFFT + ix;
+			FFTW_COMPLEX *t2Many = pAfterFFT + HalfNx + ix;
+			FFTW_COMPLEX Buf;
+			for(long k=0; k<HowMany; k++)
+			{
+				Buf = *t1Many; *t1Many = *t2Many; *t2Many = Buf;
+				t1Many += Nx; t2Many += Nx; 
+			}
+		}
+#endif
+	}
+#endif
+
+#ifdef _FFTW3 //OC29012019
+	void NormalizeDataAfter1DFFT(fftwf_complex* pAfterFFT, long HowMany, double Mult)
+	{// Assumes Nx even !
+		float fMult = (float)Mult;
+#ifndef _WITH_OMP //OC27102018
+		fftwf_complex *t = pAfterFFT;
+		for(long ix=0; ix<Nx; ix++)
+		{
+			fftwf_complex *tMany = t++;
+			for(long k=0; k<HowMany; k++)
+			{
+				(*tMany)[0] *= fMult; (*tMany)[1] *= fMult;
+				tMany += Nx;
+			}
+		}
+#else //OC27102018
+		//SY: adopted for OpenMP
+		#pragma omp parallel for
+		for(long ix=0; ix<Nx; ix++)
+		{
+			fftwf_complex *tMany = pAfterFFT + ix;
+			for(long k=0; k<HowMany; k++)
+			{
+				(*tMany)[0] *= fMult; (*tMany)[1] *= fMult;
+				tMany += Nx;
+			}
+		}
+#endif
+	}
+	void NormalizeDataAfter1DFFT(fftw_complex* pAfterFFT, long HowMany, double Mult)
+	{// Assumes Nx even !
+#ifndef _WITH_OMP //OC27102018
+		fftw_complex *t = pAfterFFT;
+		for(long ix=0; ix<Nx; ix++)
+		{
+			fftw_complex *tMany = t++;
+			for(long k=0; k<HowMany; k++)
+			{
+				(*tMany)[0] *= Mult; (*tMany)[1] *= Mult;
+				tMany += Nx;
+			}
+		}
+#else //OC27102018
+		//SY: adopted for OpenMP
+		#pragma omp parallel for
+		for(long ix=0; ix<Nx; ix++)
+		{
+			fftw_complex *tMany = pAfterFFT + ix;
+			for(long k=0; k<HowMany; k++)
+			{
+				(*tMany)[0] *= Mult; (*tMany)[1] *= Mult;
+				tMany += Nx;
+			}
+		}
+#endif
+	}
+
+#else //ifndef _FFTW3
+	void NormalizeDataAfter1DFFT(FFTW_COMPLEX* pAfterFFT, long HowMany, double Mult)
+	{// Assumes Nx even !
+#ifndef _WITH_OMP //OC27102018
+		FFTW_COMPLEX *t = pAfterFFT;
+		for(long ix=0; ix<Nx; ix++)
+		{
+			FFTW_COMPLEX *tMany = t++;
+			for(long k=0; k<HowMany; k++)
+			{
+				tMany->re *= (FFTW_REAL)Mult; tMany->im *= (FFTW_REAL)Mult;
+				tMany += Nx;
+			}
+		}
+#else //OC27102018
+		//SY: adopted for OpenMP
+		#pragma omp parallel for
+		for(long ix=0; ix<Nx; ix++)
+		{
+			FFTW_COMPLEX *tMany = pAfterFFT + ix;
+			for(long k=0; k<HowMany; k++)
+			{
+				tMany->re *= (FFTW_REAL)Mult; tMany->im *= (FFTW_REAL)Mult;
+				tMany += Nx;
+			}
+		}
+#endif
+	}
+#endif
+
+	int SetupAuxDataForSharpEdgeCorr(CGenMathFFT1DInfo&, CGenMathAuxDataForSharpEdgeCorr1D&, char dataType='f'); //OC02022019
+	//int SetupAuxDataForSharpEdgeCorr(CGenMathFFT1DInfo&, CGenMathAuxDataForSharpEdgeCorr1D&);
+	void MakeSharpEdgeCorr(CGenMathFFT1DInfo&, CGenMathAuxDataForSharpEdgeCorr1D&);
+
+	template <class T> void SetupSharpEdgeExpCorrArray(T* pCmpData, long AmOfPt, double x, double qStart, double qStep) //OC02022019
+	//void SetupSharpEdgeExpCorrArray(float* pCmpData, long AmOfPt, double x, double qStart, double qStep)
+	{
+		const double TwoPi = 6.28318530717959;
+		double TwoPiX = TwoPi*x;
+		double q = qStart;
+		//float *tCmpData = pCmpData;
+		T *tCmpData = pCmpData;
+		for(long i=0; i<AmOfPt; i++)
+		{
+			double Arg = TwoPiX*q;
+			//float Co, Si;
+			T Co, Si;
+			CosAndSin(Arg, Co, Si);
+			*(tCmpData++) = Co; *(tCmpData++) = -Si;
+			q += qStep;
+		}
+	}
+	int ProcessSharpEdges(CGenMathFFT1DInfo& FFT1DInfo)
+	{
+		CGenMathAuxDataForSharpEdgeCorr1D AuxDataForSharpEdgeCorr;
+		int result = SetupAuxDataForSharpEdgeCorr(FFT1DInfo, AuxDataForSharpEdgeCorr);
+		if(result) return result;
+
+		MakeSharpEdgeCorr(FFT1DInfo, AuxDataForSharpEdgeCorr);
+
+
+		AuxDataForSharpEdgeCorr.Dispose();
+		return 0;
+	}
+};
+
+//*************************************************************************
+
+#endif
diff --git a/cpp/src/core/gmfft_gpu.cu b/cpp/src/core/gmfft_gpu.cu
new file mode 100644
index 00000000..c47769b4
--- /dev/null
+++ b/cpp/src/core/gmfft_gpu.cu
@@ -0,0 +1,704 @@
+/************************************************************************//**
+ * File: gmfft_gpu.cu
+ * Description: Auxiliary utilities to work with FFTW library (CUDA implementation)
+ * Project: Synchrotron Radiation Workshop
+ * First release: 2023
+ *
+ * Copyright (C) Brookhaven National Laboratory
+ * All Rights Reserved
+ *
+ * @author H.Goel
+ * @version 1.0
+ ***************************************************************************/
+
+#ifdef _OFFLOAD_GPU
+#include "cuda_runtime.h"
+#include "device_launch_parameters.h"
+#include "math_constants.h"
+#include <stdio.h>
+#include <iostream>
+#include <chrono>
+
+#define GMFFT_BLOCK_SIZE 256
+
+template <typename T> __global__ void RepairSignAfter1DFFT_Kernel(T* pAfterFFT, long HowMany, long Nx2) 
+{
+    int ix = (blockIdx.x * blockDim.x + threadIdx.x) * 4 + 2; //Nx range
+    
+    if (ix < Nx2) 
+    {
+        for (long k = 0; k < HowMany; k++)
+        {
+            pAfterFFT[ix + k * Nx2] = -pAfterFFT[ix + k * Nx2];
+            pAfterFFT[ix + k * Nx2 + 1] = -pAfterFFT[ix + k * Nx2 + 1];
+        }
+    }
+}
+
+template <typename T> __global__ void RotateDataAfter1DFFT_Kernel(T* pAfterFFT, long HowMany, long Nx2, long Nx) 
+{
+    int ix = (blockIdx.x * blockDim.x + threadIdx.x) * 2; //HalfNx range
+    
+    if (ix < Nx) 
+    {
+        for (long k = 0; k < HowMany; k++)
+        {
+            T t1_0 = pAfterFFT[ix + Nx2 * k];
+            T t1_1 = pAfterFFT[ix + Nx2 * k + 1];
+
+            pAfterFFT[ix + Nx2 * k] = pAfterFFT[ix + Nx + Nx2 * k];
+            pAfterFFT[ix + Nx2 * k + 1] = pAfterFFT[ix + Nx + Nx2 * k + 1];
+            pAfterFFT[ix + Nx + Nx2 * k] = t1_0;
+            pAfterFFT[ix + Nx + Nx2 * k + 1] = t1_1;
+        }
+    }
+}
+
+template <typename T> __global__ void RepairAndRotateAfter1DFFT_Kernel(T* pAfterFFT, long HowMany, long Nx, float Mult) 
+{
+    int ix = (blockIdx.x * blockDim.x + threadIdx.x); //HalfNx range
+    
+    long HalfNx = Nx / 2;
+    long Nx2 = Nx * 2;
+    if (ix < HalfNx) 
+    {
+        float sx0 = 1 - 2 * (ix % 2);
+        float sx1 = 1 - 2 * ((HalfNx + ix) % 2);
+        
+        float s1 = sx0 * Mult;
+        float s2 = sx1 * Mult;
+
+        int idx = ix * 2;
+        for (long i = 0; i < HowMany; i++){
+            T* t1 = pAfterFFT + i * Nx2, *t2 = pAfterFFT + (HalfNx) * 2 + i * Nx2;
+            
+            T buf_r = t1[idx] * s1;
+            T buf_im = t1[idx + 1] * s1;
+
+            t1[idx] = t2[idx] * s2;
+            t1[idx + 1] = t2[idx + 1] * s2;
+
+            t2[idx] = buf_r;
+            t2[idx + 1] = buf_im;
+        }
+    }
+}
+
+template <typename T> __global__ void NormalizeDataAfter1DFFT_Kernel(T* pAfterFFT, long HowMany, long Nx2, T Mult) 
+{
+    int ix = (blockIdx.x * blockDim.x + threadIdx.x) * 2; //Nx range
+    
+    if (ix < Nx2) 
+    {
+        for (long i = 0; i < HowMany; i++) {
+            pAfterFFT[ix + i * Nx2] *= Mult;
+            pAfterFFT[ix + i * Nx2 + 1] *= Mult;
+        }
+    }
+}
+
+template <typename T> __global__ void FillArrayShift_Kernel(double t0, double tStep, long N, T* arShiftX) 
+{
+    int ix = (blockIdx.x * blockDim.x + threadIdx.x); //HalfNx range
+
+    double t0TwoPi = t0 * 2 * CUDART_PI;
+    double q = tStep * ix;
+
+    if (ix < N) 
+    {
+        if (ix == 0) {
+            arShiftX[N] = 1.0;
+            arShiftX[N + 1] = 0.0;
+        }
+
+        ix *= 2;
+        if (ix < N - 2) 
+        {
+            sincos(q * t0TwoPi, &arShiftX[N + 2 + 1 + ix], &arShiftX[N + 2 + ix]);
+            arShiftX[N - 2 - ix] = arShiftX[N + 2 + ix];
+            arShiftX[N - 1 - ix] = -arShiftX[N + 2 + 1 + ix];
+        }
+
+        if (ix == N - 2) 
+        {
+            sincos(-q * t0TwoPi, &arShiftX[1], &arShiftX[0]);
+        }
+    }
+}
+
+template <typename T> __global__ void TreatShift_Kernel(T* pData, long HowMany, long Nx2, T* tShiftX) 
+{
+    int ix = (blockIdx.x * blockDim.x + threadIdx.x) * 2; //Nx range
+    
+    if (ix < Nx2) 
+    {
+        T MultX_Re = tShiftX[ix];
+        T MultX_Im = tShiftX[ix + 1];
+
+        for (long k = 0; k < HowMany; k++)
+        {
+            T buf_r = pData[ix + k * Nx2];
+            T buf_im = pData[ix + k * Nx2 + 1];
+
+            T NewRe = buf_r * MultX_Re - buf_im * MultX_Im;
+            T NewIm = buf_r * MultX_Im + buf_im * MultX_Re;
+            pData[ix + k * Nx2] = NewRe;
+            pData[ix + k * Nx2 + 1] = NewIm;
+        }
+    }
+}
+
+void RepairSignAfter1DFFT_GPU(float* pAfterFFT, long HowMany, long Nx) 
+{
+
+    dim3 blocks(Nx / GMFFT_BLOCK_SIZE + ((Nx & (GMFFT_BLOCK_SIZE - 1)) != 0));
+    dim3 threads(GMFFT_BLOCK_SIZE, 1);
+    RepairSignAfter1DFFT_Kernel<float> << <blocks, threads >> > (pAfterFFT, HowMany, Nx * 2);
+
+//#ifdef _DEBUG
+//	cudaStreamSynchronize(0);
+//	auto err = cudaGetLastError();
+//	printf("%s\r\n", cudaGetErrorString(err));
+//#endif
+}
+
+void RotateDataAfter1DFFT_GPU(float* pAfterFFT, long HowMany, long Nx) 
+{
+
+    dim3 blocks(Nx / (2 * GMFFT_BLOCK_SIZE) + ((Nx / 2 & (GMFFT_BLOCK_SIZE - 1)) != 0));
+    dim3 threads(GMFFT_BLOCK_SIZE, 1);
+    RotateDataAfter1DFFT_Kernel<float> << <blocks, threads >> > (pAfterFFT, HowMany, Nx * 2, Nx);
+
+//#ifdef _DEBUG
+//	cudaStreamSynchronize(0);
+//	auto err = cudaGetLastError();
+//	printf("%s\r\n", cudaGetErrorString(err));
+//#endif
+}
+
+void RepairAndRotateDataAfter1DFFT_GPU(float* pAfterFFT, long HowMany, long Nx, float Mult) 
+{
+
+//#ifdef _DEBUG
+//	cudaStreamSynchronize(0);
+//	auto err = cudaGetLastError();
+//	printf("%s\r\n", cudaGetErrorString(err));
+//#endif
+
+
+    dim3 blocks(Nx / (2 * GMFFT_BLOCK_SIZE) + (((Nx / 2) & (GMFFT_BLOCK_SIZE - 1)) != 0), 1);
+    dim3 threads(GMFFT_BLOCK_SIZE, 1);
+    RepairAndRotateAfter1DFFT_Kernel<float> << <blocks, threads >> > (pAfterFFT, HowMany, Nx, Mult);
+
+//#ifdef _DEBUG
+//	cudaStreamSynchronize(0);
+//	err = cudaGetLastError();
+//	printf("%s\r\n", cudaGetErrorString(err));
+//#endif
+}
+
+void NormalizeDataAfter1DFFT_GPU(float* pAfterFFT, long HowMany, long Nx, double Mult) 
+{
+
+    dim3 blocks(Nx / GMFFT_BLOCK_SIZE + ((Nx & (GMFFT_BLOCK_SIZE - 1)) != 0), 1);
+    dim3 threads(GMFFT_BLOCK_SIZE, 1);
+    NormalizeDataAfter1DFFT_Kernel<float> << <blocks, threads >> > (pAfterFFT, HowMany, Nx * 2, (float)Mult); //OC06092023
+    //NormalizeDataAfter1DFFT_Kernel<float> << <blocks, threads >> > (pAfterFFT, HowMany, Nx * 2, Mult);
+
+//#ifdef _DEBUG
+//	cudaStreamSynchronize(0);
+//	auto err = cudaGetLastError();
+//	printf("%s\r\n", cudaGetErrorString(err));
+//#endif
+}
+
+void FillArrayShift_GPU(double t0, double tStep, long Nx, float* tShiftX) 
+{
+
+    dim3 blocks(Nx / (2 * GMFFT_BLOCK_SIZE) + ((Nx / 2 & (GMFFT_BLOCK_SIZE - 1)) != 0), 1);
+    dim3 threads(GMFFT_BLOCK_SIZE, 1);
+    FillArrayShift_Kernel<float> << <blocks, threads >> > (t0, tStep, Nx, tShiftX);
+
+//#ifdef _DEBUG
+//	cudaStreamSynchronize(0);
+//	auto err = cudaGetLastError();
+//	printf("%s\r\n", cudaGetErrorString(err));
+//#endif
+}
+
+void TreatShift_GPU(float* pData, long HowMany, long Nx, float* tShiftX) 
+{
+
+    dim3 blocks(Nx / GMFFT_BLOCK_SIZE + ((Nx & (GMFFT_BLOCK_SIZE - 1)) != 0));
+    dim3 threads(GMFFT_BLOCK_SIZE, 1);
+    TreatShift_Kernel<float> << <blocks, threads >> > (pData, HowMany, Nx * 2, tShiftX);
+
+//#ifdef _DEBUG
+//	cudaStreamSynchronize(0);
+//	auto err = cudaGetLastError();
+//	printf("%s\r\n", cudaGetErrorString(err));
+//#endif
+}
+
+void RepairSignAfter1DFFT_GPU(double* pAfterFFT, long HowMany, long Nx) 
+{
+
+    dim3 blocks(Nx / GMFFT_BLOCK_SIZE + ((Nx & (GMFFT_BLOCK_SIZE - 1)) != 0));
+    dim3 threads(GMFFT_BLOCK_SIZE, 1);
+    RepairSignAfter1DFFT_Kernel<double> << <blocks, threads >> > (pAfterFFT, HowMany, Nx * 2);
+
+//#ifdef _DEBUG
+//	cudaStreamSynchronize(0);
+//	auto err = cudaGetLastError();
+//	printf("%s\r\n", cudaGetErrorString(err));
+//#endif
+}
+
+void RotateDataAfter1DFFT_GPU(double* pAfterFFT, long HowMany, long Nx) 
+{
+
+    dim3 blocks(Nx / (2 * GMFFT_BLOCK_SIZE) + ((Nx & (2 * GMFFT_BLOCK_SIZE - 1)) != 0));
+    dim3 threads(GMFFT_BLOCK_SIZE, 1);
+    RotateDataAfter1DFFT_Kernel<double> << <blocks, threads >> > (pAfterFFT, HowMany, Nx * 2, Nx);
+
+//#ifdef _DEBUG
+//	cudaStreamSynchronize(0);
+//	auto err = cudaGetLastError();
+//	printf("%s\r\n", cudaGetErrorString(err));
+//#endif
+}
+
+void RepairAndRotateDataAfter1DFFT_GPU(double* pAfterFFT, long HowMany, long Nx, double Mult) 
+{
+
+    dim3 blocks(Nx / (2 * GMFFT_BLOCK_SIZE) + (((Nx / 2) & (GMFFT_BLOCK_SIZE - 1)) != 0), 1);
+    dim3 threads(GMFFT_BLOCK_SIZE, 1);
+    RepairAndRotateAfter1DFFT_Kernel<double> << <blocks, threads >> > (pAfterFFT, HowMany, Nx, (float)Mult); //OC06092023 (check why it's not ..T Mult..)
+    //RepairAndRotateAfter1DFFT_Kernel<double> << <blocks, threads >> > (pAfterFFT, HowMany, Nx, Mult);
+
+//#ifdef _DEBUG
+//	cudaStreamSynchronize(0);
+//	auto err = cudaGetLastError();
+//	printf("%s\r\n", cudaGetErrorString(err));
+//#endif
+}
+
+void NormalizeDataAfter1DFFT_GPU(double* pAfterFFT, long HowMany, long Nx, double Mult) 
+{
+
+    dim3 blocks(Nx / GMFFT_BLOCK_SIZE + ((Nx & (GMFFT_BLOCK_SIZE - 1)) != 0));
+    dim3 threads(GMFFT_BLOCK_SIZE, 1);
+    NormalizeDataAfter1DFFT_Kernel<double> << <blocks, threads >> > (pAfterFFT, HowMany, Nx * 2, Mult);
+
+//#ifdef _DEBUG
+//	cudaStreamSynchronize(0);
+//	auto err = cudaGetLastError();
+//	printf("%s\r\n", cudaGetErrorString(err));
+//#endif
+}
+
+void FillArrayShift_GPU(double t0, double tStep, long Nx, double* tShiftX) 
+{
+
+    dim3 blocks(Nx / (2 * GMFFT_BLOCK_SIZE) + ((Nx & (2 * GMFFT_BLOCK_SIZE - 1)) != 0), 1);
+    dim3 threads(GMFFT_BLOCK_SIZE, 1);
+    FillArrayShift_Kernel<double> << <blocks, threads >> > (t0, tStep, Nx, tShiftX);
+
+//#ifdef _DEBUG
+//	cudaStreamSynchronize(0);
+//	auto err = cudaGetLastError();
+//	printf("%s\r\n", cudaGetErrorString(err));
+//#endif
+}
+
+void TreatShift_GPU(double* pData, long HowMany, long Nx, double* tShiftX) 
+{
+
+    dim3 blocks(Nx / GMFFT_BLOCK_SIZE + ((Nx & (GMFFT_BLOCK_SIZE - 1)) != 0));
+    dim3 threads(GMFFT_BLOCK_SIZE, 1);
+    TreatShift_Kernel<double> << <blocks, threads >> > (pData, HowMany, Nx * 2, tShiftX);
+
+//#ifdef _DEBUG
+//	cudaStreamSynchronize(0);
+//	auto err = cudaGetLastError();
+//	printf("%s\r\n", cudaGetErrorString(err));
+//#endif
+}
+
+
+template <typename T> __global__ void RepairSignAfter2DFFT_Kernel(T* pAfterFFT, long Nx, long Ny, long Nx2Ny2, long howMany) 
+{
+    int ix = (blockIdx.x * blockDim.x + threadIdx.x); //Nx range
+    int iy = (blockIdx.y * blockDim.y + threadIdx.y); //Ny range
+
+    float sx0 = 1 - 2 * (ix % 2);
+    float sy0 = 1 - 2 * (iy % 2);
+    float s = sx0 * sy0;
+
+    if (ix < Nx && iy < Ny) 
+    {
+        for (long i=0; i<howMany; i++)
+        {
+            pAfterFFT[Nx2Ny2 * i + (ix + iy * Nx) * 2] *= s;
+            pAfterFFT[Nx2Ny2 * i + (ix + iy * Nx) * 2 + 1] *= s;
+        }
+    }
+}
+
+template <typename T> __global__ void RotateDataAfter2DFFT_Kernel(T* pAfterFFT, long HalfNx, long Nx, long HalfNy, long Ny, long Nx2Ny2, long howMany) 
+{
+    int ix = (blockIdx.x * blockDim.x + threadIdx.x); //HalfNx range
+    int iy = (blockIdx.y * blockDim.y + threadIdx.y); //HalfNy range
+
+    if (ix < HalfNx && iy < HalfNy) 
+    {
+        int idx = (ix + iy * Nx) * 2;
+        for (long i=0; i<howMany; i++)
+        {
+            long long HalfNyNx = ((long long)HalfNy) * ((long long)Nx);
+            T* t1 = pAfterFFT + i * Nx2Ny2, *t2 = pAfterFFT + (HalfNyNx + HalfNx) * 2 + i * Nx2Ny2;
+            T* t3 = pAfterFFT + HalfNx * 2 + i * Nx2Ny2, *t4 = pAfterFFT + HalfNyNx * 2 + i * Nx2Ny2;
+
+            T buf_r = t1[idx];
+            T buf_im = t1[idx + 1];
+            t1[idx] = t2[idx];
+            t1[idx + 1] = t2[idx + 1];
+
+            t2[idx] = buf_r;
+            t2[idx + 1] = buf_im;
+
+            buf_r = t3[idx];
+            buf_im = t3[idx + 1];
+            t3[idx] = t4[idx];
+            t3[idx + 1] = t4[idx + 1];
+
+            t4[idx] = buf_r;
+            t4[idx + 1] = buf_im;
+        }
+    }
+}
+
+template <typename T, typename T2> __global__ void RepairSignAndRotateDataAfter2DFFT_Kernel(T* pAfterFFT, long HalfNx, long Nx, long HalfNy, long Ny, long Nx2Ny2, long howMany, T2 Mult) 
+{
+    int ix = (blockIdx.x * blockDim.x + threadIdx.x); //HalfNx range
+    int iy = (blockIdx.y * blockDim.y + threadIdx.y); //HalfNy range
+
+    if (ix < HalfNx) 
+    {
+        float sx0 = 1.f - 2.f * (ix % 2);
+        float sy0 = 1.f - 2.f * (iy % 2);
+        float sx1 = 1.f - 2.f * ((HalfNx + ix) % 2);
+        float sy1 = 1.f - 2.f * ((HalfNy + iy) % 2);
+        
+        float s1 = sx0 * sy0 * Mult;
+        float s2 = sx1 * sy1 * Mult;
+        float s3 = sx1 * sy0 * Mult;
+        float s4 = sx0 * sy1 * Mult;
+
+        int idx = (ix + iy * Nx);
+        for (long i=0; i<howMany; i++)
+        {
+            long long HalfNyNx = ((long long)HalfNy) * ((long long)Nx);
+            T* t1 = pAfterFFT + i * Nx2Ny2, *t2 = pAfterFFT + (HalfNyNx + HalfNx) + i * Nx2Ny2;
+            T* t3 = pAfterFFT + HalfNx + i * Nx2Ny2, *t4 = pAfterFFT + HalfNyNx + i * Nx2Ny2;
+
+            T buf1 = t1[idx];
+            buf1.x *= s1;
+            buf1.y *= s1;
+
+            T buf2 = t2[idx];
+            buf2.x *= s2;
+            buf2.y *= s2;
+
+            t1[idx] = buf2;
+            t2[idx] = buf1;
+
+            buf1 = t3[idx];
+            buf1.x *= s3;
+            buf1.y *= s3;
+
+            buf2 = t4[idx];
+            buf2.x *= s4;
+            buf2.y *= s4;
+
+            t3[idx] = buf2;
+            t4[idx] = buf1;
+        }
+    }
+}
+
+template <typename T> __global__ void NormalizeDataAfter2DFFT_Kernel(T* pAfterFFT, long Nx2Ny2, long howMany, long n, T Mult) 
+{
+    int ix = (blockIdx.x * blockDim.x + threadIdx.x) * 2; //Nx range
+
+    if (ix < Nx2Ny2) 
+    {
+        for (long i=0; i<howMany; i++)
+        {
+            pAfterFFT[ix + Nx2Ny2 * i] *= Mult;
+            pAfterFFT[ix + Nx2Ny2 * i + 1] *= Mult;
+        }
+    }
+}
+
+template <typename T, bool NeedsShiftX, bool NeedsShiftY> __global__ void TreatShift2D_Kernel(T* pData, long HowMany, long Nx2, long Ny, T* tShiftX, T* tShiftY) 
+{
+    int ix = (blockIdx.x * blockDim.x + threadIdx.x) * 2; //Nx range
+    int iy = (blockIdx.y * blockDim.y + threadIdx.y); //Ny range
+
+    if (ix < Nx2) 
+    {
+        T MultRe = 1;
+        T MultIm = 0;
+
+        T MultX_Re = 1; 
+        T MultX_Im = 0;
+
+        T MultY_Re = 1;
+        T MultY_Im = 0;
+
+        if (NeedsShiftY)
+        {
+            MultY_Re = tShiftY[iy * 2];
+            MultY_Im = tShiftY[iy * 2 + 1];
+        }
+        if (NeedsShiftX)
+        {
+            MultX_Re = tShiftX[ix];
+            MultX_Im = tShiftX[ix + 1];
+
+            if (NeedsShiftY) 
+            {
+                MultRe = MultX_Re * MultY_Re - MultX_Im * MultY_Im;
+                MultIm = MultX_Re * MultY_Im + MultX_Im * MultY_Re;
+            }
+            else 
+            {
+                MultRe = MultX_Re;
+                MultIm = MultX_Im;
+            }
+        }
+        else 
+        {
+            MultRe = MultY_Re;
+            MultIm = MultY_Im;
+        }
+
+        for (long k=0; k<HowMany; k++)
+        {
+            long offset = k * Nx2 * Ny + iy * Nx2 + ix;
+            T buf_r = pData[offset];
+            T buf_im = pData[offset + 1];
+            T NewRe = buf_r * MultRe - buf_im * MultIm;
+            T NewIm = buf_r * MultIm + buf_im * MultRe;
+            pData[offset] = NewRe;
+            pData[offset + 1] = NewIm;
+        }
+    }
+}
+
+void RepairSignAfter2DFFT_GPU(float* pAfterFFT, long Nx, long Ny, long howMany)
+{
+
+    dim3 blocks(Nx / GMFFT_BLOCK_SIZE + ((Nx & (GMFFT_BLOCK_SIZE - 1)) != 0), Ny);
+    dim3 threads(GMFFT_BLOCK_SIZE, 1);
+    RepairSignAfter2DFFT_Kernel<float> << <blocks, threads >> > (pAfterFFT, Nx, Ny, Nx * Ny * 2, howMany);
+}
+
+void RotateDataAfter2DFFT_GPU(float* pAfterFFT, long Nx, long Ny, long howMany)
+{
+
+    dim3 blocks(Nx / (2 * GMFFT_BLOCK_SIZE) + ((Nx / 2 & (GMFFT_BLOCK_SIZE - 1)) != 0), Ny);
+    dim3 threads(GMFFT_BLOCK_SIZE, 1);
+    RotateDataAfter2DFFT_Kernel<float> << <blocks, threads >> > (pAfterFFT, Nx / 2, Nx, Ny / 2, Ny, Nx * Ny * 2, howMany);
+}
+
+void RepairSignAndRotateDataAfter2DFFT_GPU(float* pAfterFFT, long Nx, long Ny, long howMany, float Mult)
+{
+
+    dim3 blocks(Nx / (2 * GMFFT_BLOCK_SIZE) + ((Nx / 2 & (GMFFT_BLOCK_SIZE - 1)) != 0), Ny/2);
+    dim3 threads(GMFFT_BLOCK_SIZE, 1);
+    RepairSignAndRotateDataAfter2DFFT_Kernel<float2, float> << <blocks, threads >> > ((float2*)pAfterFFT, Nx / 2, Nx, Ny / 2, Ny, Nx * Ny, howMany, Mult);
+}
+
+void NormalizeDataAfter2DFFT_GPU(float* pAfterFFT, long Nx, long Ny, long howMany, double Mult)
+{
+
+    dim3 blocks((Nx * Ny) / GMFFT_BLOCK_SIZE + (((Nx * Ny) & (GMFFT_BLOCK_SIZE - 1)) != 0), 1);
+    dim3 threads(GMFFT_BLOCK_SIZE, 1);
+    NormalizeDataAfter2DFFT_Kernel<float> << <blocks, threads >> > (pAfterFFT, Nx * Ny * 2, howMany,1, (float)Mult); //OC06092023
+    //NormalizeDataAfter2DFFT_Kernel<float> << <blocks, threads >> > (pAfterFFT, Nx * Ny * 2, howMany,1, Mult);
+}
+
+void TreatShifts2D_GPU(float* pData, long Nx, long Ny, long howMany, bool NeedsShiftX, bool NeedsShiftY, float* m_ArrayShiftX, float* m_ArrayShiftY)
+{
+
+    dim3 blocks((Nx) / GMFFT_BLOCK_SIZE + (((Nx) & (GMFFT_BLOCK_SIZE - 1)) != 0), Ny);
+    dim3 threads(GMFFT_BLOCK_SIZE, 1);
+    
+    if (NeedsShiftX && NeedsShiftY) TreatShift2D_Kernel<float, true, true> << <blocks, threads >> > (pData, howMany, Nx * 2, Ny, m_ArrayShiftX, m_ArrayShiftY);
+    else if (NeedsShiftX) TreatShift2D_Kernel<float, true, false> << <blocks, threads >> > (pData, howMany, Nx * 2, Ny, m_ArrayShiftX, m_ArrayShiftY);
+    else if (NeedsShiftY) TreatShift2D_Kernel<float, false, true> << <blocks, threads >> > (pData, howMany, Nx * 2, Ny, m_ArrayShiftX, m_ArrayShiftY);
+}
+
+void RepairSignAfter2DFFT_GPU(double* pAfterFFT, long Nx, long Ny, long howMany)
+{
+
+    dim3 blocks(Nx / GMFFT_BLOCK_SIZE + ((Nx & (GMFFT_BLOCK_SIZE - 1)) != 0), Ny);
+    dim3 threads(GMFFT_BLOCK_SIZE, 1);
+    RepairSignAfter2DFFT_Kernel<double> << <blocks, threads >> > (pAfterFFT, Nx, Ny, Nx * Ny * 2, howMany);
+}
+
+void RotateDataAfter2DFFT_GPU(double* pAfterFFT, long Nx, long Ny, long howMany)
+{
+
+    dim3 blocks(Nx / (2 * GMFFT_BLOCK_SIZE) + ((Nx / 2 & (GMFFT_BLOCK_SIZE - 1)) != 0), Ny);
+    dim3 threads(GMFFT_BLOCK_SIZE, 1);
+    RotateDataAfter2DFFT_Kernel<double> << <blocks, threads >> > (pAfterFFT, Nx / 2, Nx, Ny / 2, Ny, Nx * Ny * 2, howMany);
+}
+
+void RepairSignAndRotateDataAfter2DFFT_GPU(double* pAfterFFT, long Nx, long Ny, long howMany, double Mult)
+{
+
+    dim3 blocks(Nx / (2 * GMFFT_BLOCK_SIZE) + ((Nx / 2 & (GMFFT_BLOCK_SIZE - 1)) != 0), Ny/2);
+    dim3 threads(GMFFT_BLOCK_SIZE, 1);
+    RepairSignAndRotateDataAfter2DFFT_Kernel<double2, double> << <blocks, threads >> > ((double2*)pAfterFFT, Nx / 2, Nx, Ny / 2, Ny, Nx * Ny, howMany, Mult);
+}
+
+void NormalizeDataAfter2DFFT_GPU(double* pAfterFFT, long Nx, long Ny, long howMany, double Mult)
+{
+
+    dim3 blocks((Nx * Ny) / GMFFT_BLOCK_SIZE + (((Nx * Ny) & (GMFFT_BLOCK_SIZE - 1)) != 0), 1);
+    dim3 threads(GMFFT_BLOCK_SIZE, 1);
+    NormalizeDataAfter2DFFT_Kernel<double> << <blocks, threads >> > (pAfterFFT, Nx * Ny * 2, howMany,1, Mult);
+}
+
+void TreatShifts2D_GPU(double* pData, long Nx, long Ny, long howMany, bool NeedsShiftX, bool NeedsShiftY, double* m_ArrayShiftX, double* m_ArrayShiftY)
+{
+
+    dim3 blocks((Nx) / GMFFT_BLOCK_SIZE + (((Nx) & (GMFFT_BLOCK_SIZE - 1)) != 0), Ny);
+    dim3 threads(GMFFT_BLOCK_SIZE, 1);
+
+    if (NeedsShiftX && NeedsShiftY) TreatShift2D_Kernel<double, true, true> << <blocks, threads >> > (pData, howMany, Nx * 2, Ny, m_ArrayShiftX, m_ArrayShiftY);
+    else if (NeedsShiftX) TreatShift2D_Kernel<double, true, false> << <blocks, threads >> > (pData, howMany, Nx * 2, Ny, m_ArrayShiftX, m_ArrayShiftY);
+    else if (NeedsShiftY) TreatShift2D_Kernel<double, false, true> << <blocks, threads >> > (pData, howMany, Nx * 2, Ny, m_ArrayShiftX, m_ArrayShiftY);
+}
+
+//OC06092023: looks like place if wrong here for this function, why all these functions are programmed without classes?
+template<typename T> __global__ void StokesAvgUpdateInterp_Kernel(float* pStokesArS, float* pMoreStokesArS, int nIters, int nOrder, int nStokesComp, T mult, int iSt, long xNpMeshRes, long yNpMeshRes, long eNpMeshRes, T yStartMeshRes, T yStepMeshRes, T yStartWfr, T yStepWfr, T xStartMeshRes, T xStepMeshRes, T xStartWfr, T xStepWfr, int iOfstSt, long xNpWfr, long yNpWfr, long eNpWfr, bool sum)
+{
+    int ix = (blockIdx.x * blockDim.x + threadIdx.x); //xNpMeshRes range
+    int iy = (blockIdx.y * blockDim.y + threadIdx.y); //yNpMeshRes range
+    int ie = (blockIdx.z * blockDim.z + threadIdx.z); //eNpMeshRes range
+
+    if (ix >= xNpMeshRes)
+        return;
+    if (iy >= yNpMeshRes)
+        return;
+    if (ie >= eNpMeshRes)
+        return;
+
+    long ir = iSt * yNpMeshRes * xNpMeshRes * eNpMeshRes + iy * xNpMeshRes * eNpMeshRes + ix * eNpMeshRes + ie;
+
+    auto yMeshRes = yStartMeshRes + iy * yStepMeshRes;
+    auto xMeshRes = xStartMeshRes + ix * xStepMeshRes;
+    T fInterp = 0;
+    int loc_ix_ofst = iOfstSt + ie;
+    auto nx_ix_per = xNpWfr * eNpWfr;
+
+    switch (nOrder)
+    {
+    case 1:
+        {
+            int ix0 = (int)trunc((xMeshRes - xStartWfr) / xStepWfr + 1e-09);
+            if ((ix0 < 0) | (ix0 >= xNpWfr - 1)) 
+            {
+                pStokesArS[ir] = pStokesArS[ir] * nIters / (nIters + 1);
+                return;
+            }
+            int ix1 = ix0 + 1;
+            auto tx = (xMeshRes - (xStartWfr + xStepWfr * ix0)) / xStepWfr;
+            int iy0 = (int)trunc((yMeshRes - yStartWfr) / yStepWfr + 1e-09);
+            if ((iy0 < 0) | (iy0 >= yNpWfr - 1)) 
+            {
+                pStokesArS[ir] = pStokesArS[ir] * nIters / (nIters + 1);
+                return;
+            }
+
+
+            int iy1 = iy0 + 1;
+            auto ty = (yMeshRes - (yStartWfr + yStepWfr * iy0)) / yStepWfr;
+            auto iy0_nx_ix_per = iy0 * nx_ix_per;
+            auto iy1_nx_ix_per = iy1 * nx_ix_per;
+            auto ix0_ix_per_p_ix_ofst = ix0 * eNpWfr + loc_ix_ofst;
+            auto ix1_ix_per_p_ix_ofst = ix1 * eNpWfr + loc_ix_ofst;
+            auto a00 = pMoreStokesArS[iy0_nx_ix_per + ix0_ix_per_p_ix_ofst];
+            auto f10 = pMoreStokesArS[iy0_nx_ix_per + ix1_ix_per_p_ix_ofst];
+            auto f01 = pMoreStokesArS[iy1_nx_ix_per + ix0_ix_per_p_ix_ofst];
+            auto f11 = pMoreStokesArS[iy1_nx_ix_per + ix1_ix_per_p_ix_ofst];
+            auto a10 = f10 - a00;
+            auto a01 = f01 - a00;
+            auto a11 = a00 - f01 - f10 + f11;
+            fInterp = a00 + tx * (a10 + ty * a11) + ty * a01;
+        }
+        break;
+    case 2:
+        {
+            int ix0 = int(round((xMeshRes - xStartWfr) / xStepWfr));
+            if ((ix0 < 0) || (ix0 >= xNpWfr - 1)) 
+            {
+                pStokesArS[ir] = pStokesArS[ir] * nIters / (float)(nIters + 1);
+                ir += 1;
+                return;
+            }
+            int ixm1 = ix0 - 1;
+            int ix1 = ix0 + 1;
+            auto tx = (xMeshRes - (xStartWfr + xStepWfr * ix0)) / xStepWfr;
+            int iy0 = int(round((yMeshRes - yStartWfr) / yStepWfr));
+            if ((iy0 < 0) || (iy0 >= yNpWfr - 1)) 
+            {
+                pStokesArS[ir] = pStokesArS[ir] * nIters / (nIters + 1);
+                ir += 1;
+                return;
+            }
+            int iym1 = iy0 - 1;
+            int iy1 = iy0 + 1;
+            auto ty = (yMeshRes - (yStartWfr + yStepWfr * iy0)) / yStepWfr;
+            auto iym1_nx_ix_per = iym1 * nx_ix_per;
+            auto iy0_nx_ix_per = iy0 * nx_ix_per;
+            auto iy1_nx_ix_per = iy1 * nx_ix_per;
+            auto ixm1_ix_per_p_ix_ofst = ixm1 * eNpWfr + loc_ix_ofst;
+            auto ix0_ix_per_p_ix_ofst = ix0 * eNpWfr + loc_ix_ofst;
+            auto ix1_ix_per_p_ix_ofst = ix1 * eNpWfr + loc_ix_ofst;
+            auto fm10 = pMoreStokesArS[iy0_nx_ix_per + ixm1_ix_per_p_ix_ofst];
+            auto a00 = pMoreStokesArS[iy0_nx_ix_per + ix0_ix_per_p_ix_ofst];
+            auto f10 = pMoreStokesArS[iy0_nx_ix_per + ix1_ix_per_p_ix_ofst];
+            auto f0m1 = pMoreStokesArS[iym1_nx_ix_per + ix0_ix_per_p_ix_ofst];
+            auto f01 = pMoreStokesArS[iy1_nx_ix_per + ix0_ix_per_p_ix_ofst];
+            auto f11 = pMoreStokesArS[iy1_nx_ix_per + ix1_ix_per_p_ix_ofst];
+            auto a10 = 0.5 * (f10 - fm10);
+            auto a01 = 0.5 * (f01 - f0m1);
+            auto a11 = a00 - f01 - f10 + f11;
+            auto a20 = 0.5 * (f10 + fm10) - a00;
+            auto a02 = 0.5 * (f01 + f0m1) - a00;
+            fInterp = a00 + tx * (a10 + tx * a20 + ty * a11) + ty * (a01 + ty * a02);
+        }
+        break;
+    }
+
+    if (sum) pStokesArS[ir] += mult * fInterp;
+    else pStokesArS[ir] = (pStokesArS[ir] * nIters + mult * fInterp) / (nIters + 1);
+    return;
+}
+
+//OC06092023: looks like place if wrong here for this function, why all these functions are programmed without classes?
+void StokesAvgUpdateInterp(float* pStokesArS, float* pMoreStokesArS, int nIters, int nOrder, int nStokesComp, double mult, int iSt, long xNpMeshRes, long yNpMeshRes, long eNpMeshRes, double yStartMeshRes, double yStepMeshRes, double yStartWfr, double yStepWfr, double xStartMeshRes, double xStepMeshRes, double xStartWfr, double xStepWfr, int iOfstSt, long xNpWfr, long yNpWfr, long eNpWfr, bool sum)
+{
+    const int bs = 8;
+    dim3 threads(xNpMeshRes / bs + ((xNpMeshRes & (bs - 1)) != 0), yNpMeshRes / bs + ((yNpMeshRes & (bs - 1)) != 0), eNpMeshRes);
+    dim3 blocks(bs, bs, 1);
+    //OC06092023 (check order of variables, loop over e)
+    StokesAvgUpdateInterp_Kernel <float><< <threads, blocks >> > (pStokesArS, pMoreStokesArS, nIters, nOrder, nStokesComp, (float)mult, iSt, xNpMeshRes, yNpMeshRes, eNpMeshRes, (float)yStartMeshRes, (float)yStepMeshRes, (float)yStartWfr, (float)yStepWfr, (float)xStartMeshRes, (float)xStepMeshRes, (float)xStartWfr, (float)xStepWfr, iOfstSt, xNpWfr, yNpWfr, eNpWfr, sum);
+    //StokesAvgUpdateInterp_Kernel <float><< <threads, blocks >> > (pStokesArS, pMoreStokesArS, nIters, nOrder, nStokesComp, mult, iSt, xNpMeshRes, yNpMeshRes, eNpMeshRes, yStartMeshRes, yStepMeshRes, yStartWfr, yStepWfr, xStartMeshRes, xStepMeshRes, xStartWfr, xStepWfr, iOfstSt, xNpWfr, yNpWfr, eNpWfr, sum);
+}
+#endif
\ No newline at end of file
diff --git a/cpp/src/core/gmfft_gpu.h b/cpp/src/core/gmfft_gpu.h
new file mode 100644
index 00000000..8eef9e2c
--- /dev/null
+++ b/cpp/src/core/gmfft_gpu.h
@@ -0,0 +1,43 @@
+/************************************************************************//**
+ * File: gmfft_gpu.h
+ * Description: Auxiliary utilities to work with FFTW library (CUDA header)
+ * Project: Synchrotron Radiation Workshop
+ * First release: 2023
+ *
+ * Copyright (C) Brookhaven National Laboratory
+ * All Rights Reserved
+ *
+ * @author H.Goel
+ * @version 1.0
+ ***************************************************************************/
+
+#ifndef __GMFFTGPU0_H
+#define __GMFFTGPU0_H
+
+void RepairSignAfter1DFFT_GPU(float* pAfterFFT, long HowMany, long Nx);
+void RotateDataAfter1DFFT_GPU(float* pAfterFFT, long HowMany, long Nx);
+void RepairAndRotateDataAfter1DFFT_GPU(float* pAfterFFT, long HowMany, long Nx, float Mult=1.f);
+void NormalizeDataAfter1DFFT_GPU(float* pAfterFFT, long HowMany, long Nx, double Mult);
+void FillArrayShift_GPU(double t0, double tStep, long Nx, float* tShiftX);
+void TreatShift_GPU(float* pData, long HowMany, long Nx, float* tShiftX);
+
+void RepairSignAfter1DFFT_GPU(double* pAfterFFT, long HowMany, long Nx);
+void RotateDataAfter1DFFT_GPU(double* pAfterFFT, long HowMany, long Nx);
+void RepairAndRotateDataAfter1DFFT_GPU(double* pAfterFFT, long HowMany, long Nx, double Mult=1.);
+void NormalizeDataAfter1DFFT_GPU(double* pAfterFFT, long HowMany, long Nx, double Mult);
+void FillArrayShift_GPU(double t0, double tStep, long Nx, double* tShiftX);
+void TreatShift_GPU(double* pData, long HowMany, long Nx, double* tShiftX);
+
+void RepairSignAfter2DFFT_GPU(float* pAfterFFT, long Nx, long Ny, long howMany);
+void RotateDataAfter2DFFT_GPU(float* pAfterFFT, long Nx, long Ny, long howMany);
+void RepairSignAndRotateDataAfter2DFFT_GPU(float* pAfterFFT, long Nx, long Ny, long howMany, float Mult=1.f); //to check
+void NormalizeDataAfter2DFFT_GPU(float* pAfterFFT, long Nx, long Ny, long howMany, double Mult);
+void TreatShifts2D_GPU(float* pData, long Nx, long Ny, long howMany, bool NeedsShiftX, bool NeedsShiftY, float* m_ArrayShiftX, float* m_ArrayShiftY);
+
+void RepairSignAfter2DFFT_GPU(double* pAfterFFT, long Nx, long Ny, long howMany);
+void RotateDataAfter2DFFT_GPU(double* pAfterFFT, long Nx, long Ny, long howMany);
+void RepairSignAndRotateDataAfter2DFFT_GPU(double* pAfterFFT, long Nx, long Ny, long howMany, double Mult=1.);
+void NormalizeDataAfter2DFFT_GPU(double* pAfterFFT, long Nx, long Ny, long howMany, double Mult);
+void TreatShifts2D_GPU(double* pData, long Nx, long Ny, long howMany, bool NeedsShiftX, bool NeedsShiftY, double* m_ArrayShiftX, double* m_ArrayShiftY);
+
+#endif // __GMFFTGPU0_H
\ No newline at end of file
diff --git a/cpp/src/core/srradmnp.cpp b/cpp/src/core/srradmnp.cpp
index 711bc2b4..7522f78b 100644
--- a/cpp/src/core/srradmnp.cpp
+++ b/cpp/src/core/srradmnp.cpp
@@ -677,6 +677,7 @@ int srTRadGenManip::ExtractSingleElecIntensity1DvsZ(srTRadExtract& RadExtract)
 //*************************************************************************
 
 int srTRadGenManip::ExtractSingleElecIntensity2DvsXZ(srTRadExtract& RadExtract)
+//int srTRadGenManip::ExtractSingleElecIntensity2DvsXZ(srTRadExtract& RadExtract, gpuUsageArg *pGpuUsage) //Himanshu?
 {
 	int PolCom = RadExtract.PolarizCompon;
 	int Int_or_ReE = RadExtract.Int_or_Phase;
@@ -690,6 +691,7 @@ int srTRadGenManip::ExtractSingleElecIntensity2DvsXZ(srTRadExtract& RadExtract)
 	float *pI = 0, *pI1 = 0, *pI2 = 0, *pI3 = 0; //OC17042020
 	double *pId = 0, *pI1d = 0, *pI2d = 0, *pI3d = 0;
 	long ne = RadAccessData.ne, nx = RadAccessData.nx, nz = RadAccessData.nz;
+	//long ne = RadAccessData.ne, nx = RadAccessData.nx, nz = RadAccessData.nz, nwfr = RadAccessData.nwfr; //Himanshu?
 	//float *pI = 0;
 	//DOUBLE *pId = 0;
 	//double *pId = 0; //OC26112019 (related to SRW port to IGOR XOP8 on Mac)
@@ -720,6 +722,7 @@ int srTRadGenManip::ExtractSingleElecIntensity2DvsXZ(srTRadExtract& RadExtract)
 	//long long PerZ = PerX*RadAccessData.nx;
 	long long PerX = ((long long)ne) << 1; //OC18042020
 	long long PerZ = PerX*nx;
+	long long PerWfr = PerZ*nz;
 
 	//long ie0=0, ie1=0;
 	long long ie0=0, ie1=0; //OC26042019
@@ -754,174 +757,187 @@ int srTRadGenManip::ExtractSingleElecIntensity2DvsXZ(srTRadExtract& RadExtract)
 	//long Two_ie0 = ie0 << 1, Two_ie1 = ie1 << 1;
 	long long Two_ie0 = ie0 << 1, Two_ie1 = ie1 << 1; //OC26042019
 	//long izPerZ = 0;
-	long long izPerZ = 0;
 	long ix, ie;
 
-	for(long long iz=0; iz<nz; iz++) //OC18042020
-	//for(long long iz=0; iz<RadAccessData.nz; iz++) //OC26042019
-	//for(long iz=0; iz<RadAccessData.nz; iz++)
-	{
-		float *pEx_StartForX = pEx0 + izPerZ;
-		float *pEz_StartForX = pEz0 + izPerZ;
-		//long ixPerX = 0;
+	//Himanshu?
+	//GPU_COND(pGpuUsage,
+	//{
+	//	ExtractSingleElecIntensity2DvsXZ_GPU(RadExtract, arAuxInt, ie0, ie1, InvStepRelArg, pGpuUsage);
+	//})
+	//else
+	//{
+		//long long iwfrPerWfr = 0;
+		//for(long long iwfr=0; iwfr<nwfr; iwfr++)
+		//{
+			long long izPerZ = 0;
+			for(long long iz=0; iz<nz; iz++) //OC18042020
+			//for(long long iz=0; iz<RadAccessData.nz; iz++) //OC26042019
+			//for(long iz=0; iz<RadAccessData.nz; iz++)
+			{
+				float *pEx_StartForX = pEx0 + izPerZ;
+				float *pEz_StartForX = pEz0 + izPerZ;
+				//long ixPerX = 0;
 
-		float *pEx_St = pEx_StartForX + Two_ie0;
-		float *pEz_St = pEz_StartForX + Two_ie0;
-		float *pEx_Fi = pEx_StartForX + Two_ie1;
-		float *pEz_Fi = pEz_StartForX + Two_ie1;
+				float *pEx_St = pEx_StartForX + Two_ie0;
+				float *pEz_St = pEz_StartForX + Two_ie0;
+				float *pEx_Fi = pEx_StartForX + Two_ie1;
+				float *pEz_Fi = pEz_StartForX + Two_ie1;
 
-		for(ix=0; ix<nx; ix++) //OC18042020
-		//for(long ix=0; ix<RadAccessData.nx; ix++)
-		{
-			//float *pEx_StartForE = pEx_StartForX + ixPerX;
-			//float *pEz_StartForE = pEz_StartForX + ixPerX;
-			//float *pEx_St = pEx_StartForE + Two_ie0, *pEx_Fi = pEx_StartForE + Two_ie1;
-			//float *pEz_St = pEz_StartForE + Two_ie0, *pEz_Fi = pEz_StartForE + Two_ie1;
-
-			//OC140813
-			//if(pI != 0) *(pI++) = IntensityComponentSimpleInterpol(pEx_St, pEx_Fi, pEz_St, pEz_Fi, InvStepRelArg, PolCom, Int_or_ReE);
-			//if(pId != 0) *(pId++) = IntensityComponentSimpleInterpol(pEx_St, pEx_Fi, pEz_St, pEz_Fi, InvStepRelArg, PolCom, Int_or_ReE);
-
-			if(intOverEnIsRequired) //OC140813
-			{//integrate over photon energy / time
-				double *tInt = arAuxInt; 
-				float *pEx_StAux = pEx_St;
-				float *pEz_StAux = pEz_St;
-
-				if(!allStokesReq) //OC17042020
+				for(ix=0; ix<nx; ix++) //OC18042020
+				//for(long ix=0; ix<RadAccessData.nx; ix++)
 				{
-					for(ie=0; ie<ne; ie++) //OC18042020
-					//for(int ie=0; ie<RadAccessData.ne; ie++)
-					{
-						*(tInt++) = IntensityComponent(pEx_StAux, pEz_StAux, PolCom, Int_or_ReE);
-						pEx_StAux += 2;
-						pEz_StAux += 2;
+					//float *pEx_StartForE = pEx_StartForX + ixPerX;
+					//float *pEz_StartForE = pEz_StartForX + ixPerX;
+					//float *pEx_St = pEx_StartForE + Two_ie0, *pEx_Fi = pEx_StartForE + Two_ie1;
+					//float *pEz_St = pEz_StartForE + Two_ie0, *pEz_Fi = pEz_StartForE + Two_ie1;
+
+					//OC140813
+					//if(pI != 0) *(pI++) = IntensityComponentSimpleInterpol(pEx_St, pEx_Fi, pEz_St, pEz_Fi, InvStepRelArg, PolCom, Int_or_ReE);
+					//if(pId != 0) *(pId++) = IntensityComponentSimpleInterpol(pEx_St, pEx_Fi, pEz_St, pEz_Fi, InvStepRelArg, PolCom, Int_or_ReE);
+
+					if(intOverEnIsRequired) //OC140813
+					{//integrate over photon energy / time
+						double *tInt = arAuxInt; 
+						float *pEx_StAux = pEx_St;
+						float *pEz_StAux = pEz_St;
+
+						if(!allStokesReq) //OC17042020
+						{
+							for(ie=0; ie<ne; ie++) //OC18042020
+							//for(int ie=0; ie<RadAccessData.ne; ie++)
+							{
+								*(tInt++) = IntensityComponent(pEx_StAux, pEz_StAux, PolCom, Int_or_ReE);
+								pEx_StAux += 2;
+								pEz_StAux += 2;
+							}
+							resInt = ConstPhotEnInteg*CGenMathMeth::Integ1D_FuncDefByArray(arAuxInt, ne, RadAccessData.eStep); //OC18042020
+							//resInt = ConstPhotEnInteg*CGenMathMeth::Integ1D_FuncDefByArray(arAuxInt, RadAccessData.ne, RadAccessData.eStep);
+						}
+						else
+						{
+							for(ie=0; ie<ne; ie++)
+							{
+								*(tInt++) = IntensityComponent(pEx_StAux, pEz_StAux, -1, Int_or_ReE);
+								pEx_StAux += 2; pEz_StAux += 2;
+							}
+							resInt = ConstPhotEnInteg*CGenMathMeth::Integ1D_FuncDefByArray(arAuxInt, ne, RadAccessData.eStep);
+
+							tInt = arAuxInt; pEx_StAux = pEx_St; pEz_StAux = pEz_St;
+							for(ie=0; ie<ne; ie++)
+							{
+								*(tInt++) = IntensityComponent(pEx_StAux, pEz_StAux, -2, Int_or_ReE);
+								pEx_StAux += 2; pEz_StAux += 2;
+							}
+							resInt1 = ConstPhotEnInteg*CGenMathMeth::Integ1D_FuncDefByArray(arAuxInt, ne, RadAccessData.eStep);
+
+							tInt = arAuxInt; pEx_StAux = pEx_St; pEz_StAux = pEz_St;
+							for(ie=0; ie<ne; ie++)
+							{
+								*(tInt++) = IntensityComponent(pEx_StAux, pEz_StAux, -3, Int_or_ReE);
+								pEx_StAux += 2; pEz_StAux += 2;
+							}
+							resInt2 = ConstPhotEnInteg*CGenMathMeth::Integ1D_FuncDefByArray(arAuxInt, ne, RadAccessData.eStep);
+
+							tInt = arAuxInt; pEx_StAux = pEx_St; pEz_StAux = pEz_St;
+							for(ie=0; ie<ne; ie++)
+							{
+								*(tInt++) = IntensityComponent(pEx_StAux, pEz_StAux, -4, Int_or_ReE);
+								pEx_StAux += 2; pEz_StAux += 2;
+							}
+							resInt3 = ConstPhotEnInteg*CGenMathMeth::Integ1D_FuncDefByArray(arAuxInt, ne, RadAccessData.eStep);
+						}
 					}
-					resInt = ConstPhotEnInteg*CGenMathMeth::Integ1D_FuncDefByArray(arAuxInt, ne, RadAccessData.eStep); //OC18042020
-					//resInt = ConstPhotEnInteg*CGenMathMeth::Integ1D_FuncDefByArray(arAuxInt, RadAccessData.ne, RadAccessData.eStep);
-				}
-				else
-				{
-					for(ie=0; ie<ne; ie++)
+					else
 					{
-						*(tInt++) = IntensityComponent(pEx_StAux, pEz_StAux, -1, Int_or_ReE);
-						pEx_StAux += 2; pEz_StAux += 2;
+						if(!allStokesReq) //OC18042020
+						{
+							resInt = IntensityComponentSimpleInterpol(pEx_St, pEx_Fi, pEz_St, pEz_Fi, InvStepRelArg, PolCom, Int_or_ReE);
+						}
+						else //OC18042020
+						{
+							resInt = IntensityComponentSimpleInterpol(pEx_St, pEx_Fi, pEz_St, pEz_Fi, InvStepRelArg, -1, Int_or_ReE);
+							resInt1 = IntensityComponentSimpleInterpol(pEx_St, pEx_Fi, pEz_St, pEz_Fi, InvStepRelArg, -2, Int_or_ReE);
+							resInt2 = IntensityComponentSimpleInterpol(pEx_St, pEx_Fi, pEz_St, pEz_Fi, InvStepRelArg, -3, Int_or_ReE);
+							resInt3 = IntensityComponentSimpleInterpol(pEx_St, pEx_Fi, pEz_St, pEz_Fi, InvStepRelArg, -4, Int_or_ReE);
+						}
 					}
-					resInt = ConstPhotEnInteg*CGenMathMeth::Integ1D_FuncDefByArray(arAuxInt, ne, RadAccessData.eStep);
 
-					tInt = arAuxInt; pEx_StAux = pEx_St; pEz_StAux = pEz_St;
-					for(ie=0; ie<ne; ie++)
+					if(iter == 0) //OC08052021
 					{
-						*(tInt++) = IntensityComponent(pEx_StAux, pEz_StAux, -2, Int_or_ReE);
-						pEx_StAux += 2; pEz_StAux += 2;
+						//OC140813
+						if(pI != 0) *(pI++) = (float)resInt;
+						if(pId != 0) *(pId++) = resInt; //OC18042020
+						//if(pId != 0) *(pId++) = (double)resInt;
+						if(allStokesReq) //OC18042020
+						{
+							if(RadExtract.pExtractedData != 0)
+							{
+								*(pI1++) = (float)resInt1; *(pI2++) = (float)resInt2; *(pI3++) = (float)resInt3;
+							}
+							else
+							{
+								*(pI1d++) = resInt1; *(pI2d++) = resInt2; *(pI3d++) = resInt3;
+							}
+						}
 					}
-					resInt1 = ConstPhotEnInteg*CGenMathMeth::Integ1D_FuncDefByArray(arAuxInt, ne, RadAccessData.eStep);
-
-					tInt = arAuxInt; pEx_StAux = pEx_St; pEz_StAux = pEz_St;
-					for(ie=0; ie<ne; ie++)
+					else if(iter > 0) //OC08052021
 					{
-						*(tInt++) = IntensityComponent(pEx_StAux, pEz_StAux, -3, Int_or_ReE);
-						pEx_StAux += 2; pEz_StAux += 2;
+						if(pI != 0)
+						{
+							float newI = (float)(((*pI)*iter + resInt)*inv_iter_p_1);
+							*(pI++) = newI;
+						}
+						if(pId != 0)
+						{
+							double newI = ((*pId)*iter + resInt)*inv_iter_p_1;
+							*(pId++) = newI;
+						}
+						if(allStokesReq)
+						{
+							if(RadExtract.pExtractedData != 0)
+							{
+								float newI1 = (float)(((*pI1)*iter + resInt1)*inv_iter_p_1);
+								float newI2 = (float)(((*pI2)*iter + resInt2)*inv_iter_p_1);
+								float newI3 = (float)(((*pI3)*iter + resInt3)*inv_iter_p_1);
+								*(pI1++) = newI1; *(pI2++) = newI2; *(pI3++) = newI3;
+							}
+							else
+							{
+								double newI1 = ((*pI1d)*iter + resInt1)*inv_iter_p_1;
+								double newI2 = ((*pI2d)*iter + resInt2)*inv_iter_p_1;
+								double newI3 = ((*pI3d)*iter + resInt3)*inv_iter_p_1;
+								*(pI1d++) = newI1; *(pI2d++) = newI2; *(pI3d++) = newI3;
+							}
+						}
 					}
-					resInt2 = ConstPhotEnInteg*CGenMathMeth::Integ1D_FuncDefByArray(arAuxInt, ne, RadAccessData.eStep);
-
-					tInt = arAuxInt; pEx_StAux = pEx_St; pEz_StAux = pEz_St;
-					for(ie=0; ie<ne; ie++)
+					else //OC08052021
 					{
-						*(tInt++) = IntensityComponent(pEx_StAux, pEz_StAux, -4, Int_or_ReE);
-						pEx_StAux += 2; pEz_StAux += 2;
+						if(pI != 0) *(pI++) += (float)resInt;
+						if(pId != 0) *(pId++) += resInt;
+						if(allStokesReq)
+						{
+							if(RadExtract.pExtractedData != 0)
+							{
+								*(pI1++) += (float)resInt1; *(pI2++) += (float)resInt2; *(pI3++) += (float)resInt3;
+							}
+							else
+							{
+								*(pI1d++) += resInt1; *(pI2d++) += resInt2; *(pI3d++) += resInt3;
+							}
+						}
 					}
-					resInt3 = ConstPhotEnInteg*CGenMathMeth::Integ1D_FuncDefByArray(arAuxInt, ne, RadAccessData.eStep);
-				}
-			}
-			else
-			{
-				if(!allStokesReq) //OC18042020
-				{
-					resInt = IntensityComponentSimpleInterpol(pEx_St, pEx_Fi, pEz_St, pEz_Fi, InvStepRelArg, PolCom, Int_or_ReE);
-				}
-				else //OC18042020
-				{
-					resInt = IntensityComponentSimpleInterpol(pEx_St, pEx_Fi, pEz_St, pEz_Fi, InvStepRelArg, -1, Int_or_ReE);
-					resInt1 = IntensityComponentSimpleInterpol(pEx_St, pEx_Fi, pEz_St, pEz_Fi, InvStepRelArg, -2, Int_or_ReE);
-					resInt2 = IntensityComponentSimpleInterpol(pEx_St, pEx_Fi, pEz_St, pEz_Fi, InvStepRelArg, -3, Int_or_ReE);
-					resInt3 = IntensityComponentSimpleInterpol(pEx_St, pEx_Fi, pEz_St, pEz_Fi, InvStepRelArg, -4, Int_or_ReE);
-				}
-			}
 
-			if(iter == 0) //OC08052021
-			{
-				//OC140813
-				if(pI != 0) *(pI++) = (float)resInt;
-				if(pId != 0) *(pId++) = resInt; //OC18042020
-				//if(pId != 0) *(pId++) = (double)resInt;
-				if(allStokesReq) //OC18042020
-				{
-					if(RadExtract.pExtractedData != 0)
-					{
-						*(pI1++) = (float)resInt1; *(pI2++) = (float)resInt2; *(pI3++) = (float)resInt3;
-					}
-					else
-					{
-						*(pI1d++) = resInt1; *(pI2d++) = resInt2; *(pI3d++) = resInt3;
-					}
-				}
-			}
-			else if(iter > 0) //OC08052021
-			{
-				if(pI != 0)
-				{
-					float newI = (float)(((*pI)*iter + resInt)*inv_iter_p_1);
-					*(pI++) = newI;
-				}
-				if(pId != 0)
-				{
-					double newI = ((*pId)*iter + resInt)*inv_iter_p_1;
-					*(pId++) = newI;
-				}
-				if(allStokesReq)
-				{
-					if(RadExtract.pExtractedData != 0)
-					{
-						float newI1 = (float)(((*pI1)*iter + resInt1)*inv_iter_p_1);
-						float newI2 = (float)(((*pI2)*iter + resInt2)*inv_iter_p_1);
-						float newI3 = (float)(((*pI3)*iter + resInt3)*inv_iter_p_1);
-						*(pI1++) = newI1; *(pI2++) = newI2; *(pI3++) = newI3;
-					}
-					else
-					{
-						double newI1 = ((*pI1d)*iter + resInt1)*inv_iter_p_1;
-						double newI2 = ((*pI2d)*iter + resInt2)*inv_iter_p_1;
-						double newI3 = ((*pI3d)*iter + resInt3)*inv_iter_p_1;
-						*(pI1d++) = newI1; *(pI2d++) = newI2; *(pI3d++) = newI3;
-					}
+					//ixPerX += PerX;
+					pEx_St += PerX;
+					pEz_St += PerX;
+					pEx_Fi += PerX;
+					pEz_Fi += PerX;
 				}
+				izPerZ += PerZ;
 			}
-			else //OC08052021
-			{
-				if(pI != 0) *(pI++) += (float)resInt;
-				if(pId != 0) *(pId++) += resInt;
-				if(allStokesReq)
-				{
-					if(RadExtract.pExtractedData != 0)
-					{
-						*(pI1++) += (float)resInt1; *(pI2++) += (float)resInt2; *(pI3++) += (float)resInt3;
-					}
-					else
-					{
-						*(pI1d++) += resInt1; *(pI2d++) += resInt2; *(pI3d++) += resInt3;
-					}
-				}
-			}
-
-			//ixPerX += PerX;
-			pEx_St += PerX;
-			pEz_St += PerX;
-			pEx_Fi += PerX;
-			pEz_Fi += PerX;
-		}
-		izPerZ += PerZ;
-	}
+			//iwfrPerWfr += PerWfr;
+		//}
+	//}
 	if(arAuxInt != 0) delete[] arAuxInt; //OC150813
 	return 0;
 }
@@ -1571,6 +1587,7 @@ int srTRadGenManip::ExtractSingleElecMutualIntensityVsZ(srTRadExtract& RadExtrac
 //*************************************************************************
 
 int srTRadGenManip::ExtractSingleElecMutualIntensityVsXZ(srTRadExtract& RadExtract)
+//int srTRadGenManip::ExtractSingleElecMutualIntensityVsXZ(srTRadExtract& RadExtract, gpuUsageArg *pGpuUsage) //Himanshu?
 {//OC13122019
  //This assumes "normal" data alignment in the complex "matrix" E(x,y)*E*(x',y')
 	int res = 0;
@@ -2107,156 +2124,138 @@ int srTRadGenManip::ExtractSingleElecMutualIntensityVsXZ(srTRadExtract& RadExtra
 
 	if(DontNeedInterp)
 	{
-		for(long long it=itStart; it<=itEnd; it++) //OC16042021 (to enable partial update of MI/CSD)
-		//for(long long it=0; it<=(itEnd-itStart); it++) //OC03032021 (to enable partial update of MI/CSD)
-		//for(long long it=0; it<nxnz; it++)
-		{
-			float *pMI = pMI0 + (it - itStart)*PerArg; //OC16042021
-			//float *pMI = pMI0 + it*PerArg;
-			for(long long i=0; i<=it; i++)
+		//Himanshu?
+		//GPU_COND(pGpuUsage,
+		//	{
+		//		ExtractSingleElecMutualIntensityVsXZ_GPU(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, PolCom, EhOK, EvOK, pGpuUsage);
+		//	})
+		//else
+		//{
+			for(long long it=itStart; it<=itEnd; it++) //OC16042021 (to enable partial update of MI/CSD)
+			//for(long long it=0; it<=(itEnd-itStart); it++) //OC03032021 (to enable partial update of MI/CSD)
+			//for(long long it=0; it<nxnz; it++)
 			{
-				//if(res = MutualIntensityComponent(pEx, pExT, pEz, pEzT, PolCom, iter, pMI)) return res;
-
-				double ExRe = 0., ExIm = 0., EzRe = 0., EzIm = 0.;
-				double ExReT = 0., ExImT = 0., EzReT = 0., EzImT = 0.;
-				if(EhOK) { ExRe = *pEx; ExIm = *(pEx + 1); ExReT = *pExT; ExImT = *(pExT + 1); }
-				if(EvOK) { EzRe = *pEz; EzIm = *(pEz + 1); EzReT = *pEzT; EzImT = *(pEzT + 1); }
-				double ReMI = 0., ImMI = 0.;
-
-				switch(PolCom)
-				{
-				case 0: // Lin. Hor.
-				{
-					ReMI = ExRe*ExReT + ExIm*ExImT;
-					ImMI = -(ExIm*ExReT - ExRe*ExImT); //OC01052022: Testing the standard definition: E*(x,y)*E(x',y'); previously it assumed the CSD definition E(x,y)*E*(x',y'), whereas the standard one is E*(x,y)*E(x',y') - the two are related by complex conjugation
-					//ImMI = ExIm*ExReT - ExRe*ExImT;
-					break;
-				}
-				case 1: // Lin. Vert.
-				{
-					ReMI = EzRe*EzReT + EzIm*EzImT;
-					ImMI = -(EzIm*EzReT - EzRe*EzImT); //OC01052022: Testing the standard definition: E*(x,y)*E(x',y'); previously it assumed the CSD definition E(x,y)*E*(x',y'), whereas the standard one is E*(x,y)*E(x',y') - the two are related by complex conjugation
-					//ImMI = EzIm*EzReT - EzRe*EzImT;
-					break;
-				}
-				case 2: // Linear 45 deg.
-				{
-					double ExRe_p_EzRe = ExRe + EzRe, ExIm_p_EzIm = ExIm + EzIm;
-					double ExRe_p_EzReT = ExReT + EzReT, ExIm_p_EzImT = ExImT + EzImT;
-					ReMI = 0.5*(ExRe_p_EzRe*ExRe_p_EzReT + ExIm_p_EzIm*ExIm_p_EzImT);
-					ImMI = -0.5*(ExIm_p_EzIm*ExRe_p_EzReT - ExRe_p_EzRe*ExIm_p_EzImT); //OC01052022: Testing the standard definition: E*(x,y)*E(x',y'); previously it assumed the CSD definition E(x,y)*E*(x',y'), whereas the standard one is E*(x,y)*E(x',y') - the two are related by complex conjugation
-					//ImMI = 0.5*(ExIm_p_EzIm*ExRe_p_EzReT - ExRe_p_EzRe*ExIm_p_EzImT);
-					break;
-				}
-				case 3: // Linear 135 deg.
-				{
-					double ExRe_mi_EzRe = ExRe - EzRe, ExIm_mi_EzIm = ExIm - EzIm;
-					double ExRe_mi_EzReT = ExReT - EzReT, ExIm_mi_EzImT = ExImT - EzImT;
-					ReMI = 0.5*(ExRe_mi_EzRe*ExRe_mi_EzReT + ExIm_mi_EzIm*ExIm_mi_EzImT);
-					ImMI = -0.5*(ExIm_mi_EzIm*ExRe_mi_EzReT - ExRe_mi_EzRe*ExIm_mi_EzImT); //OC01052022: Testing the standard definition: E*(x,y)*E(x',y'); previously it assumed the CSD definition E(x,y)*E*(x',y'), whereas the standard one is E*(x,y)*E(x',y') - the two are related by complex conjugation
-					//ImMI = 0.5*(ExIm_mi_EzIm*ExRe_mi_EzReT - ExRe_mi_EzRe*ExIm_mi_EzImT);
-					break;
-				}
-				case 5: // Circ. Left //OC08092019: corrected to be in compliance with definitions for right-hand frame (x,z,s) and with corresponding definition and calculation of Stokes params
-				//case 4: // Circ. Right
-				{
-					double ExRe_mi_EzIm = ExRe - EzIm, ExIm_p_EzRe = ExIm + EzRe;
-					double ExRe_mi_EzImT = ExReT - EzImT, ExIm_p_EzReT = ExImT + EzReT;
-					ReMI = 0.5*(ExRe_mi_EzIm*ExRe_mi_EzImT + ExIm_p_EzRe*ExIm_p_EzReT);
-					ImMI = -0.5*(ExIm_p_EzRe*ExRe_mi_EzImT - ExRe_mi_EzIm*ExIm_p_EzReT); //OC01052022: Testing the standard definition: E*(x,y)*E(x',y'); previously it assumed the CSD definition E(x,y)*E*(x',y'), whereas the standard one is E*(x,y)*E(x',y') - the two are related by complex conjugation
-					//ImMI = 0.5*(ExIm_p_EzRe*ExRe_mi_EzImT - ExRe_mi_EzIm*ExIm_p_EzReT);
-					break;
-				}
-				case 4: // Circ. Right //OC08092019: corrected to be in compliance with definitions for right-hand frame (x,z,s) and with corresponding definition and calculation of Stokes params
-				//case 5: // Circ. Left
-				{
-					double ExRe_p_EzIm = ExRe + EzIm, ExIm_mi_EzRe = ExIm - EzRe;
-					double ExRe_p_EzImT = ExReT + EzImT, ExIm_mi_EzReT = ExImT - EzReT;
-					ReMI = 0.5*(ExRe_p_EzIm*ExRe_p_EzImT + ExIm_mi_EzRe*ExIm_mi_EzReT);
-					ImMI = -0.5*(ExIm_mi_EzRe*ExRe_p_EzImT - ExRe_p_EzIm*ExIm_mi_EzReT); //OC01052022: Testing the standard definition: E*(x,y)*E(x',y'); previously it assumed the CSD definition E(x,y)*E*(x',y'), whereas the standard one is E*(x,y)*E(x',y') - the two are related by complex conjugation
-					//ImMI = 0.5*(ExIm_mi_EzRe*ExRe_p_EzImT - ExRe_p_EzIm*ExIm_mi_EzReT);
-					break;
-				}
-				case -1: // s0
-				{
-					ReMI = ExRe*ExReT + ExIm*ExImT + EzRe*EzReT + EzIm*EzImT; //NOTE: This assumes the CSD definition E(x,y)*E*(x',y'), whereas the standard one is E*(x,y)*E(x',y') - the two are related by complex conjugation
-					ImMI = -(ExIm*ExReT - ExRe*ExImT + EzIm*EzReT - EzRe*EzImT); //OC01052022: Testing the standard definition: E*(x,y)*E(x',y'); previously it assumed the CSD definition E(x,y)*E*(x',y'), whereas the standard one is E*(x,y)*E(x',y') - the two are related by complex conjugation
-					//ImMI = ExIm*ExReT - ExRe*ExImT + EzIm*EzReT - EzRe*EzImT;
-					break;
-				}
-				case -2: // s1
-				{
-					ReMI = ExRe*ExReT + ExIm*ExImT - (EzRe*EzReT + EzIm*EzImT);
-					ImMI = -(ExIm*ExReT - ExRe*ExImT - (EzIm*EzReT - EzRe*EzImT)); //OC01052022: Testing the standard definition: E*(x,y)*E(x',y'); previously it assumed the CSD definition E(x,y)*E*(x',y'), whereas the standard one is E*(x,y)*E(x',y') - the two are related by complex conjugation
-					//ImMI = ExIm*ExReT - ExRe*ExImT - (EzIm*EzReT - EzRe*EzImT);
-					break;
-				}
-				case -3: // s2
-				{
-					ReMI = ExImT*EzIm + ExIm*EzImT + ExReT*EzRe + ExRe*EzReT;
-					ImMI = -(ExReT*EzIm - ExRe*EzImT - ExImT*EzRe + ExIm*EzReT); //OC01052022: Testing the standard definition: E*(x,y)*E(x',y'); previously it assumed the CSD definition E(x,y)*E*(x',y'), whereas the standard one is E*(x,y)*E(x',y') - the two are related by complex conjugation
-					//ImMI = ExReT*EzIm - ExRe*EzImT - ExImT*EzRe + ExIm*EzReT;
-					break;
-				}
-				case -4: // s3
-				{
-					ReMI = ExReT*EzIm + ExRe*EzImT - ExImT*EzRe - ExIm*EzReT;
-					ImMI = -(ExIm*EzImT - ExImT*EzIm - ExReT*EzRe + ExRe*EzReT); //OC01052022: Testing the standard definition: E*(x,y)*E(x',y'); previously it assumed the CSD definition E(x,y)*E*(x',y'), whereas the standard one is E*(x,y)*E(x',y') - the two are related by complex conjugation
-					//ImMI = ExIm*EzImT - ExImT*EzIm - ExReT*EzRe + ExRe*EzReT;
-					break;
-				}
-				default: // total mutual intensity, same as s0
-				{
-					ReMI = ExRe*ExReT + ExIm*ExImT + EzRe*EzReT + EzIm*EzImT;
-					ImMI = -(ExIm*ExReT - ExRe*ExImT + EzIm*EzReT - EzRe*EzImT); //OC01052022: Testing the standard definition: E*(x,y)*E(x',y'); previously it assumed the CSD definition E(x,y)*E*(x',y'), whereas the standard one is E*(x,y)*E(x',y') - the two are related by complex conjugation
-					//ImMI = ExIm*ExReT - ExRe*ExImT + EzIm*EzReT - EzRe*EzImT;
-					break;
-					//return CAN_NOT_EXTRACT_MUT_INT;
-				}
-				}
-				if(iter == 0)
-				{
-					pMI[0] = (float)ReMI;
-					pMI[1] = (float)ImMI;
-				}
-				else if(iter > 0)
-				{
-					//double iter_p_1 = iter + 1; //OC20012020
-					//long long iter_p_1 = iter + 1;
-					pMI[0] = (float)((pMI[0]*iter + ReMI)*inv_iter_p_1); //OC08052021
-					pMI[1] = (float)((pMI[1]*iter + ImMI)*inv_iter_p_1);
-					//pMI[0] = (float)((pMI[0]*iter + ReMI)/iter_p_1);
-					//pMI[1] = (float)((pMI[1]*iter + ImMI)/iter_p_1);
-				}
-				else
+				float *pMI = pMI0 + (it - itStart)*PerArg; //OC16042021
+				//float *pMI = pMI0 + it*PerArg;
+				for(long long i=0; i<=it; i++)
 				{
-					pMI[0] += (float)ReMI;
-					pMI[1] += (float)ImMI;
-				}
+					//if(res = MutualIntensityComponent(pEx, pExT, pEz, pEzT, PolCom, iter, pMI)) return res;
 
-				pEx += PerX; pEz += PerX;
-				pMI += 2;
-			}
+					double ExRe = 0., ExIm = 0., EzRe = 0., EzIm = 0.;
+					double ExReT = 0., ExImT = 0., EzReT = 0., EzImT = 0.;
+					if(EhOK) { ExRe = *pEx; ExIm = *(pEx + 1); ExReT = *pExT; ExImT = *(pExT + 1); }
+					if(EvOK) { EzRe = *pEz; EzIm = *(pEz + 1); EzReT = *pEzT; EzImT = *(pEzT + 1); }
+					double ReMI = 0., ImMI = 0.;
 
-			pEx = pExInit0;
-			pEz = pEzInit0;
-			pExT += PerX; pEzT += PerX;
-		}
-		if(iter == 0) //OC16102021
-		{//Setting to 0 symmetrical part of MI data (to avoid having garbage there)
+					switch(PolCom)
+					{
+					case 0: // Lin. Hor.
+					{
+						ReMI = ExRe*ExReT + ExIm*ExImT;
+						ImMI = ExIm*ExReT - ExRe*ExImT;
+						break;
+					}
+					case 1: // Lin. Vert.
+					{
+						ReMI = EzRe*EzReT + EzIm*EzImT;
+						ImMI = EzIm*EzReT - EzRe*EzImT;
+						break;
+					}
+					case 2: // Linear 45 deg.
+					{
+						double ExRe_p_EzRe = ExRe + EzRe, ExIm_p_EzIm = ExIm + EzIm;
+						double ExRe_p_EzReT = ExReT + EzReT, ExIm_p_EzImT = ExImT + EzImT;
+						ReMI = 0.5*(ExRe_p_EzRe*ExRe_p_EzReT + ExIm_p_EzIm*ExIm_p_EzImT);
+						ImMI = 0.5*(ExIm_p_EzIm*ExRe_p_EzReT - ExRe_p_EzRe*ExIm_p_EzImT);
+						break;
+					}
+					case 3: // Linear 135 deg.
+					{
+						double ExRe_mi_EzRe = ExRe - EzRe, ExIm_mi_EzIm = ExIm - EzIm;
+						double ExRe_mi_EzReT = ExReT - EzReT, ExIm_mi_EzImT = ExImT - EzImT;
+						ReMI = 0.5*(ExRe_mi_EzRe*ExRe_mi_EzReT + ExIm_mi_EzIm*ExIm_mi_EzImT);
+						ImMI = 0.5*(ExIm_mi_EzIm*ExRe_mi_EzReT - ExRe_mi_EzRe*ExIm_mi_EzImT);
+						break;
+					}
+					case 5: // Circ. Left //OC08092019: corrected to be in compliance with definitions for right-hand frame (x,z,s) and with corresponding definition and calculation of Stokes params
+					//case 4: // Circ. Right
+					{
+						double ExRe_mi_EzIm = ExRe - EzIm, ExIm_p_EzRe = ExIm + EzRe;
+						double ExRe_mi_EzImT = ExReT - EzImT, ExIm_p_EzReT = ExImT + EzReT;
+						ReMI = 0.5*(ExRe_mi_EzIm*ExRe_mi_EzImT + ExIm_p_EzRe*ExIm_p_EzReT);
+						ImMI = 0.5*(ExIm_p_EzRe*ExRe_mi_EzImT - ExRe_mi_EzIm*ExIm_p_EzReT);
+						break;
+					}
+					case 4: // Circ. Right //OC08092019: corrected to be in compliance with definitions for right-hand frame (x,z,s) and with corresponding definition and calculation of Stokes params
+					//case 5: // Circ. Left
+					{
+						double ExRe_p_EzIm = ExRe + EzIm, ExIm_mi_EzRe = ExIm - EzRe;
+						double ExRe_p_EzImT = ExReT + EzImT, ExIm_mi_EzReT = ExImT - EzReT;
+						ReMI = 0.5*(ExRe_p_EzIm*ExRe_p_EzImT + ExIm_mi_EzRe*ExIm_mi_EzReT);
+						ImMI = 0.5*(ExIm_mi_EzRe*ExRe_p_EzImT - ExRe_p_EzIm*ExIm_mi_EzReT);
+						break;
+					}
+					case -1: // s0
+					{
+						ReMI = ExRe*ExReT + ExIm*ExImT + EzRe*EzReT + EzIm*EzImT;
+						ImMI = ExIm*ExReT - ExRe*ExImT + EzIm*EzReT - EzRe*EzImT;
+						break;
+					}
+					case -2: // s1
+					{
+						ReMI = ExRe*ExReT + ExIm*ExImT - (EzRe*EzReT + EzIm*EzImT);
+						ImMI = ExIm*ExReT - ExRe*ExImT - (EzIm*EzReT - EzRe*EzImT);
+						break;
+					}
+					case -3: // s2
+					{
+						ReMI = ExImT*EzIm + ExIm*EzImT + ExReT*EzRe + ExRe*EzReT;
+						ImMI = ExReT*EzIm - ExRe*EzImT - ExImT*EzRe + ExIm*EzReT;
+						break;
+					}
+					case -4: // s3
+					{
+						ReMI = ExReT*EzIm + ExRe*EzImT - ExImT*EzRe - ExIm*EzReT;
+						ImMI = ExIm*EzImT - ExImT*EzIm - ExReT*EzRe + ExRe*EzReT;
+						break;
+					}
+					default: // total mutual intensity, same as s0
+					{
+						ReMI = ExRe*ExReT + ExIm*ExImT + EzRe*EzReT + EzIm*EzImT;
+						ImMI = ExIm*ExReT - ExRe*ExImT + EzIm*EzReT - EzRe*EzImT;
+						break;
+						//return CAN_NOT_EXTRACT_MUT_INT;
+					}
+					}
+					if(iter == 0)
+					{
+						pMI[0] = (float)ReMI;
+						pMI[1] = (float)ImMI;
+					}
+					else if(iter > 0)
+					{
+						//double iter_p_1 = iter + 1; //OC20012020
+						//long long iter_p_1 = iter + 1;
+						pMI[0] = (float)((pMI[0]*iter + ReMI)*inv_iter_p_1); //OC08052021
+						pMI[1] = (float)((pMI[1]*iter + ImMI)*inv_iter_p_1);
+						//pMI[0] = (float)((pMI[0]*iter + ReMI)/iter_p_1);
+						//pMI[1] = (float)((pMI[1]*iter + ImMI)/iter_p_1);
+					}
+					else
+					{
+						pMI[0] += (float)ReMI;
+						pMI[1] += (float)ImMI;
+					}
 
-			for(long long it=itStart; it<=itEnd; it++) //OC16042021 (to enable partial update of MI/CSD)
-			{
-				float *pMI = pMI0 + (it - itStart)*(PerArg + 2) + 2; //OC29042022 (?)
-				//float *pMI = pMI0 + (it - itStart)*PerArg;
-				for(long long i=it+1; i<=itEnd; i++)
-				//for(long long i=0; i<=it; i++)
-				{
-					*(pMI++) = 0.;
-					*(pMI++) = 0.;
+					pEx += PerX; pEz += PerX;
+					pMI += 2;
 				}
+
+				pEx = pExInit0;
+				pEz = pEzInit0;
+				pExT += PerX; pEzT += PerX;
 			}
-		}
+		//}
 	}
 	else
 	{
@@ -3536,8 +3535,7 @@ void srTRadGenManip::MutualIntSumPart(srTWaveAccessData* pwI1, srTWaveAccessData
 	long long itStart = pwI2->itStart;
 	if(itStart < 0) itStart = 0;
 	long long itFin = pwI2->itFin;
-	if(itFin < 0) itFin = nxnz - 1; //OC04102021
-	//if(itFin < 0) itFin = nxnz;
+	if(itFin < 0) itFin = nxnz;
 
 	double aux; //OC27042021
 
@@ -3724,7 +3722,6 @@ void srTRadGenManip::MutualIntFillHalfHermit(srTWaveAccessData* pwI)
 				*pMIt = -imMI; //Hermitian matrix property
 			}
 		}
-		//int aha = 1;
 	}
 	else if(pDataD != 0)
 	{
@@ -3793,22 +3790,18 @@ void srTRadGenManip::MutualIntTreatComQuadPhTerm(srTWaveAccessData* pwI, double*
 		for(long long izt=0; izt<nz; izt++)
 		{
 			double xt = xStart - xc;
-			phTermZt = ConstRzE*zt*zt; //OC01052022: due to transition to correct definition of CSD: E*(x,y)*E(x',y')
-			//phTermZt = -ConstRzE*zt*zt;
+			phTermZt = -ConstRzE*zt*zt;
 			for(long long ixt=0; ixt<nx; ixt++)
 			{
 				double z = zStart - zc;
-				phTermT = phTermZt + ConstRxE*xt*xt; //OC01052022: due to transition to correct definition of CSD: E*(x,y)*E(x',y')
-				//phTermT = phTermZt - ConstRxE*xt*xt;
+				phTermT = phTermZt - ConstRxE*xt*xt;
 				for(long long iz=0; iz<nz; iz++)
 				{
 					double x = xStart - xc;
-					phTermZ = -ConstRzE*z*z; //OC01052022: due to transition to correct definition of CSD: E*(x,y)*E(x',y')
-					//phTermZ = ConstRzE*z*z;
+					phTermZ = ConstRzE*z*z;
 					for(long long ix=0; ix<nx; ix++)
 					{
-						phTerm = phTermZ - ConstRxE*x*x + phTermT; //OC01052022: due to transition to correct definition of CSD: E*(x,y)*E(x',y')
-						//phTerm = phTermZ + ConstRxE*x*x + phTermT;
+						phTerm = phTermZ + ConstRxE*x*x + phTermT;
 						CosAndSin(phTerm, cosPh, sinPh);
 
 						reMI = *pMI;
@@ -3834,22 +3827,18 @@ void srTRadGenManip::MutualIntTreatComQuadPhTerm(srTWaveAccessData* pwI, double*
 		for(long long izt=0; izt<nz; izt++)
 		{
 			double xt = xStart - xc;
-			phTermZt = ConstRzE*zt*zt; //OC01052022: due to transition to correct definition of CSD: E*(x,y)*E(x',y')
-			//phTermZt = -ConstRzE*zt*zt;
+			phTermZt = -ConstRzE*zt*zt;
 			for(long long ixt=0; ixt<nx; ixt++)
 			{
 				double z = zStart - zc;
-				phTermT = phTermZt + ConstRxE*xt*xt; //OC01052022: due to transition to correct definition of CSD: E*(x,y)*E(x',y')
-				//phTermT = phTermZt - ConstRxE*xt*xt;
+				phTermT = phTermZt - ConstRxE*xt*xt;
 				for(long long iz=0; iz<nz; iz++)
 				{
 					double x = xStart - xc;
-					phTermZ = -ConstRzE*z*z; //OC01052022: due to transition to correct definition of CSD: E*(x,y)*E(x',y')
-					//phTermZ = ConstRzE*z*z;
+					phTermZ = ConstRzE*z*z;
 					for(long long ix=0; ix<nx; ix++)
 					{
-						phTerm = phTermZ - ConstRxE*x*x + phTermT; //OC01052022: due to transition to correct definition of CSD: E*(x,y)*E(x',y')
-						//phTerm = phTermZ + ConstRxE*x*x + phTermT;
+						phTerm = phTermZ + ConstRxE*x*x + phTermT;
 						cosPh = cos(phTerm);
 						sinPh = sin(phTerm);
 						//CosAndSin(phTerm, cosPh, sinPh);
diff --git a/cpp/src/core/srradmnp.h b/cpp/src/core/srradmnp.h
index dd4461a5..f8ecda38 100644
--- a/cpp/src/core/srradmnp.h
+++ b/cpp/src/core/srradmnp.h
@@ -85,6 +85,7 @@ class srTRadGenManip {
 	//}
 
 	void ExtractRadiation(int PolarizCompon, int Int_or_Phase, int SectID, int TransvPres, double e, double x, double z, char* pData, double* pMeth=0, srTTrjDat* pTrjDat=0) //OC23022020
+	//void ExtractRadiation(int PolarizCompon, int Int_or_Phase, int SectID, int TransvPres, double e, double x, double z, char* pData, double* pMeth=0, srTTrjDat* pTrjDat=0, gpuUsageArg* pGpuUsage=0) //OC23022020 Himanshu?
 	//void ExtractRadiation(int PolarizCompon, int Int_or_Phase, int SectID, int TransvPres, double e, double x, double z, char* pData, double* pMeth=0) //OC16122019
 	//void ExtractRadiation(int PolarizCompon, int Int_or_Phase, int SectID, int TransvPres, double e, double x, double z, char* pData, int* pMeth=0) //OC13122019
 	//void ExtractRadiation(int PolarizCompon, int Int_or_Phase, int SectID, int TransvPres, double e, double x, double z, char* pData);
@@ -102,7 +103,8 @@ class srTRadGenManip {
 
 		int res;
 		if(TransvPres != RadAccessData.Pres)
-			if(res = GenOptElem.SetRadRepres(&RadAccessData, char(TransvPres))) throw res;
+			if(res = GenOptElem.SetRadRepres(&RadAccessData, char(TransvPres), 0, 0)) throw res;
+			//if(res = GenOptElem.SetRadRepres(&RadAccessData, char(TransvPres), 0, 0, pGpuUsage)) throw res; //Himanshu?
 
 		if(RadExtract.Int_or_Phase == 1)
 		{//1- Multi-Elec Intensity
@@ -119,6 +121,7 @@ class srTRadGenManip {
 		else if(RadExtract.Int_or_Phase == 8) //OC06092018
 		{
 			if(res = ExtractSingleElecMutualIntensity(RadExtract)) throw res;
+			//if(res = ExtractSingleElecMutualIntensity(RadExtract, pGpuUsage)) throw res; //Himanshu?
 		}
 		//else if(RadExtract.Int_or_Phase == 9) //OC23022020 (under development - OC23032020)
 		//{
@@ -127,6 +130,7 @@ class srTRadGenManip {
 		else
 		{
 			if(res = ExtractSingleElecIntensity(RadExtract)) throw res;
+			//if(res = ExtractSingleElecIntensity(RadExtract, pGpuUsage)) throw res; //Himanshu?
 		}
 
 		//OCTEST17082019
@@ -178,18 +182,21 @@ class srTRadGenManip {
 		return 0;
 	}
 
-	int ExtractSingleElecIntensity(srTRadExtract& RadExtract)
+	int ExtractSingleElecIntensity(srTRadExtract& RadExtract) //, gpuUsageArg *pGpuUsage=0)
+	//int ExtractSingleElecIntensity(srTRadExtract& RadExtract, gpuUsageArg *pGpuUsage=0) //Himanshu?
 	{
 		if(RadExtract.PlotType == 0) return ExtractSingleElecIntensity1DvsE(RadExtract);
 		else if(RadExtract.PlotType == 1) return ExtractSingleElecIntensity1DvsX(RadExtract);
 		else if(RadExtract.PlotType == 2) return ExtractSingleElecIntensity1DvsZ(RadExtract);
 		else if(RadExtract.PlotType == 3) return ExtractSingleElecIntensity2DvsXZ(RadExtract);
+		//else if(RadExtract.PlotType == 3) return ExtractSingleElecIntensity2DvsXZ(RadExtract, pGpuUsage); //Himanshu?
 		else if(RadExtract.PlotType == 4) return ExtractSingleElecIntensity2DvsEX(RadExtract);
 		else if(RadExtract.PlotType == 5) return ExtractSingleElecIntensity2DvsEZ(RadExtract);
 		else return ExtractSingleElecIntensity3D(RadExtract);
 	}
 
-	int ExtractSingleElecMutualIntensity(srTRadExtract& RadExtract) 
+	int ExtractSingleElecMutualIntensity(srTRadExtract& RadExtract)
+	//int ExtractSingleElecMutualIntensity(srTRadExtract& RadExtract, gpuUsageArg *pGpuUsage=0) //HG24042023 Added GPU usage parameter
 	{//OC06092018
 		//int PolCom = RadExtract.PolarizCompon; //OC03032021 (commented-out)
 		int Int_or_ReE = RadExtract.Int_or_Phase;
@@ -201,6 +208,7 @@ class srTRadGenManip {
 		if(RadExtract.PlotType == 1) return ExtractSingleElecMutualIntensityVsX(RadExtract);
 		else if(RadExtract.PlotType == 2) return ExtractSingleElecMutualIntensityVsZ(RadExtract);
 		else if(RadExtract.PlotType == 3) return ExtractSingleElecMutualIntensityVsXZ(RadExtract);
+		//else if (RadExtract.PlotType == 3) return ExtractSingleElecMutualIntensityVsXZ(RadExtract, pGpuUsage); //HG24042023
 		//else if(RadExtract.PlotType == 4) return ExtractSingleElecMutualIntensityVsEX(RadExtract);
 		//else if(RadExtract.PlotType == 5) return ExtractSingleElecMutualIntensityVsEZ(RadExtract);
 		//else return ExtractSingleElecMutualIntensityEXZ(RadExtract);
@@ -232,14 +240,25 @@ class srTRadGenManip {
 	int ExtractSingleElecIntensity1DvsE(srTRadExtract&);
 	int ExtractSingleElecIntensity1DvsX(srTRadExtract&);
 	int ExtractSingleElecIntensity1DvsZ(srTRadExtract&);
-	int ExtractSingleElecIntensity2DvsXZ(srTRadExtract&);
+
+	int ExtractSingleElecIntensity2DvsXZ(srTRadExtract&); // , gpuUsageArg* pGpuUsage=0);
+	//int ExtractSingleElecIntensity2DvsXZ(srTRadExtract&, gpuUsageArg* pGpuUsage=0); //Himanshu?
+
 	int ExtractSingleElecIntensity2DvsEX(srTRadExtract&);
 	int ExtractSingleElecIntensity2DvsEZ(srTRadExtract&);
 	int ExtractSingleElecIntensity3D(srTRadExtract&);
 
 	int ExtractSingleElecMutualIntensityVsX(srTRadExtract&); //OC06092018
 	int ExtractSingleElecMutualIntensityVsZ(srTRadExtract&);
+
 	int ExtractSingleElecMutualIntensityVsXZ(srTRadExtract&);
+	//int ExtractSingleElecMutualIntensityVsXZ(srTRadExtract&, gpuUsageArg* pGpuUsage=0); //Himanshu?
+
+	//Himanshu?
+//#ifdef _OFFLOAD_GPU
+//	int ExtractSingleElecMutualIntensityVsXZ_GPU(float* pEx, float* pEz, float* pMI, long nxnz, long itStart, long itEnd, long PerX, long iter, int PolCom, bool EhOK, bool EvOK, gpuUsageArg *pGpuUsage);
+//	int ExtractSingleElecIntensity2DvsXZ_GPU(srTRadExtract& RadExtract, double* arAuxInt, long long ie0, long long ie1, double InvStepRelArg, gpuUsageArg *pGpuUsage);
+//#endif
 
 	//int ComputeMultiElecMutualIntensityVsXZ(srTRadExtract&, srTTrjDat* pTrjDat=0); //23022020
 
@@ -305,7 +324,9 @@ class srTRadGenManip {
 		else if(PolCom==4) { *PolVect = OneReN; *(PolVect+1) = OneImN;}
 		else if(PolCom==5) { *PolVect = OneReN; *(PolVect+1) = -OneImN;}
 	}
+
 	float IntensityComponentSimpleInterpol(float* pEx_St, float* pEx_Fi, float* pEz_St, float* pEz_Fi, double InvStepRelArg, int PolCom, int Int_or_ReE)
+	//GPU_PORTABLE float IntensityComponentSimpleInterpol(float* pEx_St, float* pEx_Fi, float* pEz_St, float* pEz_Fi, double InvStepRelArg, int PolCom, int Int_or_ReE) //Himanshu?
 	{
 		float I_St = IntensityComponent(pEx_St, pEz_St, PolCom, Int_or_ReE);
 		if(Int_or_ReE == 2) return I_St;
@@ -325,6 +346,7 @@ class srTRadGenManip {
 		return (float)((I00 - I01 - I10 + I11)*Arg1Arg2 + (I10 - I00)*Arg1 + (I01 - I00)*Arg2 + I00);
 	}
 	float IntensityComponent(float* pEx, float* pEz, int PolCom, int Int_or_ReE)
+	//GPU_PORTABLE float IntensityComponent(float* pEx, float* pEz, int PolCom, int Int_or_ReE) //Himanshu?
 	{
 		//float ExRe = *pEx, ExIm = *(pEx + 1), EzRe = *pEz, EzIm = *(pEz + 1);
 		float ExRe = 0., ExIm = 0., EzRe = 0., EzIm = 0.; //OC111111
@@ -389,6 +411,7 @@ class srTRadGenManip {
 		//return (float)(ExRe*ExRe + ExIm*ExIm + EzRe*EzRe + EzIm*EzIm);
 	}
 	double FormalPhase(float Re, float Im)
+	//GPU_PORTABLE double FormalPhase(float Re, float Im) //Himanshu?
 	{
 		const double HalhPi = 1.5707963267949;
 		const double Pi = 3.1415926535898;
@@ -471,15 +494,13 @@ class srTRadGenManip {
 			case 0: // Lin. Hor.
 			{
 				ReMI = ExRe*ExReT + ExIm*ExImT;
-				ImMI = -(ExIm*ExReT - ExRe*ExImT); //OC01052022: Testing the standard definition: E*(x,y)*E(x',y'); previously it assumed the CSD definition E(x,y)*E*(x',y'), whereas the standard one is E*(x,y)*E(x',y') - the two are related by complex conjugation
-				//ImMI = ExIm*ExReT - ExRe*ExImT;
+				ImMI = ExIm*ExReT - ExRe*ExImT;
 				break;
 			}
 			case 1: // Lin. Vert.
 			{
 				ReMI = EzRe*EzReT + EzIm*EzImT;
-				ImMI = -(EzIm*EzReT - EzRe*EzImT); //OC01052022: Testing the standard definition: E*(x,y)*E(x',y'); previously it assumed the CSD definition E(x,y)*E*(x',y'), whereas the standard one is E*(x,y)*E(x',y') - the two are related by complex conjugation
-				//ImMI = EzIm*EzReT - EzRe*EzImT;
+				ImMI = EzIm*EzReT - EzRe*EzImT;
 				break;
 			}
 			case 2: // Linear 45 deg.
@@ -487,8 +508,7 @@ class srTRadGenManip {
 				double ExRe_p_EzRe = ExRe + EzRe, ExIm_p_EzIm = ExIm + EzIm;
 				double ExRe_p_EzReT = ExReT + EzReT, ExIm_p_EzImT = ExImT + EzImT;
 				ReMI = 0.5*(ExRe_p_EzRe*ExRe_p_EzReT + ExIm_p_EzIm*ExIm_p_EzImT);
-				ImMI = -0.5*(ExIm_p_EzIm*ExRe_p_EzReT - ExRe_p_EzRe*ExIm_p_EzImT); //OC01052022: Testing the standard definition: E*(x,y)*E(x',y'); previously it assumed the CSD definition E(x,y)*E*(x',y'), whereas the standard one is E*(x,y)*E(x',y') - the two are related by complex conjugation
-				//ImMI = 0.5*(ExIm_p_EzIm*ExRe_p_EzReT - ExRe_p_EzRe*ExIm_p_EzImT);
+				ImMI = 0.5*(ExIm_p_EzIm*ExRe_p_EzReT - ExRe_p_EzRe*ExIm_p_EzImT);
 				break;
 			}
 			case 3: // Linear 135 deg.
@@ -496,8 +516,7 @@ class srTRadGenManip {
 				double ExRe_mi_EzRe = ExRe - EzRe, ExIm_mi_EzIm = ExIm - EzIm;
 				double ExRe_mi_EzReT = ExReT - EzReT, ExIm_mi_EzImT = ExImT - EzImT;
 				ReMI = 0.5*(ExRe_mi_EzRe*ExRe_mi_EzReT + ExIm_mi_EzIm*ExIm_mi_EzImT);
-				ImMI = -0.5*(ExIm_mi_EzIm*ExRe_mi_EzReT - ExRe_mi_EzRe*ExIm_mi_EzImT); //OC01052022: Testing the standard definition: E*(x,y)*E(x',y'); previously it assumed the CSD definition E(x,y)*E*(x',y'), whereas the standard one is E*(x,y)*E(x',y') - the two are related by complex conjugation
-				//ImMI = 0.5*(ExIm_mi_EzIm*ExRe_mi_EzReT - ExRe_mi_EzRe*ExIm_mi_EzImT);
+				ImMI = 0.5*(ExIm_mi_EzIm*ExRe_mi_EzReT - ExRe_mi_EzRe*ExIm_mi_EzImT);
 				break;
 			}
 			case 5: // Circ. Left //OC08092019: corrected to be in compliance with definitions for right-hand frame (x,z,s) and with corresponding definition and calculation of Stokes params
@@ -506,8 +525,7 @@ class srTRadGenManip {
 				double ExRe_mi_EzIm = ExRe - EzIm, ExIm_p_EzRe = ExIm + EzRe;
 				double ExRe_mi_EzImT = ExReT - EzImT, ExIm_p_EzReT = ExImT + EzReT;
 				ReMI = 0.5*(ExRe_mi_EzIm*ExRe_mi_EzImT + ExIm_p_EzRe*ExIm_p_EzReT);
-				ImMI = -0.5*(ExIm_p_EzRe*ExRe_mi_EzImT - ExRe_mi_EzIm*ExIm_p_EzReT); //OC01052022: Testing the standard definition: E*(x,y)*E(x',y'); previously it assumed the CSD definition E(x,y)*E*(x',y'), whereas the standard one is E*(x,y)*E(x',y') - the two are related by complex conjugation
-				//ImMI = 0.5*(ExIm_p_EzRe*ExRe_mi_EzImT - ExRe_mi_EzIm*ExIm_p_EzReT);
+				ImMI = 0.5*(ExIm_p_EzRe*ExRe_mi_EzImT - ExRe_mi_EzIm*ExIm_p_EzReT);
 				break;
 			}
 			case 4: // Circ. Right //OC08092019: corrected to be in compliance with definitions for right-hand frame (x,z,s) and with corresponding definition and calculation of Stokes params
@@ -516,43 +534,37 @@ class srTRadGenManip {
 				double ExRe_p_EzIm = ExRe + EzIm, ExIm_mi_EzRe = ExIm - EzRe;
 				double ExRe_p_EzImT = ExReT + EzImT, ExIm_mi_EzReT = ExImT - EzReT;
 				ReMI = 0.5*(ExRe_p_EzIm*ExRe_p_EzImT + ExIm_mi_EzRe*ExIm_mi_EzReT);
-				ImMI = -0.5*(ExIm_mi_EzRe*ExRe_p_EzImT - ExRe_p_EzIm*ExIm_mi_EzReT); //OC01052022: Testing the standard definition: E*(x,y)*E(x',y'); previously it assumed the CSD definition E(x,y)*E*(x',y'), whereas the standard one is E*(x,y)*E(x',y') - the two are related by complex conjugation
-				//ImMI = 0.5*(ExIm_mi_EzRe*ExRe_p_EzImT - ExRe_p_EzIm*ExIm_mi_EzReT);
+				ImMI = 0.5*(ExIm_mi_EzRe*ExRe_p_EzImT - ExRe_p_EzIm*ExIm_mi_EzReT);
 				break;
 			}
 			case -1: // s0
 			{
 				ReMI = ExRe*ExReT + ExIm*ExImT + EzRe*EzReT + EzIm*EzImT;
-				ImMI = -(ExIm*ExReT - ExRe*ExImT + EzIm*EzReT - EzRe*EzImT); //OC01052022: Testing the standard definition: E*(x,y)*E(x',y'); previously it assumed the CSD definition E(x,y)*E*(x',y'), whereas the standard one is E*(x,y)*E(x',y') - the two are related by complex conjugation
-				//ImMI = ExIm*ExReT - ExRe*ExImT + EzIm*EzReT - EzRe*EzImT;
+				ImMI = ExIm*ExReT - ExRe*ExImT + EzIm*EzReT - EzRe*EzImT;
 				break;
 			}
 			case -2: // s1
 			{
 				ReMI = ExRe*ExReT + ExIm*ExImT - (EzRe*EzReT + EzIm*EzImT);
-				ImMI = -(ExIm*ExReT - ExRe*ExImT - (EzIm*EzReT - EzRe*EzImT)); //OC01052022: Testing the standard definition: E*(x,y)*E(x',y'); previously it assumed the CSD definition E(x,y)*E*(x',y'), whereas the standard one is E*(x,y)*E(x',y') - the two are related by complex conjugation
-				//ImMI = ExIm*ExReT - ExRe*ExImT - (EzIm*EzReT - EzRe*EzImT);
+				ImMI = ExIm*ExReT - ExRe*ExImT - (EzIm*EzReT - EzRe*EzImT);
 				break;
 			}
 			case -3: // s2
 			{
 				ReMI = ExImT*EzIm + ExIm*EzImT + ExReT*EzRe + ExRe*EzReT;
-				ImMI = -(ExReT*EzIm - ExRe*EzImT - ExImT*EzRe + ExIm*EzReT); //OC01052022: Testing the standard definition: E*(x,y)*E(x',y'); previously it assumed the CSD definition E(x,y)*E*(x',y'), whereas the standard one is E*(x,y)*E(x',y') - the two are related by complex conjugation
-				//ImMI = ExReT*EzIm - ExRe*EzImT - ExImT*EzRe + ExIm*EzReT;
+				ImMI = ExReT*EzIm - ExRe*EzImT - ExImT*EzRe + ExIm*EzReT;
 				break;
 			}
 			case -4: // s3
 			{
 				ReMI = ExReT*EzIm + ExRe*EzImT - ExImT*EzRe - ExIm*EzReT;
-				ImMI = -(ExIm*EzImT - ExImT*EzIm - ExReT*EzRe + ExRe*EzReT); //OC01052022: Testing the standard definition: E*(x,y)*E(x',y'); previously it assumed the CSD definition E(x,y)*E*(x',y'), whereas the standard one is E*(x,y)*E(x',y') - the two are related by complex conjugation
-				//ImMI = ExIm*EzImT - ExImT*EzIm - ExReT*EzRe + ExRe*EzReT;
+				ImMI = ExIm*EzImT - ExImT*EzIm - ExReT*EzRe + ExRe*EzReT;
 				break;
 			}
 			default: // total mutual intensity, same as s0
 			{
 				ReMI = ExRe*ExReT + ExIm*ExImT + EzRe*EzReT + EzIm*EzImT;
-				ImMI = -(ExIm*ExReT - ExRe*ExImT + EzIm*EzReT - EzRe*EzImT); //OC01052022: Testing the standard definition: E*(x,y)*E(x',y'); previously it assumed the CSD definition E(x,y)*E*(x',y'), whereas the standard one is E*(x,y)*E(x',y') - the two are related by complex conjugation
-				//ImMI = ExIm*ExReT - ExRe*ExImT + EzIm*EzReT - EzRe*EzImT;
+				ImMI = ExIm*ExReT - ExRe*ExImT + EzIm*EzReT - EzRe*EzImT;
 				break;
 				//return CAN_NOT_EXTRACT_MUT_INT;
 			}
@@ -720,6 +732,7 @@ class srTRadGenManip {
 	}
 
 	static void CosAndSin(double x, float& Cos, float& Sin) //OC23062021
+	//GPU_PORTABLE static void CosAndSin(double x, float& Cos, float& Sin) //OC23062021 //Himanshu?
 	{
 		const double TwoPI = 6.2831853071796;
 		const double ThreePIdTwo = 4.7123889803847;
diff --git a/cpp/src/core/srradmnp_gpu.cu b/cpp/src/core/srradmnp_gpu.cu
new file mode 100644
index 00000000..41d65b9e
--- /dev/null
+++ b/cpp/src/core/srradmnp_gpu.cu
@@ -0,0 +1,519 @@
+/************************************************************************//**
+ * File: srradmnp_gpu.cu
+ * Description: Various "manipulations" with Radiation data (e.g. "extraction" of Intensity from Electric Field, etc.) (CUDA implementation)
+ * Project: Synchrotron Radiation Workshop
+ * First release: 2023
+ *
+ * Copyright (C) Brookhaven National Laboratory
+ * All Rights Reserved
+ *
+ * @author H.Goel
+ * @version 1.0
+ ***************************************************************************/
+
+#ifdef _OFFLOAD_GPU
+#include <stdio.h>
+#include <stdlib.h>
+#include <string>
+#include <assert.h>
+#include <math.h>
+#include "srradmnp.h"
+#include "gmmeth.h"
+
+template <bool allStokesReq, bool intOverEnIsRequired, int PolCom>
+__global__ void ExtractSingleElecIntensity2DvsXZ_Kernel(srTRadExtract RadExtract, srTSRWRadStructAccessData RadAccessData, srTRadGenManip *obj, double* arAuxInt, long long ie0, long long ie1, double InvStepRelArg, int Int_or_ReE)
+{
+	int ix = (blockIdx.x * blockDim.x + threadIdx.x); //nx range
+    int iz = (blockIdx.y * blockDim.y + threadIdx.y); //nz range
+    int iwfr = (blockIdx.z * blockDim.z + threadIdx.z); //nwfr range
+    
+	if (ix < RadAccessData.nx && iz < RadAccessData.nz && iwfr < RadAccessData.nwfr) 
+    {
+		//int PolCom = RadExtract.PolarizCompon;
+			
+		//bool allStokesReq = (PolCom == -5); //OC18042020
+
+		float* pI = 0, * pI1 = 0, * pI2 = 0, * pI3 = 0; //OC17042020
+		double* pId = 0, * pI1d = 0, * pI2d = 0, * pI3d = 0;
+		long ne = RadAccessData.ne, nx = RadAccessData.nx, nz = RadAccessData.nz, nwfr = RadAccessData.nwfr;
+		//float *pI = 0;
+		//DOUBLE *pId = 0;
+		//double *pId = 0; //OC26112019 (related to SRW port to IGOR XOP8 on Mac)
+		long long nxnz = ((long long)nx) * ((long long)nz);
+		if (Int_or_ReE != 2)
+		{
+			pI = RadExtract.pExtractedData;
+			if (allStokesReq) //OC17042020
+			{
+				pI1 = pI + nxnz; pI2 = pI1 + nxnz; pI3 = pI2 + nxnz;
+			}
+		}
+		else
+		{
+			pId = RadExtract.pExtractedDataD;
+			if (allStokesReq) //OC17042020
+			{
+				pI1d = pId + nxnz; pI2d = pI1d + nxnz; pI3d = pI2d + nxnz;
+			}
+		}
+
+		float* pEx0 = RadAccessData.pBaseRadX;
+		float* pEz0 = RadAccessData.pBaseRadZ;
+
+		//long PerX = RadAccessData.ne << 1;
+		//long PerZ = PerX*RadAccessData.nx;
+		//long long PerX = RadAccessData.ne << 1;
+		//long long PerZ = PerX*RadAccessData.nx;
+		long long PerX = ((long long)ne) << 1; //OC18042020
+		long long PerZ = PerX * nx;
+		long long PerWfr = PerZ * nz;
+
+		//bool intOverEnIsRequired = (RadExtract.Int_or_Phase == 7) && (ne > 1); //OC18042020
+		double resInt, resInt1, resInt2, resInt3;
+		double ConstPhotEnInteg = 1.;
+		long long Two_ie0 = ie0 << 1, Two_ie1 = ie1 << 1; //OC26042019
+		long ie;
+
+		long offset = iwfr * PerWfr + iz * PerZ + ix * PerX;
+		long offsetDiv2 = offset >> 1;
+
+		float* pEx_StartForX = pEx0 + offset;
+		float* pEz_StartForX = pEz0 + offset;
+		if (pI != 0)
+		{
+			pI += offsetDiv2;
+			if (allStokesReq)
+			{
+				pI1 += offsetDiv2;
+				pI2 += offsetDiv2;
+				pI3 += offsetDiv2;
+			}
+		} 
+
+		if (pId != 0)
+		{
+			pId += offsetDiv2;
+			if (allStokesReq)
+			{
+				pI1d += offsetDiv2;
+				pI2d += offsetDiv2;
+				pI3d += offsetDiv2;
+			}
+		} 
+		
+		//long ixPerX = 0;
+
+		float* pEx_St = pEx_StartForX + Two_ie0;
+		float* pEz_St = pEz_StartForX + Two_ie0;
+		float* pEx_Fi = pEx_StartForX + Two_ie1;
+		float* pEz_Fi = pEz_StartForX + Two_ie1;
+
+		if (intOverEnIsRequired) //OC140813
+		{//integrate over photon energy / time
+			double* tInt = arAuxInt;
+			float* pEx_StAux = pEx_St;
+			float* pEz_StAux = pEz_St;
+
+			if (!allStokesReq) //OC17042020
+			{
+				for (ie = 0; ie < ne; ie++) //OC18042020
+				//for(int ie=0; ie<RadAccessData.ne; ie++)
+				{
+					*(tInt++) = obj->IntensityComponent(pEx_StAux, pEz_StAux, PolCom, Int_or_ReE);
+					pEx_StAux += 2;
+					pEz_StAux += 2;
+				}
+				resInt = ConstPhotEnInteg * CGenMathMeth::Integ1D_FuncDefByArray(arAuxInt, ne, RadAccessData.eStep); //OC18042020
+				//resInt = ConstPhotEnInteg*CGenMathMeth::Integ1D_FuncDefByArray(arAuxInt, RadAccessData.ne, RadAccessData.eStep);
+			}
+			else
+			{
+				for (ie = 0; ie < ne; ie++)
+				{
+					*(tInt++) = obj->IntensityComponent(pEx_StAux, pEz_StAux, -1, Int_or_ReE);
+					pEx_StAux += 2; pEz_StAux += 2;
+				}
+				resInt = ConstPhotEnInteg * CGenMathMeth::Integ1D_FuncDefByArray(arAuxInt, ne, RadAccessData.eStep);
+
+				tInt = arAuxInt; pEx_StAux = pEx_St; pEz_StAux = pEz_St;
+				for (ie = 0; ie < ne; ie++)
+				{
+					*(tInt++) = obj->IntensityComponent(pEx_StAux, pEz_StAux, -2, Int_or_ReE);
+					pEx_StAux += 2; pEz_StAux += 2;
+				}
+				resInt1 = ConstPhotEnInteg * CGenMathMeth::Integ1D_FuncDefByArray(arAuxInt, ne, RadAccessData.eStep);
+
+				tInt = arAuxInt; pEx_StAux = pEx_St; pEz_StAux = pEz_St;
+				for (ie = 0; ie < ne; ie++)
+				{
+					*(tInt++) = obj->IntensityComponent(pEx_StAux, pEz_StAux, -3, Int_or_ReE);
+					pEx_StAux += 2; pEz_StAux += 2;
+				}
+				resInt2 = ConstPhotEnInteg * CGenMathMeth::Integ1D_FuncDefByArray(arAuxInt, ne, RadAccessData.eStep);
+
+				tInt = arAuxInt; pEx_StAux = pEx_St; pEz_StAux = pEz_St;
+				for (ie = 0; ie < ne; ie++)
+				{
+					*(tInt++) = obj->IntensityComponent(pEx_StAux, pEz_StAux, -4, Int_or_ReE);
+					pEx_StAux += 2; pEz_StAux += 2;
+				}
+				resInt3 = ConstPhotEnInteg * CGenMathMeth::Integ1D_FuncDefByArray(arAuxInt, ne, RadAccessData.eStep);
+			}
+		}
+		else
+		{
+			if (!allStokesReq) //OC18042020
+			{
+				resInt = obj->IntensityComponentSimpleInterpol(pEx_St, pEx_Fi, pEz_St, pEz_Fi, InvStepRelArg, PolCom, Int_or_ReE);
+			}
+			else //OC18042020
+			{
+				resInt = obj->IntensityComponentSimpleInterpol(pEx_St, pEx_Fi, pEz_St, pEz_Fi, InvStepRelArg, -1, Int_or_ReE);
+				resInt1 = obj->IntensityComponentSimpleInterpol(pEx_St, pEx_Fi, pEz_St, pEz_Fi, InvStepRelArg, -2, Int_or_ReE);
+				resInt2 = obj->IntensityComponentSimpleInterpol(pEx_St, pEx_Fi, pEz_St, pEz_Fi, InvStepRelArg, -3, Int_or_ReE);
+				resInt3 = obj->IntensityComponentSimpleInterpol(pEx_St, pEx_Fi, pEz_St, pEz_Fi, InvStepRelArg, -4, Int_or_ReE);
+			}
+		}
+		//OC140813
+		if (pI != 0) *pI = (float)resInt;
+		if (pId != 0) *pId = resInt; //OC18042020
+		//if(pId != 0) *(pId++) = (double)resInt;
+		if (allStokesReq) //OC18042020
+		{
+			if (RadExtract.pExtractedData != 0)
+			{
+				*pI1 = (float)resInt1; *pI2 = (float)resInt2; *pI3 = (float)resInt3;
+			}
+			else
+			{
+				*pI1d = resInt1; *pI2d = resInt2; *pI3d = resInt3;
+			}
+		}
+	}
+}
+
+template <bool allStokesReq, bool intOverEnIsRequired>
+static inline void ExtractSingleElecIntensity2DvsXZ_GPUSub(dim3 &blocks, dim3 &threads, srTRadExtract RadExtract, srTSRWRadStructAccessData RadAccessData, srTRadGenManip *local_copy, double* arAuxInt, long long ie0, long long ie1, double InvStepRelArg, int Int_or_ReE)
+{
+	switch(RadExtract.PolarizCompon)
+	{
+		case 5: ExtractSingleElecIntensity2DvsXZ_Kernel<allStokesReq, intOverEnIsRequired, 5><<<blocks, threads>>>(RadExtract, RadAccessData, local_copy, arAuxInt, ie0, ie1, InvStepRelArg, Int_or_ReE); break;
+		case 4: ExtractSingleElecIntensity2DvsXZ_Kernel<allStokesReq, intOverEnIsRequired, 4><<<blocks, threads>>>(RadExtract, RadAccessData, local_copy, arAuxInt, ie0, ie1, InvStepRelArg, Int_or_ReE); break;
+		case 3: ExtractSingleElecIntensity2DvsXZ_Kernel<allStokesReq, intOverEnIsRequired, 3><<<blocks, threads>>>(RadExtract, RadAccessData, local_copy, arAuxInt, ie0, ie1, InvStepRelArg, Int_or_ReE); break;
+		case 2: ExtractSingleElecIntensity2DvsXZ_Kernel<allStokesReq, intOverEnIsRequired, 2><<<blocks, threads>>>(RadExtract, RadAccessData, local_copy, arAuxInt, ie0, ie1, InvStepRelArg, Int_or_ReE); break;
+		case 1: ExtractSingleElecIntensity2DvsXZ_Kernel<allStokesReq, intOverEnIsRequired, 1><<<blocks, threads>>>(RadExtract, RadAccessData, local_copy, arAuxInt, ie0, ie1, InvStepRelArg, Int_or_ReE); break;
+		case 0: ExtractSingleElecIntensity2DvsXZ_Kernel<allStokesReq, intOverEnIsRequired, 0><<<blocks, threads>>>(RadExtract, RadAccessData, local_copy, arAuxInt, ie0, ie1, InvStepRelArg, Int_or_ReE); break;
+		case -1: ExtractSingleElecIntensity2DvsXZ_Kernel<allStokesReq, intOverEnIsRequired, -1><<<blocks, threads>>>(RadExtract, RadAccessData, local_copy, arAuxInt, ie0, ie1, InvStepRelArg, Int_or_ReE); break;
+		case -2: ExtractSingleElecIntensity2DvsXZ_Kernel<allStokesReq, intOverEnIsRequired, -2><<<blocks, threads>>>(RadExtract, RadAccessData, local_copy, arAuxInt, ie0, ie1, InvStepRelArg, Int_or_ReE); break;
+		case -3: ExtractSingleElecIntensity2DvsXZ_Kernel<allStokesReq, intOverEnIsRequired, -3><<<blocks, threads>>>(RadExtract, RadAccessData, local_copy, arAuxInt, ie0, ie1, InvStepRelArg, Int_or_ReE); break;
+		case -4: ExtractSingleElecIntensity2DvsXZ_Kernel<allStokesReq, intOverEnIsRequired, -4><<<blocks, threads>>>(RadExtract, RadAccessData, local_copy, arAuxInt, ie0, ie1, InvStepRelArg, Int_or_ReE); break;
+		default: ExtractSingleElecIntensity2DvsXZ_Kernel<allStokesReq, intOverEnIsRequired, -5><<<blocks, threads>>>(RadExtract, RadAccessData, local_copy, arAuxInt, ie0, ie1, InvStepRelArg, Int_or_ReE); break;
+	}
+}
+
+int srTRadGenManip::ExtractSingleElecIntensity2DvsXZ_GPU(srTRadExtract& RadExtract, double* arAuxInt, long long ie0, long long ie1, double InvStepRelArg, gpuUsageArg *pGpuUsage)
+{
+	srTSRWRadStructAccessData& RadAccessData = *((srTSRWRadStructAccessData*)(hRadAccessData.ptr()));
+
+    const int bs = 256;
+    dim3 blocks(RadAccessData.nx / bs + ((RadAccessData.nx & (bs - 1)) != 0), RadAccessData.nz, RadAccessData.nwfr);
+    dim3 threads(bs, 1);
+
+    if (RadAccessData.pBaseRadX != NULL)
+	{
+		RadAccessData.pBaseRadX = (float*)AuxGpu::ToDevice(pGpuUsage, RadAccessData.pBaseRadX, 2*RadAccessData.ne*RadAccessData.nx*RadAccessData.nz*RadAccessData.nwfr*sizeof(float));
+		AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, RadAccessData.pBaseRadX);
+	}
+	if (RadAccessData.pBaseRadZ != NULL)
+	{
+		RadAccessData.pBaseRadZ = (float*)AuxGpu::ToDevice(pGpuUsage, RadAccessData.pBaseRadZ, 2*RadAccessData.ne*RadAccessData.nx*RadAccessData.nz*RadAccessData.nwfr*sizeof(float));
+		AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, RadAccessData.pBaseRadZ);
+	}
+
+	srTRadGenManip *local_copy = (srTRadGenManip*)AuxGpu::ToDevice(pGpuUsage, this, sizeof(srTRadGenManip));
+	AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, local_copy);
+
+    arAuxInt = (double*)AuxGpu::ToDevice(pGpuUsage, arAuxInt, RadAccessData.ne*sizeof(double));
+    AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, arAuxInt);
+
+	bool allStokesReq = (RadExtract.PolarizCompon == -5);
+	bool intOverEnIsRequired = (RadExtract.Int_or_Phase == 7) && (RadAccessData.ne > 1);
+
+	int Int_or_ReE = RadExtract.Int_or_Phase;
+	if (Int_or_ReE == 7) Int_or_ReE = 0; //OC150813: time/phot. energy integrated single-e intensity requires "normal" intensity here
+
+	if (allStokesReq)
+		if (intOverEnIsRequired)
+			ExtractSingleElecIntensity2DvsXZ_GPUSub<true, true> (blocks, threads, RadExtract, RadAccessData, local_copy, arAuxInt, ie0, ie1, InvStepRelArg, Int_or_ReE);
+		else
+			ExtractSingleElecIntensity2DvsXZ_GPUSub<true, false> (blocks, threads, RadExtract, RadAccessData, local_copy, arAuxInt, ie0, ie1, InvStepRelArg, Int_or_ReE);
+	else
+		if (intOverEnIsRequired)
+			ExtractSingleElecIntensity2DvsXZ_GPUSub<false, true> (blocks, threads, RadExtract, RadAccessData, local_copy, arAuxInt, ie0, ie1, InvStepRelArg, Int_or_ReE);
+		else
+			ExtractSingleElecIntensity2DvsXZ_GPUSub<false, false> (blocks, threads, RadExtract, RadAccessData, local_copy, arAuxInt, ie0, ie1, InvStepRelArg, Int_or_ReE);
+	
+    AuxGpu::ToHostAndFree(pGpuUsage, local_copy, sizeof(srTRadGenManip), true);
+    AuxGpu::ToHostAndFree(pGpuUsage, arAuxInt, RadAccessData.ne*sizeof(double), true);
+	AuxGpu::MarkUpdated(pGpuUsage, RadAccessData.pBaseRadX, true, false);
+	AuxGpu::MarkUpdated(pGpuUsage, RadAccessData.pBaseRadZ, true, false);
+
+#ifndef _DEBUG
+	if (RadAccessData.pBaseRadX != NULL)
+		RadAccessData.pBaseRadX = (float*)AuxGpu::GetHostPtr(pGpuUsage, RadAccessData.pBaseRadX);
+	if (RadAccessData.pBaseRadZ != NULL)
+		RadAccessData.pBaseRadZ = (float*)AuxGpu::GetHostPtr(pGpuUsage, RadAccessData.pBaseRadZ);
+#endif
+
+#ifdef _DEBUG
+	if (RadAccessData.pBaseRadX != NULL)
+		RadAccessData.pBaseRadX = (float*)AuxGpu::ToHostAndFree(pGpuUsage, RadAccessData.pBaseRadX, 2 * RadAccessData.ne * RadAccessData.nx * RadAccessData.nz * RadAccessData.nwfr * sizeof(float));
+	if (RadAccessData.pBaseRadZ != NULL)
+		RadAccessData.pBaseRadZ = (float*)AuxGpu::ToHostAndFree(pGpuUsage, RadAccessData.pBaseRadZ, 2 * RadAccessData.ne * RadAccessData.nx * RadAccessData.nz * RadAccessData.nwfr * sizeof(float));
+	cudaStreamSynchronize(0);
+	auto err = cudaGetLastError();
+	printf("%s\r\n", cudaGetErrorString(err));
+#endif
+	return 0;
+}
+
+template <int PolCom, bool EhOK, bool EvOK, int gt1_iter, int itPerBlk>
+__global__ void ExtractSingleElecMutualIntensityVsXZ_Kernel(const float* __restrict__ pEx0, const float* __restrict__ pEz0, float* __restrict__ pMI0, long nxnz, long itStart, long itEnd, long PerX, long iter0)
+{
+	//Calculate coordinates as the typical triangular matrix
+	int i0 = (blockIdx.x * blockDim.x + threadIdx.x); //<=nxnz range
+	int it0_0 = (blockIdx.y * blockDim.y + threadIdx.y); //nxnz/(2*itPerBlk) range
+	long iter = iter0;
+
+	if (i0 > nxnz) return;
+	if (it0_0 > nxnz / 2) return;
+
+	for (int it0 = it0_0 * itPerBlk; it0 < it0_0 * itPerBlk + itPerBlk; it0++)
+	{
+		long it = it0;
+		long i = i0;
+		if (i0 > it0) //If the coordinates are past the triangular bounds, switch to the lower half of the triangle
+		{
+			it = nxnz - it0 - 1;
+			i = i0 - (it0 + 1);
+		}
+
+		if (it >= itEnd) {
+			return;
+		}
+
+		//float* pMI = pMI0 + it0 * (nxnz << 1) + (i0 << 1); //Compact representation coordinates
+		float* pMI = pMI0 + (it - itStart) * (nxnz << 1) + (i << 1); //Full representation coordinates
+		const float* pEx = pEx0 + i * PerX;
+		const float* pEz = pEz0 + i * PerX;
+		const float* pExT = pEx0 + (it - itStart) * PerX;
+		const float* pEzT = pEz0 + (it - itStart) * PerX;
+
+		float ExRe = 0., ExIm = 0., EzRe = 0., EzIm = 0.;
+		float ExReT = 0., ExImT = 0., EzReT = 0., EzImT = 0.;
+
+		{
+			if (EhOK)
+			{
+				ExRe = *pEx; ExIm = *(pEx + 1);
+				if (i != (it - itStart)) {
+					ExReT = *pExT; ExImT = *(pExT + 1);
+				}
+				else {
+					ExReT = ExRe;
+					ExImT = ExIm;
+				}
+			}
+			if (EvOK) {
+				EzRe = *pEz; EzIm = *(pEz + 1);
+				if (i != (it - itStart)) {
+					EzReT = *pEzT; EzImT = *(pEzT + 1);
+				}
+				else {
+					EzReT = EzRe;
+					EzImT = EzIm;
+				}
+			}
+		}
+		float ReMI = 0., ImMI = 0.;
+
+		switch (PolCom)
+		{
+		case 0: // Lin. Hor.
+		{
+			ReMI = ExRe * ExReT + ExIm * ExImT;
+			ImMI = ExIm * ExReT - ExRe * ExImT;
+			break;
+		}
+		case 1: // Lin. Vert.
+		{
+			ReMI = EzRe * EzReT + EzIm * EzImT;
+			ImMI = EzIm * EzReT - EzRe * EzImT;
+			break;
+		}
+		case 2: // Linear 45 deg.
+		{
+			float ExRe_p_EzRe = ExRe + EzRe, ExIm_p_EzIm = ExIm + EzIm;
+			float ExRe_p_EzReT = ExReT + EzReT, ExIm_p_EzImT = ExImT + EzImT;
+			ReMI = 0.5f * (ExRe_p_EzRe * ExRe_p_EzReT + ExIm_p_EzIm * ExIm_p_EzImT);
+			ImMI = 0.5f * (ExIm_p_EzIm * ExRe_p_EzReT - ExRe_p_EzRe * ExIm_p_EzImT);
+			break;
+		}
+		case 3: // Linear 135 deg.
+		{
+			float ExRe_mi_EzRe = ExRe - EzRe, ExIm_mi_EzIm = ExIm - EzIm;
+			float ExRe_mi_EzReT = ExReT - EzReT, ExIm_mi_EzImT = ExImT - EzImT;
+			ReMI = 0.5f * (ExRe_mi_EzRe * ExRe_mi_EzReT + ExIm_mi_EzIm * ExIm_mi_EzImT);
+			ImMI = 0.5f * (ExIm_mi_EzIm * ExRe_mi_EzReT - ExRe_mi_EzRe * ExIm_mi_EzImT);
+			break;
+		}
+		case 5: // Circ. Left //OC08092019: corrected to be in compliance with definitions for right-hand frame (x,z,s) and with corresponding definition and calculation of Stokes params
+			//case 4: // Circ. Right
+		{
+			float ExRe_mi_EzIm = ExRe - EzIm, ExIm_p_EzRe = ExIm + EzRe;
+			float ExRe_mi_EzImT = ExReT - EzImT, ExIm_p_EzReT = ExImT + EzReT;
+			ReMI = 0.5f * (ExRe_mi_EzIm * ExRe_mi_EzImT + ExIm_p_EzRe * ExIm_p_EzReT);
+			ImMI = 0.5f * (ExIm_p_EzRe * ExRe_mi_EzImT - ExRe_mi_EzIm * ExIm_p_EzReT);
+			break;
+		}
+		case 4: // Circ. Right //OC08092019: corrected to be in compliance with definitions for right-hand frame (x,z,s) and with corresponding definition and calculation of Stokes params
+			//case 5: // Circ. Left
+		{
+			float ExRe_p_EzIm = ExRe + EzIm, ExIm_mi_EzRe = ExIm - EzRe;
+			float ExRe_p_EzImT = ExReT + EzImT, ExIm_mi_EzReT = ExImT - EzReT;
+			ReMI = 0.5f * (ExRe_p_EzIm * ExRe_p_EzImT + ExIm_mi_EzRe * ExIm_mi_EzReT);
+			ImMI = 0.5f * (ExIm_mi_EzRe * ExRe_p_EzImT - ExRe_p_EzIm * ExIm_mi_EzReT);
+			break;
+		}
+		case -1: // s0
+		{
+			ReMI = ExRe * ExReT + ExIm * ExImT + EzRe * EzReT + EzIm * EzImT;
+			ImMI = ExIm * ExReT - ExRe * ExImT + EzIm * EzReT - EzRe * EzImT;
+			break;
+		}
+		case -2: // s1
+		{
+			ReMI = ExRe * ExReT + ExIm * ExImT - (EzRe * EzReT + EzIm * EzImT);
+			ImMI = ExIm * ExReT - ExRe * ExImT - (EzIm * EzReT - EzRe * EzImT);
+			break;
+		}
+		case -3: // s2
+		{
+			ReMI = ExImT * EzIm + ExIm * EzImT + ExReT * EzRe + ExRe * EzReT;
+			ImMI = ExReT * EzIm - ExRe * EzImT - ExImT * EzRe + ExIm * EzReT;
+			break;
+		}
+		case -4: // s3
+		{
+			ReMI = ExReT * EzIm + ExRe * EzImT - ExImT * EzRe - ExIm * EzReT;
+			ImMI = ExIm * EzImT - ExImT * EzIm - ExReT * EzRe + ExRe * EzReT;
+			break;
+		}
+		default: // total mutual intensity, same as s0
+		{
+			ReMI = ExRe * ExReT + ExIm * ExImT + EzRe * EzReT + EzIm * EzImT;
+			ImMI = ExIm * ExReT - ExRe * ExImT + EzIm * EzReT - EzRe * EzImT;
+			break;
+			//return CAN_NOT_EXTRACT_MUT_INT;
+		}
+		}
+
+		if (gt1_iter > 0)
+		{
+			pMI[0] = (pMI[0] * iter + (float)ReMI) / (float)(iter + 1.);
+			pMI[1] = (pMI[1] * iter + (float)ImMI) / (float)(iter + 1.);
+		}
+		else if (gt1_iter == 0)
+		{
+			pMI[0] = (float)ReMI;
+			pMI[1] = (float)ImMI;
+		}
+		else
+		{
+			pMI[0] += (float)ReMI;
+			pMI[1] += (float)ImMI;
+		}
+	}
+}
+
+template <int PolCom, int gt1_iter>
+int ExtractSingleElecMutualIntensityVsXZ_GPUSub(float* pEx, float* pEz, float* pMI0, long nxnz, long itStart, long itEnd, long PerX, long iter, bool EhOK, bool EvOK, gpuUsageArg* pGpuUsage)
+{
+	const int itPerBlk = 1;
+	dim3 threads = dim3(48, 16, 1);
+	dim3 grid = dim3((nxnz + 1) / threads.x + (threads.x > 1), (nxnz / 2) / (threads.y * itPerBlk) + (threads.y > 1), 1);
+
+	pEx = (float*)AuxGpu::ToDevice(pGpuUsage, pEx, nxnz * 2 * sizeof(float));
+	AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, pEx);
+
+	pEz = (float*)AuxGpu::ToDevice(pGpuUsage, pEz, nxnz * 2 * sizeof(float));
+	AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, pEz);
+
+	pMI0 = (float*)AuxGpu::ToDevice(pGpuUsage, pMI0, (itEnd - itStart) * nxnz * 2 * sizeof(float));
+
+	if (EhOK)
+	{
+		if (EvOK) ExtractSingleElecMutualIntensityVsXZ_Kernel<PolCom, true, true, gt1_iter, itPerBlk> << <grid, threads >> > (pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter);
+		else ExtractSingleElecMutualIntensityVsXZ_Kernel<PolCom, true, false, gt1_iter, itPerBlk> << <grid, threads >> > (pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter);
+	}
+	else
+	{
+		if (EvOK) ExtractSingleElecMutualIntensityVsXZ_Kernel<PolCom, false, true, gt1_iter, itPerBlk> << <grid, threads >> > (pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter);
+		else ExtractSingleElecMutualIntensityVsXZ_Kernel<PolCom, false, false, gt1_iter, itPerBlk> << <grid, threads >> > (pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter);
+	}
+
+	pEx = (float*)AuxGpu::ToHostAndFree(pGpuUsage, pEx, nxnz * 2 * sizeof(float), true);
+	pEz = (float*)AuxGpu::ToHostAndFree(pGpuUsage, pEz, nxnz * 2 * sizeof(float), true);
+	
+	AuxGpu::MarkUpdated(pGpuUsage, pMI0, true, false);
+
+#ifdef _DEBUG
+	if (pMI0 != NULL)
+		pMI0 = (float*)AuxGpu::ToHostAndFree(pGpuUsage, pMI0, (itEnd - itStart) * RadAccessData.ne * RadAccessData.nx * RadAccessData.nz * 2 * sizeof(float));
+
+	cudaStreamSynchronize(0);
+	auto err = cudaGetLastError();
+	printf("%s\r\n", cudaGetErrorString(err));
+#endif
+	return 0;
+}
+
+int srTRadGenManip::ExtractSingleElecMutualIntensityVsXZ_GPU(float* pEx, float* pEz, float* pMI0, long nxnz, long itStart, long itEnd, long PerX, long iter, int PolCom, bool EhOK, bool EvOK, gpuUsageArg* pGpuUsage)
+{
+	if (iter > 0)
+	{
+		switch (PolCom)
+		{
+		case  0: return ExtractSingleElecMutualIntensityVsXZ_GPUSub<  0, 1>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage);
+		case  1: return ExtractSingleElecMutualIntensityVsXZ_GPUSub<  1, 1>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage);
+		case  2: return ExtractSingleElecMutualIntensityVsXZ_GPUSub<  2, 1>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage);
+		case  3: return ExtractSingleElecMutualIntensityVsXZ_GPUSub<  3, 1>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage);
+		case  4: return ExtractSingleElecMutualIntensityVsXZ_GPUSub<  4, 1>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage);
+		case  5: return ExtractSingleElecMutualIntensityVsXZ_GPUSub<  5, 1>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage);
+		case -1: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -1, 1>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage);
+		case -2: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -2, 1>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage);
+		case -3: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -3, 1>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage);
+		case -4: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -4, 1>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage);
+		default: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -5, 1>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage);
+		}
+	}
+	else if (iter == 0)
+	{
+		switch (PolCom)
+		{
+		case  0: return ExtractSingleElecMutualIntensityVsXZ_GPUSub<  0, 0>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage);
+		case  1: return ExtractSingleElecMutualIntensityVsXZ_GPUSub<  1, 0>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage);
+		case  2: return ExtractSingleElecMutualIntensityVsXZ_GPUSub<  2, 0>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage);
+		case  3: return ExtractSingleElecMutualIntensityVsXZ_GPUSub<  3, 0>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage);
+		case  4: return ExtractSingleElecMutualIntensityVsXZ_GPUSub<  4, 0>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage);
+		case  5: return ExtractSingleElecMutualIntensityVsXZ_GPUSub<  5, 0>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage);
+		case -1: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -1, 0>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage);
+		case -2: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -2, 0>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage);
+		case -3: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -3, 0>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage);
+		case -4: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -4, 0>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage);
+		default: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -5, 0>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage);
+		}
+	}
+}
+
+#endif
\ No newline at end of file
diff --git a/cpp/src/core/srradstr.h b/cpp/src/core/srradstr.h
index f9b78ba8..def0ffb0 100644
--- a/cpp/src/core/srradstr.h
+++ b/cpp/src/core/srradstr.h
@@ -32,6 +32,10 @@
 #include "srigorre.h"
 #endif
 
+#ifdef _OFFLOAD_GPU //OC28072023
+#include "auxgpu.h" //HG
+#endif
+
 #include "srobject.h"
 
 //*************************************************************************
@@ -72,8 +76,8 @@ class srTSRWRadStructAccessData : public CGenObject {
 	waveHndl wRad, wRadX, wRadZ;
 	int hStateRadX, hStateRadZ;
 	double eStep, eStart, xStep, xStart, zStep, zStart;
-	long ne, nx, nz;
-	//long long ne, nx, nz; //OC26042019
+	long ne, nx, nz; //OC03082023 (rolled back)
+	//long long ne, nx, nz; //HG //OC26042019
 
 	double xStartTr, zStartTr;
 	bool UseStartTrToShiftAtChangingRepresToCoord;
@@ -242,7 +246,16 @@ class srTSRWRadStructAccessData : public CGenObject {
 	void CheckAndSubtractPhaseTermsLin(double newXc, double newZc);
 	void CheckAndResetPhaseTermsLin();
 	void EstimateOversamplingFactors(double& estimOverSampX, double& estimOverSampZ);
-	void MirrorFieldData(int sx, int sz);
+
+	void MirrorFieldData(int sx, int sz, void* pvGPU=0); //OC28072023
+	//void MirrorFieldData(int sx, int sz);
+
+#ifdef _OFFLOAD_GPU
+	void MirrorFieldData_GPU(int sx, int sz, TGPUUsageArg* pGPU); //OC03082023
+	//void MirrorFieldData_GPU(int sx, int sz, void* pGpuUsage); //HG28072023
+	void MultiplyElFieldByPhaseLin_GPU(double xMult, double zMult, TGPUUsageArg* pGPU); //OC03082023
+	//void MultiplyElFieldByPhaseLin_GPU(double xMult, double zMult, void* pGpuUsage); //HG28072023
+#endif
 
 	int SetupWfrEdgeCorrData(float* pDataEx, float* pDataEz, srTDataPtrsForWfrEdgeCorr& DataPtrsForWfrEdgeCorr);
 	void MakeWfrEdgeCorrection(float* pDataEx, float* pDataEz, srTDataPtrsForWfrEdgeCorr& DataPtrs);
@@ -491,12 +504,32 @@ class srTSRWRadStructAccessData : public CGenObject {
 		}
 	}
 
-	void MultiplyElFieldByPhaseLin(double xMult, double zMult)
+	void MultiplyElFieldByPhaseLin(double xMult, double zMult, void* pvGPU=0) //OC28072023
+	//void MultiplyElFieldByPhaseLin(double xMult, double zMult)
 	{
 		bool RadXisDefined = (pBaseRadX != 0);
 		bool RadZisDefined = (pBaseRadZ != 0);
 		if((!RadXisDefined) && (!RadZisDefined)) return;
 
+#ifdef _OFFLOAD_GPU //OC28072023
+		//TGPUUsageArg *pGPU = (TGPUUsageArg*)pvGPU;
+		//GPU_COND(pvGPU,
+		//{
+		//	MultiplyElFieldByPhaseLin_GPU(xMult, zMult, pGPU);
+		//	//MultiplyElFieldByPhaseLin_GPU(xMult, zMult, pGPU);
+		//	return;
+		//}
+
+		if(pvGPU != 0)
+		{
+			TGPUUsageArg *pGPU = (TGPUUsageArg*)pvGPU;
+			if(CAuxGPU::GPUEnabled(pGPU))
+			{
+				MultiplyElFieldByPhaseLin_GPU(xMult, zMult, pGPU);
+			}
+		}
+#endif
+
 		float *tEx = pBaseRadX;
 		float *tEz = pBaseRadZ;
 		
diff --git a/cpp/src/core/srradstr_gpu.cu b/cpp/src/core/srradstr_gpu.cu
new file mode 100644
index 00000000..5890cbf8
--- /dev/null
+++ b/cpp/src/core/srradstr_gpu.cu
@@ -0,0 +1,330 @@
+/************************************************************************//**
+ * File: srradstr_gpu.cu
+ * Description: Auxiliary structures for various SR calculation methods (CUDA implementation)
+ * Project: Synchrotron Radiation Workshop
+ * First release: 2023
+ *
+ * Copyright (C) Brookhaven National Laboratory
+ * All Rights Reserved
+ *
+ * @author H.Goel
+ * @version 1.0
+ ***************************************************************************/
+
+#ifdef _OFFLOAD_GPU
+
+#include "cuda_runtime.h"
+#include "device_launch_parameters.h"
+#include "math_constants.h"
+#include <stdio.h>
+#include <iostream>
+#include <chrono>
+#include "srradstr.h"
+
+__global__ void MultiplyElFieldByPhaseLin_Kernel(double xMult, double zMult, float* pBaseRadX, float* pBaseRadZ, int nz, int nx, int ne, float zStart, float zStep, float xStart, float xStep) {
+    int ix = (blockIdx.x * blockDim.x + threadIdx.x); //nx range
+    int iz = (blockIdx.y * blockDim.y + threadIdx.y); //nz range
+    
+    if (ix < nx && iz < nz) 
+    {
+		bool RadXisDefined = (pBaseRadX != 0);
+		bool RadZisDefined = (pBaseRadZ != 0);
+
+		double z = zStart + iz * zStep;
+		double x = xStart + ix * xStep;
+		double dPhZ = zMult * z;
+		double dPh = dPhZ + xMult * x;
+		double cosPh, sinPh;
+		sincos(dPh, &sinPh, &cosPh);
+
+		long long offset = iz * nx * ne * 2 + ix * ne * 2;
+		float* tEx = pBaseRadX + offset;
+		float* tEz = pBaseRadZ + offset;
+		for (int ie = 0; ie < ne; ie++)
+		{
+			if (RadXisDefined)
+			{
+				//*(tEx++) *= a; *(tEx++) *= a;
+				double newReEx = (*tEx) * cosPh - (*(tEx + 1)) * sinPh;
+				double newImEx = (*tEx) * sinPh + (*(tEx + 1)) * cosPh;
+				*(tEx++) = (float)newReEx; *(tEx++) = (float)newImEx;
+			}
+			if (RadZisDefined)
+			{
+				//*(tEz++) *= a; *(tEz++) *= a;
+				double newReEz = (*tEz) * cosPh - (*(tEz + 1)) * sinPh;
+				double newImEz = (*tEz) * sinPh + (*(tEz + 1)) * cosPh;
+				*(tEz++) = (float)newReEz; *(tEz++) = (float)newImEz;
+			}
+		}
+    }
+}
+
+void srTSRWRadStructAccessData::MultiplyElFieldByPhaseLin_GPU(double xMult, double zMult, TGPUUsageArg* pGPU) //OC03082023
+//void srTSRWRadStructAccessData::MultiplyElFieldByPhaseLin_GPU(double xMult, double zMult, void* pGpuUsage)
+{
+	//TGPUUsageArg *pGpuUsage_ = (TGPUUsageArg*)pGpuUsage; //OC03082023 (commented-out)
+	if (pBaseRadX != NULL)
+	{
+		pBaseRadX = (float*)CAuxGPU::ToDevice(pGPU, pBaseRadX, nz * nx * ne * 2 * sizeof(float)); //OC03082023
+		//pBaseRadX = (float*)CAuxGPU::ToDevice(pGpuUsage_, pBaseRadX, nz * nx * ne * 2 * sizeof(float));
+		CAuxGPU::EnsureDeviceMemoryReady(pGPU, pBaseRadX); //OC03082023
+		//CAuxGPU::EnsureDeviceMemoryReady(pGpuUsage_, pBaseRadX);
+	}
+	if (pBaseRadZ != NULL)
+	{
+		pBaseRadZ = (float*)CAuxGPU::ToDevice(pGPU, pBaseRadZ, nz * nx * ne * 2 * sizeof(float)); //OC03082023
+		//pBaseRadZ = (float*)CAuxGPU::ToDevice(pGpuUsage_, pBaseRadZ, nz * nx * ne * 2 * sizeof(float));
+		CAuxGPU::EnsureDeviceMemoryReady(pGPU, pBaseRadZ); //OC03082023
+		//CAuxGPU::EnsureDeviceMemoryReady(pGpuUsage_, pBaseRadZ);
+	}
+
+    const int bs = 256;
+    dim3 blocks(nx / bs + ((nx & (bs - 1)) != 0), nz);
+    dim3 threads(bs, 1);
+    MultiplyElFieldByPhaseLin_Kernel<<<blocks, threads>>> (xMult, zMult, pBaseRadX, pBaseRadZ, nz, nx, ne, (float)zStart, (float)zStep, (float)xStart, (float)xStep);
+    //MultiplyElFieldByPhaseLin_Kernel<<<blocks, threads>>> (xMult, zMult, pBaseRadX, pBaseRadZ, nz, nx, ne, zStart, zStep, xStart, xStep);
+
+	if (pBaseRadX != NULL)
+		CAuxGPU::MarkUpdated(pGPU, pBaseRadX, true, false); //OC03082023
+		//CAuxGPU::MarkUpdated(pGpuUsage_, pBaseRadX, true, false);
+	if (pBaseRadZ != NULL)
+		CAuxGPU::MarkUpdated(pGPU, pBaseRadZ, true, false); //OC03082023
+		//CAuxGPU::MarkUpdated(pGpuUsage_, pBaseRadZ, true, false);
+
+#ifdef _DEBUG
+	if (pBaseRadX != NULL)
+		pBaseRadX = (float*)CAuxGPU::ToHostAndFree(pGPU, pBaseRadX, nz * nx * ne * 2 * sizeof(float)); //OC03082023
+		//pBaseRadX = (float*)CAuxGPU::ToHostAndFree(pGpuUsage_, pBaseRadX, nz * nx * ne * 2 * sizeof(float));
+	if (pBaseRadZ != NULL)
+		pBaseRadZ = (float*)CAuxGPU::ToHostAndFree(pGPU, pBaseRadZ, nz * nx * ne * 2 * sizeof(float)); //OC03082023
+		//pBaseRadZ = (float*)CAuxGPU::ToHostAndFree(pGpuUsage_, pBaseRadZ, nz * nx * ne * 2 * sizeof(float));
+	cudaStreamSynchronize(0);
+    //auto err = cudaGetLastError();
+    //printf("%s\r\n", cudaGetErrorString(err));
+#endif
+}
+
+template<int mode> __global__ void MirrorFieldData_Kernel(long ne, long nx, long nz, float* pEX0, float* pEZ0) {
+	int ix = (blockIdx.x * blockDim.x + threadIdx.x); //nx range
+	int iz = (blockIdx.y * blockDim.y + threadIdx.y); //nz range
+
+	if (ix < nx && iz < nz)
+	{
+		long long PerX = ne << 1;
+		long long PerZ = PerX * nx;
+		float buf;
+
+		if (mode == 0)
+		{
+			if (ix >= (nx >> 1))
+				return;
+
+			long long nx_mi_1 = nx - 1; //OC26042019
+			for (long long ie = 0; ie < ne; ie++)
+			{
+				//long Two_ie = ie << 1;
+				long long Two_ie = ie << 1; //OC26042019
+				
+				//long izPerZ = iz*PerZ;
+				long long izPerZ = iz * PerZ;
+				float* pEX_StartForX = pEX0 + izPerZ;
+				float* pEZ_StartForX = pEZ0 + izPerZ;
+
+				//long ixPerX_p_Two_ie = ix*PerX + Two_ie;
+				long long ixPerX_p_Two_ie = ix * PerX + Two_ie;
+				float* pEX = pEX_StartForX + ixPerX_p_Two_ie;
+				float* pEZ = pEZ_StartForX + ixPerX_p_Two_ie;
+
+				//long rev_ixPerX_p_Two_ie = (nx_mi_1 - ix)*PerX + Two_ie;
+				long long rev_ixPerX_p_Two_ie = (nx_mi_1 - ix) * PerX + Two_ie;
+				float* rev_pEX = pEX_StartForX + rev_ixPerX_p_Two_ie;
+				float* rev_pEZ = pEZ_StartForX + rev_ixPerX_p_Two_ie;
+
+				if (pEX0 != 0)
+				{
+					buf = *rev_pEX; *(rev_pEX++) = *pEX; *(pEX++) = buf;
+					buf = *rev_pEX; *rev_pEX = *pEX; *pEX = buf;
+				}
+				if (pEZ0 != 0)
+				{
+					buf = *rev_pEZ; *(rev_pEZ++) = *pEZ; *(pEZ++) = buf;
+					buf = *rev_pEZ; *rev_pEZ = *pEZ; *pEZ = buf;
+				}
+			}
+		}
+		else if (mode == 1)
+		{
+			if (iz >= (nz >> 1))
+				return;
+
+			long long nz_mi_1 = nz - 1; //OC26042019
+			for (long long ie = 0; ie < ne; ie++)
+			{
+				//long Two_ie = ie << 1;
+				long long Two_ie = ie << 1;
+				
+				//long izPerZ = iz*PerZ;
+				long long izPerZ = iz * PerZ;
+				float* pEX_StartForX = pEX0 + izPerZ;
+				float* pEZ_StartForX = pEZ0 + izPerZ;
+
+				//long rev_izPerZ = (nz_mi_1 - iz)*PerZ;
+				long long rev_izPerZ = (nz_mi_1 - iz) * PerZ;
+				float* rev_pEX_StartForX = pEX0 + rev_izPerZ;
+				float* rev_pEZ_StartForX = pEZ0 + rev_izPerZ;
+
+				//long ixPerX_p_Two_ie = ix*PerX + Two_ie;
+				long long ixPerX_p_Two_ie = ix * PerX + Two_ie;
+				float* pEX = pEX_StartForX + ixPerX_p_Two_ie;
+				float* pEZ = pEZ_StartForX + ixPerX_p_Two_ie;
+
+				float* rev_pEX = rev_pEX_StartForX + ixPerX_p_Two_ie;
+				float* rev_pEZ = rev_pEZ_StartForX + ixPerX_p_Two_ie;
+
+				if (pEX0 != 0)
+				{
+					buf = *rev_pEX; *(rev_pEX++) = *pEX; *(pEX++) = buf;
+					buf = *rev_pEX; *rev_pEX = *pEX; *pEX = buf;
+				}
+				if (pEZ0 != 0)
+				{
+					buf = *rev_pEZ; *(rev_pEZ++) = *pEZ; *(pEZ++) = buf;
+					buf = *rev_pEZ; *rev_pEZ = *pEZ; *pEZ = buf;
+				}
+			}
+		}
+		else if (mode == 2)
+		{
+			if (iz >= (nz >> 1))
+				return;
+
+			long long nx_mi_1 = nx - 1; //OC26042019
+			long long nz_mi_1 = nz - 1;
+			for (long long ie = 0; ie < ne; ie++) //OC26042019
+				//for(long ie=0; ie<ne; ie++)
+			{
+				//long Two_ie = ie << 1;
+				//for(long iz=0; iz<(nz >> 1); iz++)
+				long long Two_ie = ie << 1; //OC26042019
+				
+				//long izPerZ = iz*PerZ;
+				long long izPerZ = iz * PerZ;
+				float* pEX_StartForX = pEX0 + izPerZ;
+				float* pEZ_StartForX = pEZ0 + izPerZ;
+
+				//long rev_izPerZ = (nz_mi_1 - iz)*PerZ;
+				long long rev_izPerZ = (nz_mi_1 - iz) * PerZ;
+				float* rev_pEX_StartForX = pEX0 + rev_izPerZ;
+				float* rev_pEZ_StartForX = pEZ0 + rev_izPerZ;
+
+				//long ixPerX_p_Two_ie = ix*PerX + Two_ie;
+				long long ixPerX_p_Two_ie = ix * PerX + Two_ie;
+				float* pEX = pEX_StartForX + ixPerX_p_Two_ie;
+				float* pEZ = pEZ_StartForX + ixPerX_p_Two_ie;
+
+				//long rev_ixPerX_p_Two_ie = (nx_mi_1 - ix)*PerX + Two_ie;
+				long long rev_ixPerX_p_Two_ie = (nx_mi_1 - ix) * PerX + Two_ie;
+				float* rev_pEX = rev_pEX_StartForX + rev_ixPerX_p_Two_ie;
+				float* rev_pEZ = rev_pEZ_StartForX + rev_ixPerX_p_Two_ie;
+
+				if (pEX0 != 0)
+				{
+					buf = *rev_pEX; *(rev_pEX++) = *pEX; *(pEX++) = buf;
+					buf = *rev_pEX; *rev_pEX = *pEX; *pEX = buf;
+				}
+				if (pEZ0 != 0)
+				{
+					buf = *rev_pEZ; *(rev_pEZ++) = *pEZ; *(pEZ++) = buf;
+					buf = *rev_pEZ; *rev_pEZ = *pEZ; *pEZ = buf;
+				}
+
+				if (((nz >> 1) << 1) != nz)
+				{
+					//long izPerZ = ((nz >> 1) + 1)*PerZ;
+					long long izPerZ = ((nz >> 1) + 1) * PerZ;
+					float* pEX_StartForX = pEX0 + izPerZ;
+					float* pEZ_StartForX = pEZ0 + izPerZ;
+
+					//long ixPerX_p_Two_ie = ix*PerX + Two_ie;
+					long long ixPerX_p_Two_ie = ix * PerX + Two_ie;
+					float* pEX = pEX_StartForX + ixPerX_p_Two_ie;
+					float* pEZ = pEZ_StartForX + ixPerX_p_Two_ie;
+
+					//long rev_ixPerX_p_Two_ie = (nx_mi_1 - ix)*PerX + Two_ie;
+					long long rev_ixPerX_p_Two_ie = (nx_mi_1 - ix) * PerX + Two_ie;
+					float* rev_pEX = pEX_StartForX + rev_ixPerX_p_Two_ie;
+					float* rev_pEZ = pEZ_StartForX + rev_ixPerX_p_Two_ie;
+
+					if (pEX0 != 0)
+					{
+						buf = *rev_pEX; *(rev_pEX++) = *pEX; *(pEX++) = buf;
+						buf = *rev_pEX; *rev_pEX = *pEX; *pEX = buf;
+					}
+					if (pEZ0 != 0)
+					{
+						buf = *rev_pEZ; *(rev_pEZ++) = *pEZ; *(pEZ++) = buf;
+						buf = *rev_pEZ; *rev_pEZ = *pEZ; *pEZ = buf;
+					}
+				}
+			}
+		}
+	}
+}
+
+void srTSRWRadStructAccessData::MirrorFieldData_GPU(int sx, int sz, TGPUUsageArg* pGPU) //OC03082023
+//void srTSRWRadStructAccessData::MirrorFieldData_GPU(int sx, int sz, void* pGpuUsage)
+{
+	//TGPUUsageArg *pGpuUsage_ = (TGPUUsageArg*)pGpuUsage; //OC03082023 (commented-out)
+	float *pEX0 = pBaseRadX;
+	float *pEZ0 = pBaseRadZ;
+
+	if (pEX0 != NULL)
+	{
+		pEX0 = (float*)CAuxGPU::ToDevice(pGPU, pEX0, nz * nx * ne * 2 * sizeof(float)); //OC03082023
+		//pEX0 = (float*)CAuxGPU::ToDevice(pGpuUsage_, pEX0, nz * nx * ne * 2 * sizeof(float));
+		CAuxGPU::EnsureDeviceMemoryReady(pGPU, pEX0); //OC03082023
+		//CAuxGPU::EnsureDeviceMemoryReady(pGpuUsage_, pEX0);
+	}
+	if (pEZ0 != NULL)
+	{
+		pEZ0 = (float*)CAuxGPU::ToDevice(pGPU, pEZ0, nz * nx * ne * 2 * sizeof(float)); //OC03082023
+		//pEZ0 = (float*)CAuxGPU::ToDevice(pGpuUsage_, pEZ0, nz * nx * ne * 2 * sizeof(float));
+		CAuxGPU::EnsureDeviceMemoryReady(pGPU, pEZ0); //OC03082023
+		//CAuxGPU::EnsureDeviceMemoryReady(pGpuUsage_, pEZ0);
+	}
+
+	const int bs = 256;
+	dim3 blocks(nx / bs + ((nx & (bs - 1)) != 0), nz);
+	dim3 threads(bs, 1);
+
+	if ((sx > 0) && (sz > 0))
+		return;
+	else if ((sx < 0) && (sz > 0))
+		MirrorFieldData_Kernel<0> <<<blocks, threads>>>(ne, nx, nz, pEX0, pEZ0);
+	else if ((sx > 0) && (sz < 0))
+		MirrorFieldData_Kernel<1> <<<blocks, threads >>> (ne, nx, nz, pEX0, pEZ0);
+	else
+		MirrorFieldData_Kernel<2> <<<blocks, threads >>> (ne, nx, nz, pEX0, pEZ0);
+
+	if (pEX0 != NULL)
+		CAuxGPU::MarkUpdated(pGPU, pEX0, true, false); //OC03082023
+		//CAuxGPU::MarkUpdated(pGpuUsage_, pEX0, true, false);
+	if (pEZ0 != NULL)
+		CAuxGPU::MarkUpdated(pGPU, pEZ0, true, false); //OC03082023
+		//CAuxGPU::MarkUpdated(pGpuUsage_, pEZ0, true, false);
+
+#ifdef _DEBUG
+	if (pEX0 != NULL)
+		pEX0 = (float*)CAuxGPU::ToHostAndFree(pGPU, pEX0, nz * nx * ne * 2 * sizeof(float)); //OC03082023
+		//pEX0 = (float*)CAuxGPU::ToHostAndFree(pGpuUsage_, pEX0, nz * nx * ne * 2 * sizeof(float));
+	if (pEZ0 != NULL)
+		pEZ0 = (float*)CAuxGPU::ToHostAndFree(pGPU, pEZ0, nz * nx * ne * 2 * sizeof(float));
+		//pEZ0 = (float*)CAuxGPU::ToHostAndFree(pGpuUsage_, pEZ0, nz * nx * ne * 2 * sizeof(float));
+	cudaStreamSynchronize(0);
+	//auto err = cudaGetLastError();
+	//printf("%s\r\n", cudaGetErrorString(err));
+#endif
+}
+
+#endif
\ No newline at end of file
diff --git a/cpp/src/lib/auxgpu.cpp b/cpp/src/lib/auxgpu.cpp
new file mode 100644
index 00000000..02972cd3
--- /dev/null
+++ b/cpp/src/lib/auxgpu.cpp
@@ -0,0 +1,368 @@
+/************************************************************************//**
+ * File: auxgpu.cpp
+ * Description: Auxiliary utilities to manage GPU usage
+ * Project: Synchrotron Radiation Workshop
+ * First release: 2023
+ *
+ * Copyright (C) Brookhaven National Laboratory
+ * All Rights Reserved
+ *
+ * @author H.Goel
+ * @version 1.0
+ ***************************************************************************/
+
+#include <cstdio>
+#include <cstdlib>
+#include <new>
+
+#ifdef _OFFLOAD_GPU
+#include <cuda_runtime.h>
+#endif
+
+#include "auxgpu.h"
+
+static bool isGPUAvailable = false;
+static bool isGPUEnabled = false;
+static bool GPUAvailabilityTested = false;
+static bool deviceOffloadInitialized = false;
+static int deviceCount = 0;
+
+#ifdef _OFFLOAD_GPU
+typedef struct
+{
+	void *devicePtr;
+	void *hostPtr;
+	size_t size;
+	bool HostToDevUpdated;
+	bool DevToHostUpdated;
+	cudaEvent_t h2d_event;
+	cudaEvent_t d2h_event;
+} memAllocInfo_t;
+static std::map<void*, memAllocInfo_t> gpuMap;
+static cudaStream_t memcpy_stream;
+static bool memcpy_stream_initialized = false;
+static int current_device = -1;
+#endif
+
+static void CheckGPUAvailability() 
+{
+#ifdef _OFFLOAD_GPU
+	if (!GPUAvailabilityTested)
+	{
+		isGPUAvailable = false;
+		GPUAvailabilityTested = true;
+		int deviceCount = 0;
+		if (cudaGetDeviceCount(&deviceCount) != cudaSuccess)
+			return;
+
+		if (deviceCount < 1)
+			return;
+
+		isGPUAvailable = true;
+	}
+#else
+	isGPUAvailable = false;
+	isGPUEnabled = false;
+	GPUAvailabilityTested = true;
+#endif
+}
+
+bool CAuxGPU::GPUAvailable()
+{
+	CheckGPUAvailability();
+	return isGPUAvailable;
+}
+
+bool CAuxGPU::GPUEnabled(TGPUUsageArg *arg) 
+{
+#ifdef _OFFLOAD_GPU
+	if (arg == NULL)
+		return false;
+	if (arg->deviceIndex > 0) {
+		if (arg->deviceIndex <= deviceCount)
+		{
+			if (memcpy_stream_initialized && current_device != arg->deviceIndex)
+			{
+				cudaStreamDestroy(memcpy_stream);
+				memcpy_stream_initialized = false;
+			}
+			cudaSetDevice(arg->deviceIndex - 1);
+			if (!memcpy_stream_initialized)
+				cudaStreamCreateWithFlags(&memcpy_stream, cudaStreamNonBlocking);
+			current_device = arg->deviceIndex;
+			memcpy_stream_initialized = true;
+		}
+		//TODO: Add warning that GPU isn't available
+		return GPUAvailable();
+	}
+#endif
+	return false;
+}
+
+void CAuxGPU::SetGPUStatus(bool enabled)
+{
+	isGPUEnabled = enabled && GPUAvailable();
+}
+
+int CAuxGPU::GetDevice(TGPUUsageArg* arg)
+{
+#ifdef _OFFLOAD_GPU
+	if (arg == NULL)
+		return cudaCpuDeviceId;
+
+	int curDevice = 0;
+	cudaGetDevice(&curDevice);
+	return curDevice;
+#else
+	return 0;
+#endif
+}
+
+void* CAuxGPU::ToDevice(TGPUUsageArg* arg, void* hostPtr, size_t size, bool dontCopy)
+{
+#ifdef _OFFLOAD_GPU
+	if (arg == NULL)
+		return hostPtr;
+	if (arg->deviceIndex == 0)
+		return hostPtr;
+	if (hostPtr == NULL)
+		return hostPtr;
+	if (size == 0)
+		return hostPtr;
+	if (!GPUEnabled(arg))
+		return hostPtr;
+	if (gpuMap.find(hostPtr) != gpuMap.end()){
+		memAllocInfo_t info = gpuMap[hostPtr];
+		void* devPtr = info.devicePtr;
+		hostPtr = info.hostPtr;
+		if (gpuMap[devPtr].HostToDevUpdated && !dontCopy){
+			cudaMemcpyAsync(devPtr, hostPtr, size, cudaMemcpyHostToDevice, memcpy_stream);
+			cudaEventRecord(gpuMap[devPtr].h2d_event, memcpy_stream);
+		}
+//#if _DEBUG
+//		printf("ToDevice: %p -> %p, %d, D2H: %d, H2D: %d\n", hostPtr, devPtr, size, gpuMap[devPtr].DevToHostUpdated, gpuMap[devPtr].HostToDevUpdated); //HG28072023
+//#endif
+		gpuMap[devPtr].HostToDevUpdated = false;
+		return devPtr;
+	}
+
+	void *devicePtr = NULL;
+	cudaError_t err = cudaMalloc(&devicePtr, size);
+	if (err != cudaSuccess) // Try again after freeing up some memory HG24072023
+	{
+		cudaStreamSynchronize(0);
+		err = cudaMalloc(&devicePtr, size);
+	}
+	if (err != cudaSuccess)
+		return NULL;
+//#if _DEBUG
+//	printf("ToDevice: %p -> %p, %d\n", hostPtr, devicePtr, size); //HG28072023
+//#endif
+	memAllocInfo_t info;
+	info.devicePtr = devicePtr;
+	info.hostPtr = hostPtr;
+	info.DevToHostUpdated = false;
+	info.HostToDevUpdated = false;
+	cudaEventCreateWithFlags(&info.h2d_event, cudaEventDisableTiming);
+	cudaEventCreateWithFlags(&info.d2h_event, cudaEventDisableTiming);
+	if (!dontCopy){
+		cudaMemcpyAsync(devicePtr, hostPtr, size, cudaMemcpyHostToDevice, memcpy_stream);
+		cudaEventRecord(info.h2d_event, memcpy_stream);
+	}
+	info.size = size;
+	gpuMap[hostPtr] = info;
+	gpuMap[devicePtr] = info;
+	return devicePtr;
+#else
+	return hostPtr;
+#endif
+}
+
+void CAuxGPU::EnsureDeviceMemoryReady(TGPUUsageArg* arg, void* hostPtr)
+{
+#ifdef _OFFLOAD_GPU
+	if (arg == NULL)
+		return;
+	if (arg->deviceIndex == 0)
+		return;
+	if (hostPtr == NULL)
+		return;
+	if (!GPUEnabled(arg))
+		return;
+	if (gpuMap.find(hostPtr) != gpuMap.end()){
+		void* devPtr = gpuMap[hostPtr].devicePtr;
+		if (gpuMap[devPtr].HostToDevUpdated){
+			cudaStreamWaitEvent(0, gpuMap[devPtr].h2d_event);
+		}
+//#if _DEBUG
+//		printf("EnsureDeviceMemoryReady: %p -> %p, %d, D2H: %d, H2D: %d\n", hostPtr, devPtr, gpuMap[devPtr].size, gpuMap[devPtr].DevToHostUpdated, gpuMap[devPtr].HostToDevUpdated); //HG28072023
+//#endif
+	}
+#endif
+}
+
+void* CAuxGPU::GetHostPtr(TGPUUsageArg* arg, void* devicePtr)
+{
+#ifdef _OFFLOAD_GPU
+	if (arg == NULL)
+		return devicePtr;
+	if (arg->deviceIndex == 0)
+		return devicePtr;
+	if (devicePtr == NULL)
+		return devicePtr;
+	if (!GPUEnabled(arg))
+		return devicePtr;
+	memAllocInfo_t info;
+	if (gpuMap.find(devicePtr) == gpuMap.end())
+		return devicePtr;
+	info = gpuMap[devicePtr];
+//#if _DEBUG
+//	printf("GetHostPtr: %p -> %p\n", devicePtr, info.hostPtr); //HG28072023
+//#endif
+	return info.hostPtr;
+#else
+	return devicePtr;
+#endif
+}
+
+void* CAuxGPU::ToHostAndFree(TGPUUsageArg* arg, void* devicePtr, size_t size, bool dontCopy)
+{
+#ifdef _OFFLOAD_GPU
+	if (arg == NULL)
+		return devicePtr;
+	if (arg->deviceIndex == 0)
+		return devicePtr;
+	if (devicePtr == NULL)
+		return devicePtr;
+	if (size == 0)
+		return devicePtr;
+	if (!GPUEnabled(arg))
+		return devicePtr;
+	memAllocInfo_t info;
+	if (gpuMap.find(devicePtr) == gpuMap.end())
+		return devicePtr;
+	info = gpuMap[devicePtr];
+	devicePtr = info.devicePtr;
+	void *hostPtr = info.hostPtr;
+	if (!dontCopy && info.DevToHostUpdated)
+	{
+		cudaStreamWaitEvent(memcpy_stream, info.d2h_event, 0);
+		cudaMemcpyAsync(hostPtr, devicePtr, size, cudaMemcpyDeviceToHost, memcpy_stream);
+		cudaEventRecord(info.d2h_event);
+		cudaEventSynchronize(info.d2h_event); // we can't treat host memory as valid until the copy is complete
+	}
+//#if _DEBUG
+//	printf("ToHostAndFree: %p -> %p, %d\n", devicePtr, hostPtr, size); //HG28072023
+//#endif
+	cudaStreamWaitEvent(0, info.h2d_event);
+	cudaStreamWaitEvent(0, info.d2h_event);
+	cudaFreeAsync(devicePtr, 0);
+    cudaEventDestroy(info.h2d_event);
+	cudaEventDestroy(info.d2h_event);
+	gpuMap.erase(devicePtr);
+	gpuMap.erase(hostPtr);
+	return hostPtr;
+#else
+	return devicePtr;
+#endif
+}
+
+void CAuxGPU::FreeHost(void* ptr)
+{
+#ifdef _OFFLOAD_GPU
+	if (ptr == NULL)
+		return;
+	if (gpuMap.find(ptr) == gpuMap.end())
+		return;
+	memAllocInfo_t info = gpuMap[ptr];
+	void *hostPtr = info.hostPtr;
+	void *devicePtr = info.devicePtr;
+//#if _DEBUG
+//	printf("FreeHost: %p, %p\n", devicePtr, hostPtr);
+//#endif
+    //cudaStreamWaitEvent(0, info.h2d_event);
+	//cudaStreamWaitEvent(0, info.d2h_event);
+	cudaFreeAsync(devicePtr, 0);
+	//cudaEventDestroy(info.h2d_event);
+	//cudaEventDestroy(info.d2h_event);
+	std::free(hostPtr); //OC02082023
+	//CAuxGPU::free(hostPtr);
+	gpuMap.erase(devicePtr);
+	gpuMap.erase(hostPtr);
+#endif
+	return;
+}
+
+void CAuxGPU::MarkUpdated(TGPUUsageArg* arg, void* ptr, bool devToHost, bool hostToDev)
+{
+#ifdef _OFFLOAD_GPU
+	if (arg == NULL)
+		return;
+	if (arg->deviceIndex == 0)
+		return;
+	if (ptr == NULL)
+		return;
+	if (!GPUEnabled(arg))
+		return;
+	if (gpuMap.find(ptr) == gpuMap.end())
+		return;
+	void* devPtr = gpuMap[ptr].devicePtr;
+	void* hostPtr = gpuMap[ptr].hostPtr;
+	gpuMap[devPtr].DevToHostUpdated = devToHost;
+	gpuMap[devPtr].HostToDevUpdated = hostToDev;
+	gpuMap[hostPtr].DevToHostUpdated = devToHost;
+	gpuMap[hostPtr].HostToDevUpdated = hostToDev;
+	if (devToHost)
+		cudaEventRecord(gpuMap[devPtr].d2h_event, 0);
+//#if _DEBUG
+//	printf("MarkUpdated: %p -> %p, D2H: %d, H2D: %d\n", ptr, devPtr, devToHost, hostToDev);
+//#endif
+#endif
+}
+
+void CAuxGPU::Init() {
+	deviceOffloadInitialized = true;
+#ifdef _OFFLOAD_GPU
+	cudaGetDeviceCount(&deviceCount);
+	cudaDeviceSynchronize();
+#endif
+}
+
+void CAuxGPU::Fini() {
+#ifdef _OFFLOAD_GPU
+	// Copy back all updated data
+	bool updated = false;
+	bool freed = false;
+	for (std::map<void*, memAllocInfo_t>::const_iterator it = gpuMap.cbegin(); it != gpuMap.cend(); it++)
+	{
+		if (it->second.DevToHostUpdated){
+			cudaStreamWaitEvent(memcpy_stream, it->second.d2h_event, 0);
+			cudaMemcpyAsync(it->second.hostPtr, it->second.devicePtr, it->second.size, cudaMemcpyDeviceToHost, memcpy_stream);
+//#if _DEBUG
+//			printf("Fini: %p -> %p, %d\n", it->second.devicePtr, it->second.hostPtr, it->second.size);
+//#endif
+			updated = true;
+			gpuMap[it->second.hostPtr].DevToHostUpdated = false;
+			gpuMap[it->second.devicePtr].DevToHostUpdated = false;
+		}
+	}
+	for (std::map<void*, memAllocInfo_t>::const_iterator it = gpuMap.cbegin(); it != gpuMap.cend(); it++)
+	{
+		if (it->first == it->second.devicePtr)
+		{
+			cudaStreamWaitEvent(0, it->second.h2d_event);
+			cudaStreamWaitEvent(0, it->second.d2h_event);
+			cudaFreeAsync(it->second.devicePtr, 0);
+			freed = true;
+			cudaEventDestroy(it->second.h2d_event);
+			cudaEventDestroy(it->second.d2h_event);
+		}
+	}
+	if (updated | freed)
+		cudaStreamSynchronize(0);
+	gpuMap.clear();
+//#if _DEBUG
+//	printf("Fini: %d\n", gpuMap.size());
+//#endif
+#endif
+}
\ No newline at end of file
diff --git a/cpp/src/lib/auxgpu.h b/cpp/src/lib/auxgpu.h
new file mode 100644
index 00000000..9d64d450
--- /dev/null
+++ b/cpp/src/lib/auxgpu.h
@@ -0,0 +1,62 @@
+/************************************************************************//**
+ * File: auxgpu.h
+ * Description: Auxiliary utilities to manage GPU usage
+ * Project: Synchrotron Radiation Workshop
+ * First release: 2023
+ *
+ * Copyright (C) Brookhaven National Laboratory
+ * All Rights Reserved
+ *
+ * @author H.Goel
+ * @version 1.0
+ ***************************************************************************/
+
+#ifndef __UTIGPU_H
+#define __UTIGPU_H
+
+#include <cstdlib>
+#include <stdio.h>
+
+#ifdef _OFFLOAD_GPU
+#include <cuda_runtime.h>
+#include <map>
+//#if CUDART_VERSION < 11020
+//#error CUDA version too low, need at least 11.2
+//#endif
+#endif
+
+typedef struct
+{
+	int deviceIndex; // -1 means no device, TODO
+} TGPUUsageArg; 
+
+#ifdef _OFFLOAD_GPU
+#define GPU_COND(arg, code) if (arg && CAuxGPU::GPUEnabled((TGPUUsageArg*)arg)) { code }
+//#define GPU_COND(arg, code) if (arg && CAuxGPU::GPUEnabled(arg)) { code }
+#define GPU_PORTABLE __device__ __host__
+#else
+#define GPU_COND(arg, code) if(0) { }
+#define GPU_PORTABLE 
+#endif
+
+ //*************************************************************************
+class CAuxGPU
+{
+private:
+public:
+	static void Init();
+	static void Fini();
+	static bool GPUAvailable(); //CheckGPUAvailable etc
+	static bool GPUEnabled(TGPUUsageArg *arg);
+	static void SetGPUStatus(bool enabled);
+	static int GetDevice(TGPUUsageArg* arg);
+	static void* ToDevice(TGPUUsageArg* arg, void* hostPtr, size_t size, bool dontCopy = false);
+	static void* GetHostPtr(TGPUUsageArg* arg, void* devicePtr);
+	static void* ToHostAndFree(TGPUUsageArg* arg, void* devicePtr, size_t size, bool dontCopy = false);
+	static void EnsureDeviceMemoryReady(TGPUUsageArg* arg, void* devicePtr);
+	static void FreeHost(void* ptr);
+	static void MarkUpdated(TGPUUsageArg* arg, void* ptr, bool devToHost, bool hostToDev);
+};
+
+//*************************************************************************
+#endif
\ No newline at end of file
diff --git a/cpp/src/lib/srwlib.cpp b/cpp/src/lib/srwlib.cpp
index c36043c7..5bc9a324 100644
--- a/cpp/src/lib/srwlib.cpp
+++ b/cpp/src/lib/srwlib.cpp
@@ -29,6 +29,9 @@
 #include "srisosrc.h"
 #include "srmatsta.h"
 
+#ifdef _OFFLOAD_GPU
+#include "auxgpu.h" //OC27072023
+#endif
 //#include <time.h> //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP
 
 //-------------------------------------------------------------------------
@@ -751,7 +754,8 @@ EXP int CALL srwlCalcPowDenSR(SRWLStokes* pStokes, SRWLPartBeam* pElBeam, SRWLPr
 
 //-------------------------------------------------------------------------
 
-EXP int CALL srwlCalcIntFromElecField(char* pInt, SRWLWfr* pWfr, char polar, char intType, char depType, double e, double x, double y, double* pMeth, void* pFldTrj) //OC23022020
+EXP int CALL srwlCalcIntFromElecField(char* pInt, SRWLWfr* pWfr, char polar, char intType, char depType, double e, double x, double y, double* pMeth, void* pFldTrj, void* pGPU) //OC26072023
+//EXP int CALL srwlCalcIntFromElecField(char* pInt, SRWLWfr* pWfr, char polar, char intType, char depType, double e, double x, double y, double* pMeth, void* pFldTrj) //OC23022020
 //EXP int CALL srwlCalcIntFromElecField(char* pInt, SRWLWfr* pWfr, char polar, char intType, char depType, double e, double x, double y, double *pMeth) //OC16122019
 //EXP int CALL srwlCalcIntFromElecField(char* pInt, SRWLWfr* pWfr, char polar, char intType, char depType, double e, double x, double y, int *pMeth) //OC13122019
 //EXP int CALL srwlCalcIntFromElecField(char* pInt, SRWLWfr* pWfr, char polar, char intType, char depType, double e, double x, double y)
@@ -994,7 +998,8 @@ EXP int CALL srwlSetRepresElecField(SRWLWfr* pWfr, char repr)
 
 //-------------------------------------------------------------------------
 
-EXP int CALL srwlPropagElecField(SRWLWfr* pWfr, SRWLOptC* pOpt, int nInt, char** arID, SRWLRadMesh* arIM, char** arI) //OC15082018
+EXP int CALL srwlPropagElecField(SRWLWfr* pWfr, SRWLOptC* pOpt, int nInt, char** arID, SRWLRadMesh* arIM, char** arI, void* pGPU) //OC26072023 (from HG)
+//EXP int CALL srwlPropagElecField(SRWLWfr* pWfr, SRWLOptC* pOpt, int nInt, char** arID, SRWLRadMesh* arIM, char** arI) //OC15082018
 //EXP int CALL srwlPropagElecField(SRWLWfr* pWfr, SRWLOptC* pOpt)
 {
 	if((pWfr == 0) || (pOpt == 0)) return SRWL_INCORRECT_PARAM_FOR_WFR_PROP;
@@ -1047,7 +1052,8 @@ EXP int CALL srwlCalcTransm(SRWLOptT* pOpTr, const double* pDelta, const double*
 
 //-------------------------------------------------------------------------
 
-EXP int CALL srwlUtiFFT(char* pcData, char typeData, double* arMesh, int nMesh, int dir)
+EXP int CALL srwlUtiFFT(char* pcData, char typeData, double* arMesh, int nMesh, int dir, void* pGPU) //OC26072023 
+//EXP int CALL srwlUtiFFT(char* pcData, char typeData, double* arMesh, int nMesh, int dir)
 {
 	if((pcData == 0) || (arMesh == 0) || ((typeData != 'f') && (typeData != 'd')) || (nMesh < 3) || (dir == 0)) return SRWL_INCORRECT_PARAM_FOR_FFT; //OC31012019
 
@@ -1538,6 +1544,53 @@ EXP int CALL srwlPropagRadMultiE(SRWLStokes* pStokes, SRWLWfr* pWfr0, SRWLOptC*
 	return 0;
 }
 
+//-------------------------------------------------------------------------
+#ifdef _OFFLOAD_GPU //OC30102023
+
+EXP bool CALL srwlUtiGPUAvailable() //OC27072023
+//EXP bool CALL srwlAuxGpuAvailable() //HG
+{
+	return CAuxGPU::GPUAvailable(); //OC05092023
+	//return AuxGpu::GPUAvailable();
+}
+
+//-------------------------------------------------------------------------
+
+EXP bool CALL srwlUtiGPUEnabled() //OC27072023
+//EXP bool CALL srwlAuxGpuEnabled() //HG
+{
+	return CAuxGPU::GPUEnabled(nullptr); //OC05092023
+	//return AuxGpu::GPUEnabled(nullptr);
+}
+
+//-------------------------------------------------------------------------
+
+EXP void CALL srwlUtiGPUSetStatus(bool enable) //OC27072023
+//EXP void CALL srwlAuxGpuSetStatus(bool enable) //HG
+{
+	CAuxGPU::SetGPUStatus(enable); //OC05092023
+	//AuxGpu::SetGPUStatus(enable);
+}
+
+//-------------------------------------------------------------------------
+
+EXP void CALL srwlUtiGPUInit() //OC27072023
+//EXP void CALL srwlAuxGpuInit() //HG
+{
+	CAuxGPU::Init(); //OC05092023 (why void?)
+	//AuxGpu::Init();
+}
+
+//-------------------------------------------------------------------------
+
+EXP void CALL srwlUtiGPUFini() //OC27072023
+//EXP void CALL srwlAuxGpuFini() //HG
+{
+	CAuxGPU::Fini(); //OC05092023 (why void?)
+	//AuxGpu::Fini();
+}
+
+#endif
 //-------------------------------------------------------------------------
 //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP:
 /*
diff --git a/cpp/src/lib/srwlib.h b/cpp/src/lib/srwlib.h
index aa448d31..ff81e0ff 100644
--- a/cpp/src/lib/srwlib.h
+++ b/cpp/src/lib/srwlib.h
@@ -732,7 +732,8 @@ EXP int CALL srwlCalcPowDenSR(SRWLStokes* pStokes, SRWLPartBeam* pElBeam, SRWLPr
  * @return	integer error (>0) or warnig (<0) code
  * @see ...
  */
-EXP int CALL srwlCalcIntFromElecField(char* pInt, SRWLWfr* pWfr, char pol, char intType, char depType, double e, double x, double y, double* arMeth=0, void* pFldTrj=0);
+EXP int CALL srwlCalcIntFromElecField(char* pInt, SRWLWfr* pWfr, char pol, char intType, char depType, double e, double x, double y, double* arMeth=0, void* pFldTrj=0, void* pGPU=0); //OC26072023 (from HG)
+//EXP int CALL srwlCalcIntFromElecField(char* pInt, SRWLWfr* pWfr, char pol, char intType, char depType, double e, double x, double y, double* arMeth=0, void* pFldTrj=0);
 //EXP int CALL srwlCalcIntFromElecField(char* pInt, SRWLWfr* pWfr, char pol, char intType, char depType, double e, double x, double y, double* arMeth=0);
 //EXP int CALL srwlCalcIntFromElecField(char* pInt, SRWLWfr* pWfr, char pol, char intType, char depType, double e, double x, double y);
 
@@ -802,7 +803,8 @@ EXP int CALL srwlSetRepresElecField(SRWLWfr* pWfr, char repr);
  * @return	integer error (>0) or warnig (<0) code
  * @see ...
  */
-EXP int CALL srwlPropagElecField(SRWLWfr* pWfr, SRWLOptC* pOpt, int nInt=0, char** arID=0, SRWLRadMesh* arIM=0, char** arI=0); //OC15082018
+EXP int CALL srwlPropagElecField(SRWLWfr* pWfr, SRWLOptC* pOpt, int nInt=0, char** arID=0, SRWLRadMesh* arIM=0, char** arI=0, void* pGPU=0); //OC26072023 (from HG)
+//EXP int CALL srwlPropagElecField(SRWLWfr* pWfr, SRWLOptC* pOpt, int nInt=0, char** arID=0, SRWLRadMesh* arIM=0, char** arI=0); //OC15082018
 //EXP int CALL srwlPropagElecField(SRWLWfr* pWfr, SRWLOptC* pOpt);
 
 /** TEST
@@ -849,7 +851,8 @@ EXP int CALL srwlCalcTransm(SRWLOptT* pOpTr, const double* pDelta, const double*
  * @return	integer error (>0) or warnig (<0) code
  * @see ...
  */
-EXP int CALL srwlUtiFFT(char* pcData, char typeData, double* arMesh, int nMesh, int dir);
+EXP int CALL srwlUtiFFT(char* pcData, char typeData, double* arMesh, int nMesh, int dir, void* pGPU=0); //OC26072023 (from HG)
+//EXP int CALL srwlUtiFFT(char* pcData, char typeData, double* arMesh, int nMesh, int dir);
 
 /** 
  * Convolves real data with 1D or 2D Gaussian (depending on arguments)
@@ -964,6 +967,43 @@ EXP int CALL srwlUtiUndFromMagFldTab(SRWLMagFldC* pUndCnt, SRWLMagFldC* pMagCnt,
  */
 EXP int CALL srwlUtiUndFindMagFldInterpInds(int* arResInds, int* pnResInds, double* arGaps, double* arPhases, int nVals, double arPrecPar[5]);
 
+/**
+ * Checks if GPU offloading is available
+ * @return	true if available
+ * @see ...
+ */
+EXP bool CALL srwlUtiGPUAvailable(); //OC26072023
+//EXP bool CALL srwlAuxGpuAvailable(); //HG
+
+/**
+ * Checks if GPU offloading is enabled
+ * @return	true if enabled
+ * @see ...
+ */
+EXP bool CALL srwlUtiGPUEnabled(); //OC26072023
+//EXP bool CALL srwlAuxGpuEnabled(); //HG
+
+/**
+ * Enable/Disable GPU offloading
+ * @see ...
+ */
+EXP void CALL srwlUtiGPUSetStatus(bool enable);
+//EXP void CALL srwlAuxGpuSetStatus(bool enable); //HG
+
+/**
+ * Initialize device offloading
+ * @see ...
+ */
+EXP void CALL srwlUtiGPUInit(); //OC26072023
+//EXP void CALL srwlAuxGpuInit(); //HG
+
+/**
+ * Finalize device offloading
+ * @see ...
+ */
+EXP void CALL srwlUtiGPUFini(); //OC26072023
+//EXP void CALL srwlAuxGpuFini(); //HG
+
 /**
  * These functions were added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP
 EXP void CALL srwlPrintTime(const char* str, double* start);

From 965482631b424261a7dad5866e24d9cb25830d61 Mon Sep 17 00:00:00 2001
From: Himanshu Goel <himanshu@hgoel.dev>
Date: Mon, 4 Dec 2023 02:04:11 -0500
Subject: [PATCH 2/9] Port over most previous GPU related changes.

---
 cpp/gcc/Makefile                           |   41 +
 cpp/py/setup.py                            |    6 +-
 cpp/src/core/gmfft.cpp                     | 1555 --------------------
 cpp/src/core/gmfft.h                       | 1042 -------------
 cpp/src/core/gmfft_gpu.h                   |   43 -
 cpp/src/core/sroptang.h                    |   12 +-
 cpp/src/core/sroptapt.h                    |   24 +-
 cpp/src/core/sroptcnt.cpp                  |   62 +-
 cpp/src/core/sroptcnt.h                    |    9 +-
 cpp/src/core/sroptcryst.h                  |    9 +-
 cpp/src/core/sroptdrf.cpp                  |   84 +-
 cpp/src/core/sroptdrf.h                    |   88 +-
 cpp/src/core/sroptdrf_gpu.cu               |   29 +
 cpp/src/core/sroptel2.cpp                  |    3 +-
 cpp/src/core/sroptelm.cpp                  |  380 +++--
 cpp/src/core/sroptelm.h                    |   74 +-
 cpp/src/core/sroptelm_gpu.cu               |  587 ++++++++
 cpp/src/core/sroptelm_gpu.h                |  123 ++
 cpp/src/core/sroptgtr.cpp                  |    4 +-
 cpp/src/core/sroptgtr.h                    |  203 ++-
 cpp/src/core/sroptgtr_gpu.cu               |   32 +
 cpp/src/core/srradmnp.cpp                  |  323 ++--
 cpp/src/core/srradmnp.h                    |   66 +-
 cpp/src/core/srradmnp_gpu.cu               |  112 +-
 cpp/src/core/srradstr.cpp                  |   11 +-
 cpp/src/core/srradstr.h                    |    9 +-
 cpp/src/core/srradstr_gpu.cu               |   12 +-
 cpp/src/core/srstraux.h                    |   23 +
 cpp/src/ext/genmath/gmfft.cpp              | 1204 +++++++++++----
 cpp/src/ext/genmath/gmfft.h                |   75 +-
 cpp/src/{core => ext/genmath}/gmfft_gpu.cu |  307 ++--
 cpp/src/ext/genmath/gmmeth.h               |    8 +
 cpp/src/ext/utils/utidev.cpp               |   97 --
 cpp/src/ext/utils/utidev.h                 |   71 -
 cpp/src/lib/auxgpu.cpp                     |    2 +
 cpp/src/lib/srwlib.cpp                     |   46 +-
 cpp/src/lib/srwlib.h                       |   42 +-
 cpp/vc/SRW.sln                             |   50 +-
 cpp/vc/SRWLClientPython.vcxproj            |    2 +-
 cpp/vc/SRWLIB.vcxproj                      |  213 ++-
 cpp/vc/SRWLIB.vcxproj.filters              |   32 +-
 41 files changed, 3182 insertions(+), 3933 deletions(-)
 delete mode 100644 cpp/src/core/gmfft.cpp
 delete mode 100644 cpp/src/core/gmfft.h
 delete mode 100644 cpp/src/core/gmfft_gpu.h
 create mode 100644 cpp/src/core/sroptdrf_gpu.cu
 create mode 100644 cpp/src/core/sroptelm_gpu.cu
 create mode 100644 cpp/src/core/sroptelm_gpu.h
 create mode 100644 cpp/src/core/sroptgtr_gpu.cu
 rename cpp/src/{core => ext/genmath}/gmfft_gpu.cu (54%)
 delete mode 100644 cpp/src/ext/utils/utidev.cpp
 delete mode 100644 cpp/src/ext/utils/utidev.h

diff --git a/cpp/gcc/Makefile b/cpp/gcc/Makefile
index 69864471..23797d0f 100644
--- a/cpp/gcc/Makefile
+++ b/cpp/gcc/Makefile
@@ -27,6 +27,12 @@ else
 	endif
 endif
 
+# HG30112023
+CUDA_PATH ?= /usr/local/cuda
+CUDA_MATHLIBS_PATH ?= /usr/local/cuda
+NVCC = $(CUDA_PATH)/bin/nvcc
+NVCXX = $(CUDA_PATH)/bin/nvc++
+
 SRW_SRC_DEF=	-D_GNU_SOURCE -D__USE_XOPEN2K8 -DFFTW_ENABLE_FLOAT -D_GM_WITHOUT_BASE -DSRWLIB_STATIC -DNO_TIMER -DANSI_DECLARATORS -DTRILIBRARY $(OSFLAG)
 SRW_INCLUDES=   -I$(SRW_SRC_GEN_DIR) -I$(SRW_SRC_LIB_DIR) -I$(SH_SRC_PARSE_DIR) -I$(SH_SRC_GEN_MATH_DIR) $(SRW_SRC_DEF) 
 SRW_CFLAGS=		-O3 -fPIC
@@ -35,6 +41,17 @@ LDFLAGS=-L$(LIB_DIR) -lm
 ifeq ($(MODE), omp)
 SRW_CFLAGS+= -D_WITH_OMP -fopenmp -Wno-write-strings 
 LDFLAGS+= -lfftw 
+else #HG30112023
+ifeq ($(MODE), cuda)
+CUDA_INCLUDES = -I$(CUDA_PATH)/include -I$(CUDA_MATHLIBS_PATH)/include
+CUDA_LIBS = -L$(CUDA_PATH)/lib64 -L$(CUDA_MATHLIBS_PATH)/lib64
+
+SRW_SRC_DEF += -D_OFFLOAD_GPU -DUSE_CUDA -D_FFTW3
+SRW_INCLUDES += $(CUDA_INCLUDES) 
+SRW_CFLAGS += -std=c++17
+LDFLAGS += $(CUDA_LIBS) -lcudart_static -lcudadevrt -lcufft -lrt
+NVCFLAGS = -O3 -arch=sm_80 -dlto -rdc=true
+CUDA_OBJ=gmfft_gpu.o srradstr_gpu.o sroptelm_gpu.o sroptdrf_gpu.o sroptgtr_gpu.o srradmnp_gpu.o
 else
 ifeq ($(MODE), 0)
 SRW_CFLAGS+= -D_FFTW3 
@@ -43,6 +60,7 @@ else
 $(error Unknown SRW compilation option)
 endif
 endif
+endif
 
 PYFLAGS=-I$(shell python -c "from __future__ import print_function; from sysconfig import get_paths as gp; print(gp()['include'])")
 PYFLAGS+=-L$(shell python -c "from __future__ import print_function; from sysconfig import get_paths as gp; import os; print(os.path.join(gp()['stdlib'], '../libs'))")
@@ -71,6 +89,10 @@ OBJ +=  timerec.o track.o srerror.o
 
 # src/lib
 OBJ +=  srwlib.o
+# HG30112023
+ifeq ($(MODE), cuda)
+OBJ += auxgpu.o
+endif
 
 PRG= libsrw.a
 
@@ -89,6 +111,24 @@ PRG= libsrw.a
 %.o: $(SRW_SRC_GENESIS_DIR)/%.c
 	$(CC) $(CFLAGS) -c $<  
 
+# HG30112023
+ifeq ($(MODE), cuda)
+lib: $(CUDA_OBJ) $(OBJ)
+	$(NVCC) $(NVCFLAGS) -Xcompiler="$(SRW_CFLAGS)" -dlink -o srwl_link.o *.o $(LDFLAGS)
+	ar -cvq $(PRG) *.o
+	#cp $(PRG) $(LIB_DIR)/
+	rm -f *.o
+
+%.o: $(SRW_SRC_LIB_DIR)/%.cu
+	$(NVCC) -dc $(NVCFLAGS) $(SRW_INCLUDES) $(SRW_SRC_DEF) -Xcompiler="$(CFLAGS)" -c $<
+	
+%.o: $(SH_SRC_GEN_MATH_DIR)/%.cu
+	$(NVCC) -dc $(NVCFLAGS) $(SRW_INCLUDES) $(SRW_SRC_DEF) -Xcompiler="$(CFLAGS)" -c $<
+
+%.o: $(SRW_SRC_GEN_DIR)/%.cu
+	$(NVCC) -dc $(NVCFLAGS) $(SRW_INCLUDES) $(SRW_SRC_DEF) -Xcompiler="$(CFLAGS)" -c $<
+
+else
 lib:	$(OBJ)
 	ar -cvq $(PRG) *.o
 	#cp $(PRG) $(LIB_DIR)/
@@ -102,6 +142,7 @@ lib:	$(OBJ)
 
 %.o: $(SRW_SRC_GEN_DIR)/%.cu
 	$(NVCC) -x=c++ -Xcompiler="$(CFLAGS)" -c $<
+endif
 
 pylib:
 	$(CXX) -shared $(CFLAGS) $(PYFLAGS) -o srwlpy.so $(SRW_SRC_DIR)/clients/python/srwlpy.cpp libsrw.a $(LDFLAGS)
diff --git a/cpp/py/setup.py b/cpp/py/setup.py
index 013e8824..075520e1 100644
--- a/cpp/py/setup.py
+++ b/cpp/py/setup.py
@@ -20,7 +20,11 @@
 
 if 'MODE' in os.environ: 
     sMode = str(os.environ['MODE'])
-    if sMode == 'omp': 
+    if sMode == 'cuda': # HG30112023
+        ext_kwargs.update({'libraries': ['srw', 'm', 'cudart_static', 'cudadevrt', 'cufft', 'fftw3f', 'fftw3', 'rt'],  'extra_compile_args': ['-O3', '-mavx2', '-fno-math-errno']})
+        ext_kwargs['library_dirs'].append('{0}/lib64'.format(os.environ['CUDA_PATH']))
+        ext_kwargs['library_dirs'].append('{0}/lib64'.format(os.environ['CUDA_MATHLIBS_PATH']))
+    elif sMode == 'omp': 
         #ext_kwargs.update({'extra_link_args': ['-fopenmp'], 
         ext_kwargs.update({'libraries': ['srw', 'm', 'fftw'], #OC07022019
                            'extra_link_args': ['-fopenmp'], 
diff --git a/cpp/src/core/gmfft.cpp b/cpp/src/core/gmfft.cpp
deleted file mode 100644
index 6e59db8a..00000000
--- a/cpp/src/core/gmfft.cpp
+++ /dev/null
@@ -1,1555 +0,0 @@
-/************************************************************************//**
- * File: gmfft.cpp
- * Description: Auxiliary utilities to work with FFTW library
- * Project: 
- * First release: 2000
- *
- * Copyright (C) European Synchrotron Radiation Facility, Grenoble, France
- * All Rights Reserved
- *
- * @author O.Chubar, P.Elleaume
- * @author S. Yakubov (E-XFEL) - noticed issue and suggested fix in FFT1D
- * @version 1.1
- ***************************************************************************/
-
-#include "gmfft.h"
-
-#ifdef _OFFLOAD_GPU
-#include "gmfft_gpu.h"
-#endif
-
-//#include "srwlib.h" //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP
-
-#ifdef _WITH_OMP //OC27102018
-//SY: adopted for OpenMP
-#include "omp.h"
-#endif
-
-//*************************************************************************
-
-long CGenMathFFT::GoodNumbers[] = {
-	2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 36, 40, 42, 44, 
-	48, 50, 52, 54, 56, 60, 64, 66, 70, 72, 78, 80, 84, 88, 90, 96, 98, 100, 104, 
-	108, 110, 112, 120, 126, 128, 130, 132, 140, 144, 150, 154, 156, 160, 162, 
-	168, 176, 180, 182, 192, 196, 198, 200, 208, 210, 216, 220, 224, 234, 240, 
-	250, 252, 256, 260, 264, 270, 280, 286, 288, 294, 300, 308, 312, 320, 324, 
-	330, 336, 350, 352, 360, 364, 378, 384, 390, 392, 396, 400, 416, 420, 432, 
-	440, 448, 450, 462, 468, 480, 486, 490, 500, 504, 512, 520, 528, 540, 546, 
-	550, 560, 572, 576, 588, 594, 600, 616, 624, 630, 640, 648, 650, 660, 672, 
-	686, 700, 702, 704, 720, 728, 750, 756, 768, 770, 780, 784, 792, 800, 810, 
-	832, 840, 858, 864, 880, 882, 896, 900, 910, 924, 936, 960, 972, 980, 990, 
-	1000, 1008, 1024, 1040, 1050, 1056, 1078, 1080, 1092, 1100, 1120, 1134, 1144, 
-	1152, 1170, 1176, 1188, 1200, 1232, 1248, 1250, 1260, 1274, 1280, 1296, 1300, 
-	1320, 1344, 1350, 1372, 1386, 1400, 1404, 1408, 1430, 1440, 1456, 1458, 1470, 
-	1500, 1512, 1536, 1540, 1560, 1568, 1584, 1600, 1620, 1638, 1650, 1664, 1680, 
-	1716, 1728, 1750, 1760, 1764, 1782, 1792, 1800, 1820, 1848, 1872, 1890, 1920, 
-	1944, 1950, 1960, 1980, 2000, 2002, 2016, 2048, 2058, 2080, 2100, 2106, 2112, 
-	2156, 2160, 2184, 2200, 2240, 2250, 2268, 2288, 2304, 2310, 2340, 2352, 2376, 
-	2400, 2430, 2450, 2464, 2496, 2500, 2520, 2548, 2560, 2574, 2592, 2600, 2640, 
-	2646, 2688, 2700, 2730, 2744, 2750, 2772, 2800, 2808, 2816, 2860, 2880, 2912, 
-	2916, 2940, 2970, 3000, 3024, 3072, 3080, 3120, 3136, 3150, 3168, 3200, 3234, 
-	3240, 3250, 3276, 3300, 3328, 3360, 3402, 3430, 3432, 3456, 3500, 3510, 3520, 
-	3528, 3564, 3584, 3600, 3640, 3696, 3744, 3750, 3780, 3822, 3840, 3850, 3888, 
-	3900, 3920, 3960, 4000, 4004, 4032, 4050, 4096, 4116, 4158, 4160, 4200, 4212, 
-	4224, 4290, 4312, 4320, 4368, 4374, 4400, 4410, 4480, 4500, 4536, 4550, 4576, 
-	4608, 4620, 4680, 4704, 4752, 4800, 4802, 4860, 4900, 4914, 4928, 4950, 4992,
-	5000, 5040, 5096, 5120, 5148, 5184, 5200, 5250, 5280, 5292, 5346, 5376, 
-	5390, 5400, 5460, 5488, 5500, 5544, 5600, 5616, 5632, 5670, 5720, 5760, 5824, 
-	5832, 5850, 5880, 5940, 6000, 6006, 6048, 6144, 6160, 6174, 6240, 6250, 6272, 
-	6300, 6318, 6336, 6370, 6400, 6468, 6480, 6500, 6552, 6600, 6656, 6720, 6750, 
-	6804, 6860, 6864, 6912, 6930, 7000, 7020, 7040, 7056, 7128, 7150, 7168, 7200, 
-	7280, 7290, 7350, 7392, 7488, 7500, 7546, 7560, 7644, 7680, 7700, 7722, 7776, 
-	7800, 7840, 7920, 7938, 8000, 8008, 8064, 8100, 8190, 8192, 8232, 8250, 8316, 
-	8320, 8400, 8424, 8448, 8580, 8624, 8640, 8736, 8748, 8750, 8800, 8820, 8910, 
-	8918, 8960, 9000, 9072, 9100, 9152, 9216, 9240, 9360, 9408, 9450, 9504, 9600, 
-	9604, 9702, 9720, 9750, 9800, 9828, 9856, 9900, 9984, 10000, 10010, 10080, 
-	10192, 10206, 10240, 10290, 10296, 10368, 10400, 10500, 10530, 10560, 10584, 
-	10692, 10752, 10780, 10800, 10920, 10976, 11000, 11088, 11200, 11232, 11250, 
-	11264, 11340, 11440, 11466, 11520, 11550, 11648, 11664, 11700, 11760, 11880, 
-	12000, 12012, 12096, 12150, 12250, 12288, 12320, 12348, 12474, 12480, 12500, 
-	12544, 12600, 12636, 12672, 12740, 12800, 12870, 12936, 12960, 13000, 13104, 
-	13122, 13200, 13230, 13312, 13440, 13500, 13608, 13650, 13720, 13728, 13750, 
-	13824, 13860, 14000, 14014, 14040, 14080, 14112, 14256, 14300, 14336, 14400, 
-	14406, 14560, 14580, 14700, 14742, 14784, 14850, 14976, 15000, 15092, 15120, 
-	15288, 15360, 15400, 15444, 15552, 15600, 15680, 15750, 15840, 15876, 16000, 
-	16016, 16038, 16128, 16170, 16200, 16250, 16380, 16384, 16464, 16500, 16632, 
-	16640, 16800, 16848, 16896, 17010, 17150, 17160, 17248, 17280, 17472, 17496, 
-	17500, 17550, 17600, 17640, 17820, 17836, 17920, 18000, 18018, 18144, 18200, 
-	18304, 18432, 18480, 18522, 18720, 18750, 18816, 18900, 18954, 19008, 19110, 
-	19200, 19208, 19250, 19404, 19440, 19500, 19600, 19656, 19712, 19800, 19968, 
-	20000, 20020, 20160, 20250, 20384, 20412, 20480, 20580, 20592, 20736, 20790, 
-	20800, 21000, 21060, 21120, 21168, 21384, 21450, 21504, 21560, 21600, 21840, 
-	21870, 21952, 22000, 22050, 22176, 22400, 22464, 22500, 22528, 22638, 22680, 
-	22750, 22880, 22932, 23040, 23100, 23166, 23296, 23328, 23400, 23520, 23760, 
-	23814, 24000, 24010, 24024, 24192, 24300, 24500, 24570, 24576, 24640, 24696, 
-	24750, 24948, 24960, 25000, 25088, 25200, 25272, 25344, 25480, 25600, 25740, 
-	25872, 25920, 26000, 26208, 26244, 26250, 26400, 26460, 26624, 26730, 26754, 
-	26880, 26950, 27000, 27216, 27300, 27440, 27456, 27500, 27648, 27720, 28000, 
-	28028, 28080, 28160, 28224, 28350, 28512, 28600, 28672, 28800, 28812, 29106, 
-	29120, 29160, 29250, 29400, 29484, 29568, 29700, 29952, 30000, 30030, 30184, 
-	30240, 30576, 30618, 30720, 30800, 30870, 30888, 31104, 31200, 31250, 31360, 
-	31500, 31590, 31680, 31752, 31850, 32000, 32032, 32076, 32256, 32340, 32400, 
-	32500, 32760, 32768, 32928, 33000, 33264, 33280, 33600, 33614, 33696, 33750, 
-	33792, 34020, 34300, 34320, 34398, 34496, 34560, 34650, 34944, 34992, 35000, 
-	35100, 35200, 35280, 35640, 35672, 35750, 35840, 36000, 36036, 36288, 36400, 
-	36450, 36608, 36750, 36864, 36960, 37044, 37422, 37440, 37500, 37632, 37730, 
-	37800, 37908, 38016, 38220, 38400, 38416, 38500, 38610, 38808, 38880, 39000, 
-	39200, 39312, 39366, 39424, 39600, 39690, 39936, 40000, 40040, 40320, 40500, 
-	40768, 40824, 40950, 40960, 41160, 41184, 41250, 41472, 41580, 41600, 42000, 
-	42042, 42120, 42240, 42336, 42768, 42900, 43008, 43120, 43200, 43218, 43680, 
-	43740, 43750, 43904, 44000, 44100, 44226, 44352, 44550, 44590, 44800, 44928, 
-	45000, 45056, 45276, 45360, 45500, 45760, 45864, 46080, 46200, 46332, 46592, 
-	46656, 46800, 47040, 47250, 47520, 47628, 48000, 48020, 48048, 48114, 48384, 
-	48510, 48600, 48750, 49000, 49140, 49152, 49280, 49392, 49500, 49896, 49920, 
-	50000, 50050, 50176, 50400, 50544, 50688, 50960, 51030, 51200, 51450, 51480, 
-	51744, 51840, 52000, 52416, 52488, 52500, 52650, 52800, 52822, 52920, 53248, 
-	53460, 53508, 53760, 53900, 54000, 54054, 54432, 54600, 54880, 54912, 55000, 
-	55296, 55440, 55566, 56000, 56056, 56160, 56250, 56320, 56448, 56700, 56862, 
-	57024, 57200, 57330, 57344, 57600, 57624, 57750, 58212, 58240, 58320, 58500, 
-	58800, 58968, 59136, 59400, 59904, 60000, 60060, 60368, 60480, 60750, 61152, 
-	61236, 61250, 61440, 61600, 61740, 61776, 62208, 62370, 62400, 62426, 62500, 
-	62720, 63000, 63180, 63360, 63504, 63700, 64000, 64064, 64152, 64350, 64512, 
-	64680, 64800, 65000, 65520, 65536, 65610, 65856, 66000, 66150, 66528, 66560, 
-	67200, 67228, 67392, 67500, 67584, 67914, 68040, 68250, 68600, 68640, 68750, 
-	68796, 68992, 69120, 69300, 69498, 69888, 69984, 70000, 70070, 70200, 70400, 
-	70560, 71280, 71344, 71442, 71500, 71680, 72000, 72030, 72072, 72576, 72800, 
-	72900, 73216, 73500, 73710, 73728, 73920, 74088, 74250, 74844, 74880, 75000, 
-	75264, 75460, 75600, 75816, 76032, 76440, 76800, 76832, 77000, 77220, 77616, 
-	77760, 78000, 78400, 78624, 78732, 78750, 78848, 79200, 79380, 79872, 80000, 
-	80080, 80190, 80262, 80640, 80850, 81000, 81250, 81536, 81648, 81900, 81920, 
-	82320, 82368, 82500, 82944, 83160, 83200, 84000, 84084, 84240, 84480, 84672, 
-	85050, 85536, 85750, 85800, 86016, 86240, 86400, 86436, 87318, 87360, 87480, 
-	87500, 87750, 87808, 88000, 88200, 88452, 88704, 89100, 89180, 89600, 89856, 
-	90000, 90090, 90112, 90552, 90720, 91000, 91520, 91728, 91854, 92160, 92400, 
-	92610, 92664, 93184, 93312, 93600, 93750, 94080, 94500, 94770, 95040, 95256, 
-	95550, 96000, 96040, 96096, 96228, 96250, 96768, 97020, 97200, 97500, 98000, 
-	98098, 98280, 98304, 98560, 98784, 99000, 99792, 99840, 100000 
-};
-long CGenMathFFT::LenGoodNumbers = 1151; //637;
-
-long CGenMathFFT::GoodNum100s[] = { 0,37,61,79,95,107,120,130,142,151,159 };
-long CGenMathFFT::LenGoodNum100s = 11;
-
-long CGenMathFFT::GoodNum1000s[] = { 0,159,228,279,318,354,383,410,435,459,479 };
-long CGenMathFFT::LenGoodNum1000s = 11;
-
-long CGenMathFFT::GoodNum10000s[] = { 0,479,636,743,830,900,960,1017,1064,1109,1150 };
-long CGenMathFFT::LenGoodNum10000s = 11;
-
-#ifdef _OFFLOAD_GPU
-long CGenMathFFT1D::PlanLen;
-long CGenMathFFT1D::dPlanLen;
-long CGenMathFFT1D::HowMany;
-long CGenMathFFT1D::dHowMany;
-cufftHandle CGenMathFFT1D::Plan1DFFT_cu;
-cufftHandle CGenMathFFT1D::dPlan1DFFT_cu;
-#endif
-
-#ifdef _OFFLOAD_GPU
-long CGenMathFFT2D::PlanNx;
-long CGenMathFFT2D::PlanNy;
-long CGenMathFFT2D::HowMany;
-long CGenMathFFT2D::dPlanNx;
-long CGenMathFFT2D::dPlanNy;
-long CGenMathFFT2D::dHowMany;
-cufftHandle CGenMathFFT2D::Plan2DFFT_cu;
-cufftHandle CGenMathFFT2D::dPlan2DFFT_cu;
-#endif
-//*************************************************************************
-
-void CGenMathFFT::NextCorrectNumberForFFT(long& n)
-//void CGenMathFFT::NextCorrectNumberForFFT(long long& n) //OC26042019
-{
-	if(n < 4)
-	{
-		n = 4; return;
-	}
-	if(n < 100001)
-	{
-		long *pGoodPrev, *pGoodNext;
-
-		long n_d_10000 = long(n*0.0001);
-		if(n_d_10000 > 0) pGoodPrev = GoodNumbers + GoodNum10000s[n_d_10000] - 1;
-		else
-		{
-			long n_d_1000 = long(n*0.001);
-			if(n_d_1000 > 0) pGoodPrev = GoodNumbers + GoodNum1000s[n_d_1000] - 1;
-			else
-			{
-				long n_d_100 = long(n*0.01);
-				if(n_d_100 > 0) pGoodPrev = GoodNumbers + GoodNum100s[n_d_100] - 1;
-				else pGoodPrev = GoodNumbers;
-			}
-		}
-		pGoodNext = pGoodPrev + 1;
-		for(;;)
-		{
-			if((n > *(pGoodPrev++)) && (n <= *pGoodNext))
-			{
-				n = *pGoodNext; return;
-			}
-			pGoodNext++;
-		}
-	}
-	else
-	{
-		//OC23072020: sorted multiplies by ratios of power of first prime numbers bw 1 and 2
-		const double arTestMults[] = {10./9., 9./8., 6./5., 5./4., 4./3., 3./2., 8./5., 5./3., 16./9., 15./8.};
-		const int nTestMults = 10;
-
-		//long k = 16384;
-		//long k = 65536;
-		long k = 99000; //OC23072020 (make sure this number is < 100001, and divides by 9,8,5)
-
-		for(int j=0; j<100; j++)
-		{
-			//OC23072020 (added tests of intermed numbers obtained by multiplying k by a factor bw 1 and 2)
-			bool intermedNumFound = false;
-			for(int m=0; m<nTestMults; m++)
-			{
-				double dkTest = k*arTestMults[m];
-				long kTest = (long)dkTest;
-				if((dkTest - (double)kTest) >= 0.5) kTest++;
-				if(n <= kTest)
-				{
-					n = kTest; 
-					intermedNumFound = true;
-					break;
-				}
-			}
-			if(intermedNumFound) break;
-
-			k <<= 1; 
-			if(n <= k)
-			{
-				n = k; break;
-			}
-		}
-	}
-}
-
-//*************************************************************************
-//int CGenMathFFT1D::Make1DFFT_InPlace(CGenMathFFT1DInfo& FFT1DInfo)
-//int CGenMathFFT1D::Make1DFFT_InPlace(CGenMathFFT1DInfo& FFT1DInfo, gpuUsageArg *pGpuUsage) //HG18072022
-int CGenMathFFT1D::Make1DFFT_InPlace(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU) //OC06092023
-{
-#ifdef _OFFLOAD_GPU //OC06092023 (to avoid useless operations / calls at execution on CPU)
-	GPU_COND(pvGPU, //OC06092023
-	//GPU_COND(pGpuUsage,
-	{
-		//HG03082022 GPU can do an inplace fft without being given a temporary buffer
-		FFT1DInfo.pOutData = FFT1DInfo.pInData;
-		int result;
-		if (result = Make1DFFT(FFT1DInfo, pvGPU)) return result; //OC06092023
-		//if (result = Make1DFFT(FFT1DInfo, pGpuUsage)) return result;
-	})
-	else
-#endif
-	{
-		//long TotAmOfPo = (FFT1DInfo.Nx << 1)*FFT1DInfo.HowMany;
-		long long TotAmOfPo = ((long long)(FFT1DInfo.Nx << 1))*((long long)FFT1DInfo.HowMany);
-		float* AuxDataCont = new float[TotAmOfPo];
-		if(AuxDataCont == 0) return MEMORY_ALLOCATION_FAILURE;
-		FFT1DInfo.pOutData = AuxDataCont;
-
-		int result;
-		if(result = Make1DFFT(FFT1DInfo)) return result;
-
-		float *tOut = FFT1DInfo.pInData, *t = AuxDataCont;
-		for(int ix=0; ix<TotAmOfPo; ix++) *(tOut++) = *(t++);
-
-		if(AuxDataCont != 0) delete[] AuxDataCont;
-	}
-	return 0;
-}
-
-//*************************************************************************
-
-int CGenMathFFT2D::AuxDebug_TestFFT_Plans()
-{//debug function to test why fftw2d_create_plan crashed at Nx=Nz=104
-
-	for(long i=3; i<(CGenMathFFT::LenGoodNumbers); i++)
-	{
-		int CurN = GoodNumbers[i];
-
-#ifdef _FFTW3 //OC28012019
-		fftwf_complex *in=0;
-		fftwf_plan Plan2DFFT = fftwf_plan_dft_2d(CurN, CurN, in, in, FFTW_FORWARD, FFTW_ESTIMATE); 
-		fftwf_destroy_plan(Plan2DFFT);
-#else
-		fftwnd_plan Plan2DFFT;
-		Plan2DFFT = fftw2d_create_plan(CurN, CurN, FFTW_FORWARD, FFTW_IN_PLACE);
-        fftwnd_destroy_plan(Plan2DFFT);
-#endif
-	}
-	return 0;
-}
-
-//*************************************************************************
-//Forward FFT (FFT2DInfo.Dir = 1?): Int f(x,y)*exp(-i*2*Pi*(qx*x + qy*y)) dx dy
-//Backward FFT (FFT2DInfo.Dir = -1?): Int f(qx,qy)*exp(i*2*Pi*(qx*x + qy*y)) dqx dqy
-//int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo)
-//Modification by S.Yakubov for parallelizing SRW via OpenMP:
-// SY: creation (and deletion) of FFTW plans is not thread-safe. Therefore added option to use precreated plans
-#ifdef _FFTW3 //OC29012019
-//int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwf_plan* pPrecreatedPlan2DFFT, fftw_plan* pdPrecreatedPlan2DFFT)
-int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwf_plan* pPrecreatedPlan2DFFT, fftw_plan* pdPrecreatedPlan2DFFT, void* pvGPU) //OC05092023
-//int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwf_plan* pPrecreatedPlan2DFFT, fftw_plan* pdPrecreatedPlan2DFFT, gpuUsageArg *pGpuUsage) //HG18072022
-//int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwf_plan* pPrecreatedPlan2DFFT)
-#else
-int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwnd_plan* pPrecreatedPlan2DFFT) //OC27102018
-#endif
-{// Assumes Nx, Ny even !
-	const double RelShiftTol = 1.E-06;
-
-		//debug
-		//AuxDebug_TestFFT_Plans();
-		//end debug
-
-	SetupLimitsTr(FFT2DInfo);
-
-	double xStepNx = FFT2DInfo.Nx*FFT2DInfo.xStep;
-	double yStepNy = FFT2DInfo.Ny*FFT2DInfo.yStep;
-
-	double x0_After = FFT2DInfo.xStart + 0.5*xStepNx;
-	double y0_After = FFT2DInfo.yStart + 0.5*yStepNy;
-
-	NeedsShiftAfterX = (::fabs(x0_After) > RelShiftTol*xStepNx);
-	NeedsShiftAfterY = (::fabs(y0_After) > RelShiftTol*yStepNy);
-
-	double xStartTr = -0.5/FFT2DInfo.xStep;
-	double yStartTr = -0.5/FFT2DInfo.yStep;
-
-	NeedsShiftBeforeX = NeedsShiftBeforeY = 0;
-	double x0_Before = 0., y0_Before = 0.;
-	if(FFT2DInfo.UseGivenStartTrValues)
-	{
-		x0_Before = (FFT2DInfo.xStartTr - xStartTr); // Sign should be probably reversed here: check!!!
-		y0_Before = (FFT2DInfo.yStartTr - yStartTr); // Sign should be probably reversed here: check!!!
-
-		NeedsShiftBeforeX = (::fabs(x0_Before) > RelShiftTol*(::fabs(xStartTr)));
-		NeedsShiftBeforeY = (::fabs(y0_Before) > RelShiftTol*(::fabs(yStartTr)));
-	}
-
-	//ArrayShiftX = 0; ArrayShiftY = 0; 
-	m_ArrayShiftX = 0; m_ArrayShiftY = 0; //OC02022019
-	m_dArrayShiftX = 0; m_dArrayShiftY = 0;
-	if (FFT2DInfo.pData != 0)
-	{
-		if (NeedsShiftBeforeX || NeedsShiftAfterX)
-		{
-			//ArrayShiftX = new float[Nx << 1];
-			//if(ArrayShiftX == 0) return MEMORY_ALLOCATION_FAILURE;
-			m_ArrayShiftX = new float[Nx << 1];
-			if (m_ArrayShiftX == 0) return MEMORY_ALLOCATION_FAILURE;
-		}
-		if (NeedsShiftBeforeY || NeedsShiftAfterY)
-		{
-			//ArrayShiftY = new float[Ny << 1];
-			//if(ArrayShiftY == 0) return MEMORY_ALLOCATION_FAILURE;
-			m_ArrayShiftY = new float[Ny << 1];
-			if (m_ArrayShiftY == 0) return MEMORY_ALLOCATION_FAILURE;
-		}
-	}
-	else if (FFT2DInfo.pdData != 0)
-	{
-		if (NeedsShiftBeforeX || NeedsShiftAfterX)
-		{
-			m_dArrayShiftX = new double[Nx << 1];
-			if (m_dArrayShiftX == 0) return MEMORY_ALLOCATION_FAILURE;
-		}
-		if (NeedsShiftBeforeY || NeedsShiftAfterY)
-		{
-			m_dArrayShiftY = new double[Ny << 1];
-			if (m_dArrayShiftY == 0) return MEMORY_ALLOCATION_FAILURE;
-		}
-	}
-
-#ifdef _FFTW3
-	fftwf_plan Plan2DFFT;
-	fftw_plan dPlan2DFFT;
-	fftwf_complex* DataToFFT = 0;
-	fftw_complex* dDataToFFT = 0;
-#endif
-
-//HG18072022
-//#ifdef _DEBUG
-//	if (pGpuUsage != NULL)
-//		printf ("GPU: Make2DFFT\n");
-//#endif
-
-#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
-	GPU_COND(pvGPU, //OC06092023
-	//GPU_COND(pGpuUsage, //HG02112021
-	{
-		if(FFT2DInfo.pData != 0) 
-		{
-			DataToFFT = (fftwf_complex*)CAuxGPU::ToDevice((TGPUUsageArg*)pvGPU, FFT2DInfo.pData, FFT2DInfo.Nx * FFT2DInfo.Ny * FFT2DInfo.howMany * 2 * sizeof(float)); //OC06092023
-			//DataToFFT = (fftwf_complex*)AuxGPU::ToDevice(pGpuUsage, FFT2DInfo.pData, FFT2DInfo.Nx * FFT2DInfo.Ny * FFT2DInfo.howMany * 2 * sizeof(float));
-		}
-		else if(FFT2DInfo.pdData != 0) 
-		{
-			dDataToFFT = (fftw_complex*)CAuxGPU::ToDevice((TGPUUsageArg*)pvGPU, FFT2DInfo.pdData, FFT2DInfo.Nx * FFT2DInfo.Ny * FFT2DInfo.howMany * 2 * sizeof(double)); //OC06092023
-			//dDataToFFT = (fftw_complex*)AuxGpu::ToDevice(pGpuUsage, FFT2DInfo.pdData, FFT2DInfo.Nx * FFT2DInfo.Ny * FFT2DInfo.howMany * 2 * sizeof(double));
-		}
-	})
-	else
-#endif
-	{
-#if _FFTW3 //OC28012019
-		if (FFT2DInfo.pData != 0) DataToFFT = (fftwf_complex*)(FFT2DInfo.pData);
-		else if (FFT2DInfo.pdData != 0) dDataToFFT = (fftw_complex*)(FFT2DInfo.pdData); //OC02022019
-
-#else
-		fftwnd_plan Plan2DFFT;
-		FFTW_COMPLEX* DataToFFT = (FFTW_COMPLEX*)(FFT2DInfo.pData);
-#endif
-	}
-
-	char t0SignMult = (FFT2DInfo.Dir > 0)? -1 : 1;
-
-	//if(NeedsShiftBeforeX) FillArrayShift('x', t0SignMult*x0_Before, FFT2DInfo.xStep);
-	//if(NeedsShiftBeforeY) FillArrayShift('y', t0SignMult*y0_Before, FFT2DInfo.yStep);
-	if(NeedsShiftBeforeX) 
-	{//OC02022019
-		if(m_ArrayShiftX != 0)
-			FillArrayShift('x', t0SignMult*x0_Before, FFT2DInfo.xStep, m_ArrayShiftX); 
-		else if(m_dArrayShiftX != 0)
-			FillArrayShift('x', t0SignMult*x0_Before, FFT2DInfo.xStep, m_dArrayShiftX);
-	}
-	if(NeedsShiftBeforeY) 
-	{//OC02022019
-		if(m_ArrayShiftY != 0)
-			FillArrayShift('y', t0SignMult*y0_Before, FFT2DInfo.yStep, m_ArrayShiftY);
-		else if(m_dArrayShiftY != 0)
-			FillArrayShift('y', t0SignMult*y0_Before, FFT2DInfo.yStep, m_dArrayShiftY);
-	}
-	
-#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
-	if (DataToFFT != 0) CAuxGPU::EnsureDeviceMemoryReady((TGPUUsageArg*)pvGPU, DataToFFT); //OC06092023
-	else if (dDataToFFT != 0) CAuxGPU::EnsureDeviceMemoryReady((TGPUUsageArg*)pvGPU, dDataToFFT);
-	//if (DataToFFT != 0) AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, DataToFFT);
-	//else if (dDataToFFT != 0) AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, dDataToFFT);
-#endif
-
-	if (NeedsShiftBeforeX || NeedsShiftBeforeY) //HG02112021
-	{
-#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
-		GPU_COND(pvGPU, { //OC06092023
-			//GPU_COND(pGpuUsage, {
-			TGPUUsageArg *pGPU = (TGPUUsageArg*)pvGPU;
-			if (DataToFFT != 0) {
-				m_ArrayShiftX = (float*)CAuxGPU::ToDevice(pGPU, m_ArrayShiftX, (Nx << 1) * sizeof(float), false); //OC06092023
-				m_ArrayShiftY = (float*)CAuxGPU::ToDevice(pGPU, m_ArrayShiftY, (Ny << 1) * sizeof(float), false);
-				CAuxGPU::EnsureDeviceMemoryReady(pGPU, m_ArrayShiftX);
-				CAuxGPU::EnsureDeviceMemoryReady(pGPU, m_ArrayShiftY);
-				//m_ArrayShiftX = (float*)AuxGpu::ToDevice(pGpuUsage, m_ArrayShiftX, (Nx << 1) * sizeof(float), false);
-				//m_ArrayShiftY = (float*)AuxGpu::ToDevice(pGpuUsage, m_ArrayShiftY, (Ny << 1) * sizeof(float), false);	
-				//AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, m_ArrayShiftX);
-				//AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, m_ArrayShiftY);
-				TreatShifts2D_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany, NeedsShiftBeforeX, NeedsShiftBeforeY, m_ArrayShiftX, m_ArrayShiftY);
-				m_ArrayShiftX = (float*)CAuxGPU::ToHostAndFree(pGPU, m_ArrayShiftX, (Nx << 1) * sizeof(float), true); //OC06092023
-				m_ArrayShiftY = (float*)CAuxGPU::ToHostAndFree(pGPU, m_ArrayShiftY, (Ny << 1) * sizeof(float), true);
-				//m_ArrayShiftX = (float*)AuxGpu::ToHostAndFree(pGpuUsage, m_ArrayShiftX, (Nx << 1) * sizeof(float), true);
-				//m_ArrayShiftY = (float*)AuxGpu::ToHostAndFree(pGpuUsage, m_ArrayShiftY, (Ny << 1) * sizeof(float), true);
-			}
-			else if (dDataToFFT != 0) {
-				m_dArrayShiftX = (double*)CAuxGPU::ToDevice(pGPU, m_dArrayShiftX, (Nx << 1) * sizeof(double), false); //OC06092023
-				m_dArrayShiftY = (double*)CAuxGPU::ToDevice(pGPU, m_dArrayShiftY, (Ny << 1) * sizeof(double), false);
-				CAuxGPU::EnsureDeviceMemoryReady(pGPU, m_dArrayShiftX);
-				CAuxGPU::EnsureDeviceMemoryReady(pGPU, m_dArrayShiftY);
-				//m_dArrayShiftX = (double*)AuxGpu::ToDevice(pGpuUsage, m_dArrayShiftX, (Nx << 1) * sizeof(double), false);	
-				//m_dArrayShiftY = (double*)AuxGpu::ToDevice(pGpuUsage, m_dArrayShiftY, (Ny << 1) * sizeof(double), false);	
-				//AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, m_dArrayShiftX);
-				//AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, m_dArrayShiftY);
-				TreatShifts2D_GPU((double*)dDataToFFT, Nx, Ny, FFT2DInfo.howMany, NeedsShiftBeforeX, NeedsShiftBeforeY, m_dArrayShiftX, m_dArrayShiftY);
-				m_dArrayShiftX = (double*)CAuxGPU::ToHostAndFree(pGPU, m_dArrayShiftX, (Nx << 1) * sizeof(double), true); //OC06092023
-				m_dArrayShiftY = (double*)CAuxGPU::ToHostAndFree(pGPU, m_dArrayShiftY, (Ny << 1) * sizeof(double), true);
-				//m_dArrayShiftX = (double*)AuxGpu::ToHostAndFree(pGpuUsage, m_dArrayShiftX, (Nx << 1) * sizeof(double), true);
-				//m_dArrayShiftY = (double*)AuxGpu::ToHostAndFree(pGpuUsage, m_dArrayShiftY, (Ny << 1) * sizeof(double), true);
-			}
-		})
-		else 
-#endif
-		{
-			if (DataToFFT != 0) TreatShifts(DataToFFT, FFT2DInfo.howMany);
-
-#ifdef _FFTW3 //OC27022019
-			else if (dDataToFFT != 0) TreatShifts(dDataToFFT, FFT2DInfo.howMany); //OC02022019
-#endif
-		}
-	}
-
-	bool alreadyNormalized = false; //HG17032022
-	//double Mult = FFT2DInfo.xStep*FFT2DInfo.yStep;
-	double Mult = FFT2DInfo.xStep * FFT2DInfo.yStep * FFT2DInfo.ExtraMult; //OC20112017
-	if (FFT2DInfo.Dir > 0)
-	{
-#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
-		GPU_COND(pvGPU, //OC06092023
-		//GPU_COND(pGpuUsage, //HG02112021
-		{
-			if (DataToFFT != 0)
-			{
-				if (pPrecreatedPlan2DFFT == 0) 
-				{
-					if((Plan2DFFT_cu == NULL) || (!(PlanNx == Nx && PlanNy == Ny && HowMany == FFT2DInfo.howMany))) //OC06092023
-					//if (Plan2DFFT_cu == NULL | !(PlanNx == Nx && PlanNy == Ny && HowMany == FFT2DInfo.howMany)) 
-					{
-						if (Plan2DFFT_cu != NULL)
-						{
-							cufftDestroy(Plan2DFFT_cu);
-							Plan2DFFT_cu = NULL;
-						}
-
-						PlanNx = Nx;
-						PlanNy = Ny;
-						HowMany = FFT2DInfo.howMany;
-						int plan_shape[2]; plan_shape[0] = Nx; plan_shape[1] = Ny;
-						cufftPlanMany(&Plan2DFFT_cu, 2, plan_shape, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, FFT2DInfo.howMany);
-						//cufftPlan2d(&Plan2DFFT_cu, Nx, Ny, CUFFT_C2C);
-					}
-				}
-				else Plan2DFFT_cu = *(cufftHandle*)pPrecreatedPlan2DFFT;
-				if (Plan2DFFT_cu == 0) return ERROR_IN_FFT;
-
-				auto res = cufftExecC2C(Plan2DFFT_cu, (cufftComplex*)DataToFFT, (cufftComplex*)DataToFFT, CUFFT_FORWARD);
-//				if (res != CUFFT_SUCCESS)
-//					printf("CUFFT Error: %d\r\n", res);
-			}
-			else if (dDataToFFT != 0)
-			{
-				if (pdPrecreatedPlan2DFFT == 0) 
-				{
-					if((dPlan2DFFT_cu == NULL) || (!(dPlanNx == Nx && dPlanNy == Ny && dHowMany == FFT2DInfo.howMany))) //OC06092023
-					//if (dPlan2DFFT_cu == NULL | !(dPlanNx == Nx && dPlanNy == Ny && dHowMany == FFT2DInfo.howMany)) 
-					{
-						if (dPlan2DFFT_cu != NULL)
-						{
-							cufftDestroy(dPlan2DFFT_cu);
-							dPlan2DFFT_cu = NULL;
-						}
-
-						dPlanNx = Nx;
-						dPlanNy = Ny;
-						HowMany = FFT2DInfo.howMany;
-						int plan_shape[2]; plan_shape[0] = Nx; plan_shape[1] = Ny;
-						cufftPlanMany(&Plan2DFFT_cu, 2, plan_shape, 0, 0, 0, 0, 0, 0, CUFFT_Z2Z, FFT2DInfo.howMany);
-						//cufftPlan2d(&dPlan2DFFT_cu, Nx, Ny, CUFFT_Z2Z);
-					}
-				}
-				else dPlan2DFFT_cu = *(cufftHandle*)pdPrecreatedPlan2DFFT;
-				if (dPlan2DFFT_cu == 0) return ERROR_IN_FFT;
-
-				cufftExecZ2Z(dPlan2DFFT_cu, (cufftDoubleComplex*)dDataToFFT, (cufftDoubleComplex*)dDataToFFT, CUFFT_FORWARD);
-			}
-		})
-		else 
-#endif
-		{
-			//Plan2DFFT = fftw2d_create_plan(Ny, Nx, FFTW_FORWARD, FFTW_IN_PLACE);
-			//OC27102018
-			//SY: adopted for OpenMP
-#if _FFTW3 //OC28012019
-
-			for(long iHowMany = 0; iHowMany < FFT2DInfo.howMany; iHowMany++)
-			{
-				long iFFT = Nx * Ny * iHowMany;
-				if (DataToFFT != 0)
-				{
-					if (pPrecreatedPlan2DFFT == 0) Plan2DFFT = fftwf_plan_dft_2d(Ny, Nx, DataToFFT + iFFT, DataToFFT + iFFT, FFTW_FORWARD, FFTW_ESTIMATE);
-					else Plan2DFFT = *pPrecreatedPlan2DFFT;
-					if (Plan2DFFT == 0) return ERROR_IN_FFT;
-
-					fftwf_execute(Plan2DFFT);
-				}
-				else if (dDataToFFT != 0)
-				{
-					if (pdPrecreatedPlan2DFFT == 0) dPlan2DFFT = fftw_plan_dft_2d(Ny, Nx, dDataToFFT + iFFT, dDataToFFT + iFFT, FFTW_FORWARD, FFTW_ESTIMATE);
-					else dPlan2DFFT = *pdPrecreatedPlan2DFFT;
-					if (dPlan2DFFT == 0) return ERROR_IN_FFT;
-
-					fftw_execute(dPlan2DFFT);
-				}
-			}
-
-#else
-			if (pPrecreatedPlan2DFFT == 0) Plan2DFFT = fftw2d_create_plan(Ny, Nx, FFTW_FORWARD, FFTW_IN_PLACE);
-			else Plan2DFFT = *pPrecreatedPlan2DFFT;
-			if (Plan2DFFT == 0) return ERROR_IN_FFT;
-			fftwnd(Plan2DFFT, 1, DataToFFT, 1, 0, DataToFFT, 1, 0);
-#endif
-		}
-
-#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
-		GPU_COND(pvGPU, //OC06092023
-		//GPU_COND(pGpuUsage, //HG18072022
-		{
-			if (DataToFFT != 0)
-			{
-				//RepairSignAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany);
-				//RotateDataAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany);
-				//RepairSignAndRotateDataAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany, Mult);
-				RepairSignAndRotateDataAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany, (float)Mult); //OC06092023
-			}
-			else if (dDataToFFT != 0)
-			{
-				//RepairSignAfter2DFFT_GPU((double*)dDataToFFT, Nx, Ny, FFT2DInfo.howMany);
-				//RotateDataAfter2DFFT_GPU((double*)dDataToFFT, Nx, Ny, FFT2DInfo.howMany);
-				RepairSignAndRotateDataAfter2DFFT_GPU((double*)dDataToFFT, Nx, Ny, FFT2DInfo.howMany, Mult);
-			}
-			alreadyNormalized = true;
-		})
-		else 
-#endif
-		{
-			if (DataToFFT != 0)
-			{
-				RepairSignAfter2DFFT(DataToFFT, FFT2DInfo.howMany);
-				RotateDataAfter2DFFT(DataToFFT, FFT2DInfo.howMany);
-			}
-
-#ifdef _FFTW3 //OC27022019
-			else if (dDataToFFT != 0)
-			{
-				RepairSignAfter2DFFT(dDataToFFT, FFT2DInfo.howMany);
-				RotateDataAfter2DFFT(dDataToFFT, FFT2DInfo.howMany);
-			}
-#endif
-		}
-	}
-	else
-	{
-#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
-		GPU_COND(pvGPU, //OC06092023
-		//GPU_COND(pGpuUsage, //HG18072022
-		{
-			if (DataToFFT != 0)
-			{
-				if (pPrecreatedPlan2DFFT == 0) {
-					if((Plan2DFFT_cu == NULL) || (!(PlanNx == Nx && PlanNy == Ny && HowMany == FFT2DInfo.howMany))) //OC06092023
-					//if (Plan2DFFT_cu == NULL | !(PlanNx == Nx && PlanNy == Ny && HowMany == FFT2DInfo.howMany)) 
-					{
-						if (Plan2DFFT_cu != NULL){
-							cufftDestroy(Plan2DFFT_cu);
-							Plan2DFFT_cu = NULL;
-						}
-
-						PlanNx = Nx;
-						PlanNy = Ny;
-						HowMany = FFT2DInfo.howMany;
-						int plan_shape[2]; plan_shape[0] = Nx; plan_shape[1] = Ny;
-						cufftPlanMany(&Plan2DFFT_cu, 2, plan_shape, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, FFT2DInfo.howMany);
-						//cufftPlan2d(&Plan2DFFT_cu, Nx, Ny, CUFFT_C2C);
-					}
-				}
-				else Plan2DFFT_cu = *(cufftHandle*)pPrecreatedPlan2DFFT;
-				if (Plan2DFFT_cu == 0) return ERROR_IN_FFT;
-
-				RotateDataAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany);
-				RepairSignAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany);
-				cufftExecC2C(Plan2DFFT_cu, (cufftComplex*)DataToFFT, (cufftComplex*)DataToFFT, CUFFT_INVERSE);
-			}
-			else if (dDataToFFT != 0)
-			{
-				if (pdPrecreatedPlan2DFFT == 0) {
-					if((dPlan2DFFT_cu == NULL) || (!(dPlanNx == Nx && dPlanNy == Ny && dHowMany == FFT2DInfo.howMany))) //OC06092023
-					//if (dPlan2DFFT_cu == NULL | !(dPlanNx == Nx && dPlanNy == Ny && dHowMany == FFT2DInfo.howMany)) 
-					{
-						if (dPlan2DFFT_cu != NULL){
-							cufftDestroy(dPlan2DFFT_cu);
-							dPlan2DFFT_cu = NULL;
-						}
-
-						dPlanNx = Nx;
-						dPlanNy = Ny;
-						dHowMany = FFT2DInfo.howMany;
-						int plan_shape[2]; plan_shape[0] = Nx; plan_shape[1] = Ny;
-						cufftPlanMany(&Plan2DFFT_cu, 2, plan_shape, NULL, 0, 0, NULL, 0, 0, CUFFT_Z2Z, FFT2DInfo.howMany);
-						//cufftPlan2d(&dPlan2DFFT_cu, Nx, Ny, CUFFT_Z2Z);
-					}
-				}
-				else dPlan2DFFT_cu = *(cufftHandle*)pdPrecreatedPlan2DFFT;
-				if (dPlan2DFFT_cu == 0) return ERROR_IN_FFT;
-
-				RotateDataAfter2DFFT_GPU((double*)dDataToFFT, Nx, Ny, FFT2DInfo.howMany);
-				RepairSignAfter2DFFT_GPU((double*)dDataToFFT, Nx, Ny, FFT2DInfo.howMany);
-				cufftExecZ2Z(dPlan2DFFT_cu, (cufftDoubleComplex*)dDataToFFT, (cufftDoubleComplex*)dDataToFFT, CUFFT_INVERSE);
-			}
-		})
-		else 
-#endif
-		{
-			//Plan2DFFT = fftw2d_create_plan(Ny, Nx, FFTW_BACKWARD, FFTW_IN_PLACE);
-			//OC27102018
-			//SY: adopted for OpenMP
-#ifdef _FFTW3 //OC28012019
-			for (long iHowMany = 0; iHowMany < FFT2DInfo.howMany; iHowMany++)
-			{
-				long iFFT = Nx * Ny * iHowMany;
-				if (DataToFFT != 0)
-				{
-					if (pPrecreatedPlan2DFFT == 0) Plan2DFFT = fftwf_plan_dft_2d(Ny, Nx, DataToFFT + iFFT, DataToFFT + iFFT, FFTW_BACKWARD, FFTW_ESTIMATE);
-					else Plan2DFFT = *pPrecreatedPlan2DFFT;
-					if (Plan2DFFT == 0) return ERROR_IN_FFT;
-					RotateDataAfter2DFFT(DataToFFT, FFT2DInfo.howMany);
-					RepairSignAfter2DFFT(DataToFFT, FFT2DInfo.howMany);
-					fftwf_execute(Plan2DFFT);
-				}
-				else if (dDataToFFT != 0)
-				{
-					if (pdPrecreatedPlan2DFFT == 0) dPlan2DFFT = fftw_plan_dft_2d(Ny, Nx, dDataToFFT + iFFT, dDataToFFT + iFFT, FFTW_BACKWARD, FFTW_ESTIMATE);
-					else dPlan2DFFT = *pdPrecreatedPlan2DFFT;
-					if (dPlan2DFFT == 0) return ERROR_IN_FFT;
-					RotateDataAfter2DFFT(dDataToFFT, FFT2DInfo.howMany);
-					RepairSignAfter2DFFT(dDataToFFT, FFT2DInfo.howMany);
-					fftw_execute(dPlan2DFFT);
-				}
-			}
-#else
-			if (pPrecreatedPlan2DFFT == 0) Plan2DFFT = fftw2d_create_plan(Ny, Nx, FFTW_BACKWARD, FFTW_IN_PLACE);
-			else Plan2DFFT = *pPrecreatedPlan2DFFT;
-			if (Plan2DFFT == 0) return ERROR_IN_FFT;
-			RotateDataAfter2DFFT(DataToFFT);
-			RepairSignAfter2DFFT(DataToFFT);
-			fftwnd(Plan2DFFT, 1, DataToFFT, 1, 0, DataToFFT, 1, 0);
-#endif
-		}
-	}
-	
-	if (!alreadyNormalized){
-#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
-		GPU_COND(pvGPU, //OC06092023
-		//GPU_COND(pGpuUsage, //HG18072022
-		{
-			if (DataToFFT != 0)
-				NormalizeDataAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany, Mult);
-			else if (dDataToFFT != 0)
-				NormalizeDataAfter2DFFT_GPU((double*)dDataToFFT, Nx, Ny, FFT2DInfo.howMany, Mult);
-		})
-		else 
-#endif
-		{
-			if (DataToFFT != 0) NormalizeDataAfter2DFFT(DataToFFT, Mult, FFT2DInfo.howMany);
-
-#ifdef _FFTW3 //OC27022019
-			else if (dDataToFFT != 0) NormalizeDataAfter2DFFT(dDataToFFT, Mult, FFT2DInfo.howMany);
-#endif
-		}
-	}
-
-	//if(NeedsShiftAfterX) FillArrayShift('x', t0SignMult*x0_After, FFT2DInfo.xStepTr);
-	//if(NeedsShiftAfterY) FillArrayShift('y', t0SignMult*y0_After, FFT2DInfo.yStepTr);
-
-	if (NeedsShiftAfterX)
-	{//OC02022019
-		if (m_ArrayShiftX != 0) FillArrayShift('x', t0SignMult * x0_After, FFT2DInfo.xStepTr, m_ArrayShiftX);
-		else if (m_dArrayShiftX != 0) FillArrayShift('x', t0SignMult * x0_After, FFT2DInfo.xStepTr, m_dArrayShiftX);
-	}
-	if (NeedsShiftAfterY)
-	{//OC02022019
-		if (m_ArrayShiftY != 0) FillArrayShift('y', t0SignMult * y0_After, FFT2DInfo.yStepTr, m_ArrayShiftY);
-		else if (m_dArrayShiftY != 0) FillArrayShift('y', t0SignMult * y0_After, FFT2DInfo.yStepTr, m_dArrayShiftY);
-	}
-	if (NeedsShiftAfterX || NeedsShiftAfterY)
-	{
-#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
-		GPU_COND(pvGPU, //OC06092023
-		//GPU_COND(pGpuUsage, //HG18072022
-		{
-			TGPUUsageArg *pGPU = (TGPUUsageArg*)pvGPU;
-			if (DataToFFT != 0) {
-				m_ArrayShiftX = (float*)CAuxGPU::ToDevice(pGPU, m_ArrayShiftX, (Nx << 1) * sizeof(float), false); //OC06092023
-				m_ArrayShiftY = (float*)CAuxGPU::ToDevice(pGPU, m_ArrayShiftY, (Ny << 1) * sizeof(float), false);
-				CAuxGPU::EnsureDeviceMemoryReady(pGPU, m_ArrayShiftX);
-				CAuxGPU::EnsureDeviceMemoryReady(pGPU, m_ArrayShiftY);
-				//m_ArrayShiftX = (float*)AuxGpu::ToDevice(pGpuUsage, m_ArrayShiftX, (Nx << 1) * sizeof(float), false);
-				//m_ArrayShiftY = (float*)AuxGpu::ToDevice(pGpuUsage, m_ArrayShiftY, (Ny << 1) * sizeof(float), false);
-				//AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, m_ArrayShiftX);
-				//AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, m_ArrayShiftY);
-				TreatShifts2D_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany, NeedsShiftAfterX, NeedsShiftAfterY, m_ArrayShiftX, m_ArrayShiftY);
-				m_ArrayShiftX = (float*)CAuxGPU::ToHostAndFree(pGPU, m_ArrayShiftX, (Nx << 1) * sizeof(float), true); //OC06092023
-				m_ArrayShiftY = (float*)CAuxGPU::ToHostAndFree(pGPU, m_ArrayShiftY, (Ny << 1) * sizeof(float), true);
-				//m_ArrayShiftX = (float*)AuxGpu::ToHostAndFree(pGpuUsage, m_ArrayShiftX, (Nx << 1) * sizeof(float), true);
-				//m_ArrayShiftY = (float*)AuxGpu::ToHostAndFree(pGpuUsage, m_ArrayShiftY, (Ny << 1) * sizeof(float), true);
-			}
-			else if (dDataToFFT != 0) {
-				m_dArrayShiftX = (double*)CAuxGPU::ToDevice(pGPU, m_dArrayShiftX, (Nx << 1) * sizeof(double), false); //OC06092023
-				m_dArrayShiftY = (double*)CAuxGPU::ToDevice(pGPU, m_dArrayShiftY, (Ny << 1) * sizeof(double), false);
-				CAuxGPU::EnsureDeviceMemoryReady(pGPU, m_dArrayShiftX);
-				CAuxGPU::EnsureDeviceMemoryReady(pGPU, m_dArrayShiftY);
-				//m_dArrayShiftX = (double*)AuxGpu::ToDevice(pGpuUsage, m_dArrayShiftX, (Nx << 1) * sizeof(double), false);	
-				//m_dArrayShiftY = (double*)AuxGpu::ToDevice(pGpuUsage, m_dArrayShiftY, (Ny << 1) * sizeof(double), false);
-				//AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, m_dArrayShiftX);
-				//AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, m_dArrayShiftY);
-				TreatShifts2D_GPU((double*)dDataToFFT, Nx, Ny, FFT2DInfo.howMany, NeedsShiftAfterX, NeedsShiftAfterY, m_dArrayShiftX, m_dArrayShiftY);
-				m_dArrayShiftX = (double*)CAuxGPU::ToHostAndFree(pGPU, m_dArrayShiftX, (Nx << 1) * sizeof(double), true); //OC06092023
-				m_dArrayShiftY = (double*)CAuxGPU::ToHostAndFree(pGPU, m_dArrayShiftY, (Ny << 1) * sizeof(double), true);
-				//m_dArrayShiftX = (double*)AuxGpu::ToHostAndFree(pGpuUsage, m_dArrayShiftX, (Nx << 1) * sizeof(double), true);
-				//m_dArrayShiftY = (double*)AuxGpu::ToHostAndFree(pGpuUsage, m_dArrayShiftY, (Ny << 1) * sizeof(double), true);
-			}
-		})
-		else 
-#endif
-		{
-			if (DataToFFT != 0) TreatShifts(DataToFFT, FFT2DInfo.howMany);
-
-#ifdef _FFTW3 //OC27022019
-			else if (dDataToFFT != 0) TreatShifts(dDataToFFT, FFT2DInfo.howMany); //OC02022019
-#endif
-		}
-	}
-
-	//OC_NERSC: to comment-out the following line for NERSC (to avoid crash with "python-mpi")
-	//fftwnd_destroy_plan(Plan2DFFT);
-	//OC27102018
-	//SY: adopted for OpenMP
-#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
-	GPU_COND(pvGPU, //OC06092023
-	//GPU_COND(pGpuUsage, //HG02112021
-	{
-		if (FFT2DInfo.pData != 0) 
-		{
-			CAuxGPU::MarkUpdated((TGPUUsageArg*)pvGPU, DataToFFT, true, false); //OC06092023
-			//AuxGpu::MarkUpdated(pGpuUsage, DataToFFT, true, false);
-		}
-		else if (FFT2DInfo.pdData != 0) 
-		{
-			CAuxGPU::MarkUpdated((TGPUUsageArg*)pvGPU, dDataToFFT, true, false); //OC06092023
-			//AuxGpu::MarkUpdated(pGpuUsage, dDataToFFT, true, false);
-		}
-	})
-	else
-#endif
-	{
-#if _FFTW3 //OC28012019
-		if (DataToFFT != 0)
-		{
-			if (pPrecreatedPlan2DFFT == 0) fftwf_destroy_plan(Plan2DFFT);
-		}
-		else if (dDataToFFT != 0) //OC03022019
-		{
-			if (pdPrecreatedPlan2DFFT == 0) fftw_destroy_plan(dPlan2DFFT);
-		}
-#else
-		if (pPrecreatedPlan2DFFT == 0) fftwnd_destroy_plan(Plan2DFFT);
-#endif
-	}
-
-	//if(ArrayShiftX != 0) { delete[] ArrayShiftX; ArrayShiftX = 0;}
-	//if(ArrayShiftY != 0) { delete[] ArrayShiftY; ArrayShiftY = 0;}
-	if (m_ArrayShiftX != 0) { delete[] m_ArrayShiftX;}
-	if (m_ArrayShiftY != 0) { delete[] m_ArrayShiftY;}
-	if (m_dArrayShiftX != 0) { delete[] m_dArrayShiftX;} //OC02022019
-	if (m_dArrayShiftY != 0) { delete[] m_dArrayShiftY;}
-	
-	return 0;
-}
-
-//*************************************************************************
-//Forward FFT: Int f(x)*exp(-i*2*Pi*qx*x)dx
-//Backward FFT: Int f(qx)*exp(i*2*Pi*qx*x)dqx
-//int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo)
-//int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, gpuUsageArg *pGpuUsage) //HG20012022
-int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU) //OC05092023
-{// Assumes Nx, Ny even !
-	//Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP:
-	//double start;
-	//get_walltime (&start);
-
-	const double RelShiftTol = 1.E-06;
-
-	SetupLimitsTr(FFT1DInfo);
-
-	double xStepNx = FFT1DInfo.Nx*FFT1DInfo.xStep;
-	double x0_After = FFT1DInfo.xStart + 0.5*xStepNx;
-	NeedsShiftAfterX = FFT1DInfo.ApplyAutoShiftAfter && (::fabs(x0_After) > RelShiftTol*xStepNx);
-
-	double xStartTr = -0.5/FFT1DInfo.xStep;
-
-	NeedsShiftBeforeX = 0;
-	double x0_Before = 0.;
-
-	if(FFT1DInfo.UseGivenStartTrValue)
-	{
-		x0_Before = (FFT1DInfo.xStartTr - xStartTr);
-		NeedsShiftBeforeX = (::fabs(x0_Before) > RelShiftTol*(::fabs(xStartTr)));
-	}
-
-	m_ArrayShiftX = 0;
-	m_dArrayShiftX = 0;
-	if (NeedsShiftBeforeX || NeedsShiftAfterX)
-	{
-		if (FFT1DInfo.pInData != 0)
-		{
-			m_ArrayShiftX = new float[Nx << 1];
-			if (m_ArrayShiftX == 0) return MEMORY_ALLOCATION_FAILURE;
-
-#ifdef _OFFLOAD_GPU //OC05092023 (check for memory leak / misuse!)
-			m_ArrayShiftX = (float*)CAuxGPU::ToDevice((TGPUUsageArg*)pvGPU, m_ArrayShiftX, (Nx << 1) * sizeof(float), true);
-			//m_ArrayShiftX = (float*)AuxGpu::ToDevice(pGpuUsage, m_ArrayShiftX, (Nx << 1) * sizeof(float), true); //HG20012022
-#endif
-		}
-		else if (FFT1DInfo.pdInData != 0)
-		{
-			m_dArrayShiftX = new double[Nx << 1];
-			if (m_dArrayShiftX == 0) return MEMORY_ALLOCATION_FAILURE;
-
-#ifdef _OFFLOAD_GPU //OC05092023 
-			m_dArrayShiftX = (double*)CAuxGPU::ToDevice((TGPUUsageArg*)pvGPU, m_dArrayShiftX, (Nx << 1) * sizeof(double), true);
-			//m_dArrayShiftX = (double*)AuxGpu::ToDevice(pGpuUsage, m_dArrayShiftX, (Nx << 1) * sizeof(double), true); //HG20012022
-#endif
-		}
-	}
-
-#ifdef _FFTW3 //OC28012019
-	fftwf_plan Plan1DFFT;
-	fftwf_complex* DataToFFT = 0, * OutDataFFT = 0; //, *pOutDataFFT=0;
-
-	fftw_plan dPlan1DFFT;
-	fftw_complex* dDataToFFT = 0, * dOutDataFFT = 0; //, *pdOutDataFFT=0;
-#endif
-
-//HG20012022
-//#ifdef _DEBUG
-//	if (pGpuUsage != NULL)
-//		printf ("GPU: Make1DFFT\n");
-//#endif
-#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
-	GPU_COND(pvGPU, //OC06092023
-	//GPU_COND(pGpuUsage, //HG20012022
-	{
-		if ((FFT1DInfo.pInData != 0) && (FFT1DInfo.pOutData != 0))
-		{
-			DataToFFT = (fftwf_complex*)CAuxGPU::ToDevice((TGPUUsageArg*)pvGPU, FFT1DInfo.pInData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(float)); //OC06092023
-			OutDataFFT = (fftwf_complex*)CAuxGPU::ToDevice((TGPUUsageArg*)pvGPU, FFT1DInfo.pOutData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(float), true);
-			//DataToFFT = (fftwf_complex*)AuxGpu::ToDevice(pGpuUsage, FFT1DInfo.pInData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(float));
-			//OutDataFFT = (fftwf_complex*)AuxGpu::ToDevice(pGpuUsage, FFT1DInfo.pOutData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(float), true);
-		}
-		else if ((FFT1DInfo.pdInData != 0) && (FFT1DInfo.pdOutData != 0))
-		{
-			dDataToFFT = (fftw_complex*)CAuxGPU::ToDevice((TGPUUsageArg*)pvGPU, FFT1DInfo.pdInData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(double)); //OC06092023
-			dOutDataFFT = (fftw_complex*)CAuxGPU::ToDevice((TGPUUsageArg*)pvGPU, FFT1DInfo.pdOutData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(double), true);
-			//dDataToFFT = (fftw_complex*)AuxGpu::ToDevice(pGpuUsage, FFT1DInfo.pdInData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(double));
-			//dOutDataFFT = (fftw_complex*)AuxGpu::ToDevice(pGpuUsage, FFT1DInfo.pdOutData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(double), true);
-		}
-	})
-	else 
-#endif
-	{
-#ifdef _FFTW3 //OC28012019
-		if ((FFT1DInfo.pInData != 0) && (FFT1DInfo.pOutData != 0))
-		{
-			DataToFFT = (fftwf_complex*)(FFT1DInfo.pInData);
-			OutDataFFT = (fftwf_complex*)(FFT1DInfo.pOutData);
-			//pOutDataFFT = OutDataFFT; //OC03092016 to be used solely in fftw call
-		}
-		else if ((FFT1DInfo.pdInData != 0) && (FFT1DInfo.pdOutData != 0))
-		{
-			dDataToFFT = (fftw_complex*)(FFT1DInfo.pdInData);
-			dOutDataFFT = (fftw_complex*)(FFT1DInfo.pdOutData);
-			//pdOutDataFFT = dOutDataFFT;
-		}
-#else
-		fftw_plan Plan1DFFT;
-		FFTW_COMPLEX* DataToFFT = (FFTW_COMPLEX*)(FFT1DInfo.pInData);
-		FFTW_COMPLEX* OutDataFFT = (FFTW_COMPLEX*)(FFT1DInfo.pOutData);
-		FFTW_COMPLEX* pOutDataFFT = OutDataFFT; //OC03092016 to be used solely in fftw call
-	/**
-		Pointed-out by Sergey Yakubov (E-XFEL).
-		From FFTW 2.1.5 docs:
-		void fftw(fftw_plan plan, int howmany,
-			  fftw_complex *in, int istride, int idist,
-			  fftw_complex *out, int ostride, int odist);
-		...
-		out, ostride and odist describe the output array(s). The format is the same as for the input array.
-		In-place transforms:  If the plan specifies an in-place transform, ostride and odist are always ignored.
-		If out is NULL, out is ignored, too. Otherwise, out is interpreted as a pointer to an array of n complex numbers,
-		that FFTW will use as temporary space to perform the in-place computation. out is used as scratch space and its contents destroyed.
-		In this case, out must be an ordinary array whose elements are contiguous in memory (no striding).
-	**/
-#endif
-	}
-
-#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
-	if (DataToFFT != 0) CAuxGPU::EnsureDeviceMemoryReady((TGPUUsageArg*)pvGPU, DataToFFT);
-	else if (dDataToFFT != 0) CAuxGPU::EnsureDeviceMemoryReady((TGPUUsageArg*)pvGPU, dDataToFFT);
-	//if (DataToFFT != 0) AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, DataToFFT);
-	//else if (dDataToFFT != 0) AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, dDataToFFT);
-#endif
-
-	char t0SignMult = (FFT1DInfo.Dir > 0) ? -1 : 1;
-	if (NeedsShiftBeforeX)
-	{
-#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
-		GPU_COND(pvGPU, 
-		//GPU_COND(pGpuUsage, //HG20012022
-		{
-			if (m_ArrayShiftX != 0) FillArrayShift_GPU(t0SignMult * x0_Before, FFT1DInfo.xStep, Nx, m_ArrayShiftX);
-			else if (m_dArrayShiftX != 0) FillArrayShift_GPU(t0SignMult * x0_Before, FFT1DInfo.xStep, Nx, m_dArrayShiftX);
-
-			if (DataToFFT != 0) TreatShift_GPU((float*)DataToFFT, FFT1DInfo.HowMany, Nx, m_ArrayShiftX);
-			else if (dDataToFFT != 0) TreatShift_GPU((double*)dDataToFFT, FFT1DInfo.HowMany, Nx, m_dArrayShiftX);
-		})
-		else 
-#endif
-		{
-			//FillArrayShift(t0SignMult*x0_Before, FFT1DInfo.xStep);
-			if (m_ArrayShiftX != 0) FillArrayShift(t0SignMult * x0_Before, FFT1DInfo.xStep, m_ArrayShiftX);
-			else if (m_dArrayShiftX != 0) FillArrayShift(t0SignMult * x0_Before, FFT1DInfo.xStep, m_dArrayShiftX);
-
-			if (DataToFFT != 0) TreatShift(DataToFFT, FFT1DInfo.HowMany);
-
-#ifdef _FFTW3 //OC27022019
-			else if (dDataToFFT != 0) TreatShift(dDataToFFT, FFT1DInfo.HowMany);
-#endif
-		}
-	}
-
-	//Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP:
-	//srwlPrintTime("::Make1DFFT : before fft",&start);
-	
-	int flags = FFTW_ESTIMATE; //OC30012019
-	bool alreadyNormalized = false; //HG17032022
-	//double Mult = FFT1DInfo.xStep;
-	double Mult = FFT1DInfo.xStep * FFT1DInfo.MultExtra;
-
-	if (FFT1DInfo.Dir > 0) //HG17112021
-	{
-#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
-		GPU_COND(pvGPU, 
-		//GPU_COND(pGpuUsage, 
-		{
-			int arN[] = { (int)Nx }; //OC14052020
-			if (DataToFFT != 0)
-			{
-				if (PlanLen != Nx) {
-					PlanLen = Nx;
-					if (Plan1DFFT_cu != NULL)
-					{
-						cufftDestroy(Plan1DFFT_cu);
-						Plan1DFFT_cu = NULL;
-					}
-					cufftPlanMany(&Plan1DFFT_cu, 1, arN, NULL, 1, Nx, NULL, 1, Nx, CUFFT_C2C, FFT1DInfo.HowMany);
-				}
-				if (Plan1DFFT_cu == 0) return ERROR_IN_FFT;
-				cufftExecC2C(Plan1DFFT_cu, (cufftComplex*)DataToFFT, (cufftComplex*)OutDataFFT, CUFFT_FORWARD);
-			}
-			else if (dDataToFFT != 0) //OC02022019
-			{
-				if (dPlanLen != Nx) {
-					if (dPlan1DFFT_cu != NULL)
-					{
-						cufftDestroy(dPlan1DFFT_cu);
-						dPlan1DFFT_cu = NULL;
-					}
-					dPlanLen = Nx;
-					cufftPlanMany(&dPlan1DFFT_cu, 1, arN, NULL, 1, Nx, NULL, 1, Nx, CUFFT_Z2Z, FFT1DInfo.HowMany);
-				}
-				if (dPlan1DFFT_cu == 0) return ERROR_IN_FFT;
-				cufftExecZ2Z(dPlan1DFFT_cu, (cufftDoubleComplex*)dDataToFFT, (cufftDoubleComplex*)dOutDataFFT, CUFFT_FORWARD);
-			}
-		})
-		else 
-#endif
-		{
-			//int flags = FFTW_ESTIMATE;
-#ifdef _FFTW3 //OC28012019
-#ifdef _WITH_OMP
-		//Still needs to be tested!
-			if (DataToFFT != 0)
-			{
-				fftwf_init_threads(); //initialize threading support
-				int nthreads = omp_get_max_threads(); //detect number of OpenMP threads that are available
-				fftwf_plan_with_nthreads(nthreads);
-			}
-			else if (dDataToFFT != 0) //OC02022019
-			{
-				fftw_init_threads(); //initialize threading support
-				int nthreads = omp_get_max_threads(); //detect number of OpenMP threads that are available
-				fftw_plan_with_nthreads(nthreads);
-			}
-#endif //ifndef _WITH_OMP
-			int arN[] = { (int)Nx }; //OC14052020
-			//int arN[] = {Nx};
-			if (DataToFFT != 0)
-			{
-				//Plan1DFFT = fftwf_plan_many_dft(1, arN, FFT1DInfo.HowMany, DataToFFT, NULL, 1, Nx, pOutDataFFT, NULL, 1, Nx, FFTW_FORWARD, flags); 
-				Plan1DFFT = fftwf_plan_many_dft(1, arN, FFT1DInfo.HowMany, DataToFFT, NULL, 1, Nx, OutDataFFT, NULL, 1, Nx, FFTW_FORWARD, flags); //OC02022019
-				if (Plan1DFFT == 0) return ERROR_IN_FFT;
-				fftwf_execute(Plan1DFFT);
-			}
-			else if (dDataToFFT != 0) //OC02022019
-			{
-				dPlan1DFFT = fftw_plan_many_dft(1, arN, FFT1DInfo.HowMany, dDataToFFT, NULL, 1, Nx, dOutDataFFT, NULL, 1, Nx, FFTW_FORWARD, flags);
-				if (dPlan1DFFT == 0) return ERROR_IN_FFT;
-				fftw_execute(dPlan1DFFT);
-			}
-
-#else //ifndef _FFTW3
-			if (DataToFFT == OutDataFFT)
-			{
-				flags |= FFTW_IN_PLACE;
-				pOutDataFFT = 0; //OC03092016 (see FFTW 2.1.5 doc clause above)
-			}
-			Plan1DFFT = fftw_create_plan(Nx, FFTW_FORWARD, flags);
-			if (Plan1DFFT == 0) return ERROR_IN_FFT;
-
-			//Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP:
-			//srwlPrintTime("::Make1DFFT : fft create plan dir>0",&start);
-
-#ifndef _WITH_OMP //OC27102018
-		//fftw(Plan1DFFT, FFT1DInfo.HowMany, DataToFFT, 1, Nx, OutDataFFT, 1, Nx);
-			fftw(Plan1DFFT, FFT1DInfo.HowMany, DataToFFT, 1, Nx, pOutDataFFT, 1, Nx); //OC03092016
-#else //OC27102018
-		//SY: split one call into many (for OpenMP)
-#pragma omp parallel for if (omp_get_num_threads()==1) // to avoid nested multi-threading (just in case)
-			for (int i = 0; i < FFT1DInfo.HowMany; i++)
-			{
-				//SY: do not use OutDataFFT as scratch space if in-place
-				if (DataToFFT == OutDataFFT) fftw_one(Plan1DFFT, DataToFFT + i * Nx, 0);
-				else fftw_one(Plan1DFFT, DataToFFT + i * Nx, OutDataFFT + i * Nx);
-			}
-#endif
-#endif
-		}
-		//Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP:
-		//srwlPrintTime("::Make1DFFT : fft  dir>0",&start);
-
-#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
-		GPU_COND(pvGPU, 
-		//GPU_COND(pGpuUsage, //HG20012022
-		{
-			if (OutDataFFT != 0)
-			{
-				RepairAndRotateDataAfter1DFFT_GPU((float*)OutDataFFT, FFT1DInfo.HowMany, Nx, (float)Mult); //OC06092023
-				//RepairAndRotateDataAfter1DFFT_GPU((float*)OutDataFFT, FFT1DInfo.HowMany, Nx, Mult);
-				//RepairSignAfter1DFFT_GPU((float*)OutDataFFT, FFT1DInfo.HowMany, Nx);
-				//RotateDataAfter1DFFT_GPU((float*)OutDataFFT, FFT1DInfo.HowMany, Nx);
-			}
-			else if (dOutDataFFT != 0)
-			{
-				RepairAndRotateDataAfter1DFFT_GPU((double*)dOutDataFFT, FFT1DInfo.HowMany, Nx, Mult);
-				//RepairSignAfter1DFFT_GPU((double*)dOutDataFFT, FFT1DInfo.HowMany, Nx);
-				//RotateDataAfter1DFFT_GPU((double*)dOutDataFFT, FFT1DInfo.HowMany, Nx);
-			}
-			alreadyNormalized = true;
-		})
-		else 
-#endif
-		{
-			if (OutDataFFT != 0)
-			{
-				RepairSignAfter1DFFT(OutDataFFT, FFT1DInfo.HowMany);
-				RotateDataAfter1DFFT(OutDataFFT, FFT1DInfo.HowMany);
-			}
-#ifdef _FFTW3 //OC27022019
-			else if (dOutDataFFT != 0)
-			{
-				RepairSignAfter1DFFT(dOutDataFFT, FFT1DInfo.HowMany);
-				RotateDataAfter1DFFT(dOutDataFFT, FFT1DInfo.HowMany);
-			}
-#endif
-		}
-	}
-	else
-	{
-		//int flags = FFTW_ESTIMATE; //OC30012019 (commented-out)
-#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
-		GPU_COND(pvGPU, 
-		//GPU_COND(pGpuUsage, //HG20012022
-		{
-			int arN[] = { (int)Nx }; //OC14052020
-			//int arN[] = {Nx};
-			if (DataToFFT != 0)
-			{
-				if (PlanLen != Nx) {
-					PlanLen = Nx;
-					HowMany = FFT1DInfo.HowMany;
-					if (Plan1DFFT_cu != NULL)
-					{
-						cufftDestroy(Plan1DFFT_cu);
-						Plan1DFFT_cu = NULL;
-					}
-					cufftPlanMany(&Plan1DFFT_cu, 1, arN, NULL, 1, Nx, NULL, 1, Nx, CUFFT_C2C, FFT1DInfo.HowMany);
-				}
-				if (Plan1DFFT_cu == 0) return ERROR_IN_FFT;
-
-				RotateDataAfter1DFFT_GPU((float*)DataToFFT, FFT1DInfo.HowMany, Nx);
-				RepairSignAfter1DFFT_GPU((float*)DataToFFT, FFT1DInfo.HowMany, Nx);
-				cufftExecC2C(Plan1DFFT_cu, (cufftComplex*)DataToFFT, (cufftComplex*)OutDataFFT, CUFFT_INVERSE);
-			}
-			else if (dDataToFFT != 0) //OC02022019
-			{
-				if (dPlanLen != Nx) 
-				{
-					dPlanLen = Nx;
-					dHowMany = FFT1DInfo.HowMany;
-					if (dPlan1DFFT_cu != NULL)
-					{
-						cufftDestroy(dPlan1DFFT_cu);
-						dPlan1DFFT_cu = NULL;
-					}
-					cufftPlanMany(&dPlan1DFFT_cu, 1, arN, NULL, 1, Nx, NULL, 1, Nx, CUFFT_Z2Z, FFT1DInfo.HowMany);
-				}
-				if (dPlan1DFFT_cu == 0) return ERROR_IN_FFT;
-
-				RotateDataAfter1DFFT_GPU((double*)dDataToFFT, FFT1DInfo.HowMany, Nx);
-				RepairSignAfter1DFFT_GPU((double*)dDataToFFT, FFT1DInfo.HowMany, Nx);
-				cufftExecZ2Z(dPlan1DFFT_cu, (cufftDoubleComplex*)dDataToFFT, (cufftDoubleComplex*)dOutDataFFT, CUFFT_INVERSE);
-			}
-		})
-		else 
-#endif
-		{
-#ifdef _FFTW3 //OC28012019
-#ifdef _WITH_OMP
-
-			//Still needs to be tested!
-			if (DataToFFT != 0)
-			{
-				fftwf_init_threads(); //initialize threading support
-				int nthreads = omp_get_max_threads(); //detect number of OpenMP threads that are available
-				fftwf_plan_with_nthreads(nthreads);
-			}
-			else if (dDataToFFT != 0)
-			{
-				fftw_init_threads(); //initialize threading support
-				int nthreads = omp_get_max_threads(); //detect number of OpenMP threads that are available
-				fftw_plan_with_nthreads(nthreads);
-			}
-
-#endif
-			int arN[] = { (int)Nx }; //OC14052020
-	//int arN[] = {Nx};
-			if (DataToFFT != 0)
-			{
-				//Plan1DFFT = fftwf_plan_many_dft(1, arN, FFT1DInfo.HowMany, DataToFFT, NULL, 1, Nx, pOutDataFFT, NULL, 1, Nx, FFTW_BACKWARD, flags); 
-				Plan1DFFT = fftwf_plan_many_dft(1, arN, FFT1DInfo.HowMany, DataToFFT, NULL, 1, Nx, OutDataFFT, NULL, 1, Nx, FFTW_BACKWARD, flags); //OC02022019
-				if (Plan1DFFT == 0) return ERROR_IN_FFT;
-				RotateDataAfter1DFFT(DataToFFT, FFT1DInfo.HowMany);
-				RepairSignAfter1DFFT(DataToFFT, FFT1DInfo.HowMany);
-
-				fftwf_execute(Plan1DFFT);
-			}
-			else if (dDataToFFT != 0) //OC02022019
-			{
-				dPlan1DFFT = fftw_plan_many_dft(1, arN, FFT1DInfo.HowMany, dDataToFFT, NULL, 1, Nx, dOutDataFFT, NULL, 1, Nx, FFTW_BACKWARD, flags);
-				if (dPlan1DFFT == 0) return ERROR_IN_FFT;
-				RotateDataAfter1DFFT(dDataToFFT, FFT1DInfo.HowMany);
-				RepairSignAfter1DFFT(dDataToFFT, FFT1DInfo.HowMany);
-				fftw_execute(dPlan1DFFT);
-			}
-#else //ifndef _FFTW3
-			if (DataToFFT == OutDataFFT)
-			{
-				flags |= FFTW_IN_PLACE;
-				pOutDataFFT = 0; //OC03092016 (see FFTW 2.1.5 doc clause above)
-			}
-			Plan1DFFT = fftw_create_plan(Nx, FFTW_BACKWARD, flags);
-			if (Plan1DFFT == 0) return ERROR_IN_FFT;
-
-			//Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP:
-			//srwlPrintTime("::Make1DFFT : fft create plan dir<0",&start);
-
-			RotateDataAfter1DFFT(DataToFFT, FFT1DInfo.HowMany);
-			//srwlPrintTime("::Make1DFFT : rotate dir<0",&start);
-
-			RepairSignAfter1DFFT(DataToFFT, FFT1DInfo.HowMany);
-			//srwlPrintTime("::Make1DFFT : repair dir<0",&start);
-
-#ifndef _WITH_OMP //OC27102018
-		//fftw(Plan1DFFT, FFT1DInfo.HowMany, DataToFFT, 1, Nx, OutDataFFT, 1, Nx);
-			fftw(Plan1DFFT, FFT1DInfo.HowMany, DataToFFT, 1, Nx, pOutDataFFT, 1, Nx); //OC03092016
-#else //OC27102018
-		//SY: split one call into many (for OpenMP)
-#pragma omp parallel for if (omp_get_num_threads()==1) // to avoid nested multi-threading (just in case)
-			for (int i = 0; i < FFT1DInfo.HowMany; i++)
-			{
-				if (DataToFFT == OutDataFFT) fftw_one(Plan1DFFT, DataToFFT + i * Nx, 0);
-				else fftw_one(Plan1DFFT, DataToFFT + i * Nx, OutDataFFT + i * Nx);
-			}
-#endif
-#endif //_FFTW3
-		}
-		//Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP:
-		//srwlPrintTime("::Make1DFFT : fft  dir<0",&start);
-	}
-
-	if (!alreadyNormalized)
-	{
-#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
-		GPU_COND(pvGPU, 
-		//GPU_COND(pGpuUsage, 
-		{
-			if (OutDataFFT != 0) {
-				NormalizeDataAfter1DFFT_GPU((float*)OutDataFFT, FFT1DInfo.HowMany, Nx, Mult);
-			}
-			else if (dOutDataFFT != 0)
-				NormalizeDataAfter1DFFT_GPU((double*)dOutDataFFT, FFT1DInfo.HowMany, Nx, Mult);
-		})
-		else 
-#endif
-		{
-			if (OutDataFFT != 0) NormalizeDataAfter1DFFT(OutDataFFT, FFT1DInfo.HowMany, Mult);
-#ifdef _FFTW3 //OC27022019
-			else if (dOutDataFFT != 0) NormalizeDataAfter1DFFT(dOutDataFFT, FFT1DInfo.HowMany, Mult);
-#endif
-		}
-	}
-	
-	//Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP:
-	//srwlPrintTime("::Make1DFFT : NormalizeDataAfter1DFFT",&start);
-
-	if (NeedsShiftAfterX)
-	{
-#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
-		GPU_COND(pvGPU, 
-		//GPU_COND(pGpuUsage, 
-		{
-			if (m_ArrayShiftX != 0) FillArrayShift_GPU(t0SignMult * x0_After, FFT1DInfo.xStepTr, Nx, m_ArrayShiftX); //OC02022019
-			else if (m_dArrayShiftX != 0) FillArrayShift_GPU(t0SignMult * x0_After, FFT1DInfo.xStepTr, Nx, m_dArrayShiftX);
-
-			if (OutDataFFT != 0) TreatShift_GPU((float*)OutDataFFT, FFT1DInfo.HowMany, Nx, m_ArrayShiftX);
-			else if (dOutDataFFT != 0) TreatShift_GPU((double*)dOutDataFFT, FFT1DInfo.HowMany, Nx, m_dArrayShiftX);
-		})
-		else 
-#endif
-		{
-			//FillArrayShift(t0SignMult*x0_After, FFT1DInfo.xStepTr);
-			if (m_ArrayShiftX != 0) FillArrayShift(t0SignMult * x0_After, FFT1DInfo.xStepTr, m_ArrayShiftX); //OC02022019
-			else if (m_dArrayShiftX != 0) FillArrayShift(t0SignMult * x0_After, FFT1DInfo.xStepTr, m_dArrayShiftX);
-
-			if (OutDataFFT != 0) TreatShift(OutDataFFT, FFT1DInfo.HowMany);
-#ifdef _FFTW3 //OC27022019
-			else if (dOutDataFFT != 0) TreatShift(dOutDataFFT, FFT1DInfo.HowMany);
-#endif
-		}
-	}
-
-	if(FFT1DInfo.TreatSharpEdges)
-	{
-		int result = ProcessSharpEdges(FFT1DInfo);
-		if(result) return result;
-	}
-
-#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
-	GPU_COND(pvGPU,
-	//GPU_COND(pGpuUsage, //HG20012022
-	{
-		if ((FFT1DInfo.pInData != 0) && (FFT1DInfo.pOutData != 0))
-		{
-			CAuxGPU::MarkUpdated((TGPUUsageArg*)pvGPU, OutDataFFT, true, false); //OC06092023
-			//AuxGpu::MarkUpdated(pGpuUsage, OutDataFFT, true, false);
-		}
-		else if ((FFT1DInfo.pdInData != 0) && (FFT1DInfo.pdOutData != 0))
-		{
-			CAuxGPU::MarkUpdated((TGPUUsageArg*)pvGPU, dOutDataFFT, true, false); //OC06092023
-			//AuxGpu::MarkUpdated(pGpuUsage, dOutDataFFT, true, false);
-		}
-	})
-	else 
-#endif
-	{
-		//Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP:
-		//srwlPrintTime("::Make1DFFT : ProcessSharpEdges",&start);
-
-		//OC_NERSC: to comment-out the following line for NERSC (to avoid crash with "python-mpi")
-		//OC27102018: thread safety issue?
-#ifdef _FFTW3 //OC29012019
-
-		if(DataToFFT != 0) fftwf_destroy_plan(Plan1DFFT);
-		else if(dDataToFFT != 0) fftw_destroy_plan(dPlan1DFFT);
-
-#ifdef _WITH_OMP 
-
-		if(DataToFFT != 0) fftwf_cleanup_threads(); //??
-		else if(dDataToFFT != 0) fftw_cleanup_threads();
-
-#endif
-#else //ifndef _FFTW3
-
-		fftw_destroy_plan(Plan1DFFT);
-
-#endif
-	}
-
-	if (m_ArrayShiftX != 0)
-	{
-#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
-		m_ArrayShiftX = (float*)CAuxGPU::ToHostAndFree((TGPUUsageArg*)pvGPU, m_ArrayShiftX, (Nx << 1) * sizeof(float), true); //OC06092023
-		//m_ArrayShiftX = (float*)AuxGpu::ToHostAndFree(pGpuUsage, m_ArrayShiftX, (Nx << 1) * sizeof(float), true);
-#endif
-		delete[] m_ArrayShiftX;
-	}
-	if (m_dArrayShiftX != 0)
-	{
-#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
-		m_dArrayShiftX = (double*)CAuxGPU::ToHostAndFree((TGPUUsageArg*)pvGPU, m_dArrayShiftX, (Nx << 1) * sizeof(double), true); //OC06092023
-		//m_dArrayShiftX = (double*)AuxGpu::ToHostAndFree(pGpuUsage, m_dArrayShiftX, (Nx << 1) * sizeof(double), true);
-#endif
-		delete[] m_dArrayShiftX;
-	}
-
-	//Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP:
-	//srwlPrintTime("::Make1DFFT : after fft ",&start);
-	return 0;
-}
-
-//*************************************************************************
-
-int CGenMathFFT1D::SetupAuxDataForSharpEdgeCorr(CGenMathFFT1DInfo& FFT1DInfo, CGenMathAuxDataForSharpEdgeCorr1D& AuxDataForSharpEdgeCorr, char dataType)
-//int CGenMathFFT1D::SetupAuxDataForSharpEdgeCorr(CGenMathFFT1DInfo& FFT1DInfo, CGenMathAuxDataForSharpEdgeCorr1D& AuxDataForSharpEdgeCorr)
-{
-	double Step = FFT1DInfo.xStep, Start = FFT1DInfo.xStart;
-	double AbsTol = 0.05*Step;
-
-	double EdgeMinOffsetFromStart = FFT1DInfo.LeftSharpEdge - Start;
-	long iEdgeMinLower = long(EdgeMinOffsetFromStart/Step + 1.E-04); // Steer: threr was a bug at 1.E-08 and less!
-	double EdgeMinLowerMisfit = EdgeMinOffsetFromStart - iEdgeMinLower*Step;
-
-	double EdgeMaxOffsetFromStart = FFT1DInfo.RightSharpEdge - Start;
-	long iEdgeMaxLower = long(EdgeMaxOffsetFromStart/Step + 1.E-04); // Steer: threr was a bug at 1.E-08 and less!
-	double EdgeMaxLowerMisfit = EdgeMaxOffsetFromStart - iEdgeMaxLower*Step;
-
-	char EdgeMinIsBetweenMeshPoints = (EdgeMinLowerMisfit > AbsTol);
-	char EdgeMaxIsBetweenMeshPoints = (EdgeMaxLowerMisfit > AbsTol);
-	char EdgeMaxIsSmallerThanDataEnd = (::fabs((Start + FFT1DInfo.Nx*Step) - FFT1DInfo.RightSharpEdge) > AbsTol);
-	char EdgeCorrNeeded = (EdgeMinIsBetweenMeshPoints || EdgeMaxIsBetweenMeshPoints || EdgeMaxIsSmallerThanDataEnd);
-
-	//float dSt = 0.;
-	//if(EdgeMinIsBetweenMeshPoints) dSt = (float)(Step - EdgeMinLowerMisfit);
-	//float dFi = 0.;
-	//if(EdgeMaxIsBetweenMeshPoints) dFi = (float)(Step - EdgeMaxLowerMisfit);
-	//else if(EdgeMaxIsSmallerThanDataEnd) dFi = (float)(0.5*Step);
-
-	//OC02022019
-	double dSt = 0.;
-	if(EdgeMinIsBetweenMeshPoints) dSt = Step - EdgeMinLowerMisfit;
-	double dFi = 0.;
-	if(EdgeMaxIsBetweenMeshPoints) dFi = Step - EdgeMaxLowerMisfit;
-	else if(EdgeMaxIsSmallerThanDataEnd) dFi = 0.5*Step;
-
-	CGenMathFFT1DInfo FFT1DInfoLoc = FFT1DInfo;
-	FFT1DInfoLoc.UseGivenStartTrValue = 0;
-	CGenMathFFT1D FFT1D;
-	FFT1D.SetupLimitsTr(FFT1DInfoLoc);
-
-	if(EdgeCorrNeeded)
-	{
-		AuxDataForSharpEdgeCorr.d = Step;
-		long TwoN = FFT1DInfo.Nx << 1;
-
-		if(dSt != 0.)
-		{
-			if(dataType == 'f')
-			{
-				AuxDataForSharpEdgeCorr.ExpArrSt = new float[TwoN];
-				if(AuxDataForSharpEdgeCorr.ExpArrSt == 0) return MEMORY_ALLOCATION_FAILURE;
-			}
-			else if(dataType == 'd') //OC02022019
-			{
-				AuxDataForSharpEdgeCorr.dExpArrSt = new double[TwoN];
-				if(AuxDataForSharpEdgeCorr.dExpArrSt == 0) return MEMORY_ALLOCATION_FAILURE;
-			}
-
-			AuxDataForSharpEdgeCorr.dSt = dSt;
-			long jSt = iEdgeMinLower + 1;
-			AuxDataForSharpEdgeCorr.iSt = jSt;
-
-			double ArgjSt = Start + jSt*Step;
-			SetupSharpEdgeExpCorrArray(AuxDataForSharpEdgeCorr.ExpArrSt, FFT1DInfoLoc.Nx, ArgjSt, FFT1DInfoLoc.xStartTr, FFT1DInfoLoc.xStepTr);
-		}
-		if(dFi != 0.)
-		{
-			if(dataType == 'f')
-			{
-				AuxDataForSharpEdgeCorr.ExpArrFi = new float[TwoN];
-				if(AuxDataForSharpEdgeCorr.ExpArrFi == 0) return MEMORY_ALLOCATION_FAILURE;
-			}
-			else if(dataType == 'd')
-			{
-				AuxDataForSharpEdgeCorr.dExpArrFi = new double[TwoN];
-				if(AuxDataForSharpEdgeCorr.dExpArrFi == 0) return MEMORY_ALLOCATION_FAILURE;
-			}
-
-			AuxDataForSharpEdgeCorr.dFi = dFi;
-			double ArgjFi = Start + iEdgeMaxLower*Step;
-			AuxDataForSharpEdgeCorr.iFi = iEdgeMaxLower;
-
-			SetupSharpEdgeExpCorrArray(AuxDataForSharpEdgeCorr.ExpArrFi, FFT1DInfoLoc.Nx, ArgjFi, FFT1DInfoLoc.xStartTr, FFT1DInfoLoc.xStepTr);
-		}
-		AuxDataForSharpEdgeCorr.WasSetUp = 1;
-	}
-	return 0;
-}
-
-//*************************************************************************
-
-void CGenMathFFT1D::MakeSharpEdgeCorr(CGenMathFFT1DInfo& FFT1DInfo, CGenMathAuxDataForSharpEdgeCorr1D& AuxData)
-{
-	double fSRe, fSIm, fFRe, fFIm;
-	double ExpStRe, ExpStIm, ExpFiRe, ExpFiIm, Re, Im;
-	long Two_i, Two_i_p_1;
-
-	if((FFT1DInfo.pInData != 0) && (FFT1DInfo.pOutData != 0))
-	{
-		float *t = FFT1DInfo.pOutData;
-		float *tSt = FFT1DInfo.pInData + (AuxData.iSt << 1);
-		float *tFi = FFT1DInfo.pInData + (AuxData.iFi << 1);
-		fSRe = *tSt, fSIm = *(tSt + 1);
-		fFRe = *tFi, fFIm = *(tFi + 1);
-
-		for(long i=0; i<FFT1DInfo.Nx; i++)
-		{
-			Two_i = i << 1; Two_i_p_1 = Two_i + 1;
-			Re = *t, Im = *(t+1);
-			if(AuxData.dSt != 0.)
-			{
-				ExpStRe = AuxData.ExpArrSt[Two_i]; ExpStIm = AuxData.ExpArrSt[Two_i_p_1];
-				//float ExpStRe = AuxData.ExpArrSt[Two_i]; ExpStIm = AuxData.ExpArrSt[Two_i_p_1];
-				Re += AuxData.dSt*(ExpStRe*fSRe - ExpStIm*fSIm);
-				Im += AuxData.dSt*(ExpStRe*fSIm + ExpStIm*fSRe);
-			}
-			if(AuxData.dFi != 0.)
-			{
-				ExpFiRe = AuxData.ExpArrFi[Two_i]; ExpFiIm = AuxData.ExpArrFi[Two_i_p_1];
-				//float ExpFiRe = AuxData.ExpArrFi[Two_i], ExpFiIm = AuxData.ExpArrFi[Two_i_p_1];
-				Re -= AuxData.dFi*(ExpFiRe*fFRe - ExpFiIm*fFIm);
-				Im -= AuxData.dFi*(ExpFiRe*fFIm + ExpFiIm*fFRe);
-			}
-			*t = (float)Re; *(t+1) = (float)Im;
-			t += 2;
-		}
-	}
-	else if((FFT1DInfo.pdInData != 0) && (FFT1DInfo.pdOutData != 0)) //OC02022019
-	{
-		double *td = FFT1DInfo.pdOutData;
-		double *tdSt = FFT1DInfo.pdInData + (AuxData.iSt << 1);
-		double *tdFi = FFT1DInfo.pdInData + (AuxData.iFi << 1);
-		fSRe = *tdSt, fSIm = *(tdSt + 1);
-		fFRe = *tdFi, fFIm = *(tdFi + 1);
-
-		for(long i=0; i<FFT1DInfo.Nx; i++)
-		{
-			Two_i = i << 1; Two_i_p_1 = Two_i + 1;
-			Re = *td, Im = *(td+1);
-			if(AuxData.dSt != 0.)
-			{
-				ExpStRe = AuxData.dExpArrSt[Two_i]; ExpStIm = AuxData.dExpArrSt[Two_i_p_1];
-				Re += AuxData.dSt*(ExpStRe*fSRe - ExpStIm*fSIm);
-				Im += AuxData.dSt*(ExpStRe*fSIm + ExpStIm*fSRe);
-			}
-			if(AuxData.dFi != 0.)
-			{
-				ExpFiRe = AuxData.dExpArrFi[Two_i]; ExpFiIm = AuxData.dExpArrFi[Two_i_p_1];
-				Re -= AuxData.dFi*(ExpFiRe*fFRe - ExpFiIm*fFIm);
-				Im -= AuxData.dFi*(ExpFiRe*fFIm + ExpFiIm*fFRe);
-			}
-			*td = Re; *(td+1) = Im;
-			td += 2;
-		}
-	}
-}
-
-//*************************************************************************
diff --git a/cpp/src/core/gmfft.h b/cpp/src/core/gmfft.h
deleted file mode 100644
index 5d12c1d6..00000000
--- a/cpp/src/core/gmfft.h
+++ /dev/null
@@ -1,1042 +0,0 @@
-/************************************************************************//**
- * File: srfft.h
- * Description: Auxiliary utilities to work with FFTW library (header)
- * Project: 
- * First release: 2000
- *
- * Copyright (C) European Synchrotron Radiation Facility, Grenoble, France
- * All Rights Reserved
- *
- * @author O.Chubar, P.Elleaume
- * @version 1.0
- ***************************************************************************/
-
-#ifndef __GMFFT_H
-#define __GMFFT_H
-
-#ifdef _OFFLOAD_GPU //HG10072021
-#include "cufft.h"
-#include "cuda_runtime.h"
-#include "auxgpu.h" //OC06092023
-#endif 
-//#include "auxgpu.h" //OC05092023: to move into "_OFFLOAD_GPU" block?
-
-//#ifndef GPU_COND //OC06092023 ???
-//#define GPU_COND(arg, code) { code }
-//#endif
-
-#ifdef _FFTW3 //OC28012019
-#include "fftw3.h"
-#else
-#include "fftw.h"
-#endif
-
-//#ifdef __IGOR_PRO__
-//#include "srigintr.h"
-//#endif
-//#include "srercode.h"
-
-//#include <cmath>
-#include <math.h>
-
-#ifndef _GM_WITHOUT_BASE
-#include "gmobj.h"
-#endif
-
-#ifdef _WITH_OMP //OC31102018: Pre-processor definition for compiling SRW with OpenMP library
-#include "omp.h"
-#endif
-
-#ifndef MEMORY_ALLOCATION_FAILURE
-#define MEMORY_ALLOCATION_FAILURE 8 + 10000 //in line with SRW
-#endif
-#ifndef ERROR_IN_FFT
-#define ERROR_IN_FFT 40 + 10000
-#endif
-
-//*************************************************************************
-
-class CGenMathFFT //{
-#ifndef _GM_WITHOUT_BASE
-	: public CGenMathObj
-#endif
-{//OC01052013
-	double a2c, a4c, a6c, a8c, a10c, a12c;
-	double a3s, a5s, a7s, a9s, a11s, a13s;
-
-protected:
-
-	static long GoodNumbers[];
-	static long LenGoodNumbers;
-	static long GoodNum100s[];
-	static long LenGoodNum100s;
-	static long GoodNum1000s[];
-	static long LenGoodNum1000s;
-	static long GoodNum10000s[];
-	static long LenGoodNum10000s;
-
-public:
-
-	double HalfPI, PI, TwoPI, ThreePIdTwo, One_dTwoPI; // Constants
-
-	CGenMathFFT()
-	{
-		HalfPI = 1.5707963267949;
-		PI = 3.141592653590;
-		TwoPI = 6.2831853071796;
-		ThreePIdTwo = 4.7123889803847;
-		One_dTwoPI = 0.1591549430919;
-		a2c = -0.5; a4c = 0.041666666666667; a6c = -0.0013888888888889; a8c = 0.000024801587301587; a10c = -2.755731922E-07;
-		a3s = -0.16666666666667; a5s = 0.0083333333333333; a7s = -0.0001984126984127; a9s = 2.755731922E-06; a11s = -2.505210839E-08;
-	}
-
-	void CosAndSin(double x, float& Cos, float& Sin)
-	{
-		x -= TwoPI*int(x*One_dTwoPI);
-		if(x < 0.) x += TwoPI;
-
-		char ChangeSign=0;
-		if(x > ThreePIdTwo) x -= TwoPI;
-		else if(x > HalfPI) { x -= PI; ChangeSign = 1;}
-
-		double xe2 = x*x;
-		Cos = float(1. + xe2*(a2c + xe2*(a4c + xe2*(a6c + xe2*(a8c + xe2*a10c)))));
-		Sin = float(x*(1. + xe2*(a3s + xe2*(a5s + xe2*(a7s + xe2*(a9s + xe2*a11s))))));
-		if(ChangeSign) { Cos = -Cos; Sin = -Sin;}
-	}
-	void CosAndSin(double x, double& Cos, double& Sin) //OC02022019
-	{
-		//x -= TwoPI*int(x*One_dTwoPI);
-		x -= TwoPI*((long long)(x*One_dTwoPI));
-
-		if(x < 0.) x += TwoPI;
-
-		char ChangeSign=0;
-		if(x > ThreePIdTwo) x -= TwoPI;
-		else if(x > HalfPI) { x -= PI; ChangeSign = 1;}
-
-		double xe2 = x*x;
-		Cos = 1. + xe2*(a2c + xe2*(a4c + xe2*(a6c + xe2*(a8c + xe2*a10c))));
-		Sin = x*(1. + xe2*(a3s + xe2*(a5s + xe2*(a7s + xe2*(a9s + xe2*a11s)))));
-		if(ChangeSign) { Cos = -Cos; Sin = -Sin;}
-	}
-
-	//void NextCorrectNumberForFFT(long long&); //OC26042019
-	void NextCorrectNumberForFFT(long&);
-};
-
-//*************************************************************************
-
-struct CGenMathFFT2DInfo {
-	float* pData;
-	double* pdData; //OC31012019
-
-	char Dir; // >0: forward; <0: backward
-	double xStep, yStep, xStart, yStart;
-	double xStepTr, yStepTr, xStartTr, yStartTr;
-	long Nx, Ny;
-	//long long Nx, Ny;
-
-	long howMany; //OC151014
-	long iStride, iDist; //OC151014
-	//From FFTW 2.1.5 Tutorial
-	//iStride and iDist describe the input array(s). 
-	//There are howMany multi-dimensional input arrays; the first one is pointed to by in (= pData), 
-	//the second one is pointed to by in + iDist, and so on, up to in + (howMany - 1) * iDist. 
-	//Each multi-dimensional input array consists of complex numbers (see Section Data Types), 
-	//stored in row-major format (see Section Multi-dimensional Array Format), which are not necessarily contiguous in memory. 
-	//Specifically, in[0] is the first element of the first array, in[istride] is the second element of the first array, and so on. 
-	//In general, the i-th element of the j-th input array will be in position in[i * istride + j * idist]. 
-	//Note that, here, i refers to an index into the row-major format for the multi-dimensional array, rather than an index in any particular dimension. 
-	//In-place transforms:  For plans created with the FFTW_IN_PLACE option, the transform is computed in-place--the output is returned in the in array, 
-	//using the same strides, etcetera, as were used in the input. 
-
-	char UseGivenStartTrValues;
-	double ExtraMult; //OC20112017
-
-	CGenMathFFT2DInfo() 
-	{ 
-		howMany = 1; iStride = 1; iDist = 0; //OC151014
-		UseGivenStartTrValues = 0;
-		ExtraMult = 1.; //OC20112017
-
-		pData = 0; //OC31012019
-		pdData = 0;
-	}
-};
-
-//*************************************************************************
-
-class CGenMathFFT2D : public CGenMathFFT {
-
-	long Nx, Ny;
-	long HalfNx, HalfNy;
-	//long long Nx, Ny;
-	//long long HalfNx, HalfNy;
-	char NeedsShiftBeforeX, NeedsShiftBeforeY, NeedsShiftAfterX, NeedsShiftAfterY;
-	//float *ArrayShiftX, *ArrayShiftY;
-	float *m_ArrayShiftX, *m_ArrayShiftY; //OC02022019
-	double *m_dArrayShiftX, *m_dArrayShiftY; 
-
-#ifdef _OFFLOAD_GPU
-	static long PlanNx, PlanNy, HowMany;
-	static long dPlanNx, dPlanNy, dHowMany;
-	static cufftHandle Plan2DFFT_cu;
-	static cufftHandle dPlan2DFFT_cu;
-#endif
-
-public:
-	CGenMathFFT2D()
-	{
-		NeedsShiftBeforeX = NeedsShiftBeforeY = NeedsShiftAfterX = NeedsShiftAfterY = 0;
-#ifdef _OFFLOAD_GPU
-		HowMany = PlanNx = PlanNy = dHowMany = dPlanNx = dPlanNy = 0;
-		Plan2DFFT_cu = dPlan2DFFT_cu = 0;
-#endif
-	}
-
-	//int Make2DFFT(CGenMathFFT2DInfo&);
-	//Modification by S.Yakubov for parallelizing SRW via OpenMP:
-#ifdef _FFTW3 //28012019
-	int Make2DFFT(CGenMathFFT2DInfo&, fftwf_plan* pPrecreatedPlan2DFFT=0, fftw_plan* pdPrecreatedPlan2DFFT=0, void* pvGPU = 0); //OC05092023
-	//int Make2DFFT(CGenMathFFT2DInfo&, fftwf_plan* pPrecreatedPlan2DFFT=0, fftw_plan* pdPrecreatedPlan2DFFT=0, gpuUsageArg *pGpuUsage = 0); //OC02022019
-	//int Make2DFFT(CGenMathFFT2DInfo&, fftwf_plan* pPrecreatedPlan2DFFT=0);
-#else
-	int Make2DFFT(CGenMathFFT2DInfo&, fftwnd_plan* pPrecreatedPlan2DFFT=0); //OC27102018
-#endif
-
-	int AuxDebug_TestFFT_Plans();
-
-	void SetupLimitsTr(CGenMathFFT2DInfo& FFT2DInfo)
-	{// Modify this if Make2DFFT is modified !
-		Nx = FFT2DInfo.Nx; Ny = FFT2DInfo.Ny; 
-		HalfNx = (Nx >> 1); HalfNy = (Ny >> 1);
-
-		double xStartTr = -0.5/FFT2DInfo.xStep;
-		FFT2DInfo.xStepTr = -xStartTr/HalfNx;
-
-		double yStartTr = -0.5/FFT2DInfo.yStep;
-		FFT2DInfo.yStepTr = -yStartTr/HalfNy;
-
-		if(!FFT2DInfo.UseGivenStartTrValues)
-		{
-			FFT2DInfo.xStartTr = xStartTr;
-			FFT2DInfo.yStartTr = yStartTr;
-		}
-	}
-
-	template <class T> void FillArrayShift(char x_or_y, double t0, double tStep, T* arShift) //OC02022019
-	//void FillArrayShift(char x_or_y, double t0, double tStep)
-	{
-		T* tArrayShift = arShift;
-		//float* tArrayShift;
-		//long N;
-		long N = (x_or_y == 'x')? Nx : Ny;
-		//if(x_or_y == 'x') { tArrayShift = m_ArrayShiftX; N = Nx;}
-		//else { tArrayShift = m_ArrayShiftY; N = Ny;}
-
-		T *tp = tArrayShift + N;
-		//float *tp = tArrayShift + N;
-		*tp = 1.; *(tp+1) = 0.; tp += 2;
-		T *tm = tp - 4;
-		//float *tm = tp - 4;
-		
-		double t0TwoPI = t0*TwoPI;
-		double q = tStep;
-		long HalfN = N >> 1;
-		for(int i=0; i<HalfN - 1; i++)
-		{
-			CosAndSin(q*t0TwoPI, *tp, *(tp+1));
-			*tm = *tp; *(tm+1) = -(*(tp+1));
-			tp += 2; tm -= 2; q += tStep;
-		}
-		CosAndSin(-q*t0TwoPI, *tm, *(tm+1));
-	}
-
-#ifdef _FFTW3 //OC29012019
-	template <class T> void RotateDataAfter2DFFT(T* pAfterFFT, long HowMany)
-	//void RotateDataAfter2DFFT(fftwf_complex* pAfterFFT)
-	{// Assumes Nx, Ny even !
-	 //OC281117: Make it work for odd Nx, Ny as well!
-	 //OC281117: Consider combining RotateDataAfter2DFFT, RepairSignAfter2DFFT, NormalizeDataAfter2DFFT
-		//long HalfNyNx = HalfNy*Nx;
-		long long HalfNyNx = ((long long)HalfNy)*((long long)Nx);
-
-		for(long iHowMany=0; iHowMany<HowMany; iHowMany++)
-		{
-			//fftwf_complex *t1 = pAfterFFT, *t2 = pAfterFFT + (HalfNyNx + HalfNx);
-			//fftwf_complex *t3 = pAfterFFT + HalfNx, *t4 = pAfterFFT + HalfNyNx;
-			//fftwf_complex Buf;
-			long PerFFT = Nx * Ny * iHowMany;
-			T *t1 = pAfterFFT + PerFFT, *t2 = pAfterFFT + (HalfNyNx + HalfNx) + PerFFT;
-			T *t3 = pAfterFFT + HalfNx + PerFFT, *t4 = pAfterFFT + HalfNyNx + PerFFT;
-			T Buf;
-
-			for(long jj=0; jj<HalfNy; jj++)
-			{
-				for(long ii=0; ii<HalfNx; ii++)
-				{
-					Buf[0] = (*t1)[0]; Buf[1] = (*t1)[1]; //Buf = *t1;
-					(*t1)[0] = (*t2)[0]; (*(t1++))[1] = (*t2)[1]; //*(t1++) = *t2;
-					(*t2)[0] = Buf[0]; (*(t2++))[1] = Buf[1]; //*(t2++) = Buf;
-
-					Buf[0] = (*t3)[0]; Buf[1] = (*t3)[1]; //Buf = *t3; 
-					(*t3)[0] = (*t4)[0]; (*(t3++))[1] = (*t4)[1]; //*(t3++) = *t4; 
-					(*t4)[0] = Buf[0]; (*(t4++))[1] = Buf[1]; //*(t4++) = Buf;
-				}
-				t1 += HalfNx; t2 += HalfNx; t3 += HalfNx; t4 += HalfNx;
-			}
-		}
-	}
-#else
-	void RotateDataAfter2DFFT(FFTW_COMPLEX* pAfterFFT)
-	{// Assumes Nx, Ny even !
-	 //OC281117: Make it work for odd Nx, Ny as well!
-	 //OC281117: Consider combining RotateDataAfter2DFFT, RepairSignAfter2DFFT, NormalizeDataAfter2DFFT
-		//long HalfNyNx = HalfNy*Nx;
-		long long HalfNyNx = ((long long)HalfNy)*((long long)Nx);
-
-		FFTW_COMPLEX *t1 = pAfterFFT, *t2 = pAfterFFT + (HalfNyNx + HalfNx);
-	    FFTW_COMPLEX *t3 = pAfterFFT + HalfNx, *t4 = pAfterFFT + HalfNyNx;
-		FFTW_COMPLEX Buf;
-
-		for(long jj=0; jj<HalfNy; jj++)
-		{
-			for(long ii=0; ii<HalfNx; ii++)
-			{
-				Buf = *t1; *(t1++) = *t2; *(t2++) = Buf;
-				Buf = *t3; *(t3++) = *t4; *(t4++) = Buf;
-			}
-			t1 += HalfNx; t2 += HalfNx; t3 += HalfNx; t4 += HalfNx;
-		}
-	}
-#endif
-
-#ifdef _FFTW3 //OC29012019
-	void RepairSignAfter2DFFT(fftwf_complex* pAfterFFT, long HowMany)
-	{// Assumes Nx, Ny even !
-	 //OC281117: Make it work for odd Nx, Ny as well!
-	    for(long iHowMany=0; iHowMany<HowMany; iHowMany++)
-		{
-			fftwf_complex *t = pAfterFFT + Nx*Ny*iHowMany;
-			float sx0 = 1., sy0 = 1., s;
-			for(long iy=0; iy<Ny; iy++)
-			{
-				s = sy0*sx0;
-				for(long ix=0; ix<Nx; ix++)
-				{
-					(*t)[0] *= s; (*(t++))[1] *= s; s = -s;
-				}
-				sy0 = -sy0;
-			}
-		}
-	}
-	void RepairSignAfter2DFFT(fftw_complex* pAfterFFT, long HowMany)
-	{// Assumes Nx, Ny even !
-	 //OC281117: Make it work for odd Nx, Ny as well!
-	    for(long iHowMany=0; iHowMany<HowMany; iHowMany++)
-		{
-			fftw_complex *t = pAfterFFT + Nx*Ny*iHowMany;
-			double sx0 = 1., sy0 = 1., s;
-			for(long iy=0; iy<Ny; iy++)
-			{
-				s = sy0*sx0;
-				for(long ix=0; ix<Nx; ix++)
-				{
-					(*t)[0] *= s; (*(t++))[1] *= s; s = -s;
-				}
-				sy0 = -sy0;
-			}
-		}
-	}
-#else
-	void RepairSignAfter2DFFT(FFTW_COMPLEX* pAfterFFT)
-	{// Assumes Nx, Ny even !
-	 //OC281117: Make it work for odd Nx, Ny as well!
-		FFTW_COMPLEX *t = pAfterFFT;
-		FFTW_REAL sx0 = 1., sy0 = 1., s;
-		for(long iy=0; iy<Ny; iy++)
-		{
-			s = sy0*sx0;
-			for(long ix=0; ix<Nx; ix++)
-			{
-				t->re *= s; (t++)->im *= s; s = -s;
-			}
-			sy0 = -sy0;
-		}
-	}
-#endif
-
-#ifdef _FFTW3 //OC29012019
-	void NormalizeDataAfter2DFFT(fftwf_complex* pAfterFFT, double Mult, long HowMany)
-	{// Assumes Nx, Ny even !
-	 //OC281117: To make it work for odd Nx, Ny as well in the future!
-		float fMult = (float)Mult;
-		long long NxNy = ((long long)Nx)*((long long)Ny);
-		for(long iHowMany=0; iHowMany<HowMany; iHowMany++)
-		{
-			fftwf_complex *t = pAfterFFT + NxNy*iHowMany;
-			for(long long i=0; i<NxNy; i++)
-			{
-				(*t)[0] *= fMult; (*(t++))[1] *= fMult; 
-			}
-		}
-	}
-	void NormalizeDataAfter2DFFT(fftw_complex* pAfterFFT, double Mult, long HowMany)
-	{// Assumes Nx, Ny even !
-	 //OC281117: To make it work for odd Nx, Ny as well in the future!
-		long long NxNy = ((long long)Nx)*((long long)Ny);
-		for(long iHowMany=0; iHowMany<HowMany; iHowMany++)
-		{
-			fftw_complex *t = pAfterFFT + NxNy*iHowMany;
-			for(long long i=0; i<NxNy; i++)
-			{
-				(*t)[0] *= Mult; (*(t++))[1] *= Mult; 
-			}
-		}
-	}
-#else
-	void NormalizeDataAfter2DFFT(FFTW_COMPLEX* pAfterFFT, double Mult)
-	{// Assumes Nx, Ny even !
-	 //OC281117: Make it work for odd Nx, Ny as well!
-		//long NxNy = Nx*Ny;
-		long long NxNy = ((long long)Nx)*((long long)Ny);
-		FFTW_COMPLEX *t = pAfterFFT;
-		//for(long i=0; i<NxNy; i++)
-		for(long long i=0; i<NxNy; i++)
-		{
-			t->re *= (FFTW_REAL)Mult; (t++)->im *= (FFTW_REAL)Mult;
-		}
-	}
-#endif
-
-#ifdef _FFTW3 //OC29012019
-	void TreatShifts(fftwf_complex* pData, long HowMany)
-	{
-		fftwf_complex *t = pData;
-		char NeedsShiftX = NeedsShiftBeforeX || NeedsShiftAfterX;
-		char NeedsShiftY = NeedsShiftBeforeY || NeedsShiftAfterY;
-
-		for(long iHowMany=0; iHowMany<HowMany; iHowMany++)
-		{
-			float *tShiftY = m_ArrayShiftY;
-			float MultY_Re = 1., MultY_Im = 0., MultX_Re = 1., MultX_Im = 0.;
-			float MultRe, MultIm;
-
-			for(long iy=0; iy<Ny; iy++)
-			{
-				if(NeedsShiftY) { MultY_Re = *(tShiftY++); MultY_Im = *(tShiftY++);}
-				float *tShiftX = m_ArrayShiftX;
-				for(long ix=0; ix<Nx; ix++)
-				{
-					if(NeedsShiftX) 
-					{ 
-						MultX_Re = *(tShiftX++); MultX_Im = *(tShiftX++);
-						if(NeedsShiftY)
-						{
-							MultRe = MultX_Re*MultY_Re - MultX_Im*MultY_Im;
-							MultIm = MultX_Re*MultY_Im + MultX_Im*MultY_Re;
-						}
-						else
-						{
-							MultRe = MultX_Re; MultIm = MultX_Im;
-						}
-					}
-					else
-					{
-						MultRe = MultY_Re; MultIm = MultY_Im;
-					}
-
-//	#ifdef _FFTW3 //OC29012019
-					float NewRe = (*t)[0]*MultRe - (*t)[1]*MultIm;
-					float NewIm = (*t)[0]*MultIm + (*t)[1]*MultRe;
-					(*t)[0] = NewRe;
-					(*(t++))[1] = NewIm;
-//	#else
-//					float NewRe = t->re*MultRe - t->im*MultIm;
-//					float NewIm = t->re*MultIm + t->im*MultRe;
-//					t->re = NewRe;
-//					(t++)->im = NewIm;
-//	#endif
-				}
-			}
-		}
-	}
-#else
-	void TreatShifts(FFTW_COMPLEX* pData)
-	{
-		FFTW_COMPLEX *t = pData;
-		char NeedsShiftX = NeedsShiftBeforeX || NeedsShiftAfterX;
-		char NeedsShiftY = NeedsShiftBeforeY || NeedsShiftAfterY;
-
-		float *tShiftY = m_ArrayShiftY;
-		float MultY_Re = 1., MultY_Im = 0., MultX_Re = 1., MultX_Im = 0.;
-		float MultRe, MultIm;
-
-		for(long iy=0; iy<Ny; iy++)
-		{
-			if(NeedsShiftY) { MultY_Re = *(tShiftY++); MultY_Im = *(tShiftY++);}
-			float *tShiftX = m_ArrayShiftX;
-			for(long ix=0; ix<Nx; ix++)
-			{
-				if(NeedsShiftX) 
-				{ 
-					MultX_Re = *(tShiftX++); MultX_Im = *(tShiftX++);
-					if(NeedsShiftY)
-					{
-						MultRe = MultX_Re*MultY_Re - MultX_Im*MultY_Im;
-						MultIm = MultX_Re*MultY_Im + MultX_Im*MultY_Re;
-					}
-					else
-					{
-						MultRe = MultX_Re; MultIm = MultX_Im;
-					}
-				}
-				else
-				{
-					MultRe = MultY_Re; MultIm = MultY_Im;
-				}
-
-				float NewRe = t->re*MultRe - t->im*MultIm;
-				float NewIm = t->re*MultIm + t->im*MultRe;
-				t->re = NewRe;
-				(t++)->im = NewIm;
-			}
-		}
-	}
-#endif
-#ifdef _FFTW3 //OC02022019
-	void TreatShifts(fftw_complex* pData, long HowMany)
-	{
-		fftw_complex *t = pData;
-		char NeedsShiftX = NeedsShiftBeforeX || NeedsShiftAfterX;
-		char NeedsShiftY = NeedsShiftBeforeY || NeedsShiftAfterY;
-
-        for(long iHowMany=0; iHowMany<HowMany; iHowMany++)
-		{
-			double *tShiftY = m_dArrayShiftY;
-			double MultY_Re = 1., MultY_Im = 0., MultX_Re = 1., MultX_Im = 0.;
-			double MultRe, MultIm;
-
-			for(long iy=0; iy<Ny; iy++)
-			{
-				if(NeedsShiftY) { MultY_Re = *(tShiftY++); MultY_Im = *(tShiftY++);}
-				double *tShiftX = m_dArrayShiftX;
-				for(long ix=0; ix<Nx; ix++)
-				{
-					if(NeedsShiftX) 
-					{ 
-						MultX_Re = *(tShiftX++); MultX_Im = *(tShiftX++);
-						if(NeedsShiftY)
-						{
-							MultRe = MultX_Re*MultY_Re - MultX_Im*MultY_Im;
-							MultIm = MultX_Re*MultY_Im + MultX_Im*MultY_Re;
-						}
-						else
-						{
-							MultRe = MultX_Re; MultIm = MultX_Im;
-						}
-					}
-					else
-					{
-						MultRe = MultY_Re; MultIm = MultY_Im;
-					}
-
-					double NewRe = (*t)[0]*MultRe - (*t)[1]*MultIm;
-					double NewIm = (*t)[0]*MultIm + (*t)[1]*MultRe;
-					(*t)[0] = NewRe;
-					(*(t++))[1] = NewIm;
-				}
-			}
-		}
-	}
-#endif
-};
-
-//*************************************************************************
-
-struct CGenMathFFT1DInfo {
-	float *pInData, *pOutData;
-	double *pdInData, *pdOutData; //OC31012019
-
-	char Dir; // >0: forward; <0: backward
-	double xStep, xStart;
-	double xStepTr, xStartTr;
-	long Nx;
-	//long long Nx;
-	long HowMany;
-	//long long HowMany;
-	char UseGivenStartTrValue;
-	double MultExtra;
-
-	char TreatSharpEdges;
-	double LeftSharpEdge, RightSharpEdge;
-	char ApplyAutoShiftAfter;
-
-	CGenMathFFT1DInfo() 
-	{ 
-		HowMany = 1; UseGivenStartTrValue = 0;
-		TreatSharpEdges = 0;
-		MultExtra = 1.;
-		ApplyAutoShiftAfter = 1;
-
-		pInData = 0; //OC31012019
-		pOutData = 0;
-		pdInData = 0;
-		pdOutData = 0;
-	}
-};
-
-//*************************************************************************
-
-struct CGenMathAuxDataForSharpEdgeCorr1D {
-
-	float *ExpArrSt, *ExpArrFi;
-	double *dExpArrSt, *dExpArrFi;
-
-	double dSt, dFi, d;
-	long iSt, iFi;
-
-	char WasSetUp;
-
-	CGenMathAuxDataForSharpEdgeCorr1D()
-	{
-		Initialize();
-	}
-
-	void Initialize()
-	{
-		ExpArrSt = ExpArrFi = 0;
-		dExpArrSt = dExpArrFi = 0;
-
-		dSt = dFi = d = 0.;
-		iSt = iFi = 0;
-		WasSetUp = 0;
-	}
-
-	void Dispose()
-	{
-		if(ExpArrSt != 0) delete[] ExpArrSt;
-		if(ExpArrFi != 0) delete[] ExpArrFi;
-
-		if(dExpArrSt != 0) delete[] dExpArrSt;
-		if(dExpArrFi != 0) delete[] dExpArrFi;
-
-		Initialize();
-	}
-};
-
-//*************************************************************************
-
-class CGenMathFFT1D : public CGenMathFFT {
-
-	long Nx;
-	long HalfNx;
-	//long long Nx;
-	//long long HalfNx;
-	char NeedsShiftBeforeX, NeedsShiftAfterX;
-	float *m_ArrayShiftX;
-	double *m_dArrayShiftX; //OC02022019
-#ifdef _OFFLOAD_GPU
-	static long PlanLen, HowMany;
-	static long dPlanLen, dHowMany;
-	static cufftHandle Plan1DFFT_cu;
-	static cufftHandle dPlan1DFFT_cu;
-#endif
-
-public:
-	CGenMathFFT1D()
-	{
-		NeedsShiftBeforeX = NeedsShiftAfterX = 0;
-#ifdef _OFFLOAD_GPU
-		PlanLen = dPlanLen = 0;
-		Plan1DFFT_cu = dPlan1DFFT_cu = 0;
-		HowMany = dHowMany = 0;
-#endif
-	}
-
-	int Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU=0); //OC05092023
-	int Make1DFFT_InPlace(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU=0); //OC05092023
-
-//#ifndef _OFFLOAD_GPU //OC05092023
-//	int Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo);
-//	int Make1DFFT_InPlace(CGenMathFFT1DInfo& FFT1DInfo);
-//#else
-//	int Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, TGPUUsageArg* pGPU=0);
-//	int Make1DFFT_InPlace(CGenMathFFT1DInfo& FFT1DInfo, TGPUUsageArg* pGPU=0);
-//	//int Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, gpuUsageArg *pGpuUsage=0); //HG
-//	//int Make1DFFT_InPlace(CGenMathFFT1DInfo& FFT1DInfo, gpuUsageArg *pGpuUsage=0);
-//#endif
-
-	void SetupLimitsTr(CGenMathFFT1DInfo& FFT1DInfo)
-	{ // Modify this if Make1DFFT is modified !
-		Nx = FFT1DInfo.Nx;
-		HalfNx = (Nx >> 1);
-
-		double xStartTr = -0.5/FFT1DInfo.xStep;
-		FFT1DInfo.xStepTr = -xStartTr/HalfNx;
-
-		if(!FFT1DInfo.UseGivenStartTrValue)
-		{
-			FFT1DInfo.xStartTr = xStartTr;
-		}
-	}
-
-	template <class T> void FillArrayShift(double t0, double tStep, T* arShiftX) //OC02022019
-	//void FillArrayShift(double t0, double tStep)
-	{
-		//float *tArrayShift = m_ArrayShiftX;
-		T *tArrayShift = arShiftX; //OC02022019
-		long N = Nx;
-
-		//float *tp = tArrayShift + N;
-		T *tp = tArrayShift + N; //OC02022019
-		*tp = 1.; *(tp+1) = 0.; tp += 2;
-		//float *tm = tp - 4;
-		T *tm = tp - 4;
-
-		double t0TwoPI = t0*TwoPI;
-		double q = tStep;
-		long HalfN = N >> 1;
-
-		for(int i=0; i<HalfN - 1; i++)
-		{
-			CosAndSin(q*t0TwoPI, *tp, *(tp+1));
-			*tm = *tp; *(tm+1) = -(*(tp+1));
-			tp += 2; tm -= 2; q += tStep;
-		}
-		CosAndSin(-q*t0TwoPI, *tm, *(tm+1));
-	}
-
-#ifdef _FFTW3 //OC29012019
-	void TreatShift(fftwf_complex* pData, long HowMany)
-	{
-		char NeedsShiftX = NeedsShiftBeforeX || NeedsShiftAfterX;
-		if(!NeedsShiftX) return;
-
-		fftwf_complex *t = pData;
-		float *tShiftX = m_ArrayShiftX;
-
-		for(long ix=0; ix<Nx; ix++)
-		{
-			float MultX_Re = *(tShiftX++), MultX_Im = *(tShiftX++);
-
-			fftwf_complex *tMany = t++;
-			for(long k=0; k<HowMany; k++)
-			{
-				float NewRe = (*tMany)[0]*MultX_Re - (*tMany)[1]*MultX_Im;
-				float NewIm = (*tMany)[0]*MultX_Im + (*tMany)[1]*MultX_Re;
-				(*tMany)[0] = NewRe; (*tMany)[1] = NewIm;
-				tMany += Nx;
-			}
-		}
-	}
-	void TreatShift(fftw_complex* pData, long HowMany) //OC02022019
-	{
-		char NeedsShiftX = NeedsShiftBeforeX || NeedsShiftAfterX;
-		if(!NeedsShiftX) return;
-
-		fftw_complex *t = pData;
-		double *tShiftX = m_dArrayShiftX;
-
-		for(long ix=0; ix<Nx; ix++)
-		{
-			double MultX_Re = *(tShiftX++), MultX_Im = *(tShiftX++);
-
-			fftw_complex *tMany = t++;
-			for(long k=0; k<HowMany; k++)
-			{
-				double NewRe = (*tMany)[0]*MultX_Re - (*tMany)[1]*MultX_Im;
-				double NewIm = (*tMany)[0]*MultX_Im + (*tMany)[1]*MultX_Re;
-				(*tMany)[0] = NewRe; (*tMany)[1] = NewIm;
-				tMany += Nx;
-			}
-		}
-	}
-
-#else
-	void TreatShift(FFTW_COMPLEX* pData, long HowMany)
-	{
-		char NeedsShiftX = NeedsShiftBeforeX || NeedsShiftAfterX;
-		if(!NeedsShiftX) return;
-
-		FFTW_COMPLEX *t = pData;
-		float *tShiftX = m_ArrayShiftX;
-
-		for(long ix=0; ix<Nx; ix++)
-		{
-			float MultX_Re = *(tShiftX++), MultX_Im = *(tShiftX++);
-
-			FFTW_COMPLEX *tMany = t++;
-			for(long k=0; k<HowMany; k++)
-			{
-				float NewRe = tMany->re*MultX_Re - tMany->im*MultX_Im;
-				float NewIm = tMany->re*MultX_Im + tMany->im*MultX_Re;
-				tMany->re = NewRe; tMany->im = NewIm;
-				tMany += Nx;
-			}
-		}
-	}
-#endif
-
-#ifdef _FFTW3 //OC29012019
-	template <class T> void RepairSignAfter1DFFT(T* pAfterFFT, long HowMany) //OC02022019
-	//void RepairSignAfter1DFFT(fftwf_complex* pAfterFFT, long HowMany)
-	{// Assumes Nx even ! - to be improved
-		//OC27102018
-		//SY: optimized, adopt for OpenMP
-#ifdef _WITH_OMP
-		#pragma omp parallel for
-#endif
-		for(long ix=1; ix<Nx; ix+=2)
-		{
-			//FFTW_COMPLEX *t = pAfterFFT + ix;
-			//FFTW_COMPLEX *tMany = t;
-			//OC27102018
-			//fftwf_complex *tMany = pAfterFFT + ix;
-			T *tMany = pAfterFFT + ix;
-			for(long k=0; k<HowMany; k++)
-			{
-				(*tMany)[0] = -(*tMany)[0]; (*tMany)[1] = -(*tMany)[1];
-				tMany += Nx;
-			}
-		}
-	}
-#else
-	void RepairSignAfter1DFFT(FFTW_COMPLEX* pAfterFFT, long HowMany)
-	{// Assumes Nx even !
-		//FFTW_COMPLEX *t = pAfterFFT;
-		//int s = 1;
-		//for(long ix=0; ix<Nx; ix++)
-		//{
-		//	if(s < 0)
-		//	{
-		//		FFTW_COMPLEX *tMany = t;
-		//		for(long k=0; k<HowMany; k++)
-		//		{
-		//			tMany->re = -tMany->re; tMany->im = -tMany->im;
-		//			tMany += Nx;
-		//		}
-		//	}
-		//	t++; s = -s;
-		//}
-		//OC27102018
-		//SY: optimized, adopt for OpenMP
-#ifdef _WITH_OMP
-		#pragma omp parallel for
-#endif
-		for(long ix=1; ix<Nx; ix+=2)
-		{
-			//FFTW_COMPLEX *t = pAfterFFT + ix;
-			//FFTW_COMPLEX *tMany = t;
-			//OC27102018
-			FFTW_COMPLEX *tMany = pAfterFFT + ix;
-			for(long k=0; k<HowMany; k++)
-			{
-				tMany->re = -tMany->re; tMany->im = -tMany->im;
-				tMany += Nx;
-			}
-		}
-	}
-#endif
-
-#ifdef _FFTW3 //OC29012019
-	template <class T> void RotateDataAfter1DFFT(T* pAfterFFT, long HowMany) //OC02022019
-	//void RotateDataAfter1DFFT(fftwf_complex* pAfterFFT, long HowMany)
-	{// Assumes Nx even !
-#ifndef _WITH_OMP //OC27102018
-		//fftwf_complex *t1 = pAfterFFT, *t2 = pAfterFFT + HalfNx;
-		//fftwf_complex Buf;
-		T *t1 = pAfterFFT, *t2 = pAfterFFT + HalfNx, Buf;
-		for(long ix=0; ix<HalfNx; ix++)
-		{
-			//fftwf_complex *t1Many = t1++, *t2Many = t2++;
-			T *t1Many = t1++, *t2Many = t2++;
-			for(long k=0; k<HowMany; k++)
-			{
-				Buf[0] = (*t1Many)[0]; Buf[1] = (*t1Many)[1];
-				(*t1Many)[0] = (*t2Many)[0]; (*t1Many)[1] = (*t2Many)[1];
-				(*t2Many)[0] = Buf[0]; (*t2Many)[1] = Buf[1]; 
-				t1Many += Nx; t2Many += Nx; 
-			}
-		}
-#else //OC27102018
-		//SY: adopted for OpenMP
-		#pragma omp parallel for
-		for(long ix=0; ix<HalfNx; ix++)
-		{
-			//fftwf_complex *t1Many = pAfterFFT + ix;
-			//fftwf_complex *t2Many = pAfterFFT + HalfNx + ix;
-			//fftwf_complex Buf;
-			T *t1Many = pAfterFFT + ix; //OC02022019
-			T *t2Many = pAfterFFT + HalfNx + ix;
-			T Buf;
-			for(long k=0; k<HowMany; k++)
-			{
-				Buf[0] = (*t1Many)[0]; Buf[1] = (*t1Many)[1]; 
-				(*t1Many)[0] = (*t2Many)[0]; (*t1Many)[1] = (*t2Many)[1];
-				(*t2Many)[0] = Buf[0]; (*t2Many)[1] = Buf[1];
-				t1Many += Nx; t2Many += Nx; 
-			}
-		}
-#endif
-	}
-#else
-	void RotateDataAfter1DFFT(FFTW_COMPLEX* pAfterFFT, long HowMany)
-	{// Assumes Nx even !
-#ifndef _WITH_OMP //OC27102018
-		FFTW_COMPLEX *t1 = pAfterFFT, *t2 = pAfterFFT + HalfNx;
-		FFTW_COMPLEX Buf;
-		for(long ix=0; ix<HalfNx; ix++)
-		{
-			FFTW_COMPLEX *t1Many = t1++, *t2Many = t2++;
-			for(long k=0; k<HowMany; k++)
-			{
-				Buf = *t1Many; *t1Many = *t2Many; *t2Many = Buf;
-				t1Many += Nx; t2Many += Nx; 
-			}
-		}
-#else //OC27102018
-		//SY: adopted for OpenMP
-		#pragma omp parallel for
-		for(long ix=0; ix<HalfNx; ix++)
-		{
-			FFTW_COMPLEX *t1Many = pAfterFFT + ix;
-			FFTW_COMPLEX *t2Many = pAfterFFT + HalfNx + ix;
-			FFTW_COMPLEX Buf;
-			for(long k=0; k<HowMany; k++)
-			{
-				Buf = *t1Many; *t1Many = *t2Many; *t2Many = Buf;
-				t1Many += Nx; t2Many += Nx; 
-			}
-		}
-#endif
-	}
-#endif
-
-#ifdef _FFTW3 //OC29012019
-	void NormalizeDataAfter1DFFT(fftwf_complex* pAfterFFT, long HowMany, double Mult)
-	{// Assumes Nx even !
-		float fMult = (float)Mult;
-#ifndef _WITH_OMP //OC27102018
-		fftwf_complex *t = pAfterFFT;
-		for(long ix=0; ix<Nx; ix++)
-		{
-			fftwf_complex *tMany = t++;
-			for(long k=0; k<HowMany; k++)
-			{
-				(*tMany)[0] *= fMult; (*tMany)[1] *= fMult;
-				tMany += Nx;
-			}
-		}
-#else //OC27102018
-		//SY: adopted for OpenMP
-		#pragma omp parallel for
-		for(long ix=0; ix<Nx; ix++)
-		{
-			fftwf_complex *tMany = pAfterFFT + ix;
-			for(long k=0; k<HowMany; k++)
-			{
-				(*tMany)[0] *= fMult; (*tMany)[1] *= fMult;
-				tMany += Nx;
-			}
-		}
-#endif
-	}
-	void NormalizeDataAfter1DFFT(fftw_complex* pAfterFFT, long HowMany, double Mult)
-	{// Assumes Nx even !
-#ifndef _WITH_OMP //OC27102018
-		fftw_complex *t = pAfterFFT;
-		for(long ix=0; ix<Nx; ix++)
-		{
-			fftw_complex *tMany = t++;
-			for(long k=0; k<HowMany; k++)
-			{
-				(*tMany)[0] *= Mult; (*tMany)[1] *= Mult;
-				tMany += Nx;
-			}
-		}
-#else //OC27102018
-		//SY: adopted for OpenMP
-		#pragma omp parallel for
-		for(long ix=0; ix<Nx; ix++)
-		{
-			fftw_complex *tMany = pAfterFFT + ix;
-			for(long k=0; k<HowMany; k++)
-			{
-				(*tMany)[0] *= Mult; (*tMany)[1] *= Mult;
-				tMany += Nx;
-			}
-		}
-#endif
-	}
-
-#else //ifndef _FFTW3
-	void NormalizeDataAfter1DFFT(FFTW_COMPLEX* pAfterFFT, long HowMany, double Mult)
-	{// Assumes Nx even !
-#ifndef _WITH_OMP //OC27102018
-		FFTW_COMPLEX *t = pAfterFFT;
-		for(long ix=0; ix<Nx; ix++)
-		{
-			FFTW_COMPLEX *tMany = t++;
-			for(long k=0; k<HowMany; k++)
-			{
-				tMany->re *= (FFTW_REAL)Mult; tMany->im *= (FFTW_REAL)Mult;
-				tMany += Nx;
-			}
-		}
-#else //OC27102018
-		//SY: adopted for OpenMP
-		#pragma omp parallel for
-		for(long ix=0; ix<Nx; ix++)
-		{
-			FFTW_COMPLEX *tMany = pAfterFFT + ix;
-			for(long k=0; k<HowMany; k++)
-			{
-				tMany->re *= (FFTW_REAL)Mult; tMany->im *= (FFTW_REAL)Mult;
-				tMany += Nx;
-			}
-		}
-#endif
-	}
-#endif
-
-	int SetupAuxDataForSharpEdgeCorr(CGenMathFFT1DInfo&, CGenMathAuxDataForSharpEdgeCorr1D&, char dataType='f'); //OC02022019
-	//int SetupAuxDataForSharpEdgeCorr(CGenMathFFT1DInfo&, CGenMathAuxDataForSharpEdgeCorr1D&);
-	void MakeSharpEdgeCorr(CGenMathFFT1DInfo&, CGenMathAuxDataForSharpEdgeCorr1D&);
-
-	template <class T> void SetupSharpEdgeExpCorrArray(T* pCmpData, long AmOfPt, double x, double qStart, double qStep) //OC02022019
-	//void SetupSharpEdgeExpCorrArray(float* pCmpData, long AmOfPt, double x, double qStart, double qStep)
-	{
-		const double TwoPi = 6.28318530717959;
-		double TwoPiX = TwoPi*x;
-		double q = qStart;
-		//float *tCmpData = pCmpData;
-		T *tCmpData = pCmpData;
-		for(long i=0; i<AmOfPt; i++)
-		{
-			double Arg = TwoPiX*q;
-			//float Co, Si;
-			T Co, Si;
-			CosAndSin(Arg, Co, Si);
-			*(tCmpData++) = Co; *(tCmpData++) = -Si;
-			q += qStep;
-		}
-	}
-	int ProcessSharpEdges(CGenMathFFT1DInfo& FFT1DInfo)
-	{
-		CGenMathAuxDataForSharpEdgeCorr1D AuxDataForSharpEdgeCorr;
-		int result = SetupAuxDataForSharpEdgeCorr(FFT1DInfo, AuxDataForSharpEdgeCorr);
-		if(result) return result;
-
-		MakeSharpEdgeCorr(FFT1DInfo, AuxDataForSharpEdgeCorr);
-
-
-		AuxDataForSharpEdgeCorr.Dispose();
-		return 0;
-	}
-};
-
-//*************************************************************************
-
-#endif
diff --git a/cpp/src/core/gmfft_gpu.h b/cpp/src/core/gmfft_gpu.h
deleted file mode 100644
index 8eef9e2c..00000000
--- a/cpp/src/core/gmfft_gpu.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/************************************************************************//**
- * File: gmfft_gpu.h
- * Description: Auxiliary utilities to work with FFTW library (CUDA header)
- * Project: Synchrotron Radiation Workshop
- * First release: 2023
- *
- * Copyright (C) Brookhaven National Laboratory
- * All Rights Reserved
- *
- * @author H.Goel
- * @version 1.0
- ***************************************************************************/
-
-#ifndef __GMFFTGPU0_H
-#define __GMFFTGPU0_H
-
-void RepairSignAfter1DFFT_GPU(float* pAfterFFT, long HowMany, long Nx);
-void RotateDataAfter1DFFT_GPU(float* pAfterFFT, long HowMany, long Nx);
-void RepairAndRotateDataAfter1DFFT_GPU(float* pAfterFFT, long HowMany, long Nx, float Mult=1.f);
-void NormalizeDataAfter1DFFT_GPU(float* pAfterFFT, long HowMany, long Nx, double Mult);
-void FillArrayShift_GPU(double t0, double tStep, long Nx, float* tShiftX);
-void TreatShift_GPU(float* pData, long HowMany, long Nx, float* tShiftX);
-
-void RepairSignAfter1DFFT_GPU(double* pAfterFFT, long HowMany, long Nx);
-void RotateDataAfter1DFFT_GPU(double* pAfterFFT, long HowMany, long Nx);
-void RepairAndRotateDataAfter1DFFT_GPU(double* pAfterFFT, long HowMany, long Nx, double Mult=1.);
-void NormalizeDataAfter1DFFT_GPU(double* pAfterFFT, long HowMany, long Nx, double Mult);
-void FillArrayShift_GPU(double t0, double tStep, long Nx, double* tShiftX);
-void TreatShift_GPU(double* pData, long HowMany, long Nx, double* tShiftX);
-
-void RepairSignAfter2DFFT_GPU(float* pAfterFFT, long Nx, long Ny, long howMany);
-void RotateDataAfter2DFFT_GPU(float* pAfterFFT, long Nx, long Ny, long howMany);
-void RepairSignAndRotateDataAfter2DFFT_GPU(float* pAfterFFT, long Nx, long Ny, long howMany, float Mult=1.f); //to check
-void NormalizeDataAfter2DFFT_GPU(float* pAfterFFT, long Nx, long Ny, long howMany, double Mult);
-void TreatShifts2D_GPU(float* pData, long Nx, long Ny, long howMany, bool NeedsShiftX, bool NeedsShiftY, float* m_ArrayShiftX, float* m_ArrayShiftY);
-
-void RepairSignAfter2DFFT_GPU(double* pAfterFFT, long Nx, long Ny, long howMany);
-void RotateDataAfter2DFFT_GPU(double* pAfterFFT, long Nx, long Ny, long howMany);
-void RepairSignAndRotateDataAfter2DFFT_GPU(double* pAfterFFT, long Nx, long Ny, long howMany, double Mult=1.);
-void NormalizeDataAfter2DFFT_GPU(double* pAfterFFT, long Nx, long Ny, long howMany, double Mult);
-void TreatShifts2D_GPU(double* pData, long Nx, long Ny, long howMany, bool NeedsShiftX, bool NeedsShiftY, double* m_ArrayShiftX, double* m_ArrayShiftY);
-
-#endif // __GMFFTGPU0_H
\ No newline at end of file
diff --git a/cpp/src/core/sroptang.h b/cpp/src/core/sroptang.h
index c9f85485..d0370a26 100644
--- a/cpp/src/core/sroptang.h
+++ b/cpp/src/core/sroptang.h
@@ -30,7 +30,8 @@ class srTOptAngle : public srTGenOptElem {
 		AngY = InAngY;
 	}
 
-	int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect) //virtual
+	//int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect) //virtual
+	int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect, void* pvGPU) //virtual //HG30112023
 	{
 		//return PropagateRadiationMeth_0(pRadAccessData);
 		int res = 0;
@@ -43,11 +44,14 @@ class srTOptAngle : public srTGenOptElem {
 
 	//int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadAccessData, void* pBuf=0) //OC06092019
 	//OC01102019 (restored)
-	int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadAccessData)
+	//int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadAccessData)
+	int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadAccessData, void* pvGPU) //HG30112023
 	{
 		int result;
-		if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result;
-		if(result = TraverseRadZXE(pRadAccessData)) return result;
+		//if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result;
+		if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0, 0, 0, pvGPU)) return result; //HG30112023
+		//if(result = TraverseRadZXE(pRadAccessData)) return result;
+		if(result = TraverseRadZXE(pRadAccessData, 0, 0, pvGPU)) return result; //HG30112023
 		//consider programming Angle on angular side by simple change of limits
 		//however note potential problems for many photon energies!
 
diff --git a/cpp/src/core/sroptapt.h b/cpp/src/core/sroptapt.h
index 7d1032dc..e5f22dac 100644
--- a/cpp/src/core/sroptapt.h
+++ b/cpp/src/core/sroptapt.h
@@ -33,11 +33,13 @@ class srTAperture : public srTShapedOptElem {
 	srTAperture () {}
 
 	//int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, int MethNo, srTRadResizeVect& ResBeforeAndAfterVect)
-	int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect)
+	//int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect)
+	int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect, void* pvGPU) //HG30112023
 	{
 		char &MethNo = ParPrecWfrPropag.MethNo;
 
-		if(MethNo == 0) return PropagateRadiationMeth_0(pRadAccessData);
+		//if(MethNo == 0) return PropagateRadiationMeth_0(pRadAccessData);
+		if(MethNo == 0) return PropagateRadiationMeth_0(pRadAccessData, pvGPU); //HG30112023
 		else if(MethNo == 1) return PropagateRadiationMeth_1(pRadAccessData);
 		//else if(MethNo == 2) return PropagateRadiationMeth_2(pRadAccessData, ResBeforeAndAfterVect);
 		else if(MethNo == 2) return PropagateRadiationMeth_2(pRadAccessData, ParPrecWfrPropag, ResBeforeAndAfterVect);
@@ -47,11 +49,14 @@ class srTAperture : public srTShapedOptElem {
 	//int PropagateRadiationMeth_0(srTSRWRadStructAccessData* pRadAccessData)
 	//int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadAccessData, void* pBuf = 0) //OC06092019
 	//OC01102019 (restored)
-	int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadAccessData)
+	//int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadAccessData)
+	int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadAccessData, void* pvGPU) //HG30112023
 	{
 		int result;
-		if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result;
-		if(result = TraverseRadZXE(pRadAccessData)) return result;
+		//if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result;
+		if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0, 0, 0, pvGPU)) return result; //HG30112023
+		//if(result = TraverseRadZXE(pRadAccessData)) return result;
+		if(result = TraverseRadZXE(pRadAccessData, 0, 0, pvGPU)) return result; //HG30112023
 		if(result = PropagateRadMoments(pRadAccessData, 0)) return result;
 
 		SetNewNonZeroWfrLimits(pRadAccessData);
@@ -76,11 +81,14 @@ class srTAperture : public srTShapedOptElem {
 
 	//int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData, void* pBuf=0) //OC06092019
 	//OC01102019 (restored)
-	int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData)
+	//int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData)
+	int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData, void* pvGPU) //HG30112023
 	{
 		int result;
-		if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result;
-		if(result = TraverseRadZXE(pRadAccessData)) return result;
+		//if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result;
+		if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0, 0, 0, pvGPU)) return result; //HG30112023
+		//if(result = TraverseRadZXE(pRadAccessData)) return result;
+		if(result = TraverseRadZXE(pRadAccessData, 0, 0, pvGPU)) return result; //HG30112023
 
 		SetNewNonZeroWfrLimits(pRadAccessData);
 		return 0;
diff --git a/cpp/src/core/sroptcnt.cpp b/cpp/src/core/sroptcnt.cpp
index a8bb4278..9b6072b2 100644
--- a/cpp/src/core/sroptcnt.cpp
+++ b/cpp/src/core/sroptcnt.cpp
@@ -251,7 +251,8 @@ int srTCompositeOptElem::PropagateRadiationTest(srTSRWRadStructAccessData* pInRa
 
 //*************************************************************************
 
-int srTCompositeOptElem::PropagateRadiationGuided(srTSRWRadStructAccessData& wfr, int nInt, char** arID, SRWLRadMesh* arIM, char** arI) //OC15082018
+//int srTCompositeOptElem::PropagateRadiationGuided(srTSRWRadStructAccessData& wfr, int nInt, char** arID, SRWLRadMesh* arIM, char** arI) //OC15082018
+int srTCompositeOptElem::PropagateRadiationGuided(srTSRWRadStructAccessData& wfr, int nInt, char** arID, SRWLRadMesh* arIM, char** arI, void* pvGPU) //HG30112023
 //int srTCompositeOptElem::PropagateRadiationGuided(srTSRWRadStructAccessData& wfr)
 {
 	//Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP:
@@ -265,6 +266,9 @@ int srTCompositeOptElem::PropagateRadiationGuided(srTSRWRadStructAccessData& wfr
 	int res = 0, elemCount = 0;
 
 	bool propIntIsNeeded = (nInt != 0) && (arID != 0) && (arI != 0); //OC27082018
+#ifdef _OFFLOAD_GPU //HG30112023
+	bool dataOnDevice = false;
+#endif
 
 	for(srTGenOptElemHndlList::iterator it = GenOptElemList.begin(); it != GenOptElemList.end(); ++it)
 	{
@@ -308,7 +312,16 @@ int srTCompositeOptElem::PropagateRadiationGuided(srTSRWRadStructAccessData& wfr
 			if((::fabs(curPropResizeInst.pxd - 1.) > tolRes) || (::fabs(curPropResizeInst.pxm - 1.) > tolRes) ||
 			   //(::fabs(curPropResizeInst.pzd - 1.) > tolRes) || (::fabs(curPropResizeInst.pzm - 1.) > tolRes))
 			   (::fabs(curPropResizeInst.pzd - 1.) > tolRes) || (::fabs(curPropResizeInst.pzm - 1.) > tolRes) || (curPropResizeInst.ShiftTypeBeforeRes > 0)) //OC11072019
-				if(res = RadResizeGen(wfr, curPropResizeInst)) return res;
+			{	
+				//if(res = RadResizeGen(wfr, curPropResizeInst)) return res;
+				if(res = RadResizeGen(wfr, curPropResizeInst, pvGPU)) return res; //HG30112023
+
+#ifdef _OFFLOAD_GPU //HG30112023
+				if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) {
+					dataOnDevice = true;
+				}
+#endif	
+			}
 
 			//Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP:
 			//srwlPrintTime("Iteration: RadResizeGen",&start);
@@ -325,14 +338,55 @@ int srTCompositeOptElem::PropagateRadiationGuided(srTSRWRadStructAccessData& wfr
 		//Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP:
 		//srwlPrintTime("Iteration: precParWfrPropag",&start);
 
+#ifdef _OFFLOAD_GPU //HG30112023
+		TGPUUsageArg* pGPU = (TGPUUsageArg*)pvGPU;
+		if (CAuxGPU::GPUEnabled(pGPU)) {
+			if (dataOnDevice && (((srTGenOptElem*)it->rep)->SupportedFeatures() & 1) == 0)
+			{
+//#if DEBUG
+//				printf("Element does not support GPU, transferring to CPU.\r\n");
+//#endif
+				if (wfr.pBaseRadX != NULL)
+					wfr.pBaseRadX = (float*)CAuxGPU::ToHostAndFree(pGPU, wfr.pBaseRadX, 2 * wfr.ne * wfr.nx * wfr.nz * sizeof(float));
+				if (wfr.pBaseRadZ != NULL)
+					wfr.pBaseRadZ = (float*)CAuxGPU::ToHostAndFree(pGPU, wfr.pBaseRadZ, 2 * wfr.ne * wfr.nx * wfr.nz * sizeof(float));
+				dataOnDevice = false;
+			}
+			else if (!dataOnDevice && (((srTGenOptElem*)it->rep)->SupportedFeatures() & 1) == 1)
+			{
+					dataOnDevice = true;
+//#if DEBUG
+//					printf("Element supports GPU, transferring...\r\n");
+//#endif
+			}
+		}
+#endif
+
 		srTRadResizeVect auxResizeVect;
-		if(res = ((srTGenOptElem*)(it->rep))->PropagateRadiation(&wfr, precParWfrPropag, auxResizeVect)) return res;
+		//if(res = ((srTGenOptElem*)(it->rep))->PropagateRadiation(&wfr, precParWfrPropag, auxResizeVect)) return res;
+		if(res = ((srTGenOptElem*)(it->rep))->PropagateRadiation(&wfr, precParWfrPropag, auxResizeVect, pvGPU)) return res; //HG30112023
 		//maybe to use "PropagateRadiationGuided" for srTCompositeOptElem?
 
 		//Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP:
 		//srwlPrintTime("Iteration: PropagateRadiation",&start);
 
-		if(propIntIsNeeded) ExtractPropagatedIntensity(wfr, nInt, arID, arIM, arI, elemCount);
+		if(propIntIsNeeded)
+		{
+#ifdef _OFFLOAD_GPU //HG09112022 If the data is on the GPU, transfer it to CPU and synchronize before extracting the intensity
+			TGPUUsageArg* pGPU = (TGPUUsageArg*)pvGPU;
+			if (CAuxGPU::GPUEnabled(pGPU)) {
+				if (dataOnDevice)
+				{
+					if (wfr.pBaseRadX != NULL)
+						wfr.pBaseRadX = (float*)CAuxGPU::ToHostAndFree(pGPU, wfr.pBaseRadX, 2 * wfr.ne * wfr.nx * wfr.nz * sizeof(float));
+					if (wfr.pBaseRadZ != NULL)
+						wfr.pBaseRadZ = (float*)CAuxGPU::ToHostAndFree(pGPU, wfr.pBaseRadZ, 2 * wfr.ne * wfr.nx * wfr.nz * sizeof(float));
+					dataOnDevice = false;
+				}
+			}
+#endif
+			ExtractPropagatedIntensity(wfr, nInt, arID, arIM, arI, elemCount);
+		} 
 
 		elemCount++;
 
diff --git a/cpp/src/core/sroptcnt.h b/cpp/src/core/sroptcnt.h
index 59095ccc..84f7e7d5 100644
--- a/cpp/src/core/sroptcnt.h
+++ b/cpp/src/core/sroptcnt.h
@@ -34,7 +34,8 @@ class srTCompositeOptElem : public srTGenOptElem {
 	srTCompositeOptElem() {}
 
 	int PropagateRadiationTest(srTSRWRadStructAccessData*, srTSRWRadStructAccessData*);
-	int PropagateRadiationGuided(srTSRWRadStructAccessData& wfr, int nInt=0, char** arID=0, SRWLRadMesh* arIM=0, char** arI=0); //OC15082018
+	int PropagateRadiationGuided(srTSRWRadStructAccessData& wfr, int nInt=0, char** arID=0, SRWLRadMesh* arIM=0, char** arI=0, void* pvGPU=0); //HG01122023
+	//int PropagateRadiationGuided(srTSRWRadStructAccessData& wfr, int nInt=0, char** arID=0, SRWLRadMesh* arIM=0, char** arI=0); //OC15082018
 	//int PropagateRadiationGuided(srTSRWRadStructAccessData& wfr);
 	int ExtractPropagatedIntensity(srTSRWRadStructAccessData& wfr, int nInt, char** arID, SRWLRadMesh* arIM, char** arI, int elCnt, int indIntSartSearch=0); //27082018
 
@@ -47,7 +48,8 @@ class srTCompositeOptElem : public srTGenOptElem {
 		GenOptElemList.push_back(OptElemHndl);
 	}
 
-	int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResizeBeforeAndAfterVect)
+	//int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResizeBeforeAndAfterVect)
+	int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResizeBeforeAndAfterVect, void* pvGPU=0) //HG01122023
 	{
 		int AmOfElem = (int)GenOptElemList.size(); //OC110104
 		int ElemCount = 0; //OC110104
@@ -65,7 +67,8 @@ class srTCompositeOptElem : public srTGenOptElem {
 			}
 
 			//if(result = ((srTGenOptElem*)((*iter).rep))->PropagateRadiation(pRadAccessData, MethNo, ResizeBeforeAndAfterVect)) return result;
-			if(result = ((srTGenOptElem*)((*iter).rep))->PropagateRadiation(pRadAccessData, ParPrecWfrPropag, ResizeBeforeAndAfterVect)) return result;
+			//if(result = ((srTGenOptElem*)((*iter).rep))->PropagateRadiation(pRadAccessData, ParPrecWfrPropag, ResizeBeforeAndAfterVect)) return result;
+			if(result = ((srTGenOptElem*)((*iter).rep))->PropagateRadiation(pRadAccessData, ParPrecWfrPropag, ResizeBeforeAndAfterVect, pvGPU)) return result; //HG01122023
 		}
 		ParPrecWfrPropag.UseResAfter = GenUseResAfter; //OC110104
 		return 0;
diff --git a/cpp/src/core/sroptcryst.h b/cpp/src/core/sroptcryst.h
index fd25308e..af6f535f 100644
--- a/cpp/src/core/sroptcryst.h
+++ b/cpp/src/core/sroptcryst.h
@@ -943,7 +943,8 @@ class srTOptCryst : public srTGenOptElem {
 		return 0;
 	}
 
-	int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect) //virtual in srTGenOptElem
+	//int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect) //virtual in srTGenOptElem
+	int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect, void* pvGPU) //virtual in srTGenOptElem //HG01122023
 	{
 		m_eStartAux = pRadAccessData->eStart; m_eStepAux = pRadAccessData->eStep; m_ne = pRadAccessData->ne; //required for RadPointModifier
 
@@ -967,7 +968,8 @@ class srTOptCryst : public srTGenOptElem {
 		}
 
 		//return PropagateRadiationMeth_0(pRadAccessData);
-		return PropagateRadiationSingleE_Meth_0(pRadAccessData, 0);
+		//return PropagateRadiationSingleE_Meth_0(pRadAccessData, 0);
+		return PropagateRadiationSingleE_Meth_0(pRadAccessData, 0, pvGPU); //HG01122023
 	}
 
 	//int PropagateRadiationMeth_0(srTSRWRadStructAccessData* pRadAccessData) //virtual in srTGenOptElem
@@ -977,7 +979,8 @@ class srTOptCryst : public srTGenOptElem {
 
 	//int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadAccessData, void* pBuf=0) //OC06092019
 	//OC01102019 (restored)
-	int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadAccessData)
+	//int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadAccessData)
+	int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadAccessData, void* pvGPU) //HG01122023
 	{//It works for many photon energies too (as in the case of Drift)
 	 //The "in-place" processing involving FFT for many photon energies greatly improves efficiency of the code for Time-/Frequency-Dependent simulations for FEL and pulsed lasers.
 		int result;
diff --git a/cpp/src/core/sroptdrf.cpp b/cpp/src/core/sroptdrf.cpp
index e4018c6f..27246d8b 100644
--- a/cpp/src/core/sroptdrf.cpp
+++ b/cpp/src/core/sroptdrf.cpp
@@ -352,7 +352,8 @@ int srTDriftSpace::PropagateRadiationMeth_1(srTSRWRadStructAccessData* pRadAcces
 
 //int srTDriftSpace::PropagateRadiationSimple_PropToWaist(srTSRWRadStructAccessData* pRadAccessData, srTDriftPropBufVars* pBufVars) //OC06092019
 //OC01102019 (restored)
-int srTDriftSpace::PropagateRadiationSimple_PropToWaist(srTSRWRadStructAccessData* pRadAccessData)
+//int srTDriftSpace::PropagateRadiationSimple_PropToWaist(srTSRWRadStructAccessData* pRadAccessData)
+int srTDriftSpace::PropagateRadiationSimple_PropToWaist(srTSRWRadStructAccessData* pRadAccessData, void* pvGPU) //HG30112023
 {// e in eV; Length in m !!!
 	int result;
 
@@ -365,7 +366,8 @@ int srTDriftSpace::PropagateRadiationSimple_PropToWaist(srTSRWRadStructAccessDat
 	SetupPropBufVars_PropToWaist(pRadAccessData, &BufVars);
 	//SetupPropBufVars_PropToWaist(pRadAccessData);
 
-	if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result;
+	//if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result;
+	if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0, 0, 0, pvGPU)) return result; //HG30112023
 
 	//pBufVars->PassNo = 1; //OC06092019
 	//OC01102019 (restored)
@@ -373,7 +375,8 @@ int srTDriftSpace::PropagateRadiationSimple_PropToWaist(srTSRWRadStructAccessDat
 	//PropBufVars.PassNo = 1;
 	//if(result = TraverseRadZXE(pRadAccessData, pBufVars)) return result; //OC06092019
 	//OC01102019 (restored)
-	if(result = TraverseRadZXE(pRadAccessData, &BufVars)) return result; //OC29082019
+	if(result = TraverseRadZXE(pRadAccessData, &BufVars, sizeof(srTDriftPropBufVars), pvGPU)) return result; //HG30112023
+	//if(result = TraverseRadZXE(pRadAccessData, &BufVars)) return result; //OC29082019
 	//if(result = TraverseRadZXE(pRadAccessData)) return result;
 
 	//OC240114 (commented-out)
@@ -402,7 +405,8 @@ int srTDriftSpace::PropagateRadiationSimple_PropToWaist(srTSRWRadStructAccessDat
 
 	//To remove this?
 	srTDataPtrsForWfrEdgeCorr DataPtrsForWfrEdgeCorr;
-	if(result = SetupWfrEdgeCorrData(pRadAccessData, pRadAccessData->pBaseRadX, pRadAccessData->pBaseRadZ, DataPtrsForWfrEdgeCorr)) return result;
+	if(result = SetupWfrEdgeCorrData(pRadAccessData, pRadAccessData->pBaseRadX, pRadAccessData->pBaseRadZ, DataPtrsForWfrEdgeCorr, pvGPU)) return result; //HG30112023
+	//if(result = SetupWfrEdgeCorrData(pRadAccessData, pRadAccessData->pBaseRadX, pRadAccessData->pBaseRadZ, DataPtrsForWfrEdgeCorr)) return result;
 
 #if !defined(_FFTW3) && defined(_WITH_OMP) //OC29082019
 	//OC04062020
@@ -423,9 +427,11 @@ int srTDriftSpace::PropagateRadiationSimple_PropToWaist(srTSRWRadStructAccessDat
 #else
 //OCTEST01102019: commented-out the above (to see if this will fix problem of TD calcs)
 	FFT2DInfo.pData = pRadAccessData->pBaseRadX;
-	if(result = FFT2D.Make2DFFT(FFT2DInfo)) return result;
+	//if(result = FFT2D.Make2DFFT(FFT2DInfo)) return result;
+	if(result = FFT2D.Make2DFFT(FFT2DInfo, 0, 0, pvGPU)) return result; //HG30112023
 	FFT2DInfo.pData = pRadAccessData->pBaseRadZ;
-	if(result = FFT2D.Make2DFFT(FFT2DInfo)) return result;
+	//if(result = FFT2D.Make2DFFT(FFT2DInfo)) return result;
+	if(result = FFT2D.Make2DFFT(FFT2DInfo, 0, 0, pvGPU)) return result; //HG30112023
 #endif
 
 	//FFT2DInfo.pData = pRadAccessData->pBaseRadX;
@@ -436,7 +442,8 @@ int srTDriftSpace::PropagateRadiationSimple_PropToWaist(srTSRWRadStructAccessDat
 	//To remove this?
 	if(DataPtrsForWfrEdgeCorr.WasSetup)
 	{
-		MakeWfrEdgeCorrection(pRadAccessData, pRadAccessData->pBaseRadX, pRadAccessData->pBaseRadZ, DataPtrsForWfrEdgeCorr);
+		//MakeWfrEdgeCorrection(pRadAccessData, pRadAccessData->pBaseRadX, pRadAccessData->pBaseRadZ, DataPtrsForWfrEdgeCorr);
+		MakeWfrEdgeCorrection(pRadAccessData, pRadAccessData->pBaseRadX, pRadAccessData->pBaseRadZ, DataPtrsForWfrEdgeCorr, pvGPU); //HG30112023
 		DataPtrsForWfrEdgeCorr.DisposeData();
 	}
 
@@ -455,7 +462,8 @@ int srTDriftSpace::PropagateRadiationSimple_PropToWaist(srTSRWRadStructAccessDat
 	//PropBufVars.PassNo = 2;
 	//if(result = TraverseRadZXE(pRadAccessData, pBufVars)) return result; //OC06092019
 	//OC01102019 (restored)
-	if(result = TraverseRadZXE(pRadAccessData, &BufVars)) return result; //OC30082019
+	if(result = TraverseRadZXE(pRadAccessData, &BufVars, sizeof(srTDriftPropBufVars), pvGPU)) return result; //HG30112023
+	//if(result = TraverseRadZXE(pRadAccessData, &BufVars)) return result; //OC30082019
 	//if(result = TraverseRadZXE(pRadAccessData)) return result;
 
 	//OC19032022
@@ -479,7 +487,8 @@ int srTDriftSpace::PropagateRadiationSimple_PropToWaist(srTSRWRadStructAccessDat
 }
 
 //*************************************************************************
-int srTDriftSpace::PropagateRadiationSimple_PropToWaistBeyondParax(srTSRWRadStructAccessData* pRadAccessData) //OC10112019
+//int srTDriftSpace::PropagateRadiationSimple_PropToWaistBeyondParax(srTSRWRadStructAccessData* pRadAccessData) //OC10112019
+int srTDriftSpace::PropagateRadiationSimple_PropToWaistBeyondParax(srTSRWRadStructAccessData* pRadAccessData, void *pvGPU) //HG30112023
 {// e in eV; Length in m !!!
 	int result = 0;
 
@@ -488,7 +497,8 @@ int srTDriftSpace::PropagateRadiationSimple_PropToWaistBeyondParax(srTSRWRadStru
 	srTDriftPropBufVars BufVars;
 	SetupPropBufVars_PropToWaistBeyondParax(pRadAccessData, &BufVars);
 
-	if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result;
+	//if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result;
+	if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0, 0, 0, pvGPU)) return result; //HG30112023
 
 	pRadAccessData->TreatQuadPhaseTerm('r'); //OC17122019
 	//pRadAccessData->TreatQuadPhaseTermTerm('r');
@@ -509,7 +519,8 @@ int srTDriftSpace::PropagateRadiationSimple_PropToWaistBeyondParax(srTSRWRadStru
 	//pRadAccessData->xStart = (pRadAccessData->xStart)*InvLambdaM_d_Rx;
 	//pRadAccessData->zStart = (pRadAccessData->zStart)*InvLambdaM_d_Rz;
 
-	if(result = TraverseRadZXE(pRadAccessData, &BufVars)) return result;
+	//if(result = TraverseRadZXE(pRadAccessData, &BufVars)) return result;
+	if(result = TraverseRadZXE(pRadAccessData, &BufVars, sizeof(srTDriftPropBufVars), pvGPU)) return result; //HG30112023
 
 	CGenMathFFT2DInfo FFT2DInfo;
 	FFT2DInfo.xStep = pRadAccessData->xStep;
@@ -547,9 +558,11 @@ int srTDriftSpace::PropagateRadiationSimple_PropToWaistBeyondParax(srTSRWRadStru
 #else
 //OCTEST01102019: commented-out the above (to see if this will fix problem of TD calcs)
 	FFT2DInfo.pData = pRadAccessData->pBaseRadX;
-	if(result = FFT2D.Make2DFFT(FFT2DInfo)) return result;
+	//if(result = FFT2D.Make2DFFT(FFT2DInfo)) return result;
+	if(result = FFT2D.Make2DFFT(FFT2DInfo, 0, 0, pvGPU)) return result; //HG30112023
 	FFT2DInfo.pData = pRadAccessData->pBaseRadZ;
-	if(result = FFT2D.Make2DFFT(FFT2DInfo)) return result;
+	//if(result = FFT2D.Make2DFFT(FFT2DInfo)) return result;
+	if(result = FFT2D.Make2DFFT(FFT2DInfo, 0, 0, pvGPU)) return result; //HG30112023
 #endif
 
 	//To remove this?
@@ -597,7 +610,8 @@ int srTDriftSpace::PropagateRadiationSimple_PropToWaistBeyondParax(srTSRWRadStru
 
 //int srTDriftSpace::PropagateRadiationSimple_PropFromWaist(srTSRWRadStructAccessData* pRadAccessData, srTDriftPropBufVars* pBufVars) //OC06092019
 //OC01102019 (restored)
-int srTDriftSpace::PropagateRadiationSimple_PropFromWaist(srTSRWRadStructAccessData* pRadAccessData)
+//int srTDriftSpace::PropagateRadiationSimple_PropFromWaist(srTSRWRadStructAccessData* pRadAccessData)
+int srTDriftSpace::PropagateRadiationSimple_PropFromWaist(srTSRWRadStructAccessData* pRadAccessData, void* pvGPU) //HG30112023
 {//Should be very similar to PropagateRadiationSimple_PropToWaist, consider merging
 	int result = 0;
 	
@@ -607,7 +621,8 @@ int srTDriftSpace::PropagateRadiationSimple_PropFromWaist(srTSRWRadStructAccessD
 	//OC01102019 (restored)
 	SetupPropBufVars_PropFromWaist(pRadAccessData, &BufVars);
 	//SetupPropBufVars_PropFromWaist(pRadAccessData);
-	if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result;
+	//if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result;
+	if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0, 0, 0, pvGPU)) return result; //HG30112023
 
 	//OC30082019: commented-out: not needed here, since it is set in ChooseLocalPropMode(...); is it thread-safe?
 	//LocalPropMode = 2; // prop. from waist
@@ -616,7 +631,8 @@ int srTDriftSpace::PropagateRadiationSimple_PropFromWaist(srTSRWRadStructAccessD
 	//if(result = TraverseRadZXE(pRadAccessData)) return result;
 	//OC01102019 (restored)
 	BufVars.PassNo = 1;
-	if(result = TraverseRadZXE(pRadAccessData, &BufVars)) return result;
+	//if(result = TraverseRadZXE(pRadAccessData, &BufVars)) return result;
+	if(result = TraverseRadZXE(pRadAccessData, &BufVars, sizeof(srTDriftPropBufVars), pvGPU)) return result; //HG30112023
 	//OC06092019
 	//pBufVars->PassNo = 1;
 	//if(result = TraverseRadZXE(pRadAccessData, pBufVars)) return result;
@@ -638,7 +654,8 @@ int srTDriftSpace::PropagateRadiationSimple_PropFromWaist(srTSRWRadStructAccessD
 	//OCTEST (commented-out "edge correction")
 	//OC01102019 (uncommented)
 	srTDataPtrsForWfrEdgeCorr DataPtrsForWfrEdgeCorr;
-	if(result = SetupWfrEdgeCorrData(pRadAccessData, pRadAccessData->pBaseRadX, pRadAccessData->pBaseRadZ, DataPtrsForWfrEdgeCorr)) return result;
+	//if(result = SetupWfrEdgeCorrData(pRadAccessData, pRadAccessData->pBaseRadX, pRadAccessData->pBaseRadZ, DataPtrsForWfrEdgeCorr)) return result;
+	if(result = SetupWfrEdgeCorrData(pRadAccessData, pRadAccessData->pBaseRadX, pRadAccessData->pBaseRadZ, DataPtrsForWfrEdgeCorr, pvGPU)) return result; //HG30112023
 
 	CGenMathFFT2D FFT2D;
 
@@ -666,16 +683,19 @@ int srTDriftSpace::PropagateRadiationSimple_PropFromWaist(srTSRWRadStructAccessD
 	//}
 #else
 	FFT2DInfo.pData = pRadAccessData->pBaseRadX;
-	if(result = FFT2D.Make2DFFT(FFT2DInfo)) return result;
+	//if(result = FFT2D.Make2DFFT(FFT2DInfo)) return result;
+	if(result = FFT2D.Make2DFFT(FFT2DInfo, 0, 0, pvGPU)) return result; //HG30112023
 	FFT2DInfo.pData = pRadAccessData->pBaseRadZ;
-	if(result = FFT2D.Make2DFFT(FFT2DInfo)) return result;
+	//if(result = FFT2D.Make2DFFT(FFT2DInfo)) return result;
+	if(result = FFT2D.Make2DFFT(FFT2DInfo, 0, 0, pvGPU)) return result; //HG30112023
 #endif
 
 	//OCTEST (commented-out "edge correction")
 	//OC01102019 (uncommented)
 	if(DataPtrsForWfrEdgeCorr.WasSetup)
 	{
-		MakeWfrEdgeCorrection(pRadAccessData, pRadAccessData->pBaseRadX, pRadAccessData->pBaseRadZ, DataPtrsForWfrEdgeCorr);
+		//MakeWfrEdgeCorrection(pRadAccessData, pRadAccessData->pBaseRadX, pRadAccessData->pBaseRadZ, DataPtrsForWfrEdgeCorr);
+		MakeWfrEdgeCorrection(pRadAccessData, pRadAccessData->pBaseRadX, pRadAccessData->pBaseRadZ, DataPtrsForWfrEdgeCorr, pvGPU); //HG30112023
 		DataPtrsForWfrEdgeCorr.DisposeData();
 	}
 
@@ -689,7 +709,8 @@ int srTDriftSpace::PropagateRadiationSimple_PropFromWaist(srTSRWRadStructAccessD
 	//if(result = TraverseRadZXE(pRadAccessData)) return result;
 	//OC01102019 (restored)
 	BufVars.PassNo = 2;
-	if(result = TraverseRadZXE(pRadAccessData, &BufVars)) return result;
+	//if(result = TraverseRadZXE(pRadAccessData, &BufVars)) return result;
+	if(result = TraverseRadZXE(pRadAccessData, &BufVars, sizeof(srTDriftPropBufVars), pvGPU)) return result; //HG30112023
 	//OC06092019
 	//pBufVars->PassNo = 2;
 	//if(result = TraverseRadZXE(pRadAccessData, pBufVars)) return result;
@@ -701,7 +722,8 @@ int srTDriftSpace::PropagateRadiationSimple_PropFromWaist(srTSRWRadStructAccessD
 
 //int srTDriftSpace::PropagateRadiationSimple_AnalytTreatQuadPhaseTerm(srTSRWRadStructAccessData* pRadAccessData, srTDriftPropBufVars* pBufVars) //OC06092019
 //OC01102019 (restored)
-int srTDriftSpace::PropagateRadiationSimple_AnalytTreatQuadPhaseTerm(srTSRWRadStructAccessData* pRadAccessData)
+//int srTDriftSpace::PropagateRadiationSimple_AnalytTreatQuadPhaseTerm(srTSRWRadStructAccessData* pRadAccessData)
+int srTDriftSpace::PropagateRadiationSimple_AnalytTreatQuadPhaseTerm(srTSRWRadStructAccessData* pRadAccessData, void* pvGPU) //HG30112023
 {// e in eV; Length in m !!!
 	int result = 0;
 
@@ -720,7 +742,8 @@ int srTDriftSpace::PropagateRadiationSimple_AnalytTreatQuadPhaseTerm(srTSRWRadSt
 	//Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP:
 	//srwlPrintTime(":PropagateRadiationSimple_AnalytTreatQuadPhaseTerm:SetupPropBufVars_AnalytTreatQuadPhaseTerm",&start);
 
-	if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result;
+	//if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result;
+	if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0, 0, 0, pvGPU)) return result; //HG30112023
 
 	//Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP:
 	//srwlPrintTime(":PropagateRadiationSimple_AnalytTreatQuadPhaseTerm:SetRadRepres 1",&start);
@@ -731,7 +754,8 @@ int srTDriftSpace::PropagateRadiationSimple_AnalytTreatQuadPhaseTerm(srTSRWRadSt
 	//PropBufVars.PassNo = 1; //Remove quadratic term from the Phase in coord. repres.
 	//if(result = TraverseRadZXE(pRadAccessData, pBufVars)) return result; //OC06092019
 	//OC01102019 (restored)
-	if(result = TraverseRadZXE(pRadAccessData, &BufVars)) return result; //OC30082019
+	//if(result = TraverseRadZXE(pRadAccessData, &BufVars)) return result; //OC30082019
+	if(result = TraverseRadZXE(pRadAccessData, &BufVars, sizeof(srTDriftPropBufVars), pvGPU)) return result; //HG30112023
 	//if(result = TraverseRadZXE(pRadAccessData)) return result;
 
 	//Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP:
@@ -750,7 +774,8 @@ int srTDriftSpace::PropagateRadiationSimple_AnalytTreatQuadPhaseTerm(srTSRWRadSt
 
 		pRadAccessData->WfrEdgeCorrShouldBeDone = 0;
 
-	if(result = SetRadRepres(pRadAccessData, 1)) return result; //To angular repres.
+	//if(result = SetRadRepres(pRadAccessData, 1)) return result; //To angular repres.
+	if(result = SetRadRepres(pRadAccessData, 1, 0, 0, pvGPU)) return result; //To angular repres. //HG30112023
 
 	//Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP:
 	//srwlPrintTime(":PropagateRadiationSimple_AnalytTreatQuadPhaseTerm:SetRadRepres 2",&start);
@@ -761,7 +786,8 @@ int srTDriftSpace::PropagateRadiationSimple_AnalytTreatQuadPhaseTerm(srTSRWRadSt
 	//PropBufVars.PassNo = 2; //Loop in angular repres.
 	//if(result = TraverseRadZXE(pRadAccessData, pBufVars)) return result; //OC06092019
 	//OC01102019 (restored)
-	if(result = TraverseRadZXE(pRadAccessData, &BufVars)) return result; //OC30082019
+	//if(result = TraverseRadZXE(pRadAccessData, &BufVars)) return result; //OC30082019
+	if(result = TraverseRadZXE(pRadAccessData, &BufVars, sizeof(srTDriftPropBufVars), pvGPU)) return result; //HG30112023
 	//if(result = TraverseRadZXE(pRadAccessData)) return result;
 
 	//Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP:
@@ -773,7 +799,8 @@ int srTDriftSpace::PropagateRadiationSimple_AnalytTreatQuadPhaseTerm(srTSRWRadSt
 			pRadAccessData->zStartTr += zShift;
 		}
 
-	if(result = SetRadRepres(pRadAccessData, 0)) return result; //Back to coord. repres.
+	//if(result = SetRadRepres(pRadAccessData, 0)) return result; //Back to coord. repres.
+	if(result = SetRadRepres(pRadAccessData, 0, 0, 0, pvGPU)) return result; //Back to coord. repres. //HG30112023
 
 	//Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP:
 	//srwlPrintTime(":PropagateRadiationSimple_AnalytTreatQuadPhaseTerm:SetRadRepres 3",&start);
@@ -816,7 +843,8 @@ int srTDriftSpace::PropagateRadiationSimple_AnalytTreatQuadPhaseTerm(srTSRWRadSt
 	//PropBufVars.PassNo = 3; //Add new quadratic term to the Phase in coord. repres.
 	//if(result = TraverseRadZXE(pRadAccessData, pBufVars)) return result; //OC06092019
 	//OC01102019 (restored)
-	if(result = TraverseRadZXE(pRadAccessData, &BufVars)) return result; //OC30082019
+	//if(result = TraverseRadZXE(pRadAccessData, &BufVars)) return result; //OC30082019
+	if(result = TraverseRadZXE(pRadAccessData, &BufVars, sizeof(srTDriftPropBufVars), pvGPU)) return result; //HG30112023
 	//if(result = TraverseRadZXE(pRadAccessData)) return result;
 
 	//Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP:
diff --git a/cpp/src/core/sroptdrf.h b/cpp/src/core/sroptdrf.h
index 01b03722..a6a16d20 100644
--- a/cpp/src/core/sroptdrf.h
+++ b/cpp/src/core/sroptdrf.h
@@ -90,6 +90,7 @@ class srTDriftSpace : public srTGenOptElem {
 	double Length;
 	//OC06092019 (commented-out)
 	//srTDriftPropBufVars PropBufVars;
+	int SupportedFeatures() override { return 1; }	//HG01122023 Returns 1 if the element supports GPU propagation
 
 	srTDriftSpace(double InLength =0., char InTreatPath =0) 
 	{ 
@@ -109,7 +110,8 @@ class srTDriftSpace : public srTGenOptElem {
 	}
 
 	//int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, int MethNo, srTRadResizeVect& ResizeBeforeAndAfterVect)
-	int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResizeBeforeAndAfterVect)
+	//int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResizeBeforeAndAfterVect)
+	int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResizeBeforeAndAfterVect, void* pvGPU=0) //HG01122023
 	{
 		//Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP:
 		//double start;
@@ -159,7 +161,8 @@ class srTDriftSpace : public srTGenOptElem {
 
 		//if(MethNo == 0) result = PropagateRadiationMeth_0(pRadAccessData, &BufVars); //OC06092019
 		//OC01102019 (restored)
-		if(MethNo == 0) result = PropagateRadiationMeth_0(pRadAccessData);
+		//if(MethNo == 0) result = PropagateRadiationMeth_0(pRadAccessData);
+		if(MethNo == 0) result = PropagateRadiationMeth_0(pRadAccessData, pvGPU); //HG01122023
 		else if(MethNo == 1) result = PropagateRadiationMeth_1(pRadAccessData);
 		else if(MethNo == 2) result = PropagateRadiationMeth_2(pRadAccessData, ParPrecWfrPropag, ResizeBeforeAndAfterVect);
 		
@@ -175,12 +178,14 @@ class srTDriftSpace : public srTGenOptElem {
 	//int PropagateRadiationMeth_0(srTSRWRadStructAccessData* pRadAccessData)
 	//int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadAccessData, void* pBuf=0) //OC06092019
 	//OC01102019 (restored)
-	int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadAccessData)
+	//int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadAccessData)
+	int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadAccessData, void* pvGPU) //HG01122023
 	{//it works for many photon energies too!
 		int result;
 		//if(result = PropagateRadiationSimple(pRadAccessData, pBuf)) return result; //OC06092019
 		//OC01102019 (restored)
-		if(result = PropagateRadiationSimple(pRadAccessData)) return result;
+		//if(result = PropagateRadiationSimple(pRadAccessData)) return result;
+		if(result = PropagateRadiationSimple(pRadAccessData, pvGPU)) return result; //HG01122023
 		if(result = PropagateRadMoments(pRadAccessData, 0)) return result;
 		if(result = PropagateWaveFrontRadius(pRadAccessData)) return result;
 		if(result = Propagate4x4PropMatr(pRadAccessData)) return result;
@@ -189,7 +194,8 @@ class srTDriftSpace : public srTGenOptElem {
 
 	//int PropagateRadiationMeth_0(srTSRWRadStructAccessData* pRadAccessData, void* pBuf) //OC06092019
 	//OC01102019 (restored)
-	int PropagateRadiationMeth_0(srTSRWRadStructAccessData* pRadAccessData) //virtual in srTGenOptElem
+	//int PropagateRadiationMeth_0(srTSRWRadStructAccessData* pRadAccessData) //virtual in srTGenOptElem
+	int PropagateRadiationMeth_0(srTSRWRadStructAccessData* pRadAccessData, void* pvGPU) //virtual in srTGenOptElem //HG01122023
 	{//because for the Drift, the following works for many photon energies too!
 		//return PropagateRadiationSingleE_Meth_0(pRadAccessData, 0);
 		//OC251214
@@ -198,7 +204,8 @@ class srTDriftSpace : public srTGenOptElem {
 		//srTDriftPropBufVars* pBufVars = (srTDriftPropBufVars*)pBuf; //OC06092019
 		//if((pBufVars->LocalPropMode == 0) || (pBufVars->LocalPropMode == 3) || (pRadAccessData->ne == 1)) return PropagateRadiationSingleE_Meth_0(pRadAccessData, 0, pBuf); //OC06092019
 		//OC01102019 (restored)
-		if((LocalPropMode == 0) || (LocalPropMode == 3) || (pRadAccessData->ne == 1)) return PropagateRadiationSingleE_Meth_0(pRadAccessData, 0);
+		//if((LocalPropMode == 0) || (LocalPropMode == 3) || (pRadAccessData->ne == 1)) return PropagateRadiationSingleE_Meth_0(pRadAccessData, 0);
+		if((LocalPropMode == 0) || (LocalPropMode == 3) || (pRadAccessData->ne == 1)) return PropagateRadiationSingleE_Meth_0(pRadAccessData, 0, pvGPU); //HG01122023
 		else
 		{
 			pRadAccessData->SetNonZeroWavefrontLimitsToFullRange();
@@ -304,20 +311,26 @@ class srTDriftSpace : public srTGenOptElem {
 
 	//int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData, void* pBuf=0) //OC06092019
 	//OC01102019 (restored)
-	int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData)
+	//int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData)
+	int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData, void* pvGPU) //HG01122023
 	{
 		//srTDriftPropBufVars* pBufVars = (srTDriftPropBufVars*)pBuf; //OC06092019
 		//char LocalPropMode = pBufVars->LocalPropMode; //OC06092019
 		//OC01102019 (commented-out / restored)
 
-		if(LocalPropMode == 0) return PropagateRadiationSimple_AngRepres(pRadAccessData);
+		//if(LocalPropMode == 0) return PropagateRadiationSimple_AngRepres(pRadAccessData);
+		if(LocalPropMode == 0) return PropagateRadiationSimple_AngRepres(pRadAccessData, pvGPU); //HG01122023
 		//OC01102019 (restored)
-		else if(LocalPropMode == 1) return PropagateRadiationSimple_PropToWaist(pRadAccessData);
+		//else if(LocalPropMode == 1) return PropagateRadiationSimple_PropToWaist(pRadAccessData);
+		else if(LocalPropMode == 1) return PropagateRadiationSimple_PropToWaist(pRadAccessData, pvGPU); //HG01122023
 
-		else if(LocalPropMode == 11) return PropagateRadiationSimple_PropToWaistBeyondParax(pRadAccessData); //OC10112019
+		//else if(LocalPropMode == 11) return PropagateRadiationSimple_PropToWaistBeyondParax(pRadAccessData); //OC10112019
+		else if(LocalPropMode == 11) return PropagateRadiationSimple_PropToWaistBeyondParax(pRadAccessData, pvGPU); //OC10112019 //HG01122023
 
-		else if(LocalPropMode == 2) return PropagateRadiationSimple_PropFromWaist(pRadAccessData); //OC240114 (added)
-		else if(LocalPropMode == 3) return PropagateRadiationSimple_AnalytTreatQuadPhaseTerm(pRadAccessData);
+		//else if(LocalPropMode == 2) return PropagateRadiationSimple_PropFromWaist(pRadAccessData); //OC240114 (added)
+		else if(LocalPropMode == 2) return PropagateRadiationSimple_PropFromWaist(pRadAccessData, pvGPU); //OC240114 (added) //HG01122023
+		//else if(LocalPropMode == 3) return PropagateRadiationSimple_AnalytTreatQuadPhaseTerm(pRadAccessData);
+		else if(LocalPropMode == 3) return PropagateRadiationSimple_AnalytTreatQuadPhaseTerm(pRadAccessData, pvGPU); //HG01122023
 		//OC06092019
 		//else if(LocalPropMode == 1) return PropagateRadiationSimple_PropToWaist(pRadAccessData, pBufVars);
 		//else if(LocalPropMode == 2) return PropagateRadiationSimple_PropFromWaist(pRadAccessData, pBufVars); //OC240114 (added)
@@ -329,7 +342,8 @@ class srTDriftSpace : public srTGenOptElem {
 		else return 0;
 	}
 
-	int PropagateRadiationSimple_AngRepres(srTSRWRadStructAccessData* pRadAccessData)
+	//int PropagateRadiationSimple_AngRepres(srTSRWRadStructAccessData* pRadAccessData)
+	int PropagateRadiationSimple_AngRepres(srTSRWRadStructAccessData* pRadAccessData, void* pvGPU) //HG01122023
 	{
 		//Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP:
 		//double start;
@@ -351,13 +365,15 @@ class srTDriftSpace : public srTGenOptElem {
 
 		if(pRadAccessData->Pres != 1) 
 		{
-			if(result = SetRadRepres(pRadAccessData, 1)) return result;
+			//if(result = SetRadRepres(pRadAccessData, 1)) return result;
+			if(result = SetRadRepres(pRadAccessData, 1, 0, 0, pvGPU)) return result; //HG01122023
 		}
 
 		//Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP:
 		//srwlPrintTime(":PropagateRadiationSimple_AngRepres:SetRadRepres 1",&start);
 
-		if(result = TraverseRadZXE(pRadAccessData)) return result;
+		//if(result = TraverseRadZXE(pRadAccessData)) return result;
+		if(result = TraverseRadZXE(pRadAccessData, 0, 0, pvGPU)) return result; //HG01122023
 
 		//Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP:
 		//srwlPrintTime(":PropagateRadiationSimple_AngRepres:TraverseRadZXE",&start);
@@ -368,7 +384,8 @@ class srTDriftSpace : public srTGenOptElem {
 				pRadAccessData->zStartTr += zShift;
 			}
 
-		if(result = SetRadRepres(pRadAccessData, 0)) return result;
+		//if(result = SetRadRepres(pRadAccessData, 0)) return result;
+		if(result = SetRadRepres(pRadAccessData, 0, 0, 0, pvGPU)) return result; //HG01122023
 
 		//Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP:
 		//srwlPrintTime(":PropagateRadiationSimple_AngRepres:SetRadRepres 2",&start);
@@ -390,11 +407,15 @@ class srTDriftSpace : public srTGenOptElem {
 	}
 
 	//OC01102019 (restored)
-	int PropagateRadiationSimple_PropToWaist(srTSRWRadStructAccessData* pRadAccessData);
-	int PropagateRadiationSimple_PropToWaistBeyondParax(srTSRWRadStructAccessData* pRadAccessData); //OC10112019
-
-	int PropagateRadiationSimple_PropFromWaist(srTSRWRadStructAccessData* pRadAccessData);
-	int PropagateRadiationSimple_AnalytTreatQuadPhaseTerm(srTSRWRadStructAccessData* pRadAccessData);
+	//int PropagateRadiationSimple_PropToWaist(srTSRWRadStructAccessData* pRadAccessData);
+	int PropagateRadiationSimple_PropToWaist(srTSRWRadStructAccessData* pRadAccessData, void* pvGPU=0); //HG01122023
+	//int PropagateRadiationSimple_PropToWaistBeyondParax(srTSRWRadStructAccessData* pRadAccessData); //OC10112019
+	int PropagateRadiationSimple_PropToWaistBeyondParax(srTSRWRadStructAccessData* pRadAccessData, void* pvGPU=0); //OC10112019 //HG01122023
+
+	//int PropagateRadiationSimple_PropFromWaist(srTSRWRadStructAccessData* pRadAccessData);
+	int PropagateRadiationSimple_PropFromWaist(srTSRWRadStructAccessData* pRadAccessData, void* pvGPU=0); //HG01122023
+	//int PropagateRadiationSimple_AnalytTreatQuadPhaseTerm(srTSRWRadStructAccessData* pRadAccessData);
+	int PropagateRadiationSimple_AnalytTreatQuadPhaseTerm(srTSRWRadStructAccessData* pRadAccessData, void* pvGPU=0); //HG01122023
 	//OC06092019
 	//int PropagateRadiationSimple_PropToWaist(srTSRWRadStructAccessData* pRadAccessData, srTDriftPropBufVars* pBufVars=0);
 	//int PropagateRadiationSimple_PropFromWaist(srTSRWRadStructAccessData* pRadAccessData, srTDriftPropBufVars* pBufVars=0);
@@ -553,6 +574,16 @@ class srTDriftSpace : public srTGenOptElem {
 
 	void RadPointModifier(srTEXZ& EXZ, srTEFieldPtrs& EPtrs, void* pBuf=0) //OC29082019
 	//void RadPointModifier(srTEXZ& EXZ, srTEFieldPtrs& EPtrs)
+	{
+		RadPointModifierPortable(EXZ, EPtrs, pBuf); //HG01122023
+	}
+
+#ifdef _OFFLOAD_GPU //HG01122023
+	int RadPointModifierParallel(srTSRWRadStructAccessData* pRadAccessData, void* pBufVars=0, long pBufVarsSz=0, TGPUUsageArg* pGPU=0) override;
+	
+	GPU_PORTABLE
+#endif
+	void RadPointModifierPortable(srTEXZ& EXZ, srTEFieldPtrs& EPtrs, void* pBuf=0) //HG01122023
 	{
 		srTDriftPropBufVars* pBufVars = (srTDriftPropBufVars*)pBuf;
 		//char LocalPropMode = pBufVars->LocalPropMode;
@@ -568,6 +599,9 @@ class srTDriftSpace : public srTGenOptElem {
 		//else if(LocalPropMode == 3) { RadPointModifier_AnalytTreatQuadPhaseTerm(EXZ, EPtrs); return;}
 	}
 
+#ifdef _OFFLOAD_GPU //HG01122023
+	GPU_PORTABLE
+#endif
 	void RadPointModifier_AngRepres(srTEXZ& EXZ, srTEFieldPtrs& EPtrs)
 	{// e in eV; Length in m !!!
 	 // Operates on Angles side !!!
@@ -599,6 +633,9 @@ class srTDriftSpace : public srTGenOptElem {
 		*(EPtrs.pEzRe) = NewEzRe; *(EPtrs.pEzIm) = NewEzIm; 
 	}
 
+#ifdef _OFFLOAD_GPU //HG01122023
+	GPU_PORTABLE
+#endif
 	void RadPointModifier_PropToWaist(srTEXZ& EXZ, srTEFieldPtrs& EPtrs, srTDriftPropBufVars* pBufVars) //OC29082019
 	//void RadPointModifier_PropToWaist(srTEXZ& EXZ, srTEFieldPtrs& EPtrs)
 	{
@@ -661,6 +698,9 @@ class srTDriftSpace : public srTGenOptElem {
 		}
 	}
 
+#ifdef _OFFLOAD_GPU //HG01122023
+	GPU_PORTABLE
+#endif
 	void RadPointModifier_PropToWaistBeyondParax(srTEXZ& EXZ, srTEFieldPtrs& EPtrs, srTDriftPropBufVars* pBufVars) //OC10112019
 	{
 		double rx = EXZ.x, rz = EXZ.z;
@@ -695,6 +735,9 @@ class srTDriftSpace : public srTGenOptElem {
 		*(EPtrs.pEzRe) = NewEzRe; *(EPtrs.pEzIm) = NewEzIm; 
 	}
 
+#ifdef _OFFLOAD_GPU //HG01122023
+	GPU_PORTABLE
+#endif
 	void RadPointModifier_PropFromWaist(srTEXZ& EXZ, srTEFieldPtrs& EPtrs, srTDriftPropBufVars* pBufVars) //OC30082019
 	//void RadPointModifier_PropFromWaist(srTEXZ& EXZ, srTEFieldPtrs& EPtrs)
 	{
@@ -742,6 +785,9 @@ class srTDriftSpace : public srTGenOptElem {
 		}
 	}
 
+#ifdef _OFFLOAD_GPU //HG01122023
+	GPU_PORTABLE
+#endif
 	void RadPointModifier_AnalytTreatQuadPhaseTerm(srTEXZ& EXZ, srTEFieldPtrs& EPtrs, srTDriftPropBufVars* pBufVars) //OC30082019
 	//void RadPointModifier_AnalytTreatQuadPhaseTerm(srTEXZ& EXZ, srTEFieldPtrs& EPtrs)
 	{//don't use RobsX, RobsZ directly here!
diff --git a/cpp/src/core/sroptdrf_gpu.cu b/cpp/src/core/sroptdrf_gpu.cu
new file mode 100644
index 00000000..7d98fa8a
--- /dev/null
+++ b/cpp/src/core/sroptdrf_gpu.cu
@@ -0,0 +1,29 @@
+/************************************************************************//**
+ * File: sroptdrf_gpu.cu
+ * Description: Optical element: Drift space (CUDA implementation)
+ * Project: Synchrotron Radiation Workshop
+ * First release: 2023
+ *
+ * Copyright (C) Brookhaven National Laboratory
+ * All Rights Reserved
+ *
+ * @author H.Goel
+ * @version 1.0
+ ***************************************************************************/
+
+#ifdef _OFFLOAD_GPU
+#include "cuda_runtime.h"
+#include "device_launch_parameters.h"
+#include "math_constants.h"
+
+#include <stdio.h>
+#include <iostream>
+#include <chrono>
+#include "sroptdrf.h"
+
+//Implementation of the RadPointModifier's GPU function for the srTDriftSpace class
+int srTDriftSpace::RadPointModifierParallel(srTSRWRadStructAccessData* pRadAccessData, void* pBufVars, long pBufVarsSz, TGPUUsageArg *pGpu) 
+{ 
+    return RadPointModifierParallelImpl<srTDriftSpace>(pRadAccessData, pBufVars, pBufVarsSz, this, pGpu); 
+} //HG03092022
+#endif
\ No newline at end of file
diff --git a/cpp/src/core/sroptel2.cpp b/cpp/src/core/sroptel2.cpp
index 0c426d5d..73d11027 100644
--- a/cpp/src/core/sroptel2.cpp
+++ b/cpp/src/core/sroptel2.cpp
@@ -37,7 +37,8 @@ double srTGenOptElem::CheckMemoryAvailable()
 
 //int srTGenOptElem::PropagateRadiationMeth_0(srTSRWRadStructAccessData* pRadAccessData, void* pBuf) //OC06092019
 //OC01102019 (restored)
-int srTGenOptElem::PropagateRadiationMeth_0(srTSRWRadStructAccessData* pRadAccessData) 
+//int srTGenOptElem::PropagateRadiationMeth_0(srTSRWRadStructAccessData* pRadAccessData) 
+int srTGenOptElem::PropagateRadiationMeth_0(srTSRWRadStructAccessData* pRadAccessData, void *pvGPU) //HG30112023
 {//Moved from derived classes: loops over E, calls derived PropagateRadiationSingleE_Meth_0
  //This propagation method doesn't allow for true wavefront "resizing/resampling" 
  //(which results in changing numbers of points) in "slices" vs photon energy.
diff --git a/cpp/src/core/sroptelm.cpp b/cpp/src/core/sroptelm.cpp
index 341d0bae..bde700d1 100644
--- a/cpp/src/core/sroptelm.cpp
+++ b/cpp/src/core/sroptelm.cpp
@@ -30,6 +30,10 @@
 #include "sropthck.h"
 #include "sroptgrat.h"
 
+#ifdef _OFFLOAD_GPU //HG01122023
+#include "auxgpu.h"
+#endif
+
 #ifdef _WITH_OMP //Pre-processor definition for compiling with OpenMP library
 #include "omp.h"
 #endif
@@ -146,7 +150,8 @@ int srTGenOptElem::ExtraDataExpected(const char* sElemID) //OC01062020
 
 //*************************************************************************
 
-int srTGenOptElem::TraverseRadZXE(srTSRWRadStructAccessData* pRadAccessData, void* pBufVars) //OC29082019
+//int srTGenOptElem::TraverseRadZXE(srTSRWRadStructAccessData* pRadAccessData, void* pBufVars) //OC29082019
+int srTGenOptElem::TraverseRadZXE(srTSRWRadStructAccessData* pRadAccessData, void* pBufVars, long pBufVarsSz, void* pvGPU) //OC29082019 //HG01122023
 //int srTGenOptElem::TraverseRadZXE(srTSRWRadStructAccessData* pRadAccessData)
 {
 	float *pEx0 = pRadAccessData->pBaseRadX;
@@ -156,6 +161,15 @@ int srTGenOptElem::TraverseRadZXE(srTSRWRadStructAccessData* pRadAccessData, voi
 	long long PerX = pRadAccessData->ne << 1;
 	long long PerZ = PerX*pRadAccessData->nx;
 
+#ifdef _OFFLOAD_GPU //HG01122023
+	if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU))
+	{
+		if (RadPointModifierParallel(pRadAccessData, pBufVars, pBufVarsSz, (TGPUUsageArg*)pvGPU) == -1) //Try to call the GPU version, if it fails, call the CPU version
+			return TraverseRadZXE(pRadAccessData, pBufVars, pBufVarsSz, NULL);
+		return 0;
+	}
+#endif
+
 #ifndef _WITH_OMP //OC28102018
 
 	srTEFieldPtrs EFieldPtrs;
@@ -731,7 +745,8 @@ int srTGenOptElem::RemoveSliceConstE_FromGenRadStruct(srTSRWRadStructAccessData*
 
 //*************************************************************************
 
-int srTGenOptElem::SetupWfrEdgeCorrData(srTSRWRadStructAccessData* pRadAccessData, float* pDataEx, float* pDataEz, srTDataPtrsForWfrEdgeCorr& DataPtrsForWfrEdgeCorr)
+//int srTGenOptElem::SetupWfrEdgeCorrData(srTSRWRadStructAccessData* pRadAccessData, float* pDataEx, float* pDataEz, srTDataPtrsForWfrEdgeCorr& DataPtrsForWfrEdgeCorr)
+int srTGenOptElem::SetupWfrEdgeCorrData(srTSRWRadStructAccessData* pRadAccessData, float* pDataEx, float* pDataEz, srTDataPtrsForWfrEdgeCorr& DataPtrsForWfrEdgeCorr, void* pvGPU) //HG01122023
 {
 	int result;
 
@@ -849,7 +864,8 @@ int srTGenOptElem::SetupWfrEdgeCorrData(srTSRWRadStructAccessData* pRadAccessDat
 			FFT1DInfo.xStart = pRadAccessData->zStart;
 			FFT1DInfo.Nx = pRadAccessData->nz;
 			CGenMathFFT1D FFT1D;
-			if(result = FFT1D.Make1DFFT_InPlace(FFT1DInfo)) return result;
+			//if(result = FFT1D.Make1DFFT_InPlace(FFT1DInfo)) return result;
+			if(result = FFT1D.Make1DFFT_InPlace(FFT1DInfo, pvGPU)) return result; //HG01122023
 		}
 		if(dxFi != 0.)
 		{
@@ -889,7 +905,8 @@ int srTGenOptElem::SetupWfrEdgeCorrData(srTSRWRadStructAccessData* pRadAccessDat
 			FFT1DInfo.xStart = pRadAccessData->zStart;
 			FFT1DInfo.Nx = pRadAccessData->nz;
 			CGenMathFFT1D FFT1D;
-			if(result = FFT1D.Make1DFFT_InPlace(FFT1DInfo)) return result;
+			//if(result = FFT1D.Make1DFFT_InPlace(FFT1DInfo)) return result;
+			if(result = FFT1D.Make1DFFT_InPlace(FFT1DInfo, pvGPU)) return result; //HG01122023
 		}
 		if(dzSt != 0.)
 		{
@@ -913,7 +930,8 @@ int srTGenOptElem::SetupWfrEdgeCorrData(srTSRWRadStructAccessData* pRadAccessDat
 			FFT1DInfo.xStart = pRadAccessData->xStart;
 			FFT1DInfo.Nx = pRadAccessData->nx;
 			CGenMathFFT1D FFT1D;
-			if(result = FFT1D.Make1DFFT_InPlace(FFT1DInfo)) return result;
+			//if(result = FFT1D.Make1DFFT_InPlace(FFT1DInfo)) return result;
+			if(result = FFT1D.Make1DFFT_InPlace(FFT1DInfo, pvGPU)) return result; //HG01122023
 		}
 		if(dzFi != 0.)
 		{
@@ -936,7 +954,8 @@ int srTGenOptElem::SetupWfrEdgeCorrData(srTSRWRadStructAccessData* pRadAccessDat
 			FFT1DInfo.xStart = pRadAccessData->xStart;
 			FFT1DInfo.Nx = pRadAccessData->nx;
 			CGenMathFFT1D FFT1D;
-			if(result = FFT1D.Make1DFFT_InPlace(FFT1DInfo)) return result;
+			//if(result = FFT1D.Make1DFFT_InPlace(FFT1DInfo)) return result;
+			if(result = FFT1D.Make1DFFT_InPlace(FFT1DInfo, pvGPU)) return result; //HG01122023
 		}
 		DataPtrsForWfrEdgeCorr.WasSetup = 1;
 	}
@@ -1015,8 +1034,18 @@ int srTGenOptElem::SetupWfrEdgeCorrData1D(srTRadSect1D* pRadSect1D, float* pData
 
 //*************************************************************************
 
-void srTGenOptElem::MakeWfrEdgeCorrection(srTSRWRadStructAccessData* pRadAccessData, float* pDataEx, float* pDataEz, srTDataPtrsForWfrEdgeCorr& DataPtrs)
+//void srTGenOptElem::MakeWfrEdgeCorrection(srTSRWRadStructAccessData* pRadAccessData, float* pDataEx, float* pDataEz, srTDataPtrsForWfrEdgeCorr& DataPtrs)
+void srTGenOptElem::MakeWfrEdgeCorrection(srTSRWRadStructAccessData* pRadAccessData, float* pDataEx, float* pDataEz, srTDataPtrsForWfrEdgeCorr& DataPtrs, void* pvGPU) //HG01122023
 {
+	//HG23082022 Use GPU if requested
+#ifdef _OFFLOAD_GPU
+	if(CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU))
+	{
+		MakeWfrEdgeCorrection_GPU(pRadAccessData, pDataEx, pDataEz, DataPtrs, (TGPUUsageArg*)pvGPU);
+		return;
+	}
+#endif
+
 	float *tEx = pDataEx, *tEz = pDataEz;
 
 	double dxSt_dzSt = DataPtrs.dxSt*DataPtrs.dzSt;
@@ -1204,7 +1233,8 @@ void srTGenOptElem::MakeWfrEdgeCorrection1D(srTRadSect1D* pRadSect1D, float* pDa
 //*************************************************************************
 
 //int srTGenOptElem::SetRadRepres(srTSRWRadStructAccessData* pRadAccessData, char CoordOrAng)
-int srTGenOptElem::SetRadRepres(srTSRWRadStructAccessData* pRadAccessData, char CoordOrAng, double* ar_xStartInSlicesE, double* ar_zStartInSlicesE)
+//int srTGenOptElem::SetRadRepres(srTSRWRadStructAccessData* pRadAccessData, char CoordOrAng, double* ar_xStartInSlicesE, double* ar_zStartInSlicesE)
+int srTGenOptElem::SetRadRepres(srTSRWRadStructAccessData* pRadAccessData, char CoordOrAng, double* ar_xStartInSlicesE, double* ar_zStartInSlicesE, void* pvGPU) //HG01122023
 {// 0- to coord.; 1- to ang.
 	int result;
 
@@ -1247,7 +1277,8 @@ int srTGenOptElem::SetRadRepres(srTSRWRadStructAccessData* pRadAccessData, char
 		{
 			if(CoordOrAng == 1)
 			{
-				if(result = SetupWfrEdgeCorrData(pRadAccessData, pRadAccessData->pBaseRadX, pRadAccessData->pBaseRadZ, DataPtrsForWfrEdgeCorr)) return result;
+				//if(result = SetupWfrEdgeCorrData(pRadAccessData, pRadAccessData->pBaseRadX, pRadAccessData->pBaseRadZ, DataPtrsForWfrEdgeCorr)) return result;
+				if(result = SetupWfrEdgeCorrData(pRadAccessData, pRadAccessData->pBaseRadX, pRadAccessData->pBaseRadZ, DataPtrsForWfrEdgeCorr, pvGPU)) return result; //HG01122023
 			}
 		}
 
@@ -1255,9 +1286,11 @@ int srTGenOptElem::SetRadRepres(srTSRWRadStructAccessData* pRadAccessData, char
 		if(ar_zStartInSlicesE != 0) FFT2DInfo.yStart = *ar_zStartInSlicesE;
 
 		FFT2DInfo.pData = pRadAccessData->pBaseRadX;
-		if(result = FFT2D.Make2DFFT(FFT2DInfo)) return result;
+		//if(result = FFT2D.Make2DFFT(FFT2DInfo)) return result;
+		if(result = FFT2D.Make2DFFT(FFT2DInfo, 0, 0, pvGPU)) return result; //HG01122023
 		FFT2DInfo.pData = pRadAccessData->pBaseRadZ;
-		if(result = FFT2D.Make2DFFT(FFT2DInfo)) return result;
+		//if(result = FFT2D.Make2DFFT(FFT2DInfo)) return result;
+		if(result = FFT2D.Make2DFFT(FFT2DInfo, 0, 0, pvGPU)) return result; //HG01122023
 
 		if(WfrEdgeCorrShouldBeTreated)
 		{
@@ -1265,7 +1298,8 @@ int srTGenOptElem::SetRadRepres(srTSRWRadStructAccessData* pRadAccessData, char
 			{
 				if(DataPtrsForWfrEdgeCorr.WasSetup)
 				{
-					MakeWfrEdgeCorrection(pRadAccessData, pRadAccessData->pBaseRadX, pRadAccessData->pBaseRadZ, DataPtrsForWfrEdgeCorr);
+					//MakeWfrEdgeCorrection(pRadAccessData, pRadAccessData->pBaseRadX, pRadAccessData->pBaseRadZ, DataPtrsForWfrEdgeCorr);
+					MakeWfrEdgeCorrection(pRadAccessData, pRadAccessData->pBaseRadX, pRadAccessData->pBaseRadZ, DataPtrsForWfrEdgeCorr, pvGPU); //HG01122023
 					DataPtrsForWfrEdgeCorr.DisposeData();
 				}
 			}
@@ -1309,7 +1343,8 @@ int srTGenOptElem::SetRadRepres(srTSRWRadStructAccessData* pRadAccessData, char
 			{
 				if(CoordOrAng == 1)
 				{
-					if(result = SetupWfrEdgeCorrData(pRadAccessData, AuxEx, AuxEz, DataPtrsForWfrEdgeCorr)) return result;
+					//if(result = SetupWfrEdgeCorrData(pRadAccessData, AuxEx, AuxEz, DataPtrsForWfrEdgeCorr)) return result;
+					if(result = SetupWfrEdgeCorrData(pRadAccessData, AuxEx, AuxEz, DataPtrsForWfrEdgeCorr, pvGPU)) return result; //HG01122023
 				}
 			}
 
@@ -1318,9 +1353,11 @@ int srTGenOptElem::SetRadRepres(srTSRWRadStructAccessData* pRadAccessData, char
 			if(ar_zStartInSlicesE != 0) FFT2DInfo.yStart = ar_zStartInSlicesE[ie];
 
 			FFT2DInfo.pData = AuxEx;
-			if(result = FFT2D.Make2DFFT(FFT2DInfo)) return result;
+			//if(result = FFT2D.Make2DFFT(FFT2DInfo)) return result;
+			if(result = FFT2D.Make2DFFT(FFT2DInfo, 0, 0, pvGPU)) return result; //HG01122023
 			FFT2DInfo.pData = AuxEz;
-			if(result = FFT2D.Make2DFFT(FFT2DInfo)) return result;
+			//if(result = FFT2D.Make2DFFT(FFT2DInfo)) return result;
+			if(result = FFT2D.Make2DFFT(FFT2DInfo, 0, 0, pvGPU)) return result; //HG01122023
 
 			if(WfrEdgeCorrShouldBeTreated)
 			{
@@ -1328,7 +1365,8 @@ int srTGenOptElem::SetRadRepres(srTSRWRadStructAccessData* pRadAccessData, char
 				{
 					if(DataPtrsForWfrEdgeCorr.WasSetup)
 					{
-						MakeWfrEdgeCorrection(pRadAccessData, AuxEx, AuxEz, DataPtrsForWfrEdgeCorr);
+						//MakeWfrEdgeCorrection(pRadAccessData, AuxEx, AuxEz, DataPtrsForWfrEdgeCorr);
+						MakeWfrEdgeCorrection(pRadAccessData, AuxEx, AuxEz, DataPtrsForWfrEdgeCorr, pvGPU); //HG01122023
 						DataPtrsForWfrEdgeCorr.DisposeData();
 					}
 				}
@@ -2182,7 +2220,8 @@ void srTGenOptElem::FindMinMaxRatio(double* Arr1, double* Arr2, int n, double& M
 
 //*************************************************************************
 
-int srTGenOptElem::RadResizeGen(srTSRWRadStructAccessData& SRWRadStructAccessData, srTRadResize& RadResizeStruct)
+//int srTGenOptElem::RadResizeGen(srTSRWRadStructAccessData& SRWRadStructAccessData, srTRadResize& RadResizeStruct)
+int srTGenOptElem::RadResizeGen(srTSRWRadStructAccessData& SRWRadStructAccessData, srTRadResize& RadResizeStruct, void* pvGPU) //HG01122023
 {
 	//Added by SY (for profiling?) at parallelizing SRW via OpenMP:
 	//double start;
@@ -2257,7 +2296,8 @@ int srTGenOptElem::RadResizeGen(srTSRWRadStructAccessData& SRWRadStructAccessDat
 			SRWRadStructAccessData.zWfrMin += zShift; SRWRadStructAccessData.zWfrMax += zShift;
 		}
 
-		if(result = SetRadRepres(&SRWRadStructAccessData, ToRepres)) return result;
+		//if(result = SetRadRepres(&SRWRadStructAccessData, ToRepres)) return result;
+		if(result = SetRadRepres(&SRWRadStructAccessData, ToRepres, 0, 0, pvGPU)) return result; //HG01122023
 
 		double pxmNew = RadResizeStruct.pxd, pxdNew = RadResizeStruct.pxm;
 		double pzmNew = RadResizeStruct.pzd, pzdNew = RadResizeStruct.pzm;
@@ -2537,7 +2577,8 @@ int srTGenOptElem::RadResizeGen(srTSRWRadStructAccessData& SRWRadStructAccessDat
 			//Added by SY (for profiling?) at parallelizing SRW via OpenMP:
 			//srwlPrintTime(":RadResizeGen: copydata",&start);
 			
-			if(result = RadResizeCore(SRWRadStructAccessData, NewSRWRadStructAccessData, RadResizeStruct)) return result;
+			//if(result = RadResizeCore(SRWRadStructAccessData, NewSRWRadStructAccessData, RadResizeStruct)) return result;
+			if(result = RadResizeCore(SRWRadStructAccessData, NewSRWRadStructAccessData, RadResizeStruct, 0, pvGPU)) return result; //HG01122023
 			
 			if(OldRadXCopy != 0) delete[] OldRadXCopy;
 			if(OldRadZCopy != 0) delete[] OldRadZCopy;
@@ -2602,7 +2643,8 @@ int srTGenOptElem::RadResizeGen(srTSRWRadStructAccessData& SRWRadStructAccessDat
 			//Added by SY (for profiling?) at parallelizing SRW via OpenMP:
 			//srwlPrintTime(":RadResizeGen: TreatPolarizSepar-PrepareStructs",&start);
 
-			if(result = RadResizeCore(SRWRadStructAccessData, NewSRWRadStructAccessData, RadResizeStruct)) return result;
+			//if(result = RadResizeCore(SRWRadStructAccessData, NewSRWRadStructAccessData, RadResizeStruct)) return result;
+			if(result = RadResizeCore(SRWRadStructAccessData, NewSRWRadStructAccessData, RadResizeStruct, 0, pvGPU)) return result; //HG01122023
 
 			//Added by SY (for profiling?) at parallelizing SRW via OpenMP:
 			//srwlPrintTime(":RadResizeGen: RadResizeCore 2",&start);
@@ -2662,7 +2704,8 @@ int srTGenOptElem::RadResizeGen(srTSRWRadStructAccessData& SRWRadStructAccessDat
 					*(tBaseRadX++) = 0.;
 				}
 				SRWRadStructAccessData.pBaseRadX = OldRadXCopy;
-				if(result = RadResizeCore(SRWRadStructAccessData, NewSRWRadStructAccessData, RadResizeStruct, 'x')) return result;
+				//if(result = RadResizeCore(SRWRadStructAccessData, NewSRWRadStructAccessData, RadResizeStruct, 'x')) return result;
+				if(result = RadResizeCore(SRWRadStructAccessData, NewSRWRadStructAccessData, RadResizeStruct, 'x', pvGPU)) return result; //HG01122023
 				if(OldRadXCopy != 0) delete[] OldRadXCopy;
 			}
 			//Added by SY (for profiling?) at parallelizing SRW via OpenMP:
@@ -2698,7 +2741,8 @@ int srTGenOptElem::RadResizeGen(srTSRWRadStructAccessData& SRWRadStructAccessDat
 					*(tBaseRadZ++) = 0.;
 				}
 				SRWRadStructAccessData.pBaseRadZ = OldRadZCopy;
-				if(result = RadResizeCore(SRWRadStructAccessData, NewSRWRadStructAccessData, RadResizeStruct, 'z')) return result;
+				//if(result = RadResizeCore(SRWRadStructAccessData, NewSRWRadStructAccessData, RadResizeStruct, 'z')) return result;
+				if(result = RadResizeCore(SRWRadStructAccessData, NewSRWRadStructAccessData, RadResizeStruct, 'z', pvGPU)) return result; //HG01122023
 				if(OldRadZCopy != 0) delete[] OldRadZCopy;
 			}
 			//Added by SY (for profiling?) at parallelizing SRW via OpenMP:
@@ -2734,7 +2778,8 @@ int srTGenOptElem::RadResizeGen(srTSRWRadStructAccessData& SRWRadStructAccessDat
 				//for(long j=0; j<TotAmOfNewData; j++) *(tRadX++) = 0.;
 				for(long long j=0; j<TotAmOfNewData; j++) *(tRadX++) = 0.;
 
-				if(result = RadResizeCore(SRWRadStructAccessData, NewSRWRadStructAccessData, RadResizeStruct, 'x')) return result;
+				//if(result = RadResizeCore(SRWRadStructAccessData, NewSRWRadStructAccessData, RadResizeStruct, 'x')) return result;
+				if(result = RadResizeCore(SRWRadStructAccessData, NewSRWRadStructAccessData, RadResizeStruct, 'x', pvGPU)) return result; //HG01122023
 
 #ifdef __IGOR_PRO__
 				srTSRWRadStructWaveKeys Keys;
@@ -2764,7 +2809,8 @@ int srTGenOptElem::RadResizeGen(srTSRWRadStructAccessData& SRWRadStructAccessDat
 				//for(long i = 0; i < TotAmOfNewData; i++) *(tRadZ++) = 0.;
 				for(long long i = 0; i < TotAmOfNewData; i++) *(tRadZ++) = 0.;
 
-				if(result = RadResizeCore(SRWRadStructAccessData, NewSRWRadStructAccessData, RadResizeStruct, 'z')) return result;
+				//if(result = RadResizeCore(SRWRadStructAccessData, NewSRWRadStructAccessData, RadResizeStruct, 'z')) return result;
+				if(result = RadResizeCore(SRWRadStructAccessData, NewSRWRadStructAccessData, RadResizeStruct, 'z', pvGPU)) return result; //HG01122023
 
 #ifdef __IGOR_PRO__
 				srTSRWRadStructWaveKeys Keys;
@@ -2794,7 +2840,8 @@ int srTGenOptElem::RadResizeGen(srTSRWRadStructAccessData& SRWRadStructAccessDat
 	if(RadResizeStruct.useOtherSideFFT()) //OC090311
 	{
 		char ToRepres = (SRWRadStructAccessData.Pres == 0)? 1 : 0;
-		if(result = SetRadRepres(&SRWRadStructAccessData, ToRepres)) return result;
+		//if(result = SetRadRepres(&SRWRadStructAccessData, ToRepres)) return result;
+		if(result = SetRadRepres(&SRWRadStructAccessData, ToRepres, 0, 0, pvGPU)) return result; //HG01122023
 
 		if(UseStartTrWithOtherSide)
 		{
@@ -2819,7 +2866,8 @@ int srTGenOptElem::RadResizeGen(srTSRWRadStructAccessData& SRWRadStructAccessDat
 
 //*************************************************************************
 
-int srTGenOptElem::RadResizeCore(srTSRWRadStructAccessData& OldRadAccessData, srTSRWRadStructAccessData& NewRadAccessData, srTRadResize& RadResizeStruct, char PolComp)
+//int srTGenOptElem::RadResizeCore(srTSRWRadStructAccessData& OldRadAccessData, srTSRWRadStructAccessData& NewRadAccessData, srTRadResize& RadResizeStruct, char PolComp)
+int srTGenOptElem::RadResizeCore(srTSRWRadStructAccessData& OldRadAccessData, srTSRWRadStructAccessData& NewRadAccessData, srTRadResize& RadResizeStruct, char PolComp, void* pvGPU) //HG01122023
 {
 	//Added by SY (for profiling?) at parallelizing SRW via OpenMP:
 	//double start;
@@ -2860,7 +2908,8 @@ int srTGenOptElem::RadResizeCore(srTSRWRadStructAccessData& OldRadAccessData, sr
 		NewRadAccessData.WfrQuadTermCanBeTreatedAtResizeX = OldRadAccessData.WfrQuadTermCanBeTreatedAtResizeX;
 		NewRadAccessData.WfrQuadTermCanBeTreatedAtResizeZ = OldRadAccessData.WfrQuadTermCanBeTreatedAtResizeZ;
 
-		TreatStronglyOscillatingTerm(OldRadAccessData, 'r', PolComp);
+		//TreatStronglyOscillatingTerm(OldRadAccessData, 'r', PolComp);
+		TreatStronglyOscillatingTerm(OldRadAccessData, 'r', PolComp, -1, pvGPU); //HG01122023
 
 		//Added by SY (for profiling?) at parallelizing SRW via OpenMP:
 		//srwlPrintTime(":RadResizeCore: TreatStronglyOscillatingTerm 1",&start);
@@ -2907,178 +2956,187 @@ int srTGenOptElem::RadResizeCore(srTSRWRadStructAccessData& OldRadAccessData, sr
 
 	int result = 0;
 
-#ifdef _WITH_OMP //OC31102018: added by SY at parallelizing SRW via OpenMP
-	#pragma omp parallel for
+#ifdef _OFFLOAD_GPU //HG01122023
+	if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU))
+	{
+		RadResizeCore_GPU(OldRadAccessData, NewRadAccessData, PolComp, (TGPUUsageArg*)pvGPU);
+	}
+	else 
 #endif
-
-	for(int ie=0; ie<NewRadAccessData.ne; ie++)
 	{
-		//OC31102018: modified by SY at OpenMP parallelization
-		//ixStOldPrev = -1000; izStOldPrev = -1000;
-
-		//OC31102018: moved by SY at OpenMP parallelization
-		srTInterpolAux01 InterpolAux01;
-		srTInterpolAux02 InterpolAux02[4], InterpolAux02I[2];
-		srTInterpolAuxF AuxF[4], AuxFI[2];
-		int ixStOld, izStOld, ixStOldPrev = -1000, izStOldPrev = -1000;
-		float BufF[4], BufFI[2];
-		char UseLowOrderInterp_PolCompX, UseLowOrderInterp_PolCompZ;
+#ifdef _WITH_OMP //OC31102018: added by SY at parallelizing SRW via OpenMP
+		#pragma omp parallel for
+#endif
 
-		//long Two_ie = ie << 1;
-		long long Two_ie = ie << 1;
-		for(int iz=izStart; iz<=izEnd; iz++)
+		for(int ie=0; ie<NewRadAccessData.ne; ie++)
 		{
-			//SY: do we need this (always returns 0, updates some clock)
-			//if(result = srYield.Check()) return result;
-
-			double zAbs = NewRadAccessData.zStart + iz*NewRadAccessData.zStep;
-
-			char FieldShouldBeZeroedDueToZ = 0;
-			if(NewRadAccessData.WfrEdgeCorrShouldBeDone)
+			//OC31102018: modified by SY at OpenMP parallelization
+			//ixStOldPrev = -1000; izStOldPrev = -1000;
+
+			//OC31102018: moved by SY at OpenMP parallelization
+			srTInterpolAux01 InterpolAux01;
+			srTInterpolAux02 InterpolAux02[4], InterpolAux02I[2];
+			srTInterpolAuxF AuxF[4], AuxFI[2];
+			int ixStOld, izStOld, ixStOldPrev = -1000, izStOldPrev = -1000;
+			float BufF[4], BufFI[2];
+			char UseLowOrderInterp_PolCompX, UseLowOrderInterp_PolCompZ;
+
+			//long Two_ie = ie << 1;
+			long long Two_ie = ie << 1;
+			for(int iz=izStart; iz<=izEnd; iz++)
 			{
-				if((zAbs < NewRadAccessData.zWfrMin - DistAbsTol) || (zAbs > NewRadAccessData.zWfrMax + DistAbsTol)) FieldShouldBeZeroedDueToZ = 1;
-			}
+				//SY: do we need this (always returns 0, updates some clock)
+				//if(result = srYield.Check()) return result;
 
-			int izcOld = int((zAbs - OldRadAccessData.zStart)*zStepInvOld + 1.E-06);
+				double zAbs = NewRadAccessData.zStart + iz*NewRadAccessData.zStep;
 
-			double zRel = zAbs - (OldRadAccessData.zStart + izcOld*OldRadAccessData.zStep);
+				char FieldShouldBeZeroedDueToZ = 0;
+				if(NewRadAccessData.WfrEdgeCorrShouldBeDone)
+				{
+					if((zAbs < NewRadAccessData.zWfrMin - DistAbsTol) || (zAbs > NewRadAccessData.zWfrMax + DistAbsTol)) FieldShouldBeZeroedDueToZ = 1;
+				}
 
-			if(izcOld == nz_mi_1Old) { izStOld = izcOld - 3; zRel += 2.*OldRadAccessData.zStep;}
-			else if(izcOld == nz_mi_2Old) { izStOld = izcOld - 2; zRel += OldRadAccessData.zStep;}
-			else if(izcOld == 0) { izStOld = izcOld; zRel -= OldRadAccessData.zStep;}
-			else izStOld = izcOld - 1;
+				int izcOld = int((zAbs - OldRadAccessData.zStart)*zStepInvOld + 1.E-06);
 
-			zRel *= zStepInvOld;
+				double zRel = zAbs - (OldRadAccessData.zStart + izcOld*OldRadAccessData.zStep);
 
-			int izcOld_mi_izStOld = izcOld - izStOld;
-			//long izPerZ_New = iz*PerZ_New;
-			long long izPerZ_New = iz*PerZ_New;
+				if(izcOld == nz_mi_1Old) { izStOld = izcOld - 3; zRel += 2.*OldRadAccessData.zStep;}
+				else if(izcOld == nz_mi_2Old) { izStOld = izcOld - 2; zRel += OldRadAccessData.zStep;}
+				else if(izcOld == 0) { izStOld = izcOld; zRel -= OldRadAccessData.zStep;}
+				else izStOld = izcOld - 1;
 
-			float *pEX_StartForX_New = 0, *pEZ_StartForX_New = 0;
-			if(TreatPolCompX) pEX_StartForX_New = pEX0_New + izPerZ_New;
-			if(TreatPolCompZ) pEZ_StartForX_New = pEZ0_New + izPerZ_New;
+				zRel *= zStepInvOld;
 
-			for(int ix=ixStart; ix<=ixEnd; ix++)
-			{
-				//long ixPerX_New_p_Two_ie = ix*PerX_New + Two_ie;
-				long long ixPerX_New_p_Two_ie = ix*PerX_New + Two_ie;
-				float *pEX_New = 0, *pEZ_New = 0;
-				if(TreatPolCompX) pEX_New = pEX_StartForX_New + ixPerX_New_p_Two_ie;
-				if(TreatPolCompZ) pEZ_New = pEZ_StartForX_New + ixPerX_New_p_Two_ie;
+				int izcOld_mi_izStOld = izcOld - izStOld;
+				//long izPerZ_New = iz*PerZ_New;
+				long long izPerZ_New = iz*PerZ_New;
 
-				double xAbs = NewRadAccessData.xStart + ix*NewRadAccessData.xStep;
+				float *pEX_StartForX_New = 0, *pEZ_StartForX_New = 0;
+				if(TreatPolCompX) pEX_StartForX_New = pEX0_New + izPerZ_New;
+				if(TreatPolCompZ) pEZ_StartForX_New = pEZ0_New + izPerZ_New;
 
-				char FieldShouldBeZeroedDueToX = 0;
-				if(NewRadAccessData.WfrEdgeCorrShouldBeDone)
+				for(int ix=ixStart; ix<=ixEnd; ix++)
 				{
-					if((xAbs < NewRadAccessData.xWfrMin - DistAbsTol) || (xAbs > NewRadAccessData.xWfrMax + DistAbsTol)) FieldShouldBeZeroedDueToX = 1;
-				}
-				char FieldShouldBeZeroed = (FieldShouldBeZeroedDueToX || FieldShouldBeZeroedDueToZ);
+					//long ixPerX_New_p_Two_ie = ix*PerX_New + Two_ie;
+					long long ixPerX_New_p_Two_ie = ix*PerX_New + Two_ie;
+					float *pEX_New = 0, *pEZ_New = 0;
+					if(TreatPolCompX) pEX_New = pEX_StartForX_New + ixPerX_New_p_Two_ie;
+					if(TreatPolCompZ) pEZ_New = pEZ_StartForX_New + ixPerX_New_p_Two_ie;
 
-				int ixcOld = int((xAbs - OldRadAccessData.xStart)*xStepInvOld + 1.E-06);
-				double xRel = xAbs - (OldRadAccessData.xStart + ixcOld*OldRadAccessData.xStep);
+					double xAbs = NewRadAccessData.xStart + ix*NewRadAccessData.xStep;
 
-				if(ixcOld == nx_mi_1Old) { ixStOld = ixcOld - 3; xRel += 2.*OldRadAccessData.xStep;}
-				else if(ixcOld == nx_mi_2Old) { ixStOld = ixcOld - 2; xRel += OldRadAccessData.xStep;}
-				else if(ixcOld == 0) { ixStOld = ixcOld; xRel -= OldRadAccessData.xStep;}
-				else ixStOld = ixcOld - 1;
+					char FieldShouldBeZeroedDueToX = 0;
+					if(NewRadAccessData.WfrEdgeCorrShouldBeDone)
+					{
+						if((xAbs < NewRadAccessData.xWfrMin - DistAbsTol) || (xAbs > NewRadAccessData.xWfrMax + DistAbsTol)) FieldShouldBeZeroedDueToX = 1;
+					}
+					char FieldShouldBeZeroed = (FieldShouldBeZeroedDueToX || FieldShouldBeZeroedDueToZ);
 
-				xRel *= xStepInvOld;
+					int ixcOld = int((xAbs - OldRadAccessData.xStart)*xStepInvOld + 1.E-06);
+					double xRel = xAbs - (OldRadAccessData.xStart + ixcOld*OldRadAccessData.xStep);
 
-				int ixcOld_mi_ixStOld = ixcOld - ixStOld;
+					if(ixcOld == nx_mi_1Old) { ixStOld = ixcOld - 3; xRel += 2.*OldRadAccessData.xStep;}
+					else if(ixcOld == nx_mi_2Old) { ixStOld = ixcOld - 2; xRel += OldRadAccessData.xStep;}
+					else if(ixcOld == 0) { ixStOld = ixcOld; xRel -= OldRadAccessData.xStep;}
+					else ixStOld = ixcOld - 1;
 
-				if((izStOld != izStOldPrev) || (ixStOld != ixStOldPrev))
-				{
-					UseLowOrderInterp_PolCompX = 0; UseLowOrderInterp_PolCompZ = 0;
+					xRel *= xStepInvOld;
 
-					//long TotOffsetOld = izStOld*PerZ_Old + ixStOld*PerX_Old + Two_ie;
-					long long TotOffsetOld = izStOld*PerZ_Old + ixStOld*PerX_Old + Two_ie;
+					int ixcOld_mi_ixStOld = ixcOld - ixStOld;
 
-					if(TreatPolCompX)
+					if((izStOld != izStOldPrev) || (ixStOld != ixStOldPrev))
 					{
-						float* pExSt_Old = OldRadAccessData.pBaseRadX + TotOffsetOld;
-						GetCellDataForInterpol(pExSt_Old, PerX_Old, PerZ_Old, AuxF);
+						UseLowOrderInterp_PolCompX = 0; UseLowOrderInterp_PolCompZ = 0;
 
-						SetupCellDataI(AuxF, AuxFI);
-						UseLowOrderInterp_PolCompX = CheckForLowOrderInterp(AuxF, AuxFI, ixcOld_mi_ixStOld, izcOld_mi_izStOld, &InterpolAux01, InterpolAux02, InterpolAux02I);
+						//long TotOffsetOld = izStOld*PerZ_Old + ixStOld*PerX_Old + Two_ie;
+						long long TotOffsetOld = izStOld*PerZ_Old + ixStOld*PerX_Old + Two_ie;
 
-						if(!UseLowOrderInterp_PolCompX)
+						if(TreatPolCompX)
 						{
-							for(int i=0; i<2; i++) 
+							float* pExSt_Old = OldRadAccessData.pBaseRadX + TotOffsetOld;
+							GetCellDataForInterpol(pExSt_Old, PerX_Old, PerZ_Old, AuxF);
+
+							SetupCellDataI(AuxF, AuxFI);
+							UseLowOrderInterp_PolCompX = CheckForLowOrderInterp(AuxF, AuxFI, ixcOld_mi_ixStOld, izcOld_mi_izStOld, &InterpolAux01, InterpolAux02, InterpolAux02I);
+
+							if(!UseLowOrderInterp_PolCompX)
 							{
-								SetupInterpolAux02(AuxF + i, &InterpolAux01, InterpolAux02 + i);
+								for(int i=0; i<2; i++) 
+								{
+									SetupInterpolAux02(AuxF + i, &InterpolAux01, InterpolAux02 + i);
+								}
+								SetupInterpolAux02(AuxFI, &InterpolAux01, InterpolAux02I);
 							}
-							SetupInterpolAux02(AuxFI, &InterpolAux01, InterpolAux02I);
 						}
-					}
-					if(TreatPolCompZ)
-					{
-						float* pEzSt_Old = OldRadAccessData.pBaseRadZ + TotOffsetOld;
-						GetCellDataForInterpol(pEzSt_Old, PerX_Old, PerZ_Old, AuxF+2);
+						if(TreatPolCompZ)
+						{
+							float* pEzSt_Old = OldRadAccessData.pBaseRadZ + TotOffsetOld;
+							GetCellDataForInterpol(pEzSt_Old, PerX_Old, PerZ_Old, AuxF+2);
 
-						SetupCellDataI(AuxF+2, AuxFI+1);
-						UseLowOrderInterp_PolCompZ = CheckForLowOrderInterp(AuxF+2, AuxFI+1, ixcOld_mi_ixStOld, izcOld_mi_izStOld, &InterpolAux01, InterpolAux02+2, InterpolAux02I+1);
+							SetupCellDataI(AuxF+2, AuxFI+1);
+							UseLowOrderInterp_PolCompZ = CheckForLowOrderInterp(AuxF+2, AuxFI+1, ixcOld_mi_ixStOld, izcOld_mi_izStOld, &InterpolAux01, InterpolAux02+2, InterpolAux02I+1);
 
-						if(!UseLowOrderInterp_PolCompZ)
-						{
-							for(int i=0; i<2; i++) 
+							if(!UseLowOrderInterp_PolCompZ)
 							{
-								SetupInterpolAux02(AuxF+2+i, &InterpolAux01, InterpolAux02+2+i);
+								for(int i=0; i<2; i++) 
+								{
+									SetupInterpolAux02(AuxF+2+i, &InterpolAux01, InterpolAux02+2+i);
+								}
+								SetupInterpolAux02(AuxFI+1, &InterpolAux01, InterpolAux02I+1);
 							}
-							SetupInterpolAux02(AuxFI+1, &InterpolAux01, InterpolAux02I+1);
 						}
-					}
 
-					ixStOldPrev = ixStOld; izStOldPrev = izStOld;
-				}
-
-				if(TreatPolCompX)
-				{
-					if(UseLowOrderInterp_PolCompX) 
-					{
-						InterpolF_LowOrder(InterpolAux02, xRel, zRel, BufF, 0);
-						InterpolFI_LowOrder(InterpolAux02I, xRel, zRel, BufFI, 0);
+						ixStOldPrev = ixStOld; izStOldPrev = izStOld;
 					}
-					else
+
+					if(TreatPolCompX)
 					{
-						InterpolF(InterpolAux02, xRel, zRel, BufF, 0);
-						InterpolFI(InterpolAux02I, xRel, zRel, BufFI, 0);
-					}
+						if(UseLowOrderInterp_PolCompX) 
+						{
+							InterpolF_LowOrder(InterpolAux02, xRel, zRel, BufF, 0);
+							InterpolFI_LowOrder(InterpolAux02I, xRel, zRel, BufFI, 0);
+						}
+						else
+						{
+							InterpolF(InterpolAux02, xRel, zRel, BufF, 0);
+							InterpolFI(InterpolAux02I, xRel, zRel, BufFI, 0);
+						}
 
-					(*BufFI) *= AuxFI->fNorm;
-					ImproveReAndIm(BufF, BufFI);
+						(*BufFI) *= AuxFI->fNorm;
+						ImproveReAndIm(BufF, BufFI);
 
-					if(FieldShouldBeZeroed)
-					{
-						*BufF = 0.; *(BufF+1) = 0.;
-					}
+						if(FieldShouldBeZeroed)
+						{
+							*BufF = 0.; *(BufF+1) = 0.;
+						}
 
-					*pEX_New = *BufF;
-					*(pEX_New+1) = *(BufF+1);
-				}
-				if(TreatPolCompZ)
-				{
-					if(UseLowOrderInterp_PolCompZ) 
-					{
-						InterpolF_LowOrder(InterpolAux02, xRel, zRel, BufF, 2);
-						InterpolFI_LowOrder(InterpolAux02I, xRel, zRel, BufFI, 1);
+						*pEX_New = *BufF;
+						*(pEX_New+1) = *(BufF+1);
 					}
-					else
+					if(TreatPolCompZ)
 					{
-						InterpolF(InterpolAux02, xRel, zRel, BufF, 2);
-						InterpolFI(InterpolAux02I, xRel, zRel, BufFI, 1);
-					}
+						if(UseLowOrderInterp_PolCompZ) 
+						{
+							InterpolF_LowOrder(InterpolAux02, xRel, zRel, BufF, 2);
+							InterpolFI_LowOrder(InterpolAux02I, xRel, zRel, BufFI, 1);
+						}
+						else
+						{
+							InterpolF(InterpolAux02, xRel, zRel, BufF, 2);
+							InterpolFI(InterpolAux02I, xRel, zRel, BufFI, 1);
+						}
 
-					(*(BufFI+1)) *= (AuxFI+1)->fNorm;
-					ImproveReAndIm(BufF+2, BufFI+1);
+						(*(BufFI+1)) *= (AuxFI+1)->fNorm;
+						ImproveReAndIm(BufF+2, BufFI+1);
 
-					if(FieldShouldBeZeroed)
-					{
-						*(BufF+2) = 0.; *(BufF+3) = 0.;
-					}
+						if(FieldShouldBeZeroed)
+						{
+							*(BufF+2) = 0.; *(BufF+3) = 0.;
+						}
 
-					*pEZ_New = *(BufF+2);
-					*(pEZ_New+1) = *(BufF+3);
+						*pEZ_New = *(BufF+2);
+						*(pEZ_New+1) = *(BufF+3);
+					}
 				}
 			}
 		}
@@ -3089,7 +3147,8 @@ int srTGenOptElem::RadResizeCore(srTSRWRadStructAccessData& OldRadAccessData, sr
 	//sprintf(str,"%s %d",":RadResizeCore: cycles:",NewRadAccessData.ne);
 	//srwlPrintTime(str,&start);
 
-	if(WaveFrontTermWasTreated) TreatStronglyOscillatingTerm(NewRadAccessData, 'a', PolComp);
+	//if(WaveFrontTermWasTreated) TreatStronglyOscillatingTerm(NewRadAccessData, 'a', PolComp);
+	if(WaveFrontTermWasTreated) TreatStronglyOscillatingTerm(NewRadAccessData, 'a', PolComp, -1, pvGPU); //HG01122023
 
 	//OC31102018: added by SY (for profiling?) at parallelizing SRW via OpenMP
 	//srwlPrintTime(":RadResizeCore: TreatStronglyOscillatingTerm 2",&start);
@@ -4503,7 +4562,8 @@ char srTGenOptElem::WaveFrontTermCanBeTreated(srTSRWRadStructAccessData& RadAcce
 
 //*************************************************************************
 
-void srTGenOptElem::TreatStronglyOscillatingTerm(srTSRWRadStructAccessData& RadAccessData, char AddOrRem, char PolComp, int ieOnly)
+//void srTGenOptElem::TreatStronglyOscillatingTerm(srTSRWRadStructAccessData& RadAccessData, char AddOrRem, char PolComp, int ieOnly)
+void srTGenOptElem::TreatStronglyOscillatingTerm(srTSRWRadStructAccessData& RadAccessData, char AddOrRem, char PolComp, int ieOnly, void* pvGPU) //HG01122023
 {
 	//Later treat X and Z coordinates separately here!!!
 
@@ -4634,6 +4694,14 @@ void srTGenOptElem::TreatStronglyOscillatingTerm(srTSRWRadStructAccessData& RadA
 		ieStart = ieOnly; ieBefEnd = ieOnly + 1;
 	}
 
+#ifdef _OFFLOAD_GPU //HG01122023
+	if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU))
+	{
+		TreatStronglyOscillatingTerm_GPU(RadAccessData, TreatPolCompX, TreatPolCompZ, ConstRx, ConstRz, ieStart, ieBefEnd, (TGPUUsageArg*)pvGPU);
+		return;
+	}
+#endif
+
 #ifdef _WITH_OMP //OC31102018: added by SY at parallelizing SRW via OpenMP
 	#pragma omp parallel for
 #endif
diff --git a/cpp/src/core/sroptelm.h b/cpp/src/core/sroptelm.h
index 4e5c9445..a4ea6919 100644
--- a/cpp/src/core/sroptelm.h
+++ b/cpp/src/core/sroptelm.h
@@ -17,6 +17,7 @@
 #include <stdlib.h> //required by some (buggy?) version of GCC
 #include <cstdlib> //required?
 
+
 #include "gmtrans.h"
 #include "gmvect.h"
 
@@ -43,6 +44,11 @@
 #endif
 #endif
 
+#ifdef _OFFLOAD_GPU
+#include "auxgpu.h"
+#include "sroptelm_gpu.h"
+#endif
+
 //*************************************************************************
 
 extern srTIntVect gVectWarnNos;
@@ -119,7 +125,10 @@ class srTGenOptElem : public CGenObject {
 #endif
 	}
 
-	virtual int PropagateRadiation(srTSRWRadStructAccessData*, srTParPrecWfrPropag&, srTRadResizeVect&) { return 0;}
+	virtual int SupportedFeatures() { return 0; } //HG01122023 0=CPU only, 1=GPU supported
+
+	//virtual int PropagateRadiation(srTSRWRadStructAccessData*, srTParPrecWfrPropag&, srTRadResizeVect&) { return 0;}
+	virtual int PropagateRadiation(srTSRWRadStructAccessData*, srTParPrecWfrPropag&, srTRadResizeVect&, void* pvGPU=0) { return 0;} //HG01122023
 
 	virtual int PropagateRadMoments(srTSRWRadStructAccessData*, srTMomentsRatios*) { return 0;}
 	virtual int PropagateWaveFrontRadius(srTSRWRadStructAccessData*) { return 0;}
@@ -128,16 +137,21 @@ class srTGenOptElem : public CGenObject {
 
 	//virtual int PropagateRadiationSimple(srTSRWRadStructAccessData*, void* pBuf=0) { return 0;} //OC06092019
 	//OC01102019 (restored)
-	virtual int PropagateRadiationSimple(srTSRWRadStructAccessData*) { return 0;}
+	//virtual int PropagateRadiationSimple(srTSRWRadStructAccessData*) { return 0;}
+	virtual int PropagateRadiationSimple(srTSRWRadStructAccessData*, void* pvGPU=0) { return 0;} //HG01122023
 	virtual int PropagateRadiationSimple1D(srTRadSect1D*) { return 0;}
 	
 	//virtual int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadData, void* pBuf=0) { return 0;} //OC06092019
 	//OC01102019 (restored)
-	virtual int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadData) { return 0;}
+	//virtual int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadData) { return 0;}
+	virtual int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadData, void* pvGPU=0) { return 0;} //HG01122023
 
 	virtual int RangeShouldBeAdjustedAtPropag() { return 1;}
 	virtual int ResolutionShouldBeAdjustedAtPropag() { return 1;}
 
+#ifdef _OFFLOAD_GPU //HG01122023
+	virtual int RadPointModifierParallel(srTSRWRadStructAccessData* pRadAccessData, void* pBufVars=0, long pBufVarsSz=0, TGPUUsageArg* pvGPU=0) { return -1; }
+#endif
 	virtual void RadPointModifier(srTEXZ&, srTEFieldPtrs&, void* pBufVars=0) {} //OC29082019
 	//virtual void RadPointModifier(srTEXZ&, srTEFieldPtrs&) {}
 	virtual void RadPointModifier1D(srTEXZ&, srTEFieldPtrs&, void* pBufVars=0) {}//OC06092019
@@ -182,7 +196,8 @@ class srTGenOptElem : public CGenObject {
 
 	//virtual int PropagateRadiationMeth_0(srTSRWRadStructAccessData* pRadAccessData, void* pBuf=0); //OC06092019
 	//OC01102019 (restored)
-	virtual int PropagateRadiationMeth_0(srTSRWRadStructAccessData* pRadAccessData); //moved from derived classes: loops over E, calls derived PropagateRadiationSingleE_Meth_0
+	//virtual int PropagateRadiationMeth_0(srTSRWRadStructAccessData* pRadAccessData); //moved from derived classes: loops over E, calls derived PropagateRadiationSingleE_Meth_0
+	virtual int PropagateRadiationMeth_0(srTSRWRadStructAccessData* pRadAccessData, void* pvGPU=0); //moved from derived classes: loops over E, calls derived PropagateRadiationSingleE_Meth_0 //HG01122023
 
 	void FindWidestWfrMeshParam(vector<srTSRWRadStructAccessData>& vRadSlices, srTSRWRadStructAccessData* pRad, bool keepConstNumPoints);
 	int ReInterpolateWfrDataOnNewTransvMesh(vector<srTSRWRadStructAccessData>& vRadSlices, srTSRWRadStructAccessData* pAuxRadSingleE, srTSRWRadStructAccessData* pRadRes);
@@ -236,7 +251,8 @@ class srTGenOptElem : public CGenObject {
 
 	int FillOutRadFromInRad(srTSRWRadStructAccessData*, srTSRWRadStructAccessData*);
 
-	int TraverseRadZXE(srTSRWRadStructAccessData*, void* pBufVars=0); //OC29082019
+	int TraverseRadZXE(srTSRWRadStructAccessData*, void* pBufVars=0, long pBufVarsSz=0, void* pvGPU=0); //OC29082019 //HG01122023
+	//int TraverseRadZXE(srTSRWRadStructAccessData*, void* pBufVars=0); //OC29082019
 	//int TraverseRadZXE(srTSRWRadStructAccessData*);
 	int TraverseRad1D(srTRadSect1D*, void* pBufVars=0); //OC29082019
 	//int TraverseRad1D(srTRadSect1D*);
@@ -258,41 +274,73 @@ class srTGenOptElem : public CGenObject {
 	int RemoveSliceConstE_FromGenRadStruct(srTSRWRadStructAccessData*, long);
 
 	//int SetRadRepres(srTSRWRadStructAccessData*, char);
-	int SetRadRepres(srTSRWRadStructAccessData*, char, double* ar_xStartInSlicesE=0, double* ar_zStartInSlicesE=0);
+	//int SetRadRepres(srTSRWRadStructAccessData*, char, double* ar_xStartInSlicesE=0, double* ar_zStartInSlicesE=0);
+	int SetRadRepres(srTSRWRadStructAccessData*, char, double* ar_xStartInSlicesE=0, double* ar_zStartInSlicesE=0, void* pvGPU=0); //HG01122023
 	int SetRadRepres1D(srTRadSect1D*, char);
 
-	int SetupWfrEdgeCorrData(srTSRWRadStructAccessData*, float*, float*, srTDataPtrsForWfrEdgeCorr&);
+	int SetupWfrEdgeCorrData(srTSRWRadStructAccessData*, float*, float*, srTDataPtrsForWfrEdgeCorr&, void* pvGPU=0); //HG01122023
+	//int SetupWfrEdgeCorrData(srTSRWRadStructAccessData*, float*, float*, srTDataPtrsForWfrEdgeCorr&);
 	//inline void SetupExpCorrArray(float*, long, double, double, double);
 	inline void SetupExpCorrArray(float*, long long, double, double, double);
-	void MakeWfrEdgeCorrection(srTSRWRadStructAccessData*, float*, float*, srTDataPtrsForWfrEdgeCorr&);
+	void MakeWfrEdgeCorrection(srTSRWRadStructAccessData*, float*, float*, srTDataPtrsForWfrEdgeCorr&, void* pvGPU=0); //HG01122023
+	//void MakeWfrEdgeCorrection(srTSRWRadStructAccessData*, float*, float*, srTDataPtrsForWfrEdgeCorr&);
+#ifdef _OFFLOAD_GPU //HG01122023
+	void srTGenOptElem::MakeWfrEdgeCorrection_GPU(srTSRWRadStructAccessData* RadAccessData, float* pDataEx, float* pDataEz, srTDataPtrsForWfrEdgeCorr& DataPtrs, TGPUUsageArg* pGPU);
+#endif
 
 	int SetupWfrEdgeCorrData1D(srTRadSect1D*, float*, float*, srTDataPtrsForWfrEdgeCorr1D&);
 	void MakeWfrEdgeCorrection1D(srTRadSect1D*, float*, float*, srTDataPtrsForWfrEdgeCorr1D&);
 
 	int ComputeRadMoments(srTSRWRadStructAccessData*);
 
-	int RadResizeGen(srTSRWRadStructAccessData&, srTRadResize&);
+	int RadResizeGen(srTSRWRadStructAccessData&, srTRadResize&, void* pvGPU=0); //HG01122023
+	//int RadResizeGen(srTSRWRadStructAccessData&, srTRadResize&);
 	int RadResizeGenE(srTSRWRadStructAccessData&, srTRadResize&);
-	int RadResizeCore(srTSRWRadStructAccessData&, srTSRWRadStructAccessData&, srTRadResize&, char =0);
+	int RadResizeCore(srTSRWRadStructAccessData&, srTSRWRadStructAccessData&, srTRadResize&, char =0, void* =0); //HG01122023
+	//int RadResizeCore(srTSRWRadStructAccessData&, srTSRWRadStructAccessData&, srTRadResize&, char =0);
+#ifdef _OFFLOAD_GPU //HG01122023
+	int RadResizeCore_GPU(srTSRWRadStructAccessData&, srTSRWRadStructAccessData&, char =0, TGPUUsageArg* =0);
+#endif
 	int RadResizeCoreE(srTSRWRadStructAccessData&, srTSRWRadStructAccessData&, srTRadResize&, char =0);
 	int RadResizeCore_OnlyLargerRange(srTSRWRadStructAccessData& OldRadAccessData, srTSRWRadStructAccessData& NewRadAccessData, srTRadResize& RadResizeStruct, char PolComp);
 	int RadResizeCore_OnlyLargerRangeE(srTSRWRadStructAccessData& OldRadAccessData, srTSRWRadStructAccessData& NewRadAccessData, srTRadResize& RadResizeStruct, char PolComp);
 
 	//inline void GetCellDataForInterpol(float*, long long , long long, srTInterpolAuxF*);
+#ifdef _OFFLOAD_GPU //HG01122023
+	GPU_PORTABLE
+#endif
 	inline static void GetCellDataForInterpol(float*, long long, long long, srTInterpolAuxF*); //OC02022020
 	//inline void SetupCellDataI(srTInterpolAuxF*, srTInterpolAuxF*);
+#ifdef _OFFLOAD_GPU //HG01122023
+	GPU_PORTABLE
+#endif
 	inline static void SetupCellDataI(srTInterpolAuxF*, srTInterpolAuxF*); //OC02022020
 	//char WaveFrontTermCanBeTreated(srTSRWRadStructAccessData&);
 	//char WaveFrontTermCanBeTreated(srTSRWRadStructAccessData&, bool checkBenefit=true); //OC06012017 (uncommented after some fixes in bool srTSRWRadStructAccessData::CheckIfQuadTermTreatIsBenefit(char, char))
 	//char WaveFrontTermCanBeTreated(srTSRWRadStructAccessData&, bool checkBenefit=false); //OC05012017 (changed to checkBenefit=false to resolve problem of resizing in near field at strong under-sampling)
 	char WaveFrontTermCanBeTreated(srTSRWRadStructAccessData&, bool checkBenefit=false); //OC29032017 (changed again to checkBenefit=false to resolve problem of resizing of wiggler radiation at strong under-sampling, the ELETTRA SCW case)
 
-	void TreatStronglyOscillatingTerm(srTSRWRadStructAccessData&, char, char =0, int ieOnly =-1);
+	//void TreatStronglyOscillatingTerm(srTSRWRadStructAccessData&, char, char =0, int ieOnly =-1);
+	void TreatStronglyOscillatingTerm(srTSRWRadStructAccessData&, char, char =0, int ieOnly =-1, void* pvGPU=0); //HG01122023
+#ifdef _OFFLOAD_GPU //HG01122023
+	void TreatStronglyOscillatingTerm_GPU(srTSRWRadStructAccessData& RadAccessData, bool TreatPolCompX, bool TreatPolCompZ, double ConstRx, double ConstRz, int ieStart, int ieBefEnd, TGPUUsageArg* pGPU);
+#endif
 	//void TreatStronglyOscillatingTermIrregMesh(srTSRWRadStructAccessData&, float*, float, float, float, float, char, char =0, int =-1);
 	void TreatStronglyOscillatingTermIrregMesh(srTSRWRadStructAccessData&, double*, double, double, double, double, char, char =0, int =-1); //OC260114
 	//void TreatStronglyOscillatingTermIrregMesh(srTSRWRadStructAccessData&, double*, double, double, double, double, char, char =0, int =-1, double =1, double =1); //OC220214
 	void TreatStronglyOscillatingTermIrregMeshTrf(srTSRWRadStructAccessData& RadAccessData, char AddOrRem, double CrdTrf[2][3], char PolComp =0, int ieOnly =-1); //OC27122020
 
+#ifdef _OFFLOAD_GPU //HG01122023
+	GPU_PORTABLE inline static void SetupInterpolAux02(srTInterpolAuxF*, srTInterpolAux01*, srTInterpolAux02*); //OC02022020
+	GPU_PORTABLE inline static void SetupInterpolAux02_LowOrder(srTInterpolAuxF*, srTInterpolAux01*, srTInterpolAux02*); //OC02022020
+	GPU_PORTABLE inline static void InterpolF(srTInterpolAux02*, double, double, float*, int); //OC02022020
+	GPU_PORTABLE inline static void InterpolFI(srTInterpolAux02*, double, double, float*, int); //OC02022020
+	GPU_PORTABLE inline static void InterpolF_LowOrder(srTInterpolAux02*, double, double, float*, int); //OC02022020
+	GPU_PORTABLE inline static void InterpolFI_LowOrder(srTInterpolAux02*, double, double, float*, int); //OC02022020
+	GPU_PORTABLE inline double InterpLin(double r, double f1, double f2) { return f1 + r*(f2 - f1);}
+	GPU_PORTABLE inline static void ImproveReAndIm(float*, float*); //OC02022020
+	GPU_PORTABLE inline static int CheckForLowOrderInterp(srTInterpolAuxF*, srTInterpolAuxF*, int, int, srTInterpolAux01*, srTInterpolAux02*, srTInterpolAux02*); //OC02022020
+#else
 	//inline void SetupInterpolAux02(srTInterpolAuxF*, srTInterpolAux01*, srTInterpolAux02*);
 	inline static void SetupInterpolAux02(srTInterpolAuxF*, srTInterpolAux01*, srTInterpolAux02*); //OC02022020
 	//inline void SetupInterpolAux02_LowOrder(srTInterpolAuxF*, srTInterpolAux01*, srTInterpolAux02*);
@@ -310,6 +358,7 @@ class srTGenOptElem : public CGenObject {
 	inline static void ImproveReAndIm(float*, float*); //OC02022020
 	//inline int CheckForLowOrderInterp(srTInterpolAuxF*, srTInterpolAuxF*, int, int, srTInterpolAux01*, srTInterpolAux02*, srTInterpolAux02*);
 	inline static int CheckForLowOrderInterp(srTInterpolAuxF*, srTInterpolAuxF*, int, int, srTInterpolAux01*, srTInterpolAux02*, srTInterpolAux02*); //OC02022020
+#endif
 
 	int RadResizeGen1D(srTRadSect1D&, srTRadResize1D&);
 	int RadResizeCore1D(srTRadSect1D&, srTRadSect1D&, srTRadResize1D&);
@@ -346,6 +395,9 @@ class srTGenOptElem : public CGenObject {
 
 	//inline void MultSquareMatrByVect(float**, float*, int, float*);
 	inline void MultSquareMatrByVect(double**, double*, int, double*); //OC130311
+#ifdef _OFFLOAD_GPU //HG04122023
+	GPU_PORTABLE
+#endif
 	inline void CosAndSin(double, float&, float&);
 	inline void FindLowestAndUppestPoints(TVector3d&, TVector3d*, int, int&, int&);
 	inline void ReflectVect(TVector3d& N, TVector3d& V);
diff --git a/cpp/src/core/sroptelm_gpu.cu b/cpp/src/core/sroptelm_gpu.cu
new file mode 100644
index 00000000..f9a65861
--- /dev/null
+++ b/cpp/src/core/sroptelm_gpu.cu
@@ -0,0 +1,587 @@
+/************************************************************************//**
+ * File: sroptelm_gpu.cu
+ * Description: Optical element (general CUDA functions)
+ * Project: Synchrotron Radiation Workshop
+ * First release: 2023
+ *
+ * Copyright (C) Brookhaven National Laboratory
+ * All Rights Reserved
+ *
+ * @author H.Goel
+ * @version 1.0
+ ***************************************************************************/
+
+#ifdef _OFFLOAD_GPU
+#include "cuda_runtime.h"
+#include "device_launch_parameters.h"
+#include "math_constants.h"
+#include <stdio.h>
+#include <iostream>
+#include <chrono>
+#include "sroptelm.h"
+#include "sroptelm_gpu.h"
+
+
+__global__ void TreatStronglyOscillatingTerm_Kernel(srTSRWRadStructAccessData RadAccessData, bool TreatPolCompX, bool TreatPolCompZ, double ConstRx, double ConstRz, int ieStart) {
+    int ix = (blockIdx.x * blockDim.x + threadIdx.x); //nx range
+    int iz = (blockIdx.y * blockDim.y + threadIdx.y); //nz range
+    int ie = (blockIdx.z * blockDim.z + threadIdx.z) + ieStart; //ne range
+    
+    if (ix < RadAccessData.nx && iz < RadAccessData.nz && ie < RadAccessData.ne + ieStart) 
+    {
+        double ePh = RadAccessData.eStart + RadAccessData.eStep * (ie - ieStart);
+        if (RadAccessData.PresT == 1)
+        {
+            ePh = RadAccessData.avgPhotEn; //?? OC041108
+        }
+
+        double ConstRxE = ConstRx * ePh;
+        double ConstRzE = ConstRz * ePh;
+        if (RadAccessData.Pres == 1)
+        {
+            //double Lambda_m = 1.239854e-06/ePh;
+            double Lambda_m = 1.239842e-06 / ePh;
+            if (RadAccessData.PhotEnergyUnit == 1) Lambda_m *= 0.001; // if keV
+
+            double Lambda_me2 = Lambda_m * Lambda_m;
+            ConstRxE *= Lambda_me2;
+            ConstRzE *= Lambda_me2;
+        }
+
+        double z = (RadAccessData.zStart - RadAccessData.zc) + (iz * RadAccessData.zStep);
+        double PhaseAddZ = 0;
+        if (RadAccessData.WfrQuadTermCanBeTreatedAtResizeZ) PhaseAddZ = ConstRzE * z * z;
+
+        double x = (RadAccessData.xStart - RadAccessData.xc) + (ix * RadAccessData.xStep);
+        double Phase = PhaseAddZ;
+        if (RadAccessData.WfrQuadTermCanBeTreatedAtResizeX) Phase += ConstRxE * x * x;
+
+        float SinPh, CosPh;
+        sincosf(Phase, &SinPh, &CosPh);
+
+        long long PerX = RadAccessData.ne << 1;
+        long long PerZ = PerX * RadAccessData.nx;
+        long long offset = ie * 2 + iz * PerZ + ix * PerX;
+        
+		if (TreatPolCompX)
+		{
+			float* pExRe = RadAccessData.pBaseRadX + offset;
+			float* pExIm = pExRe + 1;
+			double ExReNew = (*pExRe) * CosPh - (*pExIm) * SinPh;
+			double ExImNew = (*pExRe) * SinPh + (*pExIm) * CosPh;
+			*pExRe = (float)ExReNew; *pExIm = (float)ExImNew;
+		}
+		if (TreatPolCompZ)
+		{
+			float* pEzRe = RadAccessData.pBaseRadZ + offset;
+			float* pEzIm = pEzRe + 1;
+			double EzReNew = (*pEzRe) * CosPh - (*pEzIm) * SinPh;
+			double EzImNew = (*pEzRe) * SinPh + (*pEzIm) * CosPh;
+			*pEzRe = (float)EzReNew; *pEzIm = (float)EzImNew;
+		}
+    }
+}
+
+void srTGenOptElem::TreatStronglyOscillatingTerm_GPU(srTSRWRadStructAccessData& RadAccessData, bool TreatPolCompX, bool TreatPolCompZ, double ConstRx, double ConstRz, int ieStart, int ieBefEnd, TGPUUsageArg* pGPU)
+{
+	if (RadAccessData.pBaseRadX != NULL)
+	{
+		RadAccessData.pBaseRadX = (float*)CAuxGPU::ToDevice(pGPU, RadAccessData.pBaseRadX, 2*RadAccessData.ne*RadAccessData.nx*RadAccessData.nz*sizeof(float));
+		CAuxGPU::EnsureDeviceMemoryReady(pGPU, RadAccessData.pBaseRadX);
+	}
+	if (RadAccessData.pBaseRadZ != NULL)
+	{
+		RadAccessData.pBaseRadZ = (float*)CAuxGPU::ToDevice(pGPU, RadAccessData.pBaseRadZ, 2*RadAccessData.ne*RadAccessData.nx*RadAccessData.nz*sizeof(float));
+		CAuxGPU::EnsureDeviceMemoryReady(pGPU, RadAccessData.pBaseRadZ);
+	}
+
+    const int bs = 256;
+    dim3 blocks(RadAccessData.nx / bs + ((RadAccessData.nx & (bs - 1)) != 0), RadAccessData.nz, ieBefEnd - ieStart);
+    dim3 threads(bs, 1);
+    TreatStronglyOscillatingTerm_Kernel<< <blocks, threads >> > (RadAccessData, TreatPolCompX, TreatPolCompZ, ConstRx, ConstRz, ieStart);
+	
+	CAuxGPU::MarkUpdated(pGPU, RadAccessData.pBaseRadX, true, false);
+	CAuxGPU::MarkUpdated(pGPU, RadAccessData.pBaseRadZ, true, false);
+
+#ifndef _DEBUG
+	if (RadAccessData.pBaseRadX != NULL)
+		RadAccessData.pBaseRadX = (float*)CAuxGPU::GetHostPtr(pGPU, RadAccessData.pBaseRadX);
+	if (RadAccessData.pBaseRadZ != NULL)
+		RadAccessData.pBaseRadZ = (float*)CAuxGPU::GetHostPtr(pGPU, RadAccessData.pBaseRadZ);
+#endif
+
+#ifdef _DEBUG
+	if (RadAccessData.pBaseRadX != NULL)
+		RadAccessData.pBaseRadX = (float*)CAuxGPU::ToHostAndFree(pGPU, RadAccessData.pBaseRadX, 2*RadAccessData.ne*RadAccessData.nx*RadAccessData.nz*sizeof(float));
+	if (RadAccessData.pBaseRadZ != NULL)
+		RadAccessData.pBaseRadZ = (float*)CAuxGPU::ToHostAndFree(pGPU, RadAccessData.pBaseRadZ, 2*RadAccessData.ne*RadAccessData.nx*RadAccessData.nz*sizeof(float));
+	cudaStreamSynchronize(0);
+	auto err = cudaGetLastError();
+	printf("%s\r\n", cudaGetErrorString(err));
+#endif
+}
+
+__global__ void MakeWfrEdgeCorrection_Kernel(srTSRWRadStructAccessData RadAccessData, float* pDataEx, float* pDataEz, srTDataPtrsForWfrEdgeCorr DataPtrs, float dxSt, float dxFi, float dzSt, float dzFi)
+{
+    int ix = (blockIdx.x * blockDim.x + threadIdx.x); //nx range
+    int iz = (blockIdx.y * blockDim.y + threadIdx.y); //nz range
+
+    if (ix < RadAccessData.nx && iz < RadAccessData.nz)
+    {
+		//float dxSt = (float)DataPtrs.dxSt;
+		//float dxFi = (float)DataPtrs.dxFi;
+		//float dzSt = (float)DataPtrs.dzSt;
+		//float dzFi = (float)DataPtrs.dzFi;
+		float dxSt_dzSt = dxSt * dzSt;
+		float dxSt_dzFi = dxSt * dzFi;
+		float dxFi_dzSt = dxFi * dzSt;
+		float dxFi_dzFi = dxFi * dzFi;
+
+		long TwoNz = RadAccessData.nz << 1;
+		long PerX = 2;
+		long PerZ = PerX * RadAccessData.nx;
+
+        float fSSExRe = DataPtrs.fxStzSt[0];
+        float fSSExIm = DataPtrs.fxStzSt[1];
+        float fSSEzRe = DataPtrs.fxStzSt[2];
+        float fSSEzIm = DataPtrs.fxStzSt[3];
+        
+        float fFSExRe = DataPtrs.fxFizSt[0];
+        float fFSExIm = DataPtrs.fxFizSt[1];
+        float fFSEzRe = DataPtrs.fxFizSt[2];
+        float fFSEzIm = DataPtrs.fxFizSt[3];
+        
+        float fSFExRe = DataPtrs.fxStzFi[0];
+        float fSFExIm = DataPtrs.fxStzFi[1];
+        float fSFEzRe = DataPtrs.fxStzFi[2];
+        float fSFEzIm = DataPtrs.fxStzFi[3];
+        
+        float fFFExRe = DataPtrs.fxFizFi[0];
+        float fFFExIm = DataPtrs.fxFizFi[1];
+        float fFFEzRe = DataPtrs.fxFizFi[2];
+        float fFFEzIm = DataPtrs.fxFizFi[3];
+
+		float bRe, bIm, cRe, cIm;
+
+		long long Two_iz = iz << 1;
+		long long Two_iz_p_1 = Two_iz + 1;
+		long long Two_ix = ix << 1;
+		long long Two_ix_p_1 = Two_ix + 1;
+
+		float* tEx = pDataEx + iz * PerZ + ix * PerX, * tEz = pDataEz + iz * PerZ + ix * PerX;
+		float ExRe = *tEx, ExIm = *(tEx + 1);
+		float EzRe = *tEz, EzIm = *(tEz + 1);
+
+		if (dxSt != 0.f)
+		{
+			float ExpXStRe = DataPtrs.ExpArrXSt[Two_ix], ExpXStIm = DataPtrs.ExpArrXSt[Two_ix_p_1];
+
+			bRe = DataPtrs.FFTArrXStEx[Two_iz]; bIm = DataPtrs.FFTArrXStEx[Two_iz_p_1];
+			ExRe += (float)(dxSt * (ExpXStRe * bRe - ExpXStIm * bIm));
+			ExIm += (float)(dxSt * (ExpXStRe * bIm + ExpXStIm * bRe));
+
+			bRe = DataPtrs.FFTArrXStEz[Two_iz]; bIm = DataPtrs.FFTArrXStEz[Two_iz_p_1];
+			EzRe += (float)(dxSt * (ExpXStRe * bRe - ExpXStIm * bIm));
+			EzIm += (float)(dxSt * (ExpXStRe * bIm + ExpXStIm * bRe));
+
+			if (dzSt != 0.f)
+			{
+				bRe = DataPtrs.ExpArrZSt[Two_iz], bIm = DataPtrs.ExpArrZSt[Two_iz_p_1];
+				cRe = ExpXStRe * bRe - ExpXStIm * bIm; cIm = ExpXStRe * bIm + ExpXStIm * bRe;
+
+				ExRe += (float)(dxSt_dzSt * (fSSExRe * cRe - fSSExIm * cIm));
+				ExIm += (float)(dxSt_dzSt * (fSSExRe * cIm + fSSExIm * cRe));
+				EzRe += (float)(dxSt_dzSt * (fSSEzRe * cRe - fSSEzIm * cIm));
+				EzIm += (float)(dxSt_dzSt * (fSSEzRe * cIm + fSSEzIm * cRe));
+			}
+			if (dzFi != 0.f)
+			{
+				bRe = DataPtrs.ExpArrZFi[Two_iz], bIm = DataPtrs.ExpArrZFi[Two_iz_p_1];
+				cRe = ExpXStRe * bRe - ExpXStIm * bIm; cIm = ExpXStRe * bIm + ExpXStIm * bRe;
+
+				ExRe -= (float)(dxSt_dzFi * (fSFExRe * cRe - fSFExIm * cIm));
+				ExIm -= (float)(dxSt_dzFi * (fSFExRe * cIm + fSFExIm * cRe));
+				EzRe -= (float)(dxSt_dzFi * (fSFEzRe * cRe - fSFEzIm * cIm));
+				EzIm -= (float)(dxSt_dzFi * (fSFEzRe * cIm + fSFEzIm * cRe));
+			}
+		}
+		if (dxFi != 0.f)
+		{
+			float ExpXFiRe = DataPtrs.ExpArrXFi[Two_ix], ExpXFiIm = DataPtrs.ExpArrXFi[Two_ix_p_1];
+
+			bRe = DataPtrs.FFTArrXFiEx[Two_iz]; bIm = DataPtrs.FFTArrXFiEx[Two_iz_p_1];
+			ExRe -= (float)(dxFi * (ExpXFiRe * bRe - ExpXFiIm * bIm));
+			ExIm -= (float)(dxFi * (ExpXFiRe * bIm + ExpXFiIm * bRe));
+
+			bRe = DataPtrs.FFTArrXFiEz[Two_iz]; bIm = DataPtrs.FFTArrXFiEz[Two_iz_p_1];
+			EzRe -= (float)(dxFi * (ExpXFiRe * bRe - ExpXFiIm * bIm));
+			EzIm -= (float)(dxFi * (ExpXFiRe * bIm + ExpXFiIm * bRe));
+
+			if (dzSt != 0.f)
+			{
+				bRe = DataPtrs.ExpArrZSt[Two_iz], bIm = DataPtrs.ExpArrZSt[Two_iz_p_1];
+				cRe = ExpXFiRe * bRe - ExpXFiIm * bIm; cIm = ExpXFiRe * bIm + ExpXFiIm * bRe;
+
+				ExRe -= (float)(dxFi_dzSt * (fFSExRe * cRe - fFSExIm * cIm));
+				ExIm -= (float)(dxFi_dzSt * (fFSExRe * cIm + fFSExIm * cRe));
+				EzRe -= (float)(dxFi_dzSt * (fFSEzRe * cRe - fFSEzIm * cIm));
+				EzIm -= (float)(dxFi_dzSt * (fFSEzRe * cIm + fFSEzIm * cRe));
+			}
+			if (dzFi != 0.f)
+			{
+				bRe = DataPtrs.ExpArrZFi[Two_iz], bIm = DataPtrs.ExpArrZFi[Two_iz_p_1];
+				cRe = ExpXFiRe * bRe - ExpXFiIm * bIm; cIm = ExpXFiRe * bIm + ExpXFiIm * bRe;
+
+				ExRe += (float)(dxFi_dzFi * (fFFExRe * cRe - fFFExIm * cIm));
+				ExIm += (float)(dxFi_dzFi * (fFFExRe * cIm + fFFExIm * cRe));
+				EzRe += (float)(dxFi_dzFi * (fFFEzRe * cRe - fFFEzIm * cIm));
+				EzIm += (float)(dxFi_dzFi * (fFFEzRe * cIm + fFFEzIm * cRe));
+			}
+		}
+		if (dzSt != 0.f)
+		{
+			float ExpZStRe = DataPtrs.ExpArrZSt[Two_iz], ExpZStIm = DataPtrs.ExpArrZSt[Two_iz_p_1];
+
+			bRe = DataPtrs.FFTArrZStEx[Two_ix]; bIm = DataPtrs.FFTArrZStEx[Two_ix_p_1];
+			ExRe += (float)(dzSt * (ExpZStRe * bRe - ExpZStIm * bIm));
+			ExIm += (float)(dzSt * (ExpZStRe * bIm + ExpZStIm * bRe));
+
+			bRe = DataPtrs.FFTArrZStEz[Two_ix]; bIm = DataPtrs.FFTArrZStEz[Two_ix_p_1];
+			EzRe += (float)(DataPtrs.dzSt * (ExpZStRe * bRe - ExpZStIm * bIm));
+			EzIm += (float)(DataPtrs.dzSt * (ExpZStRe * bIm + ExpZStIm * bRe));
+		}
+		if (dzFi != 0.f)
+		{
+			float ExpZFiRe = DataPtrs.ExpArrZFi[Two_iz], ExpZFiIm = DataPtrs.ExpArrZFi[Two_iz_p_1];
+
+			bRe = DataPtrs.FFTArrZFiEx[Two_ix]; bIm = DataPtrs.FFTArrZFiEx[Two_ix_p_1];
+			ExRe -= (float)(dzFi * (ExpZFiRe * bRe - ExpZFiIm * bIm));
+			ExIm -= (float)(dzFi * (ExpZFiRe * bIm + ExpZFiIm * bRe));
+
+			bRe = DataPtrs.FFTArrZFiEz[Two_ix]; bIm = DataPtrs.FFTArrZFiEz[Two_ix_p_1];
+			EzRe -= (float)(dzFi * (ExpZFiRe * bRe - ExpZFiIm * bIm));
+			EzIm -= (float)(dzFi * (ExpZFiRe * bIm + ExpZFiIm * bRe));
+		}
+
+		*tEx = ExRe; *(tEx + 1) = ExIm;
+		*tEz = EzRe; *(tEz + 1) = EzIm;
+    }
+}
+
+void srTGenOptElem::MakeWfrEdgeCorrection_GPU(srTSRWRadStructAccessData* RadAccessData, float* pDataEx, float* pDataEz, srTDataPtrsForWfrEdgeCorr& DataPtrs, TGPUUsageArg* pGPU)
+{
+	pDataEx = (float*)CAuxGPU::ToDevice(pGPU, pDataEx, 2*RadAccessData->ne*RadAccessData->nx*RadAccessData->nz*sizeof(float));
+	pDataEz = (float*)CAuxGPU::ToDevice(pGPU, pDataEz, 2*RadAccessData->ne*RadAccessData->nx*RadAccessData->nz*sizeof(float));
+	DataPtrs.FFTArrXStEx = (float*)CAuxGPU::ToDevice(pGPU, DataPtrs.FFTArrXStEx, 2*RadAccessData->nz*sizeof(float));
+	DataPtrs.FFTArrXStEz = (float*)CAuxGPU::ToDevice(pGPU, DataPtrs.FFTArrXStEz, 2*RadAccessData->nz*sizeof(float));
+	DataPtrs.FFTArrXFiEx = (float*)CAuxGPU::ToDevice(pGPU, DataPtrs.FFTArrXFiEx, 2*RadAccessData->nz*sizeof(float));
+	DataPtrs.FFTArrXFiEz = (float*)CAuxGPU::ToDevice(pGPU, DataPtrs.FFTArrXFiEz, 2*RadAccessData->nz*sizeof(float));
+	DataPtrs.FFTArrZStEx = (float*)CAuxGPU::ToDevice(pGPU, DataPtrs.FFTArrZStEx, 2*RadAccessData->nx*sizeof(float));
+	DataPtrs.FFTArrZStEz = (float*)CAuxGPU::ToDevice(pGPU, DataPtrs.FFTArrZStEz, 2*RadAccessData->nx*sizeof(float));
+	DataPtrs.FFTArrZFiEx = (float*)CAuxGPU::ToDevice(pGPU, DataPtrs.FFTArrZFiEx, 2*RadAccessData->nx*sizeof(float));
+	DataPtrs.FFTArrZFiEz = (float*)CAuxGPU::ToDevice(pGPU, DataPtrs.FFTArrZFiEz, 2*RadAccessData->nx*sizeof(float));
+	DataPtrs.ExpArrXSt = (float*)CAuxGPU::ToDevice(pGPU, DataPtrs.ExpArrXSt, 2*RadAccessData->nx*sizeof(float));
+	DataPtrs.ExpArrXFi = (float*)CAuxGPU::ToDevice(pGPU, DataPtrs.ExpArrXFi, 2*RadAccessData->nx*sizeof(float));
+	DataPtrs.ExpArrZSt = (float*)CAuxGPU::ToDevice(pGPU, DataPtrs.ExpArrZSt, 2*RadAccessData->nz*sizeof(float));
+	DataPtrs.ExpArrZFi = (float*)CAuxGPU::ToDevice(pGPU, DataPtrs.ExpArrZFi, 2*RadAccessData->nz*sizeof(float));
+
+	CAuxGPU::EnsureDeviceMemoryReady(pGPU, pDataEx);
+	CAuxGPU::EnsureDeviceMemoryReady(pGPU, pDataEz);
+	CAuxGPU::EnsureDeviceMemoryReady(pGPU, DataPtrs.FFTArrXStEx);
+	CAuxGPU::EnsureDeviceMemoryReady(pGPU, DataPtrs.FFTArrXStEz);
+	CAuxGPU::EnsureDeviceMemoryReady(pGPU, DataPtrs.FFTArrXFiEx);
+	CAuxGPU::EnsureDeviceMemoryReady(pGPU, DataPtrs.FFTArrXFiEz);
+	CAuxGPU::EnsureDeviceMemoryReady(pGPU, DataPtrs.FFTArrZStEx);
+	CAuxGPU::EnsureDeviceMemoryReady(pGPU, DataPtrs.FFTArrZStEz);
+	CAuxGPU::EnsureDeviceMemoryReady(pGPU, DataPtrs.FFTArrZFiEx);
+	CAuxGPU::EnsureDeviceMemoryReady(pGPU, DataPtrs.FFTArrZFiEz);
+	CAuxGPU::EnsureDeviceMemoryReady(pGPU, DataPtrs.ExpArrXSt);
+	CAuxGPU::EnsureDeviceMemoryReady(pGPU, DataPtrs.ExpArrXFi);
+	CAuxGPU::EnsureDeviceMemoryReady(pGPU, DataPtrs.ExpArrZSt);
+	CAuxGPU::EnsureDeviceMemoryReady(pGPU, DataPtrs.ExpArrZFi);
+
+	const int bs = 256;
+	dim3 blocks(RadAccessData->nx / bs + ((RadAccessData->nx & (bs - 1)) != 0), RadAccessData->nz);
+	dim3 threads(bs, 1);
+	MakeWfrEdgeCorrection_Kernel << <blocks, threads >> > (*RadAccessData, pDataEx, pDataEz, DataPtrs, (float)DataPtrs.dxSt, (float)DataPtrs.dxFi, (float)DataPtrs.dzSt, (float)DataPtrs.dzFi);
+
+	DataPtrs.FFTArrXStEx = (float*)CAuxGPU::ToHostAndFree(pGPU, DataPtrs.FFTArrXStEx, 2*RadAccessData->nz*sizeof(float), true);
+	DataPtrs.FFTArrXStEz = (float*)CAuxGPU::ToHostAndFree(pGPU, DataPtrs.FFTArrXStEz, 2*RadAccessData->nz*sizeof(float), true);
+	DataPtrs.FFTArrXFiEx = (float*)CAuxGPU::ToHostAndFree(pGPU, DataPtrs.FFTArrXFiEx, 2*RadAccessData->nz*sizeof(float), true);
+	DataPtrs.FFTArrXFiEz = (float*)CAuxGPU::ToHostAndFree(pGPU, DataPtrs.FFTArrXFiEz, 2*RadAccessData->nz*sizeof(float), true);
+	DataPtrs.FFTArrZStEx = (float*)CAuxGPU::ToHostAndFree(pGPU, DataPtrs.FFTArrZStEx, 2*RadAccessData->nx*sizeof(float), true);
+	DataPtrs.FFTArrZStEz = (float*)CAuxGPU::ToHostAndFree(pGPU, DataPtrs.FFTArrZStEz, 2*RadAccessData->nx*sizeof(float), true);
+	DataPtrs.FFTArrZFiEx = (float*)CAuxGPU::ToHostAndFree(pGPU, DataPtrs.FFTArrZFiEx, 2*RadAccessData->nx*sizeof(float), true);
+	DataPtrs.FFTArrZFiEz = (float*)CAuxGPU::ToHostAndFree(pGPU, DataPtrs.FFTArrZFiEz, 2*RadAccessData->nx*sizeof(float), true);
+	DataPtrs.ExpArrXSt = (float*)CAuxGPU::ToHostAndFree(pGPU, DataPtrs.ExpArrXSt, 2*RadAccessData->nx*sizeof(float), true);
+	DataPtrs.ExpArrXFi = (float*)CAuxGPU::ToHostAndFree(pGPU, DataPtrs.ExpArrXFi, 2*RadAccessData->nx*sizeof(float), true);
+	DataPtrs.ExpArrZSt = (float*)CAuxGPU::ToHostAndFree(pGPU, DataPtrs.ExpArrZSt, 2*RadAccessData->nz*sizeof(float), true);
+	DataPtrs.ExpArrZFi = (float*)CAuxGPU::ToHostAndFree(pGPU, DataPtrs.ExpArrZFi, 2*RadAccessData->nz*sizeof(float), true);
+
+	CAuxGPU::MarkUpdated(pGPU, pDataEx, true, false);
+	CAuxGPU::MarkUpdated(pGPU, pDataEz, true, false);
+
+#ifdef _DEBUG
+	CAuxGPU::ToHostAndFree(pGPU, pDataEx, 2*RadAccessData->ne*RadAccessData->nx*RadAccessData->nz*sizeof(float));
+	CAuxGPU::ToHostAndFree(pGPU, pDataEz, 2*RadAccessData->ne*RadAccessData->nx*RadAccessData->nz*sizeof(float));
+	cudaStreamSynchronize(0);
+	auto err = cudaGetLastError();
+	printf("%s\r\n", cudaGetErrorString(err));
+#endif
+}
+
+template<bool TreatPolCompX, bool TreatPolCompZ> __global__ void RadResizeCore_Kernel(srTSRWRadStructAccessData OldRadAccessData, srTSRWRadStructAccessData NewRadAccessData)
+{
+	int ixStart = int(NewRadAccessData.AuxLong1);
+	int ixEnd = int(NewRadAccessData.AuxLong2);
+	int izStart = int(NewRadAccessData.AuxLong3);
+	int izEnd = int(NewRadAccessData.AuxLong4);
+
+    int ix = (blockIdx.x * blockDim.x + threadIdx.x) + ixStart; //nx range
+    int iz = (blockIdx.y * blockDim.y + threadIdx.y) + izStart; //nz range
+    int ie = (blockIdx.z * blockDim.z + threadIdx.z); //ne range
+
+	if (ix > ixEnd) return;
+	if (iz > izEnd) return;
+
+	const double DistAbsTol = 1.E-10;
+	double xStepInvOld = 1./OldRadAccessData.xStep;
+	double zStepInvOld = 1./OldRadAccessData.zStep;
+	int nx_mi_1Old = OldRadAccessData.nx - 1;
+	int nz_mi_1Old = OldRadAccessData.nz - 1;
+	int nx_mi_2Old = nx_mi_1Old - 1;
+	int nz_mi_2Old = nz_mi_1Old - 1;
+
+	//OC31102018: moved by SY at parallelizing SRW via OpenMP
+	//srTInterpolAux01 InterpolAux01;
+	//srTInterpolAux02 InterpolAux02[4], InterpolAux02I[2];
+	//srTInterpolAuxF AuxF[4], AuxFI[2];
+	//int ixStOld, izStOld, ixStOldPrev = -1000, izStOldPrev = -1000;
+
+	//long PerX_New = NewRadAccessData.ne << 1;
+	//long PerZ_New = PerX_New*NewRadAccessData.nx;
+	long long PerX_New = NewRadAccessData.ne << 1;
+	long long PerZ_New = PerX_New*NewRadAccessData.nx;
+
+	//long PerX_Old = PerX_New;
+	//long PerZ_Old = PerX_Old*OldRadAccessData.nx;
+	long long PerX_Old = PerX_New;
+	long long PerZ_Old = PerX_Old*OldRadAccessData.nx;
+
+	float *pEX0_New = 0, *pEZ0_New = 0;
+	pEX0_New = NewRadAccessData.pBaseRadX;
+	pEZ0_New = NewRadAccessData.pBaseRadZ;
+
+	float* pEX0_Old = 0, * pEZ0_Old = 0;
+	pEX0_Old = OldRadAccessData.pBaseRadX;
+	pEZ0_Old = OldRadAccessData.pBaseRadZ;
+
+	
+	int ixStOld, izStOld, ixStOldPrev = -1000, izStOldPrev = -1000;
+	//SY: do we need this (always returns 0, updates some clock)
+	//if(result = srYield.Check()) return result;
+
+	double zAbs = NewRadAccessData.zStart + iz * NewRadAccessData.zStep;
+
+	char FieldShouldBeZeroedDueToZ = 0;
+	if (NewRadAccessData.WfrEdgeCorrShouldBeDone)
+	{
+		if ((zAbs < NewRadAccessData.zWfrMin - DistAbsTol) || (zAbs > NewRadAccessData.zWfrMax + DistAbsTol)) FieldShouldBeZeroedDueToZ = 1;
+	}
+
+	int izcOld = int((zAbs - OldRadAccessData.zStart) * zStepInvOld + 1.E-06);
+
+	double zRel = zAbs - (OldRadAccessData.zStart + izcOld * OldRadAccessData.zStep);
+
+	if (izcOld == nz_mi_1Old) { izStOld = izcOld - 3; zRel += 2. * OldRadAccessData.zStep; }
+	else if (izcOld == nz_mi_2Old) { izStOld = izcOld - 2; zRel += OldRadAccessData.zStep; }
+	else if (izcOld == 0) { izStOld = izcOld; zRel -= OldRadAccessData.zStep; }
+	else izStOld = izcOld - 1;
+
+	zRel *= zStepInvOld;
+
+	int izcOld_mi_izStOld = izcOld - izStOld;
+	//long izPerZ_New = iz*PerZ_New;
+	long long izPerZ_New = iz * PerZ_New;
+
+	double xAbs = NewRadAccessData.xStart + ix * NewRadAccessData.xStep;
+
+	char FieldShouldBeZeroedDueToX = 0;
+	if (NewRadAccessData.WfrEdgeCorrShouldBeDone)
+	{
+		if ((xAbs < NewRadAccessData.xWfrMin - DistAbsTol) || (xAbs > NewRadAccessData.xWfrMax + DistAbsTol)) FieldShouldBeZeroedDueToX = 1;
+	}
+	char FieldShouldBeZeroed = (FieldShouldBeZeroedDueToX || FieldShouldBeZeroedDueToZ);
+
+	int ixcOld = int((xAbs - OldRadAccessData.xStart) * xStepInvOld + 1.E-06);
+	double xRel = xAbs - (OldRadAccessData.xStart + ixcOld * OldRadAccessData.xStep);
+
+	if (ixcOld == nx_mi_1Old) { ixStOld = ixcOld - 3; xRel += 2. * OldRadAccessData.xStep; }
+	else if (ixcOld == nx_mi_2Old) { ixStOld = ixcOld - 2; xRel += OldRadAccessData.xStep; }
+	else if (ixcOld == 0) { ixStOld = ixcOld; xRel -= OldRadAccessData.xStep; }
+	else ixStOld = ixcOld - 1;
+
+	xRel *= xStepInvOld;
+
+	int ixcOld_mi_ixStOld = ixcOld - ixStOld;
+
+	//or (int ie = 0; ie < NewRadAccessData.ne; ie++)
+	{
+		//OC31102018: modified by SY at OpenMP parallelization
+		//ixStOldPrev = -1000; izStOldPrev = -1000;
+
+		//OC31102018: moved by SY at OpenMP parallelization
+		srTInterpolAux01 InterpolAux01;
+		srTInterpolAux02 InterpolAux02[4], InterpolAux02I[2];
+		srTInterpolAuxF AuxF[4], AuxFI[2];
+		ixStOldPrev = -1000; izStOldPrev = -1000;
+		float BufF[4], BufFI[2];
+		char UseLowOrderInterp_PolCompX = 0, UseLowOrderInterp_PolCompZ = 0;
+
+		//long Two_ie = ie << 1;
+		long long Two_ie = ie << 1;
+
+		float* pEX_StartForX_New = 0, * pEZ_StartForX_New = 0;
+		pEX_StartForX_New = pEX0_New + izPerZ_New;
+		pEZ_StartForX_New = pEZ0_New + izPerZ_New;
+
+		//long ixPerX_New_p_Two_ie = ix*PerX_New + Two_ie;
+		long long ixPerX_New_p_Two_ie = ix * PerX_New + Two_ie;
+		float* pEX_New = 0, * pEZ_New = 0;
+		pEX_New = pEX_StartForX_New + ixPerX_New_p_Two_ie;
+		pEZ_New = pEZ_StartForX_New + ixPerX_New_p_Two_ie;
+
+		//long TotOffsetOld = izStOld*PerZ_Old + ixStOld*PerX_Old + Two_ie;
+		long long TotOffsetOld = izStOld * PerZ_Old + ixStOld * PerX_Old + Two_ie;
+
+		if (TreatPolCompX)
+		{
+			float* pExSt_Old = pEX0_Old + TotOffsetOld;
+			srTGenOptElem::GetCellDataForInterpol(pExSt_Old, PerX_Old, PerZ_Old, AuxF);
+
+			srTGenOptElem::SetupCellDataI(AuxF, AuxFI);
+			UseLowOrderInterp_PolCompX = srTGenOptElem::CheckForLowOrderInterp(AuxF, AuxFI, ixcOld_mi_ixStOld, izcOld_mi_izStOld, &InterpolAux01, InterpolAux02, InterpolAux02I);
+
+			if (!UseLowOrderInterp_PolCompX)
+			{
+				for (int i = 0; i < 2; i++)
+				{
+					srTGenOptElem::SetupInterpolAux02(AuxF + i, &InterpolAux01, InterpolAux02 + i);
+				}
+				srTGenOptElem::SetupInterpolAux02(AuxFI, &InterpolAux01, InterpolAux02I);
+			}
+
+			if (UseLowOrderInterp_PolCompX)
+			{
+				srTGenOptElem::InterpolF_LowOrder(InterpolAux02, xRel, zRel, BufF, 0);
+				srTGenOptElem::InterpolFI_LowOrder(InterpolAux02I, xRel, zRel, BufFI, 0);
+			}
+			else
+			{
+				srTGenOptElem::InterpolF(InterpolAux02, xRel, zRel, BufF, 0);
+				srTGenOptElem::InterpolFI(InterpolAux02I, xRel, zRel, BufFI, 0);
+			}
+
+			(*BufFI) *= AuxFI->fNorm;
+			srTGenOptElem::ImproveReAndIm(BufF, BufFI);
+
+			if (FieldShouldBeZeroed)
+			{
+				*BufF = 0.; *(BufF + 1) = 0.;
+			}
+
+			*pEX_New = *BufF;
+			*(pEX_New + 1) = *(BufF + 1);
+		}
+		if (TreatPolCompZ)
+		{
+			float* pEzSt_Old = pEZ0_Old + TotOffsetOld;
+			srTGenOptElem::GetCellDataForInterpol(pEzSt_Old, PerX_Old, PerZ_Old, AuxF + 2);
+
+			srTGenOptElem::SetupCellDataI(AuxF + 2, AuxFI + 1);
+			UseLowOrderInterp_PolCompZ = srTGenOptElem::CheckForLowOrderInterp(AuxF + 2, AuxFI + 1, ixcOld_mi_ixStOld, izcOld_mi_izStOld, &InterpolAux01, InterpolAux02 + 2, InterpolAux02I + 1);
+
+			if (!UseLowOrderInterp_PolCompZ)
+			{
+				for (int i = 0; i < 2; i++)
+				{
+					srTGenOptElem::SetupInterpolAux02(AuxF + 2 + i, &InterpolAux01, InterpolAux02 + 2 + i);
+				}
+				srTGenOptElem::SetupInterpolAux02(AuxFI + 1, &InterpolAux01, InterpolAux02I + 1);
+			}
+			
+			if (UseLowOrderInterp_PolCompZ)
+			{
+				srTGenOptElem::InterpolF_LowOrder(InterpolAux02, xRel, zRel, BufF, 2);
+				srTGenOptElem::InterpolFI_LowOrder(InterpolAux02I, xRel, zRel, BufFI, 1);
+			}
+			else
+			{
+				srTGenOptElem::InterpolF(InterpolAux02, xRel, zRel, BufF, 2);
+				srTGenOptElem::InterpolFI(InterpolAux02I, xRel, zRel, BufFI, 1);
+			}
+
+			(*(BufFI + 1)) *= (AuxFI + 1)->fNorm;
+			srTGenOptElem::ImproveReAndIm(BufF + 2, BufFI + 1);
+
+			if (FieldShouldBeZeroed)
+			{
+				*(BufF + 2) = 0.; *(BufF + 3) = 0.;
+			}
+
+			*pEZ_New = *(BufF + 2);
+			*(pEZ_New + 1) = *(BufF + 3);
+		}
+	}
+}
+
+int srTGenOptElem::RadResizeCore_GPU(srTSRWRadStructAccessData& OldRadAccessData, srTSRWRadStructAccessData& NewRadAccessData, char PolComp, TGPUUsageArg* pGPU)
+{
+	char TreatPolCompX = ((PolComp == 0) || (PolComp == 'x'));
+	char TreatPolCompZ = ((PolComp == 0) || (PolComp == 'z'));
+
+	int nx = NewRadAccessData.AuxLong2 - NewRadAccessData.AuxLong1 + 1;
+	int nz = NewRadAccessData.AuxLong4 - NewRadAccessData.AuxLong3 + 1;
+	int ne = NewRadAccessData.ne;
+	OldRadAccessData.pBaseRadX = (float*)CAuxGPU::ToDevice(pGPU, OldRadAccessData.pBaseRadX, 2*OldRadAccessData.ne*OldRadAccessData.nx*OldRadAccessData.nz*sizeof(float));
+	OldRadAccessData.pBaseRadZ = (float*)CAuxGPU::ToDevice(pGPU, OldRadAccessData.pBaseRadZ, 2*OldRadAccessData.ne*OldRadAccessData.nx*OldRadAccessData.nz*sizeof(float));
+	NewRadAccessData.pBaseRadX = (float*)CAuxGPU::ToDevice(pGPU, NewRadAccessData.pBaseRadX, 2*NewRadAccessData.ne*NewRadAccessData.nx*NewRadAccessData.nz*sizeof(float), true);
+	NewRadAccessData.pBaseRadZ = (float*)CAuxGPU::ToDevice(pGPU, NewRadAccessData.pBaseRadZ, 2*NewRadAccessData.ne*NewRadAccessData.nx*NewRadAccessData.nz*sizeof(float), true);
+	
+	CAuxGPU::EnsureDeviceMemoryReady(pGPU, OldRadAccessData.pBaseRadX);
+	CAuxGPU::EnsureDeviceMemoryReady(pGPU, OldRadAccessData.pBaseRadZ);
+	//CAuxGPU::EnsureDeviceMemoryReady(pGPU, NewRadAccessData.pBaseRadX);
+	//CAuxGPU::EnsureDeviceMemoryReady(pGPU, NewRadAccessData.pBaseRadZ);
+
+	const int bs = 32;
+	dim3 blocks(nx / bs + ((nx & (bs - 1)) != 0), nz, ne);
+	dim3 threads(bs, 1);
+	
+	if (TreatPolCompX && TreatPolCompZ) RadResizeCore_Kernel<true, true> << <blocks, threads >> > (OldRadAccessData, NewRadAccessData);
+	else if (TreatPolCompX) RadResizeCore_Kernel<true, false> << <blocks, threads >> > (OldRadAccessData, NewRadAccessData);
+	else if (TreatPolCompZ) RadResizeCore_Kernel<false, true> << <blocks, threads >> > (OldRadAccessData, NewRadAccessData);
+
+	OldRadAccessData.pBaseRadX = (float*)CAuxGPU::ToHostAndFree(pGPU, OldRadAccessData.pBaseRadX, 2*OldRadAccessData.ne*OldRadAccessData.nx*OldRadAccessData.nz*sizeof(float), true);
+	OldRadAccessData.pBaseRadZ = (float*)CAuxGPU::ToHostAndFree(pGPU, OldRadAccessData.pBaseRadZ, 2*OldRadAccessData.ne*OldRadAccessData.nx*OldRadAccessData.nz*sizeof(float), true);
+	//NewRadAccessData.pBaseRadX = CAuxGPU::ToHostAndFree(pGPU, NewRadAccessData.pBaseRadX, 2*NewRadAccessData.ne*NewRadAccessData.nx*NewRadAccessData.nz*sizeof(float));
+	//NewRadAccessData.pBaseRadZ = CAuxGPU::ToHostAndFree(pGPU, NewRadAccessData.pBaseRadZ, 2*NewRadAccessData.ne*NewRadAccessData.nx*NewRadAccessData.nz*sizeof(float));
+	CAuxGPU::MarkUpdated(pGPU, NewRadAccessData.pBaseRadX, true, false);
+	CAuxGPU::MarkUpdated(pGPU, NewRadAccessData.pBaseRadZ, true, false);
+#ifndef _DEBUG
+	NewRadAccessData.pBaseRadX = (float*)CAuxGPU::GetHostPtr(pGPU, NewRadAccessData.pBaseRadX);
+	NewRadAccessData.pBaseRadZ = (float*)CAuxGPU::GetHostPtr(pGPU, NewRadAccessData.pBaseRadZ);
+#endif
+
+#ifdef _DEBUG
+	NewRadAccessData.pBaseRadX = (float*)CAuxGPU::ToHostAndFree(pGPU, NewRadAccessData.pBaseRadX, 2*NewRadAccessData.ne*NewRadAccessData.nx*NewRadAccessData.nz*sizeof(float), false);
+	NewRadAccessData.pBaseRadZ = (float*)CAuxGPU::ToHostAndFree(pGPU, NewRadAccessData.pBaseRadZ, 2*NewRadAccessData.ne*NewRadAccessData.nx*NewRadAccessData.nz*sizeof(float), false);
+	cudaStreamSynchronize(0);
+	auto err = cudaGetLastError();
+	printf("%s\r\n", cudaGetErrorString(err));
+
+#endif
+
+	return 0;
+}
+
+#endif
\ No newline at end of file
diff --git a/cpp/src/core/sroptelm_gpu.h b/cpp/src/core/sroptelm_gpu.h
new file mode 100644
index 00000000..629e0c42
--- /dev/null
+++ b/cpp/src/core/sroptelm_gpu.h
@@ -0,0 +1,123 @@
+/************************************************************************//**
+ * File: sroptelm_gpu.h
+ * Description: Optical element (general CUDA header)
+ * Project: Synchrotron Radiation Workshop
+ * First release: 2023
+ *
+ * Copyright (C) Brookhaven National Laboratory
+ * All Rights Reserved
+ *
+ * @author H.Goel
+ * @version 1.0
+ ***************************************************************************/
+
+#ifdef _OFFLOAD_GPU
+#ifndef __SROPTELMGPU_H
+#define __SROPTELMGPU_H
+
+#include "cuda_runtime.h"
+#include <sroptelm.h>
+#include <srradstr.h>
+#include <srstraux.h>
+
+#ifdef __CUDACC__
+template<class T> __global__ void RadPointModifierParallel_Kernel(srTSRWRadStructAccessData RadAccessData, void* pBufVars, T* tgt_obj)
+{
+	int ix = (blockIdx.x * blockDim.x + threadIdx.x); //nx range
+	int iz = (blockIdx.y * blockDim.y + threadIdx.y); //nz range
+
+	if (ix < RadAccessData.nx && iz < RadAccessData.nz)
+	{
+		srTEFieldPtrs EPtrs;
+		srTEXZ EXZ;
+		EXZ.z = RadAccessData.zStart + iz * RadAccessData.zStep;
+		EXZ.x = RadAccessData.xStart + ix * RadAccessData.xStep;
+
+		for (int ie = 0; ie < RadAccessData.ne; ie++) {
+			EXZ.e = RadAccessData.eStart + ie * RadAccessData.eStep;
+			EXZ.aux_offset = RadAccessData.ne * RadAccessData.nx * 2 * iz + RadAccessData.ne * 2 * ix + ie * 2;
+			if (RadAccessData.pBaseRadX != 0)
+			{
+				EPtrs.pExRe = RadAccessData.pBaseRadX + EXZ.aux_offset;
+				EPtrs.pExIm = EPtrs.pExRe + 1;
+			}
+			else
+			{
+				EPtrs.pExRe = 0;
+				EPtrs.pExIm = 0;
+			}
+			if (RadAccessData.pBaseRadZ != 0)
+			{
+				EPtrs.pEzRe = RadAccessData.pBaseRadZ + EXZ.aux_offset;
+				EPtrs.pEzIm = EPtrs.pEzRe + 1;
+			}
+			else
+			{
+				EPtrs.pEzRe = 0;
+				EPtrs.pEzIm = 0;
+			}
+
+			tgt_obj->RadPointModifierPortable(EXZ, EPtrs, pBufVars);
+		}
+	}
+}
+
+template<class T> int RadPointModifierParallelImpl(srTSRWRadStructAccessData* pRadAccessData, void* pBufVars, long pBufVarsSz, T* tgt_obj, TGPUUsageArg* pGPU)
+{
+	const int bs = 256;
+	dim3 blocks(pRadAccessData->nx / bs + ((pRadAccessData->nx & (bs - 1)) != 0), pRadAccessData->nz);
+	dim3 threads(bs, 1);
+	
+	if (pRadAccessData->pBaseRadX != NULL)
+	{
+		pRadAccessData->pBaseRadX = (float*)CAuxGPU::ToDevice(pGPU, pRadAccessData->pBaseRadX, 2*pRadAccessData->ne*pRadAccessData->nx*pRadAccessData->nz*sizeof(float));
+		CAuxGPU::EnsureDeviceMemoryReady(pGPU, pRadAccessData->pBaseRadX);
+	}
+	if (pRadAccessData->pBaseRadZ != NULL)
+	{
+		pRadAccessData->pBaseRadZ = (float*)CAuxGPU::ToDevice(pGPU, pRadAccessData->pBaseRadZ, 2*pRadAccessData->ne*pRadAccessData->nx*pRadAccessData->nz*sizeof(float));
+		CAuxGPU::EnsureDeviceMemoryReady(pGPU, pRadAccessData->pBaseRadZ);
+	}
+
+    T* local_copy = (T*)CAuxGPU::ToDevice(pGPU, tgt_obj, sizeof(T));
+	CAuxGPU::EnsureDeviceMemoryReady(pGPU, local_copy);
+    //cudaMalloc(&local_copy, sizeof(T));
+    //cudaMemcpy(local_copy, tgt_obj, sizeof(T), cudaMemcpyHostToDevice);
+	
+	void* pBufVars_dev = NULL;
+	if (pBufVarsSz > 0){
+		pBufVars_dev = CAuxGPU::ToDevice(pGPU, pBufVars, pBufVarsSz);
+		CAuxGPU::EnsureDeviceMemoryReady(pGPU, pBufVars_dev);
+	}
+	RadPointModifierParallel_Kernel<T> << <blocks, threads >> > (*pRadAccessData, pBufVars_dev, local_copy);
+    //cudaDeviceSynchronize();
+    //cudaFreeAsync(local_copy, 0);
+	if (pBufVarsSz > 0) CAuxGPU::ToHostAndFree(pGPU, pBufVars_dev, pBufVarsSz, true);
+	CAuxGPU::ToHostAndFree(pGPU, local_copy, sizeof(T), true);
+
+	CAuxGPU::MarkUpdated(pGPU, pRadAccessData->pBaseRadX, true, false);
+	CAuxGPU::MarkUpdated(pGPU, pRadAccessData->pBaseRadZ, true, false);
+
+#ifndef _DEBUG
+	if (pRadAccessData->pBaseRadX != NULL)
+		pRadAccessData->pBaseRadX = (float*)CAuxGPU::GetHostPtr(pGPU, pRadAccessData->pBaseRadX);
+	if (pRadAccessData->pBaseRadZ != NULL)
+		pRadAccessData->pBaseRadZ = (float*)CAuxGPU::GetHostPtr(pGPU, pRadAccessData->pBaseRadZ);
+#endif
+
+#ifdef _DEBUG
+	if (pRadAccessData->pBaseRadX != NULL)
+		pRadAccessData->pBaseRadX = (float*)CAuxGPU::ToHostAndFree(pGPU, pRadAccessData->pBaseRadX, 2*pRadAccessData->ne*pRadAccessData->nx*pRadAccessData->nz*sizeof(float));
+	if (pRadAccessData->pBaseRadZ != NULL)
+		pRadAccessData->pBaseRadZ = (float*)CAuxGPU::ToHostAndFree(pGPU, pRadAccessData->pBaseRadZ, 2*pRadAccessData->ne*pRadAccessData->nx*pRadAccessData->nz*sizeof(float));
+	cudaStreamSynchronize(0);
+	auto err = cudaGetLastError();
+	printf("%s\r\n", cudaGetErrorString(err));
+#endif
+
+	return 0;
+}
+#endif
+
+#endif //__SROPTELMGPU_H
+#endif
\ No newline at end of file
diff --git a/cpp/src/core/sroptgtr.cpp b/cpp/src/core/sroptgtr.cpp
index 7f348172..96681032 100644
--- a/cpp/src/core/sroptgtr.cpp
+++ b/cpp/src/core/sroptgtr.cpp
@@ -1172,7 +1172,7 @@ int srTGenTransmission::DetermineFocalDistByPropag1D(srTRadSect1D& Sect1D, doubl
 }
 
 //*************************************************************************
-
+/* HG01122023 Moved to header file to reduce code duplication for GPU support
 void srTGenTransmission::RadPointModifier(srTEXZ& EXZ, srTEFieldPtrs& EPtrs, void* pBufVars) //OC29082019
 //void srTGenTransmission::RadPointModifier(srTEXZ& EXZ, srTEFieldPtrs& EPtrs)
 {// e in eV; Length in m !!!
@@ -1338,7 +1338,7 @@ void srTGenTransmission::RadPointModifier(srTEXZ& EXZ, srTEFieldPtrs& EPtrs, voi
 		float NewEzIm = (float)(T*((*(EPtrs.pEzRe))*SinPh + (*(EPtrs.pEzIm))*CosPh));
 		*(EPtrs.pEzRe) = NewEzRe; *(EPtrs.pEzIm) = NewEzIm; 
 	}
-}
+} */
 
 //*************************************************************************
 
diff --git a/cpp/src/core/sroptgtr.h b/cpp/src/core/sroptgtr.h
index 0cde61e5..202190c3 100644
--- a/cpp/src/core/sroptgtr.h
+++ b/cpp/src/core/sroptgtr.h
@@ -50,6 +50,8 @@ class srTGenTransmission : public srTFocusingElem {
 		}
 	}
 
+	int SupportedFeatures() override { return 1; } //HG01122023 =1 means that it supports GPU propagation
+
 	void EnsureTransmissionForField();
 	double DetermineAppropriatePhotEnergyForFocDistTest(double Rx, double Rz);
 	int EstimateFocalDistancesAndCheckSampling();
@@ -79,7 +81,8 @@ class srTGenTransmission : public srTFocusingElem {
 	}
 
 	//int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, int MethNo, srTRadResizeVect& ResBeforeAndAfterArr)
-	int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterArr)
+	//int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterArr)
+	int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterArr, void* pvGPU) //HG01122023
 	{
 		//if(ParPrecWfrPropag.AnalTreatment == 1)
 		//{// Treating linear terms analytically
@@ -90,7 +93,8 @@ class srTGenTransmission : public srTFocusingElem {
 		
 		int result = 0;
 
-		if(MethNo == 0) result = PropagateRadiationMeth_0(pRadAccessData);
+		//if(MethNo == 0) result = PropagateRadiationMeth_0(pRadAccessData);
+		if(MethNo == 0) result = PropagateRadiationMeth_0(pRadAccessData, pvGPU); //HG01122023
 		else result = PropagateRadiationMeth_2(pRadAccessData, ParPrecWfrPropag, ResBeforeAndAfterArr);
 		
 		//if(ParPrecWfrPropag.AnalTreatment == 1)
@@ -104,25 +108,30 @@ class srTGenTransmission : public srTFocusingElem {
 	//int PropagateRadiationMeth_0(srTSRWRadStructAccessData* pRadAccessData)
 	//int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadDataSingleE, void* pBuf=0) //OC06092019
 	//OC01102019 (restored)
-	int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadDataSingleE)
+	//int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadDataSingleE)
+	int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadDataSingleE, void* pvGPU) //HG01122023
 	{
 		int result;
 		if(result = PropagateRadMoments(pRadAccessData, 0)) return result;
 		if(result = PropagateWaveFrontRadius(pRadAccessData)) return result;
 		//if(result = PropagateRadiationSimple(pRadAccessData, pBuf)) return result; //OC06092019
 		//OC01102019 (restored)
-		if(result = PropagateRadiationSimple(pRadAccessData)) return result;
+		//if(result = PropagateRadiationSimple(pRadAccessData)) return result;
+		if(result = PropagateRadiationSimple(pRadAccessData, pvGPU)) return result; //HG01122023
 		if(result = Propagate4x4PropMatr(pRadAccessData)) return result;
 		return 0;
 	}
 
 	//int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData, void* pBuf=0) //OC06092019
 	//OC01102019 (restored)
-	int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData)
+	//int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData)
+	int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData, void* pvGPU) //HG01122023
 	{
 		int result;
-		if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result;
-		return TraverseRadZXE(pRadAccessData);
+		//if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result;
+		if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0, 0, 0, pvGPU)) return result; //HG01122023
+		//return TraverseRadZXE(pRadAccessData);
+		return TraverseRadZXE(pRadAccessData, 0, 0, pvGPU); //HG01122023
 	}
   	int PropagateRadiationSimple1D(srTRadSect1D* pSect1D)
 	{
@@ -131,8 +140,186 @@ class srTGenTransmission : public srTFocusingElem {
 		return TraverseRad1D(pSect1D);
 	}
 
-	void RadPointModifier(srTEXZ& EXZ, srTEFieldPtrs& EPtrs, void* pBuf=0); //OC29082019
+	void RadPointModifier(srTEXZ& EXZ, srTEFieldPtrs& EPtrs, void* pBuf=0) //OC29082019 //HG01122023
+	{
+		RadPointModifierPortable(EXZ, EPtrs, pBuf);
+	}
+	//void RadPointModifier(srTEXZ& EXZ, srTEFieldPtrs& EPtrs, void* pBuf=0); //OC29082019
 	//void RadPointModifier(srTEXZ& EXZ, srTEFieldPtrs& EPtrs);
+
+	
+#ifdef _OFFLOAD_GPU //HG01122023 Brought from sroptgtr.cpp, to reduce code duplication for GPU port
+	int RadPointModifierParallel(srTSRWRadStructAccessData* pRadAccessData, void* pBufVars=0, long pBufVarsSz=0, TGPUUsageArg* pGPU=0) override;
+
+	GPU_PORTABLE
+#endif
+	void RadPointModifierPortable(srTEXZ& EXZ, srTEFieldPtrs& EPtrs, void* pBufVars) //OC29082019
+	//void srTGenTransmission::RadPointModifier(srTEXZ& EXZ, srTEFieldPtrs& EPtrs)
+	{// e in eV; Length in m !!!
+	// Operates on Coord. side !!!
+		//double xRel = EXZ.x - TransvCenPoint.x, zRel = EXZ.z - TransvCenPoint.y;
+		double xRel = EXZ.x, zRel = EXZ.z; //OC080311
+
+		long Ne = 1, Nemi2 = -1;
+		long iDimX = 0, iDimZ = 1;
+		if(GenTransNumData.AmOfDims == 3)
+		{
+			//Ne = (GenTransNumData.DimSizes)[0];
+			Ne = (long)((GenTransNumData.DimSizes)[0]); //OC28042019
+			Nemi2 = Ne - 2;
+			iDimX = 1; iDimZ = 2;
+		}
+
+		//long Nx = (GenTransNumData.DimSizes)[0], Nz = (GenTransNumData.DimSizes)[1];
+		//long Nx = (GenTransNumData.DimSizes)[iDimX], Nz = (GenTransNumData.DimSizes)[iDimZ]; //OC241112
+		long Nx = (long)((GenTransNumData.DimSizes)[iDimX]), Nz = (long)((GenTransNumData.DimSizes)[iDimZ]); //OC28042019
+		long Nxmi2 = Nx - 2, Nzmi2 = Nz - 2;
+		
+		//double xStart = (GenTransNumData.DimStartValues)[0], zStart = (GenTransNumData.DimStartValues)[1];
+		//double xStep = (GenTransNumData.DimSteps)[0], zStep = (GenTransNumData.DimSteps)[1];
+		double xStart = (GenTransNumData.DimStartValues)[iDimX], zStart = (GenTransNumData.DimStartValues)[iDimZ];
+		double xStep = (GenTransNumData.DimSteps)[iDimX], zStep = (GenTransNumData.DimSteps)[iDimZ];
+
+		double xEnd = xStart + (Nx - 1)*xStep, zEnd = zStart + (Nz - 1)*zStep;
+
+		double AbsTolX = xStep*0.001, AbsTolZ = zStep*0.001; // To steer
+		if(OuterTransmIs == 1)
+		{
+			if((xRel < xStart - AbsTolX) || (xRel > xEnd + AbsTolX) || (zRel < zStart - AbsTolZ) || (zRel > zEnd + AbsTolZ))
+			{
+				if(EPtrs.pExRe != 0) { *(EPtrs.pExRe) = 0.; *(EPtrs.pExIm) = 0.;}
+				if(EPtrs.pEzRe != 0) { *(EPtrs.pEzRe) = 0.; *(EPtrs.pEzIm) = 0.;}
+				return;
+			}
+		}
+
+		double xr = 0., zr = 0.;
+		double T = 1., Ph = 0.;
+		//char NotExactRightEdgeX = 1, NotExactRightEdgeZ = 1;
+
+		long ix = long((xRel - xStart)/xStep);
+		if(::fabs(xRel - ((ix + 1)*xStep + xStart)) < 1.E-05*xStep) ix++;
+
+		//if(ix < 0) { ix = 0; xr = 0.;}
+		//else if(ix > Nxmi2) { ix = Nx - 1; xr = 0.; NotExactRightEdgeX = 0;}
+		//else xr = (xRel - (ix*xStep + xStart))/xStep;
+
+		if(ix < 0) ix = 0; //OC241112
+		//else if(ix > Nxmi2) ix = Nxmi2;
+		//xr = (xRel - (ix*xStep + xStart))/xStep;
+		else if(ix > Nxmi2) { ix = Nxmi2; xr = 1.;}
+		else xr = (xRel - (ix*xStep + xStart))/xStep;
+
+		long iz = long((zRel - zStart)/zStep);
+		if(::fabs(zRel - ((iz + 1)*zStep + zStart)) < 1.E-05*zStep) iz++;
+
+		//if(iz < 0) { iz = 0; zr = 0.;}
+		//else if(iz > Nzmi2) { iz = Nz - 1; zr = 0.; NotExactRightEdgeZ = 0;}
+		//else zr = (zRel - (iz*zStep + zStart))/zStep;
+
+		if(iz < 0) iz = 0;
+		//else if(iz > Nzmi2) iz = Nzmi2;
+		//zr = (zRel - (iz*zStep + zStart))/zStep;
+		else if(iz > Nzmi2) { iz = Nzmi2; zr = 1.;}
+		else zr = (zRel - (iz*zStep + zStart))/zStep;
+		
+		double xrzr = xr*zr;
+		if((GenTransNumData.AmOfDims == 2) || ((GenTransNumData.AmOfDims == 3) && (Ne == 1)))
+		{
+			//long zPer = Nx << 1;
+			long long zPer = Nx << 1;
+
+			//DOUBLE *p00 = (DOUBLE*)(GenTransNumData.pData) + (iz*zPer + (ix << 1));
+			//DOUBLE *p10 = p00 + 2, *p01 = p00 + zPer;
+			//DOUBLE *p11 = p01 + 2;
+			//DOUBLE *p00p1 = p00+1, *p10p1 = p10+1, *p01p1 = p01+1, *p11p1 = p11+1;
+			double *p00 = (double*)(GenTransNumData.pData) + (iz*zPer + (ix << 1)); //OC26112019 (related to SRW port to IGOR XOP8 on Mac)
+			double *p10 = p00 + 2, *p01 = p00 + zPer;
+			double *p11 = p01 + 2;
+			double *p00p1 = p00+1, *p10p1 = p10+1, *p01p1 = p01+1, *p11p1 = p11+1;
+
+			//double Axz = 0., Ax = 0., Az = 0., Bxz = 0., Bx = 0., Bz = 0.;
+			//if(NotExactRightEdgeX && NotExactRightEdgeZ) { Axz = *p00 - *p01 - *p10 + *p11; Bxz = *p00p1 - *p01p1 - *p10p1 + *p11p1;}
+			//if(NotExactRightEdgeX) { Ax = (*p10 - *p00); Bx = (*p10p1 - *p00p1);}
+			//if(NotExactRightEdgeZ) { Az = (*p01 - *p00); Bz = (*p01p1 - *p00p1);}
+
+			double Axz = *p00 - *p01 - *p10 + *p11, Bxz = *p00p1 - *p01p1 - *p10p1 + *p11p1;
+			double Ax = (*p10 - *p00), Bx = (*p10p1 - *p00p1);
+			double Az = (*p01 - *p00), Bz = (*p01p1 - *p00p1);
+
+			T = Axz*xrzr + Ax*xr + Az*zr + *p00;
+			Ph = Bxz*xrzr + Bx*xr + Bz*zr + *p00p1;
+
+			//OCTEST 04032019
+			//T = *p00 + Ax*xr + Az*zr;
+			//Ph = *p00p1 + Bx*xr + Bz*zr;
+
+			//OCTEST 05032019
+			//T = CGenMathInterp::InterpOnRegMesh2d(EXZ.x, EXZ.z, xStart, xStep, Nx, zStart, zStep, Nz, (double*)(GenTransNumData.pData), 3, 2);
+			//Ph = CGenMathInterp::InterpOnRegMesh2d(EXZ.x, EXZ.z, xStart, xStep, Nx, zStart, zStep, Nz, (double*)(GenTransNumData.pData) + 1, 3, 2);
+			//END OCTEST
+		}
+		else if(GenTransNumData.AmOfDims == 3)
+		{//bi-linear 3D interpolation
+			double eStart = (GenTransNumData.DimStartValues)[0];
+			double eStep = (GenTransNumData.DimSteps)[0];
+
+			long ie = long((EXZ.e - eStart)/eStep + 1.e-10);
+			if(ie < 0) ie = 0;
+			else if(ie > Nemi2) ie = Nemi2;
+
+			double er = (EXZ.e - (ie*eStep + eStart))/eStep;
+			//double erxr = er*xr, erzr = er*zr;
+			//double erxrzr = erxr*zr;
+
+			//long xPer = Ne << 1;
+			//long zPer = Nx*xPer;
+			long long xPer = Ne << 1;
+			long long zPer = Nx*xPer;
+			//DOUBLE *p000 = (DOUBLE*)(GenTransNumData.pData) + (iz*zPer + ix*xPer + (ie << 1));
+			//DOUBLE *p100 = p000 + 2, *p010 = p000 + xPer, *p001 = p000 + zPer;
+			//DOUBLE *p110 = p100 + xPer, *p101 = p100 + zPer, *p011 = p010 + zPer;
+			//DOUBLE *p111 = p110 + zPer;
+			double *p000 = (double*)(GenTransNumData.pData) + (iz*zPer + ix*xPer + (ie << 1)); //OC26112019 (related to SRW port to IGOR XOP8 on Mac)
+			double *p100 = p000 + 2, *p010 = p000 + xPer, *p001 = p000 + zPer;
+			double *p110 = p100 + xPer, *p101 = p100 + zPer, *p011 = p010 + zPer;
+			double *p111 = p110 + zPer;
+
+			double one_mi_er = 1.- er, one_mi_xr = 1.- xr, one_mi_zr = 1.- zr;
+			double one_mi_er_one_mi_xr = one_mi_er*one_mi_xr, er_one_mi_xr = er*one_mi_xr;
+			double one_mi_er_xr = one_mi_er*xr, er_xr = er*xr;
+			T = ((*p000)*one_mi_er_one_mi_xr + (*p100)*er_one_mi_xr + (*p010)*one_mi_er_xr + (*p110)*er_xr)*one_mi_zr
+			+ ((*p001)*one_mi_er_one_mi_xr + (*p101)*er_one_mi_xr + (*p011)*one_mi_er_xr + (*p111)*er_xr)*zr;
+			Ph = ((*(p000+1))*one_mi_er_one_mi_xr + (*(p100+1))*er_one_mi_xr + (*(p010+1))*one_mi_er_xr + (*(p110+1))*er_xr)*one_mi_zr
+			+ ((*(p001+1))*one_mi_er_one_mi_xr + (*(p101+1))*er_one_mi_xr + (*(p011+1))*one_mi_er_xr + (*(p111+1))*er_xr)*zr;
+
+		// inArFunc[] = {f(x0,y0,z0),f(x1,y0,z0),f(x0,y1,z0),f(x0,y0,z1),f(x1,y1,z0),f(x1,y0,z1),f(x0,y1,z1),f(x1,y1,z1)} //function values at the corners of the cube
+			//return inArFunc[0]*one_mi_xt*one_mi_yt*one_mi_zt
+			//	+ inArFunc[1]*xt*one_mi_yt*one_mi_zt
+			//	+ inArFunc[2]*one_mi_xt*yt*one_mi_zt
+			//	+ inArFunc[3]*one_mi_xt*one_mi_yt*zt
+			//	+ inArFunc[4]*xt*yt*one_mi_zt
+			//	+ inArFunc[5]*xt*one_mi_yt*zt
+			//	+ inArFunc[6]*one_mi_xt*yt*zt
+			//	+ inArFunc[7]*xt*yt*zt;
+		}
+
+		if(OptPathOrPhase == 1) Ph *= EXZ.e*5.0676816042E+06; // TwoPi_d_Lambda_m
+		float CosPh, SinPh; CosAndSin(Ph, CosPh, SinPh);
+		if(EPtrs.pExRe != 0)
+		{
+			float NewExRe = (float)(T*((*(EPtrs.pExRe))*CosPh - (*(EPtrs.pExIm))*SinPh));
+			float NewExIm = (float)(T*((*(EPtrs.pExRe))*SinPh + (*(EPtrs.pExIm))*CosPh));
+			*(EPtrs.pExRe) = NewExRe; *(EPtrs.pExIm) = NewExIm; 
+		}
+		if(EPtrs.pEzRe != 0)
+		{
+			float NewEzRe = (float)(T*((*(EPtrs.pEzRe))*CosPh - (*(EPtrs.pEzIm))*SinPh));
+			float NewEzIm = (float)(T*((*(EPtrs.pEzRe))*SinPh + (*(EPtrs.pEzIm))*CosPh));
+			*(EPtrs.pEzRe) = NewEzRe; *(EPtrs.pEzIm) = NewEzIm; 
+		}
+	}
+
   	void RadPointModifier1D(srTEXZ& EXZ, srTEFieldPtrs& EPtrs, void* pBuf=0); //OC06092019
   	//void RadPointModifier1D(srTEXZ& EXZ, srTEFieldPtrs& EPtrs);
 
diff --git a/cpp/src/core/sroptgtr_gpu.cu b/cpp/src/core/sroptgtr_gpu.cu
new file mode 100644
index 00000000..9250740e
--- /dev/null
+++ b/cpp/src/core/sroptgtr_gpu.cu
@@ -0,0 +1,32 @@
+/************************************************************************//**
+ * File: sroptgtr_gpu.cu
+ * Description: Optical element: Transmission (CUDA implementation)
+ * Project: Synchrotron Radiation Workshop
+ * First release: 2023
+ *
+ * Copyright (C) Brookhaven National Laboratory
+ * All Rights Reserved
+ *
+ * @author H.Goel
+ * @version 1.0
+ ***************************************************************************/
+
+#ifdef _OFFLOAD_GPU
+#include "sroptgtr.h"
+#include "cuda_runtime.h"
+#include "device_launch_parameters.h"
+#include "math_constants.h"
+
+#include <stdio.h>
+#include <iostream>
+#include <chrono>
+
+int srTGenTransmission::RadPointModifierParallel(srTSRWRadStructAccessData* pRadAccessData, void* pBufVars, long pBufVarsSz, TGPUUsageArg* pGPU)
+{
+    GenTransNumData.pData = (char*)CAuxGPU::ToDevice(pGPU, GenTransNumData.pData, GenTransNumData.DimSizes[0] * (int)GenTransNumData.DimSizes[1] * (int)GenTransNumData.DimSizes[2] * sizeof(double) * 2);
+	CAuxGPU::EnsureDeviceMemoryReady(pGPU, GenTransNumData.pData);
+    int retCode = RadPointModifierParallelImpl<srTGenTransmission>(pRadAccessData, pBufVars, pBufVarsSz, this, pGPU); 
+	GenTransNumData.pData = (char*)CAuxGPU::ToHostAndFree(pGPU, GenTransNumData.pData, GenTransNumData.DimSizes[0] * (int)GenTransNumData.DimSizes[1] * (int)GenTransNumData.DimSizes[2] * sizeof(double) * 2, true);
+    return retCode;
+} //HG03092022
+#endif
\ No newline at end of file
diff --git a/cpp/src/core/srradmnp.cpp b/cpp/src/core/srradmnp.cpp
index 7522f78b..ff3597b8 100644
--- a/cpp/src/core/srradmnp.cpp
+++ b/cpp/src/core/srradmnp.cpp
@@ -676,8 +676,9 @@ int srTRadGenManip::ExtractSingleElecIntensity1DvsZ(srTRadExtract& RadExtract)
 
 //*************************************************************************
 
-int srTRadGenManip::ExtractSingleElecIntensity2DvsXZ(srTRadExtract& RadExtract)
-//int srTRadGenManip::ExtractSingleElecIntensity2DvsXZ(srTRadExtract& RadExtract, gpuUsageArg *pGpuUsage) //Himanshu?
+//int srTRadGenManip::ExtractSingleElecIntensity2DvsXZ(srTRadExtract& RadExtract)
+//int srTRadGenManip::ExtractSingleElecIntensity2DvsXZ(srTRadExtract& RadExtract, gpuUsageArg *pGpuUsage) //HG30112023
+int srTRadGenManip::ExtractSingleElecIntensity2DvsXZ(srTRadExtract& RadExtract, void* pvGPU) //HG02122023
 {
 	int PolCom = RadExtract.PolarizCompon;
 	int Int_or_ReE = RadExtract.Int_or_Phase;
@@ -691,7 +692,7 @@ int srTRadGenManip::ExtractSingleElecIntensity2DvsXZ(srTRadExtract& RadExtract)
 	float *pI = 0, *pI1 = 0, *pI2 = 0, *pI3 = 0; //OC17042020
 	double *pId = 0, *pI1d = 0, *pI2d = 0, *pI3d = 0;
 	long ne = RadAccessData.ne, nx = RadAccessData.nx, nz = RadAccessData.nz;
-	//long ne = RadAccessData.ne, nx = RadAccessData.nx, nz = RadAccessData.nz, nwfr = RadAccessData.nwfr; //Himanshu?
+	//long ne = RadAccessData.ne, nx = RadAccessData.nx, nz = RadAccessData.nz, nwfr = RadAccessData.nwfr; //HG30112023
 	//float *pI = 0;
 	//DOUBLE *pId = 0;
 	//double *pId = 0; //OC26112019 (related to SRW port to IGOR XOP8 on Mac)
@@ -759,185 +760,180 @@ int srTRadGenManip::ExtractSingleElecIntensity2DvsXZ(srTRadExtract& RadExtract)
 	//long izPerZ = 0;
 	long ix, ie;
 
-	//Himanshu?
-	//GPU_COND(pGpuUsage,
-	//{
-	//	ExtractSingleElecIntensity2DvsXZ_GPU(RadExtract, arAuxInt, ie0, ie1, InvStepRelArg, pGpuUsage);
-	//})
-	//else
-	//{
-		//long long iwfrPerWfr = 0;
-		//for(long long iwfr=0; iwfr<nwfr; iwfr++)
-		//{
-			long long izPerZ = 0;
-			for(long long iz=0; iz<nz; iz++) //OC18042020
-			//for(long long iz=0; iz<RadAccessData.nz; iz++) //OC26042019
-			//for(long iz=0; iz<RadAccessData.nz; iz++)
-			{
-				float *pEx_StartForX = pEx0 + izPerZ;
-				float *pEz_StartForX = pEz0 + izPerZ;
-				//long ixPerX = 0;
-
-				float *pEx_St = pEx_StartForX + Two_ie0;
-				float *pEz_St = pEz_StartForX + Two_ie0;
-				float *pEx_Fi = pEx_StartForX + Two_ie1;
-				float *pEz_Fi = pEz_StartForX + Two_ie1;
-
-				for(ix=0; ix<nx; ix++) //OC18042020
-				//for(long ix=0; ix<RadAccessData.nx; ix++)
-				{
-					//float *pEx_StartForE = pEx_StartForX + ixPerX;
-					//float *pEz_StartForE = pEz_StartForX + ixPerX;
-					//float *pEx_St = pEx_StartForE + Two_ie0, *pEx_Fi = pEx_StartForE + Two_ie1;
-					//float *pEz_St = pEz_StartForE + Two_ie0, *pEz_Fi = pEz_StartForE + Two_ie1;
-
-					//OC140813
-					//if(pI != 0) *(pI++) = IntensityComponentSimpleInterpol(pEx_St, pEx_Fi, pEz_St, pEz_Fi, InvStepRelArg, PolCom, Int_or_ReE);
-					//if(pId != 0) *(pId++) = IntensityComponentSimpleInterpol(pEx_St, pEx_Fi, pEz_St, pEz_Fi, InvStepRelArg, PolCom, Int_or_ReE);
+#ifdef _OFFLOAD_GPU	//HG30112023 //HG02122023
+	if(CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU))
+	{
+		ExtractSingleElecIntensity2DvsXZ_GPU(RadExtract, arAuxInt, ie0, ie1, InvStepRelArg, (TGPUUsageArg*)pvGPU);
+	}
+	else
+#endif
+	{
+		long long izPerZ = 0;
+		for(long long iz=0; iz<nz; iz++) //OC18042020
+		//for(long long iz=0; iz<RadAccessData.nz; iz++) //OC26042019
+		//for(long iz=0; iz<RadAccessData.nz; iz++)
+		{
+			float *pEx_StartForX = pEx0 + izPerZ;
+			float *pEz_StartForX = pEz0 + izPerZ;
+			//long ixPerX = 0;
 
-					if(intOverEnIsRequired) //OC140813
-					{//integrate over photon energy / time
-						double *tInt = arAuxInt; 
-						float *pEx_StAux = pEx_St;
-						float *pEz_StAux = pEz_St;
+			float *pEx_St = pEx_StartForX + Two_ie0;
+			float *pEz_St = pEz_StartForX + Two_ie0;
+			float *pEx_Fi = pEx_StartForX + Two_ie1;
+			float *pEz_Fi = pEz_StartForX + Two_ie1;
 
-						if(!allStokesReq) //OC17042020
-						{
-							for(ie=0; ie<ne; ie++) //OC18042020
-							//for(int ie=0; ie<RadAccessData.ne; ie++)
-							{
-								*(tInt++) = IntensityComponent(pEx_StAux, pEz_StAux, PolCom, Int_or_ReE);
-								pEx_StAux += 2;
-								pEz_StAux += 2;
-							}
-							resInt = ConstPhotEnInteg*CGenMathMeth::Integ1D_FuncDefByArray(arAuxInt, ne, RadAccessData.eStep); //OC18042020
-							//resInt = ConstPhotEnInteg*CGenMathMeth::Integ1D_FuncDefByArray(arAuxInt, RadAccessData.ne, RadAccessData.eStep);
-						}
-						else
+			for(ix=0; ix<nx; ix++) //OC18042020
+			//for(long ix=0; ix<RadAccessData.nx; ix++)
+			{
+				//float *pEx_StartForE = pEx_StartForX + ixPerX;
+				//float *pEz_StartForE = pEz_StartForX + ixPerX;
+				//float *pEx_St = pEx_StartForE + Two_ie0, *pEx_Fi = pEx_StartForE + Two_ie1;
+				//float *pEz_St = pEz_StartForE + Two_ie0, *pEz_Fi = pEz_StartForE + Two_ie1;
+
+				//OC140813
+				//if(pI != 0) *(pI++) = IntensityComponentSimpleInterpol(pEx_St, pEx_Fi, pEz_St, pEz_Fi, InvStepRelArg, PolCom, Int_or_ReE);
+				//if(pId != 0) *(pId++) = IntensityComponentSimpleInterpol(pEx_St, pEx_Fi, pEz_St, pEz_Fi, InvStepRelArg, PolCom, Int_or_ReE);
+
+				if(intOverEnIsRequired) //OC140813
+				{//integrate over photon energy / time
+					double *tInt = arAuxInt; 
+					float *pEx_StAux = pEx_St;
+					float *pEz_StAux = pEz_St;
+
+					if(!allStokesReq) //OC17042020
+					{
+						for(ie=0; ie<ne; ie++) //OC18042020
+						//for(int ie=0; ie<RadAccessData.ne; ie++)
 						{
-							for(ie=0; ie<ne; ie++)
-							{
-								*(tInt++) = IntensityComponent(pEx_StAux, pEz_StAux, -1, Int_or_ReE);
-								pEx_StAux += 2; pEz_StAux += 2;
-							}
-							resInt = ConstPhotEnInteg*CGenMathMeth::Integ1D_FuncDefByArray(arAuxInt, ne, RadAccessData.eStep);
-
-							tInt = arAuxInt; pEx_StAux = pEx_St; pEz_StAux = pEz_St;
-							for(ie=0; ie<ne; ie++)
-							{
-								*(tInt++) = IntensityComponent(pEx_StAux, pEz_StAux, -2, Int_or_ReE);
-								pEx_StAux += 2; pEz_StAux += 2;
-							}
-							resInt1 = ConstPhotEnInteg*CGenMathMeth::Integ1D_FuncDefByArray(arAuxInt, ne, RadAccessData.eStep);
-
-							tInt = arAuxInt; pEx_StAux = pEx_St; pEz_StAux = pEz_St;
-							for(ie=0; ie<ne; ie++)
-							{
-								*(tInt++) = IntensityComponent(pEx_StAux, pEz_StAux, -3, Int_or_ReE);
-								pEx_StAux += 2; pEz_StAux += 2;
-							}
-							resInt2 = ConstPhotEnInteg*CGenMathMeth::Integ1D_FuncDefByArray(arAuxInt, ne, RadAccessData.eStep);
-
-							tInt = arAuxInt; pEx_StAux = pEx_St; pEz_StAux = pEz_St;
-							for(ie=0; ie<ne; ie++)
-							{
-								*(tInt++) = IntensityComponent(pEx_StAux, pEz_StAux, -4, Int_or_ReE);
-								pEx_StAux += 2; pEz_StAux += 2;
-							}
-							resInt3 = ConstPhotEnInteg*CGenMathMeth::Integ1D_FuncDefByArray(arAuxInt, ne, RadAccessData.eStep);
+							*(tInt++) = IntensityComponent(pEx_StAux, pEz_StAux, PolCom, Int_or_ReE);
+							pEx_StAux += 2;
+							pEz_StAux += 2;
 						}
+						resInt = ConstPhotEnInteg*CGenMathMeth::Integ1D_FuncDefByArray(arAuxInt, ne, RadAccessData.eStep); //OC18042020
+						//resInt = ConstPhotEnInteg*CGenMathMeth::Integ1D_FuncDefByArray(arAuxInt, RadAccessData.ne, RadAccessData.eStep);
 					}
 					else
 					{
-						if(!allStokesReq) //OC18042020
+						for(ie=0; ie<ne; ie++)
 						{
-							resInt = IntensityComponentSimpleInterpol(pEx_St, pEx_Fi, pEz_St, pEz_Fi, InvStepRelArg, PolCom, Int_or_ReE);
+							*(tInt++) = IntensityComponent(pEx_StAux, pEz_StAux, -1, Int_or_ReE);
+							pEx_StAux += 2; pEz_StAux += 2;
 						}
-						else //OC18042020
+						resInt = ConstPhotEnInteg*CGenMathMeth::Integ1D_FuncDefByArray(arAuxInt, ne, RadAccessData.eStep);
+
+						tInt = arAuxInt; pEx_StAux = pEx_St; pEz_StAux = pEz_St;
+						for(ie=0; ie<ne; ie++)
 						{
-							resInt = IntensityComponentSimpleInterpol(pEx_St, pEx_Fi, pEz_St, pEz_Fi, InvStepRelArg, -1, Int_or_ReE);
-							resInt1 = IntensityComponentSimpleInterpol(pEx_St, pEx_Fi, pEz_St, pEz_Fi, InvStepRelArg, -2, Int_or_ReE);
-							resInt2 = IntensityComponentSimpleInterpol(pEx_St, pEx_Fi, pEz_St, pEz_Fi, InvStepRelArg, -3, Int_or_ReE);
-							resInt3 = IntensityComponentSimpleInterpol(pEx_St, pEx_Fi, pEz_St, pEz_Fi, InvStepRelArg, -4, Int_or_ReE);
+							*(tInt++) = IntensityComponent(pEx_StAux, pEz_StAux, -2, Int_or_ReE);
+							pEx_StAux += 2; pEz_StAux += 2;
 						}
-					}
+						resInt1 = ConstPhotEnInteg*CGenMathMeth::Integ1D_FuncDefByArray(arAuxInt, ne, RadAccessData.eStep);
 
-					if(iter == 0) //OC08052021
-					{
-						//OC140813
-						if(pI != 0) *(pI++) = (float)resInt;
-						if(pId != 0) *(pId++) = resInt; //OC18042020
-						//if(pId != 0) *(pId++) = (double)resInt;
-						if(allStokesReq) //OC18042020
+						tInt = arAuxInt; pEx_StAux = pEx_St; pEz_StAux = pEz_St;
+						for(ie=0; ie<ne; ie++)
+						{
+							*(tInt++) = IntensityComponent(pEx_StAux, pEz_StAux, -3, Int_or_ReE);
+							pEx_StAux += 2; pEz_StAux += 2;
+						}
+						resInt2 = ConstPhotEnInteg*CGenMathMeth::Integ1D_FuncDefByArray(arAuxInt, ne, RadAccessData.eStep);
+
+						tInt = arAuxInt; pEx_StAux = pEx_St; pEz_StAux = pEz_St;
+						for(ie=0; ie<ne; ie++)
 						{
-							if(RadExtract.pExtractedData != 0)
-							{
-								*(pI1++) = (float)resInt1; *(pI2++) = (float)resInt2; *(pI3++) = (float)resInt3;
-							}
-							else
-							{
-								*(pI1d++) = resInt1; *(pI2d++) = resInt2; *(pI3d++) = resInt3;
-							}
+							*(tInt++) = IntensityComponent(pEx_StAux, pEz_StAux, -4, Int_or_ReE);
+							pEx_StAux += 2; pEz_StAux += 2;
 						}
+						resInt3 = ConstPhotEnInteg*CGenMathMeth::Integ1D_FuncDefByArray(arAuxInt, ne, RadAccessData.eStep);
+					}
+				}
+				else
+				{
+					if(!allStokesReq) //OC18042020
+					{
+						resInt = IntensityComponentSimpleInterpol(pEx_St, pEx_Fi, pEz_St, pEz_Fi, InvStepRelArg, PolCom, Int_or_ReE);
+					}
+					else //OC18042020
+					{
+						resInt = IntensityComponentSimpleInterpol(pEx_St, pEx_Fi, pEz_St, pEz_Fi, InvStepRelArg, -1, Int_or_ReE);
+						resInt1 = IntensityComponentSimpleInterpol(pEx_St, pEx_Fi, pEz_St, pEz_Fi, InvStepRelArg, -2, Int_or_ReE);
+						resInt2 = IntensityComponentSimpleInterpol(pEx_St, pEx_Fi, pEz_St, pEz_Fi, InvStepRelArg, -3, Int_or_ReE);
+						resInt3 = IntensityComponentSimpleInterpol(pEx_St, pEx_Fi, pEz_St, pEz_Fi, InvStepRelArg, -4, Int_or_ReE);
 					}
-					else if(iter > 0) //OC08052021
+				}
+
+				if(iter == 0) //OC08052021
+				{
+					//OC140813
+					if(pI != 0) *(pI++) = (float)resInt;
+					if(pId != 0) *(pId++) = resInt; //OC18042020
+					//if(pId != 0) *(pId++) = (double)resInt;
+					if(allStokesReq) //OC18042020
 					{
-						if(pI != 0)
+						if(RadExtract.pExtractedData != 0)
 						{
-							float newI = (float)(((*pI)*iter + resInt)*inv_iter_p_1);
-							*(pI++) = newI;
+							*(pI1++) = (float)resInt1; *(pI2++) = (float)resInt2; *(pI3++) = (float)resInt3;
 						}
-						if(pId != 0)
+						else
+						{
+							*(pI1d++) = resInt1; *(pI2d++) = resInt2; *(pI3d++) = resInt3;
+						}
+					}
+				}
+				else if(iter > 0) //OC08052021
+				{
+					if(pI != 0)
+					{
+						float newI = (float)(((*pI)*iter + resInt)*inv_iter_p_1);
+						*(pI++) = newI;
+					}
+					if(pId != 0)
+					{
+						double newI = ((*pId)*iter + resInt)*inv_iter_p_1;
+						*(pId++) = newI;
+					}
+					if(allStokesReq)
+					{
+						if(RadExtract.pExtractedData != 0)
 						{
-							double newI = ((*pId)*iter + resInt)*inv_iter_p_1;
-							*(pId++) = newI;
+							float newI1 = (float)(((*pI1)*iter + resInt1)*inv_iter_p_1);
+							float newI2 = (float)(((*pI2)*iter + resInt2)*inv_iter_p_1);
+							float newI3 = (float)(((*pI3)*iter + resInt3)*inv_iter_p_1);
+							*(pI1++) = newI1; *(pI2++) = newI2; *(pI3++) = newI3;
 						}
-						if(allStokesReq)
+						else
 						{
-							if(RadExtract.pExtractedData != 0)
-							{
-								float newI1 = (float)(((*pI1)*iter + resInt1)*inv_iter_p_1);
-								float newI2 = (float)(((*pI2)*iter + resInt2)*inv_iter_p_1);
-								float newI3 = (float)(((*pI3)*iter + resInt3)*inv_iter_p_1);
-								*(pI1++) = newI1; *(pI2++) = newI2; *(pI3++) = newI3;
-							}
-							else
-							{
-								double newI1 = ((*pI1d)*iter + resInt1)*inv_iter_p_1;
-								double newI2 = ((*pI2d)*iter + resInt2)*inv_iter_p_1;
-								double newI3 = ((*pI3d)*iter + resInt3)*inv_iter_p_1;
-								*(pI1d++) = newI1; *(pI2d++) = newI2; *(pI3d++) = newI3;
-							}
+							double newI1 = ((*pI1d)*iter + resInt1)*inv_iter_p_1;
+							double newI2 = ((*pI2d)*iter + resInt2)*inv_iter_p_1;
+							double newI3 = ((*pI3d)*iter + resInt3)*inv_iter_p_1;
+							*(pI1d++) = newI1; *(pI2d++) = newI2; *(pI3d++) = newI3;
 						}
 					}
-					else //OC08052021
+				}
+				else //OC08052021
+				{
+					if(pI != 0) *(pI++) += (float)resInt;
+					if(pId != 0) *(pId++) += resInt;
+					if(allStokesReq)
 					{
-						if(pI != 0) *(pI++) += (float)resInt;
-						if(pId != 0) *(pId++) += resInt;
-						if(allStokesReq)
+						if(RadExtract.pExtractedData != 0)
 						{
-							if(RadExtract.pExtractedData != 0)
-							{
-								*(pI1++) += (float)resInt1; *(pI2++) += (float)resInt2; *(pI3++) += (float)resInt3;
-							}
-							else
-							{
-								*(pI1d++) += resInt1; *(pI2d++) += resInt2; *(pI3d++) += resInt3;
-							}
+							*(pI1++) += (float)resInt1; *(pI2++) += (float)resInt2; *(pI3++) += (float)resInt3;
+						}
+						else
+						{
+							*(pI1d++) += resInt1; *(pI2d++) += resInt2; *(pI3d++) += resInt3;
 						}
 					}
-
-					//ixPerX += PerX;
-					pEx_St += PerX;
-					pEz_St += PerX;
-					pEx_Fi += PerX;
-					pEz_Fi += PerX;
 				}
-				izPerZ += PerZ;
+
+				pEx_St += PerX;
+				pEz_St += PerX;
+				pEx_Fi += PerX;
+				pEz_Fi += PerX;
 			}
-			//iwfrPerWfr += PerWfr;
-		//}
-	//}
+			izPerZ += PerZ;
+		}
+	}
 	if(arAuxInt != 0) delete[] arAuxInt; //OC150813
 	return 0;
 }
@@ -1586,8 +1582,8 @@ int srTRadGenManip::ExtractSingleElecMutualIntensityVsZ(srTRadExtract& RadExtrac
 
 //*************************************************************************
 
-int srTRadGenManip::ExtractSingleElecMutualIntensityVsXZ(srTRadExtract& RadExtract)
-//int srTRadGenManip::ExtractSingleElecMutualIntensityVsXZ(srTRadExtract& RadExtract, gpuUsageArg *pGpuUsage) //Himanshu?
+//int srTRadGenManip::ExtractSingleElecMutualIntensityVsXZ(srTRadExtract& RadExtract)
+int srTRadGenManip::ExtractSingleElecMutualIntensityVsXZ(srTRadExtract& RadExtract, void* pvGPU) //HG30112023
 {//OC13122019
  //This assumes "normal" data alignment in the complex "matrix" E(x,y)*E*(x',y')
 	int res = 0;
@@ -2124,13 +2120,14 @@ int srTRadGenManip::ExtractSingleElecMutualIntensityVsXZ(srTRadExtract& RadExtra
 
 	if(DontNeedInterp)
 	{
-		//Himanshu?
-		//GPU_COND(pGpuUsage,
-		//	{
-		//		ExtractSingleElecMutualIntensityVsXZ_GPU(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, PolCom, EhOK, EvOK, pGpuUsage);
-		//	})
-		//else
-		//{
+#ifdef _OFFLOAD_GPU //HG30112023
+		if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU))
+		{
+			ExtractSingleElecMutualIntensityVsXZ_GPU(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, PolCom, EhOK, EvOK, (TGPUUsageArg*)pvGPU);
+		}
+		else
+#endif
+		{
 			for(long long it=itStart; it<=itEnd; it++) //OC16042021 (to enable partial update of MI/CSD)
 			//for(long long it=0; it<=(itEnd-itStart); it++) //OC03032021 (to enable partial update of MI/CSD)
 			//for(long long it=0; it<nxnz; it++)
@@ -2255,7 +2252,7 @@ int srTRadGenManip::ExtractSingleElecMutualIntensityVsXZ(srTRadExtract& RadExtra
 				pEz = pEzInit0;
 				pExT += PerX; pEzT += PerX;
 			}
-		//}
+		}
 	}
 	else
 	{
diff --git a/cpp/src/core/srradmnp.h b/cpp/src/core/srradmnp.h
index f8ecda38..fb36d20d 100644
--- a/cpp/src/core/srradmnp.h
+++ b/cpp/src/core/srradmnp.h
@@ -84,8 +84,8 @@ class srTRadGenManip {
 	//	if(m_arIndEforCSD != 0) delete[] m_arIndEforCSD;
 	//}
 
-	void ExtractRadiation(int PolarizCompon, int Int_or_Phase, int SectID, int TransvPres, double e, double x, double z, char* pData, double* pMeth=0, srTTrjDat* pTrjDat=0) //OC23022020
-	//void ExtractRadiation(int PolarizCompon, int Int_or_Phase, int SectID, int TransvPres, double e, double x, double z, char* pData, double* pMeth=0, srTTrjDat* pTrjDat=0, gpuUsageArg* pGpuUsage=0) //OC23022020 Himanshu?
+	void ExtractRadiation(int PolarizCompon, int Int_or_Phase, int SectID, int TransvPres, double e, double x, double z, char* pData, double* pMeth=0, srTTrjDat* pTrjDat=0, void* pvGPU=0) //OC23022020 HG30112023
+	//void ExtractRadiation(int PolarizCompon, int Int_or_Phase, int SectID, int TransvPres, double e, double x, double z, char* pData, double* pMeth=0, srTTrjDat* pTrjDat=0) //OC23022020
 	//void ExtractRadiation(int PolarizCompon, int Int_or_Phase, int SectID, int TransvPres, double e, double x, double z, char* pData, double* pMeth=0) //OC16122019
 	//void ExtractRadiation(int PolarizCompon, int Int_or_Phase, int SectID, int TransvPres, double e, double x, double z, char* pData, int* pMeth=0) //OC13122019
 	//void ExtractRadiation(int PolarizCompon, int Int_or_Phase, int SectID, int TransvPres, double e, double x, double z, char* pData);
@@ -103,8 +103,8 @@ class srTRadGenManip {
 
 		int res;
 		if(TransvPres != RadAccessData.Pres)
-			if(res = GenOptElem.SetRadRepres(&RadAccessData, char(TransvPres), 0, 0)) throw res;
-			//if(res = GenOptElem.SetRadRepres(&RadAccessData, char(TransvPres), 0, 0, pGpuUsage)) throw res; //Himanshu?
+			//if(res = GenOptElem.SetRadRepres(&RadAccessData, char(TransvPres), 0, 0)) throw res;
+			if(res = GenOptElem.SetRadRepres(&RadAccessData, char(TransvPres), 0, 0, pvGPU)) throw res; //HG30112023
 
 		if(RadExtract.Int_or_Phase == 1)
 		{//1- Multi-Elec Intensity
@@ -120,8 +120,8 @@ class srTRadGenManip {
 		}
 		else if(RadExtract.Int_or_Phase == 8) //OC06092018
 		{
-			if(res = ExtractSingleElecMutualIntensity(RadExtract)) throw res;
-			//if(res = ExtractSingleElecMutualIntensity(RadExtract, pGpuUsage)) throw res; //Himanshu?
+			//if(res = ExtractSingleElecMutualIntensity(RadExtract)) throw res;
+			if(res = ExtractSingleElecMutualIntensity(RadExtract, pvGPU)) throw res; //HG30112023
 		}
 		//else if(RadExtract.Int_or_Phase == 9) //OC23022020 (under development - OC23032020)
 		//{
@@ -129,8 +129,8 @@ class srTRadGenManip {
 		//}
 		else
 		{
-			if(res = ExtractSingleElecIntensity(RadExtract)) throw res;
-			//if(res = ExtractSingleElecIntensity(RadExtract, pGpuUsage)) throw res; //Himanshu?
+			//if(res = ExtractSingleElecIntensity(RadExtract)) throw res;
+			if(res = ExtractSingleElecIntensity(RadExtract, pvGPU)) throw res; //HG30112023
 		}
 
 		//OCTEST17082019
@@ -182,21 +182,21 @@ class srTRadGenManip {
 		return 0;
 	}
 
-	int ExtractSingleElecIntensity(srTRadExtract& RadExtract) //, gpuUsageArg *pGpuUsage=0)
-	//int ExtractSingleElecIntensity(srTRadExtract& RadExtract, gpuUsageArg *pGpuUsage=0) //Himanshu?
+	int ExtractSingleElecIntensity(srTRadExtract& RadExtract, void* pvGPU=0) //HG30112023
+	//int ExtractSingleElecIntensity(srTRadExtract& RadExtract)
 	{
 		if(RadExtract.PlotType == 0) return ExtractSingleElecIntensity1DvsE(RadExtract);
 		else if(RadExtract.PlotType == 1) return ExtractSingleElecIntensity1DvsX(RadExtract);
 		else if(RadExtract.PlotType == 2) return ExtractSingleElecIntensity1DvsZ(RadExtract);
-		else if(RadExtract.PlotType == 3) return ExtractSingleElecIntensity2DvsXZ(RadExtract);
-		//else if(RadExtract.PlotType == 3) return ExtractSingleElecIntensity2DvsXZ(RadExtract, pGpuUsage); //Himanshu?
+		//else if(RadExtract.PlotType == 3) return ExtractSingleElecIntensity2DvsXZ(RadExtract);
+		else if(RadExtract.PlotType == 3) return ExtractSingleElecIntensity2DvsXZ(RadExtract, pvGPU); //HG30112023
 		else if(RadExtract.PlotType == 4) return ExtractSingleElecIntensity2DvsEX(RadExtract);
 		else if(RadExtract.PlotType == 5) return ExtractSingleElecIntensity2DvsEZ(RadExtract);
 		else return ExtractSingleElecIntensity3D(RadExtract);
 	}
 
-	int ExtractSingleElecMutualIntensity(srTRadExtract& RadExtract)
-	//int ExtractSingleElecMutualIntensity(srTRadExtract& RadExtract, gpuUsageArg *pGpuUsage=0) //HG24042023 Added GPU usage parameter
+	int ExtractSingleElecMutualIntensity(srTRadExtract& RadExtract, void* pvGPU=0) //HG24042023 Added GPU usage parameter
+	//int ExtractSingleElecMutualIntensity(srTRadExtract& RadExtract)
 	{//OC06092018
 		//int PolCom = RadExtract.PolarizCompon; //OC03032021 (commented-out)
 		int Int_or_ReE = RadExtract.Int_or_Phase;
@@ -207,8 +207,8 @@ class srTRadGenManip {
 		//else if(RadExtract.PlotType == 1) return ExtractSingleElecMutualIntensityVsX(RadExtract);
 		if(RadExtract.PlotType == 1) return ExtractSingleElecMutualIntensityVsX(RadExtract);
 		else if(RadExtract.PlotType == 2) return ExtractSingleElecMutualIntensityVsZ(RadExtract);
-		else if(RadExtract.PlotType == 3) return ExtractSingleElecMutualIntensityVsXZ(RadExtract);
-		//else if (RadExtract.PlotType == 3) return ExtractSingleElecMutualIntensityVsXZ(RadExtract, pGpuUsage); //HG24042023
+		//else if(RadExtract.PlotType == 3) return ExtractSingleElecMutualIntensityVsXZ(RadExtract);
+		else if (RadExtract.PlotType == 3) return ExtractSingleElecMutualIntensityVsXZ(RadExtract, pvGPU); //HG24042023
 		//else if(RadExtract.PlotType == 4) return ExtractSingleElecMutualIntensityVsEX(RadExtract);
 		//else if(RadExtract.PlotType == 5) return ExtractSingleElecMutualIntensityVsEZ(RadExtract);
 		//else return ExtractSingleElecMutualIntensityEXZ(RadExtract);
@@ -241,8 +241,8 @@ class srTRadGenManip {
 	int ExtractSingleElecIntensity1DvsX(srTRadExtract&);
 	int ExtractSingleElecIntensity1DvsZ(srTRadExtract&);
 
-	int ExtractSingleElecIntensity2DvsXZ(srTRadExtract&); // , gpuUsageArg* pGpuUsage=0);
-	//int ExtractSingleElecIntensity2DvsXZ(srTRadExtract&, gpuUsageArg* pGpuUsage=0); //Himanshu?
+	int ExtractSingleElecIntensity2DvsXZ(srTRadExtract&, void* pvGPU=0); //HG30112023
+	//int ExtractSingleElecIntensity2DvsXZ(srTRadExtract&);
 
 	int ExtractSingleElecIntensity2DvsEX(srTRadExtract&);
 	int ExtractSingleElecIntensity2DvsEZ(srTRadExtract&);
@@ -251,14 +251,14 @@ class srTRadGenManip {
 	int ExtractSingleElecMutualIntensityVsX(srTRadExtract&); //OC06092018
 	int ExtractSingleElecMutualIntensityVsZ(srTRadExtract&);
 
-	int ExtractSingleElecMutualIntensityVsXZ(srTRadExtract&);
-	//int ExtractSingleElecMutualIntensityVsXZ(srTRadExtract&, gpuUsageArg* pGpuUsage=0); //Himanshu?
+	int ExtractSingleElecMutualIntensityVsXZ(srTRadExtract&, void* pvGPU=0); //HG30112023
+	//int ExtractSingleElecMutualIntensityVsXZ(srTRadExtract&);
 
-	//Himanshu?
-//#ifdef _OFFLOAD_GPU
-//	int ExtractSingleElecMutualIntensityVsXZ_GPU(float* pEx, float* pEz, float* pMI, long nxnz, long itStart, long itEnd, long PerX, long iter, int PolCom, bool EhOK, bool EvOK, gpuUsageArg *pGpuUsage);
-//	int ExtractSingleElecIntensity2DvsXZ_GPU(srTRadExtract& RadExtract, double* arAuxInt, long long ie0, long long ie1, double InvStepRelArg, gpuUsageArg *pGpuUsage);
-//#endif
+	
+#ifdef _OFFLOAD_GPU //HG30112023
+	int ExtractSingleElecMutualIntensityVsXZ_GPU(float* pEx, float* pEz, float* pMI, long nx, long nz, long ne, long itStart, long itEnd, long PerX, long iter, int PolCom, bool EhOK, bool EvOK, TGPUUsageArg* pGPU);
+	int ExtractSingleElecIntensity2DvsXZ_GPU(srTRadExtract& RadExtract, double* arAuxInt, long long ie0, long long ie1, double InvStepRelArg, TGPUUsageArg* pGPU);
+#endif
 
 	//int ComputeMultiElecMutualIntensityVsXZ(srTRadExtract&, srTTrjDat* pTrjDat=0); //23022020
 
@@ -325,8 +325,10 @@ class srTRadGenManip {
 		else if(PolCom==5) { *PolVect = OneReN; *(PolVect+1) = -OneImN;}
 	}
 
+#ifdef _OFFLOAD_GPU //HG30112023
+	GPU_PORTABLE
+#endif
 	float IntensityComponentSimpleInterpol(float* pEx_St, float* pEx_Fi, float* pEz_St, float* pEz_Fi, double InvStepRelArg, int PolCom, int Int_or_ReE)
-	//GPU_PORTABLE float IntensityComponentSimpleInterpol(float* pEx_St, float* pEx_Fi, float* pEz_St, float* pEz_Fi, double InvStepRelArg, int PolCom, int Int_or_ReE) //Himanshu?
 	{
 		float I_St = IntensityComponent(pEx_St, pEz_St, PolCom, Int_or_ReE);
 		if(Int_or_ReE == 2) return I_St;
@@ -345,8 +347,10 @@ class srTRadGenManip {
 		double Arg1Arg2 = Arg1*Arg2;
 		return (float)((I00 - I01 - I10 + I11)*Arg1Arg2 + (I10 - I00)*Arg1 + (I01 - I00)*Arg2 + I00);
 	}
+#ifdef _OFFLOAD_GPU //HG30112023
+	GPU_PORTABLE
+#endif
 	float IntensityComponent(float* pEx, float* pEz, int PolCom, int Int_or_ReE)
-	//GPU_PORTABLE float IntensityComponent(float* pEx, float* pEz, int PolCom, int Int_or_ReE) //Himanshu?
 	{
 		//float ExRe = *pEx, ExIm = *(pEx + 1), EzRe = *pEz, EzIm = *(pEz + 1);
 		float ExRe = 0., ExIm = 0., EzRe = 0., EzIm = 0.; //OC111111
@@ -410,8 +414,10 @@ class srTRadGenManip {
 		}
 		//return (float)(ExRe*ExRe + ExIm*ExIm + EzRe*EzRe + EzIm*EzIm);
 	}
+#ifdef _OFFLOAD_GPU //HG30112023
+	GPU_PORTABLE
+#endif
 	double FormalPhase(float Re, float Im)
-	//GPU_PORTABLE double FormalPhase(float Re, float Im) //Himanshu?
 	{
 		const double HalhPi = 1.5707963267949;
 		const double Pi = 3.1415926535898;
@@ -731,8 +737,10 @@ class srTRadGenManip {
 		return true;
 	}
 
+#ifdef _OFFLOAD_GPU //HG01122023
+	GPU_PORTABLE
+#endif
 	static void CosAndSin(double x, float& Cos, float& Sin) //OC23062021
-	//GPU_PORTABLE static void CosAndSin(double x, float& Cos, float& Sin) //OC23062021 //Himanshu?
 	{
 		const double TwoPI = 6.2831853071796;
 		const double ThreePIdTwo = 4.7123889803847;
diff --git a/cpp/src/core/srradmnp_gpu.cu b/cpp/src/core/srradmnp_gpu.cu
index 41d65b9e..0cd9f633 100644
--- a/cpp/src/core/srradmnp_gpu.cu
+++ b/cpp/src/core/srradmnp_gpu.cu
@@ -25,9 +25,8 @@ __global__ void ExtractSingleElecIntensity2DvsXZ_Kernel(srTRadExtract RadExtract
 {
 	int ix = (blockIdx.x * blockDim.x + threadIdx.x); //nx range
     int iz = (blockIdx.y * blockDim.y + threadIdx.y); //nz range
-    int iwfr = (blockIdx.z * blockDim.z + threadIdx.z); //nwfr range
     
-	if (ix < RadAccessData.nx && iz < RadAccessData.nz && iwfr < RadAccessData.nwfr) 
+	if (ix < RadAccessData.nx && iz < RadAccessData.nz) 
     {
 		//int PolCom = RadExtract.PolarizCompon;
 			
@@ -35,7 +34,7 @@ __global__ void ExtractSingleElecIntensity2DvsXZ_Kernel(srTRadExtract RadExtract
 
 		float* pI = 0, * pI1 = 0, * pI2 = 0, * pI3 = 0; //OC17042020
 		double* pId = 0, * pI1d = 0, * pI2d = 0, * pI3d = 0;
-		long ne = RadAccessData.ne, nx = RadAccessData.nx, nz = RadAccessData.nz, nwfr = RadAccessData.nwfr;
+		long ne = RadAccessData.ne, nx = RadAccessData.nx, nz = RadAccessData.nz;
 		//float *pI = 0;
 		//DOUBLE *pId = 0;
 		//double *pId = 0; //OC26112019 (related to SRW port to IGOR XOP8 on Mac)
@@ -66,7 +65,6 @@ __global__ void ExtractSingleElecIntensity2DvsXZ_Kernel(srTRadExtract RadExtract
 		//long long PerZ = PerX*RadAccessData.nx;
 		long long PerX = ((long long)ne) << 1; //OC18042020
 		long long PerZ = PerX * nx;
-		long long PerWfr = PerZ * nz;
 
 		//bool intOverEnIsRequired = (RadExtract.Int_or_Phase == 7) && (ne > 1); //OC18042020
 		double resInt, resInt1, resInt2, resInt3;
@@ -74,7 +72,7 @@ __global__ void ExtractSingleElecIntensity2DvsXZ_Kernel(srTRadExtract RadExtract
 		long long Two_ie0 = ie0 << 1, Two_ie1 = ie1 << 1; //OC26042019
 		long ie;
 
-		long offset = iwfr * PerWfr + iz * PerZ + ix * PerX;
+		long offset = iz * PerZ + ix * PerX;
 		long offsetDiv2 = offset >> 1;
 
 		float* pEx_StartForX = pEx0 + offset;
@@ -211,30 +209,30 @@ static inline void ExtractSingleElecIntensity2DvsXZ_GPUSub(dim3 &blocks, dim3 &t
 	}
 }
 
-int srTRadGenManip::ExtractSingleElecIntensity2DvsXZ_GPU(srTRadExtract& RadExtract, double* arAuxInt, long long ie0, long long ie1, double InvStepRelArg, gpuUsageArg *pGpuUsage)
+int srTRadGenManip::ExtractSingleElecIntensity2DvsXZ_GPU(srTRadExtract& RadExtract, double* arAuxInt, long long ie0, long long ie1, double InvStepRelArg, TGPUUsageArg* pGPU)
 {
 	srTSRWRadStructAccessData& RadAccessData = *((srTSRWRadStructAccessData*)(hRadAccessData.ptr()));
 
     const int bs = 256;
-    dim3 blocks(RadAccessData.nx / bs + ((RadAccessData.nx & (bs - 1)) != 0), RadAccessData.nz, RadAccessData.nwfr);
+    dim3 blocks(RadAccessData.nx / bs + ((RadAccessData.nx & (bs - 1)) != 0), RadAccessData.nz);
     dim3 threads(bs, 1);
 
     if (RadAccessData.pBaseRadX != NULL)
 	{
-		RadAccessData.pBaseRadX = (float*)AuxGpu::ToDevice(pGpuUsage, RadAccessData.pBaseRadX, 2*RadAccessData.ne*RadAccessData.nx*RadAccessData.nz*RadAccessData.nwfr*sizeof(float));
-		AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, RadAccessData.pBaseRadX);
+		RadAccessData.pBaseRadX = (float*)CAuxGPU::ToDevice(pGPU, RadAccessData.pBaseRadX, 2*RadAccessData.ne*RadAccessData.nx*RadAccessData.nz*sizeof(float));
+		CAuxGPU::EnsureDeviceMemoryReady(pGPU, RadAccessData.pBaseRadX);
 	}
 	if (RadAccessData.pBaseRadZ != NULL)
 	{
-		RadAccessData.pBaseRadZ = (float*)AuxGpu::ToDevice(pGpuUsage, RadAccessData.pBaseRadZ, 2*RadAccessData.ne*RadAccessData.nx*RadAccessData.nz*RadAccessData.nwfr*sizeof(float));
-		AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, RadAccessData.pBaseRadZ);
+		RadAccessData.pBaseRadZ = (float*)CAuxGPU::ToDevice(pGPU, RadAccessData.pBaseRadZ, 2*RadAccessData.ne*RadAccessData.nx*RadAccessData.nz*sizeof(float));
+		CAuxGPU::EnsureDeviceMemoryReady(pGPU, RadAccessData.pBaseRadZ);
 	}
 
-	srTRadGenManip *local_copy = (srTRadGenManip*)AuxGpu::ToDevice(pGpuUsage, this, sizeof(srTRadGenManip));
-	AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, local_copy);
+	srTRadGenManip *local_copy = (srTRadGenManip*)CAuxGPU::ToDevice(pGPU, this, sizeof(srTRadGenManip));
+	CAuxGPU::EnsureDeviceMemoryReady(pGPU, local_copy);
 
-    arAuxInt = (double*)AuxGpu::ToDevice(pGpuUsage, arAuxInt, RadAccessData.ne*sizeof(double));
-    AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, arAuxInt);
+    arAuxInt = (double*)CAuxGPU::ToDevice(pGPU, arAuxInt, RadAccessData.ne*sizeof(double));
+    CAuxGPU::EnsureDeviceMemoryReady(pGPU, arAuxInt);
 
 	bool allStokesReq = (RadExtract.PolarizCompon == -5);
 	bool intOverEnIsRequired = (RadExtract.Int_or_Phase == 7) && (RadAccessData.ne > 1);
@@ -253,23 +251,23 @@ int srTRadGenManip::ExtractSingleElecIntensity2DvsXZ_GPU(srTRadExtract& RadExtra
 		else
 			ExtractSingleElecIntensity2DvsXZ_GPUSub<false, false> (blocks, threads, RadExtract, RadAccessData, local_copy, arAuxInt, ie0, ie1, InvStepRelArg, Int_or_ReE);
 	
-    AuxGpu::ToHostAndFree(pGpuUsage, local_copy, sizeof(srTRadGenManip), true);
-    AuxGpu::ToHostAndFree(pGpuUsage, arAuxInt, RadAccessData.ne*sizeof(double), true);
-	AuxGpu::MarkUpdated(pGpuUsage, RadAccessData.pBaseRadX, true, false);
-	AuxGpu::MarkUpdated(pGpuUsage, RadAccessData.pBaseRadZ, true, false);
+    CAuxGPU::ToHostAndFree(pGPU, local_copy, sizeof(srTRadGenManip), true);
+    CAuxGPU::ToHostAndFree(pGPU, arAuxInt, RadAccessData.ne*sizeof(double), true);
+	CAuxGPU::MarkUpdated(pGPU, RadAccessData.pBaseRadX, true, false);
+	CAuxGPU::MarkUpdated(pGPU, RadAccessData.pBaseRadZ, true, false);
 
 #ifndef _DEBUG
 	if (RadAccessData.pBaseRadX != NULL)
-		RadAccessData.pBaseRadX = (float*)AuxGpu::GetHostPtr(pGpuUsage, RadAccessData.pBaseRadX);
+		RadAccessData.pBaseRadX = (float*)CAuxGPU::GetHostPtr(pGPU, RadAccessData.pBaseRadX);
 	if (RadAccessData.pBaseRadZ != NULL)
-		RadAccessData.pBaseRadZ = (float*)AuxGpu::GetHostPtr(pGpuUsage, RadAccessData.pBaseRadZ);
+		RadAccessData.pBaseRadZ = (float*)CAuxGPU::GetHostPtr(pGPU, RadAccessData.pBaseRadZ);
 #endif
 
 #ifdef _DEBUG
 	if (RadAccessData.pBaseRadX != NULL)
-		RadAccessData.pBaseRadX = (float*)AuxGpu::ToHostAndFree(pGpuUsage, RadAccessData.pBaseRadX, 2 * RadAccessData.ne * RadAccessData.nx * RadAccessData.nz * RadAccessData.nwfr * sizeof(float));
+		RadAccessData.pBaseRadX = (float*)CAuxGPU::ToHostAndFree(pGPU, RadAccessData.pBaseRadX, 2*RadAccessData.ne*RadAccessData.nx*RadAccessData.nz*sizeof(float));
 	if (RadAccessData.pBaseRadZ != NULL)
-		RadAccessData.pBaseRadZ = (float*)AuxGpu::ToHostAndFree(pGpuUsage, RadAccessData.pBaseRadZ, 2 * RadAccessData.ne * RadAccessData.nx * RadAccessData.nz * RadAccessData.nwfr * sizeof(float));
+		RadAccessData.pBaseRadZ = (float*)CAuxGPU::ToHostAndFree(pGPU, RadAccessData.pBaseRadZ, 2*RadAccessData.ne*RadAccessData.nx*RadAccessData.nz*sizeof(float));
 	cudaStreamSynchronize(0);
 	auto err = cudaGetLastError();
 	printf("%s\r\n", cudaGetErrorString(err));
@@ -437,19 +435,21 @@ __global__ void ExtractSingleElecMutualIntensityVsXZ_Kernel(const float* __restr
 }
 
 template <int PolCom, int gt1_iter>
-int ExtractSingleElecMutualIntensityVsXZ_GPUSub(float* pEx, float* pEz, float* pMI0, long nxnz, long itStart, long itEnd, long PerX, long iter, bool EhOK, bool EvOK, gpuUsageArg* pGpuUsage)
+int ExtractSingleElecMutualIntensityVsXZ_GPUSub(float* pEx, float* pEz, float* pMI0, long nx, long nz, long ne, long itStart, long itEnd, long PerX, long iter, bool EhOK, bool EvOK, TGPUUsageArg* pGPU)
 {
+	long long nxnz = ((long long)nx) * ((long long)nz);
+
 	const int itPerBlk = 1;
 	dim3 threads = dim3(48, 16, 1);
 	dim3 grid = dim3((nxnz + 1) / threads.x + (threads.x > 1), (nxnz / 2) / (threads.y * itPerBlk) + (threads.y > 1), 1);
 
-	pEx = (float*)AuxGpu::ToDevice(pGpuUsage, pEx, nxnz * 2 * sizeof(float));
-	AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, pEx);
+	pEx = (float*)CAuxGPU::ToDevice(pGPU, pEx, nxnz*2*sizeof(float));
+	CAuxGPU::EnsureDeviceMemoryReady(pGPU, pEx);
 
-	pEz = (float*)AuxGpu::ToDevice(pGpuUsage, pEz, nxnz * 2 * sizeof(float));
-	AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, pEz);
+	pEz = (float*)CAuxGPU::ToDevice(pGPU, pEz, nxnz*2*sizeof(float));
+	CAuxGPU::EnsureDeviceMemoryReady(pGPU, pEz);
 
-	pMI0 = (float*)AuxGpu::ToDevice(pGpuUsage, pMI0, (itEnd - itStart) * nxnz * 2 * sizeof(float));
+	pMI0 = (float*)CAuxGPU::ToDevice(pGPU, pMI0, (itEnd - itStart)*nxnz*2*sizeof(float));
 
 	if (EhOK)
 	{
@@ -462,14 +462,14 @@ int ExtractSingleElecMutualIntensityVsXZ_GPUSub(float* pEx, float* pEz, float* p
 		else ExtractSingleElecMutualIntensityVsXZ_Kernel<PolCom, false, false, gt1_iter, itPerBlk> << <grid, threads >> > (pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter);
 	}
 
-	pEx = (float*)AuxGpu::ToHostAndFree(pGpuUsage, pEx, nxnz * 2 * sizeof(float), true);
-	pEz = (float*)AuxGpu::ToHostAndFree(pGpuUsage, pEz, nxnz * 2 * sizeof(float), true);
+	pEx = (float*)CAuxGPU::ToHostAndFree(pGPU, pEx, nxnz * 2 * sizeof(float), true);
+	pEz = (float*)CAuxGPU::ToHostAndFree(pGPU, pEz, nxnz * 2 * sizeof(float), true);
 	
-	AuxGpu::MarkUpdated(pGpuUsage, pMI0, true, false);
+	CAuxGPU::MarkUpdated(pGPU, pMI0, true, false);
 
 #ifdef _DEBUG
 	if (pMI0 != NULL)
-		pMI0 = (float*)AuxGpu::ToHostAndFree(pGpuUsage, pMI0, (itEnd - itStart) * RadAccessData.ne * RadAccessData.nx * RadAccessData.nz * 2 * sizeof(float));
+		pMI0 = (float*)CAuxGPU::ToHostAndFree(pGPU, pMI0, (itEnd - itStart)*ne*nx*nz*2*sizeof(float));
 
 	cudaStreamSynchronize(0);
 	auto err = cudaGetLastError();
@@ -478,40 +478,40 @@ int ExtractSingleElecMutualIntensityVsXZ_GPUSub(float* pEx, float* pEz, float* p
 	return 0;
 }
 
-int srTRadGenManip::ExtractSingleElecMutualIntensityVsXZ_GPU(float* pEx, float* pEz, float* pMI0, long nxnz, long itStart, long itEnd, long PerX, long iter, int PolCom, bool EhOK, bool EvOK, gpuUsageArg* pGpuUsage)
+int srTRadGenManip::ExtractSingleElecMutualIntensityVsXZ_GPU(float* pEx, float* pEz, float* pMI0, long nx, long nz, long ne, long itStart, long itEnd, long PerX, long iter, int PolCom, bool EhOK, bool EvOK, TGPUUsageArg* pGPU)
 {
 	if (iter > 0)
 	{
 		switch (PolCom)
 		{
-		case  0: return ExtractSingleElecMutualIntensityVsXZ_GPUSub<  0, 1>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage);
-		case  1: return ExtractSingleElecMutualIntensityVsXZ_GPUSub<  1, 1>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage);
-		case  2: return ExtractSingleElecMutualIntensityVsXZ_GPUSub<  2, 1>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage);
-		case  3: return ExtractSingleElecMutualIntensityVsXZ_GPUSub<  3, 1>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage);
-		case  4: return ExtractSingleElecMutualIntensityVsXZ_GPUSub<  4, 1>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage);
-		case  5: return ExtractSingleElecMutualIntensityVsXZ_GPUSub<  5, 1>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage);
-		case -1: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -1, 1>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage);
-		case -2: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -2, 1>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage);
-		case -3: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -3, 1>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage);
-		case -4: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -4, 1>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage);
-		default: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -5, 1>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage);
+		case  1: return ExtractSingleElecMutualIntensityVsXZ_GPUSub<  1, 1>(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, EhOK, EvOK, pGPU);
+		case  2: return ExtractSingleElecMutualIntensityVsXZ_GPUSub<  2, 1>(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, EhOK, EvOK, pGPU);
+		case  0: return ExtractSingleElecMutualIntensityVsXZ_GPUSub<  0, 1>(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, EhOK, EvOK, pGPU);
+		case  3: return ExtractSingleElecMutualIntensityVsXZ_GPUSub<  3, 1>(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, EhOK, EvOK, pGPU);
+		case  4: return ExtractSingleElecMutualIntensityVsXZ_GPUSub<  4, 1>(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, EhOK, EvOK, pGPU);
+		case  5: return ExtractSingleElecMutualIntensityVsXZ_GPUSub<  5, 1>(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, EhOK, EvOK, pGPU);
+		case -1: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -1, 1>(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, EhOK, EvOK, pGPU);
+		case -2: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -2, 1>(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, EhOK, EvOK, pGPU);
+		case -3: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -3, 1>(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, EhOK, EvOK, pGPU);
+		case -4: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -4, 1>(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, EhOK, EvOK, pGPU);
+		default: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -5, 1>(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, EhOK, EvOK, pGPU);
 		}
 	}
 	else if (iter == 0)
 	{
 		switch (PolCom)
 		{
-		case  0: return ExtractSingleElecMutualIntensityVsXZ_GPUSub<  0, 0>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage);
-		case  1: return ExtractSingleElecMutualIntensityVsXZ_GPUSub<  1, 0>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage);
-		case  2: return ExtractSingleElecMutualIntensityVsXZ_GPUSub<  2, 0>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage);
-		case  3: return ExtractSingleElecMutualIntensityVsXZ_GPUSub<  3, 0>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage);
-		case  4: return ExtractSingleElecMutualIntensityVsXZ_GPUSub<  4, 0>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage);
-		case  5: return ExtractSingleElecMutualIntensityVsXZ_GPUSub<  5, 0>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage);
-		case -1: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -1, 0>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage);
-		case -2: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -2, 0>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage);
-		case -3: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -3, 0>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage);
-		case -4: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -4, 0>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage);
-		default: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -5, 0>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage);
+		case  0: return ExtractSingleElecMutualIntensityVsXZ_GPUSub<  0, 0>(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, EhOK, EvOK, pGPU);
+		case  1: return ExtractSingleElecMutualIntensityVsXZ_GPUSub<  1, 0>(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, EhOK, EvOK, pGPU);
+		case  2: return ExtractSingleElecMutualIntensityVsXZ_GPUSub<  2, 0>(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, EhOK, EvOK, pGPU);
+		case  3: return ExtractSingleElecMutualIntensityVsXZ_GPUSub<  3, 0>(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, EhOK, EvOK, pGPU);
+		case  4: return ExtractSingleElecMutualIntensityVsXZ_GPUSub<  4, 0>(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, EhOK, EvOK, pGPU);
+		case  5: return ExtractSingleElecMutualIntensityVsXZ_GPUSub<  5, 0>(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, EhOK, EvOK, pGPU);
+		case -1: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -1, 0>(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, EhOK, EvOK, pGPU);
+		case -2: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -2, 0>(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, EhOK, EvOK, pGPU);
+		case -3: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -3, 0>(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, EhOK, EvOK, pGPU);
+		case -4: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -4, 0>(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, EhOK, EvOK, pGPU);
+		default: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -5, 0>(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, EhOK, EvOK, pGPU);
 		}
 	}
 }
diff --git a/cpp/src/core/srradstr.cpp b/cpp/src/core/srradstr.cpp
index d2571a41..f7c7a87d 100644
--- a/cpp/src/core/srradstr.cpp
+++ b/cpp/src/core/srradstr.cpp
@@ -2700,7 +2700,8 @@ void srTSRWRadStructAccessData::CheckAndResetPhaseTermsLin()
 
 //*************************************************************************
 
-void srTSRWRadStructAccessData::MirrorFieldData(int sx, int sz)
+//void srTSRWRadStructAccessData::MirrorFieldData(int sx, int sz)
+void srTSRWRadStructAccessData::MirrorFieldData(int sx, int sz, void* pvGPU) //HG02122023
 {// sx < 0 means mirroring should be done vs x 
  // sz < 0 means mirroring should be done vs z 
 	//long PerX = ne << 1;
@@ -2711,6 +2712,14 @@ void srTSRWRadStructAccessData::MirrorFieldData(int sx, int sz)
 	float *pEX0 = pBaseRadX;
 	float *pEZ0 = pBaseRadZ;
 
+#ifdef _OFFLOAD_GPU //HG02122023
+	if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU))
+	{
+		MirrorFieldData_GPU(sx, sz, (TGPUUsageArg*)pvGPU);
+		return;
+	}
+#endif
+
 	if((sx > 0) && (sz > 0)) return; //no mirroring is necessary 
 	else if((sx < 0) && (sz > 0)) //mirroring with respect to x
 	{
diff --git a/cpp/src/core/srradstr.h b/cpp/src/core/srradstr.h
index def0ffb0..dee50e5f 100644
--- a/cpp/src/core/srradstr.h
+++ b/cpp/src/core/srradstr.h
@@ -33,7 +33,7 @@
 #endif
 
 #ifdef _OFFLOAD_GPU //OC28072023
-#include "auxgpu.h" //HG
+#include "auxgpu.h" //HG04122023
 #endif
 
 #include "srobject.h"
@@ -520,14 +520,15 @@ class srTSRWRadStructAccessData : public CGenObject {
 		//	return;
 		//}
 
-		if(pvGPU != 0)
-		{
+		//if(pvGPU != 0) //HG02122023 Null check is already done by CAuxGPU::GPUEnabled
+		//{
 			TGPUUsageArg *pGPU = (TGPUUsageArg*)pvGPU;
 			if(CAuxGPU::GPUEnabled(pGPU))
 			{
 				MultiplyElFieldByPhaseLin_GPU(xMult, zMult, pGPU);
+				return;	//HG02122023
 			}
-		}
+		//}
 #endif
 
 		float *tEx = pBaseRadX;
diff --git a/cpp/src/core/srradstr_gpu.cu b/cpp/src/core/srradstr_gpu.cu
index 5890cbf8..658d39e0 100644
--- a/cpp/src/core/srradstr_gpu.cu
+++ b/cpp/src/core/srradstr_gpu.cu
@@ -21,7 +21,7 @@
 #include <chrono>
 #include "srradstr.h"
 
-__global__ void MultiplyElFieldByPhaseLin_Kernel(double xMult, double zMult, float* pBaseRadX, float* pBaseRadZ, int nz, int nx, int ne, float zStart, float zStep, float xStart, float xStep) {
+__global__ void MultiplyElFieldByPhaseLin_Kernel(double xMult, double zMult, float* pBaseRadX, float* pBaseRadZ, int nx, int nz, int ne, float xStart, float zStart, float xStep, float zStep) {
     int ix = (blockIdx.x * blockDim.x + threadIdx.x); //nx range
     int iz = (blockIdx.y * blockDim.y + threadIdx.y); //nz range
     
@@ -82,7 +82,7 @@ void srTSRWRadStructAccessData::MultiplyElFieldByPhaseLin_GPU(double xMult, doub
     const int bs = 256;
     dim3 blocks(nx / bs + ((nx & (bs - 1)) != 0), nz);
     dim3 threads(bs, 1);
-    MultiplyElFieldByPhaseLin_Kernel<<<blocks, threads>>> (xMult, zMult, pBaseRadX, pBaseRadZ, nz, nx, ne, (float)zStart, (float)zStep, (float)xStart, (float)xStep);
+    MultiplyElFieldByPhaseLin_Kernel<<<blocks, threads>>> (xMult, zMult, pBaseRadX, pBaseRadZ, nx, nz, ne, (float)xStart, (float)zStart, (float)xStep, (float)zStep);
     //MultiplyElFieldByPhaseLin_Kernel<<<blocks, threads>>> (xMult, zMult, pBaseRadX, pBaseRadZ, nz, nx, ne, zStart, zStep, xStart, xStep);
 
 	if (pBaseRadX != NULL)
@@ -105,7 +105,7 @@ void srTSRWRadStructAccessData::MultiplyElFieldByPhaseLin_GPU(double xMult, doub
 #endif
 }
 
-template<int mode> __global__ void MirrorFieldData_Kernel(long ne, long nx, long nz, float* pEX0, float* pEZ0) {
+template<int mode> __global__ void MirrorFieldData_Kernel(long nx, long nz, long ne, float* pEX0, float* pEZ0) {
 	int ix = (blockIdx.x * blockDim.x + threadIdx.x); //nx range
 	int iz = (blockIdx.y * blockDim.y + threadIdx.y); //nz range
 
@@ -301,11 +301,11 @@ void srTSRWRadStructAccessData::MirrorFieldData_GPU(int sx, int sz, TGPUUsageArg
 	if ((sx > 0) && (sz > 0))
 		return;
 	else if ((sx < 0) && (sz > 0))
-		MirrorFieldData_Kernel<0> <<<blocks, threads>>>(ne, nx, nz, pEX0, pEZ0);
+		MirrorFieldData_Kernel<0> <<<blocks, threads>>>(nx, nz, ne, pEX0, pEZ0);
 	else if ((sx > 0) && (sz < 0))
-		MirrorFieldData_Kernel<1> <<<blocks, threads >>> (ne, nx, nz, pEX0, pEZ0);
+		MirrorFieldData_Kernel<1> <<<blocks, threads >>> (nx, nz, ne, pEX0, pEZ0);
 	else
-		MirrorFieldData_Kernel<2> <<<blocks, threads >>> (ne, nx, nz, pEX0, pEZ0);
+		MirrorFieldData_Kernel<2> <<<blocks, threads >>> (nx, nz, ne, pEX0, pEZ0);
 
 	if (pEX0 != NULL)
 		CAuxGPU::MarkUpdated(pGPU, pEX0, true, false); //OC03082023
diff --git a/cpp/src/core/srstraux.h b/cpp/src/core/srstraux.h
index ad7dec02..49aff638 100644
--- a/cpp/src/core/srstraux.h
+++ b/cpp/src/core/srstraux.h
@@ -203,6 +203,9 @@ struct srTStokesC {
 
 struct srTEFieldPtrs {
 	float *pExRe, *pExIm, *pEzRe, *pEzIm;
+#ifdef _OFFLOAD_GPU //HG02122023
+	GPU_PORTABLE
+#endif
 	srTEFieldPtrs(float* In_pExRe =0, float* In_pExIm =0, float* In_pEzRe =0, float* In_pEzIm =0) 
 	{ 
 		pExRe = In_pExRe; pExIm = In_pExIm; pEzRe = In_pEzRe; pEzIm = In_pEzIm;
@@ -1588,6 +1591,9 @@ struct srTInterpolAux01 {
 	double cAx2z0, cAx2z1, cAx2z2, cAx2z3, cAx3z0, cAx3z1, cAx3z2, cAx3z3;
 	double cLAx1z0, cLAx0z1, cLAx1z1;
 
+#ifdef _OFFLOAD_GPU //HG02122023
+	GPU_PORTABLE
+#endif
 	srTInterpolAux01()
 	{
 		cAx0z1 = 0.1666666667;
@@ -1654,10 +1660,18 @@ struct srTInterpolAuxF {
 	float f03, f13, f23, f33;
 
 	float fAvg, fNorm;
+
+#ifdef _OFFLOAD_GPU //HG02122023
+	GPU_PORTABLE
+#endif
 	void SetUpAvg()
 	{
 		fAvg = (float)(0.0625*(f00 + f10 + f20 + f30 + f01 + f11 + f21 + f31 + f02 + f12 + f22 + f32 + f03 + f13 + f23 + f33));
 	}
+	
+#ifdef _OFFLOAD_GPU //HG02122023
+	GPU_PORTABLE
+#endif
 	void NormalizeByAvg()
 	{
 		const float CritNorm = 1.;
@@ -1724,11 +1738,17 @@ struct srTDataPtrsForWfrEdgeCorr {
 	double dxSt, dxFi, dzSt, dzFi, dx, dz;
 	char WasSetup;
 
+#ifdef _OFFLOAD_GPU //HG02122023
+	GPU_PORTABLE
+#endif
 	srTDataPtrsForWfrEdgeCorr()
 	{
 		InitializeAll();
 	}
 
+#ifdef _OFFLOAD_GPU //HG02122023
+	GPU_PORTABLE
+#endif
 	void InitializeAll()
 	{
 		ExpArrXSt = ExpArrXFi = 0;
@@ -1747,6 +1767,9 @@ struct srTDataPtrsForWfrEdgeCorr {
 		}
 		WasSetup = 0;
 	}
+#ifdef _OFFLOAD_GPU //HG02122023
+	GPU_PORTABLE
+#endif
 	void DisposeData()
 	{
 		if(ExpArrXSt != 0) delete[] ExpArrXSt;
diff --git a/cpp/src/ext/genmath/gmfft.cpp b/cpp/src/ext/genmath/gmfft.cpp
index dbea0340..3c242adc 100644
--- a/cpp/src/ext/genmath/gmfft.cpp
+++ b/cpp/src/ext/genmath/gmfft.cpp
@@ -132,6 +132,23 @@ long CGenMathFFT::LenGoodNum1000s = 11;
 long CGenMathFFT::GoodNum10000s[] = { 0,479,636,743,830,900,960,1017,1064,1109,1150 };
 long CGenMathFFT::LenGoodNum10000s = 11;
 
+#ifdef _OFFLOAD_GPU
+long CGenMathFFT1D::PlanLen;
+long CGenMathFFT1D::dPlanLen;
+long CGenMathFFT1D::HowMany;
+long CGenMathFFT1D::dHowMany;
+cufftHandle CGenMathFFT1D::Plan1DFFT_cu;
+cufftHandle CGenMathFFT1D::dPlan1DFFT_cu;
+#endif
+
+#ifdef _OFFLOAD_GPU
+long CGenMathFFT2D::PlanNx;
+long CGenMathFFT2D::PlanNy;
+long CGenMathFFT2D::dPlanNx;
+long CGenMathFFT2D::dPlanNy;
+cufftHandle CGenMathFFT2D::Plan2DFFT_cu;
+cufftHandle CGenMathFFT2D::dPlan2DFFT_cu;
+#endif
 //*************************************************************************
 
 void CGenMathFFT::NextCorrectNumberForFFT(long& n)
@@ -206,22 +223,38 @@ void CGenMathFFT::NextCorrectNumberForFFT(long& n)
 }
 
 //*************************************************************************
-
-int CGenMathFFT1D::Make1DFFT_InPlace(CGenMathFFT1DInfo& FFT1DInfo)
+//int CGenMathFFT1D::Make1DFFT_InPlace(CGenMathFFT1DInfo& FFT1DInfo)
+//int CGenMathFFT1D::Make1DFFT_InPlace(CGenMathFFT1DInfo& FFT1DInfo, gpuUsageArg *pGpuUsage) //HG18072022
+int CGenMathFFT1D::Make1DFFT_InPlace(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU) //OC06092023
 {
-	//long TotAmOfPo = (FFT1DInfo.Nx << 1)*FFT1DInfo.HowMany;
-	long long TotAmOfPo = ((long long)(FFT1DInfo.Nx << 1))*((long long)FFT1DInfo.HowMany);
-	float* AuxDataCont = new float[TotAmOfPo];
-	if(AuxDataCont == 0) return MEMORY_ALLOCATION_FAILURE;
-	FFT1DInfo.pOutData = AuxDataCont;
+#ifdef _OFFLOAD_GPU //OC06092023 (to avoid useless operations / calls at execution on CPU)
+	if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023
+	//GPU_COND(pvGPU, //OC06092023
+	//GPU_COND(pGpuUsage,
+	{
+		//HG03082022 GPU can do an inplace fft without being given a temporary buffer
+		FFT1DInfo.pOutData = FFT1DInfo.pInData;
+		int result;
+		if (result = Make1DFFT(FFT1DInfo, pvGPU)) return result; //OC06092023
+		//if (result = Make1DFFT(FFT1DInfo, pGpuUsage)) return result;
+	}//)
+	else
+#endif
+	{
+		//long TotAmOfPo = (FFT1DInfo.Nx << 1)*FFT1DInfo.HowMany;
+		long long TotAmOfPo = ((long long)(FFT1DInfo.Nx << 1))*((long long)FFT1DInfo.HowMany);
+		float* AuxDataCont = new float[TotAmOfPo];
+		if(AuxDataCont == 0) return MEMORY_ALLOCATION_FAILURE;
+		FFT1DInfo.pOutData = AuxDataCont;
 
-	int result;
-	if(result = Make1DFFT(FFT1DInfo)) return result;
+		int result;
+		if(result = Make1DFFT(FFT1DInfo)) return result;
 
-	float *tOut = FFT1DInfo.pInData, *t = AuxDataCont;
-	for(int ix=0; ix<TotAmOfPo; ix++) *(tOut++) = *(t++);
+		float *tOut = FFT1DInfo.pInData, *t = AuxDataCont;
+		for(int ix=0; ix<TotAmOfPo; ix++) *(tOut++) = *(t++);
 
-	if(AuxDataCont != 0) delete[] AuxDataCont;
+		if(AuxDataCont != 0) delete[] AuxDataCont;
+	}
 	return 0;
 }
 
@@ -254,7 +287,9 @@ int CGenMathFFT2D::AuxDebug_TestFFT_Plans()
 //Modification by S.Yakubov for parallelizing SRW via OpenMP:
 // SY: creation (and deletion) of FFTW plans is not thread-safe. Therefore added option to use precreated plans
 #ifdef _FFTW3 //OC29012019
-int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwf_plan* pPrecreatedPlan2DFFT, fftw_plan* pdPrecreatedPlan2DFFT)
+//int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwf_plan* pPrecreatedPlan2DFFT, fftw_plan* pdPrecreatedPlan2DFFT)
+int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwf_plan* pPrecreatedPlan2DFFT, fftw_plan* pdPrecreatedPlan2DFFT, void* pvGPU) //OC05092023
+//int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwf_plan* pPrecreatedPlan2DFFT, fftw_plan* pdPrecreatedPlan2DFFT, gpuUsageArg *pGpuUsage) //HG18072022
 //int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwf_plan* pPrecreatedPlan2DFFT)
 #else
 int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwnd_plan* pPrecreatedPlan2DFFT) //OC27102018
@@ -294,50 +329,78 @@ int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwnd_plan* pPrecrea
 	//ArrayShiftX = 0; ArrayShiftY = 0; 
 	m_ArrayShiftX = 0; m_ArrayShiftY = 0; //OC02022019
 	m_dArrayShiftX = 0; m_dArrayShiftY = 0;
-	if(FFT2DInfo.pData != 0)
+	if (FFT2DInfo.pData != 0)
 	{
-		if(NeedsShiftBeforeX || NeedsShiftAfterX)
+		if (NeedsShiftBeforeX || NeedsShiftAfterX)
 		{
 			//ArrayShiftX = new float[Nx << 1];
 			//if(ArrayShiftX == 0) return MEMORY_ALLOCATION_FAILURE;
 			m_ArrayShiftX = new float[Nx << 1];
-			if(m_ArrayShiftX == 0) return MEMORY_ALLOCATION_FAILURE;
+			if (m_ArrayShiftX == 0) return MEMORY_ALLOCATION_FAILURE;
 		}
-		if(NeedsShiftBeforeY || NeedsShiftAfterY)
+		if (NeedsShiftBeforeY || NeedsShiftAfterY)
 		{
 			//ArrayShiftY = new float[Ny << 1];
 			//if(ArrayShiftY == 0) return MEMORY_ALLOCATION_FAILURE;
 			m_ArrayShiftY = new float[Ny << 1];
-			if(m_ArrayShiftY == 0) return MEMORY_ALLOCATION_FAILURE;
+			if (m_ArrayShiftY == 0) return MEMORY_ALLOCATION_FAILURE;
 		}
 	}
-	else if(FFT2DInfo.pdData != 0)
+	else if (FFT2DInfo.pdData != 0)
 	{
-		if(NeedsShiftBeforeX || NeedsShiftAfterX)
+		if (NeedsShiftBeforeX || NeedsShiftAfterX)
 		{
 			m_dArrayShiftX = new double[Nx << 1];
-			if(m_dArrayShiftX == 0) return MEMORY_ALLOCATION_FAILURE;
+			if (m_dArrayShiftX == 0) return MEMORY_ALLOCATION_FAILURE;
 		}
-		if(NeedsShiftBeforeY || NeedsShiftAfterY)
+		if (NeedsShiftBeforeY || NeedsShiftAfterY)
 		{
 			m_dArrayShiftY = new double[Ny << 1];
-			if(m_dArrayShiftY == 0) return MEMORY_ALLOCATION_FAILURE;
+			if (m_dArrayShiftY == 0) return MEMORY_ALLOCATION_FAILURE;
 		}
 	}
 
-#ifdef _FFTW3 //OC28012019
+#ifdef _FFTW3
 	fftwf_plan Plan2DFFT;
 	fftw_plan dPlan2DFFT;
-	fftwf_complex *DataToFFT=0;
-	fftw_complex *dDataToFFT=0;
+	fftwf_complex* DataToFFT = 0;
+	fftw_complex* dDataToFFT = 0;
+#endif
+
+//HG18072022
+//#ifdef _DEBUG
+//	if (pGpuUsage != NULL)
+//		printf ("GPU: Make2DFFT\n");
+//#endif
 
-	if(FFT2DInfo.pData != 0) DataToFFT = (fftwf_complex*)(FFT2DInfo.pData);
-	else if(FFT2DInfo.pdData != 0) dDataToFFT = (fftw_complex*)(FFT2DInfo.pdData); //OC02022019
+#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
+	if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023
+	//GPU_COND(pvGPU, //OC06092023
+	//GPU_COND(pGpuUsage, //HG02112021
+	{
+		if(FFT2DInfo.pData != 0) 
+		{
+			DataToFFT = (fftwf_complex*)CAuxGPU::ToDevice((TGPUUsageArg*)pvGPU, FFT2DInfo.pData, FFT2DInfo.Nx * FFT2DInfo.Ny * FFT2DInfo.howMany * 2 * sizeof(float)); //OC06092023
+			//DataToFFT = (fftwf_complex*)AuxGPU::ToDevice(pGpuUsage, FFT2DInfo.pData, FFT2DInfo.Nx * FFT2DInfo.Ny * FFT2DInfo.howMany * 2 * sizeof(float));
+		}
+		else if(FFT2DInfo.pdData != 0) 
+		{
+			dDataToFFT = (fftw_complex*)CAuxGPU::ToDevice((TGPUUsageArg*)pvGPU, FFT2DInfo.pdData, FFT2DInfo.Nx * FFT2DInfo.Ny * FFT2DInfo.howMany * 2 * sizeof(double)); //OC06092023
+			//dDataToFFT = (fftw_complex*)CAuxGPU::ToDevice(pGpuUsage, FFT2DInfo.pdData, FFT2DInfo.Nx * FFT2DInfo.Ny * FFT2DInfo.howMany * 2 * sizeof(double));
+		}
+	}//)
+	else
+#endif
+	{
+#if _FFTW3 //OC28012019
+		if (FFT2DInfo.pData != 0) DataToFFT = (fftwf_complex*)(FFT2DInfo.pData);
+		else if (FFT2DInfo.pdData != 0) dDataToFFT = (fftw_complex*)(FFT2DInfo.pdData); //OC02022019
 
 #else
-	fftwnd_plan Plan2DFFT;
-	FFTW_COMPLEX *DataToFFT = (FFTW_COMPLEX*)(FFT2DInfo.pData);
+		fftwnd_plan Plan2DFFT;
+		FFTW_COMPLEX* DataToFFT = (FFTW_COMPLEX*)(FFT2DInfo.pData);
 #endif
+	}
 
 	char t0SignMult = (FFT2DInfo.Dir > 0)? -1 : 1;
 
@@ -345,164 +408,471 @@ int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwnd_plan* pPrecrea
 	//if(NeedsShiftBeforeY) FillArrayShift('y', t0SignMult*y0_Before, FFT2DInfo.yStep);
 	if(NeedsShiftBeforeX) 
 	{//OC02022019
-		if(m_ArrayShiftX != 0) FillArrayShift('x', t0SignMult*x0_Before, FFT2DInfo.xStep, m_ArrayShiftX); 
-		else if(m_dArrayShiftX != 0) FillArrayShift('x', t0SignMult*x0_Before, FFT2DInfo.xStep, m_dArrayShiftX);
+		if(m_ArrayShiftX != 0)
+			FillArrayShift('x', t0SignMult*x0_Before, FFT2DInfo.xStep, m_ArrayShiftX); 
+		else if(m_dArrayShiftX != 0)
+			FillArrayShift('x', t0SignMult*x0_Before, FFT2DInfo.xStep, m_dArrayShiftX);
 	}
 	if(NeedsShiftBeforeY) 
 	{//OC02022019
-		if(m_ArrayShiftY != 0) FillArrayShift('y', t0SignMult*y0_Before, FFT2DInfo.yStep, m_ArrayShiftY);
-		else if(m_dArrayShiftY != 0) FillArrayShift('y', t0SignMult*y0_Before, FFT2DInfo.yStep, m_dArrayShiftY);
+		if(m_ArrayShiftY != 0)
+			FillArrayShift('y', t0SignMult*y0_Before, FFT2DInfo.yStep, m_ArrayShiftY);
+		else if(m_dArrayShiftY != 0)
+			FillArrayShift('y', t0SignMult*y0_Before, FFT2DInfo.yStep, m_dArrayShiftY);
 	}
-	if(NeedsShiftBeforeX || NeedsShiftBeforeY) 
+	
+#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
+	if (DataToFFT != 0) CAuxGPU::EnsureDeviceMemoryReady((TGPUUsageArg*)pvGPU, DataToFFT); //OC06092023
+	else if (dDataToFFT != 0) CAuxGPU::EnsureDeviceMemoryReady((TGPUUsageArg*)pvGPU, dDataToFFT);
+	//if (DataToFFT != 0) CAuxGPU::EnsureDeviceMemoryReady(pGpuUsage, DataToFFT);
+	//else if (dDataToFFT != 0) CAuxGPU::EnsureDeviceMemoryReady(pGpuUsage, dDataToFFT);
+#endif
+
+	if (NeedsShiftBeforeX || NeedsShiftBeforeY) //HG02112021
 	{
-		if(DataToFFT != 0) TreatShifts(DataToFFT);
+#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
+		if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023
+		{
+			//GPU_COND(pvGPU, { //OC06092023
+			//GPU_COND(pGpuUsage, {
+			TGPUUsageArg *pGPU = (TGPUUsageArg*)pvGPU;
+			if (DataToFFT != 0) {
+				m_ArrayShiftX = (float*)CAuxGPU::ToDevice(pGPU, m_ArrayShiftX, (Nx << 1) * sizeof(float), false); //OC06092023
+				m_ArrayShiftY = (float*)CAuxGPU::ToDevice(pGPU, m_ArrayShiftY, (Ny << 1) * sizeof(float), false);
+				CAuxGPU::EnsureDeviceMemoryReady(pGPU, m_ArrayShiftX);
+				CAuxGPU::EnsureDeviceMemoryReady(pGPU, m_ArrayShiftY);
+				//m_ArrayShiftX = (float*)CAuxGPU::ToDevice(pGpuUsage, m_ArrayShiftX, (Nx << 1) * sizeof(float), false);
+				//m_ArrayShiftY = (float*)CAuxGPU::ToDevice(pGpuUsage, m_ArrayShiftY, (Ny << 1) * sizeof(float), false);	
+				//CAuxGPU::EnsureDeviceMemoryReady(pGpuUsage, m_ArrayShiftX);
+				//CAuxGPU::EnsureDeviceMemoryReady(pGpuUsage, m_ArrayShiftY);
+				TreatShifts2D_GPU((float*)DataToFFT, Nx, Ny, NeedsShiftBeforeX, NeedsShiftBeforeY, m_ArrayShiftX, m_ArrayShiftY);
+				m_ArrayShiftX = (float*)CAuxGPU::ToHostAndFree(pGPU, m_ArrayShiftX, (Nx << 1) * sizeof(float), true); //OC06092023
+				m_ArrayShiftY = (float*)CAuxGPU::ToHostAndFree(pGPU, m_ArrayShiftY, (Ny << 1) * sizeof(float), true);
+				//m_ArrayShiftX = (float*)CAuxGPU::ToHostAndFree(pGpuUsage, m_ArrayShiftX, (Nx << 1) * sizeof(float), true);
+				//m_ArrayShiftY = (float*)CAuxGPU::ToHostAndFree(pGpuUsage, m_ArrayShiftY, (Ny << 1) * sizeof(float), true);
+			}
+			else if (dDataToFFT != 0) {
+				m_dArrayShiftX = (double*)CAuxGPU::ToDevice(pGPU, m_dArrayShiftX, (Nx << 1) * sizeof(double), false); //OC06092023
+				m_dArrayShiftY = (double*)CAuxGPU::ToDevice(pGPU, m_dArrayShiftY, (Ny << 1) * sizeof(double), false);
+				CAuxGPU::EnsureDeviceMemoryReady(pGPU, m_dArrayShiftX);
+				CAuxGPU::EnsureDeviceMemoryReady(pGPU, m_dArrayShiftY);
+				//m_dArrayShiftX = (double*)CAuxGPU::ToDevice(pGpuUsage, m_dArrayShiftX, (Nx << 1) * sizeof(double), false);	
+				//m_dArrayShiftY = (double*)CAuxGPU::ToDevice(pGpuUsage, m_dArrayShiftY, (Ny << 1) * sizeof(double), false);	
+				//CAuxGPU::EnsureDeviceMemoryReady(pGpuUsage, m_dArrayShiftX);
+				//CAuxGPU::EnsureDeviceMemoryReady(pGpuUsage, m_dArrayShiftY);
+				TreatShifts2D_GPU((double*)dDataToFFT, Nx, Ny, NeedsShiftBeforeX, NeedsShiftBeforeY, m_dArrayShiftX, m_dArrayShiftY);
+				m_dArrayShiftX = (double*)CAuxGPU::ToHostAndFree(pGPU, m_dArrayShiftX, (Nx << 1) * sizeof(double), true); //OC06092023
+				m_dArrayShiftY = (double*)CAuxGPU::ToHostAndFree(pGPU, m_dArrayShiftY, (Ny << 1) * sizeof(double), true);
+				//m_dArrayShiftX = (double*)CAuxGPU::ToHostAndFree(pGpuUsage, m_dArrayShiftX, (Nx << 1) * sizeof(double), true);
+				//m_dArrayShiftY = (double*)CAuxGPU::ToHostAndFree(pGpuUsage, m_dArrayShiftY, (Ny << 1) * sizeof(double), true);
+			}
+		}//)
+		else 
+#endif
+		{
+			if (DataToFFT != 0) TreatShifts(DataToFFT);
 
 #ifdef _FFTW3 //OC27022019
-		else if(dDataToFFT != 0) TreatShifts(dDataToFFT); //OC02022019
+			else if (dDataToFFT != 0) TreatShifts(dDataToFFT); //OC02022019
 #endif
+		}
 	}
 
-	if(FFT2DInfo.Dir > 0)
+	bool alreadyNormalized = false; //HG17032022
+	//double Mult = FFT2DInfo.xStep*FFT2DInfo.yStep;
+	double Mult = FFT2DInfo.xStep * FFT2DInfo.yStep * FFT2DInfo.ExtraMult; //OC20112017
+	if (FFT2DInfo.Dir > 0)
 	{
-		//Plan2DFFT = fftw2d_create_plan(Ny, Nx, FFTW_FORWARD, FFTW_IN_PLACE);
-		//OC27102018
-		//SY: adopted for OpenMP
-#ifdef _FFTW3 //OC28012019
-
-		if(DataToFFT != 0)
+#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
+		if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023
+		//GPU_COND(pvGPU, //OC06092023
+		//GPU_COND(pGpuUsage, //HG02112021
 		{
-			if(pPrecreatedPlan2DFFT == 0) Plan2DFFT = fftwf_plan_dft_2d(Ny, Nx, DataToFFT, DataToFFT, FFTW_FORWARD, FFTW_ESTIMATE); 
-			else Plan2DFFT = *pPrecreatedPlan2DFFT;
-			if(Plan2DFFT == 0) return ERROR_IN_FFT;
+			if (DataToFFT != 0)
+			{
+				if (pPrecreatedPlan2DFFT == 0) 
+				{
+					if ((Plan2DFFT_cu == NULL) || (!(PlanNx == Nx && PlanNy == Ny))) //OC06092023 //HG04122023
+					//if((Plan2DFFT_cu == NULL) || (!(PlanNx == Nx && PlanNy == Ny && HowMany == FFT2DInfo.howMany))) //OC06092023
+					//if (Plan2DFFT_cu == NULL | !(PlanNx == Nx && PlanNy == Ny && HowMany == FFT2DInfo.howMany)) 
+					{
+						if (Plan2DFFT_cu != NULL)
+						{
+							cufftDestroy(Plan2DFFT_cu);
+							Plan2DFFT_cu = NULL;
+						}
+
+						PlanNx = Nx;
+						PlanNy = Ny;
+						int plan_shape[2]; plan_shape[0] = Nx; plan_shape[1] = Ny;
+						cufftPlanMany(&Plan2DFFT_cu, 2, plan_shape, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, 1); //HG04122023
+						//cufftPlanMany(&Plan2DFFT_cu, 2, plan_shape, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, FFT2DInfo.howMany);
+						//cufftPlan2d(&Plan2DFFT_cu, Nx, Ny, CUFFT_C2C);
+					}
+				}
+				else Plan2DFFT_cu = *(cufftHandle*)pPrecreatedPlan2DFFT;
+				if (Plan2DFFT_cu == 0) return ERROR_IN_FFT;
 
-			fftwf_execute(Plan2DFFT);
-		}
-		else if(dDataToFFT != 0)
+				auto res = cufftExecC2C(Plan2DFFT_cu, (cufftComplex*)DataToFFT, (cufftComplex*)DataToFFT, CUFFT_FORWARD);
+//				if (res != CUFFT_SUCCESS)
+//					printf("CUFFT Error: %d\r\n", res);
+			}
+			else if (dDataToFFT != 0)
+			{
+				if (pdPrecreatedPlan2DFFT == 0) 
+				{
+					if ((dPlan2DFFT_cu == NULL) || (!(dPlanNx == Nx && dPlanNy == Ny))) //OC06092023 //HG04122023
+					//if((dPlan2DFFT_cu == NULL) || (!(dPlanNx == Nx && dPlanNy == Ny && dHowMany == FFT2DInfo.howMany))) //OC06092023
+					//if (dPlan2DFFT_cu == NULL | !(dPlanNx == Nx && dPlanNy == Ny && dHowMany == FFT2DInfo.howMany)) 
+					{
+						if (dPlan2DFFT_cu != NULL)
+						{
+							cufftDestroy(dPlan2DFFT_cu);
+							dPlan2DFFT_cu = NULL;
+						}
+
+						dPlanNx = Nx;
+						dPlanNy = Ny;
+						int plan_shape[2]; plan_shape[0] = Nx; plan_shape[1] = Ny;
+						cufftPlanMany(&Plan2DFFT_cu, 2, plan_shape, 0, 0, 0, 0, 0, 0, CUFFT_Z2Z, 1); //HG04122023
+						//cufftPlanMany(&Plan2DFFT_cu, 2, plan_shape, 0, 0, 0, 0, 0, 0, CUFFT_Z2Z, FFT2DInfo.howMany);
+						//cufftPlan2d(&dPlan2DFFT_cu, Nx, Ny, CUFFT_Z2Z);
+					}
+				}
+				else dPlan2DFFT_cu = *(cufftHandle*)pdPrecreatedPlan2DFFT;
+				if (dPlan2DFFT_cu == 0) return ERROR_IN_FFT;
+
+				cufftExecZ2Z(dPlan2DFFT_cu, (cufftDoubleComplex*)dDataToFFT, (cufftDoubleComplex*)dDataToFFT, CUFFT_FORWARD);
+			}
+		}//)
+		else 
+#endif
 		{
-			if(pdPrecreatedPlan2DFFT == 0) dPlan2DFFT = fftw_plan_dft_2d(Ny, Nx, dDataToFFT, dDataToFFT, FFTW_FORWARD, FFTW_ESTIMATE); 
-			else dPlan2DFFT = *pdPrecreatedPlan2DFFT;
-			if(dPlan2DFFT == 0) return ERROR_IN_FFT;
+			//Plan2DFFT = fftw2d_create_plan(Ny, Nx, FFTW_FORWARD, FFTW_IN_PLACE);
+			//OC27102018
+			//SY: adopted for OpenMP
+#if _FFTW3 //OC28012019
 
-			fftw_execute(dPlan2DFFT);
-		}
+			for(long iHowMany = 0; iHowMany < FFT2DInfo.howMany; iHowMany++)
+			{
+				long iFFT = Nx * Ny * iHowMany;
+				if (DataToFFT != 0)
+				{
+					if (pPrecreatedPlan2DFFT == 0) Plan2DFFT = fftwf_plan_dft_2d(Ny, Nx, DataToFFT + iFFT, DataToFFT + iFFT, FFTW_FORWARD, FFTW_ESTIMATE);
+					else Plan2DFFT = *pPrecreatedPlan2DFFT;
+					if (Plan2DFFT == 0) return ERROR_IN_FFT;
+
+					fftwf_execute(Plan2DFFT);
+				}
+				else if (dDataToFFT != 0)
+				{
+					if (pdPrecreatedPlan2DFFT == 0) dPlan2DFFT = fftw_plan_dft_2d(Ny, Nx, dDataToFFT + iFFT, dDataToFFT + iFFT, FFTW_FORWARD, FFTW_ESTIMATE);
+					else dPlan2DFFT = *pdPrecreatedPlan2DFFT;
+					if (dPlan2DFFT == 0) return ERROR_IN_FFT;
+
+					fftw_execute(dPlan2DFFT);
+				}
+			}
 
 #else
-		if(pPrecreatedPlan2DFFT == 0) Plan2DFFT = fftw2d_create_plan(Ny, Nx, FFTW_FORWARD, FFTW_IN_PLACE);
-		else Plan2DFFT = *pPrecreatedPlan2DFFT;
-		if(Plan2DFFT == 0) return ERROR_IN_FFT;
-		fftwnd(Plan2DFFT, 1, DataToFFT, 1, 0, DataToFFT, 1, 0);
+			if (pPrecreatedPlan2DFFT == 0) Plan2DFFT = fftw2d_create_plan(Ny, Nx, FFTW_FORWARD, FFTW_IN_PLACE);
+			else Plan2DFFT = *pPrecreatedPlan2DFFT;
+			if (Plan2DFFT == 0) return ERROR_IN_FFT;
+			fftwnd(Plan2DFFT, 1, DataToFFT, 1, 0, DataToFFT, 1, 0);
 #endif
+		}
 
-		if(DataToFFT != 0)
+#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
+		if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023
+		//GPU_COND(pvGPU, //OC06092023
+		//GPU_COND(pGpuUsage, //HG18072022
 		{
-			RepairSignAfter2DFFT(DataToFFT);
-			RotateDataAfter2DFFT(DataToFFT);
-		}
+			if (DataToFFT != 0)
+			{
+				//RepairSignAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany);
+				//RotateDataAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany);
+				//RepairSignAndRotateDataAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany, Mult);
+				//RepairSignAndRotateDataAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany, (float)Mult); //OC06092023
+				RepairSignAndRotateDataAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, (float)Mult); //OC06092023 //HG04122023
+			}
+			else if (dDataToFFT != 0)
+			{
+				//RepairSignAfter2DFFT_GPU((double*)dDataToFFT, Nx, Ny, FFT2DInfo.howMany);
+				//RotateDataAfter2DFFT_GPU((double*)dDataToFFT, Nx, Ny, FFT2DInfo.howMany);
+				//RepairSignAndRotateDataAfter2DFFT_GPU((double*)dDataToFFT, Nx, Ny, FFT2DInfo.howMany, Mult);
+				RepairSignAndRotateDataAfter2DFFT_GPU((double*)dDataToFFT, Nx, Ny, Mult); //HG04122023
+			}
+			alreadyNormalized = true;
+		}//)
+		else 
+#endif
+		{
+			if (DataToFFT != 0)
+			{
+				RepairSignAfter2DFFT(DataToFFT);
+				RotateDataAfter2DFFT(DataToFFT);
+			}
 
 #ifdef _FFTW3 //OC27022019
-		else if(dDataToFFT != 0)
-		{
-			RepairSignAfter2DFFT(dDataToFFT);
-			RotateDataAfter2DFFT(dDataToFFT);
-		}
+			else if (dDataToFFT != 0)
+			{
+				RepairSignAfter2DFFT(dDataToFFT);
+				RotateDataAfter2DFFT(dDataToFFT);
+			}
 #endif
+		}
 	}
 	else
 	{
-		//Plan2DFFT = fftw2d_create_plan(Ny, Nx, FFTW_BACKWARD, FFTW_IN_PLACE);
-		//OC27102018
-		//SY: adopted for OpenMP
-#ifdef _FFTW3 //OC28012019
-		if(DataToFFT != 0)
+#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
+		if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023
+		//GPU_COND(pvGPU, //OC06092023
+		//GPU_COND(pGpuUsage, //HG18072022
+		{
+			if (DataToFFT != 0)
+			{
+				if (pPrecreatedPlan2DFFT == 0) {
+					if ((Plan2DFFT_cu == NULL) || (!(PlanNx == Nx && PlanNy == Ny))) //OC06092023 //HG04122023
+					//if((Plan2DFFT_cu == NULL) || (!(PlanNx == Nx && PlanNy == Ny && HowMany == FFT2DInfo.howMany))) //OC06092023
+					//if (Plan2DFFT_cu == NULL | !(PlanNx == Nx && PlanNy == Ny && HowMany == FFT2DInfo.howMany)) 
+					{
+						if (Plan2DFFT_cu != NULL){
+							cufftDestroy(Plan2DFFT_cu);
+							Plan2DFFT_cu = NULL;
+						}
+
+						PlanNx = Nx;
+						PlanNy = Ny;
+						//HowMany = FFT2DInfo.howMany; //HG04122023 (Commented out)
+						int plan_shape[2]; plan_shape[0] = Nx; plan_shape[1] = Ny;
+						cufftPlanMany(&Plan2DFFT_cu, 2, plan_shape, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, 1); //HG04122023
+						//cufftPlanMany(&Plan2DFFT_cu, 2, plan_shape, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, FFT2DInfo.howMany);
+						//cufftPlan2d(&Plan2DFFT_cu, Nx, Ny, CUFFT_C2C);
+					}
+				}
+				else Plan2DFFT_cu = *(cufftHandle*)pPrecreatedPlan2DFFT;
+				if (Plan2DFFT_cu == 0) return ERROR_IN_FFT;
+
+				//RotateDataAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany);
+				//RepairSignAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany);
+				RotateDataAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny); //HG04122023
+				RepairSignAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny);
+				cufftExecC2C(Plan2DFFT_cu, (cufftComplex*)DataToFFT, (cufftComplex*)DataToFFT, CUFFT_INVERSE);
+			}
+			else if (dDataToFFT != 0)
+			{
+				if (pdPrecreatedPlan2DFFT == 0) {
+					if ((dPlan2DFFT_cu == NULL) || (!(dPlanNx == Nx && dPlanNy == Ny))) //OC06092023 //HG04122023
+					//if((dPlan2DFFT_cu == NULL) || (!(dPlanNx == Nx && dPlanNy == Ny && dHowMany == FFT2DInfo.howMany))) //OC06092023
+					//if (dPlan2DFFT_cu == NULL | !(dPlanNx == Nx && dPlanNy == Ny && dHowMany == FFT2DInfo.howMany)) 
+					{
+						if (dPlan2DFFT_cu != NULL){
+							cufftDestroy(dPlan2DFFT_cu);
+							dPlan2DFFT_cu = NULL;
+						}
+
+						dPlanNx = Nx;
+						dPlanNy = Ny;
+						//dHowMany = FFT2DInfo.howMany; //HG04122023 (Commented out)
+						int plan_shape[2]; plan_shape[0] = Nx; plan_shape[1] = Ny;
+						cufftPlanMany(&Plan2DFFT_cu, 2, plan_shape, NULL, 0, 0, NULL, 0, 0, CUFFT_Z2Z, FFT2DInfo.howMany);
+						//cufftPlan2d(&dPlan2DFFT_cu, Nx, Ny, CUFFT_Z2Z);
+					}
+				}
+				else dPlan2DFFT_cu = *(cufftHandle*)pdPrecreatedPlan2DFFT;
+				if (dPlan2DFFT_cu == 0) return ERROR_IN_FFT;
+
+				//RotateDataAfter2DFFT_GPU((double*)dDataToFFT, Nx, Ny, FFT2DInfo.howMany);
+				//RepairSignAfter2DFFT_GPU((double*)dDataToFFT, Nx, Ny, FFT2DInfo.howMany);
+				RotateDataAfter2DFFT_GPU((double*)dDataToFFT, Nx, Ny);
+				RepairSignAfter2DFFT_GPU((double*)dDataToFFT, Nx, Ny);
+				cufftExecZ2Z(dPlan2DFFT_cu, (cufftDoubleComplex*)dDataToFFT, (cufftDoubleComplex*)dDataToFFT, CUFFT_INVERSE);
+			}
+		}//)
+		else 
+#endif
 		{
-			if(pPrecreatedPlan2DFFT == 0) Plan2DFFT = fftwf_plan_dft_2d(Ny, Nx, DataToFFT, DataToFFT, FFTW_BACKWARD, FFTW_ESTIMATE); 
+			//Plan2DFFT = fftw2d_create_plan(Ny, Nx, FFTW_BACKWARD, FFTW_IN_PLACE);
+			//OC27102018
+			//SY: adopted for OpenMP
+#ifdef _FFTW3 //OC28012019
+			for (long iHowMany = 0; iHowMany < FFT2DInfo.howMany; iHowMany++)
+			{
+				long iFFT = Nx * Ny * iHowMany;
+				if (DataToFFT != 0)
+				{
+					if (pPrecreatedPlan2DFFT == 0) Plan2DFFT = fftwf_plan_dft_2d(Ny, Nx, DataToFFT + iFFT, DataToFFT + iFFT, FFTW_BACKWARD, FFTW_ESTIMATE);
+					else Plan2DFFT = *pPrecreatedPlan2DFFT;
+					if (Plan2DFFT == 0) return ERROR_IN_FFT;
+					RotateDataAfter2DFFT(DataToFFT);
+					RepairSignAfter2DFFT(DataToFFT);
+					fftwf_execute(Plan2DFFT);
+				}
+				else if (dDataToFFT != 0)
+				{
+					if (pdPrecreatedPlan2DFFT == 0) dPlan2DFFT = fftw_plan_dft_2d(Ny, Nx, dDataToFFT + iFFT, dDataToFFT + iFFT, FFTW_BACKWARD, FFTW_ESTIMATE);
+					else dPlan2DFFT = *pdPrecreatedPlan2DFFT;
+					if (dPlan2DFFT == 0) return ERROR_IN_FFT;
+					RotateDataAfter2DFFT(dDataToFFT);
+					RepairSignAfter2DFFT(dDataToFFT);
+					fftw_execute(dPlan2DFFT);
+				}
+			}
+#else
+			if (pPrecreatedPlan2DFFT == 0) Plan2DFFT = fftw2d_create_plan(Ny, Nx, FFTW_BACKWARD, FFTW_IN_PLACE);
 			else Plan2DFFT = *pPrecreatedPlan2DFFT;
-			if(Plan2DFFT == 0) return ERROR_IN_FFT;
+			if (Plan2DFFT == 0) return ERROR_IN_FFT;
 			RotateDataAfter2DFFT(DataToFFT);
 			RepairSignAfter2DFFT(DataToFFT);
-			fftwf_execute(Plan2DFFT);
-		}
-		else if(dDataToFFT != 0)
-		{
-			if(pdPrecreatedPlan2DFFT == 0) dPlan2DFFT = fftw_plan_dft_2d(Ny, Nx, dDataToFFT, dDataToFFT, FFTW_BACKWARD, FFTW_ESTIMATE); 
-			else dPlan2DFFT = *pdPrecreatedPlan2DFFT;
-			if(dPlan2DFFT == 0) return ERROR_IN_FFT;
-			RotateDataAfter2DFFT(dDataToFFT);
-			RepairSignAfter2DFFT(dDataToFFT);
-			fftw_execute(dPlan2DFFT);
-		}
-#else
-		if(pPrecreatedPlan2DFFT == 0) Plan2DFFT = fftw2d_create_plan(Ny, Nx, FFTW_BACKWARD, FFTW_IN_PLACE);
-		else Plan2DFFT = *pPrecreatedPlan2DFFT;
-		if(Plan2DFFT == 0) return ERROR_IN_FFT;
-		RotateDataAfter2DFFT(DataToFFT);
-		RepairSignAfter2DFFT(DataToFFT);
-		fftwnd(Plan2DFFT, 1, DataToFFT, 1, 0, DataToFFT, 1, 0);
+			fftwnd(Plan2DFFT, 1, DataToFFT, 1, 0, DataToFFT, 1, 0);
 #endif
+		}
 	}
 	
-	//double Mult = FFT2DInfo.xStep*FFT2DInfo.yStep;
-	double Mult = FFT2DInfo.xStep*FFT2DInfo.yStep*FFT2DInfo.ExtraMult; //OC20112017
-
-	if(DataToFFT != 0) NormalizeDataAfter2DFFT(DataToFFT, Mult);
+	if (!alreadyNormalized){
+#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
+		if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023
+		//GPU_COND(pvGPU, //OC06092023
+		//GPU_COND(pGpuUsage, //HG18072022
+		{
+			//if (DataToFFT != 0)
+			//	NormalizeDataAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany, Mult);
+			//else if (dDataToFFT != 0)
+			//	NormalizeDataAfter2DFFT_GPU((double*)dDataToFFT, Nx, Ny, FFT2DInfo.howMany, Mult);
+			if (DataToFFT != 0) //HG04122023
+				NormalizeDataAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, Mult);
+			else if (dDataToFFT != 0)
+				NormalizeDataAfter2DFFT_GPU((double*)dDataToFFT, Nx, Ny, Mult);
+		}//)
+		else 
+#endif
+		{
+			if (DataToFFT != 0) NormalizeDataAfter2DFFT(DataToFFT, Mult);
 
 #ifdef _FFTW3 //OC27022019
-	else if(dDataToFFT != 0) NormalizeDataAfter2DFFT(dDataToFFT, Mult);
+			else if (dDataToFFT != 0) NormalizeDataAfter2DFFT(dDataToFFT, Mult);
 #endif
+		}
+	}
 
 	//if(NeedsShiftAfterX) FillArrayShift('x', t0SignMult*x0_After, FFT2DInfo.xStepTr);
 	//if(NeedsShiftAfterY) FillArrayShift('y', t0SignMult*y0_After, FFT2DInfo.yStepTr);
-	if(NeedsShiftAfterX) 
+
+	if (NeedsShiftAfterX)
 	{//OC02022019
-		if(m_ArrayShiftX != 0) FillArrayShift('x', t0SignMult*x0_After, FFT2DInfo.xStepTr, m_ArrayShiftX);
-		else if(m_dArrayShiftX != 0) FillArrayShift('x', t0SignMult*x0_After, FFT2DInfo.xStepTr, m_dArrayShiftX);
+		if (m_ArrayShiftX != 0) FillArrayShift('x', t0SignMult * x0_After, FFT2DInfo.xStepTr, m_ArrayShiftX);
+		else if (m_dArrayShiftX != 0) FillArrayShift('x', t0SignMult * x0_After, FFT2DInfo.xStepTr, m_dArrayShiftX);
 	}
-	if(NeedsShiftAfterY) 
+	if (NeedsShiftAfterY)
 	{//OC02022019
-		if(m_ArrayShiftY != 0) FillArrayShift('y', t0SignMult*y0_After, FFT2DInfo.yStepTr, m_ArrayShiftY);
-		else if(m_dArrayShiftY != 0) FillArrayShift('y', t0SignMult*y0_After, FFT2DInfo.yStepTr, m_dArrayShiftY);
+		if (m_ArrayShiftY != 0) FillArrayShift('y', t0SignMult * y0_After, FFT2DInfo.yStepTr, m_ArrayShiftY);
+		else if (m_dArrayShiftY != 0) FillArrayShift('y', t0SignMult * y0_After, FFT2DInfo.yStepTr, m_dArrayShiftY);
 	}
-	if(NeedsShiftAfterX || NeedsShiftAfterY) 
+	if (NeedsShiftAfterX || NeedsShiftAfterY)
 	{
-		if(DataToFFT != 0) TreatShifts(DataToFFT);
+#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
+		if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023
+		//GPU_COND(pvGPU, //OC06092023
+		//GPU_COND(pGpuUsage, //HG18072022
+		{
+			TGPUUsageArg *pGPU = (TGPUUsageArg*)pvGPU;
+			if (DataToFFT != 0) {
+				m_ArrayShiftX = (float*)CAuxGPU::ToDevice(pGPU, m_ArrayShiftX, (Nx << 1) * sizeof(float), false); //OC06092023
+				m_ArrayShiftY = (float*)CAuxGPU::ToDevice(pGPU, m_ArrayShiftY, (Ny << 1) * sizeof(float), false);
+				CAuxGPU::EnsureDeviceMemoryReady(pGPU, m_ArrayShiftX);
+				CAuxGPU::EnsureDeviceMemoryReady(pGPU, m_ArrayShiftY);
+				//m_ArrayShiftX = (float*)CAuxGPU::ToDevice(pGpuUsage, m_ArrayShiftX, (Nx << 1) * sizeof(float), false);
+				//m_ArrayShiftY = (float*)CAuxGPU::ToDevice(pGpuUsage, m_ArrayShiftY, (Ny << 1) * sizeof(float), false);
+				//CAuxGPU::EnsureDeviceMemoryReady(pGpuUsage, m_ArrayShiftX);
+				//CAuxGPU::EnsureDeviceMemoryReady(pGpuUsage, m_ArrayShiftY);
+				//TreatShifts2D_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany, NeedsShiftAfterX, NeedsShiftAfterY, m_ArrayShiftX, m_ArrayShiftY);
+				TreatShifts2D_GPU((float*)DataToFFT, Nx, Ny, NeedsShiftAfterX, NeedsShiftAfterY, m_ArrayShiftX, m_ArrayShiftY); //HG04122023
+				m_ArrayShiftX = (float*)CAuxGPU::ToHostAndFree(pGPU, m_ArrayShiftX, (Nx << 1) * sizeof(float), true); //OC06092023
+				m_ArrayShiftY = (float*)CAuxGPU::ToHostAndFree(pGPU, m_ArrayShiftY, (Ny << 1) * sizeof(float), true);
+				//m_ArrayShiftX = (float*)CAuxGPU::ToHostAndFree(pGpuUsage, m_ArrayShiftX, (Nx << 1) * sizeof(float), true);
+				//m_ArrayShiftY = (float*)CAuxGPU::ToHostAndFree(pGpuUsage, m_ArrayShiftY, (Ny << 1) * sizeof(float), true);
+			}
+			else if (dDataToFFT != 0) {
+				m_dArrayShiftX = (double*)CAuxGPU::ToDevice(pGPU, m_dArrayShiftX, (Nx << 1) * sizeof(double), false); //OC06092023
+				m_dArrayShiftY = (double*)CAuxGPU::ToDevice(pGPU, m_dArrayShiftY, (Ny << 1) * sizeof(double), false);
+				CAuxGPU::EnsureDeviceMemoryReady(pGPU, m_dArrayShiftX);
+				CAuxGPU::EnsureDeviceMemoryReady(pGPU, m_dArrayShiftY);
+				//m_dArrayShiftX = (double*)CAuxGPU::ToDevice(pGpuUsage, m_dArrayShiftX, (Nx << 1) * sizeof(double), false);	
+				//m_dArrayShiftY = (double*)CAuxGPU::ToDevice(pGpuUsage, m_dArrayShiftY, (Ny << 1) * sizeof(double), false);
+				//CAuxGPU::EnsureDeviceMemoryReady(pGpuUsage, m_dArrayShiftX);
+				//CAuxGPU::EnsureDeviceMemoryReady(pGpuUsage, m_dArrayShiftY);
+				//TreatShifts2D_GPU((double*)dDataToFFT, Nx, Ny, FFT2DInfo.howMany, NeedsShiftAfterX, NeedsShiftAfterY, m_dArrayShiftX, m_dArrayShiftY);
+				TreatShifts2D_GPU((double*)dDataToFFT, Nx, Ny, NeedsShiftAfterX, NeedsShiftAfterY, m_dArrayShiftX, m_dArrayShiftY); //HG04122023
+				m_dArrayShiftX = (double*)CAuxGPU::ToHostAndFree(pGPU, m_dArrayShiftX, (Nx << 1) * sizeof(double), true); //OC06092023
+				m_dArrayShiftY = (double*)CAuxGPU::ToHostAndFree(pGPU, m_dArrayShiftY, (Ny << 1) * sizeof(double), true);
+				//m_dArrayShiftX = (double*)CAuxGPU::ToHostAndFree(pGpuUsage, m_dArrayShiftX, (Nx << 1) * sizeof(double), true);
+				//m_dArrayShiftY = (double*)CAuxGPU::ToHostAndFree(pGpuUsage, m_dArrayShiftY, (Ny << 1) * sizeof(double), true);
+			}
+		}//)
+		else 
+#endif
+		{
+			if (DataToFFT != 0) TreatShifts(DataToFFT);
 
 #ifdef _FFTW3 //OC27022019
-		else if(dDataToFFT != 0) TreatShifts(dDataToFFT); //OC02022019
+			else if (dDataToFFT != 0) TreatShifts(dDataToFFT); //OC02022019
 #endif
+		}
 	}
 
 	//OC_NERSC: to comment-out the following line for NERSC (to avoid crash with "python-mpi")
 	//fftwnd_destroy_plan(Plan2DFFT);
 	//OC27102018
 	//SY: adopted for OpenMP
-
-#ifdef _FFTW3 //OC28012019
-	if(DataToFFT != 0)
+#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
+	if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023
+	//GPU_COND(pvGPU, //OC06092023
+	//GPU_COND(pGpuUsage, //HG02112021
 	{
-		if(pPrecreatedPlan2DFFT == 0) fftwf_destroy_plan(Plan2DFFT);
-	}
-	else if(dDataToFFT != 0) //OC03022019
+		if (FFT2DInfo.pData != 0) 
+		{
+			CAuxGPU::MarkUpdated((TGPUUsageArg*)pvGPU, DataToFFT, true, false); //OC06092023
+			//CAuxGPU::MarkUpdated(pGpuUsage, DataToFFT, true, false);
+		}
+		else if (FFT2DInfo.pdData != 0) 
+		{
+			CAuxGPU::MarkUpdated((TGPUUsageArg*)pvGPU, dDataToFFT, true, false); //OC06092023
+			//CAuxGPU::MarkUpdated(pGpuUsage, dDataToFFT, true, false);
+		}
+	}//)
+	else
+#endif
 	{
-		if(pdPrecreatedPlan2DFFT == 0) fftw_destroy_plan(dPlan2DFFT);
-	}
+#if _FFTW3 //OC28012019
+		if (DataToFFT != 0)
+		{
+			if (pPrecreatedPlan2DFFT == 0) fftwf_destroy_plan(Plan2DFFT);
+		}
+		else if (dDataToFFT != 0) //OC03022019
+		{
+			if (pdPrecreatedPlan2DFFT == 0) fftw_destroy_plan(dPlan2DFFT);
+		}
 #else
-	if(pPrecreatedPlan2DFFT == 0) fftwnd_destroy_plan(Plan2DFFT);
+		if (pPrecreatedPlan2DFFT == 0) fftwnd_destroy_plan(Plan2DFFT);
 #endif
+	}
 
 	//if(ArrayShiftX != 0) { delete[] ArrayShiftX; ArrayShiftX = 0;}
 	//if(ArrayShiftY != 0) { delete[] ArrayShiftY; ArrayShiftY = 0;}
-	if(m_ArrayShiftX != 0) { delete[] m_ArrayShiftX; m_ArrayShiftX = 0;}
-	if(m_ArrayShiftY != 0) { delete[] m_ArrayShiftY; m_ArrayShiftY = 0;}
-	if(m_dArrayShiftX != 0) { delete[] m_dArrayShiftX; m_dArrayShiftX = 0;} //OC02022019
-	if(m_dArrayShiftY != 0) { delete[] m_dArrayShiftY; m_dArrayShiftY = 0;}
-
+	if (m_ArrayShiftX != 0) { delete[] m_ArrayShiftX;}
+	if (m_ArrayShiftY != 0) { delete[] m_ArrayShiftY;}
+	if (m_dArrayShiftX != 0) { delete[] m_dArrayShiftX;} //OC02022019
+	if (m_dArrayShiftY != 0) { delete[] m_dArrayShiftY;}
+	
 	return 0;
 }
 
 //*************************************************************************
 //Forward FFT: Int f(x)*exp(-i*2*Pi*qx*x)dx
 //Backward FFT: Int f(qx)*exp(i*2*Pi*qx*x)dqx
-int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo)
+//int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo)
+//int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, gpuUsageArg *pGpuUsage) //HG20012022
+int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU) //OC05092023
 {// Assumes Nx, Ny even !
 	//Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP:
 	//double start;
@@ -529,260 +899,460 @@ int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo)
 
 	m_ArrayShiftX = 0;
 	m_dArrayShiftX = 0;
-	if(NeedsShiftBeforeX || NeedsShiftAfterX)
+	if (NeedsShiftBeforeX || NeedsShiftAfterX)
 	{
-		if(FFT1DInfo.pInData != 0)
+		if (FFT1DInfo.pInData != 0)
 		{
 			m_ArrayShiftX = new float[Nx << 1];
-			if(m_ArrayShiftX == 0) return MEMORY_ALLOCATION_FAILURE;
+			if (m_ArrayShiftX == 0) return MEMORY_ALLOCATION_FAILURE;
+
+#ifdef _OFFLOAD_GPU //OC05092023 (check for memory leak / misuse!)
+			m_ArrayShiftX = (float*)CAuxGPU::ToDevice((TGPUUsageArg*)pvGPU, m_ArrayShiftX, (Nx << 1) * sizeof(float), true);
+			//m_ArrayShiftX = (float*)CAuxGPU::ToDevice(pGpuUsage, m_ArrayShiftX, (Nx << 1) * sizeof(float), true); //HG20012022
+#endif
 		}
-		else if(FFT1DInfo.pdInData != 0)
+		else if (FFT1DInfo.pdInData != 0)
 		{
 			m_dArrayShiftX = new double[Nx << 1];
-			if(m_dArrayShiftX == 0) return MEMORY_ALLOCATION_FAILURE;
+			if (m_dArrayShiftX == 0) return MEMORY_ALLOCATION_FAILURE;
+
+#ifdef _OFFLOAD_GPU //OC05092023 
+			m_dArrayShiftX = (double*)CAuxGPU::ToDevice((TGPUUsageArg*)pvGPU, m_dArrayShiftX, (Nx << 1) * sizeof(double), true);
+			//m_dArrayShiftX = (double*)CAuxGPU::ToDevice(pGpuUsage, m_dArrayShiftX, (Nx << 1) * sizeof(double), true); //HG20012022
+#endif
 		}
 	}
 
 #ifdef _FFTW3 //OC28012019
 	fftwf_plan Plan1DFFT;
-	fftwf_complex *DataToFFT=0, *OutDataFFT=0; //, *pOutDataFFT=0;
+	fftwf_complex* DataToFFT = 0, * OutDataFFT = 0; //, *pOutDataFFT=0;
 
 	fftw_plan dPlan1DFFT;
-	fftw_complex *dDataToFFT=0, *dOutDataFFT=0; //, *pdOutDataFFT=0;
+	fftw_complex* dDataToFFT = 0, * dOutDataFFT = 0; //, *pdOutDataFFT=0;
+#endif
 
-	if((FFT1DInfo.pInData != 0) && (FFT1DInfo.pOutData != 0))
+//HG20012022
+//#ifdef _DEBUG
+//	if (pGpuUsage != NULL)
+//		printf ("GPU: Make1DFFT\n");
+//#endif
+#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
+	if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023
+	//GPU_COND(pvGPU, //OC06092023
+	//GPU_COND(pGpuUsage, //HG20012022
 	{
-		DataToFFT = (fftwf_complex*)(FFT1DInfo.pInData);
-		OutDataFFT = (fftwf_complex*)(FFT1DInfo.pOutData);
-		//pOutDataFFT = OutDataFFT; //OC03092016 to be used solely in fftw call
-	}
-	else if((FFT1DInfo.pdInData != 0) && (FFT1DInfo.pdOutData != 0))
+		if ((FFT1DInfo.pInData != 0) && (FFT1DInfo.pOutData != 0))
+		{
+			DataToFFT = (fftwf_complex*)CAuxGPU::ToDevice((TGPUUsageArg*)pvGPU, FFT1DInfo.pInData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(float)); //OC06092023
+			OutDataFFT = (fftwf_complex*)CAuxGPU::ToDevice((TGPUUsageArg*)pvGPU, FFT1DInfo.pOutData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(float), true);
+			//DataToFFT = (fftwf_complex*)CAuxGPU::ToDevice(pGpuUsage, FFT1DInfo.pInData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(float));
+			//OutDataFFT = (fftwf_complex*)CAuxGPU::ToDevice(pGpuUsage, FFT1DInfo.pOutData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(float), true);
+		}
+		else if ((FFT1DInfo.pdInData != 0) && (FFT1DInfo.pdOutData != 0))
+		{
+			dDataToFFT = (fftw_complex*)CAuxGPU::ToDevice((TGPUUsageArg*)pvGPU, FFT1DInfo.pdInData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(double)); //OC06092023
+			dOutDataFFT = (fftw_complex*)CAuxGPU::ToDevice((TGPUUsageArg*)pvGPU, FFT1DInfo.pdOutData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(double), true);
+			//dDataToFFT = (fftw_complex*)CAuxGPU::ToDevice(pGpuUsage, FFT1DInfo.pdInData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(double));
+			//dOutDataFFT = (fftw_complex*)CAuxGPU::ToDevice(pGpuUsage, FFT1DInfo.pdOutData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(double), true);
+		}
+	}//)
+	else 
+#endif
 	{
-		dDataToFFT = (fftw_complex*)(FFT1DInfo.pdInData);
-		dOutDataFFT = (fftw_complex*)(FFT1DInfo.pdOutData);
-		//pdOutDataFFT = dOutDataFFT;
-	}
+#ifdef _FFTW3 //OC28012019
+		if ((FFT1DInfo.pInData != 0) && (FFT1DInfo.pOutData != 0))
+		{
+			DataToFFT = (fftwf_complex*)(FFT1DInfo.pInData);
+			OutDataFFT = (fftwf_complex*)(FFT1DInfo.pOutData);
+			//pOutDataFFT = OutDataFFT; //OC03092016 to be used solely in fftw call
+		}
+		else if ((FFT1DInfo.pdInData != 0) && (FFT1DInfo.pdOutData != 0))
+		{
+			dDataToFFT = (fftw_complex*)(FFT1DInfo.pdInData);
+			dOutDataFFT = (fftw_complex*)(FFT1DInfo.pdOutData);
+			//pdOutDataFFT = dOutDataFFT;
+		}
 #else
-	fftw_plan Plan1DFFT;
-	FFTW_COMPLEX *DataToFFT = (FFTW_COMPLEX*)(FFT1DInfo.pInData);
-	FFTW_COMPLEX *OutDataFFT = (FFTW_COMPLEX*)(FFT1DInfo.pOutData);
-	FFTW_COMPLEX *pOutDataFFT = OutDataFFT; //OC03092016 to be used solely in fftw call
-/**
-	Pointed-out by Sergey Yakubov (E-XFEL).
-	From FFTW 2.1.5 docs:
-	void fftw(fftw_plan plan, int howmany,
-          fftw_complex *in, int istride, int idist,
-          fftw_complex *out, int ostride, int odist);
-	...
-	out, ostride and odist describe the output array(s). The format is the same as for the input array. 
-	In-place transforms:  If the plan specifies an in-place transform, ostride and odist are always ignored. 
-	If out is NULL, out is ignored, too. Otherwise, out is interpreted as a pointer to an array of n complex numbers, 
-	that FFTW will use as temporary space to perform the in-place computation. out is used as scratch space and its contents destroyed. 
-	In this case, out must be an ordinary array whose elements are contiguous in memory (no striding). 
-**/
-#endif
-
-	char t0SignMult = (FFT1DInfo.Dir > 0)? -1 : 1;
-	if(NeedsShiftBeforeX) 
+		fftw_plan Plan1DFFT;
+		FFTW_COMPLEX* DataToFFT = (FFTW_COMPLEX*)(FFT1DInfo.pInData);
+		FFTW_COMPLEX* OutDataFFT = (FFTW_COMPLEX*)(FFT1DInfo.pOutData);
+		FFTW_COMPLEX* pOutDataFFT = OutDataFFT; //OC03092016 to be used solely in fftw call
+	/**
+		Pointed-out by Sergey Yakubov (E-XFEL).
+		From FFTW 2.1.5 docs:
+		void fftw(fftw_plan plan, int howmany,
+			  fftw_complex *in, int istride, int idist,
+			  fftw_complex *out, int ostride, int odist);
+		...
+		out, ostride and odist describe the output array(s). The format is the same as for the input array.
+		In-place transforms:  If the plan specifies an in-place transform, ostride and odist are always ignored.
+		If out is NULL, out is ignored, too. Otherwise, out is interpreted as a pointer to an array of n complex numbers,
+		that FFTW will use as temporary space to perform the in-place computation. out is used as scratch space and its contents destroyed.
+		In this case, out must be an ordinary array whose elements are contiguous in memory (no striding).
+	**/
+#endif
+	}
+
+#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
+	if (DataToFFT != 0) CAuxGPU::EnsureDeviceMemoryReady((TGPUUsageArg*)pvGPU, DataToFFT);
+	else if (dDataToFFT != 0) CAuxGPU::EnsureDeviceMemoryReady((TGPUUsageArg*)pvGPU, dDataToFFT);
+	//if (DataToFFT != 0) CAuxGPU::EnsureDeviceMemoryReady(pGpuUsage, DataToFFT);
+	//else if (dDataToFFT != 0) CAuxGPU::EnsureDeviceMemoryReady(pGpuUsage, dDataToFFT);
+#endif
+
+	char t0SignMult = (FFT1DInfo.Dir > 0) ? -1 : 1;
+	if (NeedsShiftBeforeX)
 	{
-		//FillArrayShift(t0SignMult*x0_Before, FFT1DInfo.xStep);
-		if(m_ArrayShiftX != 0) FillArrayShift(t0SignMult*x0_Before, FFT1DInfo.xStep, m_ArrayShiftX);
-		else if(m_dArrayShiftX != 0) FillArrayShift(t0SignMult*x0_Before, FFT1DInfo.xStep, m_dArrayShiftX);
+#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
+		if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023
+		//GPU_COND(pvGPU, 
+		//GPU_COND(pGpuUsage, //HG20012022
+		{
+			if (m_ArrayShiftX != 0) FillArrayShift_GPU(t0SignMult * x0_Before, FFT1DInfo.xStep, Nx, m_ArrayShiftX);
+			else if (m_dArrayShiftX != 0) FillArrayShift_GPU(t0SignMult * x0_Before, FFT1DInfo.xStep, Nx, m_dArrayShiftX);
 
-		if(DataToFFT != 0) TreatShift(DataToFFT, FFT1DInfo.HowMany);
+			if (DataToFFT != 0) TreatShift_GPU((float*)DataToFFT, FFT1DInfo.HowMany, Nx, m_ArrayShiftX);
+			else if (dDataToFFT != 0) TreatShift_GPU((double*)dDataToFFT, FFT1DInfo.HowMany, Nx, m_dArrayShiftX);
+		}//)
+		else 
+#endif
+		{
+			//FillArrayShift(t0SignMult*x0_Before, FFT1DInfo.xStep);
+			if (m_ArrayShiftX != 0) FillArrayShift(t0SignMult * x0_Before, FFT1DInfo.xStep, m_ArrayShiftX);
+			else if (m_dArrayShiftX != 0) FillArrayShift(t0SignMult * x0_Before, FFT1DInfo.xStep, m_dArrayShiftX);
+
+			if (DataToFFT != 0) TreatShift(DataToFFT, FFT1DInfo.HowMany);
 
 #ifdef _FFTW3 //OC27022019
-		else if(dDataToFFT != 0) TreatShift(dDataToFFT, FFT1DInfo.HowMany);
+			else if (dDataToFFT != 0) TreatShift(dDataToFFT, FFT1DInfo.HowMany);
 #endif
+		}
 	}
 
 	//Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP:
 	//srwlPrintTime("::Make1DFFT : before fft",&start);
 	
 	int flags = FFTW_ESTIMATE; //OC30012019
+	bool alreadyNormalized = false; //HG17032022
+	//double Mult = FFT1DInfo.xStep;
+	double Mult = FFT1DInfo.xStep * FFT1DInfo.MultExtra;
 
-	if(FFT1DInfo.Dir > 0)
+	if (FFT1DInfo.Dir > 0) //HG17112021
 	{
-		//int flags = FFTW_ESTIMATE;
+#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
+		if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023
+		//GPU_COND(pvGPU, 
+		//GPU_COND(pGpuUsage, 
+		{
+			int arN[] = { (int)Nx }; //OC14052020
+			if (DataToFFT != 0)
+			{
+				if (PlanLen != Nx) {
+					PlanLen = Nx;
+					if (Plan1DFFT_cu != NULL)
+					{
+						cufftDestroy(Plan1DFFT_cu);
+						Plan1DFFT_cu = NULL;
+					}
+					cufftPlanMany(&Plan1DFFT_cu, 1, arN, NULL, 1, Nx, NULL, 1, Nx, CUFFT_C2C, FFT1DInfo.HowMany);
+				}
+				if (Plan1DFFT_cu == 0) return ERROR_IN_FFT;
+				cufftExecC2C(Plan1DFFT_cu, (cufftComplex*)DataToFFT, (cufftComplex*)OutDataFFT, CUFFT_FORWARD);
+			}
+			else if (dDataToFFT != 0) //OC02022019
+			{
+				if (dPlanLen != Nx) {
+					if (dPlan1DFFT_cu != NULL)
+					{
+						cufftDestroy(dPlan1DFFT_cu);
+						dPlan1DFFT_cu = NULL;
+					}
+					dPlanLen = Nx;
+					cufftPlanMany(&dPlan1DFFT_cu, 1, arN, NULL, 1, Nx, NULL, 1, Nx, CUFFT_Z2Z, FFT1DInfo.HowMany);
+				}
+				if (dPlan1DFFT_cu == 0) return ERROR_IN_FFT;
+				cufftExecZ2Z(dPlan1DFFT_cu, (cufftDoubleComplex*)dDataToFFT, (cufftDoubleComplex*)dOutDataFFT, CUFFT_FORWARD);
+			}
+		}//)
+		else 
+#endif
+		{
+			//int flags = FFTW_ESTIMATE;
 #ifdef _FFTW3 //OC28012019
 #ifdef _WITH_OMP
 		//Still needs to be tested!
-		if(DataToFFT != 0)
-		{
-			fftwf_init_threads(); //initialize threading support
-			int nthreads = omp_get_max_threads(); //detect number of OpenMP threads that are available
-			fftwf_plan_with_nthreads(nthreads);
-		}
-		else if(dDataToFFT != 0) //OC02022019
-		{
-			fftw_init_threads(); //initialize threading support
-			int nthreads = omp_get_max_threads(); //detect number of OpenMP threads that are available
-			fftw_plan_with_nthreads(nthreads);
-		}
+			if (DataToFFT != 0)
+			{
+				fftwf_init_threads(); //initialize threading support
+				int nthreads = omp_get_max_threads(); //detect number of OpenMP threads that are available
+				fftwf_plan_with_nthreads(nthreads);
+			}
+			else if (dDataToFFT != 0) //OC02022019
+			{
+				fftw_init_threads(); //initialize threading support
+				int nthreads = omp_get_max_threads(); //detect number of OpenMP threads that are available
+				fftw_plan_with_nthreads(nthreads);
+			}
 #endif //ifndef _WITH_OMP
-
-		int arN[] = {(int)Nx}; //OC14052020
-		//int arN[] = {Nx};
-		if(DataToFFT != 0)
-		{
-			//Plan1DFFT = fftwf_plan_many_dft(1, arN, FFT1DInfo.HowMany, DataToFFT, NULL, 1, Nx, pOutDataFFT, NULL, 1, Nx, FFTW_FORWARD, flags); 
-			Plan1DFFT = fftwf_plan_many_dft(1, arN, FFT1DInfo.HowMany, DataToFFT, NULL, 1, Nx, OutDataFFT, NULL, 1, Nx, FFTW_FORWARD, flags); //OC02022019
-			if(Plan1DFFT == 0) return ERROR_IN_FFT;
-			fftwf_execute(Plan1DFFT);
-		}
-		else if(dDataToFFT != 0) //OC02022019
-		{
-			dPlan1DFFT = fftw_plan_many_dft(1, arN, FFT1DInfo.HowMany, dDataToFFT, NULL, 1, Nx, dOutDataFFT, NULL, 1, Nx, FFTW_FORWARD, flags); 
-			if(dPlan1DFFT == 0) return ERROR_IN_FFT;
-			fftw_execute(dPlan1DFFT);
-		}
+			int arN[] = { (int)Nx }; //OC14052020
+			//int arN[] = {Nx};
+			if (DataToFFT != 0)
+			{
+				//Plan1DFFT = fftwf_plan_many_dft(1, arN, FFT1DInfo.HowMany, DataToFFT, NULL, 1, Nx, pOutDataFFT, NULL, 1, Nx, FFTW_FORWARD, flags); 
+				Plan1DFFT = fftwf_plan_many_dft(1, arN, FFT1DInfo.HowMany, DataToFFT, NULL, 1, Nx, OutDataFFT, NULL, 1, Nx, FFTW_FORWARD, flags); //OC02022019
+				if (Plan1DFFT == 0) return ERROR_IN_FFT;
+				fftwf_execute(Plan1DFFT);
+			}
+			else if (dDataToFFT != 0) //OC02022019
+			{
+				dPlan1DFFT = fftw_plan_many_dft(1, arN, FFT1DInfo.HowMany, dDataToFFT, NULL, 1, Nx, dOutDataFFT, NULL, 1, Nx, FFTW_FORWARD, flags);
+				if (dPlan1DFFT == 0) return ERROR_IN_FFT;
+				fftw_execute(dPlan1DFFT);
+			}
 
 #else //ifndef _FFTW3
-		if(DataToFFT == OutDataFFT)
-		{
-			flags |= FFTW_IN_PLACE;
-			pOutDataFFT = 0; //OC03092016 (see FFTW 2.1.5 doc clause above)
-		}
-		Plan1DFFT = fftw_create_plan(Nx, FFTW_FORWARD, flags);
-		if(Plan1DFFT == 0) return ERROR_IN_FFT;
+			if (DataToFFT == OutDataFFT)
+			{
+				flags |= FFTW_IN_PLACE;
+				pOutDataFFT = 0; //OC03092016 (see FFTW 2.1.5 doc clause above)
+			}
+			Plan1DFFT = fftw_create_plan(Nx, FFTW_FORWARD, flags);
+			if (Plan1DFFT == 0) return ERROR_IN_FFT;
 
-		//Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP:
-		//srwlPrintTime("::Make1DFFT : fft create plan dir>0",&start);
+			//Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP:
+			//srwlPrintTime("::Make1DFFT : fft create plan dir>0",&start);
 
 #ifndef _WITH_OMP //OC27102018
 		//fftw(Plan1DFFT, FFT1DInfo.HowMany, DataToFFT, 1, Nx, OutDataFFT, 1, Nx);
-		fftw(Plan1DFFT, FFT1DInfo.HowMany, DataToFFT, 1, Nx, pOutDataFFT, 1, Nx); //OC03092016
+			fftw(Plan1DFFT, FFT1DInfo.HowMany, DataToFFT, 1, Nx, pOutDataFFT, 1, Nx); //OC03092016
 #else //OC27102018
 		//SY: split one call into many (for OpenMP)
-		#pragma omp parallel for if (omp_get_num_threads()==1) // to avoid nested multi-threading (just in case)
-		for(int i=0; i<FFT1DInfo.HowMany; i++)
-		{
-			//SY: do not use OutDataFFT as scratch space if in-place
-			if(DataToFFT == OutDataFFT) fftw_one(Plan1DFFT, DataToFFT + i*Nx, 0);
-			else fftw_one(Plan1DFFT, DataToFFT + i*Nx, OutDataFFT + i*Nx);
-		}
+#pragma omp parallel for if (omp_get_num_threads()==1) // to avoid nested multi-threading (just in case)
+			for (int i = 0; i < FFT1DInfo.HowMany; i++)
+			{
+				//SY: do not use OutDataFFT as scratch space if in-place
+				if (DataToFFT == OutDataFFT) fftw_one(Plan1DFFT, DataToFFT + i * Nx, 0);
+				else fftw_one(Plan1DFFT, DataToFFT + i * Nx, OutDataFFT + i * Nx);
+			}
 #endif
 #endif
+		}
 		//Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP:
 		//srwlPrintTime("::Make1DFFT : fft  dir>0",&start);
 
-		if(OutDataFFT != 0)
+#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
+		if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023
+		//GPU_COND(pvGPU, 
+		//GPU_COND(pGpuUsage, //HG20012022
 		{
-			RepairSignAfter1DFFT(OutDataFFT, FFT1DInfo.HowMany);
-			RotateDataAfter1DFFT(OutDataFFT, FFT1DInfo.HowMany);
-		}
-
-#ifdef _FFTW3 //OC27022019
-		else if(dOutDataFFT != 0)
+			if (OutDataFFT != 0)
+			{
+				RepairAndRotateDataAfter1DFFT_GPU((float*)OutDataFFT, FFT1DInfo.HowMany, Nx, (float)Mult); //OC06092023
+				//RepairAndRotateDataAfter1DFFT_GPU((float*)OutDataFFT, FFT1DInfo.HowMany, Nx, Mult);
+				//RepairSignAfter1DFFT_GPU((float*)OutDataFFT, FFT1DInfo.HowMany, Nx);
+				//RotateDataAfter1DFFT_GPU((float*)OutDataFFT, FFT1DInfo.HowMany, Nx);
+			}
+			else if (dOutDataFFT != 0)
+			{
+				RepairAndRotateDataAfter1DFFT_GPU((double*)dOutDataFFT, FFT1DInfo.HowMany, Nx, Mult);
+				//RepairSignAfter1DFFT_GPU((double*)dOutDataFFT, FFT1DInfo.HowMany, Nx);
+				//RotateDataAfter1DFFT_GPU((double*)dOutDataFFT, FFT1DInfo.HowMany, Nx);
+			}
+			alreadyNormalized = true;
+		}//)
+		else 
+#endif
 		{
-			RepairSignAfter1DFFT(dOutDataFFT, FFT1DInfo.HowMany);
-			RotateDataAfter1DFFT(dOutDataFFT, FFT1DInfo.HowMany);
-		}
+			if (OutDataFFT != 0)
+			{
+				RepairSignAfter1DFFT(OutDataFFT, FFT1DInfo.HowMany);
+				RotateDataAfter1DFFT(OutDataFFT, FFT1DInfo.HowMany);
+			}
+#ifdef _FFTW3 //OC27022019
+			else if (dOutDataFFT != 0)
+			{
+				RepairSignAfter1DFFT(dOutDataFFT, FFT1DInfo.HowMany);
+				RotateDataAfter1DFFT(dOutDataFFT, FFT1DInfo.HowMany);
+			}
 #endif
+		}
 	}
 	else
 	{
 		//int flags = FFTW_ESTIMATE; //OC30012019 (commented-out)
-#ifdef _FFTW3 //OC28012019
-#ifdef _WITH_OMP
-
-		//Still needs to be tested!
-		if(DataToFFT != 0)
+#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
+		if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023
+		//GPU_COND(pvGPU, 
+		//GPU_COND(pGpuUsage, //HG20012022
 		{
-			fftwf_init_threads(); //initialize threading support
-			int nthreads = omp_get_max_threads(); //detect number of OpenMP threads that are available
-			fftwf_plan_with_nthreads(nthreads);
-		}
-		else if(dDataToFFT != 0)
-		{
-			fftw_init_threads(); //initialize threading support
-			int nthreads = omp_get_max_threads(); //detect number of OpenMP threads that are available
-			fftw_plan_with_nthreads(nthreads);
-		}
-
-#endif
-
-		int arN[] = {(int)Nx}; //OC14052020
-		//int arN[] = {Nx};
-		if(DataToFFT != 0)
-		{
-			//Plan1DFFT = fftwf_plan_many_dft(1, arN, FFT1DInfo.HowMany, DataToFFT, NULL, 1, Nx, pOutDataFFT, NULL, 1, Nx, FFTW_BACKWARD, flags); 
-			Plan1DFFT = fftwf_plan_many_dft(1, arN, FFT1DInfo.HowMany, DataToFFT, NULL, 1, Nx, OutDataFFT, NULL, 1, Nx, FFTW_BACKWARD, flags); //OC02022019
-			if(Plan1DFFT == 0) return ERROR_IN_FFT;
+			int arN[] = { (int)Nx }; //OC14052020
+			//int arN[] = {Nx};
+			if (DataToFFT != 0)
+			{
+				if (PlanLen != Nx) {
+					PlanLen = Nx;
+					HowMany = FFT1DInfo.HowMany;
+					if (Plan1DFFT_cu != NULL)
+					{
+						cufftDestroy(Plan1DFFT_cu);
+						Plan1DFFT_cu = NULL;
+					}
+					cufftPlanMany(&Plan1DFFT_cu, 1, arN, NULL, 1, Nx, NULL, 1, Nx, CUFFT_C2C, FFT1DInfo.HowMany);
+				}
+				if (Plan1DFFT_cu == 0) return ERROR_IN_FFT;
 
-			RotateDataAfter1DFFT(DataToFFT, FFT1DInfo.HowMany);
-			RepairSignAfter1DFFT(DataToFFT, FFT1DInfo.HowMany);
+				RotateDataAfter1DFFT_GPU((float*)DataToFFT, FFT1DInfo.HowMany, Nx);
+				RepairSignAfter1DFFT_GPU((float*)DataToFFT, FFT1DInfo.HowMany, Nx);
+				cufftExecC2C(Plan1DFFT_cu, (cufftComplex*)DataToFFT, (cufftComplex*)OutDataFFT, CUFFT_INVERSE);
+			}
+			else if (dDataToFFT != 0) //OC02022019
+			{
+				if (dPlanLen != Nx) 
+				{
+					dPlanLen = Nx;
+					dHowMany = FFT1DInfo.HowMany;
+					if (dPlan1DFFT_cu != NULL)
+					{
+						cufftDestroy(dPlan1DFFT_cu);
+						dPlan1DFFT_cu = NULL;
+					}
+					cufftPlanMany(&dPlan1DFFT_cu, 1, arN, NULL, 1, Nx, NULL, 1, Nx, CUFFT_Z2Z, FFT1DInfo.HowMany);
+				}
+				if (dPlan1DFFT_cu == 0) return ERROR_IN_FFT;
 
-			fftwf_execute(Plan1DFFT);
-		}
-		else if(dDataToFFT != 0) //OC02022019
+				RotateDataAfter1DFFT_GPU((double*)dDataToFFT, FFT1DInfo.HowMany, Nx);
+				RepairSignAfter1DFFT_GPU((double*)dDataToFFT, FFT1DInfo.HowMany, Nx);
+				cufftExecZ2Z(dPlan1DFFT_cu, (cufftDoubleComplex*)dDataToFFT, (cufftDoubleComplex*)dOutDataFFT, CUFFT_INVERSE);
+			}
+		}//)
+		else 
+#endif
 		{
-			dPlan1DFFT = fftw_plan_many_dft(1, arN, FFT1DInfo.HowMany, dDataToFFT, NULL, 1, Nx, dOutDataFFT, NULL, 1, Nx, FFTW_BACKWARD, flags);
-			if(dPlan1DFFT == 0) return ERROR_IN_FFT;
+#ifdef _FFTW3 //OC28012019
+#ifdef _WITH_OMP
 
-			RotateDataAfter1DFFT(dDataToFFT, FFT1DInfo.HowMany);
-			RepairSignAfter1DFFT(dDataToFFT, FFT1DInfo.HowMany);
+			//Still needs to be tested!
+			if (DataToFFT != 0)
+			{
+				fftwf_init_threads(); //initialize threading support
+				int nthreads = omp_get_max_threads(); //detect number of OpenMP threads that are available
+				fftwf_plan_with_nthreads(nthreads);
+			}
+			else if (dDataToFFT != 0)
+			{
+				fftw_init_threads(); //initialize threading support
+				int nthreads = omp_get_max_threads(); //detect number of OpenMP threads that are available
+				fftw_plan_with_nthreads(nthreads);
+			}
 
-			fftw_execute(dPlan1DFFT);
-		}
+#endif
+			int arN[] = { (int)Nx }; //OC14052020
+	//int arN[] = {Nx};
+			if (DataToFFT != 0)
+			{
+				//Plan1DFFT = fftwf_plan_many_dft(1, arN, FFT1DInfo.HowMany, DataToFFT, NULL, 1, Nx, pOutDataFFT, NULL, 1, Nx, FFTW_BACKWARD, flags); 
+				Plan1DFFT = fftwf_plan_many_dft(1, arN, FFT1DInfo.HowMany, DataToFFT, NULL, 1, Nx, OutDataFFT, NULL, 1, Nx, FFTW_BACKWARD, flags); //OC02022019
+				if (Plan1DFFT == 0) return ERROR_IN_FFT;
+				RotateDataAfter1DFFT(DataToFFT, FFT1DInfo.HowMany);
+				RepairSignAfter1DFFT(DataToFFT, FFT1DInfo.HowMany);
 
+				fftwf_execute(Plan1DFFT);
+			}
+			else if (dDataToFFT != 0) //OC02022019
+			{
+				dPlan1DFFT = fftw_plan_many_dft(1, arN, FFT1DInfo.HowMany, dDataToFFT, NULL, 1, Nx, dOutDataFFT, NULL, 1, Nx, FFTW_BACKWARD, flags);
+				if (dPlan1DFFT == 0) return ERROR_IN_FFT;
+				RotateDataAfter1DFFT(dDataToFFT, FFT1DInfo.HowMany);
+				RepairSignAfter1DFFT(dDataToFFT, FFT1DInfo.HowMany);
+				fftw_execute(dPlan1DFFT);
+			}
 #else //ifndef _FFTW3
-		if(DataToFFT == OutDataFFT)
-		{
-			flags |= FFTW_IN_PLACE;
-			pOutDataFFT = 0; //OC03092016 (see FFTW 2.1.5 doc clause above)
-		}
-		Plan1DFFT = fftw_create_plan(Nx, FFTW_BACKWARD, flags);
-		if(Plan1DFFT == 0) return ERROR_IN_FFT;
+			if (DataToFFT == OutDataFFT)
+			{
+				flags |= FFTW_IN_PLACE;
+				pOutDataFFT = 0; //OC03092016 (see FFTW 2.1.5 doc clause above)
+			}
+			Plan1DFFT = fftw_create_plan(Nx, FFTW_BACKWARD, flags);
+			if (Plan1DFFT == 0) return ERROR_IN_FFT;
 
-		//Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP:
-		//srwlPrintTime("::Make1DFFT : fft create plan dir<0",&start);
+			//Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP:
+			//srwlPrintTime("::Make1DFFT : fft create plan dir<0",&start);
 
-		RotateDataAfter1DFFT(DataToFFT, FFT1DInfo.HowMany);
-		//srwlPrintTime("::Make1DFFT : rotate dir<0",&start);
+			RotateDataAfter1DFFT(DataToFFT, FFT1DInfo.HowMany);
+			//srwlPrintTime("::Make1DFFT : rotate dir<0",&start);
 
-		RepairSignAfter1DFFT(DataToFFT, FFT1DInfo.HowMany);
-		//srwlPrintTime("::Make1DFFT : repair dir<0",&start);
+			RepairSignAfter1DFFT(DataToFFT, FFT1DInfo.HowMany);
+			//srwlPrintTime("::Make1DFFT : repair dir<0",&start);
 
 #ifndef _WITH_OMP //OC27102018
 		//fftw(Plan1DFFT, FFT1DInfo.HowMany, DataToFFT, 1, Nx, OutDataFFT, 1, Nx);
-		fftw(Plan1DFFT, FFT1DInfo.HowMany, DataToFFT, 1, Nx, pOutDataFFT, 1, Nx); //OC03092016
+			fftw(Plan1DFFT, FFT1DInfo.HowMany, DataToFFT, 1, Nx, pOutDataFFT, 1, Nx); //OC03092016
 #else //OC27102018
 		//SY: split one call into many (for OpenMP)
-		#pragma omp parallel for if (omp_get_num_threads()==1) // to avoid nested multi-threading (just in case)
-		for(int i=0; i<FFT1DInfo.HowMany; i++)
-		{
-			if(DataToFFT == OutDataFFT) fftw_one(Plan1DFFT, DataToFFT + i*Nx, 0);
-			else fftw_one(Plan1DFFT, DataToFFT + i*Nx, OutDataFFT + i*Nx);
-		}
+#pragma omp parallel for if (omp_get_num_threads()==1) // to avoid nested multi-threading (just in case)
+			for (int i = 0; i < FFT1DInfo.HowMany; i++)
+			{
+				if (DataToFFT == OutDataFFT) fftw_one(Plan1DFFT, DataToFFT + i * Nx, 0);
+				else fftw_one(Plan1DFFT, DataToFFT + i * Nx, OutDataFFT + i * Nx);
+			}
 #endif
 #endif //_FFTW3
+		}
 		//Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP:
 		//srwlPrintTime("::Make1DFFT : fft  dir<0",&start);
 	}
-	//double Mult = FFT1DInfo.xStep;
-	double Mult = FFT1DInfo.xStep*FFT1DInfo.MultExtra;
-
-	if(OutDataFFT != 0) NormalizeDataAfter1DFFT(OutDataFFT, FFT1DInfo.HowMany, Mult);
 
+	if (!alreadyNormalized)
+	{
+#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
+		if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023
+		//GPU_COND(pvGPU, 
+		//GPU_COND(pGpuUsage, 
+		{
+			if (OutDataFFT != 0) {
+				NormalizeDataAfter1DFFT_GPU((float*)OutDataFFT, FFT1DInfo.HowMany, Nx, Mult);
+			}
+			else if (dOutDataFFT != 0)
+				NormalizeDataAfter1DFFT_GPU((double*)dOutDataFFT, FFT1DInfo.HowMany, Nx, Mult);
+		}//)
+		else 
+#endif
+		{
+			if (OutDataFFT != 0) NormalizeDataAfter1DFFT(OutDataFFT, FFT1DInfo.HowMany, Mult);
 #ifdef _FFTW3 //OC27022019
-	else if(dOutDataFFT != 0) NormalizeDataAfter1DFFT(dOutDataFFT, FFT1DInfo.HowMany, Mult);
+			else if (dOutDataFFT != 0) NormalizeDataAfter1DFFT(dOutDataFFT, FFT1DInfo.HowMany, Mult);
 #endif
-
+		}
+	}
+	
 	//Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP:
 	//srwlPrintTime("::Make1DFFT : NormalizeDataAfter1DFFT",&start);
 
-	if(NeedsShiftAfterX)
+	if (NeedsShiftAfterX)
 	{
-		//FillArrayShift(t0SignMult*x0_After, FFT1DInfo.xStepTr);
-		if(m_ArrayShiftX != 0) FillArrayShift(t0SignMult*x0_After, FFT1DInfo.xStepTr, m_ArrayShiftX); //OC02022019
-		else if(m_dArrayShiftX != 0) FillArrayShift(t0SignMult*x0_After, FFT1DInfo.xStepTr, m_dArrayShiftX);
+#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
+		if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023
+		//GPU_COND(pvGPU, 
+		//GPU_COND(pGpuUsage, 
+		{
+			if (m_ArrayShiftX != 0) FillArrayShift_GPU(t0SignMult * x0_After, FFT1DInfo.xStepTr, Nx, m_ArrayShiftX); //OC02022019
+			else if (m_dArrayShiftX != 0) FillArrayShift_GPU(t0SignMult * x0_After, FFT1DInfo.xStepTr, Nx, m_dArrayShiftX);
 
-		if(OutDataFFT != 0) TreatShift(OutDataFFT, FFT1DInfo.HowMany);
+			if (OutDataFFT != 0) TreatShift_GPU((float*)OutDataFFT, FFT1DInfo.HowMany, Nx, m_ArrayShiftX);
+			else if (dOutDataFFT != 0) TreatShift_GPU((double*)dOutDataFFT, FFT1DInfo.HowMany, Nx, m_dArrayShiftX);
+		}//)
+		else 
+#endif
+		{
+			//FillArrayShift(t0SignMult*x0_After, FFT1DInfo.xStepTr);
+			if (m_ArrayShiftX != 0) FillArrayShift(t0SignMult * x0_After, FFT1DInfo.xStepTr, m_ArrayShiftX); //OC02022019
+			else if (m_dArrayShiftX != 0) FillArrayShift(t0SignMult * x0_After, FFT1DInfo.xStepTr, m_dArrayShiftX);
 
+			if (OutDataFFT != 0) TreatShift(OutDataFFT, FFT1DInfo.HowMany);
 #ifdef _FFTW3 //OC27022019
-		else if(dOutDataFFT != 0) TreatShift(dOutDataFFT, FFT1DInfo.HowMany);
+			else if (dOutDataFFT != 0) TreatShift(dOutDataFFT, FFT1DInfo.HowMany);
 #endif
+		}
 	}
 
 	if(FFT1DInfo.TreatSharpEdges)
@@ -791,35 +1361,63 @@ int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo)
 		if(result) return result;
 	}
 
-	//Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP:
-	//srwlPrintTime("::Make1DFFT : ProcessSharpEdges",&start);
+#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
+	if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023
+	//GPU_COND(pvGPU,
+	//GPU_COND(pGpuUsage, //HG20012022
+	{
+		if ((FFT1DInfo.pInData != 0) && (FFT1DInfo.pOutData != 0))
+		{
+			CAuxGPU::MarkUpdated((TGPUUsageArg*)pvGPU, OutDataFFT, true, false); //OC06092023
+			//CAuxGPU::MarkUpdated(pGpuUsage, OutDataFFT, true, false);
+		}
+		else if ((FFT1DInfo.pdInData != 0) && (FFT1DInfo.pdOutData != 0))
+		{
+			CAuxGPU::MarkUpdated((TGPUUsageArg*)pvGPU, dOutDataFFT, true, false); //OC06092023
+			//CAuxGPU::MarkUpdated(pGpuUsage, dOutDataFFT, true, false);
+		}
+	}//)
+	else 
+#endif
+	{
+		//Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP:
+		//srwlPrintTime("::Make1DFFT : ProcessSharpEdges",&start);
 
-	//OC_NERSC: to comment-out the following line for NERSC (to avoid crash with "python-mpi")
-	//OC27102018: thread safety issue?
+		//OC_NERSC: to comment-out the following line for NERSC (to avoid crash with "python-mpi")
+		//OC27102018: thread safety issue?
 #ifdef _FFTW3 //OC29012019
 
-	if(DataToFFT != 0) fftwf_destroy_plan(Plan1DFFT);
-	else if(dDataToFFT != 0) fftw_destroy_plan(dPlan1DFFT);
+		if(DataToFFT != 0) fftwf_destroy_plan(Plan1DFFT);
+		else if(dDataToFFT != 0) fftw_destroy_plan(dPlan1DFFT);
 
 #ifdef _WITH_OMP 
 
-	if(DataToFFT != 0) fftwf_cleanup_threads(); //??
-	else if(dDataToFFT != 0) fftw_cleanup_threads();
+		if(DataToFFT != 0) fftwf_cleanup_threads(); //??
+		else if(dDataToFFT != 0) fftw_cleanup_threads();
 
 #endif
 #else //ifndef _FFTW3
 
-	fftw_destroy_plan(Plan1DFFT);
+		fftw_destroy_plan(Plan1DFFT);
 
 #endif
+	}
 
-	if(m_ArrayShiftX != 0) 
+	if (m_ArrayShiftX != 0)
 	{
-		delete[] m_ArrayShiftX; m_ArrayShiftX = 0;
+#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
+		m_ArrayShiftX = (float*)CAuxGPU::ToHostAndFree((TGPUUsageArg*)pvGPU, m_ArrayShiftX, (Nx << 1) * sizeof(float), true); //OC06092023
+		//m_ArrayShiftX = (float*)CAuxGPU::ToHostAndFree(pGpuUsage, m_ArrayShiftX, (Nx << 1) * sizeof(float), true);
+#endif
+		delete[] m_ArrayShiftX;
 	}
-	if(m_dArrayShiftX != 0) 
+	if (m_dArrayShiftX != 0)
 	{
-		delete[] m_dArrayShiftX; m_dArrayShiftX = 0;
+#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
+		m_dArrayShiftX = (double*)CAuxGPU::ToHostAndFree((TGPUUsageArg*)pvGPU, m_dArrayShiftX, (Nx << 1) * sizeof(double), true); //OC06092023
+		//m_dArrayShiftX = (double*)CAuxGPU::ToHostAndFree(pGpuUsage, m_dArrayShiftX, (Nx << 1) * sizeof(double), true);
+#endif
+		delete[] m_dArrayShiftX;
 	}
 
 	//Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP:
diff --git a/cpp/src/ext/genmath/gmfft.h b/cpp/src/ext/genmath/gmfft.h
index 18f19077..09ae0bca 100644
--- a/cpp/src/ext/genmath/gmfft.h
+++ b/cpp/src/ext/genmath/gmfft.h
@@ -14,6 +14,12 @@
 #ifndef __GMFFT_H
 #define __GMFFT_H
 
+#ifdef _OFFLOAD_GPU //HG10072021
+#include "cufft.h"
+#include "cuda_runtime.h"
+#include "auxgpu.h" //OC06092023
+#endif 
+
 #ifdef _FFTW3 //OC28012019
 #include "fftw3.h"
 #else
@@ -167,21 +173,47 @@ class CGenMathFFT2D : public CGenMathFFT {
 	float *m_ArrayShiftX, *m_ArrayShiftY; //OC02022019
 	double *m_dArrayShiftX, *m_dArrayShiftY; 
 
+#ifdef _OFFLOAD_GPU //HG04122023
+	static long PlanNx, PlanNy;
+	static long dPlanNx, dPlanNy;
+	static cufftHandle Plan2DFFT_cu;
+	static cufftHandle dPlan2DFFT_cu;
+#endif
+
 public:
 	CGenMathFFT2D()
 	{
 		NeedsShiftBeforeX = NeedsShiftBeforeY = NeedsShiftAfterX = NeedsShiftAfterY = 0;
+#ifdef _OFFLOAD_GPU //HG04122023
+		PlanNx = PlanNy = dPlanNx = dPlanNy = 0;
+		Plan2DFFT_cu = dPlan2DFFT_cu = 0;
+#endif
 	}
 
 	//int Make2DFFT(CGenMathFFT2DInfo&);
 	//Modification by S.Yakubov for parallelizing SRW via OpenMP:
 #ifdef _FFTW3 //28012019
-	int Make2DFFT(CGenMathFFT2DInfo&, fftwf_plan* pPrecreatedPlan2DFFT=0, fftw_plan* pdPrecreatedPlan2DFFT=0); //OC02022019
+	int Make2DFFT(CGenMathFFT2DInfo&, fftwf_plan* pPrecreatedPlan2DFFT=0, fftw_plan* pdPrecreatedPlan2DFFT=0, void* pvGPU = 0); //OC05092023
+	//int Make2DFFT(CGenMathFFT2DInfo&, fftwf_plan* pPrecreatedPlan2DFFT=0, fftw_plan* pdPrecreatedPlan2DFFT=0, gpuUsageArg *pGpuUsage = 0); //OC02022019
 	//int Make2DFFT(CGenMathFFT2DInfo&, fftwf_plan* pPrecreatedPlan2DFFT=0);
 #else
 	int Make2DFFT(CGenMathFFT2DInfo&, fftwnd_plan* pPrecreatedPlan2DFFT=0); //OC27102018
 #endif
 
+#ifdef _OFFLOAD_GPU //HG04122023
+	void RepairSignAfter2DFFT_GPU(float* pAfterFFT, long Nx, long Ny);
+	void RotateDataAfter2DFFT_GPU(float* pAfterFFT, long Nx, long Ny);
+	void RepairSignAndRotateDataAfter2DFFT_GPU(float* pAfterFFT, long Nx, long Ny, float Mult=1.f); //to check
+	void NormalizeDataAfter2DFFT_GPU(float* pAfterFFT, long Nx, long Ny, double Mult);
+	void TreatShifts2D_GPU(float* pData, long Nx, long Ny, bool NeedsShiftX, bool NeedsShiftY, float* m_ArrayShiftX, float* m_ArrayShiftY);
+
+	void RepairSignAfter2DFFT_GPU(double* pAfterFFT, long Nx, long Ny);
+	void RotateDataAfter2DFFT_GPU(double* pAfterFFT, long Nx, long Ny);
+	void RepairSignAndRotateDataAfter2DFFT_GPU(double* pAfterFFT, long Nx, long Ny, double Mult=1.);
+	void NormalizeDataAfter2DFFT_GPU(double* pAfterFFT, long Nx, long Ny, double Mult);
+	void TreatShifts2D_GPU(double* pData, long Nx, long Ny, bool NeedsShiftX, bool NeedsShiftY, double* m_ArrayShiftX, double* m_ArrayShiftY);
+#endif
+
 	int AuxDebug_TestFFT_Plans();
 
 	void SetupLimitsTr(CGenMathFFT2DInfo& FFT2DInfo)
@@ -554,15 +586,52 @@ class CGenMathFFT1D : public CGenMathFFT {
 	char NeedsShiftBeforeX, NeedsShiftAfterX;
 	float *m_ArrayShiftX;
 	double *m_dArrayShiftX; //OC02022019
+#ifdef _OFFLOAD_GPU //HG04122023
+	static long PlanLen, HowMany;
+	static long dPlanLen, dHowMany;
+	static cufftHandle Plan1DFFT_cu;
+	static cufftHandle dPlan1DFFT_cu;
+#endif
 
 public:
 	CGenMathFFT1D()
 	{
 		NeedsShiftBeforeX = NeedsShiftAfterX = 0;
+#ifdef _OFFLOAD_GPU //HG04122023
+		PlanLen = dPlanLen = 0;
+		Plan1DFFT_cu = dPlan1DFFT_cu = 0;
+		HowMany = dHowMany = 0;
+#endif
 	}
 
-	int Make1DFFT(CGenMathFFT1DInfo&);
-	int Make1DFFT_InPlace(CGenMathFFT1DInfo& FFT1DInfo);
+	int Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU=0); //OC05092023
+	int Make1DFFT_InPlace(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU=0); //OC05092023
+
+//#ifndef _OFFLOAD_GPU //OC05092023
+//	int Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo);
+//	int Make1DFFT_InPlace(CGenMathFFT1DInfo& FFT1DInfo);
+//#else
+//	int Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, TGPUUsageArg* pGPU=0);
+//	int Make1DFFT_InPlace(CGenMathFFT1DInfo& FFT1DInfo, TGPUUsageArg* pGPU=0);
+//	//int Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, gpuUsageArg *pGpuUsage=0); //HG
+//	//int Make1DFFT_InPlace(CGenMathFFT1DInfo& FFT1DInfo, gpuUsageArg *pGpuUsage=0);
+//#endif
+
+#ifdef _OFFLOAD_GPU //HG04122023
+	void RepairSignAfter1DFFT_GPU(float* pAfterFFT, long HowMany, long Nx);
+	void RotateDataAfter1DFFT_GPU(float* pAfterFFT, long HowMany, long Nx);
+	void RepairAndRotateDataAfter1DFFT_GPU(float* pAfterFFT, long HowMany, long Nx, float Mult=1.f);
+	void NormalizeDataAfter1DFFT_GPU(float* pAfterFFT, long HowMany, long Nx, double Mult);
+	void FillArrayShift_GPU(double t0, double tStep, long Nx, float* tShiftX);
+	void TreatShift_GPU(float* pData, long HowMany, long Nx, float* tShiftX);
+
+	void RepairSignAfter1DFFT_GPU(double* pAfterFFT, long HowMany, long Nx);
+	void RotateDataAfter1DFFT_GPU(double* pAfterFFT, long HowMany, long Nx);
+	void RepairAndRotateDataAfter1DFFT_GPU(double* pAfterFFT, long HowMany, long Nx, double Mult=1.);
+	void NormalizeDataAfter1DFFT_GPU(double* pAfterFFT, long HowMany, long Nx, double Mult);
+	void FillArrayShift_GPU(double t0, double tStep, long Nx, double* tShiftX);
+	void TreatShift_GPU(double* pData, long HowMany, long Nx, double* tShiftX);
+#endif
 
 	void SetupLimitsTr(CGenMathFFT1DInfo& FFT1DInfo)
 	{ // Modify this if Make1DFFT is modified !
diff --git a/cpp/src/core/gmfft_gpu.cu b/cpp/src/ext/genmath/gmfft_gpu.cu
similarity index 54%
rename from cpp/src/core/gmfft_gpu.cu
rename to cpp/src/ext/genmath/gmfft_gpu.cu
index c47769b4..525e81da 100644
--- a/cpp/src/core/gmfft_gpu.cu
+++ b/cpp/src/ext/genmath/gmfft_gpu.cu
@@ -18,6 +18,7 @@
 #include <stdio.h>
 #include <iostream>
 #include <chrono>
+#include "gmfft.h"
 
 #define GMFFT_BLOCK_SIZE 256
 
@@ -148,7 +149,7 @@ template <typename T> __global__ void TreatShift_Kernel(T* pData, long HowMany,
     }
 }
 
-void RepairSignAfter1DFFT_GPU(float* pAfterFFT, long HowMany, long Nx) 
+void CGenMathFFT1D::RepairSignAfter1DFFT_GPU(float* pAfterFFT, long HowMany, long Nx) 
 {
 
     dim3 blocks(Nx / GMFFT_BLOCK_SIZE + ((Nx & (GMFFT_BLOCK_SIZE - 1)) != 0));
@@ -162,7 +163,7 @@ void RepairSignAfter1DFFT_GPU(float* pAfterFFT, long HowMany, long Nx)
 //#endif
 }
 
-void RotateDataAfter1DFFT_GPU(float* pAfterFFT, long HowMany, long Nx) 
+void CGenMathFFT1D::RotateDataAfter1DFFT_GPU(float* pAfterFFT, long HowMany, long Nx) 
 {
 
     dim3 blocks(Nx / (2 * GMFFT_BLOCK_SIZE) + ((Nx / 2 & (GMFFT_BLOCK_SIZE - 1)) != 0));
@@ -176,7 +177,7 @@ void RotateDataAfter1DFFT_GPU(float* pAfterFFT, long HowMany, long Nx)
 //#endif
 }
 
-void RepairAndRotateDataAfter1DFFT_GPU(float* pAfterFFT, long HowMany, long Nx, float Mult) 
+void CGenMathFFT1D::RepairAndRotateDataAfter1DFFT_GPU(float* pAfterFFT, long HowMany, long Nx, float Mult) 
 {
 
 //#ifdef _DEBUG
@@ -197,7 +198,7 @@ void RepairAndRotateDataAfter1DFFT_GPU(float* pAfterFFT, long HowMany, long Nx,
 //#endif
 }
 
-void NormalizeDataAfter1DFFT_GPU(float* pAfterFFT, long HowMany, long Nx, double Mult) 
+void CGenMathFFT1D::NormalizeDataAfter1DFFT_GPU(float* pAfterFFT, long HowMany, long Nx, double Mult) 
 {
 
     dim3 blocks(Nx / GMFFT_BLOCK_SIZE + ((Nx & (GMFFT_BLOCK_SIZE - 1)) != 0), 1);
@@ -212,7 +213,7 @@ void NormalizeDataAfter1DFFT_GPU(float* pAfterFFT, long HowMany, long Nx, double
 //#endif
 }
 
-void FillArrayShift_GPU(double t0, double tStep, long Nx, float* tShiftX) 
+void CGenMathFFT1D::FillArrayShift_GPU(double t0, double tStep, long Nx, float* tShiftX) 
 {
 
     dim3 blocks(Nx / (2 * GMFFT_BLOCK_SIZE) + ((Nx / 2 & (GMFFT_BLOCK_SIZE - 1)) != 0), 1);
@@ -226,7 +227,7 @@ void FillArrayShift_GPU(double t0, double tStep, long Nx, float* tShiftX)
 //#endif
 }
 
-void TreatShift_GPU(float* pData, long HowMany, long Nx, float* tShiftX) 
+void CGenMathFFT1D::TreatShift_GPU(float* pData, long HowMany, long Nx, float* tShiftX) 
 {
 
     dim3 blocks(Nx / GMFFT_BLOCK_SIZE + ((Nx & (GMFFT_BLOCK_SIZE - 1)) != 0));
@@ -240,7 +241,7 @@ void TreatShift_GPU(float* pData, long HowMany, long Nx, float* tShiftX)
 //#endif
 }
 
-void RepairSignAfter1DFFT_GPU(double* pAfterFFT, long HowMany, long Nx) 
+void CGenMathFFT1D::RepairSignAfter1DFFT_GPU(double* pAfterFFT, long HowMany, long Nx) 
 {
 
     dim3 blocks(Nx / GMFFT_BLOCK_SIZE + ((Nx & (GMFFT_BLOCK_SIZE - 1)) != 0));
@@ -254,7 +255,7 @@ void RepairSignAfter1DFFT_GPU(double* pAfterFFT, long HowMany, long Nx)
 //#endif
 }
 
-void RotateDataAfter1DFFT_GPU(double* pAfterFFT, long HowMany, long Nx) 
+void CGenMathFFT1D::RotateDataAfter1DFFT_GPU(double* pAfterFFT, long HowMany, long Nx) 
 {
 
     dim3 blocks(Nx / (2 * GMFFT_BLOCK_SIZE) + ((Nx & (2 * GMFFT_BLOCK_SIZE - 1)) != 0));
@@ -268,7 +269,7 @@ void RotateDataAfter1DFFT_GPU(double* pAfterFFT, long HowMany, long Nx)
 //#endif
 }
 
-void RepairAndRotateDataAfter1DFFT_GPU(double* pAfterFFT, long HowMany, long Nx, double Mult) 
+void CGenMathFFT1D::RepairAndRotateDataAfter1DFFT_GPU(double* pAfterFFT, long HowMany, long Nx, double Mult) 
 {
 
     dim3 blocks(Nx / (2 * GMFFT_BLOCK_SIZE) + (((Nx / 2) & (GMFFT_BLOCK_SIZE - 1)) != 0), 1);
@@ -283,7 +284,7 @@ void RepairAndRotateDataAfter1DFFT_GPU(double* pAfterFFT, long HowMany, long Nx,
 //#endif
 }
 
-void NormalizeDataAfter1DFFT_GPU(double* pAfterFFT, long HowMany, long Nx, double Mult) 
+void CGenMathFFT1D::NormalizeDataAfter1DFFT_GPU(double* pAfterFFT, long HowMany, long Nx, double Mult) 
 {
 
     dim3 blocks(Nx / GMFFT_BLOCK_SIZE + ((Nx & (GMFFT_BLOCK_SIZE - 1)) != 0));
@@ -297,7 +298,7 @@ void NormalizeDataAfter1DFFT_GPU(double* pAfterFFT, long HowMany, long Nx, doubl
 //#endif
 }
 
-void FillArrayShift_GPU(double t0, double tStep, long Nx, double* tShiftX) 
+void CGenMathFFT1D::FillArrayShift_GPU(double t0, double tStep, long Nx, double* tShiftX) 
 {
 
     dim3 blocks(Nx / (2 * GMFFT_BLOCK_SIZE) + ((Nx & (2 * GMFFT_BLOCK_SIZE - 1)) != 0), 1);
@@ -311,7 +312,7 @@ void FillArrayShift_GPU(double t0, double tStep, long Nx, double* tShiftX)
 //#endif
 }
 
-void TreatShift_GPU(double* pData, long HowMany, long Nx, double* tShiftX) 
+void CGenMathFFT1D::TreatShift_GPU(double* pData, long HowMany, long Nx, double* tShiftX) 
 {
 
     dim3 blocks(Nx / GMFFT_BLOCK_SIZE + ((Nx & (GMFFT_BLOCK_SIZE - 1)) != 0));
@@ -326,7 +327,7 @@ void TreatShift_GPU(double* pData, long HowMany, long Nx, double* tShiftX)
 }
 
 
-template <typename T> __global__ void RepairSignAfter2DFFT_Kernel(T* pAfterFFT, long Nx, long Ny, long Nx2Ny2, long howMany) 
+template <typename T> __global__ void RepairSignAfter2DFFT_Kernel(T* pAfterFFT, long Nx, long Ny) 
 {
     int ix = (blockIdx.x * blockDim.x + threadIdx.x); //Nx range
     int iy = (blockIdx.y * blockDim.y + threadIdx.y); //Ny range
@@ -337,15 +338,12 @@ template <typename T> __global__ void RepairSignAfter2DFFT_Kernel(T* pAfterFFT,
 
     if (ix < Nx && iy < Ny) 
     {
-        for (long i=0; i<howMany; i++)
-        {
-            pAfterFFT[Nx2Ny2 * i + (ix + iy * Nx) * 2] *= s;
-            pAfterFFT[Nx2Ny2 * i + (ix + iy * Nx) * 2 + 1] *= s;
-        }
+        pAfterFFT[(ix + iy * Nx) * 2] *= s;
+        pAfterFFT[(ix + iy * Nx) * 2 + 1] *= s;
     }
 }
 
-template <typename T> __global__ void RotateDataAfter2DFFT_Kernel(T* pAfterFFT, long HalfNx, long Nx, long HalfNy, long Ny, long Nx2Ny2, long howMany) 
+template <typename T> __global__ void RotateDataAfter2DFFT_Kernel(T* pAfterFFT, long HalfNx, long Nx, long HalfNy, long Ny) 
 {
     int ix = (blockIdx.x * blockDim.x + threadIdx.x); //HalfNx range
     int iy = (blockIdx.y * blockDim.y + threadIdx.y); //HalfNy range
@@ -353,32 +351,29 @@ template <typename T> __global__ void RotateDataAfter2DFFT_Kernel(T* pAfterFFT,
     if (ix < HalfNx && iy < HalfNy) 
     {
         int idx = (ix + iy * Nx) * 2;
-        for (long i=0; i<howMany; i++)
-        {
-            long long HalfNyNx = ((long long)HalfNy) * ((long long)Nx);
-            T* t1 = pAfterFFT + i * Nx2Ny2, *t2 = pAfterFFT + (HalfNyNx + HalfNx) * 2 + i * Nx2Ny2;
-            T* t3 = pAfterFFT + HalfNx * 2 + i * Nx2Ny2, *t4 = pAfterFFT + HalfNyNx * 2 + i * Nx2Ny2;
+        long long HalfNyNx = ((long long)HalfNy) * ((long long)Nx);
+        T* t1 = pAfterFFT, *t2 = pAfterFFT + (HalfNyNx + HalfNx) * 2;
+        T* t3 = pAfterFFT + HalfNx * 2, *t4 = pAfterFFT + HalfNyNx * 2;
 
-            T buf_r = t1[idx];
-            T buf_im = t1[idx + 1];
-            t1[idx] = t2[idx];
-            t1[idx + 1] = t2[idx + 1];
+        T buf_r = t1[idx];
+        T buf_im = t1[idx + 1];
+        t1[idx] = t2[idx];
+        t1[idx + 1] = t2[idx + 1];
 
-            t2[idx] = buf_r;
-            t2[idx + 1] = buf_im;
+        t2[idx] = buf_r;
+        t2[idx + 1] = buf_im;
 
-            buf_r = t3[idx];
-            buf_im = t3[idx + 1];
-            t3[idx] = t4[idx];
-            t3[idx + 1] = t4[idx + 1];
+        buf_r = t3[idx];
+        buf_im = t3[idx + 1];
+        t3[idx] = t4[idx];
+        t3[idx + 1] = t4[idx + 1];
 
-            t4[idx] = buf_r;
-            t4[idx + 1] = buf_im;
-        }
+        t4[idx] = buf_r;
+        t4[idx + 1] = buf_im;
     }
 }
 
-template <typename T, typename T2> __global__ void RepairSignAndRotateDataAfter2DFFT_Kernel(T* pAfterFFT, long HalfNx, long Nx, long HalfNy, long Ny, long Nx2Ny2, long howMany, T2 Mult) 
+template <typename T, typename T2> __global__ void RepairSignAndRotateDataAfter2DFFT_Kernel(T* pAfterFFT, long HalfNx, long Nx, long HalfNy, long Ny, T2 Mult) 
 {
     int ix = (blockIdx.x * blockDim.x + threadIdx.x); //HalfNx range
     int iy = (blockIdx.y * blockDim.y + threadIdx.y); //HalfNy range
@@ -396,52 +391,47 @@ template <typename T, typename T2> __global__ void RepairSignAndRotateDataAfter2
         float s4 = sx0 * sy1 * Mult;
 
         int idx = (ix + iy * Nx);
-        for (long i=0; i<howMany; i++)
-        {
-            long long HalfNyNx = ((long long)HalfNy) * ((long long)Nx);
-            T* t1 = pAfterFFT + i * Nx2Ny2, *t2 = pAfterFFT + (HalfNyNx + HalfNx) + i * Nx2Ny2;
-            T* t3 = pAfterFFT + HalfNx + i * Nx2Ny2, *t4 = pAfterFFT + HalfNyNx + i * Nx2Ny2;
+        
+        long long HalfNyNx = ((long long)HalfNy) * ((long long)Nx);
+        T* t1 = pAfterFFT, *t2 = pAfterFFT + (HalfNyNx + HalfNx);
+        T* t3 = pAfterFFT + HalfNx, *t4 = pAfterFFT + HalfNyNx;
 
-            T buf1 = t1[idx];
-            buf1.x *= s1;
-            buf1.y *= s1;
+        T buf1 = t1[idx];
+        buf1.x *= s1;
+        buf1.y *= s1;
 
-            T buf2 = t2[idx];
-            buf2.x *= s2;
-            buf2.y *= s2;
+        T buf2 = t2[idx];
+        buf2.x *= s2;
+        buf2.y *= s2;
 
-            t1[idx] = buf2;
-            t2[idx] = buf1;
+        t1[idx] = buf2;
+        t2[idx] = buf1;
 
-            buf1 = t3[idx];
-            buf1.x *= s3;
-            buf1.y *= s3;
+        buf1 = t3[idx];
+        buf1.x *= s3;
+        buf1.y *= s3;
 
-            buf2 = t4[idx];
-            buf2.x *= s4;
-            buf2.y *= s4;
+        buf2 = t4[idx];
+        buf2.x *= s4;
+        buf2.y *= s4;
 
-            t3[idx] = buf2;
-            t4[idx] = buf1;
-        }
+        t3[idx] = buf2;
+        t4[idx] = buf1;
     }
 }
 
-template <typename T> __global__ void NormalizeDataAfter2DFFT_Kernel(T* pAfterFFT, long Nx2Ny2, long howMany, long n, T Mult) 
+template <typename T> __global__ void NormalizeDataAfter2DFFT_Kernel(T* pAfterFFT, long Nx2Ny2, long n, T Mult) 
 {
     int ix = (blockIdx.x * blockDim.x + threadIdx.x) * 2; //Nx range
 
     if (ix < Nx2Ny2) 
     {
-        for (long i=0; i<howMany; i++)
-        {
-            pAfterFFT[ix + Nx2Ny2 * i] *= Mult;
-            pAfterFFT[ix + Nx2Ny2 * i + 1] *= Mult;
-        }
+        pAfterFFT[ix] *= Mult;
+        pAfterFFT[ix + 1] *= Mult;
     }
 }
 
-template <typename T, bool NeedsShiftX, bool NeedsShiftY> __global__ void TreatShift2D_Kernel(T* pData, long HowMany, long Nx2, long Ny, T* tShiftX, T* tShiftY) 
+template <typename T, bool NeedsShiftX, bool NeedsShiftY> __global__ void TreatShift2D_Kernel(T* pData, long Nx2, long Ny, T* tShiftX, T* tShiftY) 
 {
     int ix = (blockIdx.x * blockDim.x + threadIdx.x) * 2; //Nx range
     int iy = (blockIdx.y * blockDim.y + threadIdx.y); //Ny range
@@ -484,221 +474,100 @@ template <typename T, bool NeedsShiftX, bool NeedsShiftY> __global__ void TreatS
             MultIm = MultY_Im;
         }
 
-        for (long k=0; k<HowMany; k++)
-        {
-            long offset = k * Nx2 * Ny + iy * Nx2 + ix;
-            T buf_r = pData[offset];
-            T buf_im = pData[offset + 1];
-            T NewRe = buf_r * MultRe - buf_im * MultIm;
-            T NewIm = buf_r * MultIm + buf_im * MultRe;
-            pData[offset] = NewRe;
-            pData[offset + 1] = NewIm;
-        }
+        long offset = iy * Nx2 + ix;
+        T buf_r = pData[offset];
+        T buf_im = pData[offset + 1];
+        T NewRe = buf_r * MultRe - buf_im * MultIm;
+        T NewIm = buf_r * MultIm + buf_im * MultRe;
+        pData[offset] = NewRe;
+        pData[offset + 1] = NewIm;
     }
 }
 
-void RepairSignAfter2DFFT_GPU(float* pAfterFFT, long Nx, long Ny, long howMany)
+void CGenMathFFT2D::RepairSignAfter2DFFT_GPU(float* pAfterFFT, long Nx, long Ny)
 {
 
     dim3 blocks(Nx / GMFFT_BLOCK_SIZE + ((Nx & (GMFFT_BLOCK_SIZE - 1)) != 0), Ny);
     dim3 threads(GMFFT_BLOCK_SIZE, 1);
-    RepairSignAfter2DFFT_Kernel<float> << <blocks, threads >> > (pAfterFFT, Nx, Ny, Nx * Ny * 2, howMany);
+    RepairSignAfter2DFFT_Kernel<float> << <blocks, threads >> > (pAfterFFT, Nx, Ny);
 }
 
-void RotateDataAfter2DFFT_GPU(float* pAfterFFT, long Nx, long Ny, long howMany)
+void CGenMathFFT2D::RotateDataAfter2DFFT_GPU(float* pAfterFFT, long Nx, long Ny)
 {
 
     dim3 blocks(Nx / (2 * GMFFT_BLOCK_SIZE) + ((Nx / 2 & (GMFFT_BLOCK_SIZE - 1)) != 0), Ny);
     dim3 threads(GMFFT_BLOCK_SIZE, 1);
-    RotateDataAfter2DFFT_Kernel<float> << <blocks, threads >> > (pAfterFFT, Nx / 2, Nx, Ny / 2, Ny, Nx * Ny * 2, howMany);
+    RotateDataAfter2DFFT_Kernel<float> << <blocks, threads >> > (pAfterFFT, Nx / 2, Nx, Ny / 2, Ny);
 }
 
-void RepairSignAndRotateDataAfter2DFFT_GPU(float* pAfterFFT, long Nx, long Ny, long howMany, float Mult)
+void CGenMathFFT2D::RepairSignAndRotateDataAfter2DFFT_GPU(float* pAfterFFT, long Nx, long Ny, float Mult)
 {
 
     dim3 blocks(Nx / (2 * GMFFT_BLOCK_SIZE) + ((Nx / 2 & (GMFFT_BLOCK_SIZE - 1)) != 0), Ny/2);
     dim3 threads(GMFFT_BLOCK_SIZE, 1);
-    RepairSignAndRotateDataAfter2DFFT_Kernel<float2, float> << <blocks, threads >> > ((float2*)pAfterFFT, Nx / 2, Nx, Ny / 2, Ny, Nx * Ny, howMany, Mult);
+    RepairSignAndRotateDataAfter2DFFT_Kernel<float2, float> << <blocks, threads >> > ((float2*)pAfterFFT, Nx / 2, Nx, Ny / 2, Ny, Mult);
 }
 
-void NormalizeDataAfter2DFFT_GPU(float* pAfterFFT, long Nx, long Ny, long howMany, double Mult)
+void CGenMathFFT2D::NormalizeDataAfter2DFFT_GPU(float* pAfterFFT, long Nx, long Ny, double Mult)
 {
 
     dim3 blocks((Nx * Ny) / GMFFT_BLOCK_SIZE + (((Nx * Ny) & (GMFFT_BLOCK_SIZE - 1)) != 0), 1);
     dim3 threads(GMFFT_BLOCK_SIZE, 1);
-    NormalizeDataAfter2DFFT_Kernel<float> << <blocks, threads >> > (pAfterFFT, Nx * Ny * 2, howMany,1, (float)Mult); //OC06092023
+    NormalizeDataAfter2DFFT_Kernel<float> << <blocks, threads >> > (pAfterFFT, Nx * Ny * 2, 1, (float)Mult); //OC06092023
     //NormalizeDataAfter2DFFT_Kernel<float> << <blocks, threads >> > (pAfterFFT, Nx * Ny * 2, howMany,1, Mult);
 }
 
-void TreatShifts2D_GPU(float* pData, long Nx, long Ny, long howMany, bool NeedsShiftX, bool NeedsShiftY, float* m_ArrayShiftX, float* m_ArrayShiftY)
+void CGenMathFFT2D::TreatShifts2D_GPU(float* pData, long Nx, long Ny, bool NeedsShiftX, bool NeedsShiftY, float* m_ArrayShiftX, float* m_ArrayShiftY)
 {
 
     dim3 blocks((Nx) / GMFFT_BLOCK_SIZE + (((Nx) & (GMFFT_BLOCK_SIZE - 1)) != 0), Ny);
     dim3 threads(GMFFT_BLOCK_SIZE, 1);
     
-    if (NeedsShiftX && NeedsShiftY) TreatShift2D_Kernel<float, true, true> << <blocks, threads >> > (pData, howMany, Nx * 2, Ny, m_ArrayShiftX, m_ArrayShiftY);
-    else if (NeedsShiftX) TreatShift2D_Kernel<float, true, false> << <blocks, threads >> > (pData, howMany, Nx * 2, Ny, m_ArrayShiftX, m_ArrayShiftY);
-    else if (NeedsShiftY) TreatShift2D_Kernel<float, false, true> << <blocks, threads >> > (pData, howMany, Nx * 2, Ny, m_ArrayShiftX, m_ArrayShiftY);
+    if (NeedsShiftX && NeedsShiftY) TreatShift2D_Kernel<float, true, true> << <blocks, threads >> > (pData, Nx * 2, Ny, m_ArrayShiftX, m_ArrayShiftY);
+    else if (NeedsShiftX) TreatShift2D_Kernel<float, true, false> << <blocks, threads >> > (pData, Nx * 2, Ny, m_ArrayShiftX, m_ArrayShiftY);
+    else if (NeedsShiftY) TreatShift2D_Kernel<float, false, true> << <blocks, threads >> > (pData, Nx * 2, Ny, m_ArrayShiftX, m_ArrayShiftY);
 }
 
-void RepairSignAfter2DFFT_GPU(double* pAfterFFT, long Nx, long Ny, long howMany)
+void CGenMathFFT2D::RepairSignAfter2DFFT_GPU(double* pAfterFFT, long Nx, long Ny)
 {
 
     dim3 blocks(Nx / GMFFT_BLOCK_SIZE + ((Nx & (GMFFT_BLOCK_SIZE - 1)) != 0), Ny);
     dim3 threads(GMFFT_BLOCK_SIZE, 1);
-    RepairSignAfter2DFFT_Kernel<double> << <blocks, threads >> > (pAfterFFT, Nx, Ny, Nx * Ny * 2, howMany);
+    RepairSignAfter2DFFT_Kernel<double> << <blocks, threads >> > (pAfterFFT, Nx, Ny);
 }
 
-void RotateDataAfter2DFFT_GPU(double* pAfterFFT, long Nx, long Ny, long howMany)
+void CGenMathFFT2D::RotateDataAfter2DFFT_GPU(double* pAfterFFT, long Nx, long Ny)
 {
 
     dim3 blocks(Nx / (2 * GMFFT_BLOCK_SIZE) + ((Nx / 2 & (GMFFT_BLOCK_SIZE - 1)) != 0), Ny);
     dim3 threads(GMFFT_BLOCK_SIZE, 1);
-    RotateDataAfter2DFFT_Kernel<double> << <blocks, threads >> > (pAfterFFT, Nx / 2, Nx, Ny / 2, Ny, Nx * Ny * 2, howMany);
+    RotateDataAfter2DFFT_Kernel<double> << <blocks, threads >> > (pAfterFFT, Nx / 2, Nx, Ny / 2, Ny);
 }
 
-void RepairSignAndRotateDataAfter2DFFT_GPU(double* pAfterFFT, long Nx, long Ny, long howMany, double Mult)
+void CGenMathFFT2D::RepairSignAndRotateDataAfter2DFFT_GPU(double* pAfterFFT, long Nx, long Ny, double Mult)
 {
 
     dim3 blocks(Nx / (2 * GMFFT_BLOCK_SIZE) + ((Nx / 2 & (GMFFT_BLOCK_SIZE - 1)) != 0), Ny/2);
     dim3 threads(GMFFT_BLOCK_SIZE, 1);
-    RepairSignAndRotateDataAfter2DFFT_Kernel<double2, double> << <blocks, threads >> > ((double2*)pAfterFFT, Nx / 2, Nx, Ny / 2, Ny, Nx * Ny, howMany, Mult);
+    RepairSignAndRotateDataAfter2DFFT_Kernel<double2, double> << <blocks, threads >> > ((double2*)pAfterFFT, Nx / 2, Nx, Ny / 2, Ny, Mult);
 }
 
-void NormalizeDataAfter2DFFT_GPU(double* pAfterFFT, long Nx, long Ny, long howMany, double Mult)
+void CGenMathFFT2D::NormalizeDataAfter2DFFT_GPU(double* pAfterFFT, long Nx, long Ny, double Mult)
 {
 
     dim3 blocks((Nx * Ny) / GMFFT_BLOCK_SIZE + (((Nx * Ny) & (GMFFT_BLOCK_SIZE - 1)) != 0), 1);
     dim3 threads(GMFFT_BLOCK_SIZE, 1);
-    NormalizeDataAfter2DFFT_Kernel<double> << <blocks, threads >> > (pAfterFFT, Nx * Ny * 2, howMany,1, Mult);
+    NormalizeDataAfter2DFFT_Kernel<double> << <blocks, threads >> > (pAfterFFT, Nx * Ny * 2,1, Mult);
 }
 
-void TreatShifts2D_GPU(double* pData, long Nx, long Ny, long howMany, bool NeedsShiftX, bool NeedsShiftY, double* m_ArrayShiftX, double* m_ArrayShiftY)
+void CGenMathFFT2D::TreatShifts2D_GPU(double* pData, long Nx, long Ny, bool NeedsShiftX, bool NeedsShiftY, double* m_ArrayShiftX, double* m_ArrayShiftY)
 {
 
     dim3 blocks((Nx) / GMFFT_BLOCK_SIZE + (((Nx) & (GMFFT_BLOCK_SIZE - 1)) != 0), Ny);
     dim3 threads(GMFFT_BLOCK_SIZE, 1);
 
-    if (NeedsShiftX && NeedsShiftY) TreatShift2D_Kernel<double, true, true> << <blocks, threads >> > (pData, howMany, Nx * 2, Ny, m_ArrayShiftX, m_ArrayShiftY);
-    else if (NeedsShiftX) TreatShift2D_Kernel<double, true, false> << <blocks, threads >> > (pData, howMany, Nx * 2, Ny, m_ArrayShiftX, m_ArrayShiftY);
-    else if (NeedsShiftY) TreatShift2D_Kernel<double, false, true> << <blocks, threads >> > (pData, howMany, Nx * 2, Ny, m_ArrayShiftX, m_ArrayShiftY);
-}
-
-//OC06092023: looks like place if wrong here for this function, why all these functions are programmed without classes?
-template<typename T> __global__ void StokesAvgUpdateInterp_Kernel(float* pStokesArS, float* pMoreStokesArS, int nIters, int nOrder, int nStokesComp, T mult, int iSt, long xNpMeshRes, long yNpMeshRes, long eNpMeshRes, T yStartMeshRes, T yStepMeshRes, T yStartWfr, T yStepWfr, T xStartMeshRes, T xStepMeshRes, T xStartWfr, T xStepWfr, int iOfstSt, long xNpWfr, long yNpWfr, long eNpWfr, bool sum)
-{
-    int ix = (blockIdx.x * blockDim.x + threadIdx.x); //xNpMeshRes range
-    int iy = (blockIdx.y * blockDim.y + threadIdx.y); //yNpMeshRes range
-    int ie = (blockIdx.z * blockDim.z + threadIdx.z); //eNpMeshRes range
-
-    if (ix >= xNpMeshRes)
-        return;
-    if (iy >= yNpMeshRes)
-        return;
-    if (ie >= eNpMeshRes)
-        return;
-
-    long ir = iSt * yNpMeshRes * xNpMeshRes * eNpMeshRes + iy * xNpMeshRes * eNpMeshRes + ix * eNpMeshRes + ie;
-
-    auto yMeshRes = yStartMeshRes + iy * yStepMeshRes;
-    auto xMeshRes = xStartMeshRes + ix * xStepMeshRes;
-    T fInterp = 0;
-    int loc_ix_ofst = iOfstSt + ie;
-    auto nx_ix_per = xNpWfr * eNpWfr;
-
-    switch (nOrder)
-    {
-    case 1:
-        {
-            int ix0 = (int)trunc((xMeshRes - xStartWfr) / xStepWfr + 1e-09);
-            if ((ix0 < 0) | (ix0 >= xNpWfr - 1)) 
-            {
-                pStokesArS[ir] = pStokesArS[ir] * nIters / (nIters + 1);
-                return;
-            }
-            int ix1 = ix0 + 1;
-            auto tx = (xMeshRes - (xStartWfr + xStepWfr * ix0)) / xStepWfr;
-            int iy0 = (int)trunc((yMeshRes - yStartWfr) / yStepWfr + 1e-09);
-            if ((iy0 < 0) | (iy0 >= yNpWfr - 1)) 
-            {
-                pStokesArS[ir] = pStokesArS[ir] * nIters / (nIters + 1);
-                return;
-            }
-
-
-            int iy1 = iy0 + 1;
-            auto ty = (yMeshRes - (yStartWfr + yStepWfr * iy0)) / yStepWfr;
-            auto iy0_nx_ix_per = iy0 * nx_ix_per;
-            auto iy1_nx_ix_per = iy1 * nx_ix_per;
-            auto ix0_ix_per_p_ix_ofst = ix0 * eNpWfr + loc_ix_ofst;
-            auto ix1_ix_per_p_ix_ofst = ix1 * eNpWfr + loc_ix_ofst;
-            auto a00 = pMoreStokesArS[iy0_nx_ix_per + ix0_ix_per_p_ix_ofst];
-            auto f10 = pMoreStokesArS[iy0_nx_ix_per + ix1_ix_per_p_ix_ofst];
-            auto f01 = pMoreStokesArS[iy1_nx_ix_per + ix0_ix_per_p_ix_ofst];
-            auto f11 = pMoreStokesArS[iy1_nx_ix_per + ix1_ix_per_p_ix_ofst];
-            auto a10 = f10 - a00;
-            auto a01 = f01 - a00;
-            auto a11 = a00 - f01 - f10 + f11;
-            fInterp = a00 + tx * (a10 + ty * a11) + ty * a01;
-        }
-        break;
-    case 2:
-        {
-            int ix0 = int(round((xMeshRes - xStartWfr) / xStepWfr));
-            if ((ix0 < 0) || (ix0 >= xNpWfr - 1)) 
-            {
-                pStokesArS[ir] = pStokesArS[ir] * nIters / (float)(nIters + 1);
-                ir += 1;
-                return;
-            }
-            int ixm1 = ix0 - 1;
-            int ix1 = ix0 + 1;
-            auto tx = (xMeshRes - (xStartWfr + xStepWfr * ix0)) / xStepWfr;
-            int iy0 = int(round((yMeshRes - yStartWfr) / yStepWfr));
-            if ((iy0 < 0) || (iy0 >= yNpWfr - 1)) 
-            {
-                pStokesArS[ir] = pStokesArS[ir] * nIters / (nIters + 1);
-                ir += 1;
-                return;
-            }
-            int iym1 = iy0 - 1;
-            int iy1 = iy0 + 1;
-            auto ty = (yMeshRes - (yStartWfr + yStepWfr * iy0)) / yStepWfr;
-            auto iym1_nx_ix_per = iym1 * nx_ix_per;
-            auto iy0_nx_ix_per = iy0 * nx_ix_per;
-            auto iy1_nx_ix_per = iy1 * nx_ix_per;
-            auto ixm1_ix_per_p_ix_ofst = ixm1 * eNpWfr + loc_ix_ofst;
-            auto ix0_ix_per_p_ix_ofst = ix0 * eNpWfr + loc_ix_ofst;
-            auto ix1_ix_per_p_ix_ofst = ix1 * eNpWfr + loc_ix_ofst;
-            auto fm10 = pMoreStokesArS[iy0_nx_ix_per + ixm1_ix_per_p_ix_ofst];
-            auto a00 = pMoreStokesArS[iy0_nx_ix_per + ix0_ix_per_p_ix_ofst];
-            auto f10 = pMoreStokesArS[iy0_nx_ix_per + ix1_ix_per_p_ix_ofst];
-            auto f0m1 = pMoreStokesArS[iym1_nx_ix_per + ix0_ix_per_p_ix_ofst];
-            auto f01 = pMoreStokesArS[iy1_nx_ix_per + ix0_ix_per_p_ix_ofst];
-            auto f11 = pMoreStokesArS[iy1_nx_ix_per + ix1_ix_per_p_ix_ofst];
-            auto a10 = 0.5 * (f10 - fm10);
-            auto a01 = 0.5 * (f01 - f0m1);
-            auto a11 = a00 - f01 - f10 + f11;
-            auto a20 = 0.5 * (f10 + fm10) - a00;
-            auto a02 = 0.5 * (f01 + f0m1) - a00;
-            fInterp = a00 + tx * (a10 + tx * a20 + ty * a11) + ty * (a01 + ty * a02);
-        }
-        break;
-    }
-
-    if (sum) pStokesArS[ir] += mult * fInterp;
-    else pStokesArS[ir] = (pStokesArS[ir] * nIters + mult * fInterp) / (nIters + 1);
-    return;
-}
-
-//OC06092023: looks like place if wrong here for this function, why all these functions are programmed without classes?
-void StokesAvgUpdateInterp(float* pStokesArS, float* pMoreStokesArS, int nIters, int nOrder, int nStokesComp, double mult, int iSt, long xNpMeshRes, long yNpMeshRes, long eNpMeshRes, double yStartMeshRes, double yStepMeshRes, double yStartWfr, double yStepWfr, double xStartMeshRes, double xStepMeshRes, double xStartWfr, double xStepWfr, int iOfstSt, long xNpWfr, long yNpWfr, long eNpWfr, bool sum)
-{
-    const int bs = 8;
-    dim3 threads(xNpMeshRes / bs + ((xNpMeshRes & (bs - 1)) != 0), yNpMeshRes / bs + ((yNpMeshRes & (bs - 1)) != 0), eNpMeshRes);
-    dim3 blocks(bs, bs, 1);
-    //OC06092023 (check order of variables, loop over e)
-    StokesAvgUpdateInterp_Kernel <float><< <threads, blocks >> > (pStokesArS, pMoreStokesArS, nIters, nOrder, nStokesComp, (float)mult, iSt, xNpMeshRes, yNpMeshRes, eNpMeshRes, (float)yStartMeshRes, (float)yStepMeshRes, (float)yStartWfr, (float)yStepWfr, (float)xStartMeshRes, (float)xStepMeshRes, (float)xStartWfr, (float)xStepWfr, iOfstSt, xNpWfr, yNpWfr, eNpWfr, sum);
-    //StokesAvgUpdateInterp_Kernel <float><< <threads, blocks >> > (pStokesArS, pMoreStokesArS, nIters, nOrder, nStokesComp, mult, iSt, xNpMeshRes, yNpMeshRes, eNpMeshRes, yStartMeshRes, yStepMeshRes, yStartWfr, yStepWfr, xStartMeshRes, xStepMeshRes, xStartWfr, xStepWfr, iOfstSt, xNpWfr, yNpWfr, eNpWfr, sum);
+    if (NeedsShiftX && NeedsShiftY) TreatShift2D_Kernel<double, true, true> << <blocks, threads >> > (pData, Nx * 2, Ny, m_ArrayShiftX, m_ArrayShiftY);
+    else if (NeedsShiftX) TreatShift2D_Kernel<double, true, false> << <blocks, threads >> > (pData, Nx * 2, Ny, m_ArrayShiftX, m_ArrayShiftY);
+    else if (NeedsShiftY) TreatShift2D_Kernel<double, false, true> << <blocks, threads >> > (pData, Nx * 2, Ny, m_ArrayShiftX, m_ArrayShiftY);
 }
 #endif
\ No newline at end of file
diff --git a/cpp/src/ext/genmath/gmmeth.h b/cpp/src/ext/genmath/gmmeth.h
index 6388ae26..619a7d01 100644
--- a/cpp/src/ext/genmath/gmmeth.h
+++ b/cpp/src/ext/genmath/gmmeth.h
@@ -18,6 +18,10 @@
 #include "gmobj.h"
 #endif
 
+#ifdef _OFFLOAD_GPU //HG04122023
+#include "auxgpu.h"
+#endif
+
 #include "gmvect.h"
 #include <math.h>
 #include <complex>
@@ -163,7 +167,11 @@ class CGenMathMeth
 	//static double Integ1D_FuncDefByArray(double* FuncArr, long Np, double Step);
 	//static double Integ1D_FuncDefByArray(float* FuncArr, long Np, double Step);
 	//template <class T> static double Integ1D_FuncDefByArray(T* FuncArr, long Np, double Step)
+#ifdef _OFFLOAD_GPU //HG04122023
+	template <class T> GPU_PORTABLE static double Integ1D_FuncDefByArray(T* FuncArr, long long Np, double Step)
+#else
 	template <class T> static double Integ1D_FuncDefByArray(T* FuncArr, long long Np, double Step)
+#endif
 	{
 		if((FuncArr == 0) || (Np < 2) || (Step == 0)) return 0;
 		//if(Np == 2) return (double)(0.5*(FuncArr[0] + FuncArr[1]));
diff --git a/cpp/src/ext/utils/utidev.cpp b/cpp/src/ext/utils/utidev.cpp
deleted file mode 100644
index 3a2057f1..00000000
--- a/cpp/src/ext/utils/utidev.cpp
+++ /dev/null
@@ -1,97 +0,0 @@
-/************************************************************************//**
- * File: utidev.cpp
- * Description: Auxiliary utilities to support GPU management
- *
- * @author H.Goel
- * @version 0.1
- ***************************************************************************/
-
-#include <cstdio>
-#include <cstdlib>
-#include <new>
-
-#ifdef _OFFLOAD_GPU
-#include <cuda_runtime.h>
-#endif
-
-#include "utidev.h"
-
-static bool isGPUAvailable = false;
-static bool isGPUEnabled = false;
-static bool GPUAvailabilityTested = false;
-static bool deviceOffloadInitialized = false;
-
-static void CheckGPUAvailability() 
-{
-#ifdef _OFFLOAD_GPU
-	if (!GPUAvailabilityTested)
-	{
-		isGPUAvailable = false;
-		GPUAvailabilityTested = true;
-		int deviceCount = 0;
-		if (cudaGetDeviceCount(&deviceCount) != cudaSuccess)
-			return;
-
-		if (deviceCount < 1)
-			return;
-
-		isGPUAvailable = true;
-	}
-#else
-	isGPUAvailable = false;
-	isGPUEnabled = false;
-	GPUAvailabilityTested = true;
-#endif
-}
-
-bool UtiDev::GPUAvailable()
-{
-	CheckGPUAvailability();
-	return isGPUAvailable;
-}
-
-bool UtiDev::GPUEnabled(gpuUsageArg_t *arg) 
-{
-#ifdef _OFFLOAD_GPU
-	if (arg == NULL)
-		return false;
-	if (*arg > 0) {
-		//if (cudaSetDevice(*arg - 1) != cudaSuccess) return false;
-		return GPUAvailable();
-	}
-#endif
-	return false;
-}
-
-void UtiDev::SetGPUStatus(bool enabled)
-{
-	isGPUEnabled = enabled && GPUAvailable();
-}
-
-int UtiDev::GetDevice(gpuUsageArg_t* arg)
-{
-#ifdef _OFFLOAD_GPU
-	if (arg == NULL)
-		return cudaCpuDeviceId;
-
-	int curDevice = 0;
-	cudaGetDevice(&curDevice);
-	return curDevice;
-#else
-	return 0;
-#endif
-}
-
-void UtiDev::Init() {
-	deviceOffloadInitialized = true;
-#ifdef _OFFLOAD_GPU
-	cudaDeviceSynchronize();
-#endif
-}
-
-void UtiDev::Fini() {
-#ifdef _OFFLOAD_GPU
-	cudaDeviceSynchronize();
-#endif
-	//deviceOffloadInitialized = false;
-}
\ No newline at end of file
diff --git a/cpp/src/ext/utils/utidev.h b/cpp/src/ext/utils/utidev.h
deleted file mode 100644
index 2df059fd..00000000
--- a/cpp/src/ext/utils/utidev.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/************************************************************************//**
- * File: utidev.h
- * Description: GPU offloading detection and control
- * Project: Synchrotron Radiation Workshop (and possibly others)
- * First release: 2022
- *
- * @author H. Goel
- * @version 0.1
- ***************************************************************************/
-
-#ifndef __UTIGPU_H
-#define __UTIGPU_H
-
-#include <cstdlib>
-#include <stdio.h>
-
-#ifdef _OFFLOAD_GPU
-#include <cuda_runtime.h>
-#endif
-
-typedef int gpuUsageArg_t;
-
-#define ALLOC_ARRAY(type, size) (type *)UtiDev::malloc(sizeof(type)*(size))
-#define FREE_ARRAY(x) UtiDev::free(x); x=NULL
-#define ALLOC_STRUCT(type) (type *)UtiDev::malloc(sizeof(type))
-#define FREE_STRUCT(x) UtiDev::free(x); x=NULL
-
-#ifdef _OFFLOAD_GPU
-#define GPU_ENABLED(arg) UtiDev::GPUEnabled(arg)
-#define GPU_COND(arg, code) if (GPU_ENABLED(arg)) { code }
-#define GPU_PORTABLE __device__ __host__
-#else
-#define GPU_COND(arg, code) if(0) { }
-#define GPU_ENABLED(arg) 0
-#define GPU_PORTABLE 
-#endif
-
- //*************************************************************************
-class UtiDev
-{
-public:
-	static void Init();
-	static void Fini();
-	static bool GPUAvailable(); //CheckGPUAvailable etc
-	static bool GPUEnabled(gpuUsageArg_t *arg);
-	static void SetGPUStatus(bool enabled);
-	static int GetDevice(gpuUsageArg_t* arg);
-	
-	static inline void* malloc(size_t sz) {
-#ifdef _OFFLOAD_GPU
-			void *ptr;
-			auto err = cudaMallocManaged(&ptr, sz);
-			if (err != cudaSuccess)
-				printf("Allocation Failure\r\n");
-			return ptr;
-#else
-			return std::malloc(sz);
-#endif
-	}
-
-	static inline void free(void* ptr) {
-#ifdef _OFFLOAD_GPU
-		cudaFree(ptr);
-#else
-		std::free(ptr);
-#endif
-	}
-};
-
-//*************************************************************************
-#endif
\ No newline at end of file
diff --git a/cpp/src/lib/auxgpu.cpp b/cpp/src/lib/auxgpu.cpp
index 02972cd3..d65db5e0 100644
--- a/cpp/src/lib/auxgpu.cpp
+++ b/cpp/src/lib/auxgpu.cpp
@@ -330,6 +330,8 @@ void CAuxGPU::Init() {
 
 void CAuxGPU::Fini() {
 #ifdef _OFFLOAD_GPU
+	SetGPUStatus(false);	//HG30112023 Disable GPU
+
 	// Copy back all updated data
 	bool updated = false;
 	bool freed = false;
diff --git a/cpp/src/lib/srwlib.cpp b/cpp/src/lib/srwlib.cpp
index 5bc9a324..fac92539 100644
--- a/cpp/src/lib/srwlib.cpp
+++ b/cpp/src/lib/srwlib.cpp
@@ -754,7 +754,7 @@ EXP int CALL srwlCalcPowDenSR(SRWLStokes* pStokes, SRWLPartBeam* pElBeam, SRWLPr
 
 //-------------------------------------------------------------------------
 
-EXP int CALL srwlCalcIntFromElecField(char* pInt, SRWLWfr* pWfr, char polar, char intType, char depType, double e, double x, double y, double* pMeth, void* pFldTrj, void* pGPU) //OC26072023
+EXP int CALL srwlCalcIntFromElecField(char* pInt, SRWLWfr* pWfr, char polar, char intType, char depType, double e, double x, double y, double* pMeth, void* pFldTrj, void* pvGPU) //OC26072023
 //EXP int CALL srwlCalcIntFromElecField(char* pInt, SRWLWfr* pWfr, char polar, char intType, char depType, double e, double x, double y, double* pMeth, void* pFldTrj) //OC23022020
 //EXP int CALL srwlCalcIntFromElecField(char* pInt, SRWLWfr* pWfr, char polar, char intType, char depType, double e, double x, double y, double *pMeth) //OC16122019
 //EXP int CALL srwlCalcIntFromElecField(char* pInt, SRWLWfr* pWfr, char polar, char intType, char depType, double e, double x, double y, int *pMeth) //OC13122019
@@ -800,7 +800,8 @@ EXP int CALL srwlCalcIntFromElecField(char* pInt, SRWLWfr* pWfr, char polar, cha
 			//pFldTrj = pTrjData;
 		}
 
-		radGenManip.ExtractRadiation((int)polar, (int)arIntTypeConv[intType], (int)depType, wfr.Pres, e, x, y, pInt, pMeth, pTrjDat); //OC23022020
+		radGenManip.ExtractRadiation((int)polar, (int)arIntTypeConv[intType], (int)depType, wfr.Pres, e, x, y, pInt, pMeth, pTrjDat, pvGPU); //HG03122023
+		//radGenManip.ExtractRadiation((int)polar, (int)arIntTypeConv[intType], (int)depType, wfr.Pres, e, x, y, pInt, pMeth, pTrjDat); //OC23022020
 		//radGenManip.ExtractRadiation((int)polar, (int)arIntTypeConv[intType], (int)depType, wfr.Pres, e, x, y, pInt, pMeth); //OC13122019
 		//radGenManip.ExtractRadiation((int)polar, (int)intType, (int)depType, wfr.Pres, e, x, y, pInt);
 
@@ -998,7 +999,7 @@ EXP int CALL srwlSetRepresElecField(SRWLWfr* pWfr, char repr)
 
 //-------------------------------------------------------------------------
 
-EXP int CALL srwlPropagElecField(SRWLWfr* pWfr, SRWLOptC* pOpt, int nInt, char** arID, SRWLRadMesh* arIM, char** arI, void* pGPU) //OC26072023 (from HG)
+EXP int CALL srwlPropagElecField(SRWLWfr* pWfr, SRWLOptC* pOpt, int nInt, char** arID, SRWLRadMesh* arIM, char** arI, void* pvGPU) //OC26072023 (from HG)
 //EXP int CALL srwlPropagElecField(SRWLWfr* pWfr, SRWLOptC* pOpt, int nInt, char** arID, SRWLRadMesh* arIM, char** arI) //OC15082018
 //EXP int CALL srwlPropagElecField(SRWLWfr* pWfr, SRWLOptC* pOpt)
 {
@@ -1019,7 +1020,8 @@ EXP int CALL srwlPropagElecField(SRWLWfr* pWfr, SRWLOptC* pOpt, int nInt, char**
 		//srwlPrintTime("srwlPropagElecField: CheckRadStructForPropagation",&start);
 
 		//if(locErNo = optCont.PropagateRadiationGuided(wfr)) return locErNo;
-		if(locErNo = optCont.PropagateRadiationGuided(wfr, nInt, arID, arIM, arI)) return locErNo; //OC15082018
+		//if(locErNo = optCont.PropagateRadiationGuided(wfr, nInt, arID, arIM, arI)) return locErNo; //OC15082018
+		if(locErNo = optCont.PropagateRadiationGuided(wfr, nInt, arID, arIM, arI, pvGPU)) return locErNo; //OC15082018 //HG03122023
 
 		//Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP:
 		//srwlPrintTime("srwlPropagElecField: PropagateRadiationGuided",&start);
@@ -1052,7 +1054,7 @@ EXP int CALL srwlCalcTransm(SRWLOptT* pOpTr, const double* pDelta, const double*
 
 //-------------------------------------------------------------------------
 
-EXP int CALL srwlUtiFFT(char* pcData, char typeData, double* arMesh, int nMesh, int dir, void* pGPU) //OC26072023 
+EXP int CALL srwlUtiFFT(char* pcData, char typeData, double* arMesh, int nMesh, int dir, void* pvGPU) //OC26072023 
 //EXP int CALL srwlUtiFFT(char* pcData, char typeData, double* arMesh, int nMesh, int dir)
 {
 	if((pcData == 0) || (arMesh == 0) || ((typeData != 'f') && (typeData != 'd')) || (nMesh < 3) || (dir == 0)) return SRWL_INCORRECT_PARAM_FOR_FFT; //OC31012019
@@ -1098,7 +1100,8 @@ EXP int CALL srwlUtiFFT(char* pcData, char typeData, double* arMesh, int nMesh,
 			FFT1DInfo.UseGivenStartTrValue = 0;
 
 			CGenMathFFT1D FFT1D;
-			if(locErNo = FFT1D.Make1DFFT(FFT1DInfo)) return locErNo;
+			//if(locErNo = FFT1D.Make1DFFT(FFT1DInfo)) return locErNo;
+			if(locErNo = FFT1D.Make1DFFT(FFT1DInfo, pvGPU)) return locErNo; //HG03122023
 
 			arMesh[0] = FFT1DInfo.xStartTr;
 			arMesh[1] = FFT1DInfo.xStepTr;
@@ -1128,7 +1131,8 @@ EXP int CALL srwlUtiFFT(char* pcData, char typeData, double* arMesh, int nMesh,
 			FFT2DInfo.UseGivenStartTrValues = 0;
 
 			CGenMathFFT2D FFT2D;
-			if(locErNo = FFT2D.Make2DFFT(FFT2DInfo)) return locErNo;
+			//if(locErNo = FFT2D.Make2DFFT(FFT2DInfo)) return locErNo;
+			if(locErNo = FFT2D.Make2DFFT(FFT2DInfo, 0, 0, pvGPU)) return locErNo; //HG03122023
 
 			arMesh[0] = FFT2DInfo.xStartTr;
 			arMesh[1] = FFT2DInfo.xStepTr;
@@ -1546,49 +1550,57 @@ EXP int CALL srwlPropagRadMultiE(SRWLStokes* pStokes, SRWLWfr* pWfr0, SRWLOptC*
 
 //-------------------------------------------------------------------------
 #ifdef _OFFLOAD_GPU //OC30102023
+EXP int CALL srwlUtiGPUProc(int op, void* pvGPU) //HG04122023
+{
+	if(op == 0) CAuxGPU::Fini();
+	if(op == 1) CAuxGPU::Init();
+	return 0;
+}
 
+/* HG30112023
 EXP bool CALL srwlUtiGPUAvailable() //OC27072023
-//EXP bool CALL srwlAuxGpuAvailable() //HG
+//EXP bool CALL srwlCAuxGPUAvailable() //HG
 {
 	return CAuxGPU::GPUAvailable(); //OC05092023
-	//return AuxGpu::GPUAvailable();
+	//return CAuxGPU::GPUAvailable();
 }
 
 //-------------------------------------------------------------------------
 
 EXP bool CALL srwlUtiGPUEnabled() //OC27072023
-//EXP bool CALL srwlAuxGpuEnabled() //HG
+//EXP bool CALL srwlCAuxGPUEnabled() //HG
 {
 	return CAuxGPU::GPUEnabled(nullptr); //OC05092023
-	//return AuxGpu::GPUEnabled(nullptr);
+	//return CAuxGPU::GPUEnabled(nullptr);
 }
 
 //-------------------------------------------------------------------------
 
 EXP void CALL srwlUtiGPUSetStatus(bool enable) //OC27072023
-//EXP void CALL srwlAuxGpuSetStatus(bool enable) //HG
+//EXP void CALL srwlCAuxGPUSetStatus(bool enable) //HG
 {
 	CAuxGPU::SetGPUStatus(enable); //OC05092023
-	//AuxGpu::SetGPUStatus(enable);
+	//CAuxGPU::SetGPUStatus(enable);
 }
 
 //-------------------------------------------------------------------------
 
 EXP void CALL srwlUtiGPUInit() //OC27072023
-//EXP void CALL srwlAuxGpuInit() //HG
+//EXP void CALL srwlCAuxGPUInit() //HG
 {
 	CAuxGPU::Init(); //OC05092023 (why void?)
-	//AuxGpu::Init();
+	//CAuxGPU::Init();
 }
 
 //-------------------------------------------------------------------------
 
 EXP void CALL srwlUtiGPUFini() //OC27072023
-//EXP void CALL srwlAuxGpuFini() //HG
+//EXP void CALL srwlCAuxGPUFini() //HG
 {
 	CAuxGPU::Fini(); //OC05092023 (why void?)
-	//AuxGpu::Fini();
+	//CAuxGPU::Fini();
 }
+*/
 
 #endif
 //-------------------------------------------------------------------------
diff --git a/cpp/src/lib/srwlib.h b/cpp/src/lib/srwlib.h
index ff81e0ff..9b73c400 100644
--- a/cpp/src/lib/srwlib.h
+++ b/cpp/src/lib/srwlib.h
@@ -729,10 +729,11 @@ EXP int CALL srwlCalcPowDenSR(SRWLStokes* pStokes, SRWLPartBeam* pElBeam, SRWLPr
  *			   arMeth[18]: used for mutual intensity calculaiton / update: index of first general conjugated position to start updating the mutual intensity
  * 			   arMeth[19]: used for mutual intensity calculaiton / update: index of last general conjugated position to finish updating the mutual intensity
  * @param [in] pFldTrj auxiliary pointer to magnetic field or trajectory of central electron
+ * @param [in] pvGPU optional GPU utilization related parameters (TGPUUsageArg*)
  * @return	integer error (>0) or warnig (<0) code
  * @see ...
  */
-EXP int CALL srwlCalcIntFromElecField(char* pInt, SRWLWfr* pWfr, char pol, char intType, char depType, double e, double x, double y, double* arMeth=0, void* pFldTrj=0, void* pGPU=0); //OC26072023 (from HG)
+EXP int CALL srwlCalcIntFromElecField(char* pInt, SRWLWfr* pWfr, char pol, char intType, char depType, double e, double x, double y, double* arMeth=0, void* pFldTrj=0, void* pvGPU=0); //OC26072023 (from HG)
 //EXP int CALL srwlCalcIntFromElecField(char* pInt, SRWLWfr* pWfr, char pol, char intType, char depType, double e, double x, double y, double* arMeth=0, void* pFldTrj=0);
 //EXP int CALL srwlCalcIntFromElecField(char* pInt, SRWLWfr* pWfr, char pol, char intType, char depType, double e, double x, double y, double* arMeth=0);
 //EXP int CALL srwlCalcIntFromElecField(char* pInt, SRWLWfr* pWfr, char pol, char intType, char depType, double e, double x, double y);
@@ -800,10 +801,11 @@ EXP int CALL srwlSetRepresElecField(SRWLWfr* pWfr, char repr);
  * "Propagates" Electric Field Wavefront through Optical Elements and free spaces
  * @param [in, out] pWfr pointer to pre-calculated Wavefront structure
  * @param [in] pOpt pointer to container of optical elements the propagation should be done through
+ * @param [in] pvGPU optional GPU utilization related parameters (TGPUUsageArg*)
  * @return	integer error (>0) or warnig (<0) code
  * @see ...
  */
-EXP int CALL srwlPropagElecField(SRWLWfr* pWfr, SRWLOptC* pOpt, int nInt=0, char** arID=0, SRWLRadMesh* arIM=0, char** arI=0, void* pGPU=0); //OC26072023 (from HG)
+EXP int CALL srwlPropagElecField(SRWLWfr* pWfr, SRWLOptC* pOpt, int nInt=0, char** arID=0, SRWLRadMesh* arIM=0, char** arI=0, void* pvGPU=0); //OC26072023 (from HG)
 //EXP int CALL srwlPropagElecField(SRWLWfr* pWfr, SRWLOptC* pOpt, int nInt=0, char** arID=0, SRWLRadMesh* arIM=0, char** arI=0); //OC15082018
 //EXP int CALL srwlPropagElecField(SRWLWfr* pWfr, SRWLOptC* pOpt);
 
@@ -848,10 +850,11 @@ EXP int CALL srwlCalcTransm(SRWLOptT* pOpTr, const double* pDelta, const double*
  *             arMesh[5]: (optional) number of points of the second argument
  * @param [in] nMesh length of arMesh array (3 or 6 elements)
  * @param [in] dir direction for the FFT (>0 means forward, <0 means backward)
+ * @param [in] pvGPU optional GPU utilization related parameters (TGPUUsageArg*)
  * @return	integer error (>0) or warnig (<0) code
  * @see ...
  */
-EXP int CALL srwlUtiFFT(char* pcData, char typeData, double* arMesh, int nMesh, int dir, void* pGPU=0); //OC26072023 (from HG)
+EXP int CALL srwlUtiFFT(char* pcData, char typeData, double* arMesh, int nMesh, int dir, void* pvGPU=0); //OC26072023 (from HG)
 //EXP int CALL srwlUtiFFT(char* pcData, char typeData, double* arMesh, int nMesh, int dir);
 
 /** 
@@ -967,42 +970,55 @@ EXP int CALL srwlUtiUndFromMagFldTab(SRWLMagFldC* pUndCnt, SRWLMagFldC* pMagCnt,
  */
 EXP int CALL srwlUtiUndFindMagFldInterpInds(int* arResInds, int* pnResInds, double* arGaps, double* arPhases, int nVals, double arPrecPar[5]);
 
+#ifdef _OFFLOAD_GPU //HG30112023
+/**
+ * Implements GPU related operations.
+ * @param [in] op operation to be performed:
+ * 			   0= Deinitialize GPU
+ * 			   1= Initialize GPU
+ * @param [in] pvGPU optional GPU utilization related parameters (TGPUUsageArg*)
+ * @return	integer error (>0) or warnig (<0) code
+ * @see ...
+ */
+EXP int CALL srwlUtiGPUProc(int op, void* pvGpu=0);
+
 /**
  * Checks if GPU offloading is available
  * @return	true if available
  * @see ...
  */
-EXP bool CALL srwlUtiGPUAvailable(); //OC26072023
-//EXP bool CALL srwlAuxGpuAvailable(); //HG
+//EXP bool CALL srwlUtiGPUAvailable(); //OC26072023
+//EXP bool CALL srwlCAuxGPUAvailable(); //HG
 
 /**
  * Checks if GPU offloading is enabled
  * @return	true if enabled
  * @see ...
  */
-EXP bool CALL srwlUtiGPUEnabled(); //OC26072023
-//EXP bool CALL srwlAuxGpuEnabled(); //HG
+//EXP bool CALL srwlUtiGPUEnabled(); //OC26072023
+//EXP bool CALL srwlCAuxGPUEnabled(); //HG
 
 /**
  * Enable/Disable GPU offloading
  * @see ...
  */
-EXP void CALL srwlUtiGPUSetStatus(bool enable);
-//EXP void CALL srwlAuxGpuSetStatus(bool enable); //HG
+//EXP void CALL srwlUtiGPUSetStatus(bool enable);
+//EXP void CALL srwlCAuxGPUSetStatus(bool enable); //HG
 
 /**
  * Initialize device offloading
  * @see ...
  */
-EXP void CALL srwlUtiGPUInit(); //OC26072023
-//EXP void CALL srwlAuxGpuInit(); //HG
+//EXP void CALL srwlUtiGPUInit(); //OC26072023
+//EXP void CALL srwlCAuxGPUInit(); //HG
 
 /**
  * Finalize device offloading
  * @see ...
  */
-EXP void CALL srwlUtiGPUFini(); //OC26072023
-//EXP void CALL srwlAuxGpuFini(); //HG
+//EXP void CALL srwlUtiGPUFini(); //OC26072023
+//EXP void CALL srwlCAuxGPUFini(); //HG
+#endif
 
 /**
  * These functions were added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP
diff --git a/cpp/vc/SRW.sln b/cpp/vc/SRW.sln
index d62533af..57eb7848 100644
--- a/cpp/vc/SRW.sln
+++ b/cpp/vc/SRW.sln
@@ -1,14 +1,14 @@
 Microsoft Visual Studio Solution File, Format Version 12.00
 # Visual Studio Version 17
-VisualStudioVersion = 17.0.31912.275
+VisualStudioVersion = 17.4.33110.190
 MinimumVisualStudioVersion = 10.0.40219.1
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "SRWLIB", "SRWLIB.vcxproj", "{A7E707A6-D325-42AE-A0D0-3C97C38D36A6}"
-EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "SRWLClientPython", "SRWLClientPython.vcxproj", "{B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}"
 	ProjectSection(ProjectDependencies) = postProject
 		{A7E707A6-D325-42AE-A0D0-3C97C38D36A6} = {A7E707A6-D325-42AE-A0D0-3C97C38D36A6}
 	EndProjectSection
 EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "SRWLIB", "SRWLIB.vcxproj", "{A7E707A6-D325-42AE-A0D0-3C97C38D36A6}"
+EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "SRWLClientIgor", "SRWLClientIgor.vcxproj", "{0D473386-2B3E-4586-8516-DD4DCF6D4E1E}"
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "SRWLClientC", "SRWLClientC.vcxproj", "{AA6DEE02-E699-4AF6-A8FC-655906C9CDC9}"
@@ -23,32 +23,32 @@ Global
 		Release|x64 = Release|x64
 	EndGlobalSection
 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
-		{A7E707A6-D325-42AE-A0D0-3C97C38D36A6}.Debug|Mixed Platforms.ActiveCfg = Debug|Win32
-		{A7E707A6-D325-42AE-A0D0-3C97C38D36A6}.Debug|Mixed Platforms.Build.0 = Debug|Win32
+		{B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Debug|Mixed Platforms.ActiveCfg = Debug_Py3_9|x64
+		{B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Debug|Mixed Platforms.Build.0 = Debug_Py3_9|x64
+		{B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Debug|Win32.ActiveCfg = Debug_Py2x|Win32
+		{B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Debug|Win32.Build.0 = Debug_Py2x|Win32
+		{B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Debug|x64.ActiveCfg = Debug_Py3_9|x64
+		{B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Debug|x64.Build.0 = Debug_Py3_9|x64
+		{B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Release|Mixed Platforms.ActiveCfg = Release_Py3_9|Win32
+		{B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Release|Mixed Platforms.Build.0 = Release_Py3_9|Win32
+		{B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Release|Win32.ActiveCfg = Release_Py2x|Win32
+		{B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Release|Win32.Build.0 = Release_Py2x|Win32
+		{B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Release|x64.ActiveCfg = Release_Py3_9|x64
+		{B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Release|x64.Build.0 = Release_Py3_9|x64
+		{A7E707A6-D325-42AE-A0D0-3C97C38D36A6}.Debug|Mixed Platforms.ActiveCfg = Debug_cuda|x64
+		{A7E707A6-D325-42AE-A0D0-3C97C38D36A6}.Debug|Mixed Platforms.Build.0 = Debug_cuda|x64
 		{A7E707A6-D325-42AE-A0D0-3C97C38D36A6}.Debug|Win32.ActiveCfg = Debug|Win32
 		{A7E707A6-D325-42AE-A0D0-3C97C38D36A6}.Debug|Win32.Build.0 = Debug|Win32
-		{A7E707A6-D325-42AE-A0D0-3C97C38D36A6}.Debug|x64.ActiveCfg = Debug|x64
-		{A7E707A6-D325-42AE-A0D0-3C97C38D36A6}.Debug|x64.Build.0 = Debug|x64
+		{A7E707A6-D325-42AE-A0D0-3C97C38D36A6}.Debug|x64.ActiveCfg = Debug_cuda|x64
+		{A7E707A6-D325-42AE-A0D0-3C97C38D36A6}.Debug|x64.Build.0 = Debug_cuda|x64
 		{A7E707A6-D325-42AE-A0D0-3C97C38D36A6}.Release|Mixed Platforms.ActiveCfg = Release|Win32
 		{A7E707A6-D325-42AE-A0D0-3C97C38D36A6}.Release|Mixed Platforms.Build.0 = Release|Win32
 		{A7E707A6-D325-42AE-A0D0-3C97C38D36A6}.Release|Win32.ActiveCfg = Release|Win32
 		{A7E707A6-D325-42AE-A0D0-3C97C38D36A6}.Release|Win32.Build.0 = Release|Win32
-		{A7E707A6-D325-42AE-A0D0-3C97C38D36A6}.Release|x64.ActiveCfg = Release|x64
-		{A7E707A6-D325-42AE-A0D0-3C97C38D36A6}.Release|x64.Build.0 = Release|x64
-		{B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Debug|Mixed Platforms.ActiveCfg = Debug_Py3_3|Win32
-		{B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Debug|Mixed Platforms.Build.0 = Debug_Py3_3|Win32
-		{B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Debug|Win32.ActiveCfg = Debug_Py3_3|Win32
-		{B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Debug|Win32.Build.0 = Debug_Py3_3|Win32
-		{B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Debug|x64.ActiveCfg = Debug_Py3_11|x64
-		{B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Debug|x64.Build.0 = Debug_Py3_11|x64
-		{B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Release|Mixed Platforms.ActiveCfg = Release_Py3_3|Win32
-		{B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Release|Mixed Platforms.Build.0 = Release_Py3_3|Win32
-		{B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Release|Win32.ActiveCfg = Release_Py2x|Win32
-		{B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Release|Win32.Build.0 = Release_Py2x|Win32
-		{B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Release|x64.ActiveCfg = Release_Py3_11|x64
-		{B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Release|x64.Build.0 = Release_Py3_11|x64
-		{0D473386-2B3E-4586-8516-DD4DCF6D4E1E}.Debug|Mixed Platforms.ActiveCfg = Debug|Win32
-		{0D473386-2B3E-4586-8516-DD4DCF6D4E1E}.Debug|Mixed Platforms.Build.0 = Debug|Win32
+		{A7E707A6-D325-42AE-A0D0-3C97C38D36A6}.Release|x64.ActiveCfg = Release_cuda|x64
+		{A7E707A6-D325-42AE-A0D0-3C97C38D36A6}.Release|x64.Build.0 = Release_cuda|x64
+		{0D473386-2B3E-4586-8516-DD4DCF6D4E1E}.Debug|Mixed Platforms.ActiveCfg = Debug|x64
+		{0D473386-2B3E-4586-8516-DD4DCF6D4E1E}.Debug|Mixed Platforms.Build.0 = Debug|x64
 		{0D473386-2B3E-4586-8516-DD4DCF6D4E1E}.Debug|Win32.ActiveCfg = Debug|Win32
 		{0D473386-2B3E-4586-8516-DD4DCF6D4E1E}.Debug|Win32.Build.0 = Debug|Win32
 		{0D473386-2B3E-4586-8516-DD4DCF6D4E1E}.Debug|x64.ActiveCfg = Debug|x64
@@ -59,8 +59,8 @@ Global
 		{0D473386-2B3E-4586-8516-DD4DCF6D4E1E}.Release|Win32.Build.0 = Release|Win32
 		{0D473386-2B3E-4586-8516-DD4DCF6D4E1E}.Release|x64.ActiveCfg = Release|x64
 		{0D473386-2B3E-4586-8516-DD4DCF6D4E1E}.Release|x64.Build.0 = Release|x64
-		{AA6DEE02-E699-4AF6-A8FC-655906C9CDC9}.Debug|Mixed Platforms.ActiveCfg = Debug|Win32
-		{AA6DEE02-E699-4AF6-A8FC-655906C9CDC9}.Debug|Mixed Platforms.Build.0 = Debug|Win32
+		{AA6DEE02-E699-4AF6-A8FC-655906C9CDC9}.Debug|Mixed Platforms.ActiveCfg = Debug|x64
+		{AA6DEE02-E699-4AF6-A8FC-655906C9CDC9}.Debug|Mixed Platforms.Build.0 = Debug|x64
 		{AA6DEE02-E699-4AF6-A8FC-655906C9CDC9}.Debug|Win32.ActiveCfg = Debug|Win32
 		{AA6DEE02-E699-4AF6-A8FC-655906C9CDC9}.Debug|Win32.Build.0 = Debug|Win32
 		{AA6DEE02-E699-4AF6-A8FC-655906C9CDC9}.Debug|x64.ActiveCfg = Debug|x64
diff --git a/cpp/vc/SRWLClientPython.vcxproj b/cpp/vc/SRWLClientPython.vcxproj
index 88a2cc49..882e3575 100644
--- a/cpp/vc/SRWLClientPython.vcxproj
+++ b/cpp/vc/SRWLClientPython.vcxproj
@@ -1427,7 +1427,7 @@
       <ProfileGuidedDatabase>srwlpy.pgd</ProfileGuidedDatabase>
     </Link>
     <PostBuildEvent>
-      <Command>copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\work\srw_python\"</Command>
+      <Command>copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\clients\python\srwpy\"</Command>
     </PostBuildEvent>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release_Py3_9|x64'">
diff --git a/cpp/vc/SRWLIB.vcxproj b/cpp/vc/SRWLIB.vcxproj
index d0a4e611..e2215a8e 100644
--- a/cpp/vc/SRWLIB.vcxproj
+++ b/cpp/vc/SRWLIB.vcxproj
@@ -33,6 +33,14 @@
       <Configuration>Debug_fftw2</Configuration>
       <Platform>x64</Platform>
     </ProjectConfiguration>
+    <ProjectConfiguration Include="Release_cuda|Win32">
+      <Configuration>Release_cuda</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release_cuda|x64">
+      <Configuration>Release_cuda</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
     <ProjectConfiguration Include="Release_omph|Win32">
       <Configuration>Release_omph</Configuration>
       <Platform>Win32</Platform>
@@ -70,7 +78,6 @@
     <ProjectGuid>{A7E707A6-D325-42AE-A0D0-3C97C38D36A6}</ProjectGuid>
     <RootNamespace>SRWLIB</RootNamespace>
     <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
-    <CudaToolkitCustomDir>$(CUDA_PATH)</CudaToolkitCustomDir>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release_fftw2|Win32'" Label="Configuration">
@@ -85,6 +92,12 @@
     <UseOfMfc>false</UseOfMfc>
     <CharacterSet>MultiByte</CharacterSet>
   </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release_cuda|Win32'" Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+    <PlatformToolset>v143</PlatformToolset>
+    <UseOfMfc>false</UseOfMfc>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release_omp|Win32'" Label="Configuration">
     <ConfigurationType>StaticLibrary</ConfigurationType>
     <PlatformToolset>v143</PlatformToolset>
@@ -134,6 +147,13 @@
     <CharacterSet>MultiByte</CharacterSet>
     <SpectreMitigation>false</SpectreMitigation>
   </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release_cuda|x64'" Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+    <PlatformToolset>v143</PlatformToolset>
+    <UseOfMfc>false</UseOfMfc>
+    <CharacterSet>MultiByte</CharacterSet>
+    <SpectreMitigation>false</SpectreMitigation>
+  </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release_omp|x64'" Label="Configuration">
     <ConfigurationType>StaticLibrary</ConfigurationType>
     <PlatformToolset>v143</PlatformToolset>
@@ -175,7 +195,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 11.7.props" />
+    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 11.6.props" />
   </ImportGroup>
   <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release_fftw2|Win32'" Label="PropertySheets">
     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
@@ -185,6 +205,10 @@
     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
     <Import Project="$(VCTargetsPath)Microsoft.CPP.UpgradeFromVC71.props" />
   </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release_cuda|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+    <Import Project="$(VCTargetsPath)Microsoft.CPP.UpgradeFromVC71.props" />
+  </ImportGroup>
   <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release_omp|Win32'" Label="PropertySheets">
     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
     <Import Project="$(VCTargetsPath)Microsoft.CPP.UpgradeFromVC71.props" />
@@ -217,6 +241,10 @@
     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
     <Import Project="$(VCTargetsPath)Microsoft.CPP.UpgradeFromVC71.props" />
   </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release_cuda|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+    <Import Project="$(VCTargetsPath)Microsoft.CPP.UpgradeFromVC71.props" />
+  </ImportGroup>
   <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release_omp|x64'" Label="PropertySheets">
     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
     <Import Project="$(VCTargetsPath)Microsoft.CPP.UpgradeFromVC71.props" />
@@ -295,6 +323,11 @@
     <IntDir>$(Platform)\$(Configuration)\</IntDir>
     <TargetName>srw_win32</TargetName>
   </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release_cuda|Win32'">
+    <OutDir>$(SolutionDir)</OutDir>
+    <IntDir>$(Platform)\$(Configuration)\</IntDir>
+    <TargetName>srw_win32</TargetName>
+  </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release_omp|Win32'">
     <OutDir>$(SolutionDir)</OutDir>
     <IntDir>$(Platform)\$(Configuration)\</IntDir>
@@ -317,6 +350,12 @@
     <EnableManagedIncrementalBuild>true</EnableManagedIncrementalBuild>
     <TargetName>srw_x64</TargetName>
   </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release_cuda|x64'">
+    <OutDir>$(SolutionDir)</OutDir>
+    <IntDir>$(Platform)\$(Configuration)\</IntDir>
+    <EnableManagedIncrementalBuild>true</EnableManagedIncrementalBuild>
+    <TargetName>srw_x64</TargetName>
+  </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release_omp|x64'">
     <OutDir>$(SolutionDir)</OutDir>
     <IntDir>$(Platform)\$(Configuration)\</IntDir>
@@ -579,7 +618,7 @@
     </Midl>
     <ClCompile>
       <Optimization>Disabled</Optimization>
-      <AdditionalIncludeDirectories>..\src\core;..\src\lib;..\src\ext\genmath;..\src\ext\auxparse;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>..\src\core;..\src\lib;..\src\ext\genmath;..\src\ext\auxparse;$(CUDA_PATH)\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <PreprocessorDefinitions>_DEBUG;WIN32;_WINDOWS;_USRDLL;__VC__;SRWLIB_STATIC;_GM_WITHOUT_BASE;_CRT_SECURE_NO_WARNINGS;NON_UNIX_STDIO;_FFTW3;_OFFLOAD_GPU;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
       <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
@@ -602,13 +641,19 @@
       <Culture>0x0809</Culture>
     </ResourceCompile>
     <Lib>
-      <AdditionalDependencies>..\..\ext_lib\fftw3_64.lib;..\..\ext_lib\fftw3f_64.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>..\..\ext_lib\fftw3_64.lib;..\..\ext_lib\fftw3f_64.lib;cudart_static.lib;cufft.lib;cudadevrt.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <OutputFile>srw_x64.lib</OutputFile>
+      <AdditionalLibraryDirectories>$(CUDA_PATH)\lib\x64</AdditionalLibraryDirectories>
     </Lib>
     <PostBuildEvent>
       <Command>
       </Command>
     </PostBuildEvent>
+    <CudaCompile>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+      <CodeGeneration>compute_60,sm_60</CodeGeneration>
+      <Defines>_OFFLOAD_GPU;_USE_CUDA;</Defines>
+    </CudaCompile>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug_omp|x64'">
     <Midl>
@@ -729,6 +774,46 @@
       <Command>copy $(TargetPath) "$(SolutionDir)..\..\env\work\srw_python\lib\"</Command>
     </PostBuildEvent>
   </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release_cuda|Win32'">
+    <Midl>
+      <PreprocessorDefinitions>NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <MkTypLibCompatible>true</MkTypLibCompatible>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <TargetEnvironment>Win32</TargetEnvironment>
+      <TypeLibraryName>.\Release/SRWLIB.tlb</TypeLibraryName>
+    </Midl>
+    <ClCompile>
+      <InlineFunctionExpansion>OnlyExplicitInline</InlineFunctionExpansion>
+      <FavorSizeOrSpeed>Neither</FavorSizeOrSpeed>
+      <AdditionalIncludeDirectories>..\src\lib;..\src\core;..\src\ext\genmath;..\src\ext\auxparse;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>NDEBUG;WIN32;_WINDOWS;_USRDLL;__VC__;SRWLIB_STATIC;_GM_WITHOUT_BASE;_CRT_SECURE_NO_WARNINGS;NON_UNIX_STDIO;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <StringPooling>true</StringPooling>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <RuntimeTypeInfo>true</RuntimeTypeInfo>
+      <PrecompiledHeaderOutputFile>
+      </PrecompiledHeaderOutputFile>
+      <AssemblerListingLocation>
+      </AssemblerListingLocation>
+      <ObjectFileName>$(IntDir)</ObjectFileName>
+      <ProgramDataBaseFileName>$(IntDir)vc90.pdb</ProgramDataBaseFileName>
+      <WarningLevel>Level2</WarningLevel>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <CompileAs>Default</CompileAs>
+      <StructMemberAlignment>Default</StructMemberAlignment>
+    </ClCompile>
+    <ResourceCompile>
+      <PreprocessorDefinitions>NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <Culture>0x0809</Culture>
+    </ResourceCompile>
+    <Lib>
+      <AdditionalDependencies>..\..\ext_lib\fftw_f.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <OutputFile>srw_win32.lib</OutputFile>
+    </Lib>
+    <PostBuildEvent>
+      <Command>copy $(TargetPath) "$(SolutionDir)..\..\env\work\srw_python\lib\"</Command>
+    </PostBuildEvent>
+  </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release_omp|Win32'">
     <Midl>
       <PreprocessorDefinitions>NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
@@ -901,6 +986,60 @@
       </Command>
     </PostBuildEvent>
   </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release_cuda|x64'">
+    <Midl>
+      <PreprocessorDefinitions>NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <MkTypLibCompatible>true</MkTypLibCompatible>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <TargetEnvironment>X64</TargetEnvironment>
+      <TypeLibraryName>.\Release/SRWLIB.tlb</TypeLibraryName>
+    </Midl>
+    <ClCompile>
+      <InlineFunctionExpansion>OnlyExplicitInline</InlineFunctionExpansion>
+      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+      <AdditionalIncludeDirectories>..\src\lib;..\src\core;..\src\ext\genmath;..\src\ext\auxparse;$(CUDA_PATH)\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>NDEBUG;WIN32;_WINDOWS;_USRDLL;__VC__;SRWLIB_STATIC;_GM_WITHOUT_BASE;_CRT_SECURE_NO_WARNINGS;NON_UNIX_STDIO;_FFTW3;_OFFLOAD_GPU;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <StringPooling>true</StringPooling>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+      <FunctionLevelLinking>false</FunctionLevelLinking>
+      <RuntimeTypeInfo>true</RuntimeTypeInfo>
+      <PrecompiledHeaderOutputFile>
+      </PrecompiledHeaderOutputFile>
+      <AssemblerListingLocation>
+      </AssemblerListingLocation>
+      <ObjectFileName>$(IntDir)</ObjectFileName>
+      <ProgramDataBaseFileName>$(IntDir)vc90.pdb</ProgramDataBaseFileName>
+      <WarningLevel>Level3</WarningLevel>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <DebugInformationFormat>
+      </DebugInformationFormat>
+      <CompileAs>Default</CompileAs>
+      <OmitFramePointers>true</OmitFramePointers>
+      <Optimization>MaxSpeed</Optimization>
+      <EnableFiberSafeOptimizations>true</EnableFiberSafeOptimizations>
+      <FloatingPointModel>Precise</FloatingPointModel>
+    </ClCompile>
+    <ResourceCompile>
+      <PreprocessorDefinitions>NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <Culture>0x0809</Culture>
+    </ResourceCompile>
+    <Lib>
+      <AdditionalDependencies>..\..\ext_lib\fftw3_64.lib;..\..\ext_lib\fftw3f_64.lib;cudart_static.lib;cufft.lib;cudadevrt.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <OutputFile>srw_x64.lib</OutputFile>
+      <AdditionalOptions>
+      </AdditionalOptions>
+      <AdditionalLibraryDirectories>$(CUDA_PATH)\lib\x64</AdditionalLibraryDirectories>
+    </Lib>
+    <PostBuildEvent>
+      <Command>
+      </Command>
+    </PostBuildEvent>
+    <CudaCompile>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+      <CodeGeneration>compute_60,sm_60</CodeGeneration>
+      <Defines>_OFFLOAD_GPU;_USE_CUDA</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release_omp|x64'">
     <Midl>
       <PreprocessorDefinitions>NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
@@ -1045,6 +1184,31 @@
     <ClCompile Include="..\src\ext\f2c_min\XWSNE.C" />
     <ClCompile Include="..\src\ext\f2c_min\Z_DIV.C" />
     <ClCompile Include="..\src\ext\f2c_min\Z_EXP.C" />
+    <ClCompile Include="..\src\ext\genesis\genesis_july08\all_com.c" />
+    <ClCompile Include="..\src\ext\genesis\genesis_july08\check.c" />
+    <ClCompile Include="..\src\ext\genesis\genesis_july08\diagno.c" />
+    <ClCompile Include="..\src\ext\genesis\genesis_july08\esource.c" />
+    <ClCompile Include="..\src\ext\genesis\genesis_july08\field.c" />
+    <ClCompile Include="..\src\ext\genesis\genesis_july08\incoherent.c" />
+    <ClCompile Include="..\src\ext\genesis\genesis_july08\initrun.c" />
+    <ClCompile Include="..\src\ext\genesis\genesis_july08\input.c" />
+    <ClCompile Include="..\src\ext\genesis\genesis_july08\loadbeam.c" />
+    <ClCompile Include="..\src\ext\genesis\genesis_july08\loadrad.c" />
+    <ClCompile Include="..\src\ext\genesis\genesis_july08\magfield.c" />
+    <ClCompile Include="..\src\ext\genesis\genesis_july08\main.c" />
+    <ClCompile Include="..\src\ext\genesis\genesis_july08\math.c" />
+    <ClCompile Include="..\src\ext\genesis\genesis_july08\mpi.c" />
+    <ClCompile Include="..\src\ext\genesis\genesis_july08\output.c" />
+    <ClCompile Include="..\src\ext\genesis\genesis_july08\partsim.c" />
+    <ClCompile Include="..\src\ext\genesis\genesis_july08\pushp.c" />
+    <ClCompile Include="..\src\ext\genesis\genesis_july08\rpos.c" />
+    <ClCompile Include="..\src\ext\genesis\genesis_july08\scan.c" />
+    <ClCompile Include="..\src\ext\genesis\genesis_july08\source.c" />
+    <ClCompile Include="..\src\ext\genesis\genesis_july08\stepz.c" />
+    <ClCompile Include="..\src\ext\genesis\genesis_july08\string.c" />
+    <ClCompile Include="..\src\ext\genesis\genesis_july08\tdepend.c" />
+    <ClCompile Include="..\src\ext\genesis\genesis_july08\timerec.c" />
+    <ClCompile Include="..\src\ext\genesis\genesis_july08\track.c" />
     <ClCompile Include="..\src\ext\genmath\gmfft.cpp" />
     <ClCompile Include="..\src\ext\genmath\gmfit.cpp" />
     <ClCompile Include="..\src\ext\genmath\gminterp.cpp" />
@@ -1097,36 +1261,13 @@
     <ClCompile Include="..\src\core\srtrjaux.cpp" />
     <ClCompile Include="..\src\core\srtrjdat.cpp" />
     <ClCompile Include="..\src\core\srtrjdat3d.cpp" />
-    <ClCompile Include="..\src\ext\genesis\genesis_july08\all_com.c" />
-    <ClCompile Include="..\src\ext\genesis\genesis_july08\check.c" />
-    <ClCompile Include="..\src\ext\genesis\genesis_july08\diagno.c" />
-    <ClCompile Include="..\src\ext\genesis\genesis_july08\esource.c" />
-    <ClCompile Include="..\src\ext\genesis\genesis_july08\field.c" />
-    <ClCompile Include="..\src\ext\genesis\genesis_july08\incoherent.c" />
-    <ClCompile Include="..\src\ext\genesis\genesis_july08\initrun.c" />
-    <ClCompile Include="..\src\ext\genesis\genesis_july08\input.c" />
-    <ClCompile Include="..\src\ext\genesis\genesis_july08\loadbeam.c" />
-    <ClCompile Include="..\src\ext\genesis\genesis_july08\loadrad.c" />
-    <ClCompile Include="..\src\ext\genesis\genesis_july08\magfield.c" />
-    <ClCompile Include="..\src\ext\genesis\genesis_july08\main.c" />
-    <ClCompile Include="..\src\ext\genesis\genesis_july08\math.c" />
-    <ClCompile Include="..\src\ext\genesis\genesis_july08\mpi.c" />
-    <ClCompile Include="..\src\ext\genesis\genesis_july08\output.c" />
-    <ClCompile Include="..\src\ext\genesis\genesis_july08\partsim.c" />
-    <ClCompile Include="..\src\ext\genesis\genesis_july08\pushp.c" />
-    <ClCompile Include="..\src\ext\genesis\genesis_july08\rpos.c" />
-    <ClCompile Include="..\src\ext\genesis\genesis_july08\scan.c" />
-    <ClCompile Include="..\src\ext\genesis\genesis_july08\source.c" />
-    <ClCompile Include="..\src\ext\genesis\genesis_july08\stepz.c" />
-    <ClCompile Include="..\src\ext\genesis\genesis_july08\string.c" />
-    <ClCompile Include="..\src\ext\genesis\genesis_july08\tdepend.c" />
-    <ClCompile Include="..\src\ext\genesis\genesis_july08\timerec.c" />
-    <ClCompile Include="..\src\ext\genesis\genesis_july08\track.c" />
-    <ClCompile Include="..\src\ext\utils\utidev.cpp" />
+    <ClCompile Include="..\src\lib\auxgpu.cpp" />
     <ClCompile Include="..\src\lib\srerror.cpp" />
     <ClCompile Include="..\src\lib\srwlib.cpp" />
   </ItemGroup>
   <ItemGroup>
+    <ClInclude Include="..\src\core\sroptelm_gpu.h" />
+    <ClInclude Include="..\src\core\srradstr_gpu.h" />
     <ClInclude Include="..\src\ext\auxparse\auxparse.h" />
     <ClInclude Include="..\src\ext\genmath\cmplxd.h" />
     <ClInclude Include="..\src\ext\genmath\gmercode.h" />
@@ -1196,13 +1337,21 @@
     <ClInclude Include="..\src\core\srwfrsmp.h" />
     <ClInclude Include="..\src\ext\auxparse\stlstart.h" />
     <ClInclude Include="..\src\core\srercode.h" />
-    <ClInclude Include="..\src\ext\utils\utidev.h" />
+    <ClInclude Include="..\src\lib\auxgpu.h" />
     <ClInclude Include="..\src\lib\srerror.h" />
     <ClInclude Include="..\src\lib\srigorre.h" />
     <ClInclude Include="..\src\lib\srwlib.h" />
   </ItemGroup>
+  <ItemGroup>
+    <CudaCompile Include="..\src\core\sroptdrf_gpu.cu" />
+    <CudaCompile Include="..\src\core\sroptelm_gpu.cu" />
+    <CudaCompile Include="..\src\core\sroptgtr_gpu.cu" />
+    <CudaCompile Include="..\src\core\srradmnp_gpu.cu" />
+    <CudaCompile Include="..\src\core\srradstr_gpu.cu" />
+    <CudaCompile Include="..\src\ext\genmath\gmfft_gpu.cu" />
+  </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 11.7.targets" />
+    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 11.6.targets" />
   </ImportGroup>
 </Project>
\ No newline at end of file
diff --git a/cpp/vc/SRWLIB.vcxproj.filters b/cpp/vc/SRWLIB.vcxproj.filters
index c77ca1a2..6c69e3cc 100644
--- a/cpp/vc/SRWLIB.vcxproj.filters
+++ b/cpp/vc/SRWLIB.vcxproj.filters
@@ -393,8 +393,8 @@
     <ClCompile Include="..\src\ext\f2c_min\Z_EXP.C">
       <Filter>f2c</Filter>
     </ClCompile>
-    <ClCompile Include="..\src\ext\utils\utidev.cpp">
-      <Filter>core</Filter>
+    <ClCompile Include="..\src\lib\auxgpu.cpp">
+      <Filter>lib</Filter>
     </ClCompile>
   </ItemGroup>
   <ItemGroup>
@@ -614,8 +614,34 @@
     <ClInclude Include="..\src\lib\srwlib.h">
       <Filter>lib</Filter>
     </ClInclude>
-    <ClInclude Include="..\src\ext\utils\utidev.h">
+    <ClInclude Include="..\src\lib\auxgpu.h">
+      <Filter>lib</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\core\sroptelm_gpu.h">
       <Filter>core</Filter>
     </ClInclude>
+    <ClInclude Include="..\src\core\srradstr_gpu.h">
+      <Filter>core</Filter>
+    </ClInclude>
+  </ItemGroup>
+  <ItemGroup>
+    <CudaCompile Include="..\src\ext\genmath\gmfft_gpu.cu">
+      <Filter>core</Filter>
+    </CudaCompile>
+    <CudaCompile Include="..\src\core\sroptdrf_gpu.cu">
+      <Filter>core</Filter>
+    </CudaCompile>
+    <CudaCompile Include="..\src\core\sroptelm_gpu.cu">
+      <Filter>core</Filter>
+    </CudaCompile>
+    <CudaCompile Include="..\src\core\sroptgtr_gpu.cu">
+      <Filter>core</Filter>
+    </CudaCompile>
+    <CudaCompile Include="..\src\core\srradmnp_gpu.cu">
+      <Filter>core</Filter>
+    </CudaCompile>
+    <CudaCompile Include="..\src\core\srradstr_gpu.cu">
+      <Filter>core</Filter>
+    </CudaCompile>
   </ItemGroup>
 </Project>
\ No newline at end of file

From 06c2cea71d7a7a41b7cf240c33d5e1cf7362b544 Mon Sep 17 00:00:00 2001
From: Himanshu Goel <himanshu@hgoel.dev>
Date: Mon, 4 Dec 2023 04:42:16 -0500
Subject: [PATCH 3/9] Fix virtual function parameters.

---
 cpp/src/core/sroptang.h   |  5 +++--
 cpp/src/core/sroptapt.h   |  4 ++--
 cpp/src/core/sroptcryst.h |  2 +-
 cpp/src/core/sroptdrf.h   |  4 ++--
 cpp/src/core/sroptfoc.h   | 12 ++++++++----
 cpp/src/core/sroptgrat.h  |  3 ++-
 cpp/src/core/sroptgtr.h   |  4 ++--
 cpp/src/core/sropthck.h   |  6 ++++--
 cpp/src/core/sroptpsh.h   | 12 ++++++++----
 cpp/src/core/sroptsmr.h   |  6 ++++--
 cpp/src/core/sroptwgr.h   |  6 ++++--
 cpp/src/core/sroptzp.h    | 15 ++++++++++-----
 cpp/src/core/sroptzps.h   | 12 ++++++++----
 13 files changed, 58 insertions(+), 33 deletions(-)

diff --git a/cpp/src/core/sroptang.h b/cpp/src/core/sroptang.h
index d0370a26..2294731c 100644
--- a/cpp/src/core/sroptang.h
+++ b/cpp/src/core/sroptang.h
@@ -31,7 +31,7 @@ class srTOptAngle : public srTGenOptElem {
 	}
 
 	//int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect) //virtual
-	int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect, void* pvGPU) //virtual //HG30112023
+	int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect, void* pvGPU=0) //virtual //HG30112023
 	{
 		//return PropagateRadiationMeth_0(pRadAccessData);
 		int res = 0;
@@ -135,7 +135,8 @@ class srTOptShift : public srTGenOptElem {
 		ShiftY = InShiftY;
 	}
 
-	int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect) //virtual
+	//int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect) //virtual
+	int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect, void* pvGPU=0) //virtual //HG04122023
 	{
 		//return PropagateRadiationMeth_0(pRadAccessData);
 		int res = 0;
diff --git a/cpp/src/core/sroptapt.h b/cpp/src/core/sroptapt.h
index e5f22dac..98f6598c 100644
--- a/cpp/src/core/sroptapt.h
+++ b/cpp/src/core/sroptapt.h
@@ -34,7 +34,7 @@ class srTAperture : public srTShapedOptElem {
 
 	//int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, int MethNo, srTRadResizeVect& ResBeforeAndAfterVect)
 	//int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect)
-	int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect, void* pvGPU) //HG30112023
+	int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect, void* pvGPU=0) //HG30112023
 	{
 		char &MethNo = ParPrecWfrPropag.MethNo;
 
@@ -82,7 +82,7 @@ class srTAperture : public srTShapedOptElem {
 	//int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData, void* pBuf=0) //OC06092019
 	//OC01102019 (restored)
 	//int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData)
-	int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData, void* pvGPU) //HG30112023
+	int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData, void* pvGPU=0) //HG30112023
 	{
 		int result;
 		//if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result;
diff --git a/cpp/src/core/sroptcryst.h b/cpp/src/core/sroptcryst.h
index af6f535f..ee0ef80b 100644
--- a/cpp/src/core/sroptcryst.h
+++ b/cpp/src/core/sroptcryst.h
@@ -944,7 +944,7 @@ class srTOptCryst : public srTGenOptElem {
 	}
 
 	//int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect) //virtual in srTGenOptElem
-	int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect, void* pvGPU) //virtual in srTGenOptElem //HG01122023
+	int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect, void* pvGPU=0) //virtual in srTGenOptElem //HG01122023
 	{
 		m_eStartAux = pRadAccessData->eStart; m_eStepAux = pRadAccessData->eStep; m_ne = pRadAccessData->ne; //required for RadPointModifier
 
diff --git a/cpp/src/core/sroptdrf.h b/cpp/src/core/sroptdrf.h
index a6a16d20..c3a7509a 100644
--- a/cpp/src/core/sroptdrf.h
+++ b/cpp/src/core/sroptdrf.h
@@ -179,7 +179,7 @@ class srTDriftSpace : public srTGenOptElem {
 	//int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadAccessData, void* pBuf=0) //OC06092019
 	//OC01102019 (restored)
 	//int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadAccessData)
-	int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadAccessData, void* pvGPU) //HG01122023
+	int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadAccessData, void* pvGPU=0) //HG01122023
 	{//it works for many photon energies too!
 		int result;
 		//if(result = PropagateRadiationSimple(pRadAccessData, pBuf)) return result; //OC06092019
@@ -312,7 +312,7 @@ class srTDriftSpace : public srTGenOptElem {
 	//int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData, void* pBuf=0) //OC06092019
 	//OC01102019 (restored)
 	//int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData)
-	int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData, void* pvGPU) //HG01122023
+	int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData, void* pvGPU=0) //HG01122023
 	{
 		//srTDriftPropBufVars* pBufVars = (srTDriftPropBufVars*)pBuf; //OC06092019
 		//char LocalPropMode = pBufVars->LocalPropMode; //OC06092019
diff --git a/cpp/src/core/sroptfoc.h b/cpp/src/core/sroptfoc.h
index d2a05579..f950a775 100644
--- a/cpp/src/core/sroptfoc.h
+++ b/cpp/src/core/sroptfoc.h
@@ -153,7 +153,8 @@ class srTThinLens : public srTFocusingElem {
 	srTThinLens() {}
 
 	//int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, int MethNo, srTRadResizeVect& ResBeforeAndAfterVect)
-	int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect)
+	//int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect)
+	int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect, void* pvGPU=0) //HG04122023
 	{
 		//if(ParPrecWfrPropag.AnalTreatment == 1)
 		//{// Treating linear terms analytically
@@ -197,11 +198,14 @@ class srTThinLens : public srTFocusingElem {
 
 	//int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData, void* pBuf=0) //OC06092019
 	//OC01102019 (restored)
-	int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData)
+	//int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData)
+	int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData, void* pvGPU=0) //HG04122023
 	{
 		int result;
-		if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result;
-		if(result = TraverseRadZXE(pRadAccessData)) return result;
+		//if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result;
+		if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0, 0, 0, pvGPU)) return result; //HG04122023
+		//if(result = TraverseRadZXE(pRadAccessData)) return result;
+		if(result = TraverseRadZXE(pRadAccessData, 0, 0, pvGPU)) return result; //HG04122023
 		return 0;
 	}
   	int PropagateRadiationSimple1D(srTRadSect1D* pSect1D)
diff --git a/cpp/src/core/sroptgrat.h b/cpp/src/core/sroptgrat.h
index 487f805f..2c761663 100644
--- a/cpp/src/core/sroptgrat.h
+++ b/cpp/src/core/sroptgrat.h
@@ -96,7 +96,8 @@ class srTGrating : public srTShapedOptElem {
 		m_PropWfrInPlace = true; //OC151008 //previous electric field is NOT necessary for the propagation
 	}
 
-	int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect)
+	//int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect)
+	int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect, void* pvGPU=0) //HG04122023
 	{
 		//char &MethNo = ParPrecWfrPropag.MethNo;
 		SetupPropBufVars_Gen(pRadAccessData);
diff --git a/cpp/src/core/sroptgtr.h b/cpp/src/core/sroptgtr.h
index 202190c3..34052e34 100644
--- a/cpp/src/core/sroptgtr.h
+++ b/cpp/src/core/sroptgtr.h
@@ -82,7 +82,7 @@ class srTGenTransmission : public srTFocusingElem {
 
 	//int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, int MethNo, srTRadResizeVect& ResBeforeAndAfterArr)
 	//int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterArr)
-	int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterArr, void* pvGPU) //HG01122023
+	int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterArr, void* pvGPU=0) //HG01122023
 	{
 		//if(ParPrecWfrPropag.AnalTreatment == 1)
 		//{// Treating linear terms analytically
@@ -125,7 +125,7 @@ class srTGenTransmission : public srTFocusingElem {
 	//int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData, void* pBuf=0) //OC06092019
 	//OC01102019 (restored)
 	//int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData)
-	int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData, void* pvGPU) //HG01122023
+	int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData, void* pvGPU=0) //HG01122023
 	{
 		int result;
 		//if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result;
diff --git a/cpp/src/core/sropthck.h b/cpp/src/core/sropthck.h
index 45f09323..cc7fe350 100644
--- a/cpp/src/core/sropthck.h
+++ b/cpp/src/core/sropthck.h
@@ -167,7 +167,8 @@ class srTMirror : public srTFocusingElem {
 		//return true;
 	}
 
-	int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect) //virtual in srTGenOptElem
+	//int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect) //virtual in srTGenOptElem
+	int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect, void* pvGPU=0) //virtual in srTGenOptElem //HG04122023
 	{
 		m_ParPrecWfrPropag = ParPrecWfrPropag; //store for use in a composite prapagator (through drif space, etc.)
 		
@@ -206,7 +207,8 @@ class srTMirror : public srTFocusingElem {
 
 	//int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData, void* pBuf=0) //OC06092019
 	//OC01102019 (restored)
-	int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData)
+	//int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData)
+	int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData, void* pvGPU=0) //HG04122023
 	{
 		if(m_propMeth == 1) return PropagateRadiationSimple_ThinElem(pRadAccessData);
 		else if(m_propMeth == 2) return PropagateRadiationSimple_LocRayTracing(pRadAccessData);
diff --git a/cpp/src/core/sroptpsh.h b/cpp/src/core/sroptpsh.h
index ab0ac787..181df8df 100644
--- a/cpp/src/core/sroptpsh.h
+++ b/cpp/src/core/sroptpsh.h
@@ -75,7 +75,8 @@ class srTPhaseShift : public srTFocusingElem {
 	}
 
 	//int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, int MethNo, srTRadResizeVect& ResBeforeAndAfterVect)
-	int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect)
+	//int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect)
+	int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect, void* pvGPU=0) //HG04122023
 	{
 		char &MethNo = ParPrecWfrPropag.MethNo;
 
@@ -86,7 +87,8 @@ class srTPhaseShift : public srTFocusingElem {
 
 	//int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData, void* pBuf=0) //OC06092019
 	//OC01102019 (restored)
-	int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData)
+	//int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData)
+	int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData, void* pvGPU=0) //HG04122023
 	{
 		int result;
 		srTWaveAccessData PhShWaveAccessData;
@@ -94,8 +96,10 @@ class srTPhaseShift : public srTFocusingElem {
 		//tPhaseShiftData = (DOUBLE*)(PhShWaveAccessData.pWaveData);
 		tPhaseShiftData = (double*)(PhShWaveAccessData.pWaveData); //OC26112019 (related to SRW port to IGOR XOP8 on Mac)
 
-		if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result;
-		if(result = TraverseRadZXE(pRadAccessData)) return result;
+		//if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result;
+		if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0, 0, 0, pvGPU)) return result; //HG04122023
+		//if(result = TraverseRadZXE(pRadAccessData)) return result;
+		if(result = TraverseRadZXE(pRadAccessData, 0, 0, pvGPU)) return result; //HG04122023
 
 		//srTSend Send;
 		//if(result = Send.FinishWorkingWithWave(&PhShWaveAccessData)) return result;
diff --git a/cpp/src/core/sroptsmr.h b/cpp/src/core/sroptsmr.h
index 9d36eb82..f0d1e6b1 100644
--- a/cpp/src/core/sroptsmr.h
+++ b/cpp/src/core/sroptsmr.h
@@ -67,10 +67,12 @@ class srTSpherMirror : public srTFocusingElem {
 	void SetupSpherMirrorApprox();
 
 	//int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, int MethNo, srTRadResizeVect& ResBeforeAndAfterVect)
-	int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect)
+	//int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect)
+	int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect, void* pvGPU=0) //HG04122023
 	{
 		//if(UseSpherMirrorApprox) return ((srTGenOptElem*)(SpherMirrorApproxHndl.rep))->PropagateRadiation(pRadAccessData, MethNo, ResBeforeAndAfterVect);
-		if(UseSpherMirrorApprox) return ((srTGenOptElem*)(SpherMirrorApproxHndl.rep))->PropagateRadiation(pRadAccessData, ParPrecWfrPropag, ResBeforeAndAfterVect);
+		//if(UseSpherMirrorApprox) return ((srTGenOptElem*)(SpherMirrorApproxHndl.rep))->PropagateRadiation(pRadAccessData, ParPrecWfrPropag, ResBeforeAndAfterVect);
+		if(UseSpherMirrorApprox) return ((srTGenOptElem*)(SpherMirrorApproxHndl.rep))->PropagateRadiation(pRadAccessData, ParPrecWfrPropag, ResBeforeAndAfterVect, pvGPU); //HG04122023
 		else
 		{
             char &MethNo = ParPrecWfrPropag.MethNo;
diff --git a/cpp/src/core/sroptwgr.h b/cpp/src/core/sroptwgr.h
index c9be6164..580b97d3 100644
--- a/cpp/src/core/sroptwgr.h
+++ b/cpp/src/core/sroptwgr.h
@@ -134,7 +134,8 @@ class srTWaveguideRect : public srTShapedOptElem {
 	}
 
 	//int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, int MethNo, srTRadResizeVect& ResizeBeforeAndAfterVect)
-	int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResizeBeforeAndAfterVect)
+	//int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResizeBeforeAndAfterVect)
+	int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResizeBeforeAndAfterVect, void* pvGPU=0) //HG04122023
 	{
 		//Checks current sampling "resolution" in hor. and vert. directions
 		//Makes necessary sampling for propag. through the waveguide (fit the waveguide with approx. the same resolution, include all harmonics until the cut-off)
@@ -151,7 +152,8 @@ class srTWaveguideRect : public srTShapedOptElem {
 		if(result = PropagateRadiationSimple_AngRepres(&AuxWfrData)) return result;
 
 		srTRectAperture RectAp(Dx, Dz, TransvCenPoint.x, TransvCenPoint.y);
-		if(result = RectAp.TraverseRadZXE(&AuxWfrData)) return result;
+		//if(result = RectAp.TraverseRadZXE(&AuxWfrData)) return result;
+		if(result = RectAp.TraverseRadZXE(&AuxWfrData, 0, 0, pvGPU)) return result; //HG04122023
 
 		if(result = CopyElecFieldDataForOut(AuxWfrData, *pRadAccessData)) return result;
 		AuxWfrData.DeleteElecFieldArrays(); //deletes Ex, Ez only
diff --git a/cpp/src/core/sroptzp.h b/cpp/src/core/sroptzp.h
index 813974de..68ae1ee7 100644
--- a/cpp/src/core/sroptzp.h
+++ b/cpp/src/core/sroptzp.h
@@ -100,7 +100,8 @@ class srTZonePlate : public srTFocusingElem {
 	srTZonePlate() {}
 
 	//int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, int MethNo, srTRadResizeVect& ResBeforeAndAfterVect)
-	int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect)
+	//int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect)
+	int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect, void* pvGPU=0) //HG04122023
 	{
 		//if(ParPrecWfrPropag.AnalTreatment == 1)
 		//{// Treating linear terms analytically
@@ -111,7 +112,8 @@ class srTZonePlate : public srTFocusingElem {
 		
 		int result = 0;
 
-		if(MethNo == 0) result = PropagateRadiationMeth_0(pRadAccessData);
+		//if(MethNo == 0) result = PropagateRadiationMeth_0(pRadAccessData);
+		if(MethNo == 0) result = PropagateRadiationMeth_0(pRadAccessData, pvGPU); //HG04122023
 		//else return PropagateRadiationMeth_2(pRadAccessData, ResBeforeAndAfterVect);
 		else result = PropagateRadiationMeth_2(pRadAccessData, ParPrecWfrPropag, ResBeforeAndAfterVect);
 
@@ -125,11 +127,14 @@ class srTZonePlate : public srTFocusingElem {
 
 	//int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData, void* pBuf=0) //OC06092019
 	//OC01102019 (restored)
-	int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData)
+	//int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData)
+	int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData, void* pvGPU=0) //HG04122023
 	{
 		int result;
-		if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result;
-		if(result = TraverseRadZXE(pRadAccessData)) return result;
+		//if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result;
+		if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0, 0, 0, pvGPU)) return result; //HG04122023
+		//if(result = TraverseRadZXE(pRadAccessData)) return result;
+		if(result = TraverseRadZXE(pRadAccessData, 0, 0, pvGPU)) return result; //HG04122023
 		return 0;
 	}
   	int PropagateRadiationSimple1D(srTRadSect1D* pSect1D)
diff --git a/cpp/src/core/sroptzps.h b/cpp/src/core/sroptzps.h
index 792d02c6..e5409814 100644
--- a/cpp/src/core/sroptzps.h
+++ b/cpp/src/core/sroptzps.h
@@ -80,7 +80,8 @@ class srTZonePlateSpec : public srTFocusingElem {
 	}
 
 	//int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, int MethNo, srTRadResizeVect& ResBeforeAndAfterVect)
-	int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect)
+	//int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect)
+	int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect, void* pvGPU=0) //HG04122023
 	{
 		char &MethNo = ParPrecWfrPropag.MethNo;
 		//if(MethNo == 2) return PropagateRadiationMeth_2(pRadAccessData, ResBeforeAndAfterVect);
@@ -90,11 +91,14 @@ class srTZonePlateSpec : public srTFocusingElem {
 
 	//int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData, void* pBuf=0) //OC06092019
 	//OC01102019 (restored)
-	int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData)
+	//int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData)
+	int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData, void* pvGPU=0) //HG04122023
 	{
 		int result;
-		if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result;
-		return TraverseRadZXE(pRadAccessData);
+		//if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result;
+		if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0, 0, 0, pvGPU)) return result; //HG04122023
+		//return TraverseRadZXE(pRadAccessData);
+		return TraverseRadZXE(pRadAccessData, 0, 0, pvGPU);	//HG04122023
 	}
   	int PropagateRadiationSimple1D(srTRadSect1D* pSect1D)
 	{

From 85b7c5cc6c4564dcec99509ef8d246f5a5c9723a Mon Sep 17 00:00:00 2001
From: Himanshu Goel <himanshu@hgoel.dev>
Date: Thu, 4 Jan 2024 12:07:04 -0500
Subject: [PATCH 4/9] Add GPU related code for srwlpy.cpp

---
 cpp/src/clients/python/srwlpy.cpp | 75 ++++++++++++++++++++++++++++---
 1 file changed, 69 insertions(+), 6 deletions(-)

diff --git a/cpp/src/clients/python/srwlpy.cpp b/cpp/src/clients/python/srwlpy.cpp
index f320cfc5..217fbe95 100644
--- a/cpp/src/clients/python/srwlpy.cpp
+++ b/cpp/src/clients/python/srwlpy.cpp
@@ -26,6 +26,10 @@
 #include <map>
 #include <sstream> //OCTEST_161214
 
+#ifdef _OFFLOAD_GPU //HG30112023
+#include "auxgpu.h"
+#endif
+
 //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP:
 //#include <time.h>
 
@@ -3319,6 +3323,22 @@ void ParseSructSmpObj3D(double**& arObjShapeDefs, int& nObj3D, PyObject* oListSh
 	}
 }
 
+#ifdef _OFFLOAD_GPU //HG30112023
+/************************************************************************//**
+ * Convert Python device specification to C++ structure.
+ ***************************************************************************/
+void ParseDeviceParam(PyObject* oDev, gpuUsageArg *pGpuUsage) //HG10202021 Convert Python device specification to C++ structure
+{
+	if (oDev != 0) {
+		if (PyLong_Check(oDev)) {
+			pGpuUsage->deviceIndex = _PyLong_AsInt(oDev);
+			return;
+		}
+	}
+	pGpuUsage->deviceIndex = 0;
+}
+#endif
+
 /************************************************************************//**
  * Updates Py List by numbers
  ***************************************************************************/
@@ -4617,18 +4637,24 @@ static PyObject* srwlpy_CalcIntFromElecField(PyObject *self, PyObject *args)
 {
 	//PyObject *oInt=0, *oWfr=0, *oPol=0, *oIntType=0, *oDepType=0, *oE=0, *oX=0, *oY=0;
 	//PyObject *oInt=0, *oWfr=0, *oPol=0, *oIntType=0, *oDepType=0, *oE=0, *oX=0, *oY=0, *oMeth=0;
-	PyObject *oInt=0, *oWfr=0, *oPol=0, *oIntType=0, *oDepType=0, *oE=0, *oX=0, *oY=0, *oMeth=0, *oFldTrj=0; //OC23022020
+	//PyObject *oInt=0, *oWfr=0, *oPol=0, *oIntType=0, *oDepType=0, *oE=0, *oX=0, *oY=0, *oMeth=0, *oFldTrj=0; //OC23022020
+	PyObject *oInt=0, *oWfr=0, *oPol=0, *oIntType=0, *oDepType=0, *oE=0, *oX=0, *oY=0, *oMeth=0, *oFldTrj=0, *oDev=0; //HG03012024
 	vector<Py_buffer> vBuf;
 	SRWLWfr wfr;
 	SRWLMagFldC *pMagCnt=0; //OC23022020
 	SRWLPrtTrj *pPrtTrj=0;
 
+#ifdef _OFFLOAD_GPU //HG30112023
+	TGPUUsageArg gpu;
+	srwlUtiGPUProc(1); //to prepare GPU for calculations
+#endif
 	try
 	{
 		//if(!PyArg_ParseTuple(args, "OOOOOOOO:CalcIntFromElecField", &oInt, &oWfr, &oPol, &oIntType, &oDepType, &oE, &oX, &oY)) throw strEr_BadArg_CalcIntFromElecField;
 		//if(!PyArg_ParseTuple(args, "OOOOOOOO|O:CalcIntFromElecField", &oInt, &oWfr, &oPol, &oIntType, &oDepType, &oE, &oX, &oY, &oMeth)) throw strEr_BadArg_CalcIntFromElecField; //OC13122019
 		//if(!PyArg_ParseTuple(args, "OOOOOOOO|O:CalcIntFromElecField", &oInt, &oWfr, &oPol, &oIntType, &oDepType, &oE, &oX, &oY, &oMeth, &oFldTrj)) throw strEr_BadArg_CalcIntFromElecField; //OC23022020
-		if(!PyArg_ParseTuple(args, "OOOOOOOO|OO:CalcIntFromElecField", &oInt, &oWfr, &oPol, &oIntType, &oDepType, &oE, &oX, &oY, &oMeth, &oFldTrj)) throw strEr_BadArg_CalcIntFromElecField; //OC03032021 (just formally corrected, according to number of arguments)
+		//if(!PyArg_ParseTuple(args, "OOOOOOOO|OO:CalcIntFromElecField", &oInt, &oWfr, &oPol, &oIntType, &oDepType, &oE, &oX, &oY, &oMeth, &oFldTrj)) throw strEr_BadArg_CalcIntFromElecField; //OC03032021 (just formally corrected, according to number of arguments)
+		if(!PyArg_ParseTuple(args, "OOOOOOOO|OOO:CalcIntFromElecField", &oInt, &oWfr, &oPol, &oIntType, &oDepType, &oE, &oX, &oY, &oMeth, &oFldTrj, &oDev)) throw strEr_BadArg_CalcIntFromElecField; //HG03012024
 		if((oInt == 0) || (oWfr == 0) || (oPol == 0) || (oIntType == 0) || (oDepType == 0) || (oE == 0) || (oX == 0) || (oY == 0)) throw strEr_BadArg_CalcIntFromElecField;
 
 		//char *arInt = (char*)GetPyArrayBuf(oInt, vBuf, PyBUF_WRITABLE, 0);
@@ -4691,7 +4717,13 @@ static PyObject* srwlpy_CalcIntFromElecField(PyObject *self, PyObject *args)
 
 		//ProcRes(srwlCalcIntFromElecField(arInt, &wfr, pol, intType, depType, e, x, y));
 		//ProcRes(srwlCalcIntFromElecField(arInt, &wfr, pol, intType, depType, e, x, y, pMeth)); //OC13122019
+
+#ifdef _OFFLOAD_GPU //HG30112023
+		ParseDeviceParam(oDev, &gpu);
+		ProcRes(srwlCalcIntFromElecField(arInt, &wfr, pol, intType, depType, e, x, y, pMeth, pFldTrj, (void*)&gpu));
+#else
 		ProcRes(srwlCalcIntFromElecField(arInt, &wfr, pol, intType, depType, e, x, y, pMeth, pFldTrj)); //OC23022020
+#endif
 	}
 	catch(const char* erText) 
 	{
@@ -4700,6 +4732,9 @@ static PyObject* srwlpy_CalcIntFromElecField(PyObject *self, PyObject *args)
 		oInt = 0;
 	}
 
+#ifdef _OFFLOAD_GPU //HG30112023
+	srwlUtiGPUProc(0); //to free GPU
+#endif
 	if(pMagCnt != 0) DeallocMagCntArrays(pMagCnt);
 	ReleasePyBuffers(vBuf);
 	EraseElementFromMap(&wfr, gmWfrPyPtr);
@@ -4932,7 +4967,8 @@ static PyObject* srwlpy_SetRepresElecField(PyObject *self, PyObject *args)
 static PyObject* srwlpy_PropagElecField(PyObject *self, PyObject *args)
 {
 	//PyObject *oWfr=0, *oOptCnt=0;
-	PyObject *oWfr=0, *oOptCnt=0, *oInt=0; //OC14082018
+	//PyObject *oWfr=0, *oOptCnt=0, *oInt=0; //OC14082018
+	PyObject *oWfr=0, *oOptCnt=0, *oInt=0, *oDev=0; //Hg03012024
 
 	vector<Py_buffer> vBuf;
 	SRWLWfr wfr;
@@ -4945,10 +4981,15 @@ static PyObject* srwlpy_PropagElecField(PyObject *self, PyObject *args)
 	//float **arInts=0;
 	char **arInts=0;
 
+#ifdef _OFFLOAD_GPU //HG03012024
+	TGPUUsageArg gpu;
+	srwlUtiGPUProc(1); //to prepare GPU for calculations
+#endif
 	try
 	{
 		//if(!PyArg_ParseTuple(args, "OO:PropagElecField", &oWfr, &oOptCnt)) throw strEr_BadArg_PropagElecField;
-		if(!PyArg_ParseTuple(args, "OO|O:PropagElecField", &oWfr, &oOptCnt, &oInt)) throw strEr_BadArg_PropagElecField; //OC14082018
+		//if(!PyArg_ParseTuple(args, "OO|O:PropagElecField", &oWfr, &oOptCnt, &oInt)) throw strEr_BadArg_PropagElecField; //OC14082018
+		if(!PyArg_ParseTuple(args, "OO|OO:PropagElecField", &oWfr, &oOptCnt, &oInt, &oDev)) throw strEr_BadArg_PropagElecField; //HG03012024
 		if((oWfr == 0) || (oOptCnt == 0)) throw strEr_BadArg_PropagElecField;
 
 		//Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP:
@@ -4981,7 +5022,12 @@ static PyObject* srwlpy_PropagElecField(PyObject *self, PyObject *args)
 		}
 
 		//ProcRes(srwlPropagElecField(&wfr, &optCnt));
+#ifdef _OFFLOAD_GPU //HG03012024
+		ParseDeviceParam(oDev, &gpu);
+		ProcRes(srwlPropagElecField(&wfr, &optCnt, nInt, arIntDescr, arIntMesh, arInts, (void*)&gpu));
+#else
 		ProcRes(srwlPropagElecField(&wfr, &optCnt, nInt, arIntDescr, arIntMesh, arInts)); //OC15082018
+#endif
 
 		//Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP:
 		//srwlPrintTime(":srwlpy_PropagElecField :srwlPropagElecField", &start);
@@ -5002,6 +5048,9 @@ static PyObject* srwlpy_PropagElecField(PyObject *self, PyObject *args)
 		//PyErr_PrintEx(1);
 		oWfr = 0;
 	}
+#ifdef _OFFLOAD_GPU //HG03012024
+	srwlUtiGPUProc(0); //to free GPU
+#endif
 
 	DeallocOptCntArrays(&optCnt);
 	ReleasePyBuffers(vBuf);
@@ -5102,12 +5151,18 @@ static PyObject* srwlpy_CalcTransm(PyObject* self, PyObject* args) //HG27012021
  ***************************************************************************/
 static PyObject* srwlpy_UtiFFT(PyObject *self, PyObject *args)
 {
-	PyObject *oData=0, *oMesh=0, *oDir=0;
+	//PyObject *oData=0, *oMesh=0, *oDir=0;
+	PyObject *oData=0, *oMesh=0, *oDir=0, *oDev=0; //HG03012024
 	vector<Py_buffer> vBuf;
 
+#ifdef _OFFLOAD_GPU //HG03012024
+	TGPUUsageArg gpu;
+	srwlUtiGPUProc(1); //to prepare GPU for calculations
+#endif
 	try
 	{
-		if(!PyArg_ParseTuple(args, "OOO:UtiFFT", &oData, &oMesh, &oDir)) throw strEr_BadArg_UtiFFT;
+		//if(!PyArg_ParseTuple(args, "OOO:UtiFFT", &oData, &oMesh, &oDir)) throw strEr_BadArg_UtiFFT;
+		if(!PyArg_ParseTuple(args, "OOO|O:UtiFFT", &oData, &oMesh, &oDir, &oDev)) throw strEr_BadArg_UtiFFT; //HG03012024
 		if((oData == 0) || (oMesh == 0) || (oDir == 0)) throw strEr_BadArg_UtiFFT;
 
 		//int sizeVectBuf = (int)vBuf.size();
@@ -5143,7 +5198,12 @@ static PyObject* srwlpy_UtiFFT(PyObject *self, PyObject *args)
 		if(!PyNumber_Check(oDir)) throw strEr_BadArg_UtiFFT;
 		int dir = (int)PyLong_AsLong(oDir);
 
+#ifdef _OFFLOAD_GPU //HG03012024
+		ParseDeviceParam(oDev, &gpu);
+		ProcRes(srwlUtiFFT(pcData, typeData, arMesh, nMesh, dir, (void*)&gpu));
+#else
 		ProcRes(srwlUtiFFT(pcData, typeData, arMesh, nMesh, dir));
+#endif
 
 		if(meshArType == 'l') UpdatePyListNum(oMesh, arMesh, nMesh); //04092016
 	}
@@ -5153,6 +5213,9 @@ static PyObject* srwlpy_UtiFFT(PyObject *self, PyObject *args)
 		//if(vBuf.size() > 0) ReleasePyBuffers(vBuf);
 		oData = 0; oMesh = 0; oDir = 0;
 	}
+#ifdef _OFFLOAD_GPU //HG03012024
+	srwlUtiGPUProc(0); //to free GPU
+#endif
 
 	ReleasePyBuffers(vBuf);
 

From 4a3a0be9e0cd93a80e5389c85d7e961c26db6a82 Mon Sep 17 00:00:00 2001
From: Himanshu Goel <himanshu@hgoel.dev>
Date: Thu, 4 Jan 2024 12:17:05 -0500
Subject: [PATCH 5/9] Fix gmfft formatting

---
 cpp/src/ext/genmath/gmfft.cpp | 368 +++++++++++++++++-----------------
 1 file changed, 184 insertions(+), 184 deletions(-)

diff --git a/cpp/src/ext/genmath/gmfft.cpp b/cpp/src/ext/genmath/gmfft.cpp
index 3c242adc..43639845 100644
--- a/cpp/src/ext/genmath/gmfft.cpp
+++ b/cpp/src/ext/genmath/gmfft.cpp
@@ -228,15 +228,15 @@ void CGenMathFFT::NextCorrectNumberForFFT(long& n)
 int CGenMathFFT1D::Make1DFFT_InPlace(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU) //OC06092023
 {
 #ifdef _OFFLOAD_GPU //OC06092023 (to avoid useless operations / calls at execution on CPU)
-	if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023
+	if(CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023
 	//GPU_COND(pvGPU, //OC06092023
 	//GPU_COND(pGpuUsage,
 	{
 		//HG03082022 GPU can do an inplace fft without being given a temporary buffer
 		FFT1DInfo.pOutData = FFT1DInfo.pInData;
 		int result;
-		if (result = Make1DFFT(FFT1DInfo, pvGPU)) return result; //OC06092023
-		//if (result = Make1DFFT(FFT1DInfo, pGpuUsage)) return result;
+		if(result = Make1DFFT(FFT1DInfo, pvGPU)) return result; //OC06092023
+		//if(result = Make1DFFT(FFT1DInfo, pGpuUsage)) return result;
 	}//)
 	else
 #endif
@@ -329,34 +329,34 @@ int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwnd_plan* pPrecrea
 	//ArrayShiftX = 0; ArrayShiftY = 0; 
 	m_ArrayShiftX = 0; m_ArrayShiftY = 0; //OC02022019
 	m_dArrayShiftX = 0; m_dArrayShiftY = 0;
-	if (FFT2DInfo.pData != 0)
+	if(FFT2DInfo.pData != 0)
 	{
-		if (NeedsShiftBeforeX || NeedsShiftAfterX)
+		if(NeedsShiftBeforeX || NeedsShiftAfterX)
 		{
 			//ArrayShiftX = new float[Nx << 1];
 			//if(ArrayShiftX == 0) return MEMORY_ALLOCATION_FAILURE;
 			m_ArrayShiftX = new float[Nx << 1];
-			if (m_ArrayShiftX == 0) return MEMORY_ALLOCATION_FAILURE;
+			if(m_ArrayShiftX == 0) return MEMORY_ALLOCATION_FAILURE;
 		}
-		if (NeedsShiftBeforeY || NeedsShiftAfterY)
+		if(NeedsShiftBeforeY || NeedsShiftAfterY)
 		{
 			//ArrayShiftY = new float[Ny << 1];
 			//if(ArrayShiftY == 0) return MEMORY_ALLOCATION_FAILURE;
 			m_ArrayShiftY = new float[Ny << 1];
-			if (m_ArrayShiftY == 0) return MEMORY_ALLOCATION_FAILURE;
+			if(m_ArrayShiftY == 0) return MEMORY_ALLOCATION_FAILURE;
 		}
 	}
-	else if (FFT2DInfo.pdData != 0)
+	else if(FFT2DInfo.pdData != 0)
 	{
-		if (NeedsShiftBeforeX || NeedsShiftAfterX)
+		if(NeedsShiftBeforeX || NeedsShiftAfterX)
 		{
 			m_dArrayShiftX = new double[Nx << 1];
-			if (m_dArrayShiftX == 0) return MEMORY_ALLOCATION_FAILURE;
+			if(m_dArrayShiftX == 0) return MEMORY_ALLOCATION_FAILURE;
 		}
-		if (NeedsShiftBeforeY || NeedsShiftAfterY)
+		if(NeedsShiftBeforeY || NeedsShiftAfterY)
 		{
 			m_dArrayShiftY = new double[Ny << 1];
-			if (m_dArrayShiftY == 0) return MEMORY_ALLOCATION_FAILURE;
+			if(m_dArrayShiftY == 0) return MEMORY_ALLOCATION_FAILURE;
 		}
 	}
 
@@ -374,7 +374,7 @@ int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwnd_plan* pPrecrea
 //#endif
 
 #ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
-	if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023
+	if(CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023
 	//GPU_COND(pvGPU, //OC06092023
 	//GPU_COND(pGpuUsage, //HG02112021
 	{
@@ -393,8 +393,8 @@ int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwnd_plan* pPrecrea
 #endif
 	{
 #if _FFTW3 //OC28012019
-		if (FFT2DInfo.pData != 0) DataToFFT = (fftwf_complex*)(FFT2DInfo.pData);
-		else if (FFT2DInfo.pdData != 0) dDataToFFT = (fftw_complex*)(FFT2DInfo.pdData); //OC02022019
+		if(FFT2DInfo.pData != 0) DataToFFT = (fftwf_complex*)(FFT2DInfo.pData);
+		else if(FFT2DInfo.pdData != 0) dDataToFFT = (fftw_complex*)(FFT2DInfo.pdData); //OC02022019
 
 #else
 		fftwnd_plan Plan2DFFT;
@@ -422,21 +422,21 @@ int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwnd_plan* pPrecrea
 	}
 	
 #ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
-	if (DataToFFT != 0) CAuxGPU::EnsureDeviceMemoryReady((TGPUUsageArg*)pvGPU, DataToFFT); //OC06092023
-	else if (dDataToFFT != 0) CAuxGPU::EnsureDeviceMemoryReady((TGPUUsageArg*)pvGPU, dDataToFFT);
+	if(DataToFFT != 0) CAuxGPU::EnsureDeviceMemoryReady((TGPUUsageArg*)pvGPU, DataToFFT); //OC06092023
+	else if(dDataToFFT != 0) CAuxGPU::EnsureDeviceMemoryReady((TGPUUsageArg*)pvGPU, dDataToFFT);
 	//if (DataToFFT != 0) CAuxGPU::EnsureDeviceMemoryReady(pGpuUsage, DataToFFT);
 	//else if (dDataToFFT != 0) CAuxGPU::EnsureDeviceMemoryReady(pGpuUsage, dDataToFFT);
 #endif
 
-	if (NeedsShiftBeforeX || NeedsShiftBeforeY) //HG02112021
+	if(NeedsShiftBeforeX || NeedsShiftBeforeY) //HG02112021
 	{
 #ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
-		if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023
+		if(CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023
 		{
 			//GPU_COND(pvGPU, { //OC06092023
 			//GPU_COND(pGpuUsage, {
 			TGPUUsageArg *pGPU = (TGPUUsageArg*)pvGPU;
-			if (DataToFFT != 0) {
+			if(DataToFFT != 0) {
 				m_ArrayShiftX = (float*)CAuxGPU::ToDevice(pGPU, m_ArrayShiftX, (Nx << 1) * sizeof(float), false); //OC06092023
 				m_ArrayShiftY = (float*)CAuxGPU::ToDevice(pGPU, m_ArrayShiftY, (Ny << 1) * sizeof(float), false);
 				CAuxGPU::EnsureDeviceMemoryReady(pGPU, m_ArrayShiftX);
@@ -451,7 +451,7 @@ int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwnd_plan* pPrecrea
 				//m_ArrayShiftX = (float*)CAuxGPU::ToHostAndFree(pGpuUsage, m_ArrayShiftX, (Nx << 1) * sizeof(float), true);
 				//m_ArrayShiftY = (float*)CAuxGPU::ToHostAndFree(pGpuUsage, m_ArrayShiftY, (Ny << 1) * sizeof(float), true);
 			}
-			else if (dDataToFFT != 0) {
+			else if(dDataToFFT != 0) {
 				m_dArrayShiftX = (double*)CAuxGPU::ToDevice(pGPU, m_dArrayShiftX, (Nx << 1) * sizeof(double), false); //OC06092023
 				m_dArrayShiftY = (double*)CAuxGPU::ToDevice(pGPU, m_dArrayShiftY, (Ny << 1) * sizeof(double), false);
 				CAuxGPU::EnsureDeviceMemoryReady(pGPU, m_dArrayShiftX);
@@ -470,10 +470,10 @@ int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwnd_plan* pPrecrea
 		else 
 #endif
 		{
-			if (DataToFFT != 0) TreatShifts(DataToFFT);
+			if(DataToFFT != 0) TreatShifts(DataToFFT);
 
 #ifdef _FFTW3 //OC27022019
-			else if (dDataToFFT != 0) TreatShifts(dDataToFFT); //OC02022019
+			else if(dDataToFFT != 0) TreatShifts(dDataToFFT); //OC02022019
 #endif
 		}
 	}
@@ -481,22 +481,22 @@ int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwnd_plan* pPrecrea
 	bool alreadyNormalized = false; //HG17032022
 	//double Mult = FFT2DInfo.xStep*FFT2DInfo.yStep;
 	double Mult = FFT2DInfo.xStep * FFT2DInfo.yStep * FFT2DInfo.ExtraMult; //OC20112017
-	if (FFT2DInfo.Dir > 0)
+	if(FFT2DInfo.Dir > 0)
 	{
 #ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
-		if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023
+		if(CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023
 		//GPU_COND(pvGPU, //OC06092023
 		//GPU_COND(pGpuUsage, //HG02112021
 		{
-			if (DataToFFT != 0)
+			if(DataToFFT != 0)
 			{
-				if (pPrecreatedPlan2DFFT == 0) 
+				if(pPrecreatedPlan2DFFT == 0) 
 				{
-					if ((Plan2DFFT_cu == NULL) || (!(PlanNx == Nx && PlanNy == Ny))) //OC06092023 //HG04122023
+					if((Plan2DFFT_cu == NULL) || (!(PlanNx == Nx && PlanNy == Ny))) //OC06092023 //HG04122023
 					//if((Plan2DFFT_cu == NULL) || (!(PlanNx == Nx && PlanNy == Ny && HowMany == FFT2DInfo.howMany))) //OC06092023
 					//if (Plan2DFFT_cu == NULL | !(PlanNx == Nx && PlanNy == Ny && HowMany == FFT2DInfo.howMany)) 
 					{
-						if (Plan2DFFT_cu != NULL)
+						if(Plan2DFFT_cu != NULL)
 						{
 							cufftDestroy(Plan2DFFT_cu);
 							Plan2DFFT_cu = NULL;
@@ -511,21 +511,21 @@ int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwnd_plan* pPrecrea
 					}
 				}
 				else Plan2DFFT_cu = *(cufftHandle*)pPrecreatedPlan2DFFT;
-				if (Plan2DFFT_cu == 0) return ERROR_IN_FFT;
+				if(Plan2DFFT_cu == 0) return ERROR_IN_FFT;
 
 				auto res = cufftExecC2C(Plan2DFFT_cu, (cufftComplex*)DataToFFT, (cufftComplex*)DataToFFT, CUFFT_FORWARD);
 //				if (res != CUFFT_SUCCESS)
 //					printf("CUFFT Error: %d\r\n", res);
 			}
-			else if (dDataToFFT != 0)
+			else if(dDataToFFT != 0)
 			{
-				if (pdPrecreatedPlan2DFFT == 0) 
+				if(pdPrecreatedPlan2DFFT == 0) 
 				{
-					if ((dPlan2DFFT_cu == NULL) || (!(dPlanNx == Nx && dPlanNy == Ny))) //OC06092023 //HG04122023
+					if((dPlan2DFFT_cu == NULL) || (!(dPlanNx == Nx && dPlanNy == Ny))) //OC06092023 //HG04122023
 					//if((dPlan2DFFT_cu == NULL) || (!(dPlanNx == Nx && dPlanNy == Ny && dHowMany == FFT2DInfo.howMany))) //OC06092023
 					//if (dPlan2DFFT_cu == NULL | !(dPlanNx == Nx && dPlanNy == Ny && dHowMany == FFT2DInfo.howMany)) 
 					{
-						if (dPlan2DFFT_cu != NULL)
+						if(dPlan2DFFT_cu != NULL)
 						{
 							cufftDestroy(dPlan2DFFT_cu);
 							dPlan2DFFT_cu = NULL;
@@ -540,7 +540,7 @@ int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwnd_plan* pPrecrea
 					}
 				}
 				else dPlan2DFFT_cu = *(cufftHandle*)pdPrecreatedPlan2DFFT;
-				if (dPlan2DFFT_cu == 0) return ERROR_IN_FFT;
+				if(dPlan2DFFT_cu == 0) return ERROR_IN_FFT;
 
 				cufftExecZ2Z(dPlan2DFFT_cu, (cufftDoubleComplex*)dDataToFFT, (cufftDoubleComplex*)dDataToFFT, CUFFT_FORWARD);
 			}
@@ -556,38 +556,38 @@ int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwnd_plan* pPrecrea
 			for(long iHowMany = 0; iHowMany < FFT2DInfo.howMany; iHowMany++)
 			{
 				long iFFT = Nx * Ny * iHowMany;
-				if (DataToFFT != 0)
+				if(DataToFFT != 0)
 				{
-					if (pPrecreatedPlan2DFFT == 0) Plan2DFFT = fftwf_plan_dft_2d(Ny, Nx, DataToFFT + iFFT, DataToFFT + iFFT, FFTW_FORWARD, FFTW_ESTIMATE);
+					if(pPrecreatedPlan2DFFT == 0) Plan2DFFT = fftwf_plan_dft_2d(Ny, Nx, DataToFFT + iFFT, DataToFFT + iFFT, FFTW_FORWARD, FFTW_ESTIMATE);
 					else Plan2DFFT = *pPrecreatedPlan2DFFT;
-					if (Plan2DFFT == 0) return ERROR_IN_FFT;
+					if(Plan2DFFT == 0) return ERROR_IN_FFT;
 
 					fftwf_execute(Plan2DFFT);
 				}
-				else if (dDataToFFT != 0)
+				else if(dDataToFFT != 0)
 				{
-					if (pdPrecreatedPlan2DFFT == 0) dPlan2DFFT = fftw_plan_dft_2d(Ny, Nx, dDataToFFT + iFFT, dDataToFFT + iFFT, FFTW_FORWARD, FFTW_ESTIMATE);
+					if(pdPrecreatedPlan2DFFT == 0) dPlan2DFFT = fftw_plan_dft_2d(Ny, Nx, dDataToFFT + iFFT, dDataToFFT + iFFT, FFTW_FORWARD, FFTW_ESTIMATE);
 					else dPlan2DFFT = *pdPrecreatedPlan2DFFT;
-					if (dPlan2DFFT == 0) return ERROR_IN_FFT;
+					if(dPlan2DFFT == 0) return ERROR_IN_FFT;
 
 					fftw_execute(dPlan2DFFT);
 				}
 			}
 
 #else
-			if (pPrecreatedPlan2DFFT == 0) Plan2DFFT = fftw2d_create_plan(Ny, Nx, FFTW_FORWARD, FFTW_IN_PLACE);
+			if(pPrecreatedPlan2DFFT == 0) Plan2DFFT = fftw2d_create_plan(Ny, Nx, FFTW_FORWARD, FFTW_IN_PLACE);
 			else Plan2DFFT = *pPrecreatedPlan2DFFT;
-			if (Plan2DFFT == 0) return ERROR_IN_FFT;
+			if(Plan2DFFT == 0) return ERROR_IN_FFT;
 			fftwnd(Plan2DFFT, 1, DataToFFT, 1, 0, DataToFFT, 1, 0);
 #endif
 		}
 
 #ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
-		if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023
+		if(CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023
 		//GPU_COND(pvGPU, //OC06092023
 		//GPU_COND(pGpuUsage, //HG18072022
 		{
-			if (DataToFFT != 0)
+			if(DataToFFT != 0)
 			{
 				//RepairSignAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany);
 				//RotateDataAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany);
@@ -595,7 +595,7 @@ int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwnd_plan* pPrecrea
 				//RepairSignAndRotateDataAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany, (float)Mult); //OC06092023
 				RepairSignAndRotateDataAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, (float)Mult); //OC06092023 //HG04122023
 			}
-			else if (dDataToFFT != 0)
+			else if(dDataToFFT != 0)
 			{
 				//RepairSignAfter2DFFT_GPU((double*)dDataToFFT, Nx, Ny, FFT2DInfo.howMany);
 				//RotateDataAfter2DFFT_GPU((double*)dDataToFFT, Nx, Ny, FFT2DInfo.howMany);
@@ -607,14 +607,14 @@ int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwnd_plan* pPrecrea
 		else 
 #endif
 		{
-			if (DataToFFT != 0)
+			if(DataToFFT != 0)
 			{
 				RepairSignAfter2DFFT(DataToFFT);
 				RotateDataAfter2DFFT(DataToFFT);
 			}
 
 #ifdef _FFTW3 //OC27022019
-			else if (dDataToFFT != 0)
+			else if(dDataToFFT != 0)
 			{
 				RepairSignAfter2DFFT(dDataToFFT);
 				RotateDataAfter2DFFT(dDataToFFT);
@@ -625,18 +625,18 @@ int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwnd_plan* pPrecrea
 	else
 	{
 #ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
-		if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023
+		if(CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023
 		//GPU_COND(pvGPU, //OC06092023
 		//GPU_COND(pGpuUsage, //HG18072022
 		{
-			if (DataToFFT != 0)
+			if(DataToFFT != 0)
 			{
-				if (pPrecreatedPlan2DFFT == 0) {
-					if ((Plan2DFFT_cu == NULL) || (!(PlanNx == Nx && PlanNy == Ny))) //OC06092023 //HG04122023
+				if(pPrecreatedPlan2DFFT == 0) {
+					if((Plan2DFFT_cu == NULL) || (!(PlanNx == Nx && PlanNy == Ny))) //OC06092023 //HG04122023
 					//if((Plan2DFFT_cu == NULL) || (!(PlanNx == Nx && PlanNy == Ny && HowMany == FFT2DInfo.howMany))) //OC06092023
 					//if (Plan2DFFT_cu == NULL | !(PlanNx == Nx && PlanNy == Ny && HowMany == FFT2DInfo.howMany)) 
 					{
-						if (Plan2DFFT_cu != NULL){
+						if(Plan2DFFT_cu != NULL){
 							cufftDestroy(Plan2DFFT_cu);
 							Plan2DFFT_cu = NULL;
 						}
@@ -651,7 +651,7 @@ int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwnd_plan* pPrecrea
 					}
 				}
 				else Plan2DFFT_cu = *(cufftHandle*)pPrecreatedPlan2DFFT;
-				if (Plan2DFFT_cu == 0) return ERROR_IN_FFT;
+				if(Plan2DFFT_cu == 0) return ERROR_IN_FFT;
 
 				//RotateDataAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany);
 				//RepairSignAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany);
@@ -659,14 +659,14 @@ int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwnd_plan* pPrecrea
 				RepairSignAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny);
 				cufftExecC2C(Plan2DFFT_cu, (cufftComplex*)DataToFFT, (cufftComplex*)DataToFFT, CUFFT_INVERSE);
 			}
-			else if (dDataToFFT != 0)
+			else if(dDataToFFT != 0)
 			{
-				if (pdPrecreatedPlan2DFFT == 0) {
-					if ((dPlan2DFFT_cu == NULL) || (!(dPlanNx == Nx && dPlanNy == Ny))) //OC06092023 //HG04122023
+				if(pdPrecreatedPlan2DFFT == 0) {
+					if((dPlan2DFFT_cu == NULL) || (!(dPlanNx == Nx && dPlanNy == Ny))) //OC06092023 //HG04122023
 					//if((dPlan2DFFT_cu == NULL) || (!(dPlanNx == Nx && dPlanNy == Ny && dHowMany == FFT2DInfo.howMany))) //OC06092023
 					//if (dPlan2DFFT_cu == NULL | !(dPlanNx == Nx && dPlanNy == Ny && dHowMany == FFT2DInfo.howMany)) 
 					{
-						if (dPlan2DFFT_cu != NULL){
+						if(dPlan2DFFT_cu != NULL){
 							cufftDestroy(dPlan2DFFT_cu);
 							dPlan2DFFT_cu = NULL;
 						}
@@ -680,7 +680,7 @@ int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwnd_plan* pPrecrea
 					}
 				}
 				else dPlan2DFFT_cu = *(cufftHandle*)pdPrecreatedPlan2DFFT;
-				if (dPlan2DFFT_cu == 0) return ERROR_IN_FFT;
+				if(dPlan2DFFT_cu == 0) return ERROR_IN_FFT;
 
 				//RotateDataAfter2DFFT_GPU((double*)dDataToFFT, Nx, Ny, FFT2DInfo.howMany);
 				//RepairSignAfter2DFFT_GPU((double*)dDataToFFT, Nx, Ny, FFT2DInfo.howMany);
@@ -699,29 +699,29 @@ int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwnd_plan* pPrecrea
 			for (long iHowMany = 0; iHowMany < FFT2DInfo.howMany; iHowMany++)
 			{
 				long iFFT = Nx * Ny * iHowMany;
-				if (DataToFFT != 0)
+				if(DataToFFT != 0)
 				{
-					if (pPrecreatedPlan2DFFT == 0) Plan2DFFT = fftwf_plan_dft_2d(Ny, Nx, DataToFFT + iFFT, DataToFFT + iFFT, FFTW_BACKWARD, FFTW_ESTIMATE);
+					if(pPrecreatedPlan2DFFT == 0) Plan2DFFT = fftwf_plan_dft_2d(Ny, Nx, DataToFFT + iFFT, DataToFFT + iFFT, FFTW_BACKWARD, FFTW_ESTIMATE);
 					else Plan2DFFT = *pPrecreatedPlan2DFFT;
-					if (Plan2DFFT == 0) return ERROR_IN_FFT;
+					if(Plan2DFFT == 0) return ERROR_IN_FFT;
 					RotateDataAfter2DFFT(DataToFFT);
 					RepairSignAfter2DFFT(DataToFFT);
 					fftwf_execute(Plan2DFFT);
 				}
-				else if (dDataToFFT != 0)
+				else if(dDataToFFT != 0)
 				{
-					if (pdPrecreatedPlan2DFFT == 0) dPlan2DFFT = fftw_plan_dft_2d(Ny, Nx, dDataToFFT + iFFT, dDataToFFT + iFFT, FFTW_BACKWARD, FFTW_ESTIMATE);
+					if(pdPrecreatedPlan2DFFT == 0) dPlan2DFFT = fftw_plan_dft_2d(Ny, Nx, dDataToFFT + iFFT, dDataToFFT + iFFT, FFTW_BACKWARD, FFTW_ESTIMATE);
 					else dPlan2DFFT = *pdPrecreatedPlan2DFFT;
-					if (dPlan2DFFT == 0) return ERROR_IN_FFT;
+					if(dPlan2DFFT == 0) return ERROR_IN_FFT;
 					RotateDataAfter2DFFT(dDataToFFT);
 					RepairSignAfter2DFFT(dDataToFFT);
 					fftw_execute(dPlan2DFFT);
 				}
 			}
 #else
-			if (pPrecreatedPlan2DFFT == 0) Plan2DFFT = fftw2d_create_plan(Ny, Nx, FFTW_BACKWARD, FFTW_IN_PLACE);
+			if(pPrecreatedPlan2DFFT == 0) Plan2DFFT = fftw2d_create_plan(Ny, Nx, FFTW_BACKWARD, FFTW_IN_PLACE);
 			else Plan2DFFT = *pPrecreatedPlan2DFFT;
-			if (Plan2DFFT == 0) return ERROR_IN_FFT;
+			if(Plan2DFFT == 0) return ERROR_IN_FFT;
 			RotateDataAfter2DFFT(DataToFFT);
 			RepairSignAfter2DFFT(DataToFFT);
 			fftwnd(Plan2DFFT, 1, DataToFFT, 1, 0, DataToFFT, 1, 0);
@@ -729,9 +729,9 @@ int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwnd_plan* pPrecrea
 		}
 	}
 	
-	if (!alreadyNormalized){
+	if(!alreadyNormalized){
 #ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
-		if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023
+		if(CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023
 		//GPU_COND(pvGPU, //OC06092023
 		//GPU_COND(pGpuUsage, //HG18072022
 		{
@@ -739,18 +739,18 @@ int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwnd_plan* pPrecrea
 			//	NormalizeDataAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany, Mult);
 			//else if (dDataToFFT != 0)
 			//	NormalizeDataAfter2DFFT_GPU((double*)dDataToFFT, Nx, Ny, FFT2DInfo.howMany, Mult);
-			if (DataToFFT != 0) //HG04122023
+			if(DataToFFT != 0) //HG04122023
 				NormalizeDataAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, Mult);
-			else if (dDataToFFT != 0)
+			else if(dDataToFFT != 0)
 				NormalizeDataAfter2DFFT_GPU((double*)dDataToFFT, Nx, Ny, Mult);
 		}//)
 		else 
 #endif
 		{
-			if (DataToFFT != 0) NormalizeDataAfter2DFFT(DataToFFT, Mult);
+			if(DataToFFT != 0) NormalizeDataAfter2DFFT(DataToFFT, Mult);
 
 #ifdef _FFTW3 //OC27022019
-			else if (dDataToFFT != 0) NormalizeDataAfter2DFFT(dDataToFFT, Mult);
+			else if(dDataToFFT != 0) NormalizeDataAfter2DFFT(dDataToFFT, Mult);
 #endif
 		}
 	}
@@ -758,25 +758,25 @@ int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwnd_plan* pPrecrea
 	//if(NeedsShiftAfterX) FillArrayShift('x', t0SignMult*x0_After, FFT2DInfo.xStepTr);
 	//if(NeedsShiftAfterY) FillArrayShift('y', t0SignMult*y0_After, FFT2DInfo.yStepTr);
 
-	if (NeedsShiftAfterX)
+	if(NeedsShiftAfterX)
 	{//OC02022019
-		if (m_ArrayShiftX != 0) FillArrayShift('x', t0SignMult * x0_After, FFT2DInfo.xStepTr, m_ArrayShiftX);
-		else if (m_dArrayShiftX != 0) FillArrayShift('x', t0SignMult * x0_After, FFT2DInfo.xStepTr, m_dArrayShiftX);
+		if(m_ArrayShiftX != 0) FillArrayShift('x', t0SignMult * x0_After, FFT2DInfo.xStepTr, m_ArrayShiftX);
+		else if(m_dArrayShiftX != 0) FillArrayShift('x', t0SignMult * x0_After, FFT2DInfo.xStepTr, m_dArrayShiftX);
 	}
-	if (NeedsShiftAfterY)
+	if(NeedsShiftAfterY)
 	{//OC02022019
-		if (m_ArrayShiftY != 0) FillArrayShift('y', t0SignMult * y0_After, FFT2DInfo.yStepTr, m_ArrayShiftY);
-		else if (m_dArrayShiftY != 0) FillArrayShift('y', t0SignMult * y0_After, FFT2DInfo.yStepTr, m_dArrayShiftY);
+		if(m_ArrayShiftY != 0) FillArrayShift('y', t0SignMult * y0_After, FFT2DInfo.yStepTr, m_ArrayShiftY);
+		else if(m_dArrayShiftY != 0) FillArrayShift('y', t0SignMult * y0_After, FFT2DInfo.yStepTr, m_dArrayShiftY);
 	}
-	if (NeedsShiftAfterX || NeedsShiftAfterY)
+	if(NeedsShiftAfterX || NeedsShiftAfterY)
 	{
 #ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
-		if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023
+		if(CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023
 		//GPU_COND(pvGPU, //OC06092023
 		//GPU_COND(pGpuUsage, //HG18072022
 		{
 			TGPUUsageArg *pGPU = (TGPUUsageArg*)pvGPU;
-			if (DataToFFT != 0) {
+			if(DataToFFT != 0) {
 				m_ArrayShiftX = (float*)CAuxGPU::ToDevice(pGPU, m_ArrayShiftX, (Nx << 1) * sizeof(float), false); //OC06092023
 				m_ArrayShiftY = (float*)CAuxGPU::ToDevice(pGPU, m_ArrayShiftY, (Ny << 1) * sizeof(float), false);
 				CAuxGPU::EnsureDeviceMemoryReady(pGPU, m_ArrayShiftX);
@@ -792,7 +792,7 @@ int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwnd_plan* pPrecrea
 				//m_ArrayShiftX = (float*)CAuxGPU::ToHostAndFree(pGpuUsage, m_ArrayShiftX, (Nx << 1) * sizeof(float), true);
 				//m_ArrayShiftY = (float*)CAuxGPU::ToHostAndFree(pGpuUsage, m_ArrayShiftY, (Ny << 1) * sizeof(float), true);
 			}
-			else if (dDataToFFT != 0) {
+			else if(dDataToFFT != 0) {
 				m_dArrayShiftX = (double*)CAuxGPU::ToDevice(pGPU, m_dArrayShiftX, (Nx << 1) * sizeof(double), false); //OC06092023
 				m_dArrayShiftY = (double*)CAuxGPU::ToDevice(pGPU, m_dArrayShiftY, (Ny << 1) * sizeof(double), false);
 				CAuxGPU::EnsureDeviceMemoryReady(pGPU, m_dArrayShiftX);
@@ -812,10 +812,10 @@ int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwnd_plan* pPrecrea
 		else 
 #endif
 		{
-			if (DataToFFT != 0) TreatShifts(DataToFFT);
+			if(DataToFFT != 0) TreatShifts(DataToFFT);
 
 #ifdef _FFTW3 //OC27022019
-			else if (dDataToFFT != 0) TreatShifts(dDataToFFT); //OC02022019
+			else if(dDataToFFT != 0) TreatShifts(dDataToFFT); //OC02022019
 #endif
 		}
 	}
@@ -825,16 +825,16 @@ int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwnd_plan* pPrecrea
 	//OC27102018
 	//SY: adopted for OpenMP
 #ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
-	if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023
+	if(CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023
 	//GPU_COND(pvGPU, //OC06092023
 	//GPU_COND(pGpuUsage, //HG02112021
 	{
-		if (FFT2DInfo.pData != 0) 
+		if(FFT2DInfo.pData != 0) 
 		{
 			CAuxGPU::MarkUpdated((TGPUUsageArg*)pvGPU, DataToFFT, true, false); //OC06092023
 			//CAuxGPU::MarkUpdated(pGpuUsage, DataToFFT, true, false);
 		}
-		else if (FFT2DInfo.pdData != 0) 
+		else if(FFT2DInfo.pdData != 0) 
 		{
 			CAuxGPU::MarkUpdated((TGPUUsageArg*)pvGPU, dDataToFFT, true, false); //OC06092023
 			//CAuxGPU::MarkUpdated(pGpuUsage, dDataToFFT, true, false);
@@ -844,25 +844,25 @@ int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwnd_plan* pPrecrea
 #endif
 	{
 #if _FFTW3 //OC28012019
-		if (DataToFFT != 0)
+		if(DataToFFT != 0)
 		{
-			if (pPrecreatedPlan2DFFT == 0) fftwf_destroy_plan(Plan2DFFT);
+			if(pPrecreatedPlan2DFFT == 0) fftwf_destroy_plan(Plan2DFFT);
 		}
-		else if (dDataToFFT != 0) //OC03022019
+		else if(dDataToFFT != 0) //OC03022019
 		{
-			if (pdPrecreatedPlan2DFFT == 0) fftw_destroy_plan(dPlan2DFFT);
+			if(pdPrecreatedPlan2DFFT == 0) fftw_destroy_plan(dPlan2DFFT);
 		}
 #else
-		if (pPrecreatedPlan2DFFT == 0) fftwnd_destroy_plan(Plan2DFFT);
+		if(pPrecreatedPlan2DFFT == 0) fftwnd_destroy_plan(Plan2DFFT);
 #endif
 	}
 
 	//if(ArrayShiftX != 0) { delete[] ArrayShiftX; ArrayShiftX = 0;}
 	//if(ArrayShiftY != 0) { delete[] ArrayShiftY; ArrayShiftY = 0;}
-	if (m_ArrayShiftX != 0) { delete[] m_ArrayShiftX;}
-	if (m_ArrayShiftY != 0) { delete[] m_ArrayShiftY;}
-	if (m_dArrayShiftX != 0) { delete[] m_dArrayShiftX;} //OC02022019
-	if (m_dArrayShiftY != 0) { delete[] m_dArrayShiftY;}
+	if(m_ArrayShiftX != 0) { delete[] m_ArrayShiftX;}
+	if(m_ArrayShiftY != 0) { delete[] m_ArrayShiftY;}
+	if(m_dArrayShiftX != 0) { delete[] m_dArrayShiftX;} //OC02022019
+	if(m_dArrayShiftY != 0) { delete[] m_dArrayShiftY;}
 	
 	return 0;
 }
@@ -899,22 +899,22 @@ int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU) //OC0509
 
 	m_ArrayShiftX = 0;
 	m_dArrayShiftX = 0;
-	if (NeedsShiftBeforeX || NeedsShiftAfterX)
+	if(NeedsShiftBeforeX || NeedsShiftAfterX)
 	{
-		if (FFT1DInfo.pInData != 0)
+		if(FFT1DInfo.pInData != 0)
 		{
 			m_ArrayShiftX = new float[Nx << 1];
-			if (m_ArrayShiftX == 0) return MEMORY_ALLOCATION_FAILURE;
+			if(m_ArrayShiftX == 0) return MEMORY_ALLOCATION_FAILURE;
 
 #ifdef _OFFLOAD_GPU //OC05092023 (check for memory leak / misuse!)
 			m_ArrayShiftX = (float*)CAuxGPU::ToDevice((TGPUUsageArg*)pvGPU, m_ArrayShiftX, (Nx << 1) * sizeof(float), true);
 			//m_ArrayShiftX = (float*)CAuxGPU::ToDevice(pGpuUsage, m_ArrayShiftX, (Nx << 1) * sizeof(float), true); //HG20012022
 #endif
 		}
-		else if (FFT1DInfo.pdInData != 0)
+		else if(FFT1DInfo.pdInData != 0)
 		{
 			m_dArrayShiftX = new double[Nx << 1];
-			if (m_dArrayShiftX == 0) return MEMORY_ALLOCATION_FAILURE;
+			if(m_dArrayShiftX == 0) return MEMORY_ALLOCATION_FAILURE;
 
 #ifdef _OFFLOAD_GPU //OC05092023 
 			m_dArrayShiftX = (double*)CAuxGPU::ToDevice((TGPUUsageArg*)pvGPU, m_dArrayShiftX, (Nx << 1) * sizeof(double), true);
@@ -937,18 +937,18 @@ int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU) //OC0509
 //		printf ("GPU: Make1DFFT\n");
 //#endif
 #ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
-	if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023
+	if(CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023
 	//GPU_COND(pvGPU, //OC06092023
 	//GPU_COND(pGpuUsage, //HG20012022
 	{
-		if ((FFT1DInfo.pInData != 0) && (FFT1DInfo.pOutData != 0))
+		if((FFT1DInfo.pInData != 0) && (FFT1DInfo.pOutData != 0))
 		{
 			DataToFFT = (fftwf_complex*)CAuxGPU::ToDevice((TGPUUsageArg*)pvGPU, FFT1DInfo.pInData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(float)); //OC06092023
 			OutDataFFT = (fftwf_complex*)CAuxGPU::ToDevice((TGPUUsageArg*)pvGPU, FFT1DInfo.pOutData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(float), true);
 			//DataToFFT = (fftwf_complex*)CAuxGPU::ToDevice(pGpuUsage, FFT1DInfo.pInData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(float));
 			//OutDataFFT = (fftwf_complex*)CAuxGPU::ToDevice(pGpuUsage, FFT1DInfo.pOutData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(float), true);
 		}
-		else if ((FFT1DInfo.pdInData != 0) && (FFT1DInfo.pdOutData != 0))
+		else if((FFT1DInfo.pdInData != 0) && (FFT1DInfo.pdOutData != 0))
 		{
 			dDataToFFT = (fftw_complex*)CAuxGPU::ToDevice((TGPUUsageArg*)pvGPU, FFT1DInfo.pdInData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(double)); //OC06092023
 			dOutDataFFT = (fftw_complex*)CAuxGPU::ToDevice((TGPUUsageArg*)pvGPU, FFT1DInfo.pdOutData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(double), true);
@@ -960,13 +960,13 @@ int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU) //OC0509
 #endif
 	{
 #ifdef _FFTW3 //OC28012019
-		if ((FFT1DInfo.pInData != 0) && (FFT1DInfo.pOutData != 0))
+		if((FFT1DInfo.pInData != 0) && (FFT1DInfo.pOutData != 0))
 		{
 			DataToFFT = (fftwf_complex*)(FFT1DInfo.pInData);
 			OutDataFFT = (fftwf_complex*)(FFT1DInfo.pOutData);
 			//pOutDataFFT = OutDataFFT; //OC03092016 to be used solely in fftw call
 		}
-		else if ((FFT1DInfo.pdInData != 0) && (FFT1DInfo.pdOutData != 0))
+		else if((FFT1DInfo.pdInData != 0) && (FFT1DInfo.pdOutData != 0))
 		{
 			dDataToFFT = (fftw_complex*)(FFT1DInfo.pdInData);
 			dOutDataFFT = (fftw_complex*)(FFT1DInfo.pdOutData);
@@ -994,37 +994,37 @@ int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU) //OC0509
 	}
 
 #ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
-	if (DataToFFT != 0) CAuxGPU::EnsureDeviceMemoryReady((TGPUUsageArg*)pvGPU, DataToFFT);
-	else if (dDataToFFT != 0) CAuxGPU::EnsureDeviceMemoryReady((TGPUUsageArg*)pvGPU, dDataToFFT);
+	if(DataToFFT != 0) CAuxGPU::EnsureDeviceMemoryReady((TGPUUsageArg*)pvGPU, DataToFFT);
+	else if(dDataToFFT != 0) CAuxGPU::EnsureDeviceMemoryReady((TGPUUsageArg*)pvGPU, dDataToFFT);
 	//if (DataToFFT != 0) CAuxGPU::EnsureDeviceMemoryReady(pGpuUsage, DataToFFT);
 	//else if (dDataToFFT != 0) CAuxGPU::EnsureDeviceMemoryReady(pGpuUsage, dDataToFFT);
 #endif
 
 	char t0SignMult = (FFT1DInfo.Dir > 0) ? -1 : 1;
-	if (NeedsShiftBeforeX)
+	if(NeedsShiftBeforeX)
 	{
 #ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
-		if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023
+		if(CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023
 		//GPU_COND(pvGPU, 
 		//GPU_COND(pGpuUsage, //HG20012022
 		{
-			if (m_ArrayShiftX != 0) FillArrayShift_GPU(t0SignMult * x0_Before, FFT1DInfo.xStep, Nx, m_ArrayShiftX);
-			else if (m_dArrayShiftX != 0) FillArrayShift_GPU(t0SignMult * x0_Before, FFT1DInfo.xStep, Nx, m_dArrayShiftX);
+			if(m_ArrayShiftX != 0) FillArrayShift_GPU(t0SignMult * x0_Before, FFT1DInfo.xStep, Nx, m_ArrayShiftX);
+			else if(m_dArrayShiftX != 0) FillArrayShift_GPU(t0SignMult * x0_Before, FFT1DInfo.xStep, Nx, m_dArrayShiftX);
 
-			if (DataToFFT != 0) TreatShift_GPU((float*)DataToFFT, FFT1DInfo.HowMany, Nx, m_ArrayShiftX);
-			else if (dDataToFFT != 0) TreatShift_GPU((double*)dDataToFFT, FFT1DInfo.HowMany, Nx, m_dArrayShiftX);
+			if(DataToFFT != 0) TreatShift_GPU((float*)DataToFFT, FFT1DInfo.HowMany, Nx, m_ArrayShiftX);
+			else if(dDataToFFT != 0) TreatShift_GPU((double*)dDataToFFT, FFT1DInfo.HowMany, Nx, m_dArrayShiftX);
 		}//)
 		else 
 #endif
 		{
 			//FillArrayShift(t0SignMult*x0_Before, FFT1DInfo.xStep);
-			if (m_ArrayShiftX != 0) FillArrayShift(t0SignMult * x0_Before, FFT1DInfo.xStep, m_ArrayShiftX);
-			else if (m_dArrayShiftX != 0) FillArrayShift(t0SignMult * x0_Before, FFT1DInfo.xStep, m_dArrayShiftX);
+			if(m_ArrayShiftX != 0) FillArrayShift(t0SignMult * x0_Before, FFT1DInfo.xStep, m_ArrayShiftX);
+			else if(m_dArrayShiftX != 0) FillArrayShift(t0SignMult * x0_Before, FFT1DInfo.xStep, m_dArrayShiftX);
 
-			if (DataToFFT != 0) TreatShift(DataToFFT, FFT1DInfo.HowMany);
+			if(DataToFFT != 0) TreatShift(DataToFFT, FFT1DInfo.HowMany);
 
 #ifdef _FFTW3 //OC27022019
-			else if (dDataToFFT != 0) TreatShift(dDataToFFT, FFT1DInfo.HowMany);
+			else if(dDataToFFT != 0) TreatShift(dDataToFFT, FFT1DInfo.HowMany);
 #endif
 		}
 	}
@@ -1037,32 +1037,32 @@ int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU) //OC0509
 	//double Mult = FFT1DInfo.xStep;
 	double Mult = FFT1DInfo.xStep * FFT1DInfo.MultExtra;
 
-	if (FFT1DInfo.Dir > 0) //HG17112021
+	if(FFT1DInfo.Dir > 0) //HG17112021
 	{
 #ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
-		if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023
+		if(CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023
 		//GPU_COND(pvGPU, 
 		//GPU_COND(pGpuUsage, 
 		{
 			int arN[] = { (int)Nx }; //OC14052020
-			if (DataToFFT != 0)
+			if(DataToFFT != 0)
 			{
-				if (PlanLen != Nx) {
+				if(PlanLen != Nx) {
 					PlanLen = Nx;
-					if (Plan1DFFT_cu != NULL)
+					if(Plan1DFFT_cu != NULL)
 					{
 						cufftDestroy(Plan1DFFT_cu);
 						Plan1DFFT_cu = NULL;
 					}
 					cufftPlanMany(&Plan1DFFT_cu, 1, arN, NULL, 1, Nx, NULL, 1, Nx, CUFFT_C2C, FFT1DInfo.HowMany);
 				}
-				if (Plan1DFFT_cu == 0) return ERROR_IN_FFT;
+				if(Plan1DFFT_cu == 0) return ERROR_IN_FFT;
 				cufftExecC2C(Plan1DFFT_cu, (cufftComplex*)DataToFFT, (cufftComplex*)OutDataFFT, CUFFT_FORWARD);
 			}
-			else if (dDataToFFT != 0) //OC02022019
+			else if(dDataToFFT != 0) //OC02022019
 			{
-				if (dPlanLen != Nx) {
-					if (dPlan1DFFT_cu != NULL)
+				if(dPlanLen != Nx) {
+					if(dPlan1DFFT_cu != NULL)
 					{
 						cufftDestroy(dPlan1DFFT_cu);
 						dPlan1DFFT_cu = NULL;
@@ -1070,7 +1070,7 @@ int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU) //OC0509
 					dPlanLen = Nx;
 					cufftPlanMany(&dPlan1DFFT_cu, 1, arN, NULL, 1, Nx, NULL, 1, Nx, CUFFT_Z2Z, FFT1DInfo.HowMany);
 				}
-				if (dPlan1DFFT_cu == 0) return ERROR_IN_FFT;
+				if(dPlan1DFFT_cu == 0) return ERROR_IN_FFT;
 				cufftExecZ2Z(dPlan1DFFT_cu, (cufftDoubleComplex*)dDataToFFT, (cufftDoubleComplex*)dOutDataFFT, CUFFT_FORWARD);
 			}
 		}//)
@@ -1081,13 +1081,13 @@ int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU) //OC0509
 #ifdef _FFTW3 //OC28012019
 #ifdef _WITH_OMP
 		//Still needs to be tested!
-			if (DataToFFT != 0)
+			if(DataToFFT != 0)
 			{
 				fftwf_init_threads(); //initialize threading support
 				int nthreads = omp_get_max_threads(); //detect number of OpenMP threads that are available
 				fftwf_plan_with_nthreads(nthreads);
 			}
-			else if (dDataToFFT != 0) //OC02022019
+			else if(dDataToFFT != 0) //OC02022019
 			{
 				fftw_init_threads(); //initialize threading support
 				int nthreads = omp_get_max_threads(); //detect number of OpenMP threads that are available
@@ -1096,28 +1096,28 @@ int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU) //OC0509
 #endif //ifndef _WITH_OMP
 			int arN[] = { (int)Nx }; //OC14052020
 			//int arN[] = {Nx};
-			if (DataToFFT != 0)
+			if(DataToFFT != 0)
 			{
 				//Plan1DFFT = fftwf_plan_many_dft(1, arN, FFT1DInfo.HowMany, DataToFFT, NULL, 1, Nx, pOutDataFFT, NULL, 1, Nx, FFTW_FORWARD, flags); 
 				Plan1DFFT = fftwf_plan_many_dft(1, arN, FFT1DInfo.HowMany, DataToFFT, NULL, 1, Nx, OutDataFFT, NULL, 1, Nx, FFTW_FORWARD, flags); //OC02022019
-				if (Plan1DFFT == 0) return ERROR_IN_FFT;
+				if(Plan1DFFT == 0) return ERROR_IN_FFT;
 				fftwf_execute(Plan1DFFT);
 			}
-			else if (dDataToFFT != 0) //OC02022019
+			else if(dDataToFFT != 0) //OC02022019
 			{
 				dPlan1DFFT = fftw_plan_many_dft(1, arN, FFT1DInfo.HowMany, dDataToFFT, NULL, 1, Nx, dOutDataFFT, NULL, 1, Nx, FFTW_FORWARD, flags);
-				if (dPlan1DFFT == 0) return ERROR_IN_FFT;
+				if(dPlan1DFFT == 0) return ERROR_IN_FFT;
 				fftw_execute(dPlan1DFFT);
 			}
 
 #else //ifndef _FFTW3
-			if (DataToFFT == OutDataFFT)
+			if(DataToFFT == OutDataFFT)
 			{
 				flags |= FFTW_IN_PLACE;
 				pOutDataFFT = 0; //OC03092016 (see FFTW 2.1.5 doc clause above)
 			}
 			Plan1DFFT = fftw_create_plan(Nx, FFTW_FORWARD, flags);
-			if (Plan1DFFT == 0) return ERROR_IN_FFT;
+			if(Plan1DFFT == 0) return ERROR_IN_FFT;
 
 			//Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP:
 			//srwlPrintTime("::Make1DFFT : fft create plan dir>0",&start);
@@ -1131,7 +1131,7 @@ int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU) //OC0509
 			for (int i = 0; i < FFT1DInfo.HowMany; i++)
 			{
 				//SY: do not use OutDataFFT as scratch space if in-place
-				if (DataToFFT == OutDataFFT) fftw_one(Plan1DFFT, DataToFFT + i * Nx, 0);
+				if(DataToFFT == OutDataFFT) fftw_one(Plan1DFFT, DataToFFT + i * Nx, 0);
 				else fftw_one(Plan1DFFT, DataToFFT + i * Nx, OutDataFFT + i * Nx);
 			}
 #endif
@@ -1141,18 +1141,18 @@ int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU) //OC0509
 		//srwlPrintTime("::Make1DFFT : fft  dir>0",&start);
 
 #ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
-		if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023
+		if(CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023
 		//GPU_COND(pvGPU, 
 		//GPU_COND(pGpuUsage, //HG20012022
 		{
-			if (OutDataFFT != 0)
+			if(OutDataFFT != 0)
 			{
 				RepairAndRotateDataAfter1DFFT_GPU((float*)OutDataFFT, FFT1DInfo.HowMany, Nx, (float)Mult); //OC06092023
 				//RepairAndRotateDataAfter1DFFT_GPU((float*)OutDataFFT, FFT1DInfo.HowMany, Nx, Mult);
 				//RepairSignAfter1DFFT_GPU((float*)OutDataFFT, FFT1DInfo.HowMany, Nx);
 				//RotateDataAfter1DFFT_GPU((float*)OutDataFFT, FFT1DInfo.HowMany, Nx);
 			}
-			else if (dOutDataFFT != 0)
+			else if(dOutDataFFT != 0)
 			{
 				RepairAndRotateDataAfter1DFFT_GPU((double*)dOutDataFFT, FFT1DInfo.HowMany, Nx, Mult);
 				//RepairSignAfter1DFFT_GPU((double*)dOutDataFFT, FFT1DInfo.HowMany, Nx);
@@ -1163,13 +1163,13 @@ int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU) //OC0509
 		else 
 #endif
 		{
-			if (OutDataFFT != 0)
+			if(OutDataFFT != 0)
 			{
 				RepairSignAfter1DFFT(OutDataFFT, FFT1DInfo.HowMany);
 				RotateDataAfter1DFFT(OutDataFFT, FFT1DInfo.HowMany);
 			}
 #ifdef _FFTW3 //OC27022019
-			else if (dOutDataFFT != 0)
+			else if(dOutDataFFT != 0)
 			{
 				RepairSignAfter1DFFT(dOutDataFFT, FFT1DInfo.HowMany);
 				RotateDataAfter1DFFT(dOutDataFFT, FFT1DInfo.HowMany);
@@ -1181,44 +1181,44 @@ int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU) //OC0509
 	{
 		//int flags = FFTW_ESTIMATE; //OC30012019 (commented-out)
 #ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
-		if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023
+		if(CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023
 		//GPU_COND(pvGPU, 
 		//GPU_COND(pGpuUsage, //HG20012022
 		{
 			int arN[] = { (int)Nx }; //OC14052020
 			//int arN[] = {Nx};
-			if (DataToFFT != 0)
+			if(DataToFFT != 0)
 			{
-				if (PlanLen != Nx) {
+				if(PlanLen != Nx) {
 					PlanLen = Nx;
 					HowMany = FFT1DInfo.HowMany;
-					if (Plan1DFFT_cu != NULL)
+					if(Plan1DFFT_cu != NULL)
 					{
 						cufftDestroy(Plan1DFFT_cu);
 						Plan1DFFT_cu = NULL;
 					}
 					cufftPlanMany(&Plan1DFFT_cu, 1, arN, NULL, 1, Nx, NULL, 1, Nx, CUFFT_C2C, FFT1DInfo.HowMany);
 				}
-				if (Plan1DFFT_cu == 0) return ERROR_IN_FFT;
+				if(Plan1DFFT_cu == 0) return ERROR_IN_FFT;
 
 				RotateDataAfter1DFFT_GPU((float*)DataToFFT, FFT1DInfo.HowMany, Nx);
 				RepairSignAfter1DFFT_GPU((float*)DataToFFT, FFT1DInfo.HowMany, Nx);
 				cufftExecC2C(Plan1DFFT_cu, (cufftComplex*)DataToFFT, (cufftComplex*)OutDataFFT, CUFFT_INVERSE);
 			}
-			else if (dDataToFFT != 0) //OC02022019
+			else if(dDataToFFT != 0) //OC02022019
 			{
-				if (dPlanLen != Nx) 
+				if(dPlanLen != Nx) 
 				{
 					dPlanLen = Nx;
 					dHowMany = FFT1DInfo.HowMany;
-					if (dPlan1DFFT_cu != NULL)
+					if(dPlan1DFFT_cu != NULL)
 					{
 						cufftDestroy(dPlan1DFFT_cu);
 						dPlan1DFFT_cu = NULL;
 					}
 					cufftPlanMany(&dPlan1DFFT_cu, 1, arN, NULL, 1, Nx, NULL, 1, Nx, CUFFT_Z2Z, FFT1DInfo.HowMany);
 				}
-				if (dPlan1DFFT_cu == 0) return ERROR_IN_FFT;
+				if(dPlan1DFFT_cu == 0) return ERROR_IN_FFT;
 
 				RotateDataAfter1DFFT_GPU((double*)dDataToFFT, FFT1DInfo.HowMany, Nx);
 				RepairSignAfter1DFFT_GPU((double*)dDataToFFT, FFT1DInfo.HowMany, Nx);
@@ -1232,13 +1232,13 @@ int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU) //OC0509
 #ifdef _WITH_OMP
 
 			//Still needs to be tested!
-			if (DataToFFT != 0)
+			if(DataToFFT != 0)
 			{
 				fftwf_init_threads(); //initialize threading support
 				int nthreads = omp_get_max_threads(); //detect number of OpenMP threads that are available
 				fftwf_plan_with_nthreads(nthreads);
 			}
-			else if (dDataToFFT != 0)
+			else if(dDataToFFT != 0)
 			{
 				fftw_init_threads(); //initialize threading support
 				int nthreads = omp_get_max_threads(); //detect number of OpenMP threads that are available
@@ -1248,32 +1248,32 @@ int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU) //OC0509
 #endif
 			int arN[] = { (int)Nx }; //OC14052020
 	//int arN[] = {Nx};
-			if (DataToFFT != 0)
+			if(DataToFFT != 0)
 			{
 				//Plan1DFFT = fftwf_plan_many_dft(1, arN, FFT1DInfo.HowMany, DataToFFT, NULL, 1, Nx, pOutDataFFT, NULL, 1, Nx, FFTW_BACKWARD, flags); 
 				Plan1DFFT = fftwf_plan_many_dft(1, arN, FFT1DInfo.HowMany, DataToFFT, NULL, 1, Nx, OutDataFFT, NULL, 1, Nx, FFTW_BACKWARD, flags); //OC02022019
-				if (Plan1DFFT == 0) return ERROR_IN_FFT;
+				if(Plan1DFFT == 0) return ERROR_IN_FFT;
 				RotateDataAfter1DFFT(DataToFFT, FFT1DInfo.HowMany);
 				RepairSignAfter1DFFT(DataToFFT, FFT1DInfo.HowMany);
 
 				fftwf_execute(Plan1DFFT);
 			}
-			else if (dDataToFFT != 0) //OC02022019
+			else if(dDataToFFT != 0) //OC02022019
 			{
 				dPlan1DFFT = fftw_plan_many_dft(1, arN, FFT1DInfo.HowMany, dDataToFFT, NULL, 1, Nx, dOutDataFFT, NULL, 1, Nx, FFTW_BACKWARD, flags);
-				if (dPlan1DFFT == 0) return ERROR_IN_FFT;
+				if(dPlan1DFFT == 0) return ERROR_IN_FFT;
 				RotateDataAfter1DFFT(dDataToFFT, FFT1DInfo.HowMany);
 				RepairSignAfter1DFFT(dDataToFFT, FFT1DInfo.HowMany);
 				fftw_execute(dPlan1DFFT);
 			}
 #else //ifndef _FFTW3
-			if (DataToFFT == OutDataFFT)
+			if(DataToFFT == OutDataFFT)
 			{
 				flags |= FFTW_IN_PLACE;
 				pOutDataFFT = 0; //OC03092016 (see FFTW 2.1.5 doc clause above)
 			}
 			Plan1DFFT = fftw_create_plan(Nx, FFTW_BACKWARD, flags);
-			if (Plan1DFFT == 0) return ERROR_IN_FFT;
+			if(Plan1DFFT == 0) return ERROR_IN_FFT;
 
 			//Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP:
 			//srwlPrintTime("::Make1DFFT : fft create plan dir<0",&start);
@@ -1289,10 +1289,10 @@ int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU) //OC0509
 			fftw(Plan1DFFT, FFT1DInfo.HowMany, DataToFFT, 1, Nx, pOutDataFFT, 1, Nx); //OC03092016
 #else //OC27102018
 		//SY: split one call into many (for OpenMP)
-#pragma omp parallel for if (omp_get_num_threads()==1) // to avoid nested multi-threading (just in case)
+#pragma omp parallel for if(omp_get_num_threads()==1) // to avoid nested multi-threading (just in case)
 			for (int i = 0; i < FFT1DInfo.HowMany; i++)
 			{
-				if (DataToFFT == OutDataFFT) fftw_one(Plan1DFFT, DataToFFT + i * Nx, 0);
+				if(DataToFFT == OutDataFFT) fftw_one(Plan1DFFT, DataToFFT + i * Nx, 0);
 				else fftw_one(Plan1DFFT, DataToFFT + i * Nx, OutDataFFT + i * Nx);
 			}
 #endif
@@ -1302,25 +1302,25 @@ int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU) //OC0509
 		//srwlPrintTime("::Make1DFFT : fft  dir<0",&start);
 	}
 
-	if (!alreadyNormalized)
+	if(!alreadyNormalized)
 	{
 #ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
-		if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023
+		if(CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023
 		//GPU_COND(pvGPU, 
 		//GPU_COND(pGpuUsage, 
 		{
-			if (OutDataFFT != 0) {
+			if(OutDataFFT != 0) {
 				NormalizeDataAfter1DFFT_GPU((float*)OutDataFFT, FFT1DInfo.HowMany, Nx, Mult);
 			}
-			else if (dOutDataFFT != 0)
+			else if(dOutDataFFT != 0)
 				NormalizeDataAfter1DFFT_GPU((double*)dOutDataFFT, FFT1DInfo.HowMany, Nx, Mult);
 		}//)
 		else 
 #endif
 		{
-			if (OutDataFFT != 0) NormalizeDataAfter1DFFT(OutDataFFT, FFT1DInfo.HowMany, Mult);
+			if(OutDataFFT != 0) NormalizeDataAfter1DFFT(OutDataFFT, FFT1DInfo.HowMany, Mult);
 #ifdef _FFTW3 //OC27022019
-			else if (dOutDataFFT != 0) NormalizeDataAfter1DFFT(dOutDataFFT, FFT1DInfo.HowMany, Mult);
+			else if(dOutDataFFT != 0) NormalizeDataAfter1DFFT(dOutDataFFT, FFT1DInfo.HowMany, Mult);
 #endif
 		}
 	}
@@ -1328,29 +1328,29 @@ int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU) //OC0509
 	//Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP:
 	//srwlPrintTime("::Make1DFFT : NormalizeDataAfter1DFFT",&start);
 
-	if (NeedsShiftAfterX)
+	if(NeedsShiftAfterX)
 	{
 #ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
-		if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023
+		if(CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023
 		//GPU_COND(pvGPU, 
 		//GPU_COND(pGpuUsage, 
 		{
-			if (m_ArrayShiftX != 0) FillArrayShift_GPU(t0SignMult * x0_After, FFT1DInfo.xStepTr, Nx, m_ArrayShiftX); //OC02022019
-			else if (m_dArrayShiftX != 0) FillArrayShift_GPU(t0SignMult * x0_After, FFT1DInfo.xStepTr, Nx, m_dArrayShiftX);
+			if(m_ArrayShiftX != 0) FillArrayShift_GPU(t0SignMult * x0_After, FFT1DInfo.xStepTr, Nx, m_ArrayShiftX); //OC02022019
+			else if(m_dArrayShiftX != 0) FillArrayShift_GPU(t0SignMult * x0_After, FFT1DInfo.xStepTr, Nx, m_dArrayShiftX);
 
-			if (OutDataFFT != 0) TreatShift_GPU((float*)OutDataFFT, FFT1DInfo.HowMany, Nx, m_ArrayShiftX);
-			else if (dOutDataFFT != 0) TreatShift_GPU((double*)dOutDataFFT, FFT1DInfo.HowMany, Nx, m_dArrayShiftX);
+			if(OutDataFFT != 0) TreatShift_GPU((float*)OutDataFFT, FFT1DInfo.HowMany, Nx, m_ArrayShiftX);
+			else if(dOutDataFFT != 0) TreatShift_GPU((double*)dOutDataFFT, FFT1DInfo.HowMany, Nx, m_dArrayShiftX);
 		}//)
 		else 
 #endif
 		{
 			//FillArrayShift(t0SignMult*x0_After, FFT1DInfo.xStepTr);
-			if (m_ArrayShiftX != 0) FillArrayShift(t0SignMult * x0_After, FFT1DInfo.xStepTr, m_ArrayShiftX); //OC02022019
-			else if (m_dArrayShiftX != 0) FillArrayShift(t0SignMult * x0_After, FFT1DInfo.xStepTr, m_dArrayShiftX);
+			if(m_ArrayShiftX != 0) FillArrayShift(t0SignMult * x0_After, FFT1DInfo.xStepTr, m_ArrayShiftX); //OC02022019
+			else if(m_dArrayShiftX != 0) FillArrayShift(t0SignMult * x0_After, FFT1DInfo.xStepTr, m_dArrayShiftX);
 
-			if (OutDataFFT != 0) TreatShift(OutDataFFT, FFT1DInfo.HowMany);
+			if(OutDataFFT != 0) TreatShift(OutDataFFT, FFT1DInfo.HowMany);
 #ifdef _FFTW3 //OC27022019
-			else if (dOutDataFFT != 0) TreatShift(dOutDataFFT, FFT1DInfo.HowMany);
+			else if(dOutDataFFT != 0) TreatShift(dOutDataFFT, FFT1DInfo.HowMany);
 #endif
 		}
 	}
@@ -1362,16 +1362,16 @@ int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU) //OC0509
 	}
 
 #ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
-	if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023
+	if(CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023
 	//GPU_COND(pvGPU,
 	//GPU_COND(pGpuUsage, //HG20012022
 	{
-		if ((FFT1DInfo.pInData != 0) && (FFT1DInfo.pOutData != 0))
+		if((FFT1DInfo.pInData != 0) && (FFT1DInfo.pOutData != 0))
 		{
 			CAuxGPU::MarkUpdated((TGPUUsageArg*)pvGPU, OutDataFFT, true, false); //OC06092023
 			//CAuxGPU::MarkUpdated(pGpuUsage, OutDataFFT, true, false);
 		}
-		else if ((FFT1DInfo.pdInData != 0) && (FFT1DInfo.pdOutData != 0))
+		else if((FFT1DInfo.pdInData != 0) && (FFT1DInfo.pdOutData != 0))
 		{
 			CAuxGPU::MarkUpdated((TGPUUsageArg*)pvGPU, dOutDataFFT, true, false); //OC06092023
 			//CAuxGPU::MarkUpdated(pGpuUsage, dOutDataFFT, true, false);
@@ -1403,7 +1403,7 @@ int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU) //OC0509
 #endif
 	}
 
-	if (m_ArrayShiftX != 0)
+	if(m_ArrayShiftX != 0)
 	{
 #ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
 		m_ArrayShiftX = (float*)CAuxGPU::ToHostAndFree((TGPUUsageArg*)pvGPU, m_ArrayShiftX, (Nx << 1) * sizeof(float), true); //OC06092023
@@ -1411,7 +1411,7 @@ int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU) //OC0509
 #endif
 		delete[] m_ArrayShiftX;
 	}
-	if (m_dArrayShiftX != 0)
+	if(m_dArrayShiftX != 0)
 	{
 #ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU)
 		m_dArrayShiftX = (double*)CAuxGPU::ToHostAndFree((TGPUUsageArg*)pvGPU, m_dArrayShiftX, (Nx << 1) * sizeof(double), true); //OC06092023

From 5ecedf46cffb099de11917072a325006bd49a63b Mon Sep 17 00:00:00 2001
From: Himanshu Goel <himanshu@hgoel.dev>
Date: Thu, 4 Jan 2024 12:23:54 -0500
Subject: [PATCH 6/9] Restore deleted change tag.

---
 cpp/src/ext/genmath/gmfft.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/ext/genmath/gmfft.cpp b/cpp/src/ext/genmath/gmfft.cpp
index 43639845..58b766a2 100644
--- a/cpp/src/ext/genmath/gmfft.cpp
+++ b/cpp/src/ext/genmath/gmfft.cpp
@@ -360,7 +360,7 @@ int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwnd_plan* pPrecrea
 		}
 	}
 
-#ifdef _FFTW3
+#ifdef _FFTW3 //OC28012019
 	fftwf_plan Plan2DFFT;
 	fftw_plan dPlan2DFFT;
 	fftwf_complex* DataToFFT = 0;

From 44cce0811cf90d7c371c0f9ff19c48467bad09f5 Mon Sep 17 00:00:00 2001
From: Himanshu Goel <himanshu@hgoel.dev>
Date: Thu, 4 Jan 2024 13:43:51 -0500
Subject: [PATCH 7/9] Update ParseDeviceParam definition

---
 cpp/src/clients/python/srwlpy.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/src/clients/python/srwlpy.cpp b/cpp/src/clients/python/srwlpy.cpp
index 217fbe95..d568aafc 100644
--- a/cpp/src/clients/python/srwlpy.cpp
+++ b/cpp/src/clients/python/srwlpy.cpp
@@ -3327,15 +3327,15 @@ void ParseSructSmpObj3D(double**& arObjShapeDefs, int& nObj3D, PyObject* oListSh
 /************************************************************************//**
  * Convert Python device specification to C++ structure.
  ***************************************************************************/
-void ParseDeviceParam(PyObject* oDev, gpuUsageArg *pGpuUsage) //HG10202021 Convert Python device specification to C++ structure
+void ParseDeviceParam(PyObject* oDev, TGPUUsageArg* pGpu) //HG10202021 Convert Python device specification to C++ structure
 {
 	if (oDev != 0) {
 		if (PyLong_Check(oDev)) {
-			pGpuUsage->deviceIndex = _PyLong_AsInt(oDev);
+			pGpu->deviceIndex = _PyLong_AsInt(oDev);
 			return;
 		}
 	}
-	pGpuUsage->deviceIndex = 0;
+	pGpu->deviceIndex = 0;
 }
 #endif
 

From 863ce5718ca988c715bdc9e83045bf67ff8f9ade Mon Sep 17 00:00:00 2001
From: Himanshu Goel <himanshu@hgoel.dev>
Date: Thu, 4 Jan 2024 13:56:04 -0500
Subject: [PATCH 8/9] Update SRWLClientPython copy paths.

---
 cpp/vc/SRWLClientPython.vcxproj | 58 ++++++++++++++++-----------------
 1 file changed, 29 insertions(+), 29 deletions(-)

diff --git a/cpp/vc/SRWLClientPython.vcxproj b/cpp/vc/SRWLClientPython.vcxproj
index 882e3575..25826e85 100644
--- a/cpp/vc/SRWLClientPython.vcxproj
+++ b/cpp/vc/SRWLClientPython.vcxproj
@@ -648,7 +648,7 @@
     </ClCompile>
     <Link>
       <AdditionalDependencies>..\..\..\Python33\libs\python33.lib;srw_win32.lib;%(AdditionalDependencies)</AdditionalDependencies>
-      <OutputFile>$(SolutionDir)..\..\env\work\srw_python\srwlpy.pyd</OutputFile>
+      <OutputFile>$(SolutionDir)..\..\env\python\srwpy\srwlpy.pyd</OutputFile>
       <IgnoreSpecificDefaultLibraries>LIBC;%(IgnoreSpecificDefaultLibraries)</IgnoreSpecificDefaultLibraries>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <SubSystem>Windows</SubSystem>
@@ -674,7 +674,7 @@
     </ClCompile>
     <Link>
       <AdditionalDependencies>..\..\..\Python33\libs\python33.lib;srw_win32.lib;%(AdditionalDependencies)</AdditionalDependencies>
-      <OutputFile>$(SolutionDir)..\..\env\work\srw_python\srwlpy.pyd</OutputFile>
+      <OutputFile>$(SolutionDir)..\..\env\python\srwpy\srwlpy.pyd</OutputFile>
       <IgnoreSpecificDefaultLibraries>LIBC;%(IgnoreSpecificDefaultLibraries)</IgnoreSpecificDefaultLibraries>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <SubSystem>Windows</SubSystem>
@@ -701,7 +701,7 @@
     </ClCompile>
     <Link>
       <AdditionalDependencies>..\..\..\Python33\libs\python33.lib;srw_win32.lib;%(AdditionalDependencies)</AdditionalDependencies>
-      <OutputFile>$(SolutionDir)..\..\env\work\srw_python\srwlpy.pyd</OutputFile>
+      <OutputFile>$(SolutionDir)..\..\env\python\srwpy\srwlpy.pyd</OutputFile>
       <IgnoreSpecificDefaultLibraries>LIBC;%(IgnoreSpecificDefaultLibraries)</IgnoreSpecificDefaultLibraries>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <SubSystem>Windows</SubSystem>
@@ -728,7 +728,7 @@
     </ClCompile>
     <Link>
       <AdditionalDependencies>..\..\..\Python33\libs\python33.lib;srw_win32.lib;%(AdditionalDependencies)</AdditionalDependencies>
-      <OutputFile>$(SolutionDir)..\..\env\work\srw_python\srwlpy.pyd</OutputFile>
+      <OutputFile>$(SolutionDir)..\..\env\python\srwpy\srwlpy.pyd</OutputFile>
       <IgnoreSpecificDefaultLibraries>LIBC;%(IgnoreSpecificDefaultLibraries)</IgnoreSpecificDefaultLibraries>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <SubSystem>Windows</SubSystem>
@@ -755,7 +755,7 @@
     </ClCompile>
     <Link>
       <AdditionalDependencies>..\..\..\Python33\libs\python33.lib;srw_win32.lib;%(AdditionalDependencies)</AdditionalDependencies>
-      <OutputFile>$(SolutionDir)..\..\env\work\srw_python\srwlpy.pyd</OutputFile>
+      <OutputFile>$(SolutionDir)..\..\env\python\srwpy\srwlpy.pyd</OutputFile>
       <IgnoreSpecificDefaultLibraries>LIBC;%(IgnoreSpecificDefaultLibraries)</IgnoreSpecificDefaultLibraries>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <SubSystem>Windows</SubSystem>
@@ -782,7 +782,7 @@
     </ClCompile>
     <Link>
       <AdditionalDependencies>..\..\..\Python33\libs\python33.lib;srw_win32.lib;%(AdditionalDependencies)</AdditionalDependencies>
-      <OutputFile>$(SolutionDir)..\..\env\work\srw_python\srwlpy.pyd</OutputFile>
+      <OutputFile>$(SolutionDir)..\..\env\python\srwpy\srwlpy.pyd</OutputFile>
       <IgnoreSpecificDefaultLibraries>LIBC;%(IgnoreSpecificDefaultLibraries)</IgnoreSpecificDefaultLibraries>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <SubSystem>Windows</SubSystem>
@@ -809,7 +809,7 @@
     </ClCompile>
     <Link>
       <AdditionalDependencies>..\..\..\Python33\libs\python33.lib;srw_win32.lib;%(AdditionalDependencies)</AdditionalDependencies>
-      <OutputFile>$(SolutionDir)..\..\env\work\srw_python\srwlpy.pyd</OutputFile>
+      <OutputFile>$(SolutionDir)..\..\env\python\srwpy\srwlpy.pyd</OutputFile>
       <IgnoreSpecificDefaultLibraries>LIBC;%(IgnoreSpecificDefaultLibraries)</IgnoreSpecificDefaultLibraries>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <SubSystem>Windows</SubSystem>
@@ -854,7 +854,7 @@
       <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
     </Link>
     <PostBuildEvent>
-      <Command>copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\work\srw_python\"</Command>
+      <Command>copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\python\srwpy\"</Command>
     </PostBuildEvent>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug_Py3_5|x64'">
@@ -874,7 +874,7 @@
     </ClCompile>
     <Link>
       <AdditionalDependencies>..\..\..\Python35_x64\libs\python35.lib;srw_x64.lib;%(AdditionalDependencies)</AdditionalDependencies>
-      <OutputFile>$(SolutionDir)..\..\env\work\srw_python\srwlpy.pyd</OutputFile>
+      <OutputFile>$(SolutionDir)..\..\env\python\srwpy\srwlpy.pyd</OutputFile>
       <IgnoreSpecificDefaultLibraries>LIBCMT;%(IgnoreSpecificDefaultLibraries)</IgnoreSpecificDefaultLibraries>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <SubSystem>Windows</SubSystem>
@@ -902,7 +902,7 @@
     </ClCompile>
     <Link>
       <AdditionalDependencies>..\..\..\Python36_x64\libs\python36.lib;srw_x64.lib;%(AdditionalDependencies)</AdditionalDependencies>
-      <OutputFile>$(SolutionDir)..\..\env\work\srw_python\srwlpy.pyd</OutputFile>
+      <OutputFile>$(SolutionDir)..\..\env\python\srwpy\srwlpy.pyd</OutputFile>
       <IgnoreSpecificDefaultLibraries>LIBCMT;%(IgnoreSpecificDefaultLibraries)</IgnoreSpecificDefaultLibraries>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <SubSystem>Windows</SubSystem>
@@ -930,7 +930,7 @@
     </ClCompile>
     <Link>
       <AdditionalDependencies>..\..\..\Python38_x64\libs\python38.lib;srw_x64.lib;%(AdditionalDependencies)</AdditionalDependencies>
-      <OutputFile>$(SolutionDir)..\..\env\work\srw_python\srwlpy.pyd</OutputFile>
+      <OutputFile>$(SolutionDir)..\..\env\python\srwpy\srwlpy.pyd</OutputFile>
       <IgnoreSpecificDefaultLibraries>LIBCMT;%(IgnoreSpecificDefaultLibraries)</IgnoreSpecificDefaultLibraries>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <SubSystem>Windows</SubSystem>
@@ -958,7 +958,7 @@
     </ClCompile>
     <Link>
       <AdditionalDependencies>..\..\..\Python39_x64\libs\python39.lib;srw_x64.lib;%(AdditionalDependencies)</AdditionalDependencies>
-      <OutputFile>$(SolutionDir)..\..\env\work\srw_python\srwlpy.pyd</OutputFile>
+      <OutputFile>$(SolutionDir)..\..\env\python\srwpy\srwlpy.pyd</OutputFile>
       <IgnoreSpecificDefaultLibraries>LIBCMT;%(IgnoreSpecificDefaultLibraries)</IgnoreSpecificDefaultLibraries>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <SubSystem>Windows</SubSystem>
@@ -1014,7 +1014,7 @@
     </ClCompile>
     <Link>
       <AdditionalDependencies>..\..\..\Python37_x64\libs\python37.lib;srw_x64.lib;%(AdditionalDependencies)</AdditionalDependencies>
-      <OutputFile>$(SolutionDir)..\..\env\work\srw_python\srwlpy.pyd</OutputFile>
+      <OutputFile>$(SolutionDir)..\..\env\python\srwpy\srwlpy.pyd</OutputFile>
       <IgnoreSpecificDefaultLibraries>LIBCMT;%(IgnoreSpecificDefaultLibraries)</IgnoreSpecificDefaultLibraries>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <SubSystem>Windows</SubSystem>
@@ -1052,7 +1052,7 @@
       <ImageHasSafeExceptionHandlers>false</ImageHasSafeExceptionHandlers>
     </Link>
     <PostBuildEvent>
-      <Command>copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\work\srw_python\"</Command>
+      <Command>copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\python\srwpy\"</Command>
     </PostBuildEvent>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release_Py3_5|Win32'">
@@ -1083,7 +1083,7 @@
       <ImageHasSafeExceptionHandlers>false</ImageHasSafeExceptionHandlers>
     </Link>
     <PostBuildEvent>
-      <Command>copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\work\srw_python\"</Command>
+      <Command>copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\python\srwpy\"</Command>
     </PostBuildEvent>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release_Py3_6|Win32'">
@@ -1114,7 +1114,7 @@
       <ImageHasSafeExceptionHandlers>false</ImageHasSafeExceptionHandlers>
     </Link>
     <PostBuildEvent>
-      <Command>copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\work\srw_python\"</Command>
+      <Command>copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\python\srwpy\"</Command>
     </PostBuildEvent>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release_Py3_7|Win32'">
@@ -1145,7 +1145,7 @@
       <ImageHasSafeExceptionHandlers>false</ImageHasSafeExceptionHandlers>
     </Link>
     <PostBuildEvent>
-      <Command>copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\work\srw_python\"</Command>
+      <Command>copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\python\srwpy\"</Command>
     </PostBuildEvent>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release_Py3_8|Win32'">
@@ -1176,7 +1176,7 @@
       <ImageHasSafeExceptionHandlers>false</ImageHasSafeExceptionHandlers>
     </Link>
     <PostBuildEvent>
-      <Command>copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\work\srw_python\"</Command>
+      <Command>copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\python\srwpy\"</Command>
     </PostBuildEvent>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release_Py3_9|Win32'">
@@ -1207,7 +1207,7 @@
       <ImageHasSafeExceptionHandlers>false</ImageHasSafeExceptionHandlers>
     </Link>
     <PostBuildEvent>
-      <Command>copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\work\srw_python\"</Command>
+      <Command>copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\python\srwpy\"</Command>
     </PostBuildEvent>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release_Py3_11|Win32'">
@@ -1238,7 +1238,7 @@
       <ImageHasSafeExceptionHandlers>false</ImageHasSafeExceptionHandlers>
     </Link>
     <PostBuildEvent>
-      <Command>copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\work\srw_python\"</Command>
+      <Command>copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\python\srwpy\"</Command>
     </PostBuildEvent>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release_Py3_3|x64'">
@@ -1275,7 +1275,7 @@
       <ProfileGuidedDatabase>srwlpy.pgd</ProfileGuidedDatabase>
     </Link>
     <PostBuildEvent>
-      <Command>copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\work\srw_python\"</Command>
+      <Command>copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\python\srwpy\"</Command>
     </PostBuildEvent>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release_Py3_5|x64'">
@@ -1313,7 +1313,7 @@
       <ProfileGuidedDatabase>srwlpy.pgd</ProfileGuidedDatabase>
     </Link>
     <PostBuildEvent>
-      <Command>copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\work\srw_python\"</Command>
+      <Command>copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\python\srwpy\"</Command>
     </PostBuildEvent>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release_Py3_6|x64'">
@@ -1351,7 +1351,7 @@
       <ProfileGuidedDatabase>srwlpy.pgd</ProfileGuidedDatabase>
     </Link>
     <PostBuildEvent>
-      <Command>copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\work\srw_python\"</Command>
+      <Command>copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\python\srwpy\"</Command>
     </PostBuildEvent>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release_Py3_7|x64'">
@@ -1389,7 +1389,7 @@
       <ProfileGuidedDatabase>srwlpy.pgd</ProfileGuidedDatabase>
     </Link>
     <PostBuildEvent>
-      <Command>copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\work\srw_python\"</Command>
+      <Command>copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\python\srwpy\"</Command>
     </PostBuildEvent>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release_Py3_8|x64'">
@@ -1465,7 +1465,7 @@
       <ProfileGuidedDatabase>srwlpy.pgd</ProfileGuidedDatabase>
     </Link>
     <PostBuildEvent>
-      <Command>copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\work\srw_python\"</Command>
+      <Command>copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\python\srwpy\"</Command>
     </PostBuildEvent>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release_Py3_11|x64'">
@@ -1520,7 +1520,7 @@
     </ClCompile>
     <Link>
       <AdditionalDependencies>..\..\..\Python32\libs\python32.lib;srw_win32.lib;%(AdditionalDependencies)</AdditionalDependencies>
-      <OutputFile>$(SolutionDir)..\..\env\work\srw_python\srwlpy.pyd</OutputFile>
+      <OutputFile>$(SolutionDir)..\..\env\python\srwpy\srwlpy.pyd</OutputFile>
       <IgnoreSpecificDefaultLibraries>LIBC;%(IgnoreSpecificDefaultLibraries)</IgnoreSpecificDefaultLibraries>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <SubSystem>Windows</SubSystem>
@@ -1548,7 +1548,7 @@
     </ClCompile>
     <Link>
       <AdditionalDependencies>..\..\..\Python27_x64\libs\python27.lib;srw_x64.lib;%(AdditionalDependencies)</AdditionalDependencies>
-      <OutputFile>$(SolutionDir)..\..\env\work\srw_python\srwlpy.pyd</OutputFile>
+      <OutputFile>$(SolutionDir)..\..\env\python\srwpy\srwlpy.pyd</OutputFile>
       <IgnoreSpecificDefaultLibraries>LIBCMT;%(IgnoreSpecificDefaultLibraries)</IgnoreSpecificDefaultLibraries>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <SubSystem>Windows</SubSystem>
@@ -1582,7 +1582,7 @@
       <ImageHasSafeExceptionHandlers>false</ImageHasSafeExceptionHandlers>
     </Link>
     <PostBuildEvent>
-      <Command>copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\work\srw_python\"</Command>
+      <Command>copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\python\srwpy\"</Command>
     </PostBuildEvent>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release_Py2x|x64'">
@@ -1616,7 +1616,7 @@
       <TargetMachine>MachineX64</TargetMachine>
     </Link>
     <PostBuildEvent>
-      <Command>copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\work\srw_python\"</Command>
+      <Command>copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\python\srwpy\"</Command>
     </PostBuildEvent>
   </ItemDefinitionGroup>
   <ItemGroup>

From 02637c5e57fd3d14a1b089584cb70baecd3a6877 Mon Sep 17 00:00:00 2001
From: Himanshu Goel <himanshu@hgoel.dev>
Date: Thu, 4 Jan 2024 14:04:01 -0500
Subject: [PATCH 9/9] Update SRWLClientPython project.

---
 cpp/vc/SRWLClientPython.vcxproj      | 202 +++++++++++++++++++++++++++
 cpp/vc/SRWLClientPython.vcxproj.user |  18 +++
 2 files changed, 220 insertions(+)

diff --git a/cpp/vc/SRWLClientPython.vcxproj b/cpp/vc/SRWLClientPython.vcxproj
index 25826e85..ed3d9959 100644
--- a/cpp/vc/SRWLClientPython.vcxproj
+++ b/cpp/vc/SRWLClientPython.vcxproj
@@ -9,6 +9,14 @@
       <Configuration>Debug_Py2x</Configuration>
       <Platform>x64</Platform>
     </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug_Py3_11_cuda|Win32">
+      <Configuration>Debug_Py3_11_cuda</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug_Py3_11_cuda|x64">
+      <Configuration>Debug_Py3_11_cuda</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
     <ProjectConfiguration Include="Debug_Py3_11|Win32">
       <Configuration>Debug_Py3_11</Configuration>
       <Platform>Win32</Platform>
@@ -73,6 +81,14 @@
       <Configuration>Release_Py2x</Configuration>
       <Platform>x64</Platform>
     </ProjectConfiguration>
+    <ProjectConfiguration Include="Release_Py3_11_cuda|Win32">
+      <Configuration>Release_Py3_11_cuda</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release_Py3_11_cuda|x64">
+      <Configuration>Release_Py3_11_cuda</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
     <ProjectConfiguration Include="Release_Py3_11|Win32">
       <Configuration>Release_Py3_11</Configuration>
       <Platform>Win32</Platform>
@@ -190,6 +206,12 @@
     <CharacterSet>Unicode</CharacterSet>
     <WholeProgramOptimization>true</WholeProgramOptimization>
   </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release_Py3_11_cuda|Win32'" Label="Configuration">
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
+    <PlatformToolset>v143</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug_Py3_3|Win32'" Label="Configuration">
     <ConfigurationType>DynamicLibrary</ConfigurationType>
     <PlatformToolset>v143</PlatformToolset>
@@ -220,6 +242,11 @@
     <PlatformToolset>v143</PlatformToolset>
     <CharacterSet>Unicode</CharacterSet>
   </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug_Py3_11_cuda|Win32'" Label="Configuration">
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
+    <PlatformToolset>v143</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug_Py3_7|Win32'" Label="Configuration">
     <ConfigurationType>DynamicLibrary</ConfigurationType>
     <PlatformToolset>v143</PlatformToolset>
@@ -283,6 +310,13 @@
     <WholeProgramOptimization>true</WholeProgramOptimization>
     <SpectreMitigation>false</SpectreMitigation>
   </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release_Py3_11_cuda|x64'" Label="Configuration">
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
+    <PlatformToolset>v143</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <SpectreMitigation>false</SpectreMitigation>
+  </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug_Py3_3|x64'" Label="Configuration">
     <ConfigurationType>DynamicLibrary</ConfigurationType>
     <PlatformToolset>v143</PlatformToolset>
@@ -317,6 +351,12 @@
     <CharacterSet>Unicode</CharacterSet>
     <SpectreMitigation>false</SpectreMitigation>
   </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug_Py3_11_cuda|x64'" Label="Configuration">
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
+    <PlatformToolset>v143</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+    <SpectreMitigation>false</SpectreMitigation>
+  </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug_Py3_7|x64'" Label="Configuration">
     <ConfigurationType>DynamicLibrary</ConfigurationType>
     <PlatformToolset>v143</PlatformToolset>
@@ -353,6 +393,9 @@
   <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release_Py3_11|Win32'" Label="PropertySheets">
     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
   </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release_Py3_11_cuda|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
   <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug_Py3_3|Win32'" Label="PropertySheets">
     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
   </ImportGroup>
@@ -371,6 +414,9 @@
   <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug_Py3_11|Win32'" Label="PropertySheets">
     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
   </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug_Py3_11_cuda|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
   <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug_Py3_7|Win32'" Label="PropertySheets">
     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
   </ImportGroup>
@@ -401,6 +447,9 @@
   <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release_Py3_11|x64'" Label="PropertySheets">
     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
   </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release_Py3_11_cuda|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
   <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug_Py3_3|x64'" Label="PropertySheets">
     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
   </ImportGroup>
@@ -419,6 +468,9 @@
   <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug_Py3_11|x64'" Label="PropertySheets">
     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
   </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug_Py3_11_cuda|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
   <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug_Py3_7|x64'" Label="PropertySheets">
     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
   </ImportGroup>
@@ -456,6 +508,11 @@
     <IntDir>$(Platform)\$(Configuration)\</IntDir>
     <LinkIncremental>true</LinkIncremental>
   </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug_Py3_11_cuda|Win32'">
+    <OutDir>$(SolutionDir)$(Platform)\$(Configuration)\</OutDir>
+    <IntDir>$(Platform)\$(Configuration)\</IntDir>
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug_Py3_7|Win32'">
     <OutDir>$(SolutionDir)$(Platform)\$(Configuration)\</OutDir>
     <IntDir>$(Platform)\$(Configuration)\</IntDir>
@@ -503,6 +560,13 @@
     <TargetName>srwlpy</TargetName>
     <TargetExt>.pyd</TargetExt>
   </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug_Py3_11_cuda|x64'">
+    <OutDir>$(ProjectDir)</OutDir>
+    <IntDir>$(Platform)\$(Configuration)\</IntDir>
+    <LinkIncremental>true</LinkIncremental>
+    <TargetName>srwlpy</TargetName>
+    <TargetExt>.pyd</TargetExt>
+  </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug_Py3_7|x64'">
     <OutDir>$(ProjectDir)</OutDir>
     <IntDir>$(Platform)\$(Configuration)\</IntDir>
@@ -559,6 +623,13 @@
     <TargetName>srwlpy</TargetName>
     <TargetExt>.pyd</TargetExt>
   </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release_Py3_11_cuda|Win32'">
+    <OutDir>$(ProjectDir)</OutDir>
+    <IntDir>$(Platform)\$(Configuration)\</IntDir>
+    <LinkIncremental>false</LinkIncremental>
+    <TargetName>srwlpy</TargetName>
+    <TargetExt>.pyd</TargetExt>
+  </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release_Py3_3|x64'">
     <OutDir>$(ProjectDir)</OutDir>
     <IntDir>$(Platform)\$(Configuration)\</IntDir>
@@ -608,6 +679,13 @@
     <TargetName>srwlpy</TargetName>
     <TargetExt>.pyd</TargetExt>
   </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release_Py3_11_cuda|x64'">
+    <OutDir>$(ProjectDir)</OutDir>
+    <IntDir>$(Platform)\$(Configuration)\</IntDir>
+    <LinkIncremental>false</LinkIncremental>
+    <TargetName>srwlpy</TargetName>
+    <TargetExt>.pyd</TargetExt>
+  </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug_Py2x|Win32'">
     <OutDir>$(SolutionDir)$(Platform)\$(Configuration)\</OutDir>
     <IntDir>$(Platform)\$(Configuration)\</IntDir>
@@ -794,6 +872,33 @@
       </Command>
     </PostBuildEvent>
   </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug_Py3_11_cuda|Win32'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <AdditionalIncludeDirectories>..\src\lib;..\..\..\Python33\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_WINDOWS;_USRDLL;SRWLIB_CLIENT;SRWLCLIENTPYTHON_EXPORTS;_CRT_SECURE_NO_WARNINGS;NON_UNIX_STDIO;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <MinimalRebuild>true</MinimalRebuild>
+      <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
+    </ClCompile>
+    <Link>
+      <AdditionalDependencies>..\..\..\Python33\libs\python33.lib;srw_win32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <OutputFile>$(SolutionDir)..\..\env\python\srwpy\srwlpy.pyd</OutputFile>
+      <IgnoreSpecificDefaultLibraries>LIBC;%(IgnoreSpecificDefaultLibraries)</IgnoreSpecificDefaultLibraries>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <SubSystem>Windows</SubSystem>
+      <TargetMachine>MachineX86</TargetMachine>
+      <ImageHasSafeExceptionHandlers>false</ImageHasSafeExceptionHandlers>
+    </Link>
+    <PostBuildEvent>
+      <Command>
+      </Command>
+    </PostBuildEvent>
+  </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug_Py3_7|Win32'">
     <ClCompile>
       <Optimization>Disabled</Optimization>
@@ -997,6 +1102,34 @@
       </Command>
     </PostBuildEvent>
   </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug_Py3_11_cuda|x64'">
+    <Midl>
+      <TargetEnvironment>X64</TargetEnvironment>
+    </Midl>
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <AdditionalIncludeDirectories>..\src\lib;..\..\..\Python311_x64\include;$(CUDA_PATH)\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>WIN32;_OFFLOAD_GPU;_DEBUG;_WINDOWS;_USRDLL;SRWLIB_CLIENT;SRWLCLIENTPYTHON_EXPORTS;_CRT_SECURE_NO_WARNINGS;NON_UNIX_STDIO;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <MinimalRebuild>true</MinimalRebuild>
+      <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+      <PrecompiledHeader>NotUsing</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+    </ClCompile>
+    <Link>
+      <AdditionalDependencies>..\..\..\Python311_x64\libs\python311.lib;$(CUDA_PATH)\lib\x64\cudart_static.lib;$(CUDA_PATH)\lib\x64\cudadevrt.lib;srw_x64.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <OutputFile>$(SolutionDir)..\..\env\python\srwpy\srwlpy.pyd</OutputFile>
+      <IgnoreSpecificDefaultLibraries>LIBCMT;%(IgnoreSpecificDefaultLibraries)</IgnoreSpecificDefaultLibraries>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <SubSystem>Windows</SubSystem>
+      <TargetMachine>MachineX64</TargetMachine>
+    </Link>
+    <PostBuildEvent>
+      <Command>
+      </Command>
+    </PostBuildEvent>
+  </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug_Py3_7|x64'">
     <Midl>
       <TargetEnvironment>X64</TargetEnvironment>
@@ -1241,6 +1374,37 @@
       <Command>copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\python\srwpy\"</Command>
     </PostBuildEvent>
   </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release_Py3_11_cuda|Win32'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <IntrinsicFunctions>false</IntrinsicFunctions>
+      <AdditionalIncludeDirectories>..\src\lib;..\..\..\Python36\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_WINDOWS;_USRDLL;SRWLCLIENTPYTHON_EXPORTS;SRWLIB_CLIENT;_CRT_SECURE_NO_WARNINGS;NON_UNIX_STDIO;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+      <WholeProgramOptimization>false</WholeProgramOptimization>
+      <StructMemberAlignment>Default</StructMemberAlignment>
+      <StringPooling>true</StringPooling>
+    </ClCompile>
+    <Link>
+      <AdditionalDependencies>..\..\..\Python36\libs\python36.lib;srw_win32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <OutputFile>srwlpy.pyd</OutputFile>
+      <IgnoreSpecificDefaultLibraries>LIBC;%(IgnoreSpecificDefaultLibraries)</IgnoreSpecificDefaultLibraries>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <SubSystem>Windows</SubSystem>
+      <OptimizeReferences>true</OptimizeReferences>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <TargetMachine>MachineX86</TargetMachine>
+      <ImageHasSafeExceptionHandlers>false</ImageHasSafeExceptionHandlers>
+    </Link>
+    <PostBuildEvent>
+      <Command>copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\python\srwpy\"</Command>
+    </PostBuildEvent>
+  </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release_Py3_3|x64'">
     <Midl>
       <TargetEnvironment>X64</TargetEnvironment>
@@ -1506,6 +1670,44 @@
       <Command>copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\python\srwpy\"</Command>
     </PostBuildEvent>
   </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release_Py3_11_cuda|x64'">
+    <Midl>
+      <TargetEnvironment>X64</TargetEnvironment>
+    </Midl>
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <IntrinsicFunctions>false</IntrinsicFunctions>
+      <AdditionalIncludeDirectories>..\src\lib;..\..\..\Python311_x64\include;$(CUDA_PATH)\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>WIN32;_OFFLOAD_GPU;NDEBUG;_WINDOWS;_USRDLL;SRWLCLIENTPYTHON_EXPORTS;SRWLIB_CLIENT;_CRT_SECURE_NO_WARNINGS;NON_UNIX_STDIO;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+      <FunctionLevelLinking>false</FunctionLevelLinking>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <DebugInformationFormat>None</DebugInformationFormat>
+      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+      <InlineFunctionExpansion>OnlyExplicitInline</InlineFunctionExpansion>
+      <OmitFramePointers>true</OmitFramePointers>
+      <WholeProgramOptimization>false</WholeProgramOptimization>
+      <EnableFiberSafeOptimizations>true</EnableFiberSafeOptimizations>
+      <FloatingPointModel>Precise</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <AdditionalDependencies>..\..\..\Python311_x64\libs\python311.lib;$(CUDA_PATH)\lib\x64\cudart_static.lib;$(CUDA_PATH)\lib\x64\cudadevrt.lib;srw_x64.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <OutputFile>srwlpy.pyd</OutputFile>
+      <IgnoreSpecificDefaultLibraries>LIBC;%(IgnoreSpecificDefaultLibraries)</IgnoreSpecificDefaultLibraries>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <SubSystem>Windows</SubSystem>
+      <OptimizeReferences>true</OptimizeReferences>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <TargetMachine>MachineX64</TargetMachine>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+      <ProfileGuidedDatabase>srwlpy.pgd</ProfileGuidedDatabase>
+    </Link>
+    <PostBuildEvent>
+      <Command>copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\python\srwpy\"</Command>
+    </PostBuildEvent>
+  </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug_Py2x|Win32'">
     <ClCompile>
       <Optimization>Disabled</Optimization>
diff --git a/cpp/vc/SRWLClientPython.vcxproj.user b/cpp/vc/SRWLClientPython.vcxproj.user
index 0e4ca8e2..c029e307 100644
--- a/cpp/vc/SRWLClientPython.vcxproj.user
+++ b/cpp/vc/SRWLClientPython.vcxproj.user
@@ -36,6 +36,12 @@
     <LocalDebuggerWorkingDirectory>..\..\env\work\srw_python</LocalDebuggerWorkingDirectory>
     <DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor>
   </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug_Py3_11_cuda|x64'">
+    <LocalDebuggerCommand>C:\SoftwareDevelopments\Python39_x64\python.exe</LocalDebuggerCommand>
+    <LocalDebuggerCommandArguments>ELETTRA-CDI-Source-Test-Tandem-350-eV.py</LocalDebuggerCommandArguments>
+    <LocalDebuggerWorkingDirectory>..\..\env\work\srw_python</LocalDebuggerWorkingDirectory>
+    <DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor>
+  </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug_Py3_7|x64'">
     <LocalDebuggerCommand>..\..\Python37_x64\python.exe</LocalDebuggerCommand>
     <LocalDebuggerCommandArguments>SRWLIB_Example04_test_mi4d_resize_mesh.py</LocalDebuggerCommandArguments>
@@ -84,6 +90,12 @@
     <LocalDebuggerCommandArguments>split-delay-test-vcc.py</LocalDebuggerCommandArguments>
     <LocalDebuggerWorkingDirectory>..\..\env\work\srw_python</LocalDebuggerWorkingDirectory>
   </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release_Py3_11_cuda|x64'">
+    <LocalDebuggerCommand>C:\SoftwareDevelopments\Python38_x64\python.exe</LocalDebuggerCommand>
+    <DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor>
+    <LocalDebuggerCommandArguments>split-delay-test-vcc.py</LocalDebuggerCommandArguments>
+    <LocalDebuggerWorkingDirectory>..\..\env\work\srw_python</LocalDebuggerWorkingDirectory>
+  </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug_Py2x|x64'">
     <LocalDebuggerCommand>C:\SoftwareDevelopments\Python27_x64\python.exe</LocalDebuggerCommand>
     <DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor>
@@ -114,6 +126,12 @@
     <LocalDebuggerWorkingDirectory>..\..\env\work\srw_python</LocalDebuggerWorkingDirectory>
     <DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor>
   </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug_Py3_11_cuda|Win32'">
+    <LocalDebuggerCommand>python</LocalDebuggerCommand>
+    <LocalDebuggerCommandArguments>test_hdf5_convert.py</LocalDebuggerCommandArguments>
+    <LocalDebuggerWorkingDirectory>..\..\env\work\srw_python</LocalDebuggerWorkingDirectory>
+    <DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor>
+  </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug_Py3_7|Win32'">
     <LocalDebuggerCommand>python</LocalDebuggerCommand>
     <LocalDebuggerCommandArguments>smf-preliminary-03-an-2d-test-01.py</LocalDebuggerCommandArguments>