From 09f6d279f0aeaea9c340f44db16ab83978dcd63f Mon Sep 17 00:00:00 2001 From: Himanshu Goel Date: Sun, 3 Dec 2023 18:36:29 -0500 Subject: [PATCH 1/9] Import previous changes. --- cpp/src/core/gmfft.cpp | 1555 ++++++++++++++++++++++++++++++++++ cpp/src/core/gmfft.h | 1042 +++++++++++++++++++++++ cpp/src/core/gmfft_gpu.cu | 704 +++++++++++++++ cpp/src/core/gmfft_gpu.h | 43 + cpp/src/core/srradmnp.cpp | 613 +++++++------- cpp/src/core/srradmnp.h | 65 +- cpp/src/core/srradmnp_gpu.cu | 519 ++++++++++++ cpp/src/core/srradstr.h | 41 +- cpp/src/core/srradstr_gpu.cu | 330 ++++++++ cpp/src/lib/auxgpu.cpp | 368 ++++++++ cpp/src/lib/auxgpu.h | 62 ++ cpp/src/lib/srwlib.cpp | 59 +- cpp/src/lib/srwlib.h | 46 +- 13 files changed, 5099 insertions(+), 348 deletions(-) create mode 100644 cpp/src/core/gmfft.cpp create mode 100644 cpp/src/core/gmfft.h create mode 100644 cpp/src/core/gmfft_gpu.cu create mode 100644 cpp/src/core/gmfft_gpu.h create mode 100644 cpp/src/core/srradmnp_gpu.cu create mode 100644 cpp/src/core/srradstr_gpu.cu create mode 100644 cpp/src/lib/auxgpu.cpp create mode 100644 cpp/src/lib/auxgpu.h diff --git a/cpp/src/core/gmfft.cpp b/cpp/src/core/gmfft.cpp new file mode 100644 index 00000000..6e59db8a --- /dev/null +++ b/cpp/src/core/gmfft.cpp @@ -0,0 +1,1555 @@ +/************************************************************************//** + * File: gmfft.cpp + * Description: Auxiliary utilities to work with FFTW library + * Project: + * First release: 2000 + * + * Copyright (C) European Synchrotron Radiation Facility, Grenoble, France + * All Rights Reserved + * + * @author O.Chubar, P.Elleaume + * @author S. Yakubov (E-XFEL) - noticed issue and suggested fix in FFT1D + * @version 1.1 + ***************************************************************************/ + +#include "gmfft.h" + +#ifdef _OFFLOAD_GPU +#include "gmfft_gpu.h" +#endif + +//#include "srwlib.h" //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP + +#ifdef _WITH_OMP //OC27102018 +//SY: adopted for OpenMP +#include "omp.h" +#endif + +//************************************************************************* + +long CGenMathFFT::GoodNumbers[] = { + 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 36, 40, 42, 44, + 48, 50, 52, 54, 56, 60, 64, 66, 70, 72, 78, 80, 84, 88, 90, 96, 98, 100, 104, + 108, 110, 112, 120, 126, 128, 130, 132, 140, 144, 150, 154, 156, 160, 162, + 168, 176, 180, 182, 192, 196, 198, 200, 208, 210, 216, 220, 224, 234, 240, + 250, 252, 256, 260, 264, 270, 280, 286, 288, 294, 300, 308, 312, 320, 324, + 330, 336, 350, 352, 360, 364, 378, 384, 390, 392, 396, 400, 416, 420, 432, + 440, 448, 450, 462, 468, 480, 486, 490, 500, 504, 512, 520, 528, 540, 546, + 550, 560, 572, 576, 588, 594, 600, 616, 624, 630, 640, 648, 650, 660, 672, + 686, 700, 702, 704, 720, 728, 750, 756, 768, 770, 780, 784, 792, 800, 810, + 832, 840, 858, 864, 880, 882, 896, 900, 910, 924, 936, 960, 972, 980, 990, + 1000, 1008, 1024, 1040, 1050, 1056, 1078, 1080, 1092, 1100, 1120, 1134, 1144, + 1152, 1170, 1176, 1188, 1200, 1232, 1248, 1250, 1260, 1274, 1280, 1296, 1300, + 1320, 1344, 1350, 1372, 1386, 1400, 1404, 1408, 1430, 1440, 1456, 1458, 1470, + 1500, 1512, 1536, 1540, 1560, 1568, 1584, 1600, 1620, 1638, 1650, 1664, 1680, + 1716, 1728, 1750, 1760, 1764, 1782, 1792, 1800, 1820, 1848, 1872, 1890, 1920, + 1944, 1950, 1960, 1980, 2000, 2002, 2016, 2048, 2058, 2080, 2100, 2106, 2112, + 2156, 2160, 2184, 2200, 2240, 2250, 2268, 2288, 2304, 2310, 2340, 2352, 2376, + 2400, 2430, 2450, 2464, 2496, 2500, 2520, 2548, 2560, 2574, 2592, 2600, 2640, + 2646, 2688, 2700, 2730, 2744, 2750, 2772, 2800, 2808, 2816, 2860, 2880, 2912, + 2916, 2940, 2970, 3000, 3024, 3072, 3080, 3120, 3136, 3150, 3168, 3200, 3234, + 3240, 3250, 3276, 3300, 3328, 3360, 3402, 3430, 3432, 3456, 3500, 3510, 3520, + 3528, 3564, 3584, 3600, 3640, 3696, 3744, 3750, 3780, 3822, 3840, 3850, 3888, + 3900, 3920, 3960, 4000, 4004, 4032, 4050, 4096, 4116, 4158, 4160, 4200, 4212, + 4224, 4290, 4312, 4320, 4368, 4374, 4400, 4410, 4480, 4500, 4536, 4550, 4576, + 4608, 4620, 4680, 4704, 4752, 4800, 4802, 4860, 4900, 4914, 4928, 4950, 4992, + 5000, 5040, 5096, 5120, 5148, 5184, 5200, 5250, 5280, 5292, 5346, 5376, + 5390, 5400, 5460, 5488, 5500, 5544, 5600, 5616, 5632, 5670, 5720, 5760, 5824, + 5832, 5850, 5880, 5940, 6000, 6006, 6048, 6144, 6160, 6174, 6240, 6250, 6272, + 6300, 6318, 6336, 6370, 6400, 6468, 6480, 6500, 6552, 6600, 6656, 6720, 6750, + 6804, 6860, 6864, 6912, 6930, 7000, 7020, 7040, 7056, 7128, 7150, 7168, 7200, + 7280, 7290, 7350, 7392, 7488, 7500, 7546, 7560, 7644, 7680, 7700, 7722, 7776, + 7800, 7840, 7920, 7938, 8000, 8008, 8064, 8100, 8190, 8192, 8232, 8250, 8316, + 8320, 8400, 8424, 8448, 8580, 8624, 8640, 8736, 8748, 8750, 8800, 8820, 8910, + 8918, 8960, 9000, 9072, 9100, 9152, 9216, 9240, 9360, 9408, 9450, 9504, 9600, + 9604, 9702, 9720, 9750, 9800, 9828, 9856, 9900, 9984, 10000, 10010, 10080, + 10192, 10206, 10240, 10290, 10296, 10368, 10400, 10500, 10530, 10560, 10584, + 10692, 10752, 10780, 10800, 10920, 10976, 11000, 11088, 11200, 11232, 11250, + 11264, 11340, 11440, 11466, 11520, 11550, 11648, 11664, 11700, 11760, 11880, + 12000, 12012, 12096, 12150, 12250, 12288, 12320, 12348, 12474, 12480, 12500, + 12544, 12600, 12636, 12672, 12740, 12800, 12870, 12936, 12960, 13000, 13104, + 13122, 13200, 13230, 13312, 13440, 13500, 13608, 13650, 13720, 13728, 13750, + 13824, 13860, 14000, 14014, 14040, 14080, 14112, 14256, 14300, 14336, 14400, + 14406, 14560, 14580, 14700, 14742, 14784, 14850, 14976, 15000, 15092, 15120, + 15288, 15360, 15400, 15444, 15552, 15600, 15680, 15750, 15840, 15876, 16000, + 16016, 16038, 16128, 16170, 16200, 16250, 16380, 16384, 16464, 16500, 16632, + 16640, 16800, 16848, 16896, 17010, 17150, 17160, 17248, 17280, 17472, 17496, + 17500, 17550, 17600, 17640, 17820, 17836, 17920, 18000, 18018, 18144, 18200, + 18304, 18432, 18480, 18522, 18720, 18750, 18816, 18900, 18954, 19008, 19110, + 19200, 19208, 19250, 19404, 19440, 19500, 19600, 19656, 19712, 19800, 19968, + 20000, 20020, 20160, 20250, 20384, 20412, 20480, 20580, 20592, 20736, 20790, + 20800, 21000, 21060, 21120, 21168, 21384, 21450, 21504, 21560, 21600, 21840, + 21870, 21952, 22000, 22050, 22176, 22400, 22464, 22500, 22528, 22638, 22680, + 22750, 22880, 22932, 23040, 23100, 23166, 23296, 23328, 23400, 23520, 23760, + 23814, 24000, 24010, 24024, 24192, 24300, 24500, 24570, 24576, 24640, 24696, + 24750, 24948, 24960, 25000, 25088, 25200, 25272, 25344, 25480, 25600, 25740, + 25872, 25920, 26000, 26208, 26244, 26250, 26400, 26460, 26624, 26730, 26754, + 26880, 26950, 27000, 27216, 27300, 27440, 27456, 27500, 27648, 27720, 28000, + 28028, 28080, 28160, 28224, 28350, 28512, 28600, 28672, 28800, 28812, 29106, + 29120, 29160, 29250, 29400, 29484, 29568, 29700, 29952, 30000, 30030, 30184, + 30240, 30576, 30618, 30720, 30800, 30870, 30888, 31104, 31200, 31250, 31360, + 31500, 31590, 31680, 31752, 31850, 32000, 32032, 32076, 32256, 32340, 32400, + 32500, 32760, 32768, 32928, 33000, 33264, 33280, 33600, 33614, 33696, 33750, + 33792, 34020, 34300, 34320, 34398, 34496, 34560, 34650, 34944, 34992, 35000, + 35100, 35200, 35280, 35640, 35672, 35750, 35840, 36000, 36036, 36288, 36400, + 36450, 36608, 36750, 36864, 36960, 37044, 37422, 37440, 37500, 37632, 37730, + 37800, 37908, 38016, 38220, 38400, 38416, 38500, 38610, 38808, 38880, 39000, + 39200, 39312, 39366, 39424, 39600, 39690, 39936, 40000, 40040, 40320, 40500, + 40768, 40824, 40950, 40960, 41160, 41184, 41250, 41472, 41580, 41600, 42000, + 42042, 42120, 42240, 42336, 42768, 42900, 43008, 43120, 43200, 43218, 43680, + 43740, 43750, 43904, 44000, 44100, 44226, 44352, 44550, 44590, 44800, 44928, + 45000, 45056, 45276, 45360, 45500, 45760, 45864, 46080, 46200, 46332, 46592, + 46656, 46800, 47040, 47250, 47520, 47628, 48000, 48020, 48048, 48114, 48384, + 48510, 48600, 48750, 49000, 49140, 49152, 49280, 49392, 49500, 49896, 49920, + 50000, 50050, 50176, 50400, 50544, 50688, 50960, 51030, 51200, 51450, 51480, + 51744, 51840, 52000, 52416, 52488, 52500, 52650, 52800, 52822, 52920, 53248, + 53460, 53508, 53760, 53900, 54000, 54054, 54432, 54600, 54880, 54912, 55000, + 55296, 55440, 55566, 56000, 56056, 56160, 56250, 56320, 56448, 56700, 56862, + 57024, 57200, 57330, 57344, 57600, 57624, 57750, 58212, 58240, 58320, 58500, + 58800, 58968, 59136, 59400, 59904, 60000, 60060, 60368, 60480, 60750, 61152, + 61236, 61250, 61440, 61600, 61740, 61776, 62208, 62370, 62400, 62426, 62500, + 62720, 63000, 63180, 63360, 63504, 63700, 64000, 64064, 64152, 64350, 64512, + 64680, 64800, 65000, 65520, 65536, 65610, 65856, 66000, 66150, 66528, 66560, + 67200, 67228, 67392, 67500, 67584, 67914, 68040, 68250, 68600, 68640, 68750, + 68796, 68992, 69120, 69300, 69498, 69888, 69984, 70000, 70070, 70200, 70400, + 70560, 71280, 71344, 71442, 71500, 71680, 72000, 72030, 72072, 72576, 72800, + 72900, 73216, 73500, 73710, 73728, 73920, 74088, 74250, 74844, 74880, 75000, + 75264, 75460, 75600, 75816, 76032, 76440, 76800, 76832, 77000, 77220, 77616, + 77760, 78000, 78400, 78624, 78732, 78750, 78848, 79200, 79380, 79872, 80000, + 80080, 80190, 80262, 80640, 80850, 81000, 81250, 81536, 81648, 81900, 81920, + 82320, 82368, 82500, 82944, 83160, 83200, 84000, 84084, 84240, 84480, 84672, + 85050, 85536, 85750, 85800, 86016, 86240, 86400, 86436, 87318, 87360, 87480, + 87500, 87750, 87808, 88000, 88200, 88452, 88704, 89100, 89180, 89600, 89856, + 90000, 90090, 90112, 90552, 90720, 91000, 91520, 91728, 91854, 92160, 92400, + 92610, 92664, 93184, 93312, 93600, 93750, 94080, 94500, 94770, 95040, 95256, + 95550, 96000, 96040, 96096, 96228, 96250, 96768, 97020, 97200, 97500, 98000, + 98098, 98280, 98304, 98560, 98784, 99000, 99792, 99840, 100000 +}; +long CGenMathFFT::LenGoodNumbers = 1151; //637; + +long CGenMathFFT::GoodNum100s[] = { 0,37,61,79,95,107,120,130,142,151,159 }; +long CGenMathFFT::LenGoodNum100s = 11; + +long CGenMathFFT::GoodNum1000s[] = { 0,159,228,279,318,354,383,410,435,459,479 }; +long CGenMathFFT::LenGoodNum1000s = 11; + +long CGenMathFFT::GoodNum10000s[] = { 0,479,636,743,830,900,960,1017,1064,1109,1150 }; +long CGenMathFFT::LenGoodNum10000s = 11; + +#ifdef _OFFLOAD_GPU +long CGenMathFFT1D::PlanLen; +long CGenMathFFT1D::dPlanLen; +long CGenMathFFT1D::HowMany; +long CGenMathFFT1D::dHowMany; +cufftHandle CGenMathFFT1D::Plan1DFFT_cu; +cufftHandle CGenMathFFT1D::dPlan1DFFT_cu; +#endif + +#ifdef _OFFLOAD_GPU +long CGenMathFFT2D::PlanNx; +long CGenMathFFT2D::PlanNy; +long CGenMathFFT2D::HowMany; +long CGenMathFFT2D::dPlanNx; +long CGenMathFFT2D::dPlanNy; +long CGenMathFFT2D::dHowMany; +cufftHandle CGenMathFFT2D::Plan2DFFT_cu; +cufftHandle CGenMathFFT2D::dPlan2DFFT_cu; +#endif +//************************************************************************* + +void CGenMathFFT::NextCorrectNumberForFFT(long& n) +//void CGenMathFFT::NextCorrectNumberForFFT(long long& n) //OC26042019 +{ + if(n < 4) + { + n = 4; return; + } + if(n < 100001) + { + long *pGoodPrev, *pGoodNext; + + long n_d_10000 = long(n*0.0001); + if(n_d_10000 > 0) pGoodPrev = GoodNumbers + GoodNum10000s[n_d_10000] - 1; + else + { + long n_d_1000 = long(n*0.001); + if(n_d_1000 > 0) pGoodPrev = GoodNumbers + GoodNum1000s[n_d_1000] - 1; + else + { + long n_d_100 = long(n*0.01); + if(n_d_100 > 0) pGoodPrev = GoodNumbers + GoodNum100s[n_d_100] - 1; + else pGoodPrev = GoodNumbers; + } + } + pGoodNext = pGoodPrev + 1; + for(;;) + { + if((n > *(pGoodPrev++)) && (n <= *pGoodNext)) + { + n = *pGoodNext; return; + } + pGoodNext++; + } + } + else + { + //OC23072020: sorted multiplies by ratios of power of first prime numbers bw 1 and 2 + const double arTestMults[] = {10./9., 9./8., 6./5., 5./4., 4./3., 3./2., 8./5., 5./3., 16./9., 15./8.}; + const int nTestMults = 10; + + //long k = 16384; + //long k = 65536; + long k = 99000; //OC23072020 (make sure this number is < 100001, and divides by 9,8,5) + + for(int j=0; j<100; j++) + { + //OC23072020 (added tests of intermed numbers obtained by multiplying k by a factor bw 1 and 2) + bool intermedNumFound = false; + for(int m=0; m= 0.5) kTest++; + if(n <= kTest) + { + n = kTest; + intermedNumFound = true; + break; + } + } + if(intermedNumFound) break; + + k <<= 1; + if(n <= k) + { + n = k; break; + } + } + } +} + +//************************************************************************* +//int CGenMathFFT1D::Make1DFFT_InPlace(CGenMathFFT1DInfo& FFT1DInfo) +//int CGenMathFFT1D::Make1DFFT_InPlace(CGenMathFFT1DInfo& FFT1DInfo, gpuUsageArg *pGpuUsage) //HG18072022 +int CGenMathFFT1D::Make1DFFT_InPlace(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU) //OC06092023 +{ +#ifdef _OFFLOAD_GPU //OC06092023 (to avoid useless operations / calls at execution on CPU) + GPU_COND(pvGPU, //OC06092023 + //GPU_COND(pGpuUsage, + { + //HG03082022 GPU can do an inplace fft without being given a temporary buffer + FFT1DInfo.pOutData = FFT1DInfo.pInData; + int result; + if (result = Make1DFFT(FFT1DInfo, pvGPU)) return result; //OC06092023 + //if (result = Make1DFFT(FFT1DInfo, pGpuUsage)) return result; + }) + else +#endif + { + //long TotAmOfPo = (FFT1DInfo.Nx << 1)*FFT1DInfo.HowMany; + long long TotAmOfPo = ((long long)(FFT1DInfo.Nx << 1))*((long long)FFT1DInfo.HowMany); + float* AuxDataCont = new float[TotAmOfPo]; + if(AuxDataCont == 0) return MEMORY_ALLOCATION_FAILURE; + FFT1DInfo.pOutData = AuxDataCont; + + int result; + if(result = Make1DFFT(FFT1DInfo)) return result; + + float *tOut = FFT1DInfo.pInData, *t = AuxDataCont; + for(int ix=0; ix RelShiftTol*xStepNx); + NeedsShiftAfterY = (::fabs(y0_After) > RelShiftTol*yStepNy); + + double xStartTr = -0.5/FFT2DInfo.xStep; + double yStartTr = -0.5/FFT2DInfo.yStep; + + NeedsShiftBeforeX = NeedsShiftBeforeY = 0; + double x0_Before = 0., y0_Before = 0.; + if(FFT2DInfo.UseGivenStartTrValues) + { + x0_Before = (FFT2DInfo.xStartTr - xStartTr); // Sign should be probably reversed here: check!!! + y0_Before = (FFT2DInfo.yStartTr - yStartTr); // Sign should be probably reversed here: check!!! + + NeedsShiftBeforeX = (::fabs(x0_Before) > RelShiftTol*(::fabs(xStartTr))); + NeedsShiftBeforeY = (::fabs(y0_Before) > RelShiftTol*(::fabs(yStartTr))); + } + + //ArrayShiftX = 0; ArrayShiftY = 0; + m_ArrayShiftX = 0; m_ArrayShiftY = 0; //OC02022019 + m_dArrayShiftX = 0; m_dArrayShiftY = 0; + if (FFT2DInfo.pData != 0) + { + if (NeedsShiftBeforeX || NeedsShiftAfterX) + { + //ArrayShiftX = new float[Nx << 1]; + //if(ArrayShiftX == 0) return MEMORY_ALLOCATION_FAILURE; + m_ArrayShiftX = new float[Nx << 1]; + if (m_ArrayShiftX == 0) return MEMORY_ALLOCATION_FAILURE; + } + if (NeedsShiftBeforeY || NeedsShiftAfterY) + { + //ArrayShiftY = new float[Ny << 1]; + //if(ArrayShiftY == 0) return MEMORY_ALLOCATION_FAILURE; + m_ArrayShiftY = new float[Ny << 1]; + if (m_ArrayShiftY == 0) return MEMORY_ALLOCATION_FAILURE; + } + } + else if (FFT2DInfo.pdData != 0) + { + if (NeedsShiftBeforeX || NeedsShiftAfterX) + { + m_dArrayShiftX = new double[Nx << 1]; + if (m_dArrayShiftX == 0) return MEMORY_ALLOCATION_FAILURE; + } + if (NeedsShiftBeforeY || NeedsShiftAfterY) + { + m_dArrayShiftY = new double[Ny << 1]; + if (m_dArrayShiftY == 0) return MEMORY_ALLOCATION_FAILURE; + } + } + +#ifdef _FFTW3 + fftwf_plan Plan2DFFT; + fftw_plan dPlan2DFFT; + fftwf_complex* DataToFFT = 0; + fftw_complex* dDataToFFT = 0; +#endif + +//HG18072022 +//#ifdef _DEBUG +// if (pGpuUsage != NULL) +// printf ("GPU: Make2DFFT\n"); +//#endif + +#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) + GPU_COND(pvGPU, //OC06092023 + //GPU_COND(pGpuUsage, //HG02112021 + { + if(FFT2DInfo.pData != 0) + { + DataToFFT = (fftwf_complex*)CAuxGPU::ToDevice((TGPUUsageArg*)pvGPU, FFT2DInfo.pData, FFT2DInfo.Nx * FFT2DInfo.Ny * FFT2DInfo.howMany * 2 * sizeof(float)); //OC06092023 + //DataToFFT = (fftwf_complex*)AuxGPU::ToDevice(pGpuUsage, FFT2DInfo.pData, FFT2DInfo.Nx * FFT2DInfo.Ny * FFT2DInfo.howMany * 2 * sizeof(float)); + } + else if(FFT2DInfo.pdData != 0) + { + dDataToFFT = (fftw_complex*)CAuxGPU::ToDevice((TGPUUsageArg*)pvGPU, FFT2DInfo.pdData, FFT2DInfo.Nx * FFT2DInfo.Ny * FFT2DInfo.howMany * 2 * sizeof(double)); //OC06092023 + //dDataToFFT = (fftw_complex*)AuxGpu::ToDevice(pGpuUsage, FFT2DInfo.pdData, FFT2DInfo.Nx * FFT2DInfo.Ny * FFT2DInfo.howMany * 2 * sizeof(double)); + } + }) + else +#endif + { +#if _FFTW3 //OC28012019 + if (FFT2DInfo.pData != 0) DataToFFT = (fftwf_complex*)(FFT2DInfo.pData); + else if (FFT2DInfo.pdData != 0) dDataToFFT = (fftw_complex*)(FFT2DInfo.pdData); //OC02022019 + +#else + fftwnd_plan Plan2DFFT; + FFTW_COMPLEX* DataToFFT = (FFTW_COMPLEX*)(FFT2DInfo.pData); +#endif + } + + char t0SignMult = (FFT2DInfo.Dir > 0)? -1 : 1; + + //if(NeedsShiftBeforeX) FillArrayShift('x', t0SignMult*x0_Before, FFT2DInfo.xStep); + //if(NeedsShiftBeforeY) FillArrayShift('y', t0SignMult*y0_Before, FFT2DInfo.yStep); + if(NeedsShiftBeforeX) + {//OC02022019 + if(m_ArrayShiftX != 0) + FillArrayShift('x', t0SignMult*x0_Before, FFT2DInfo.xStep, m_ArrayShiftX); + else if(m_dArrayShiftX != 0) + FillArrayShift('x', t0SignMult*x0_Before, FFT2DInfo.xStep, m_dArrayShiftX); + } + if(NeedsShiftBeforeY) + {//OC02022019 + if(m_ArrayShiftY != 0) + FillArrayShift('y', t0SignMult*y0_Before, FFT2DInfo.yStep, m_ArrayShiftY); + else if(m_dArrayShiftY != 0) + FillArrayShift('y', t0SignMult*y0_Before, FFT2DInfo.yStep, m_dArrayShiftY); + } + +#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) + if (DataToFFT != 0) CAuxGPU::EnsureDeviceMemoryReady((TGPUUsageArg*)pvGPU, DataToFFT); //OC06092023 + else if (dDataToFFT != 0) CAuxGPU::EnsureDeviceMemoryReady((TGPUUsageArg*)pvGPU, dDataToFFT); + //if (DataToFFT != 0) AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, DataToFFT); + //else if (dDataToFFT != 0) AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, dDataToFFT); +#endif + + if (NeedsShiftBeforeX || NeedsShiftBeforeY) //HG02112021 + { +#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) + GPU_COND(pvGPU, { //OC06092023 + //GPU_COND(pGpuUsage, { + TGPUUsageArg *pGPU = (TGPUUsageArg*)pvGPU; + if (DataToFFT != 0) { + m_ArrayShiftX = (float*)CAuxGPU::ToDevice(pGPU, m_ArrayShiftX, (Nx << 1) * sizeof(float), false); //OC06092023 + m_ArrayShiftY = (float*)CAuxGPU::ToDevice(pGPU, m_ArrayShiftY, (Ny << 1) * sizeof(float), false); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, m_ArrayShiftX); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, m_ArrayShiftY); + //m_ArrayShiftX = (float*)AuxGpu::ToDevice(pGpuUsage, m_ArrayShiftX, (Nx << 1) * sizeof(float), false); + //m_ArrayShiftY = (float*)AuxGpu::ToDevice(pGpuUsage, m_ArrayShiftY, (Ny << 1) * sizeof(float), false); + //AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, m_ArrayShiftX); + //AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, m_ArrayShiftY); + TreatShifts2D_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany, NeedsShiftBeforeX, NeedsShiftBeforeY, m_ArrayShiftX, m_ArrayShiftY); + m_ArrayShiftX = (float*)CAuxGPU::ToHostAndFree(pGPU, m_ArrayShiftX, (Nx << 1) * sizeof(float), true); //OC06092023 + m_ArrayShiftY = (float*)CAuxGPU::ToHostAndFree(pGPU, m_ArrayShiftY, (Ny << 1) * sizeof(float), true); + //m_ArrayShiftX = (float*)AuxGpu::ToHostAndFree(pGpuUsage, m_ArrayShiftX, (Nx << 1) * sizeof(float), true); + //m_ArrayShiftY = (float*)AuxGpu::ToHostAndFree(pGpuUsage, m_ArrayShiftY, (Ny << 1) * sizeof(float), true); + } + else if (dDataToFFT != 0) { + m_dArrayShiftX = (double*)CAuxGPU::ToDevice(pGPU, m_dArrayShiftX, (Nx << 1) * sizeof(double), false); //OC06092023 + m_dArrayShiftY = (double*)CAuxGPU::ToDevice(pGPU, m_dArrayShiftY, (Ny << 1) * sizeof(double), false); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, m_dArrayShiftX); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, m_dArrayShiftY); + //m_dArrayShiftX = (double*)AuxGpu::ToDevice(pGpuUsage, m_dArrayShiftX, (Nx << 1) * sizeof(double), false); + //m_dArrayShiftY = (double*)AuxGpu::ToDevice(pGpuUsage, m_dArrayShiftY, (Ny << 1) * sizeof(double), false); + //AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, m_dArrayShiftX); + //AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, m_dArrayShiftY); + TreatShifts2D_GPU((double*)dDataToFFT, Nx, Ny, FFT2DInfo.howMany, NeedsShiftBeforeX, NeedsShiftBeforeY, m_dArrayShiftX, m_dArrayShiftY); + m_dArrayShiftX = (double*)CAuxGPU::ToHostAndFree(pGPU, m_dArrayShiftX, (Nx << 1) * sizeof(double), true); //OC06092023 + m_dArrayShiftY = (double*)CAuxGPU::ToHostAndFree(pGPU, m_dArrayShiftY, (Ny << 1) * sizeof(double), true); + //m_dArrayShiftX = (double*)AuxGpu::ToHostAndFree(pGpuUsage, m_dArrayShiftX, (Nx << 1) * sizeof(double), true); + //m_dArrayShiftY = (double*)AuxGpu::ToHostAndFree(pGpuUsage, m_dArrayShiftY, (Ny << 1) * sizeof(double), true); + } + }) + else +#endif + { + if (DataToFFT != 0) TreatShifts(DataToFFT, FFT2DInfo.howMany); + +#ifdef _FFTW3 //OC27022019 + else if (dDataToFFT != 0) TreatShifts(dDataToFFT, FFT2DInfo.howMany); //OC02022019 +#endif + } + } + + bool alreadyNormalized = false; //HG17032022 + //double Mult = FFT2DInfo.xStep*FFT2DInfo.yStep; + double Mult = FFT2DInfo.xStep * FFT2DInfo.yStep * FFT2DInfo.ExtraMult; //OC20112017 + if (FFT2DInfo.Dir > 0) + { +#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) + GPU_COND(pvGPU, //OC06092023 + //GPU_COND(pGpuUsage, //HG02112021 + { + if (DataToFFT != 0) + { + if (pPrecreatedPlan2DFFT == 0) + { + if((Plan2DFFT_cu == NULL) || (!(PlanNx == Nx && PlanNy == Ny && HowMany == FFT2DInfo.howMany))) //OC06092023 + //if (Plan2DFFT_cu == NULL | !(PlanNx == Nx && PlanNy == Ny && HowMany == FFT2DInfo.howMany)) + { + if (Plan2DFFT_cu != NULL) + { + cufftDestroy(Plan2DFFT_cu); + Plan2DFFT_cu = NULL; + } + + PlanNx = Nx; + PlanNy = Ny; + HowMany = FFT2DInfo.howMany; + int plan_shape[2]; plan_shape[0] = Nx; plan_shape[1] = Ny; + cufftPlanMany(&Plan2DFFT_cu, 2, plan_shape, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, FFT2DInfo.howMany); + //cufftPlan2d(&Plan2DFFT_cu, Nx, Ny, CUFFT_C2C); + } + } + else Plan2DFFT_cu = *(cufftHandle*)pPrecreatedPlan2DFFT; + if (Plan2DFFT_cu == 0) return ERROR_IN_FFT; + + auto res = cufftExecC2C(Plan2DFFT_cu, (cufftComplex*)DataToFFT, (cufftComplex*)DataToFFT, CUFFT_FORWARD); +// if (res != CUFFT_SUCCESS) +// printf("CUFFT Error: %d\r\n", res); + } + else if (dDataToFFT != 0) + { + if (pdPrecreatedPlan2DFFT == 0) + { + if((dPlan2DFFT_cu == NULL) || (!(dPlanNx == Nx && dPlanNy == Ny && dHowMany == FFT2DInfo.howMany))) //OC06092023 + //if (dPlan2DFFT_cu == NULL | !(dPlanNx == Nx && dPlanNy == Ny && dHowMany == FFT2DInfo.howMany)) + { + if (dPlan2DFFT_cu != NULL) + { + cufftDestroy(dPlan2DFFT_cu); + dPlan2DFFT_cu = NULL; + } + + dPlanNx = Nx; + dPlanNy = Ny; + HowMany = FFT2DInfo.howMany; + int plan_shape[2]; plan_shape[0] = Nx; plan_shape[1] = Ny; + cufftPlanMany(&Plan2DFFT_cu, 2, plan_shape, 0, 0, 0, 0, 0, 0, CUFFT_Z2Z, FFT2DInfo.howMany); + //cufftPlan2d(&dPlan2DFFT_cu, Nx, Ny, CUFFT_Z2Z); + } + } + else dPlan2DFFT_cu = *(cufftHandle*)pdPrecreatedPlan2DFFT; + if (dPlan2DFFT_cu == 0) return ERROR_IN_FFT; + + cufftExecZ2Z(dPlan2DFFT_cu, (cufftDoubleComplex*)dDataToFFT, (cufftDoubleComplex*)dDataToFFT, CUFFT_FORWARD); + } + }) + else +#endif + { + //Plan2DFFT = fftw2d_create_plan(Ny, Nx, FFTW_FORWARD, FFTW_IN_PLACE); + //OC27102018 + //SY: adopted for OpenMP +#if _FFTW3 //OC28012019 + + for(long iHowMany = 0; iHowMany < FFT2DInfo.howMany; iHowMany++) + { + long iFFT = Nx * Ny * iHowMany; + if (DataToFFT != 0) + { + if (pPrecreatedPlan2DFFT == 0) Plan2DFFT = fftwf_plan_dft_2d(Ny, Nx, DataToFFT + iFFT, DataToFFT + iFFT, FFTW_FORWARD, FFTW_ESTIMATE); + else Plan2DFFT = *pPrecreatedPlan2DFFT; + if (Plan2DFFT == 0) return ERROR_IN_FFT; + + fftwf_execute(Plan2DFFT); + } + else if (dDataToFFT != 0) + { + if (pdPrecreatedPlan2DFFT == 0) dPlan2DFFT = fftw_plan_dft_2d(Ny, Nx, dDataToFFT + iFFT, dDataToFFT + iFFT, FFTW_FORWARD, FFTW_ESTIMATE); + else dPlan2DFFT = *pdPrecreatedPlan2DFFT; + if (dPlan2DFFT == 0) return ERROR_IN_FFT; + + fftw_execute(dPlan2DFFT); + } + } + +#else + if (pPrecreatedPlan2DFFT == 0) Plan2DFFT = fftw2d_create_plan(Ny, Nx, FFTW_FORWARD, FFTW_IN_PLACE); + else Plan2DFFT = *pPrecreatedPlan2DFFT; + if (Plan2DFFT == 0) return ERROR_IN_FFT; + fftwnd(Plan2DFFT, 1, DataToFFT, 1, 0, DataToFFT, 1, 0); +#endif + } + +#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) + GPU_COND(pvGPU, //OC06092023 + //GPU_COND(pGpuUsage, //HG18072022 + { + if (DataToFFT != 0) + { + //RepairSignAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany); + //RotateDataAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany); + //RepairSignAndRotateDataAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany, Mult); + RepairSignAndRotateDataAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany, (float)Mult); //OC06092023 + } + else if (dDataToFFT != 0) + { + //RepairSignAfter2DFFT_GPU((double*)dDataToFFT, Nx, Ny, FFT2DInfo.howMany); + //RotateDataAfter2DFFT_GPU((double*)dDataToFFT, Nx, Ny, FFT2DInfo.howMany); + RepairSignAndRotateDataAfter2DFFT_GPU((double*)dDataToFFT, Nx, Ny, FFT2DInfo.howMany, Mult); + } + alreadyNormalized = true; + }) + else +#endif + { + if (DataToFFT != 0) + { + RepairSignAfter2DFFT(DataToFFT, FFT2DInfo.howMany); + RotateDataAfter2DFFT(DataToFFT, FFT2DInfo.howMany); + } + +#ifdef _FFTW3 //OC27022019 + else if (dDataToFFT != 0) + { + RepairSignAfter2DFFT(dDataToFFT, FFT2DInfo.howMany); + RotateDataAfter2DFFT(dDataToFFT, FFT2DInfo.howMany); + } +#endif + } + } + else + { +#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) + GPU_COND(pvGPU, //OC06092023 + //GPU_COND(pGpuUsage, //HG18072022 + { + if (DataToFFT != 0) + { + if (pPrecreatedPlan2DFFT == 0) { + if((Plan2DFFT_cu == NULL) || (!(PlanNx == Nx && PlanNy == Ny && HowMany == FFT2DInfo.howMany))) //OC06092023 + //if (Plan2DFFT_cu == NULL | !(PlanNx == Nx && PlanNy == Ny && HowMany == FFT2DInfo.howMany)) + { + if (Plan2DFFT_cu != NULL){ + cufftDestroy(Plan2DFFT_cu); + Plan2DFFT_cu = NULL; + } + + PlanNx = Nx; + PlanNy = Ny; + HowMany = FFT2DInfo.howMany; + int plan_shape[2]; plan_shape[0] = Nx; plan_shape[1] = Ny; + cufftPlanMany(&Plan2DFFT_cu, 2, plan_shape, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, FFT2DInfo.howMany); + //cufftPlan2d(&Plan2DFFT_cu, Nx, Ny, CUFFT_C2C); + } + } + else Plan2DFFT_cu = *(cufftHandle*)pPrecreatedPlan2DFFT; + if (Plan2DFFT_cu == 0) return ERROR_IN_FFT; + + RotateDataAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany); + RepairSignAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany); + cufftExecC2C(Plan2DFFT_cu, (cufftComplex*)DataToFFT, (cufftComplex*)DataToFFT, CUFFT_INVERSE); + } + else if (dDataToFFT != 0) + { + if (pdPrecreatedPlan2DFFT == 0) { + if((dPlan2DFFT_cu == NULL) || (!(dPlanNx == Nx && dPlanNy == Ny && dHowMany == FFT2DInfo.howMany))) //OC06092023 + //if (dPlan2DFFT_cu == NULL | !(dPlanNx == Nx && dPlanNy == Ny && dHowMany == FFT2DInfo.howMany)) + { + if (dPlan2DFFT_cu != NULL){ + cufftDestroy(dPlan2DFFT_cu); + dPlan2DFFT_cu = NULL; + } + + dPlanNx = Nx; + dPlanNy = Ny; + dHowMany = FFT2DInfo.howMany; + int plan_shape[2]; plan_shape[0] = Nx; plan_shape[1] = Ny; + cufftPlanMany(&Plan2DFFT_cu, 2, plan_shape, NULL, 0, 0, NULL, 0, 0, CUFFT_Z2Z, FFT2DInfo.howMany); + //cufftPlan2d(&dPlan2DFFT_cu, Nx, Ny, CUFFT_Z2Z); + } + } + else dPlan2DFFT_cu = *(cufftHandle*)pdPrecreatedPlan2DFFT; + if (dPlan2DFFT_cu == 0) return ERROR_IN_FFT; + + RotateDataAfter2DFFT_GPU((double*)dDataToFFT, Nx, Ny, FFT2DInfo.howMany); + RepairSignAfter2DFFT_GPU((double*)dDataToFFT, Nx, Ny, FFT2DInfo.howMany); + cufftExecZ2Z(dPlan2DFFT_cu, (cufftDoubleComplex*)dDataToFFT, (cufftDoubleComplex*)dDataToFFT, CUFFT_INVERSE); + } + }) + else +#endif + { + //Plan2DFFT = fftw2d_create_plan(Ny, Nx, FFTW_BACKWARD, FFTW_IN_PLACE); + //OC27102018 + //SY: adopted for OpenMP +#ifdef _FFTW3 //OC28012019 + for (long iHowMany = 0; iHowMany < FFT2DInfo.howMany; iHowMany++) + { + long iFFT = Nx * Ny * iHowMany; + if (DataToFFT != 0) + { + if (pPrecreatedPlan2DFFT == 0) Plan2DFFT = fftwf_plan_dft_2d(Ny, Nx, DataToFFT + iFFT, DataToFFT + iFFT, FFTW_BACKWARD, FFTW_ESTIMATE); + else Plan2DFFT = *pPrecreatedPlan2DFFT; + if (Plan2DFFT == 0) return ERROR_IN_FFT; + RotateDataAfter2DFFT(DataToFFT, FFT2DInfo.howMany); + RepairSignAfter2DFFT(DataToFFT, FFT2DInfo.howMany); + fftwf_execute(Plan2DFFT); + } + else if (dDataToFFT != 0) + { + if (pdPrecreatedPlan2DFFT == 0) dPlan2DFFT = fftw_plan_dft_2d(Ny, Nx, dDataToFFT + iFFT, dDataToFFT + iFFT, FFTW_BACKWARD, FFTW_ESTIMATE); + else dPlan2DFFT = *pdPrecreatedPlan2DFFT; + if (dPlan2DFFT == 0) return ERROR_IN_FFT; + RotateDataAfter2DFFT(dDataToFFT, FFT2DInfo.howMany); + RepairSignAfter2DFFT(dDataToFFT, FFT2DInfo.howMany); + fftw_execute(dPlan2DFFT); + } + } +#else + if (pPrecreatedPlan2DFFT == 0) Plan2DFFT = fftw2d_create_plan(Ny, Nx, FFTW_BACKWARD, FFTW_IN_PLACE); + else Plan2DFFT = *pPrecreatedPlan2DFFT; + if (Plan2DFFT == 0) return ERROR_IN_FFT; + RotateDataAfter2DFFT(DataToFFT); + RepairSignAfter2DFFT(DataToFFT); + fftwnd(Plan2DFFT, 1, DataToFFT, 1, 0, DataToFFT, 1, 0); +#endif + } + } + + if (!alreadyNormalized){ +#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) + GPU_COND(pvGPU, //OC06092023 + //GPU_COND(pGpuUsage, //HG18072022 + { + if (DataToFFT != 0) + NormalizeDataAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany, Mult); + else if (dDataToFFT != 0) + NormalizeDataAfter2DFFT_GPU((double*)dDataToFFT, Nx, Ny, FFT2DInfo.howMany, Mult); + }) + else +#endif + { + if (DataToFFT != 0) NormalizeDataAfter2DFFT(DataToFFT, Mult, FFT2DInfo.howMany); + +#ifdef _FFTW3 //OC27022019 + else if (dDataToFFT != 0) NormalizeDataAfter2DFFT(dDataToFFT, Mult, FFT2DInfo.howMany); +#endif + } + } + + //if(NeedsShiftAfterX) FillArrayShift('x', t0SignMult*x0_After, FFT2DInfo.xStepTr); + //if(NeedsShiftAfterY) FillArrayShift('y', t0SignMult*y0_After, FFT2DInfo.yStepTr); + + if (NeedsShiftAfterX) + {//OC02022019 + if (m_ArrayShiftX != 0) FillArrayShift('x', t0SignMult * x0_After, FFT2DInfo.xStepTr, m_ArrayShiftX); + else if (m_dArrayShiftX != 0) FillArrayShift('x', t0SignMult * x0_After, FFT2DInfo.xStepTr, m_dArrayShiftX); + } + if (NeedsShiftAfterY) + {//OC02022019 + if (m_ArrayShiftY != 0) FillArrayShift('y', t0SignMult * y0_After, FFT2DInfo.yStepTr, m_ArrayShiftY); + else if (m_dArrayShiftY != 0) FillArrayShift('y', t0SignMult * y0_After, FFT2DInfo.yStepTr, m_dArrayShiftY); + } + if (NeedsShiftAfterX || NeedsShiftAfterY) + { +#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) + GPU_COND(pvGPU, //OC06092023 + //GPU_COND(pGpuUsage, //HG18072022 + { + TGPUUsageArg *pGPU = (TGPUUsageArg*)pvGPU; + if (DataToFFT != 0) { + m_ArrayShiftX = (float*)CAuxGPU::ToDevice(pGPU, m_ArrayShiftX, (Nx << 1) * sizeof(float), false); //OC06092023 + m_ArrayShiftY = (float*)CAuxGPU::ToDevice(pGPU, m_ArrayShiftY, (Ny << 1) * sizeof(float), false); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, m_ArrayShiftX); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, m_ArrayShiftY); + //m_ArrayShiftX = (float*)AuxGpu::ToDevice(pGpuUsage, m_ArrayShiftX, (Nx << 1) * sizeof(float), false); + //m_ArrayShiftY = (float*)AuxGpu::ToDevice(pGpuUsage, m_ArrayShiftY, (Ny << 1) * sizeof(float), false); + //AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, m_ArrayShiftX); + //AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, m_ArrayShiftY); + TreatShifts2D_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany, NeedsShiftAfterX, NeedsShiftAfterY, m_ArrayShiftX, m_ArrayShiftY); + m_ArrayShiftX = (float*)CAuxGPU::ToHostAndFree(pGPU, m_ArrayShiftX, (Nx << 1) * sizeof(float), true); //OC06092023 + m_ArrayShiftY = (float*)CAuxGPU::ToHostAndFree(pGPU, m_ArrayShiftY, (Ny << 1) * sizeof(float), true); + //m_ArrayShiftX = (float*)AuxGpu::ToHostAndFree(pGpuUsage, m_ArrayShiftX, (Nx << 1) * sizeof(float), true); + //m_ArrayShiftY = (float*)AuxGpu::ToHostAndFree(pGpuUsage, m_ArrayShiftY, (Ny << 1) * sizeof(float), true); + } + else if (dDataToFFT != 0) { + m_dArrayShiftX = (double*)CAuxGPU::ToDevice(pGPU, m_dArrayShiftX, (Nx << 1) * sizeof(double), false); //OC06092023 + m_dArrayShiftY = (double*)CAuxGPU::ToDevice(pGPU, m_dArrayShiftY, (Ny << 1) * sizeof(double), false); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, m_dArrayShiftX); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, m_dArrayShiftY); + //m_dArrayShiftX = (double*)AuxGpu::ToDevice(pGpuUsage, m_dArrayShiftX, (Nx << 1) * sizeof(double), false); + //m_dArrayShiftY = (double*)AuxGpu::ToDevice(pGpuUsage, m_dArrayShiftY, (Ny << 1) * sizeof(double), false); + //AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, m_dArrayShiftX); + //AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, m_dArrayShiftY); + TreatShifts2D_GPU((double*)dDataToFFT, Nx, Ny, FFT2DInfo.howMany, NeedsShiftAfterX, NeedsShiftAfterY, m_dArrayShiftX, m_dArrayShiftY); + m_dArrayShiftX = (double*)CAuxGPU::ToHostAndFree(pGPU, m_dArrayShiftX, (Nx << 1) * sizeof(double), true); //OC06092023 + m_dArrayShiftY = (double*)CAuxGPU::ToHostAndFree(pGPU, m_dArrayShiftY, (Ny << 1) * sizeof(double), true); + //m_dArrayShiftX = (double*)AuxGpu::ToHostAndFree(pGpuUsage, m_dArrayShiftX, (Nx << 1) * sizeof(double), true); + //m_dArrayShiftY = (double*)AuxGpu::ToHostAndFree(pGpuUsage, m_dArrayShiftY, (Ny << 1) * sizeof(double), true); + } + }) + else +#endif + { + if (DataToFFT != 0) TreatShifts(DataToFFT, FFT2DInfo.howMany); + +#ifdef _FFTW3 //OC27022019 + else if (dDataToFFT != 0) TreatShifts(dDataToFFT, FFT2DInfo.howMany); //OC02022019 +#endif + } + } + + //OC_NERSC: to comment-out the following line for NERSC (to avoid crash with "python-mpi") + //fftwnd_destroy_plan(Plan2DFFT); + //OC27102018 + //SY: adopted for OpenMP +#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) + GPU_COND(pvGPU, //OC06092023 + //GPU_COND(pGpuUsage, //HG02112021 + { + if (FFT2DInfo.pData != 0) + { + CAuxGPU::MarkUpdated((TGPUUsageArg*)pvGPU, DataToFFT, true, false); //OC06092023 + //AuxGpu::MarkUpdated(pGpuUsage, DataToFFT, true, false); + } + else if (FFT2DInfo.pdData != 0) + { + CAuxGPU::MarkUpdated((TGPUUsageArg*)pvGPU, dDataToFFT, true, false); //OC06092023 + //AuxGpu::MarkUpdated(pGpuUsage, dDataToFFT, true, false); + } + }) + else +#endif + { +#if _FFTW3 //OC28012019 + if (DataToFFT != 0) + { + if (pPrecreatedPlan2DFFT == 0) fftwf_destroy_plan(Plan2DFFT); + } + else if (dDataToFFT != 0) //OC03022019 + { + if (pdPrecreatedPlan2DFFT == 0) fftw_destroy_plan(dPlan2DFFT); + } +#else + if (pPrecreatedPlan2DFFT == 0) fftwnd_destroy_plan(Plan2DFFT); +#endif + } + + //if(ArrayShiftX != 0) { delete[] ArrayShiftX; ArrayShiftX = 0;} + //if(ArrayShiftY != 0) { delete[] ArrayShiftY; ArrayShiftY = 0;} + if (m_ArrayShiftX != 0) { delete[] m_ArrayShiftX;} + if (m_ArrayShiftY != 0) { delete[] m_ArrayShiftY;} + if (m_dArrayShiftX != 0) { delete[] m_dArrayShiftX;} //OC02022019 + if (m_dArrayShiftY != 0) { delete[] m_dArrayShiftY;} + + return 0; +} + +//************************************************************************* +//Forward FFT: Int f(x)*exp(-i*2*Pi*qx*x)dx +//Backward FFT: Int f(qx)*exp(i*2*Pi*qx*x)dqx +//int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo) +//int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, gpuUsageArg *pGpuUsage) //HG20012022 +int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU) //OC05092023 +{// Assumes Nx, Ny even ! + //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: + //double start; + //get_walltime (&start); + + const double RelShiftTol = 1.E-06; + + SetupLimitsTr(FFT1DInfo); + + double xStepNx = FFT1DInfo.Nx*FFT1DInfo.xStep; + double x0_After = FFT1DInfo.xStart + 0.5*xStepNx; + NeedsShiftAfterX = FFT1DInfo.ApplyAutoShiftAfter && (::fabs(x0_After) > RelShiftTol*xStepNx); + + double xStartTr = -0.5/FFT1DInfo.xStep; + + NeedsShiftBeforeX = 0; + double x0_Before = 0.; + + if(FFT1DInfo.UseGivenStartTrValue) + { + x0_Before = (FFT1DInfo.xStartTr - xStartTr); + NeedsShiftBeforeX = (::fabs(x0_Before) > RelShiftTol*(::fabs(xStartTr))); + } + + m_ArrayShiftX = 0; + m_dArrayShiftX = 0; + if (NeedsShiftBeforeX || NeedsShiftAfterX) + { + if (FFT1DInfo.pInData != 0) + { + m_ArrayShiftX = new float[Nx << 1]; + if (m_ArrayShiftX == 0) return MEMORY_ALLOCATION_FAILURE; + +#ifdef _OFFLOAD_GPU //OC05092023 (check for memory leak / misuse!) + m_ArrayShiftX = (float*)CAuxGPU::ToDevice((TGPUUsageArg*)pvGPU, m_ArrayShiftX, (Nx << 1) * sizeof(float), true); + //m_ArrayShiftX = (float*)AuxGpu::ToDevice(pGpuUsage, m_ArrayShiftX, (Nx << 1) * sizeof(float), true); //HG20012022 +#endif + } + else if (FFT1DInfo.pdInData != 0) + { + m_dArrayShiftX = new double[Nx << 1]; + if (m_dArrayShiftX == 0) return MEMORY_ALLOCATION_FAILURE; + +#ifdef _OFFLOAD_GPU //OC05092023 + m_dArrayShiftX = (double*)CAuxGPU::ToDevice((TGPUUsageArg*)pvGPU, m_dArrayShiftX, (Nx << 1) * sizeof(double), true); + //m_dArrayShiftX = (double*)AuxGpu::ToDevice(pGpuUsage, m_dArrayShiftX, (Nx << 1) * sizeof(double), true); //HG20012022 +#endif + } + } + +#ifdef _FFTW3 //OC28012019 + fftwf_plan Plan1DFFT; + fftwf_complex* DataToFFT = 0, * OutDataFFT = 0; //, *pOutDataFFT=0; + + fftw_plan dPlan1DFFT; + fftw_complex* dDataToFFT = 0, * dOutDataFFT = 0; //, *pdOutDataFFT=0; +#endif + +//HG20012022 +//#ifdef _DEBUG +// if (pGpuUsage != NULL) +// printf ("GPU: Make1DFFT\n"); +//#endif +#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) + GPU_COND(pvGPU, //OC06092023 + //GPU_COND(pGpuUsage, //HG20012022 + { + if ((FFT1DInfo.pInData != 0) && (FFT1DInfo.pOutData != 0)) + { + DataToFFT = (fftwf_complex*)CAuxGPU::ToDevice((TGPUUsageArg*)pvGPU, FFT1DInfo.pInData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(float)); //OC06092023 + OutDataFFT = (fftwf_complex*)CAuxGPU::ToDevice((TGPUUsageArg*)pvGPU, FFT1DInfo.pOutData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(float), true); + //DataToFFT = (fftwf_complex*)AuxGpu::ToDevice(pGpuUsage, FFT1DInfo.pInData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(float)); + //OutDataFFT = (fftwf_complex*)AuxGpu::ToDevice(pGpuUsage, FFT1DInfo.pOutData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(float), true); + } + else if ((FFT1DInfo.pdInData != 0) && (FFT1DInfo.pdOutData != 0)) + { + dDataToFFT = (fftw_complex*)CAuxGPU::ToDevice((TGPUUsageArg*)pvGPU, FFT1DInfo.pdInData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(double)); //OC06092023 + dOutDataFFT = (fftw_complex*)CAuxGPU::ToDevice((TGPUUsageArg*)pvGPU, FFT1DInfo.pdOutData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(double), true); + //dDataToFFT = (fftw_complex*)AuxGpu::ToDevice(pGpuUsage, FFT1DInfo.pdInData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(double)); + //dOutDataFFT = (fftw_complex*)AuxGpu::ToDevice(pGpuUsage, FFT1DInfo.pdOutData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(double), true); + } + }) + else +#endif + { +#ifdef _FFTW3 //OC28012019 + if ((FFT1DInfo.pInData != 0) && (FFT1DInfo.pOutData != 0)) + { + DataToFFT = (fftwf_complex*)(FFT1DInfo.pInData); + OutDataFFT = (fftwf_complex*)(FFT1DInfo.pOutData); + //pOutDataFFT = OutDataFFT; //OC03092016 to be used solely in fftw call + } + else if ((FFT1DInfo.pdInData != 0) && (FFT1DInfo.pdOutData != 0)) + { + dDataToFFT = (fftw_complex*)(FFT1DInfo.pdInData); + dOutDataFFT = (fftw_complex*)(FFT1DInfo.pdOutData); + //pdOutDataFFT = dOutDataFFT; + } +#else + fftw_plan Plan1DFFT; + FFTW_COMPLEX* DataToFFT = (FFTW_COMPLEX*)(FFT1DInfo.pInData); + FFTW_COMPLEX* OutDataFFT = (FFTW_COMPLEX*)(FFT1DInfo.pOutData); + FFTW_COMPLEX* pOutDataFFT = OutDataFFT; //OC03092016 to be used solely in fftw call + /** + Pointed-out by Sergey Yakubov (E-XFEL). + From FFTW 2.1.5 docs: + void fftw(fftw_plan plan, int howmany, + fftw_complex *in, int istride, int idist, + fftw_complex *out, int ostride, int odist); + ... + out, ostride and odist describe the output array(s). The format is the same as for the input array. + In-place transforms: If the plan specifies an in-place transform, ostride and odist are always ignored. + If out is NULL, out is ignored, too. Otherwise, out is interpreted as a pointer to an array of n complex numbers, + that FFTW will use as temporary space to perform the in-place computation. out is used as scratch space and its contents destroyed. + In this case, out must be an ordinary array whose elements are contiguous in memory (no striding). + **/ +#endif + } + +#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) + if (DataToFFT != 0) CAuxGPU::EnsureDeviceMemoryReady((TGPUUsageArg*)pvGPU, DataToFFT); + else if (dDataToFFT != 0) CAuxGPU::EnsureDeviceMemoryReady((TGPUUsageArg*)pvGPU, dDataToFFT); + //if (DataToFFT != 0) AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, DataToFFT); + //else if (dDataToFFT != 0) AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, dDataToFFT); +#endif + + char t0SignMult = (FFT1DInfo.Dir > 0) ? -1 : 1; + if (NeedsShiftBeforeX) + { +#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) + GPU_COND(pvGPU, + //GPU_COND(pGpuUsage, //HG20012022 + { + if (m_ArrayShiftX != 0) FillArrayShift_GPU(t0SignMult * x0_Before, FFT1DInfo.xStep, Nx, m_ArrayShiftX); + else if (m_dArrayShiftX != 0) FillArrayShift_GPU(t0SignMult * x0_Before, FFT1DInfo.xStep, Nx, m_dArrayShiftX); + + if (DataToFFT != 0) TreatShift_GPU((float*)DataToFFT, FFT1DInfo.HowMany, Nx, m_ArrayShiftX); + else if (dDataToFFT != 0) TreatShift_GPU((double*)dDataToFFT, FFT1DInfo.HowMany, Nx, m_dArrayShiftX); + }) + else +#endif + { + //FillArrayShift(t0SignMult*x0_Before, FFT1DInfo.xStep); + if (m_ArrayShiftX != 0) FillArrayShift(t0SignMult * x0_Before, FFT1DInfo.xStep, m_ArrayShiftX); + else if (m_dArrayShiftX != 0) FillArrayShift(t0SignMult * x0_Before, FFT1DInfo.xStep, m_dArrayShiftX); + + if (DataToFFT != 0) TreatShift(DataToFFT, FFT1DInfo.HowMany); + +#ifdef _FFTW3 //OC27022019 + else if (dDataToFFT != 0) TreatShift(dDataToFFT, FFT1DInfo.HowMany); +#endif + } + } + + //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: + //srwlPrintTime("::Make1DFFT : before fft",&start); + + int flags = FFTW_ESTIMATE; //OC30012019 + bool alreadyNormalized = false; //HG17032022 + //double Mult = FFT1DInfo.xStep; + double Mult = FFT1DInfo.xStep * FFT1DInfo.MultExtra; + + if (FFT1DInfo.Dir > 0) //HG17112021 + { +#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) + GPU_COND(pvGPU, + //GPU_COND(pGpuUsage, + { + int arN[] = { (int)Nx }; //OC14052020 + if (DataToFFT != 0) + { + if (PlanLen != Nx) { + PlanLen = Nx; + if (Plan1DFFT_cu != NULL) + { + cufftDestroy(Plan1DFFT_cu); + Plan1DFFT_cu = NULL; + } + cufftPlanMany(&Plan1DFFT_cu, 1, arN, NULL, 1, Nx, NULL, 1, Nx, CUFFT_C2C, FFT1DInfo.HowMany); + } + if (Plan1DFFT_cu == 0) return ERROR_IN_FFT; + cufftExecC2C(Plan1DFFT_cu, (cufftComplex*)DataToFFT, (cufftComplex*)OutDataFFT, CUFFT_FORWARD); + } + else if (dDataToFFT != 0) //OC02022019 + { + if (dPlanLen != Nx) { + if (dPlan1DFFT_cu != NULL) + { + cufftDestroy(dPlan1DFFT_cu); + dPlan1DFFT_cu = NULL; + } + dPlanLen = Nx; + cufftPlanMany(&dPlan1DFFT_cu, 1, arN, NULL, 1, Nx, NULL, 1, Nx, CUFFT_Z2Z, FFT1DInfo.HowMany); + } + if (dPlan1DFFT_cu == 0) return ERROR_IN_FFT; + cufftExecZ2Z(dPlan1DFFT_cu, (cufftDoubleComplex*)dDataToFFT, (cufftDoubleComplex*)dOutDataFFT, CUFFT_FORWARD); + } + }) + else +#endif + { + //int flags = FFTW_ESTIMATE; +#ifdef _FFTW3 //OC28012019 +#ifdef _WITH_OMP + //Still needs to be tested! + if (DataToFFT != 0) + { + fftwf_init_threads(); //initialize threading support + int nthreads = omp_get_max_threads(); //detect number of OpenMP threads that are available + fftwf_plan_with_nthreads(nthreads); + } + else if (dDataToFFT != 0) //OC02022019 + { + fftw_init_threads(); //initialize threading support + int nthreads = omp_get_max_threads(); //detect number of OpenMP threads that are available + fftw_plan_with_nthreads(nthreads); + } +#endif //ifndef _WITH_OMP + int arN[] = { (int)Nx }; //OC14052020 + //int arN[] = {Nx}; + if (DataToFFT != 0) + { + //Plan1DFFT = fftwf_plan_many_dft(1, arN, FFT1DInfo.HowMany, DataToFFT, NULL, 1, Nx, pOutDataFFT, NULL, 1, Nx, FFTW_FORWARD, flags); + Plan1DFFT = fftwf_plan_many_dft(1, arN, FFT1DInfo.HowMany, DataToFFT, NULL, 1, Nx, OutDataFFT, NULL, 1, Nx, FFTW_FORWARD, flags); //OC02022019 + if (Plan1DFFT == 0) return ERROR_IN_FFT; + fftwf_execute(Plan1DFFT); + } + else if (dDataToFFT != 0) //OC02022019 + { + dPlan1DFFT = fftw_plan_many_dft(1, arN, FFT1DInfo.HowMany, dDataToFFT, NULL, 1, Nx, dOutDataFFT, NULL, 1, Nx, FFTW_FORWARD, flags); + if (dPlan1DFFT == 0) return ERROR_IN_FFT; + fftw_execute(dPlan1DFFT); + } + +#else //ifndef _FFTW3 + if (DataToFFT == OutDataFFT) + { + flags |= FFTW_IN_PLACE; + pOutDataFFT = 0; //OC03092016 (see FFTW 2.1.5 doc clause above) + } + Plan1DFFT = fftw_create_plan(Nx, FFTW_FORWARD, flags); + if (Plan1DFFT == 0) return ERROR_IN_FFT; + + //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: + //srwlPrintTime("::Make1DFFT : fft create plan dir>0",&start); + +#ifndef _WITH_OMP //OC27102018 + //fftw(Plan1DFFT, FFT1DInfo.HowMany, DataToFFT, 1, Nx, OutDataFFT, 1, Nx); + fftw(Plan1DFFT, FFT1DInfo.HowMany, DataToFFT, 1, Nx, pOutDataFFT, 1, Nx); //OC03092016 +#else //OC27102018 + //SY: split one call into many (for OpenMP) +#pragma omp parallel for if (omp_get_num_threads()==1) // to avoid nested multi-threading (just in case) + for (int i = 0; i < FFT1DInfo.HowMany; i++) + { + //SY: do not use OutDataFFT as scratch space if in-place + if (DataToFFT == OutDataFFT) fftw_one(Plan1DFFT, DataToFFT + i * Nx, 0); + else fftw_one(Plan1DFFT, DataToFFT + i * Nx, OutDataFFT + i * Nx); + } +#endif +#endif + } + //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: + //srwlPrintTime("::Make1DFFT : fft dir>0",&start); + +#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) + GPU_COND(pvGPU, + //GPU_COND(pGpuUsage, //HG20012022 + { + if (OutDataFFT != 0) + { + RepairAndRotateDataAfter1DFFT_GPU((float*)OutDataFFT, FFT1DInfo.HowMany, Nx, (float)Mult); //OC06092023 + //RepairAndRotateDataAfter1DFFT_GPU((float*)OutDataFFT, FFT1DInfo.HowMany, Nx, Mult); + //RepairSignAfter1DFFT_GPU((float*)OutDataFFT, FFT1DInfo.HowMany, Nx); + //RotateDataAfter1DFFT_GPU((float*)OutDataFFT, FFT1DInfo.HowMany, Nx); + } + else if (dOutDataFFT != 0) + { + RepairAndRotateDataAfter1DFFT_GPU((double*)dOutDataFFT, FFT1DInfo.HowMany, Nx, Mult); + //RepairSignAfter1DFFT_GPU((double*)dOutDataFFT, FFT1DInfo.HowMany, Nx); + //RotateDataAfter1DFFT_GPU((double*)dOutDataFFT, FFT1DInfo.HowMany, Nx); + } + alreadyNormalized = true; + }) + else +#endif + { + if (OutDataFFT != 0) + { + RepairSignAfter1DFFT(OutDataFFT, FFT1DInfo.HowMany); + RotateDataAfter1DFFT(OutDataFFT, FFT1DInfo.HowMany); + } +#ifdef _FFTW3 //OC27022019 + else if (dOutDataFFT != 0) + { + RepairSignAfter1DFFT(dOutDataFFT, FFT1DInfo.HowMany); + RotateDataAfter1DFFT(dOutDataFFT, FFT1DInfo.HowMany); + } +#endif + } + } + else + { + //int flags = FFTW_ESTIMATE; //OC30012019 (commented-out) +#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) + GPU_COND(pvGPU, + //GPU_COND(pGpuUsage, //HG20012022 + { + int arN[] = { (int)Nx }; //OC14052020 + //int arN[] = {Nx}; + if (DataToFFT != 0) + { + if (PlanLen != Nx) { + PlanLen = Nx; + HowMany = FFT1DInfo.HowMany; + if (Plan1DFFT_cu != NULL) + { + cufftDestroy(Plan1DFFT_cu); + Plan1DFFT_cu = NULL; + } + cufftPlanMany(&Plan1DFFT_cu, 1, arN, NULL, 1, Nx, NULL, 1, Nx, CUFFT_C2C, FFT1DInfo.HowMany); + } + if (Plan1DFFT_cu == 0) return ERROR_IN_FFT; + + RotateDataAfter1DFFT_GPU((float*)DataToFFT, FFT1DInfo.HowMany, Nx); + RepairSignAfter1DFFT_GPU((float*)DataToFFT, FFT1DInfo.HowMany, Nx); + cufftExecC2C(Plan1DFFT_cu, (cufftComplex*)DataToFFT, (cufftComplex*)OutDataFFT, CUFFT_INVERSE); + } + else if (dDataToFFT != 0) //OC02022019 + { + if (dPlanLen != Nx) + { + dPlanLen = Nx; + dHowMany = FFT1DInfo.HowMany; + if (dPlan1DFFT_cu != NULL) + { + cufftDestroy(dPlan1DFFT_cu); + dPlan1DFFT_cu = NULL; + } + cufftPlanMany(&dPlan1DFFT_cu, 1, arN, NULL, 1, Nx, NULL, 1, Nx, CUFFT_Z2Z, FFT1DInfo.HowMany); + } + if (dPlan1DFFT_cu == 0) return ERROR_IN_FFT; + + RotateDataAfter1DFFT_GPU((double*)dDataToFFT, FFT1DInfo.HowMany, Nx); + RepairSignAfter1DFFT_GPU((double*)dDataToFFT, FFT1DInfo.HowMany, Nx); + cufftExecZ2Z(dPlan1DFFT_cu, (cufftDoubleComplex*)dDataToFFT, (cufftDoubleComplex*)dOutDataFFT, CUFFT_INVERSE); + } + }) + else +#endif + { +#ifdef _FFTW3 //OC28012019 +#ifdef _WITH_OMP + + //Still needs to be tested! + if (DataToFFT != 0) + { + fftwf_init_threads(); //initialize threading support + int nthreads = omp_get_max_threads(); //detect number of OpenMP threads that are available + fftwf_plan_with_nthreads(nthreads); + } + else if (dDataToFFT != 0) + { + fftw_init_threads(); //initialize threading support + int nthreads = omp_get_max_threads(); //detect number of OpenMP threads that are available + fftw_plan_with_nthreads(nthreads); + } + +#endif + int arN[] = { (int)Nx }; //OC14052020 + //int arN[] = {Nx}; + if (DataToFFT != 0) + { + //Plan1DFFT = fftwf_plan_many_dft(1, arN, FFT1DInfo.HowMany, DataToFFT, NULL, 1, Nx, pOutDataFFT, NULL, 1, Nx, FFTW_BACKWARD, flags); + Plan1DFFT = fftwf_plan_many_dft(1, arN, FFT1DInfo.HowMany, DataToFFT, NULL, 1, Nx, OutDataFFT, NULL, 1, Nx, FFTW_BACKWARD, flags); //OC02022019 + if (Plan1DFFT == 0) return ERROR_IN_FFT; + RotateDataAfter1DFFT(DataToFFT, FFT1DInfo.HowMany); + RepairSignAfter1DFFT(DataToFFT, FFT1DInfo.HowMany); + + fftwf_execute(Plan1DFFT); + } + else if (dDataToFFT != 0) //OC02022019 + { + dPlan1DFFT = fftw_plan_many_dft(1, arN, FFT1DInfo.HowMany, dDataToFFT, NULL, 1, Nx, dOutDataFFT, NULL, 1, Nx, FFTW_BACKWARD, flags); + if (dPlan1DFFT == 0) return ERROR_IN_FFT; + RotateDataAfter1DFFT(dDataToFFT, FFT1DInfo.HowMany); + RepairSignAfter1DFFT(dDataToFFT, FFT1DInfo.HowMany); + fftw_execute(dPlan1DFFT); + } +#else //ifndef _FFTW3 + if (DataToFFT == OutDataFFT) + { + flags |= FFTW_IN_PLACE; + pOutDataFFT = 0; //OC03092016 (see FFTW 2.1.5 doc clause above) + } + Plan1DFFT = fftw_create_plan(Nx, FFTW_BACKWARD, flags); + if (Plan1DFFT == 0) return ERROR_IN_FFT; + + //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: + //srwlPrintTime("::Make1DFFT : fft create plan dir<0",&start); + + RotateDataAfter1DFFT(DataToFFT, FFT1DInfo.HowMany); + //srwlPrintTime("::Make1DFFT : rotate dir<0",&start); + + RepairSignAfter1DFFT(DataToFFT, FFT1DInfo.HowMany); + //srwlPrintTime("::Make1DFFT : repair dir<0",&start); + +#ifndef _WITH_OMP //OC27102018 + //fftw(Plan1DFFT, FFT1DInfo.HowMany, DataToFFT, 1, Nx, OutDataFFT, 1, Nx); + fftw(Plan1DFFT, FFT1DInfo.HowMany, DataToFFT, 1, Nx, pOutDataFFT, 1, Nx); //OC03092016 +#else //OC27102018 + //SY: split one call into many (for OpenMP) +#pragma omp parallel for if (omp_get_num_threads()==1) // to avoid nested multi-threading (just in case) + for (int i = 0; i < FFT1DInfo.HowMany; i++) + { + if (DataToFFT == OutDataFFT) fftw_one(Plan1DFFT, DataToFFT + i * Nx, 0); + else fftw_one(Plan1DFFT, DataToFFT + i * Nx, OutDataFFT + i * Nx); + } +#endif +#endif //_FFTW3 + } + //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: + //srwlPrintTime("::Make1DFFT : fft dir<0",&start); + } + + if (!alreadyNormalized) + { +#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) + GPU_COND(pvGPU, + //GPU_COND(pGpuUsage, + { + if (OutDataFFT != 0) { + NormalizeDataAfter1DFFT_GPU((float*)OutDataFFT, FFT1DInfo.HowMany, Nx, Mult); + } + else if (dOutDataFFT != 0) + NormalizeDataAfter1DFFT_GPU((double*)dOutDataFFT, FFT1DInfo.HowMany, Nx, Mult); + }) + else +#endif + { + if (OutDataFFT != 0) NormalizeDataAfter1DFFT(OutDataFFT, FFT1DInfo.HowMany, Mult); +#ifdef _FFTW3 //OC27022019 + else if (dOutDataFFT != 0) NormalizeDataAfter1DFFT(dOutDataFFT, FFT1DInfo.HowMany, Mult); +#endif + } + } + + //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: + //srwlPrintTime("::Make1DFFT : NormalizeDataAfter1DFFT",&start); + + if (NeedsShiftAfterX) + { +#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) + GPU_COND(pvGPU, + //GPU_COND(pGpuUsage, + { + if (m_ArrayShiftX != 0) FillArrayShift_GPU(t0SignMult * x0_After, FFT1DInfo.xStepTr, Nx, m_ArrayShiftX); //OC02022019 + else if (m_dArrayShiftX != 0) FillArrayShift_GPU(t0SignMult * x0_After, FFT1DInfo.xStepTr, Nx, m_dArrayShiftX); + + if (OutDataFFT != 0) TreatShift_GPU((float*)OutDataFFT, FFT1DInfo.HowMany, Nx, m_ArrayShiftX); + else if (dOutDataFFT != 0) TreatShift_GPU((double*)dOutDataFFT, FFT1DInfo.HowMany, Nx, m_dArrayShiftX); + }) + else +#endif + { + //FillArrayShift(t0SignMult*x0_After, FFT1DInfo.xStepTr); + if (m_ArrayShiftX != 0) FillArrayShift(t0SignMult * x0_After, FFT1DInfo.xStepTr, m_ArrayShiftX); //OC02022019 + else if (m_dArrayShiftX != 0) FillArrayShift(t0SignMult * x0_After, FFT1DInfo.xStepTr, m_dArrayShiftX); + + if (OutDataFFT != 0) TreatShift(OutDataFFT, FFT1DInfo.HowMany); +#ifdef _FFTW3 //OC27022019 + else if (dOutDataFFT != 0) TreatShift(dOutDataFFT, FFT1DInfo.HowMany); +#endif + } + } + + if(FFT1DInfo.TreatSharpEdges) + { + int result = ProcessSharpEdges(FFT1DInfo); + if(result) return result; + } + +#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) + GPU_COND(pvGPU, + //GPU_COND(pGpuUsage, //HG20012022 + { + if ((FFT1DInfo.pInData != 0) && (FFT1DInfo.pOutData != 0)) + { + CAuxGPU::MarkUpdated((TGPUUsageArg*)pvGPU, OutDataFFT, true, false); //OC06092023 + //AuxGpu::MarkUpdated(pGpuUsage, OutDataFFT, true, false); + } + else if ((FFT1DInfo.pdInData != 0) && (FFT1DInfo.pdOutData != 0)) + { + CAuxGPU::MarkUpdated((TGPUUsageArg*)pvGPU, dOutDataFFT, true, false); //OC06092023 + //AuxGpu::MarkUpdated(pGpuUsage, dOutDataFFT, true, false); + } + }) + else +#endif + { + //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: + //srwlPrintTime("::Make1DFFT : ProcessSharpEdges",&start); + + //OC_NERSC: to comment-out the following line for NERSC (to avoid crash with "python-mpi") + //OC27102018: thread safety issue? +#ifdef _FFTW3 //OC29012019 + + if(DataToFFT != 0) fftwf_destroy_plan(Plan1DFFT); + else if(dDataToFFT != 0) fftw_destroy_plan(dPlan1DFFT); + +#ifdef _WITH_OMP + + if(DataToFFT != 0) fftwf_cleanup_threads(); //?? + else if(dDataToFFT != 0) fftw_cleanup_threads(); + +#endif +#else //ifndef _FFTW3 + + fftw_destroy_plan(Plan1DFFT); + +#endif + } + + if (m_ArrayShiftX != 0) + { +#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) + m_ArrayShiftX = (float*)CAuxGPU::ToHostAndFree((TGPUUsageArg*)pvGPU, m_ArrayShiftX, (Nx << 1) * sizeof(float), true); //OC06092023 + //m_ArrayShiftX = (float*)AuxGpu::ToHostAndFree(pGpuUsage, m_ArrayShiftX, (Nx << 1) * sizeof(float), true); +#endif + delete[] m_ArrayShiftX; + } + if (m_dArrayShiftX != 0) + { +#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) + m_dArrayShiftX = (double*)CAuxGPU::ToHostAndFree((TGPUUsageArg*)pvGPU, m_dArrayShiftX, (Nx << 1) * sizeof(double), true); //OC06092023 + //m_dArrayShiftX = (double*)AuxGpu::ToHostAndFree(pGpuUsage, m_dArrayShiftX, (Nx << 1) * sizeof(double), true); +#endif + delete[] m_dArrayShiftX; + } + + //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: + //srwlPrintTime("::Make1DFFT : after fft ",&start); + return 0; +} + +//************************************************************************* + +int CGenMathFFT1D::SetupAuxDataForSharpEdgeCorr(CGenMathFFT1DInfo& FFT1DInfo, CGenMathAuxDataForSharpEdgeCorr1D& AuxDataForSharpEdgeCorr, char dataType) +//int CGenMathFFT1D::SetupAuxDataForSharpEdgeCorr(CGenMathFFT1DInfo& FFT1DInfo, CGenMathAuxDataForSharpEdgeCorr1D& AuxDataForSharpEdgeCorr) +{ + double Step = FFT1DInfo.xStep, Start = FFT1DInfo.xStart; + double AbsTol = 0.05*Step; + + double EdgeMinOffsetFromStart = FFT1DInfo.LeftSharpEdge - Start; + long iEdgeMinLower = long(EdgeMinOffsetFromStart/Step + 1.E-04); // Steer: threr was a bug at 1.E-08 and less! + double EdgeMinLowerMisfit = EdgeMinOffsetFromStart - iEdgeMinLower*Step; + + double EdgeMaxOffsetFromStart = FFT1DInfo.RightSharpEdge - Start; + long iEdgeMaxLower = long(EdgeMaxOffsetFromStart/Step + 1.E-04); // Steer: threr was a bug at 1.E-08 and less! + double EdgeMaxLowerMisfit = EdgeMaxOffsetFromStart - iEdgeMaxLower*Step; + + char EdgeMinIsBetweenMeshPoints = (EdgeMinLowerMisfit > AbsTol); + char EdgeMaxIsBetweenMeshPoints = (EdgeMaxLowerMisfit > AbsTol); + char EdgeMaxIsSmallerThanDataEnd = (::fabs((Start + FFT1DInfo.Nx*Step) - FFT1DInfo.RightSharpEdge) > AbsTol); + char EdgeCorrNeeded = (EdgeMinIsBetweenMeshPoints || EdgeMaxIsBetweenMeshPoints || EdgeMaxIsSmallerThanDataEnd); + + //float dSt = 0.; + //if(EdgeMinIsBetweenMeshPoints) dSt = (float)(Step - EdgeMinLowerMisfit); + //float dFi = 0.; + //if(EdgeMaxIsBetweenMeshPoints) dFi = (float)(Step - EdgeMaxLowerMisfit); + //else if(EdgeMaxIsSmallerThanDataEnd) dFi = (float)(0.5*Step); + + //OC02022019 + double dSt = 0.; + if(EdgeMinIsBetweenMeshPoints) dSt = Step - EdgeMinLowerMisfit; + double dFi = 0.; + if(EdgeMaxIsBetweenMeshPoints) dFi = Step - EdgeMaxLowerMisfit; + else if(EdgeMaxIsSmallerThanDataEnd) dFi = 0.5*Step; + + CGenMathFFT1DInfo FFT1DInfoLoc = FFT1DInfo; + FFT1DInfoLoc.UseGivenStartTrValue = 0; + CGenMathFFT1D FFT1D; + FFT1D.SetupLimitsTr(FFT1DInfoLoc); + + if(EdgeCorrNeeded) + { + AuxDataForSharpEdgeCorr.d = Step; + long TwoN = FFT1DInfo.Nx << 1; + + if(dSt != 0.) + { + if(dataType == 'f') + { + AuxDataForSharpEdgeCorr.ExpArrSt = new float[TwoN]; + if(AuxDataForSharpEdgeCorr.ExpArrSt == 0) return MEMORY_ALLOCATION_FAILURE; + } + else if(dataType == 'd') //OC02022019 + { + AuxDataForSharpEdgeCorr.dExpArrSt = new double[TwoN]; + if(AuxDataForSharpEdgeCorr.dExpArrSt == 0) return MEMORY_ALLOCATION_FAILURE; + } + + AuxDataForSharpEdgeCorr.dSt = dSt; + long jSt = iEdgeMinLower + 1; + AuxDataForSharpEdgeCorr.iSt = jSt; + + double ArgjSt = Start + jSt*Step; + SetupSharpEdgeExpCorrArray(AuxDataForSharpEdgeCorr.ExpArrSt, FFT1DInfoLoc.Nx, ArgjSt, FFT1DInfoLoc.xStartTr, FFT1DInfoLoc.xStepTr); + } + if(dFi != 0.) + { + if(dataType == 'f') + { + AuxDataForSharpEdgeCorr.ExpArrFi = new float[TwoN]; + if(AuxDataForSharpEdgeCorr.ExpArrFi == 0) return MEMORY_ALLOCATION_FAILURE; + } + else if(dataType == 'd') + { + AuxDataForSharpEdgeCorr.dExpArrFi = new double[TwoN]; + if(AuxDataForSharpEdgeCorr.dExpArrFi == 0) return MEMORY_ALLOCATION_FAILURE; + } + + AuxDataForSharpEdgeCorr.dFi = dFi; + double ArgjFi = Start + iEdgeMaxLower*Step; + AuxDataForSharpEdgeCorr.iFi = iEdgeMaxLower; + + SetupSharpEdgeExpCorrArray(AuxDataForSharpEdgeCorr.ExpArrFi, FFT1DInfoLoc.Nx, ArgjFi, FFT1DInfoLoc.xStartTr, FFT1DInfoLoc.xStepTr); + } + AuxDataForSharpEdgeCorr.WasSetUp = 1; + } + return 0; +} + +//************************************************************************* + +void CGenMathFFT1D::MakeSharpEdgeCorr(CGenMathFFT1DInfo& FFT1DInfo, CGenMathAuxDataForSharpEdgeCorr1D& AuxData) +{ + double fSRe, fSIm, fFRe, fFIm; + double ExpStRe, ExpStIm, ExpFiRe, ExpFiIm, Re, Im; + long Two_i, Two_i_p_1; + + if((FFT1DInfo.pInData != 0) && (FFT1DInfo.pOutData != 0)) + { + float *t = FFT1DInfo.pOutData; + float *tSt = FFT1DInfo.pInData + (AuxData.iSt << 1); + float *tFi = FFT1DInfo.pInData + (AuxData.iFi << 1); + fSRe = *tSt, fSIm = *(tSt + 1); + fFRe = *tFi, fFIm = *(tFi + 1); + + for(long i=0; i +#include + +#ifndef _GM_WITHOUT_BASE +#include "gmobj.h" +#endif + +#ifdef _WITH_OMP //OC31102018: Pre-processor definition for compiling SRW with OpenMP library +#include "omp.h" +#endif + +#ifndef MEMORY_ALLOCATION_FAILURE +#define MEMORY_ALLOCATION_FAILURE 8 + 10000 //in line with SRW +#endif +#ifndef ERROR_IN_FFT +#define ERROR_IN_FFT 40 + 10000 +#endif + +//************************************************************************* + +class CGenMathFFT //{ +#ifndef _GM_WITHOUT_BASE + : public CGenMathObj +#endif +{//OC01052013 + double a2c, a4c, a6c, a8c, a10c, a12c; + double a3s, a5s, a7s, a9s, a11s, a13s; + +protected: + + static long GoodNumbers[]; + static long LenGoodNumbers; + static long GoodNum100s[]; + static long LenGoodNum100s; + static long GoodNum1000s[]; + static long LenGoodNum1000s; + static long GoodNum10000s[]; + static long LenGoodNum10000s; + +public: + + double HalfPI, PI, TwoPI, ThreePIdTwo, One_dTwoPI; // Constants + + CGenMathFFT() + { + HalfPI = 1.5707963267949; + PI = 3.141592653590; + TwoPI = 6.2831853071796; + ThreePIdTwo = 4.7123889803847; + One_dTwoPI = 0.1591549430919; + a2c = -0.5; a4c = 0.041666666666667; a6c = -0.0013888888888889; a8c = 0.000024801587301587; a10c = -2.755731922E-07; + a3s = -0.16666666666667; a5s = 0.0083333333333333; a7s = -0.0001984126984127; a9s = 2.755731922E-06; a11s = -2.505210839E-08; + } + + void CosAndSin(double x, float& Cos, float& Sin) + { + x -= TwoPI*int(x*One_dTwoPI); + if(x < 0.) x += TwoPI; + + char ChangeSign=0; + if(x > ThreePIdTwo) x -= TwoPI; + else if(x > HalfPI) { x -= PI; ChangeSign = 1;} + + double xe2 = x*x; + Cos = float(1. + xe2*(a2c + xe2*(a4c + xe2*(a6c + xe2*(a8c + xe2*a10c))))); + Sin = float(x*(1. + xe2*(a3s + xe2*(a5s + xe2*(a7s + xe2*(a9s + xe2*a11s)))))); + if(ChangeSign) { Cos = -Cos; Sin = -Sin;} + } + void CosAndSin(double x, double& Cos, double& Sin) //OC02022019 + { + //x -= TwoPI*int(x*One_dTwoPI); + x -= TwoPI*((long long)(x*One_dTwoPI)); + + if(x < 0.) x += TwoPI; + + char ChangeSign=0; + if(x > ThreePIdTwo) x -= TwoPI; + else if(x > HalfPI) { x -= PI; ChangeSign = 1;} + + double xe2 = x*x; + Cos = 1. + xe2*(a2c + xe2*(a4c + xe2*(a6c + xe2*(a8c + xe2*a10c)))); + Sin = x*(1. + xe2*(a3s + xe2*(a5s + xe2*(a7s + xe2*(a9s + xe2*a11s))))); + if(ChangeSign) { Cos = -Cos; Sin = -Sin;} + } + + //void NextCorrectNumberForFFT(long long&); //OC26042019 + void NextCorrectNumberForFFT(long&); +}; + +//************************************************************************* + +struct CGenMathFFT2DInfo { + float* pData; + double* pdData; //OC31012019 + + char Dir; // >0: forward; <0: backward + double xStep, yStep, xStart, yStart; + double xStepTr, yStepTr, xStartTr, yStartTr; + long Nx, Ny; + //long long Nx, Ny; + + long howMany; //OC151014 + long iStride, iDist; //OC151014 + //From FFTW 2.1.5 Tutorial + //iStride and iDist describe the input array(s). + //There are howMany multi-dimensional input arrays; the first one is pointed to by in (= pData), + //the second one is pointed to by in + iDist, and so on, up to in + (howMany - 1) * iDist. + //Each multi-dimensional input array consists of complex numbers (see Section Data Types), + //stored in row-major format (see Section Multi-dimensional Array Format), which are not necessarily contiguous in memory. + //Specifically, in[0] is the first element of the first array, in[istride] is the second element of the first array, and so on. + //In general, the i-th element of the j-th input array will be in position in[i * istride + j * idist]. + //Note that, here, i refers to an index into the row-major format for the multi-dimensional array, rather than an index in any particular dimension. + //In-place transforms: For plans created with the FFTW_IN_PLACE option, the transform is computed in-place--the output is returned in the in array, + //using the same strides, etcetera, as were used in the input. + + char UseGivenStartTrValues; + double ExtraMult; //OC20112017 + + CGenMathFFT2DInfo() + { + howMany = 1; iStride = 1; iDist = 0; //OC151014 + UseGivenStartTrValues = 0; + ExtraMult = 1.; //OC20112017 + + pData = 0; //OC31012019 + pdData = 0; + } +}; + +//************************************************************************* + +class CGenMathFFT2D : public CGenMathFFT { + + long Nx, Ny; + long HalfNx, HalfNy; + //long long Nx, Ny; + //long long HalfNx, HalfNy; + char NeedsShiftBeforeX, NeedsShiftBeforeY, NeedsShiftAfterX, NeedsShiftAfterY; + //float *ArrayShiftX, *ArrayShiftY; + float *m_ArrayShiftX, *m_ArrayShiftY; //OC02022019 + double *m_dArrayShiftX, *m_dArrayShiftY; + +#ifdef _OFFLOAD_GPU + static long PlanNx, PlanNy, HowMany; + static long dPlanNx, dPlanNy, dHowMany; + static cufftHandle Plan2DFFT_cu; + static cufftHandle dPlan2DFFT_cu; +#endif + +public: + CGenMathFFT2D() + { + NeedsShiftBeforeX = NeedsShiftBeforeY = NeedsShiftAfterX = NeedsShiftAfterY = 0; +#ifdef _OFFLOAD_GPU + HowMany = PlanNx = PlanNy = dHowMany = dPlanNx = dPlanNy = 0; + Plan2DFFT_cu = dPlan2DFFT_cu = 0; +#endif + } + + //int Make2DFFT(CGenMathFFT2DInfo&); + //Modification by S.Yakubov for parallelizing SRW via OpenMP: +#ifdef _FFTW3 //28012019 + int Make2DFFT(CGenMathFFT2DInfo&, fftwf_plan* pPrecreatedPlan2DFFT=0, fftw_plan* pdPrecreatedPlan2DFFT=0, void* pvGPU = 0); //OC05092023 + //int Make2DFFT(CGenMathFFT2DInfo&, fftwf_plan* pPrecreatedPlan2DFFT=0, fftw_plan* pdPrecreatedPlan2DFFT=0, gpuUsageArg *pGpuUsage = 0); //OC02022019 + //int Make2DFFT(CGenMathFFT2DInfo&, fftwf_plan* pPrecreatedPlan2DFFT=0); +#else + int Make2DFFT(CGenMathFFT2DInfo&, fftwnd_plan* pPrecreatedPlan2DFFT=0); //OC27102018 +#endif + + int AuxDebug_TestFFT_Plans(); + + void SetupLimitsTr(CGenMathFFT2DInfo& FFT2DInfo) + {// Modify this if Make2DFFT is modified ! + Nx = FFT2DInfo.Nx; Ny = FFT2DInfo.Ny; + HalfNx = (Nx >> 1); HalfNy = (Ny >> 1); + + double xStartTr = -0.5/FFT2DInfo.xStep; + FFT2DInfo.xStepTr = -xStartTr/HalfNx; + + double yStartTr = -0.5/FFT2DInfo.yStep; + FFT2DInfo.yStepTr = -yStartTr/HalfNy; + + if(!FFT2DInfo.UseGivenStartTrValues) + { + FFT2DInfo.xStartTr = xStartTr; + FFT2DInfo.yStartTr = yStartTr; + } + } + + template void FillArrayShift(char x_or_y, double t0, double tStep, T* arShift) //OC02022019 + //void FillArrayShift(char x_or_y, double t0, double tStep) + { + T* tArrayShift = arShift; + //float* tArrayShift; + //long N; + long N = (x_or_y == 'x')? Nx : Ny; + //if(x_or_y == 'x') { tArrayShift = m_ArrayShiftX; N = Nx;} + //else { tArrayShift = m_ArrayShiftY; N = Ny;} + + T *tp = tArrayShift + N; + //float *tp = tArrayShift + N; + *tp = 1.; *(tp+1) = 0.; tp += 2; + T *tm = tp - 4; + //float *tm = tp - 4; + + double t0TwoPI = t0*TwoPI; + double q = tStep; + long HalfN = N >> 1; + for(int i=0; i void RotateDataAfter2DFFT(T* pAfterFFT, long HowMany) + //void RotateDataAfter2DFFT(fftwf_complex* pAfterFFT) + {// Assumes Nx, Ny even ! + //OC281117: Make it work for odd Nx, Ny as well! + //OC281117: Consider combining RotateDataAfter2DFFT, RepairSignAfter2DFFT, NormalizeDataAfter2DFFT + //long HalfNyNx = HalfNy*Nx; + long long HalfNyNx = ((long long)HalfNy)*((long long)Nx); + + for(long iHowMany=0; iHowManyre *= s; (t++)->im *= s; s = -s; + } + sy0 = -sy0; + } + } +#endif + +#ifdef _FFTW3 //OC29012019 + void NormalizeDataAfter2DFFT(fftwf_complex* pAfterFFT, double Mult, long HowMany) + {// Assumes Nx, Ny even ! + //OC281117: To make it work for odd Nx, Ny as well in the future! + float fMult = (float)Mult; + long long NxNy = ((long long)Nx)*((long long)Ny); + for(long iHowMany=0; iHowManyre *= (FFTW_REAL)Mult; (t++)->im *= (FFTW_REAL)Mult; + } + } +#endif + +#ifdef _FFTW3 //OC29012019 + void TreatShifts(fftwf_complex* pData, long HowMany) + { + fftwf_complex *t = pData; + char NeedsShiftX = NeedsShiftBeforeX || NeedsShiftAfterX; + char NeedsShiftY = NeedsShiftBeforeY || NeedsShiftAfterY; + + for(long iHowMany=0; iHowManyre*MultRe - t->im*MultIm; +// float NewIm = t->re*MultIm + t->im*MultRe; +// t->re = NewRe; +// (t++)->im = NewIm; +// #endif + } + } + } + } +#else + void TreatShifts(FFTW_COMPLEX* pData) + { + FFTW_COMPLEX *t = pData; + char NeedsShiftX = NeedsShiftBeforeX || NeedsShiftAfterX; + char NeedsShiftY = NeedsShiftBeforeY || NeedsShiftAfterY; + + float *tShiftY = m_ArrayShiftY; + float MultY_Re = 1., MultY_Im = 0., MultX_Re = 1., MultX_Im = 0.; + float MultRe, MultIm; + + for(long iy=0; iyre*MultRe - t->im*MultIm; + float NewIm = t->re*MultIm + t->im*MultRe; + t->re = NewRe; + (t++)->im = NewIm; + } + } + } +#endif +#ifdef _FFTW3 //OC02022019 + void TreatShifts(fftw_complex* pData, long HowMany) + { + fftw_complex *t = pData; + char NeedsShiftX = NeedsShiftBeforeX || NeedsShiftAfterX; + char NeedsShiftY = NeedsShiftBeforeY || NeedsShiftAfterY; + + for(long iHowMany=0; iHowMany0: forward; <0: backward + double xStep, xStart; + double xStepTr, xStartTr; + long Nx; + //long long Nx; + long HowMany; + //long long HowMany; + char UseGivenStartTrValue; + double MultExtra; + + char TreatSharpEdges; + double LeftSharpEdge, RightSharpEdge; + char ApplyAutoShiftAfter; + + CGenMathFFT1DInfo() + { + HowMany = 1; UseGivenStartTrValue = 0; + TreatSharpEdges = 0; + MultExtra = 1.; + ApplyAutoShiftAfter = 1; + + pInData = 0; //OC31012019 + pOutData = 0; + pdInData = 0; + pdOutData = 0; + } +}; + +//************************************************************************* + +struct CGenMathAuxDataForSharpEdgeCorr1D { + + float *ExpArrSt, *ExpArrFi; + double *dExpArrSt, *dExpArrFi; + + double dSt, dFi, d; + long iSt, iFi; + + char WasSetUp; + + CGenMathAuxDataForSharpEdgeCorr1D() + { + Initialize(); + } + + void Initialize() + { + ExpArrSt = ExpArrFi = 0; + dExpArrSt = dExpArrFi = 0; + + dSt = dFi = d = 0.; + iSt = iFi = 0; + WasSetUp = 0; + } + + void Dispose() + { + if(ExpArrSt != 0) delete[] ExpArrSt; + if(ExpArrFi != 0) delete[] ExpArrFi; + + if(dExpArrSt != 0) delete[] dExpArrSt; + if(dExpArrFi != 0) delete[] dExpArrFi; + + Initialize(); + } +}; + +//************************************************************************* + +class CGenMathFFT1D : public CGenMathFFT { + + long Nx; + long HalfNx; + //long long Nx; + //long long HalfNx; + char NeedsShiftBeforeX, NeedsShiftAfterX; + float *m_ArrayShiftX; + double *m_dArrayShiftX; //OC02022019 +#ifdef _OFFLOAD_GPU + static long PlanLen, HowMany; + static long dPlanLen, dHowMany; + static cufftHandle Plan1DFFT_cu; + static cufftHandle dPlan1DFFT_cu; +#endif + +public: + CGenMathFFT1D() + { + NeedsShiftBeforeX = NeedsShiftAfterX = 0; +#ifdef _OFFLOAD_GPU + PlanLen = dPlanLen = 0; + Plan1DFFT_cu = dPlan1DFFT_cu = 0; + HowMany = dHowMany = 0; +#endif + } + + int Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU=0); //OC05092023 + int Make1DFFT_InPlace(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU=0); //OC05092023 + +//#ifndef _OFFLOAD_GPU //OC05092023 +// int Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo); +// int Make1DFFT_InPlace(CGenMathFFT1DInfo& FFT1DInfo); +//#else +// int Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, TGPUUsageArg* pGPU=0); +// int Make1DFFT_InPlace(CGenMathFFT1DInfo& FFT1DInfo, TGPUUsageArg* pGPU=0); +// //int Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, gpuUsageArg *pGpuUsage=0); //HG +// //int Make1DFFT_InPlace(CGenMathFFT1DInfo& FFT1DInfo, gpuUsageArg *pGpuUsage=0); +//#endif + + void SetupLimitsTr(CGenMathFFT1DInfo& FFT1DInfo) + { // Modify this if Make1DFFT is modified ! + Nx = FFT1DInfo.Nx; + HalfNx = (Nx >> 1); + + double xStartTr = -0.5/FFT1DInfo.xStep; + FFT1DInfo.xStepTr = -xStartTr/HalfNx; + + if(!FFT1DInfo.UseGivenStartTrValue) + { + FFT1DInfo.xStartTr = xStartTr; + } + } + + template void FillArrayShift(double t0, double tStep, T* arShiftX) //OC02022019 + //void FillArrayShift(double t0, double tStep) + { + //float *tArrayShift = m_ArrayShiftX; + T *tArrayShift = arShiftX; //OC02022019 + long N = Nx; + + //float *tp = tArrayShift + N; + T *tp = tArrayShift + N; //OC02022019 + *tp = 1.; *(tp+1) = 0.; tp += 2; + //float *tm = tp - 4; + T *tm = tp - 4; + + double t0TwoPI = t0*TwoPI; + double q = tStep; + long HalfN = N >> 1; + + for(int i=0; ire*MultX_Re - tMany->im*MultX_Im; + float NewIm = tMany->re*MultX_Im + tMany->im*MultX_Re; + tMany->re = NewRe; tMany->im = NewIm; + tMany += Nx; + } + } + } +#endif + +#ifdef _FFTW3 //OC29012019 + template void RepairSignAfter1DFFT(T* pAfterFFT, long HowMany) //OC02022019 + //void RepairSignAfter1DFFT(fftwf_complex* pAfterFFT, long HowMany) + {// Assumes Nx even ! - to be improved + //OC27102018 + //SY: optimized, adopt for OpenMP +#ifdef _WITH_OMP + #pragma omp parallel for +#endif + for(long ix=1; ixre = -tMany->re; tMany->im = -tMany->im; + // tMany += Nx; + // } + // } + // t++; s = -s; + //} + //OC27102018 + //SY: optimized, adopt for OpenMP +#ifdef _WITH_OMP + #pragma omp parallel for +#endif + for(long ix=1; ixre = -tMany->re; tMany->im = -tMany->im; + tMany += Nx; + } + } + } +#endif + +#ifdef _FFTW3 //OC29012019 + template void RotateDataAfter1DFFT(T* pAfterFFT, long HowMany) //OC02022019 + //void RotateDataAfter1DFFT(fftwf_complex* pAfterFFT, long HowMany) + {// Assumes Nx even ! +#ifndef _WITH_OMP //OC27102018 + //fftwf_complex *t1 = pAfterFFT, *t2 = pAfterFFT + HalfNx; + //fftwf_complex Buf; + T *t1 = pAfterFFT, *t2 = pAfterFFT + HalfNx, Buf; + for(long ix=0; ixre *= (FFTW_REAL)Mult; tMany->im *= (FFTW_REAL)Mult; + tMany += Nx; + } + } +#else //OC27102018 + //SY: adopted for OpenMP + #pragma omp parallel for + for(long ix=0; ixre *= (FFTW_REAL)Mult; tMany->im *= (FFTW_REAL)Mult; + tMany += Nx; + } + } +#endif + } +#endif + + int SetupAuxDataForSharpEdgeCorr(CGenMathFFT1DInfo&, CGenMathAuxDataForSharpEdgeCorr1D&, char dataType='f'); //OC02022019 + //int SetupAuxDataForSharpEdgeCorr(CGenMathFFT1DInfo&, CGenMathAuxDataForSharpEdgeCorr1D&); + void MakeSharpEdgeCorr(CGenMathFFT1DInfo&, CGenMathAuxDataForSharpEdgeCorr1D&); + + template void SetupSharpEdgeExpCorrArray(T* pCmpData, long AmOfPt, double x, double qStart, double qStep) //OC02022019 + //void SetupSharpEdgeExpCorrArray(float* pCmpData, long AmOfPt, double x, double qStart, double qStep) + { + const double TwoPi = 6.28318530717959; + double TwoPiX = TwoPi*x; + double q = qStart; + //float *tCmpData = pCmpData; + T *tCmpData = pCmpData; + for(long i=0; i +#include +#include + +#define GMFFT_BLOCK_SIZE 256 + +template __global__ void RepairSignAfter1DFFT_Kernel(T* pAfterFFT, long HowMany, long Nx2) +{ + int ix = (blockIdx.x * blockDim.x + threadIdx.x) * 4 + 2; //Nx range + + if (ix < Nx2) + { + for (long k = 0; k < HowMany; k++) + { + pAfterFFT[ix + k * Nx2] = -pAfterFFT[ix + k * Nx2]; + pAfterFFT[ix + k * Nx2 + 1] = -pAfterFFT[ix + k * Nx2 + 1]; + } + } +} + +template __global__ void RotateDataAfter1DFFT_Kernel(T* pAfterFFT, long HowMany, long Nx2, long Nx) +{ + int ix = (blockIdx.x * blockDim.x + threadIdx.x) * 2; //HalfNx range + + if (ix < Nx) + { + for (long k = 0; k < HowMany; k++) + { + T t1_0 = pAfterFFT[ix + Nx2 * k]; + T t1_1 = pAfterFFT[ix + Nx2 * k + 1]; + + pAfterFFT[ix + Nx2 * k] = pAfterFFT[ix + Nx + Nx2 * k]; + pAfterFFT[ix + Nx2 * k + 1] = pAfterFFT[ix + Nx + Nx2 * k + 1]; + pAfterFFT[ix + Nx + Nx2 * k] = t1_0; + pAfterFFT[ix + Nx + Nx2 * k + 1] = t1_1; + } + } +} + +template __global__ void RepairAndRotateAfter1DFFT_Kernel(T* pAfterFFT, long HowMany, long Nx, float Mult) +{ + int ix = (blockIdx.x * blockDim.x + threadIdx.x); //HalfNx range + + long HalfNx = Nx / 2; + long Nx2 = Nx * 2; + if (ix < HalfNx) + { + float sx0 = 1 - 2 * (ix % 2); + float sx1 = 1 - 2 * ((HalfNx + ix) % 2); + + float s1 = sx0 * Mult; + float s2 = sx1 * Mult; + + int idx = ix * 2; + for (long i = 0; i < HowMany; i++){ + T* t1 = pAfterFFT + i * Nx2, *t2 = pAfterFFT + (HalfNx) * 2 + i * Nx2; + + T buf_r = t1[idx] * s1; + T buf_im = t1[idx + 1] * s1; + + t1[idx] = t2[idx] * s2; + t1[idx + 1] = t2[idx + 1] * s2; + + t2[idx] = buf_r; + t2[idx + 1] = buf_im; + } + } +} + +template __global__ void NormalizeDataAfter1DFFT_Kernel(T* pAfterFFT, long HowMany, long Nx2, T Mult) +{ + int ix = (blockIdx.x * blockDim.x + threadIdx.x) * 2; //Nx range + + if (ix < Nx2) + { + for (long i = 0; i < HowMany; i++) { + pAfterFFT[ix + i * Nx2] *= Mult; + pAfterFFT[ix + i * Nx2 + 1] *= Mult; + } + } +} + +template __global__ void FillArrayShift_Kernel(double t0, double tStep, long N, T* arShiftX) +{ + int ix = (blockIdx.x * blockDim.x + threadIdx.x); //HalfNx range + + double t0TwoPi = t0 * 2 * CUDART_PI; + double q = tStep * ix; + + if (ix < N) + { + if (ix == 0) { + arShiftX[N] = 1.0; + arShiftX[N + 1] = 0.0; + } + + ix *= 2; + if (ix < N - 2) + { + sincos(q * t0TwoPi, &arShiftX[N + 2 + 1 + ix], &arShiftX[N + 2 + ix]); + arShiftX[N - 2 - ix] = arShiftX[N + 2 + ix]; + arShiftX[N - 1 - ix] = -arShiftX[N + 2 + 1 + ix]; + } + + if (ix == N - 2) + { + sincos(-q * t0TwoPi, &arShiftX[1], &arShiftX[0]); + } + } +} + +template __global__ void TreatShift_Kernel(T* pData, long HowMany, long Nx2, T* tShiftX) +{ + int ix = (blockIdx.x * blockDim.x + threadIdx.x) * 2; //Nx range + + if (ix < Nx2) + { + T MultX_Re = tShiftX[ix]; + T MultX_Im = tShiftX[ix + 1]; + + for (long k = 0; k < HowMany; k++) + { + T buf_r = pData[ix + k * Nx2]; + T buf_im = pData[ix + k * Nx2 + 1]; + + T NewRe = buf_r * MultX_Re - buf_im * MultX_Im; + T NewIm = buf_r * MultX_Im + buf_im * MultX_Re; + pData[ix + k * Nx2] = NewRe; + pData[ix + k * Nx2 + 1] = NewIm; + } + } +} + +void RepairSignAfter1DFFT_GPU(float* pAfterFFT, long HowMany, long Nx) +{ + + dim3 blocks(Nx / GMFFT_BLOCK_SIZE + ((Nx & (GMFFT_BLOCK_SIZE - 1)) != 0)); + dim3 threads(GMFFT_BLOCK_SIZE, 1); + RepairSignAfter1DFFT_Kernel << > > (pAfterFFT, HowMany, Nx * 2); + +//#ifdef _DEBUG +// cudaStreamSynchronize(0); +// auto err = cudaGetLastError(); +// printf("%s\r\n", cudaGetErrorString(err)); +//#endif +} + +void RotateDataAfter1DFFT_GPU(float* pAfterFFT, long HowMany, long Nx) +{ + + dim3 blocks(Nx / (2 * GMFFT_BLOCK_SIZE) + ((Nx / 2 & (GMFFT_BLOCK_SIZE - 1)) != 0)); + dim3 threads(GMFFT_BLOCK_SIZE, 1); + RotateDataAfter1DFFT_Kernel << > > (pAfterFFT, HowMany, Nx * 2, Nx); + +//#ifdef _DEBUG +// cudaStreamSynchronize(0); +// auto err = cudaGetLastError(); +// printf("%s\r\n", cudaGetErrorString(err)); +//#endif +} + +void RepairAndRotateDataAfter1DFFT_GPU(float* pAfterFFT, long HowMany, long Nx, float Mult) +{ + +//#ifdef _DEBUG +// cudaStreamSynchronize(0); +// auto err = cudaGetLastError(); +// printf("%s\r\n", cudaGetErrorString(err)); +//#endif + + + dim3 blocks(Nx / (2 * GMFFT_BLOCK_SIZE) + (((Nx / 2) & (GMFFT_BLOCK_SIZE - 1)) != 0), 1); + dim3 threads(GMFFT_BLOCK_SIZE, 1); + RepairAndRotateAfter1DFFT_Kernel << > > (pAfterFFT, HowMany, Nx, Mult); + +//#ifdef _DEBUG +// cudaStreamSynchronize(0); +// err = cudaGetLastError(); +// printf("%s\r\n", cudaGetErrorString(err)); +//#endif +} + +void NormalizeDataAfter1DFFT_GPU(float* pAfterFFT, long HowMany, long Nx, double Mult) +{ + + dim3 blocks(Nx / GMFFT_BLOCK_SIZE + ((Nx & (GMFFT_BLOCK_SIZE - 1)) != 0), 1); + dim3 threads(GMFFT_BLOCK_SIZE, 1); + NormalizeDataAfter1DFFT_Kernel << > > (pAfterFFT, HowMany, Nx * 2, (float)Mult); //OC06092023 + //NormalizeDataAfter1DFFT_Kernel << > > (pAfterFFT, HowMany, Nx * 2, Mult); + +//#ifdef _DEBUG +// cudaStreamSynchronize(0); +// auto err = cudaGetLastError(); +// printf("%s\r\n", cudaGetErrorString(err)); +//#endif +} + +void FillArrayShift_GPU(double t0, double tStep, long Nx, float* tShiftX) +{ + + dim3 blocks(Nx / (2 * GMFFT_BLOCK_SIZE) + ((Nx / 2 & (GMFFT_BLOCK_SIZE - 1)) != 0), 1); + dim3 threads(GMFFT_BLOCK_SIZE, 1); + FillArrayShift_Kernel << > > (t0, tStep, Nx, tShiftX); + +//#ifdef _DEBUG +// cudaStreamSynchronize(0); +// auto err = cudaGetLastError(); +// printf("%s\r\n", cudaGetErrorString(err)); +//#endif +} + +void TreatShift_GPU(float* pData, long HowMany, long Nx, float* tShiftX) +{ + + dim3 blocks(Nx / GMFFT_BLOCK_SIZE + ((Nx & (GMFFT_BLOCK_SIZE - 1)) != 0)); + dim3 threads(GMFFT_BLOCK_SIZE, 1); + TreatShift_Kernel << > > (pData, HowMany, Nx * 2, tShiftX); + +//#ifdef _DEBUG +// cudaStreamSynchronize(0); +// auto err = cudaGetLastError(); +// printf("%s\r\n", cudaGetErrorString(err)); +//#endif +} + +void RepairSignAfter1DFFT_GPU(double* pAfterFFT, long HowMany, long Nx) +{ + + dim3 blocks(Nx / GMFFT_BLOCK_SIZE + ((Nx & (GMFFT_BLOCK_SIZE - 1)) != 0)); + dim3 threads(GMFFT_BLOCK_SIZE, 1); + RepairSignAfter1DFFT_Kernel << > > (pAfterFFT, HowMany, Nx * 2); + +//#ifdef _DEBUG +// cudaStreamSynchronize(0); +// auto err = cudaGetLastError(); +// printf("%s\r\n", cudaGetErrorString(err)); +//#endif +} + +void RotateDataAfter1DFFT_GPU(double* pAfterFFT, long HowMany, long Nx) +{ + + dim3 blocks(Nx / (2 * GMFFT_BLOCK_SIZE) + ((Nx & (2 * GMFFT_BLOCK_SIZE - 1)) != 0)); + dim3 threads(GMFFT_BLOCK_SIZE, 1); + RotateDataAfter1DFFT_Kernel << > > (pAfterFFT, HowMany, Nx * 2, Nx); + +//#ifdef _DEBUG +// cudaStreamSynchronize(0); +// auto err = cudaGetLastError(); +// printf("%s\r\n", cudaGetErrorString(err)); +//#endif +} + +void RepairAndRotateDataAfter1DFFT_GPU(double* pAfterFFT, long HowMany, long Nx, double Mult) +{ + + dim3 blocks(Nx / (2 * GMFFT_BLOCK_SIZE) + (((Nx / 2) & (GMFFT_BLOCK_SIZE - 1)) != 0), 1); + dim3 threads(GMFFT_BLOCK_SIZE, 1); + RepairAndRotateAfter1DFFT_Kernel << > > (pAfterFFT, HowMany, Nx, (float)Mult); //OC06092023 (check why it's not ..T Mult..) + //RepairAndRotateAfter1DFFT_Kernel << > > (pAfterFFT, HowMany, Nx, Mult); + +//#ifdef _DEBUG +// cudaStreamSynchronize(0); +// auto err = cudaGetLastError(); +// printf("%s\r\n", cudaGetErrorString(err)); +//#endif +} + +void NormalizeDataAfter1DFFT_GPU(double* pAfterFFT, long HowMany, long Nx, double Mult) +{ + + dim3 blocks(Nx / GMFFT_BLOCK_SIZE + ((Nx & (GMFFT_BLOCK_SIZE - 1)) != 0)); + dim3 threads(GMFFT_BLOCK_SIZE, 1); + NormalizeDataAfter1DFFT_Kernel << > > (pAfterFFT, HowMany, Nx * 2, Mult); + +//#ifdef _DEBUG +// cudaStreamSynchronize(0); +// auto err = cudaGetLastError(); +// printf("%s\r\n", cudaGetErrorString(err)); +//#endif +} + +void FillArrayShift_GPU(double t0, double tStep, long Nx, double* tShiftX) +{ + + dim3 blocks(Nx / (2 * GMFFT_BLOCK_SIZE) + ((Nx & (2 * GMFFT_BLOCK_SIZE - 1)) != 0), 1); + dim3 threads(GMFFT_BLOCK_SIZE, 1); + FillArrayShift_Kernel << > > (t0, tStep, Nx, tShiftX); + +//#ifdef _DEBUG +// cudaStreamSynchronize(0); +// auto err = cudaGetLastError(); +// printf("%s\r\n", cudaGetErrorString(err)); +//#endif +} + +void TreatShift_GPU(double* pData, long HowMany, long Nx, double* tShiftX) +{ + + dim3 blocks(Nx / GMFFT_BLOCK_SIZE + ((Nx & (GMFFT_BLOCK_SIZE - 1)) != 0)); + dim3 threads(GMFFT_BLOCK_SIZE, 1); + TreatShift_Kernel << > > (pData, HowMany, Nx * 2, tShiftX); + +//#ifdef _DEBUG +// cudaStreamSynchronize(0); +// auto err = cudaGetLastError(); +// printf("%s\r\n", cudaGetErrorString(err)); +//#endif +} + + +template __global__ void RepairSignAfter2DFFT_Kernel(T* pAfterFFT, long Nx, long Ny, long Nx2Ny2, long howMany) +{ + int ix = (blockIdx.x * blockDim.x + threadIdx.x); //Nx range + int iy = (blockIdx.y * blockDim.y + threadIdx.y); //Ny range + + float sx0 = 1 - 2 * (ix % 2); + float sy0 = 1 - 2 * (iy % 2); + float s = sx0 * sy0; + + if (ix < Nx && iy < Ny) + { + for (long i=0; i __global__ void RotateDataAfter2DFFT_Kernel(T* pAfterFFT, long HalfNx, long Nx, long HalfNy, long Ny, long Nx2Ny2, long howMany) +{ + int ix = (blockIdx.x * blockDim.x + threadIdx.x); //HalfNx range + int iy = (blockIdx.y * blockDim.y + threadIdx.y); //HalfNy range + + if (ix < HalfNx && iy < HalfNy) + { + int idx = (ix + iy * Nx) * 2; + for (long i=0; i __global__ void RepairSignAndRotateDataAfter2DFFT_Kernel(T* pAfterFFT, long HalfNx, long Nx, long HalfNy, long Ny, long Nx2Ny2, long howMany, T2 Mult) +{ + int ix = (blockIdx.x * blockDim.x + threadIdx.x); //HalfNx range + int iy = (blockIdx.y * blockDim.y + threadIdx.y); //HalfNy range + + if (ix < HalfNx) + { + float sx0 = 1.f - 2.f * (ix % 2); + float sy0 = 1.f - 2.f * (iy % 2); + float sx1 = 1.f - 2.f * ((HalfNx + ix) % 2); + float sy1 = 1.f - 2.f * ((HalfNy + iy) % 2); + + float s1 = sx0 * sy0 * Mult; + float s2 = sx1 * sy1 * Mult; + float s3 = sx1 * sy0 * Mult; + float s4 = sx0 * sy1 * Mult; + + int idx = (ix + iy * Nx); + for (long i=0; i __global__ void NormalizeDataAfter2DFFT_Kernel(T* pAfterFFT, long Nx2Ny2, long howMany, long n, T Mult) +{ + int ix = (blockIdx.x * blockDim.x + threadIdx.x) * 2; //Nx range + + if (ix < Nx2Ny2) + { + for (long i=0; i __global__ void TreatShift2D_Kernel(T* pData, long HowMany, long Nx2, long Ny, T* tShiftX, T* tShiftY) +{ + int ix = (blockIdx.x * blockDim.x + threadIdx.x) * 2; //Nx range + int iy = (blockIdx.y * blockDim.y + threadIdx.y); //Ny range + + if (ix < Nx2) + { + T MultRe = 1; + T MultIm = 0; + + T MultX_Re = 1; + T MultX_Im = 0; + + T MultY_Re = 1; + T MultY_Im = 0; + + if (NeedsShiftY) + { + MultY_Re = tShiftY[iy * 2]; + MultY_Im = tShiftY[iy * 2 + 1]; + } + if (NeedsShiftX) + { + MultX_Re = tShiftX[ix]; + MultX_Im = tShiftX[ix + 1]; + + if (NeedsShiftY) + { + MultRe = MultX_Re * MultY_Re - MultX_Im * MultY_Im; + MultIm = MultX_Re * MultY_Im + MultX_Im * MultY_Re; + } + else + { + MultRe = MultX_Re; + MultIm = MultX_Im; + } + } + else + { + MultRe = MultY_Re; + MultIm = MultY_Im; + } + + for (long k=0; k << > > (pAfterFFT, Nx, Ny, Nx * Ny * 2, howMany); +} + +void RotateDataAfter2DFFT_GPU(float* pAfterFFT, long Nx, long Ny, long howMany) +{ + + dim3 blocks(Nx / (2 * GMFFT_BLOCK_SIZE) + ((Nx / 2 & (GMFFT_BLOCK_SIZE - 1)) != 0), Ny); + dim3 threads(GMFFT_BLOCK_SIZE, 1); + RotateDataAfter2DFFT_Kernel << > > (pAfterFFT, Nx / 2, Nx, Ny / 2, Ny, Nx * Ny * 2, howMany); +} + +void RepairSignAndRotateDataAfter2DFFT_GPU(float* pAfterFFT, long Nx, long Ny, long howMany, float Mult) +{ + + dim3 blocks(Nx / (2 * GMFFT_BLOCK_SIZE) + ((Nx / 2 & (GMFFT_BLOCK_SIZE - 1)) != 0), Ny/2); + dim3 threads(GMFFT_BLOCK_SIZE, 1); + RepairSignAndRotateDataAfter2DFFT_Kernel << > > ((float2*)pAfterFFT, Nx / 2, Nx, Ny / 2, Ny, Nx * Ny, howMany, Mult); +} + +void NormalizeDataAfter2DFFT_GPU(float* pAfterFFT, long Nx, long Ny, long howMany, double Mult) +{ + + dim3 blocks((Nx * Ny) / GMFFT_BLOCK_SIZE + (((Nx * Ny) & (GMFFT_BLOCK_SIZE - 1)) != 0), 1); + dim3 threads(GMFFT_BLOCK_SIZE, 1); + NormalizeDataAfter2DFFT_Kernel << > > (pAfterFFT, Nx * Ny * 2, howMany,1, (float)Mult); //OC06092023 + //NormalizeDataAfter2DFFT_Kernel << > > (pAfterFFT, Nx * Ny * 2, howMany,1, Mult); +} + +void TreatShifts2D_GPU(float* pData, long Nx, long Ny, long howMany, bool NeedsShiftX, bool NeedsShiftY, float* m_ArrayShiftX, float* m_ArrayShiftY) +{ + + dim3 blocks((Nx) / GMFFT_BLOCK_SIZE + (((Nx) & (GMFFT_BLOCK_SIZE - 1)) != 0), Ny); + dim3 threads(GMFFT_BLOCK_SIZE, 1); + + if (NeedsShiftX && NeedsShiftY) TreatShift2D_Kernel << > > (pData, howMany, Nx * 2, Ny, m_ArrayShiftX, m_ArrayShiftY); + else if (NeedsShiftX) TreatShift2D_Kernel << > > (pData, howMany, Nx * 2, Ny, m_ArrayShiftX, m_ArrayShiftY); + else if (NeedsShiftY) TreatShift2D_Kernel << > > (pData, howMany, Nx * 2, Ny, m_ArrayShiftX, m_ArrayShiftY); +} + +void RepairSignAfter2DFFT_GPU(double* pAfterFFT, long Nx, long Ny, long howMany) +{ + + dim3 blocks(Nx / GMFFT_BLOCK_SIZE + ((Nx & (GMFFT_BLOCK_SIZE - 1)) != 0), Ny); + dim3 threads(GMFFT_BLOCK_SIZE, 1); + RepairSignAfter2DFFT_Kernel << > > (pAfterFFT, Nx, Ny, Nx * Ny * 2, howMany); +} + +void RotateDataAfter2DFFT_GPU(double* pAfterFFT, long Nx, long Ny, long howMany) +{ + + dim3 blocks(Nx / (2 * GMFFT_BLOCK_SIZE) + ((Nx / 2 & (GMFFT_BLOCK_SIZE - 1)) != 0), Ny); + dim3 threads(GMFFT_BLOCK_SIZE, 1); + RotateDataAfter2DFFT_Kernel << > > (pAfterFFT, Nx / 2, Nx, Ny / 2, Ny, Nx * Ny * 2, howMany); +} + +void RepairSignAndRotateDataAfter2DFFT_GPU(double* pAfterFFT, long Nx, long Ny, long howMany, double Mult) +{ + + dim3 blocks(Nx / (2 * GMFFT_BLOCK_SIZE) + ((Nx / 2 & (GMFFT_BLOCK_SIZE - 1)) != 0), Ny/2); + dim3 threads(GMFFT_BLOCK_SIZE, 1); + RepairSignAndRotateDataAfter2DFFT_Kernel << > > ((double2*)pAfterFFT, Nx / 2, Nx, Ny / 2, Ny, Nx * Ny, howMany, Mult); +} + +void NormalizeDataAfter2DFFT_GPU(double* pAfterFFT, long Nx, long Ny, long howMany, double Mult) +{ + + dim3 blocks((Nx * Ny) / GMFFT_BLOCK_SIZE + (((Nx * Ny) & (GMFFT_BLOCK_SIZE - 1)) != 0), 1); + dim3 threads(GMFFT_BLOCK_SIZE, 1); + NormalizeDataAfter2DFFT_Kernel << > > (pAfterFFT, Nx * Ny * 2, howMany,1, Mult); +} + +void TreatShifts2D_GPU(double* pData, long Nx, long Ny, long howMany, bool NeedsShiftX, bool NeedsShiftY, double* m_ArrayShiftX, double* m_ArrayShiftY) +{ + + dim3 blocks((Nx) / GMFFT_BLOCK_SIZE + (((Nx) & (GMFFT_BLOCK_SIZE - 1)) != 0), Ny); + dim3 threads(GMFFT_BLOCK_SIZE, 1); + + if (NeedsShiftX && NeedsShiftY) TreatShift2D_Kernel << > > (pData, howMany, Nx * 2, Ny, m_ArrayShiftX, m_ArrayShiftY); + else if (NeedsShiftX) TreatShift2D_Kernel << > > (pData, howMany, Nx * 2, Ny, m_ArrayShiftX, m_ArrayShiftY); + else if (NeedsShiftY) TreatShift2D_Kernel << > > (pData, howMany, Nx * 2, Ny, m_ArrayShiftX, m_ArrayShiftY); +} + +//OC06092023: looks like place if wrong here for this function, why all these functions are programmed without classes? +template __global__ void StokesAvgUpdateInterp_Kernel(float* pStokesArS, float* pMoreStokesArS, int nIters, int nOrder, int nStokesComp, T mult, int iSt, long xNpMeshRes, long yNpMeshRes, long eNpMeshRes, T yStartMeshRes, T yStepMeshRes, T yStartWfr, T yStepWfr, T xStartMeshRes, T xStepMeshRes, T xStartWfr, T xStepWfr, int iOfstSt, long xNpWfr, long yNpWfr, long eNpWfr, bool sum) +{ + int ix = (blockIdx.x * blockDim.x + threadIdx.x); //xNpMeshRes range + int iy = (blockIdx.y * blockDim.y + threadIdx.y); //yNpMeshRes range + int ie = (blockIdx.z * blockDim.z + threadIdx.z); //eNpMeshRes range + + if (ix >= xNpMeshRes) + return; + if (iy >= yNpMeshRes) + return; + if (ie >= eNpMeshRes) + return; + + long ir = iSt * yNpMeshRes * xNpMeshRes * eNpMeshRes + iy * xNpMeshRes * eNpMeshRes + ix * eNpMeshRes + ie; + + auto yMeshRes = yStartMeshRes + iy * yStepMeshRes; + auto xMeshRes = xStartMeshRes + ix * xStepMeshRes; + T fInterp = 0; + int loc_ix_ofst = iOfstSt + ie; + auto nx_ix_per = xNpWfr * eNpWfr; + + switch (nOrder) + { + case 1: + { + int ix0 = (int)trunc((xMeshRes - xStartWfr) / xStepWfr + 1e-09); + if ((ix0 < 0) | (ix0 >= xNpWfr - 1)) + { + pStokesArS[ir] = pStokesArS[ir] * nIters / (nIters + 1); + return; + } + int ix1 = ix0 + 1; + auto tx = (xMeshRes - (xStartWfr + xStepWfr * ix0)) / xStepWfr; + int iy0 = (int)trunc((yMeshRes - yStartWfr) / yStepWfr + 1e-09); + if ((iy0 < 0) | (iy0 >= yNpWfr - 1)) + { + pStokesArS[ir] = pStokesArS[ir] * nIters / (nIters + 1); + return; + } + + + int iy1 = iy0 + 1; + auto ty = (yMeshRes - (yStartWfr + yStepWfr * iy0)) / yStepWfr; + auto iy0_nx_ix_per = iy0 * nx_ix_per; + auto iy1_nx_ix_per = iy1 * nx_ix_per; + auto ix0_ix_per_p_ix_ofst = ix0 * eNpWfr + loc_ix_ofst; + auto ix1_ix_per_p_ix_ofst = ix1 * eNpWfr + loc_ix_ofst; + auto a00 = pMoreStokesArS[iy0_nx_ix_per + ix0_ix_per_p_ix_ofst]; + auto f10 = pMoreStokesArS[iy0_nx_ix_per + ix1_ix_per_p_ix_ofst]; + auto f01 = pMoreStokesArS[iy1_nx_ix_per + ix0_ix_per_p_ix_ofst]; + auto f11 = pMoreStokesArS[iy1_nx_ix_per + ix1_ix_per_p_ix_ofst]; + auto a10 = f10 - a00; + auto a01 = f01 - a00; + auto a11 = a00 - f01 - f10 + f11; + fInterp = a00 + tx * (a10 + ty * a11) + ty * a01; + } + break; + case 2: + { + int ix0 = int(round((xMeshRes - xStartWfr) / xStepWfr)); + if ((ix0 < 0) || (ix0 >= xNpWfr - 1)) + { + pStokesArS[ir] = pStokesArS[ir] * nIters / (float)(nIters + 1); + ir += 1; + return; + } + int ixm1 = ix0 - 1; + int ix1 = ix0 + 1; + auto tx = (xMeshRes - (xStartWfr + xStepWfr * ix0)) / xStepWfr; + int iy0 = int(round((yMeshRes - yStartWfr) / yStepWfr)); + if ((iy0 < 0) || (iy0 >= yNpWfr - 1)) + { + pStokesArS[ir] = pStokesArS[ir] * nIters / (nIters + 1); + ir += 1; + return; + } + int iym1 = iy0 - 1; + int iy1 = iy0 + 1; + auto ty = (yMeshRes - (yStartWfr + yStepWfr * iy0)) / yStepWfr; + auto iym1_nx_ix_per = iym1 * nx_ix_per; + auto iy0_nx_ix_per = iy0 * nx_ix_per; + auto iy1_nx_ix_per = iy1 * nx_ix_per; + auto ixm1_ix_per_p_ix_ofst = ixm1 * eNpWfr + loc_ix_ofst; + auto ix0_ix_per_p_ix_ofst = ix0 * eNpWfr + loc_ix_ofst; + auto ix1_ix_per_p_ix_ofst = ix1 * eNpWfr + loc_ix_ofst; + auto fm10 = pMoreStokesArS[iy0_nx_ix_per + ixm1_ix_per_p_ix_ofst]; + auto a00 = pMoreStokesArS[iy0_nx_ix_per + ix0_ix_per_p_ix_ofst]; + auto f10 = pMoreStokesArS[iy0_nx_ix_per + ix1_ix_per_p_ix_ofst]; + auto f0m1 = pMoreStokesArS[iym1_nx_ix_per + ix0_ix_per_p_ix_ofst]; + auto f01 = pMoreStokesArS[iy1_nx_ix_per + ix0_ix_per_p_ix_ofst]; + auto f11 = pMoreStokesArS[iy1_nx_ix_per + ix1_ix_per_p_ix_ofst]; + auto a10 = 0.5 * (f10 - fm10); + auto a01 = 0.5 * (f01 - f0m1); + auto a11 = a00 - f01 - f10 + f11; + auto a20 = 0.5 * (f10 + fm10) - a00; + auto a02 = 0.5 * (f01 + f0m1) - a00; + fInterp = a00 + tx * (a10 + tx * a20 + ty * a11) + ty * (a01 + ty * a02); + } + break; + } + + if (sum) pStokesArS[ir] += mult * fInterp; + else pStokesArS[ir] = (pStokesArS[ir] * nIters + mult * fInterp) / (nIters + 1); + return; +} + +//OC06092023: looks like place if wrong here for this function, why all these functions are programmed without classes? +void StokesAvgUpdateInterp(float* pStokesArS, float* pMoreStokesArS, int nIters, int nOrder, int nStokesComp, double mult, int iSt, long xNpMeshRes, long yNpMeshRes, long eNpMeshRes, double yStartMeshRes, double yStepMeshRes, double yStartWfr, double yStepWfr, double xStartMeshRes, double xStepMeshRes, double xStartWfr, double xStepWfr, int iOfstSt, long xNpWfr, long yNpWfr, long eNpWfr, bool sum) +{ + const int bs = 8; + dim3 threads(xNpMeshRes / bs + ((xNpMeshRes & (bs - 1)) != 0), yNpMeshRes / bs + ((yNpMeshRes & (bs - 1)) != 0), eNpMeshRes); + dim3 blocks(bs, bs, 1); + //OC06092023 (check order of variables, loop over e) + StokesAvgUpdateInterp_Kernel << > > (pStokesArS, pMoreStokesArS, nIters, nOrder, nStokesComp, (float)mult, iSt, xNpMeshRes, yNpMeshRes, eNpMeshRes, (float)yStartMeshRes, (float)yStepMeshRes, (float)yStartWfr, (float)yStepWfr, (float)xStartMeshRes, (float)xStepMeshRes, (float)xStartWfr, (float)xStepWfr, iOfstSt, xNpWfr, yNpWfr, eNpWfr, sum); + //StokesAvgUpdateInterp_Kernel << > > (pStokesArS, pMoreStokesArS, nIters, nOrder, nStokesComp, mult, iSt, xNpMeshRes, yNpMeshRes, eNpMeshRes, yStartMeshRes, yStepMeshRes, yStartWfr, yStepWfr, xStartMeshRes, xStepMeshRes, xStartWfr, xStepWfr, iOfstSt, xNpWfr, yNpWfr, eNpWfr, sum); +} +#endif \ No newline at end of file diff --git a/cpp/src/core/gmfft_gpu.h b/cpp/src/core/gmfft_gpu.h new file mode 100644 index 00000000..8eef9e2c --- /dev/null +++ b/cpp/src/core/gmfft_gpu.h @@ -0,0 +1,43 @@ +/************************************************************************//** + * File: gmfft_gpu.h + * Description: Auxiliary utilities to work with FFTW library (CUDA header) + * Project: Synchrotron Radiation Workshop + * First release: 2023 + * + * Copyright (C) Brookhaven National Laboratory + * All Rights Reserved + * + * @author H.Goel + * @version 1.0 + ***************************************************************************/ + +#ifndef __GMFFTGPU0_H +#define __GMFFTGPU0_H + +void RepairSignAfter1DFFT_GPU(float* pAfterFFT, long HowMany, long Nx); +void RotateDataAfter1DFFT_GPU(float* pAfterFFT, long HowMany, long Nx); +void RepairAndRotateDataAfter1DFFT_GPU(float* pAfterFFT, long HowMany, long Nx, float Mult=1.f); +void NormalizeDataAfter1DFFT_GPU(float* pAfterFFT, long HowMany, long Nx, double Mult); +void FillArrayShift_GPU(double t0, double tStep, long Nx, float* tShiftX); +void TreatShift_GPU(float* pData, long HowMany, long Nx, float* tShiftX); + +void RepairSignAfter1DFFT_GPU(double* pAfterFFT, long HowMany, long Nx); +void RotateDataAfter1DFFT_GPU(double* pAfterFFT, long HowMany, long Nx); +void RepairAndRotateDataAfter1DFFT_GPU(double* pAfterFFT, long HowMany, long Nx, double Mult=1.); +void NormalizeDataAfter1DFFT_GPU(double* pAfterFFT, long HowMany, long Nx, double Mult); +void FillArrayShift_GPU(double t0, double tStep, long Nx, double* tShiftX); +void TreatShift_GPU(double* pData, long HowMany, long Nx, double* tShiftX); + +void RepairSignAfter2DFFT_GPU(float* pAfterFFT, long Nx, long Ny, long howMany); +void RotateDataAfter2DFFT_GPU(float* pAfterFFT, long Nx, long Ny, long howMany); +void RepairSignAndRotateDataAfter2DFFT_GPU(float* pAfterFFT, long Nx, long Ny, long howMany, float Mult=1.f); //to check +void NormalizeDataAfter2DFFT_GPU(float* pAfterFFT, long Nx, long Ny, long howMany, double Mult); +void TreatShifts2D_GPU(float* pData, long Nx, long Ny, long howMany, bool NeedsShiftX, bool NeedsShiftY, float* m_ArrayShiftX, float* m_ArrayShiftY); + +void RepairSignAfter2DFFT_GPU(double* pAfterFFT, long Nx, long Ny, long howMany); +void RotateDataAfter2DFFT_GPU(double* pAfterFFT, long Nx, long Ny, long howMany); +void RepairSignAndRotateDataAfter2DFFT_GPU(double* pAfterFFT, long Nx, long Ny, long howMany, double Mult=1.); +void NormalizeDataAfter2DFFT_GPU(double* pAfterFFT, long Nx, long Ny, long howMany, double Mult); +void TreatShifts2D_GPU(double* pData, long Nx, long Ny, long howMany, bool NeedsShiftX, bool NeedsShiftY, double* m_ArrayShiftX, double* m_ArrayShiftY); + +#endif // __GMFFTGPU0_H \ No newline at end of file diff --git a/cpp/src/core/srradmnp.cpp b/cpp/src/core/srradmnp.cpp index 711bc2b4..7522f78b 100644 --- a/cpp/src/core/srradmnp.cpp +++ b/cpp/src/core/srradmnp.cpp @@ -677,6 +677,7 @@ int srTRadGenManip::ExtractSingleElecIntensity1DvsZ(srTRadExtract& RadExtract) //************************************************************************* int srTRadGenManip::ExtractSingleElecIntensity2DvsXZ(srTRadExtract& RadExtract) +//int srTRadGenManip::ExtractSingleElecIntensity2DvsXZ(srTRadExtract& RadExtract, gpuUsageArg *pGpuUsage) //Himanshu? { int PolCom = RadExtract.PolarizCompon; int Int_or_ReE = RadExtract.Int_or_Phase; @@ -690,6 +691,7 @@ int srTRadGenManip::ExtractSingleElecIntensity2DvsXZ(srTRadExtract& RadExtract) float *pI = 0, *pI1 = 0, *pI2 = 0, *pI3 = 0; //OC17042020 double *pId = 0, *pI1d = 0, *pI2d = 0, *pI3d = 0; long ne = RadAccessData.ne, nx = RadAccessData.nx, nz = RadAccessData.nz; + //long ne = RadAccessData.ne, nx = RadAccessData.nx, nz = RadAccessData.nz, nwfr = RadAccessData.nwfr; //Himanshu? //float *pI = 0; //DOUBLE *pId = 0; //double *pId = 0; //OC26112019 (related to SRW port to IGOR XOP8 on Mac) @@ -720,6 +722,7 @@ int srTRadGenManip::ExtractSingleElecIntensity2DvsXZ(srTRadExtract& RadExtract) //long long PerZ = PerX*RadAccessData.nx; long long PerX = ((long long)ne) << 1; //OC18042020 long long PerZ = PerX*nx; + long long PerWfr = PerZ*nz; //long ie0=0, ie1=0; long long ie0=0, ie1=0; //OC26042019 @@ -754,174 +757,187 @@ int srTRadGenManip::ExtractSingleElecIntensity2DvsXZ(srTRadExtract& RadExtract) //long Two_ie0 = ie0 << 1, Two_ie1 = ie1 << 1; long long Two_ie0 = ie0 << 1, Two_ie1 = ie1 << 1; //OC26042019 //long izPerZ = 0; - long long izPerZ = 0; long ix, ie; - for(long long iz=0; iz 0) //OC08052021 { - *(tInt++) = IntensityComponent(pEx_StAux, pEz_StAux, -3, Int_or_ReE); - pEx_StAux += 2; pEz_StAux += 2; + if(pI != 0) + { + float newI = (float)(((*pI)*iter + resInt)*inv_iter_p_1); + *(pI++) = newI; + } + if(pId != 0) + { + double newI = ((*pId)*iter + resInt)*inv_iter_p_1; + *(pId++) = newI; + } + if(allStokesReq) + { + if(RadExtract.pExtractedData != 0) + { + float newI1 = (float)(((*pI1)*iter + resInt1)*inv_iter_p_1); + float newI2 = (float)(((*pI2)*iter + resInt2)*inv_iter_p_1); + float newI3 = (float)(((*pI3)*iter + resInt3)*inv_iter_p_1); + *(pI1++) = newI1; *(pI2++) = newI2; *(pI3++) = newI3; + } + else + { + double newI1 = ((*pI1d)*iter + resInt1)*inv_iter_p_1; + double newI2 = ((*pI2d)*iter + resInt2)*inv_iter_p_1; + double newI3 = ((*pI3d)*iter + resInt3)*inv_iter_p_1; + *(pI1d++) = newI1; *(pI2d++) = newI2; *(pI3d++) = newI3; + } + } } - resInt2 = ConstPhotEnInteg*CGenMathMeth::Integ1D_FuncDefByArray(arAuxInt, ne, RadAccessData.eStep); - - tInt = arAuxInt; pEx_StAux = pEx_St; pEz_StAux = pEz_St; - for(ie=0; ie 0) //OC08052021 - { - if(pI != 0) - { - float newI = (float)(((*pI)*iter + resInt)*inv_iter_p_1); - *(pI++) = newI; - } - if(pId != 0) - { - double newI = ((*pId)*iter + resInt)*inv_iter_p_1; - *(pId++) = newI; - } - if(allStokesReq) - { - if(RadExtract.pExtractedData != 0) - { - float newI1 = (float)(((*pI1)*iter + resInt1)*inv_iter_p_1); - float newI2 = (float)(((*pI2)*iter + resInt2)*inv_iter_p_1); - float newI3 = (float)(((*pI3)*iter + resInt3)*inv_iter_p_1); - *(pI1++) = newI1; *(pI2++) = newI2; *(pI3++) = newI3; - } - else - { - double newI1 = ((*pI1d)*iter + resInt1)*inv_iter_p_1; - double newI2 = ((*pI2d)*iter + resInt2)*inv_iter_p_1; - double newI3 = ((*pI3d)*iter + resInt3)*inv_iter_p_1; - *(pI1d++) = newI1; *(pI2d++) = newI2; *(pI3d++) = newI3; - } + //ixPerX += PerX; + pEx_St += PerX; + pEz_St += PerX; + pEx_Fi += PerX; + pEz_Fi += PerX; } + izPerZ += PerZ; } - else //OC08052021 - { - if(pI != 0) *(pI++) += (float)resInt; - if(pId != 0) *(pId++) += resInt; - if(allStokesReq) - { - if(RadExtract.pExtractedData != 0) - { - *(pI1++) += (float)resInt1; *(pI2++) += (float)resInt2; *(pI3++) += (float)resInt3; - } - else - { - *(pI1d++) += resInt1; *(pI2d++) += resInt2; *(pI3d++) += resInt3; - } - } - } - - //ixPerX += PerX; - pEx_St += PerX; - pEz_St += PerX; - pEx_Fi += PerX; - pEz_Fi += PerX; - } - izPerZ += PerZ; - } + //iwfrPerWfr += PerWfr; + //} + //} if(arAuxInt != 0) delete[] arAuxInt; //OC150813 return 0; } @@ -1571,6 +1587,7 @@ int srTRadGenManip::ExtractSingleElecMutualIntensityVsZ(srTRadExtract& RadExtrac //************************************************************************* int srTRadGenManip::ExtractSingleElecMutualIntensityVsXZ(srTRadExtract& RadExtract) +//int srTRadGenManip::ExtractSingleElecMutualIntensityVsXZ(srTRadExtract& RadExtract, gpuUsageArg *pGpuUsage) //Himanshu? {//OC13122019 //This assumes "normal" data alignment in the complex "matrix" E(x,y)*E*(x',y') int res = 0; @@ -2107,156 +2124,138 @@ int srTRadGenManip::ExtractSingleElecMutualIntensityVsXZ(srTRadExtract& RadExtra if(DontNeedInterp) { - for(long long it=itStart; it<=itEnd; it++) //OC16042021 (to enable partial update of MI/CSD) - //for(long long it=0; it<=(itEnd-itStart); it++) //OC03032021 (to enable partial update of MI/CSD) - //for(long long it=0; it 0) - { - //double iter_p_1 = iter + 1; //OC20012020 - //long long iter_p_1 = iter + 1; - pMI[0] = (float)((pMI[0]*iter + ReMI)*inv_iter_p_1); //OC08052021 - pMI[1] = (float)((pMI[1]*iter + ImMI)*inv_iter_p_1); - //pMI[0] = (float)((pMI[0]*iter + ReMI)/iter_p_1); - //pMI[1] = (float)((pMI[1]*iter + ImMI)/iter_p_1); - } - else + float *pMI = pMI0 + (it - itStart)*PerArg; //OC16042021 + //float *pMI = pMI0 + it*PerArg; + for(long long i=0; i<=it; i++) { - pMI[0] += (float)ReMI; - pMI[1] += (float)ImMI; - } + //if(res = MutualIntensityComponent(pEx, pExT, pEz, pEzT, PolCom, iter, pMI)) return res; - pEx += PerX; pEz += PerX; - pMI += 2; - } + double ExRe = 0., ExIm = 0., EzRe = 0., EzIm = 0.; + double ExReT = 0., ExImT = 0., EzReT = 0., EzImT = 0.; + if(EhOK) { ExRe = *pEx; ExIm = *(pEx + 1); ExReT = *pExT; ExImT = *(pExT + 1); } + if(EvOK) { EzRe = *pEz; EzIm = *(pEz + 1); EzReT = *pEzT; EzImT = *(pEzT + 1); } + double ReMI = 0., ImMI = 0.; - pEx = pExInit0; - pEz = pEzInit0; - pExT += PerX; pEzT += PerX; - } - if(iter == 0) //OC16102021 - {//Setting to 0 symmetrical part of MI data (to avoid having garbage there) + switch(PolCom) + { + case 0: // Lin. Hor. + { + ReMI = ExRe*ExReT + ExIm*ExImT; + ImMI = ExIm*ExReT - ExRe*ExImT; + break; + } + case 1: // Lin. Vert. + { + ReMI = EzRe*EzReT + EzIm*EzImT; + ImMI = EzIm*EzReT - EzRe*EzImT; + break; + } + case 2: // Linear 45 deg. + { + double ExRe_p_EzRe = ExRe + EzRe, ExIm_p_EzIm = ExIm + EzIm; + double ExRe_p_EzReT = ExReT + EzReT, ExIm_p_EzImT = ExImT + EzImT; + ReMI = 0.5*(ExRe_p_EzRe*ExRe_p_EzReT + ExIm_p_EzIm*ExIm_p_EzImT); + ImMI = 0.5*(ExIm_p_EzIm*ExRe_p_EzReT - ExRe_p_EzRe*ExIm_p_EzImT); + break; + } + case 3: // Linear 135 deg. + { + double ExRe_mi_EzRe = ExRe - EzRe, ExIm_mi_EzIm = ExIm - EzIm; + double ExRe_mi_EzReT = ExReT - EzReT, ExIm_mi_EzImT = ExImT - EzImT; + ReMI = 0.5*(ExRe_mi_EzRe*ExRe_mi_EzReT + ExIm_mi_EzIm*ExIm_mi_EzImT); + ImMI = 0.5*(ExIm_mi_EzIm*ExRe_mi_EzReT - ExRe_mi_EzRe*ExIm_mi_EzImT); + break; + } + case 5: // Circ. Left //OC08092019: corrected to be in compliance with definitions for right-hand frame (x,z,s) and with corresponding definition and calculation of Stokes params + //case 4: // Circ. Right + { + double ExRe_mi_EzIm = ExRe - EzIm, ExIm_p_EzRe = ExIm + EzRe; + double ExRe_mi_EzImT = ExReT - EzImT, ExIm_p_EzReT = ExImT + EzReT; + ReMI = 0.5*(ExRe_mi_EzIm*ExRe_mi_EzImT + ExIm_p_EzRe*ExIm_p_EzReT); + ImMI = 0.5*(ExIm_p_EzRe*ExRe_mi_EzImT - ExRe_mi_EzIm*ExIm_p_EzReT); + break; + } + case 4: // Circ. Right //OC08092019: corrected to be in compliance with definitions for right-hand frame (x,z,s) and with corresponding definition and calculation of Stokes params + //case 5: // Circ. Left + { + double ExRe_p_EzIm = ExRe + EzIm, ExIm_mi_EzRe = ExIm - EzRe; + double ExRe_p_EzImT = ExReT + EzImT, ExIm_mi_EzReT = ExImT - EzReT; + ReMI = 0.5*(ExRe_p_EzIm*ExRe_p_EzImT + ExIm_mi_EzRe*ExIm_mi_EzReT); + ImMI = 0.5*(ExIm_mi_EzRe*ExRe_p_EzImT - ExRe_p_EzIm*ExIm_mi_EzReT); + break; + } + case -1: // s0 + { + ReMI = ExRe*ExReT + ExIm*ExImT + EzRe*EzReT + EzIm*EzImT; + ImMI = ExIm*ExReT - ExRe*ExImT + EzIm*EzReT - EzRe*EzImT; + break; + } + case -2: // s1 + { + ReMI = ExRe*ExReT + ExIm*ExImT - (EzRe*EzReT + EzIm*EzImT); + ImMI = ExIm*ExReT - ExRe*ExImT - (EzIm*EzReT - EzRe*EzImT); + break; + } + case -3: // s2 + { + ReMI = ExImT*EzIm + ExIm*EzImT + ExReT*EzRe + ExRe*EzReT; + ImMI = ExReT*EzIm - ExRe*EzImT - ExImT*EzRe + ExIm*EzReT; + break; + } + case -4: // s3 + { + ReMI = ExReT*EzIm + ExRe*EzImT - ExImT*EzRe - ExIm*EzReT; + ImMI = ExIm*EzImT - ExImT*EzIm - ExReT*EzRe + ExRe*EzReT; + break; + } + default: // total mutual intensity, same as s0 + { + ReMI = ExRe*ExReT + ExIm*ExImT + EzRe*EzReT + EzIm*EzImT; + ImMI = ExIm*ExReT - ExRe*ExImT + EzIm*EzReT - EzRe*EzImT; + break; + //return CAN_NOT_EXTRACT_MUT_INT; + } + } + if(iter == 0) + { + pMI[0] = (float)ReMI; + pMI[1] = (float)ImMI; + } + else if(iter > 0) + { + //double iter_p_1 = iter + 1; //OC20012020 + //long long iter_p_1 = iter + 1; + pMI[0] = (float)((pMI[0]*iter + ReMI)*inv_iter_p_1); //OC08052021 + pMI[1] = (float)((pMI[1]*iter + ImMI)*inv_iter_p_1); + //pMI[0] = (float)((pMI[0]*iter + ReMI)/iter_p_1); + //pMI[1] = (float)((pMI[1]*iter + ImMI)/iter_p_1); + } + else + { + pMI[0] += (float)ReMI; + pMI[1] += (float)ImMI; + } - for(long long it=itStart; it<=itEnd; it++) //OC16042021 (to enable partial update of MI/CSD) - { - float *pMI = pMI0 + (it - itStart)*(PerArg + 2) + 2; //OC29042022 (?) - //float *pMI = pMI0 + (it - itStart)*PerArg; - for(long long i=it+1; i<=itEnd; i++) - //for(long long i=0; i<=it; i++) - { - *(pMI++) = 0.; - *(pMI++) = 0.; + pEx += PerX; pEz += PerX; + pMI += 2; } + + pEx = pExInit0; + pEz = pEzInit0; + pExT += PerX; pEzT += PerX; } - } + //} } else { @@ -3536,8 +3535,7 @@ void srTRadGenManip::MutualIntSumPart(srTWaveAccessData* pwI1, srTWaveAccessData long long itStart = pwI2->itStart; if(itStart < 0) itStart = 0; long long itFin = pwI2->itFin; - if(itFin < 0) itFin = nxnz - 1; //OC04102021 - //if(itFin < 0) itFin = nxnz; + if(itFin < 0) itFin = nxnz; double aux; //OC27042021 @@ -3724,7 +3722,6 @@ void srTRadGenManip::MutualIntFillHalfHermit(srTWaveAccessData* pwI) *pMIt = -imMI; //Hermitian matrix property } } - //int aha = 1; } else if(pDataD != 0) { @@ -3793,22 +3790,18 @@ void srTRadGenManip::MutualIntTreatComQuadPhTerm(srTWaveAccessData* pwI, double* for(long long izt=0; izt +#include +#include +#include +#include +#include "srradmnp.h" +#include "gmmeth.h" + +template +__global__ void ExtractSingleElecIntensity2DvsXZ_Kernel(srTRadExtract RadExtract, srTSRWRadStructAccessData RadAccessData, srTRadGenManip *obj, double* arAuxInt, long long ie0, long long ie1, double InvStepRelArg, int Int_or_ReE) +{ + int ix = (blockIdx.x * blockDim.x + threadIdx.x); //nx range + int iz = (blockIdx.y * blockDim.y + threadIdx.y); //nz range + int iwfr = (blockIdx.z * blockDim.z + threadIdx.z); //nwfr range + + if (ix < RadAccessData.nx && iz < RadAccessData.nz && iwfr < RadAccessData.nwfr) + { + //int PolCom = RadExtract.PolarizCompon; + + //bool allStokesReq = (PolCom == -5); //OC18042020 + + float* pI = 0, * pI1 = 0, * pI2 = 0, * pI3 = 0; //OC17042020 + double* pId = 0, * pI1d = 0, * pI2d = 0, * pI3d = 0; + long ne = RadAccessData.ne, nx = RadAccessData.nx, nz = RadAccessData.nz, nwfr = RadAccessData.nwfr; + //float *pI = 0; + //DOUBLE *pId = 0; + //double *pId = 0; //OC26112019 (related to SRW port to IGOR XOP8 on Mac) + long long nxnz = ((long long)nx) * ((long long)nz); + if (Int_or_ReE != 2) + { + pI = RadExtract.pExtractedData; + if (allStokesReq) //OC17042020 + { + pI1 = pI + nxnz; pI2 = pI1 + nxnz; pI3 = pI2 + nxnz; + } + } + else + { + pId = RadExtract.pExtractedDataD; + if (allStokesReq) //OC17042020 + { + pI1d = pId + nxnz; pI2d = pI1d + nxnz; pI3d = pI2d + nxnz; + } + } + + float* pEx0 = RadAccessData.pBaseRadX; + float* pEz0 = RadAccessData.pBaseRadZ; + + //long PerX = RadAccessData.ne << 1; + //long PerZ = PerX*RadAccessData.nx; + //long long PerX = RadAccessData.ne << 1; + //long long PerZ = PerX*RadAccessData.nx; + long long PerX = ((long long)ne) << 1; //OC18042020 + long long PerZ = PerX * nx; + long long PerWfr = PerZ * nz; + + //bool intOverEnIsRequired = (RadExtract.Int_or_Phase == 7) && (ne > 1); //OC18042020 + double resInt, resInt1, resInt2, resInt3; + double ConstPhotEnInteg = 1.; + long long Two_ie0 = ie0 << 1, Two_ie1 = ie1 << 1; //OC26042019 + long ie; + + long offset = iwfr * PerWfr + iz * PerZ + ix * PerX; + long offsetDiv2 = offset >> 1; + + float* pEx_StartForX = pEx0 + offset; + float* pEz_StartForX = pEz0 + offset; + if (pI != 0) + { + pI += offsetDiv2; + if (allStokesReq) + { + pI1 += offsetDiv2; + pI2 += offsetDiv2; + pI3 += offsetDiv2; + } + } + + if (pId != 0) + { + pId += offsetDiv2; + if (allStokesReq) + { + pI1d += offsetDiv2; + pI2d += offsetDiv2; + pI3d += offsetDiv2; + } + } + + //long ixPerX = 0; + + float* pEx_St = pEx_StartForX + Two_ie0; + float* pEz_St = pEz_StartForX + Two_ie0; + float* pEx_Fi = pEx_StartForX + Two_ie1; + float* pEz_Fi = pEz_StartForX + Two_ie1; + + if (intOverEnIsRequired) //OC140813 + {//integrate over photon energy / time + double* tInt = arAuxInt; + float* pEx_StAux = pEx_St; + float* pEz_StAux = pEz_St; + + if (!allStokesReq) //OC17042020 + { + for (ie = 0; ie < ne; ie++) //OC18042020 + //for(int ie=0; ieIntensityComponent(pEx_StAux, pEz_StAux, PolCom, Int_or_ReE); + pEx_StAux += 2; + pEz_StAux += 2; + } + resInt = ConstPhotEnInteg * CGenMathMeth::Integ1D_FuncDefByArray(arAuxInt, ne, RadAccessData.eStep); //OC18042020 + //resInt = ConstPhotEnInteg*CGenMathMeth::Integ1D_FuncDefByArray(arAuxInt, RadAccessData.ne, RadAccessData.eStep); + } + else + { + for (ie = 0; ie < ne; ie++) + { + *(tInt++) = obj->IntensityComponent(pEx_StAux, pEz_StAux, -1, Int_or_ReE); + pEx_StAux += 2; pEz_StAux += 2; + } + resInt = ConstPhotEnInteg * CGenMathMeth::Integ1D_FuncDefByArray(arAuxInt, ne, RadAccessData.eStep); + + tInt = arAuxInt; pEx_StAux = pEx_St; pEz_StAux = pEz_St; + for (ie = 0; ie < ne; ie++) + { + *(tInt++) = obj->IntensityComponent(pEx_StAux, pEz_StAux, -2, Int_or_ReE); + pEx_StAux += 2; pEz_StAux += 2; + } + resInt1 = ConstPhotEnInteg * CGenMathMeth::Integ1D_FuncDefByArray(arAuxInt, ne, RadAccessData.eStep); + + tInt = arAuxInt; pEx_StAux = pEx_St; pEz_StAux = pEz_St; + for (ie = 0; ie < ne; ie++) + { + *(tInt++) = obj->IntensityComponent(pEx_StAux, pEz_StAux, -3, Int_or_ReE); + pEx_StAux += 2; pEz_StAux += 2; + } + resInt2 = ConstPhotEnInteg * CGenMathMeth::Integ1D_FuncDefByArray(arAuxInt, ne, RadAccessData.eStep); + + tInt = arAuxInt; pEx_StAux = pEx_St; pEz_StAux = pEz_St; + for (ie = 0; ie < ne; ie++) + { + *(tInt++) = obj->IntensityComponent(pEx_StAux, pEz_StAux, -4, Int_or_ReE); + pEx_StAux += 2; pEz_StAux += 2; + } + resInt3 = ConstPhotEnInteg * CGenMathMeth::Integ1D_FuncDefByArray(arAuxInt, ne, RadAccessData.eStep); + } + } + else + { + if (!allStokesReq) //OC18042020 + { + resInt = obj->IntensityComponentSimpleInterpol(pEx_St, pEx_Fi, pEz_St, pEz_Fi, InvStepRelArg, PolCom, Int_or_ReE); + } + else //OC18042020 + { + resInt = obj->IntensityComponentSimpleInterpol(pEx_St, pEx_Fi, pEz_St, pEz_Fi, InvStepRelArg, -1, Int_or_ReE); + resInt1 = obj->IntensityComponentSimpleInterpol(pEx_St, pEx_Fi, pEz_St, pEz_Fi, InvStepRelArg, -2, Int_or_ReE); + resInt2 = obj->IntensityComponentSimpleInterpol(pEx_St, pEx_Fi, pEz_St, pEz_Fi, InvStepRelArg, -3, Int_or_ReE); + resInt3 = obj->IntensityComponentSimpleInterpol(pEx_St, pEx_Fi, pEz_St, pEz_Fi, InvStepRelArg, -4, Int_or_ReE); + } + } + //OC140813 + if (pI != 0) *pI = (float)resInt; + if (pId != 0) *pId = resInt; //OC18042020 + //if(pId != 0) *(pId++) = (double)resInt; + if (allStokesReq) //OC18042020 + { + if (RadExtract.pExtractedData != 0) + { + *pI1 = (float)resInt1; *pI2 = (float)resInt2; *pI3 = (float)resInt3; + } + else + { + *pI1d = resInt1; *pI2d = resInt2; *pI3d = resInt3; + } + } + } +} + +template +static inline void ExtractSingleElecIntensity2DvsXZ_GPUSub(dim3 &blocks, dim3 &threads, srTRadExtract RadExtract, srTSRWRadStructAccessData RadAccessData, srTRadGenManip *local_copy, double* arAuxInt, long long ie0, long long ie1, double InvStepRelArg, int Int_or_ReE) +{ + switch(RadExtract.PolarizCompon) + { + case 5: ExtractSingleElecIntensity2DvsXZ_Kernel<<>>(RadExtract, RadAccessData, local_copy, arAuxInt, ie0, ie1, InvStepRelArg, Int_or_ReE); break; + case 4: ExtractSingleElecIntensity2DvsXZ_Kernel<<>>(RadExtract, RadAccessData, local_copy, arAuxInt, ie0, ie1, InvStepRelArg, Int_or_ReE); break; + case 3: ExtractSingleElecIntensity2DvsXZ_Kernel<<>>(RadExtract, RadAccessData, local_copy, arAuxInt, ie0, ie1, InvStepRelArg, Int_or_ReE); break; + case 2: ExtractSingleElecIntensity2DvsXZ_Kernel<<>>(RadExtract, RadAccessData, local_copy, arAuxInt, ie0, ie1, InvStepRelArg, Int_or_ReE); break; + case 1: ExtractSingleElecIntensity2DvsXZ_Kernel<<>>(RadExtract, RadAccessData, local_copy, arAuxInt, ie0, ie1, InvStepRelArg, Int_or_ReE); break; + case 0: ExtractSingleElecIntensity2DvsXZ_Kernel<<>>(RadExtract, RadAccessData, local_copy, arAuxInt, ie0, ie1, InvStepRelArg, Int_or_ReE); break; + case -1: ExtractSingleElecIntensity2DvsXZ_Kernel<<>>(RadExtract, RadAccessData, local_copy, arAuxInt, ie0, ie1, InvStepRelArg, Int_or_ReE); break; + case -2: ExtractSingleElecIntensity2DvsXZ_Kernel<<>>(RadExtract, RadAccessData, local_copy, arAuxInt, ie0, ie1, InvStepRelArg, Int_or_ReE); break; + case -3: ExtractSingleElecIntensity2DvsXZ_Kernel<<>>(RadExtract, RadAccessData, local_copy, arAuxInt, ie0, ie1, InvStepRelArg, Int_or_ReE); break; + case -4: ExtractSingleElecIntensity2DvsXZ_Kernel<<>>(RadExtract, RadAccessData, local_copy, arAuxInt, ie0, ie1, InvStepRelArg, Int_or_ReE); break; + default: ExtractSingleElecIntensity2DvsXZ_Kernel<<>>(RadExtract, RadAccessData, local_copy, arAuxInt, ie0, ie1, InvStepRelArg, Int_or_ReE); break; + } +} + +int srTRadGenManip::ExtractSingleElecIntensity2DvsXZ_GPU(srTRadExtract& RadExtract, double* arAuxInt, long long ie0, long long ie1, double InvStepRelArg, gpuUsageArg *pGpuUsage) +{ + srTSRWRadStructAccessData& RadAccessData = *((srTSRWRadStructAccessData*)(hRadAccessData.ptr())); + + const int bs = 256; + dim3 blocks(RadAccessData.nx / bs + ((RadAccessData.nx & (bs - 1)) != 0), RadAccessData.nz, RadAccessData.nwfr); + dim3 threads(bs, 1); + + if (RadAccessData.pBaseRadX != NULL) + { + RadAccessData.pBaseRadX = (float*)AuxGpu::ToDevice(pGpuUsage, RadAccessData.pBaseRadX, 2*RadAccessData.ne*RadAccessData.nx*RadAccessData.nz*RadAccessData.nwfr*sizeof(float)); + AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, RadAccessData.pBaseRadX); + } + if (RadAccessData.pBaseRadZ != NULL) + { + RadAccessData.pBaseRadZ = (float*)AuxGpu::ToDevice(pGpuUsage, RadAccessData.pBaseRadZ, 2*RadAccessData.ne*RadAccessData.nx*RadAccessData.nz*RadAccessData.nwfr*sizeof(float)); + AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, RadAccessData.pBaseRadZ); + } + + srTRadGenManip *local_copy = (srTRadGenManip*)AuxGpu::ToDevice(pGpuUsage, this, sizeof(srTRadGenManip)); + AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, local_copy); + + arAuxInt = (double*)AuxGpu::ToDevice(pGpuUsage, arAuxInt, RadAccessData.ne*sizeof(double)); + AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, arAuxInt); + + bool allStokesReq = (RadExtract.PolarizCompon == -5); + bool intOverEnIsRequired = (RadExtract.Int_or_Phase == 7) && (RadAccessData.ne > 1); + + int Int_or_ReE = RadExtract.Int_or_Phase; + if (Int_or_ReE == 7) Int_or_ReE = 0; //OC150813: time/phot. energy integrated single-e intensity requires "normal" intensity here + + if (allStokesReq) + if (intOverEnIsRequired) + ExtractSingleElecIntensity2DvsXZ_GPUSub (blocks, threads, RadExtract, RadAccessData, local_copy, arAuxInt, ie0, ie1, InvStepRelArg, Int_or_ReE); + else + ExtractSingleElecIntensity2DvsXZ_GPUSub (blocks, threads, RadExtract, RadAccessData, local_copy, arAuxInt, ie0, ie1, InvStepRelArg, Int_or_ReE); + else + if (intOverEnIsRequired) + ExtractSingleElecIntensity2DvsXZ_GPUSub (blocks, threads, RadExtract, RadAccessData, local_copy, arAuxInt, ie0, ie1, InvStepRelArg, Int_or_ReE); + else + ExtractSingleElecIntensity2DvsXZ_GPUSub (blocks, threads, RadExtract, RadAccessData, local_copy, arAuxInt, ie0, ie1, InvStepRelArg, Int_or_ReE); + + AuxGpu::ToHostAndFree(pGpuUsage, local_copy, sizeof(srTRadGenManip), true); + AuxGpu::ToHostAndFree(pGpuUsage, arAuxInt, RadAccessData.ne*sizeof(double), true); + AuxGpu::MarkUpdated(pGpuUsage, RadAccessData.pBaseRadX, true, false); + AuxGpu::MarkUpdated(pGpuUsage, RadAccessData.pBaseRadZ, true, false); + +#ifndef _DEBUG + if (RadAccessData.pBaseRadX != NULL) + RadAccessData.pBaseRadX = (float*)AuxGpu::GetHostPtr(pGpuUsage, RadAccessData.pBaseRadX); + if (RadAccessData.pBaseRadZ != NULL) + RadAccessData.pBaseRadZ = (float*)AuxGpu::GetHostPtr(pGpuUsage, RadAccessData.pBaseRadZ); +#endif + +#ifdef _DEBUG + if (RadAccessData.pBaseRadX != NULL) + RadAccessData.pBaseRadX = (float*)AuxGpu::ToHostAndFree(pGpuUsage, RadAccessData.pBaseRadX, 2 * RadAccessData.ne * RadAccessData.nx * RadAccessData.nz * RadAccessData.nwfr * sizeof(float)); + if (RadAccessData.pBaseRadZ != NULL) + RadAccessData.pBaseRadZ = (float*)AuxGpu::ToHostAndFree(pGpuUsage, RadAccessData.pBaseRadZ, 2 * RadAccessData.ne * RadAccessData.nx * RadAccessData.nz * RadAccessData.nwfr * sizeof(float)); + cudaStreamSynchronize(0); + auto err = cudaGetLastError(); + printf("%s\r\n", cudaGetErrorString(err)); +#endif + return 0; +} + +template +__global__ void ExtractSingleElecMutualIntensityVsXZ_Kernel(const float* __restrict__ pEx0, const float* __restrict__ pEz0, float* __restrict__ pMI0, long nxnz, long itStart, long itEnd, long PerX, long iter0) +{ + //Calculate coordinates as the typical triangular matrix + int i0 = (blockIdx.x * blockDim.x + threadIdx.x); //<=nxnz range + int it0_0 = (blockIdx.y * blockDim.y + threadIdx.y); //nxnz/(2*itPerBlk) range + long iter = iter0; + + if (i0 > nxnz) return; + if (it0_0 > nxnz / 2) return; + + for (int it0 = it0_0 * itPerBlk; it0 < it0_0 * itPerBlk + itPerBlk; it0++) + { + long it = it0; + long i = i0; + if (i0 > it0) //If the coordinates are past the triangular bounds, switch to the lower half of the triangle + { + it = nxnz - it0 - 1; + i = i0 - (it0 + 1); + } + + if (it >= itEnd) { + return; + } + + //float* pMI = pMI0 + it0 * (nxnz << 1) + (i0 << 1); //Compact representation coordinates + float* pMI = pMI0 + (it - itStart) * (nxnz << 1) + (i << 1); //Full representation coordinates + const float* pEx = pEx0 + i * PerX; + const float* pEz = pEz0 + i * PerX; + const float* pExT = pEx0 + (it - itStart) * PerX; + const float* pEzT = pEz0 + (it - itStart) * PerX; + + float ExRe = 0., ExIm = 0., EzRe = 0., EzIm = 0.; + float ExReT = 0., ExImT = 0., EzReT = 0., EzImT = 0.; + + { + if (EhOK) + { + ExRe = *pEx; ExIm = *(pEx + 1); + if (i != (it - itStart)) { + ExReT = *pExT; ExImT = *(pExT + 1); + } + else { + ExReT = ExRe; + ExImT = ExIm; + } + } + if (EvOK) { + EzRe = *pEz; EzIm = *(pEz + 1); + if (i != (it - itStart)) { + EzReT = *pEzT; EzImT = *(pEzT + 1); + } + else { + EzReT = EzRe; + EzImT = EzIm; + } + } + } + float ReMI = 0., ImMI = 0.; + + switch (PolCom) + { + case 0: // Lin. Hor. + { + ReMI = ExRe * ExReT + ExIm * ExImT; + ImMI = ExIm * ExReT - ExRe * ExImT; + break; + } + case 1: // Lin. Vert. + { + ReMI = EzRe * EzReT + EzIm * EzImT; + ImMI = EzIm * EzReT - EzRe * EzImT; + break; + } + case 2: // Linear 45 deg. + { + float ExRe_p_EzRe = ExRe + EzRe, ExIm_p_EzIm = ExIm + EzIm; + float ExRe_p_EzReT = ExReT + EzReT, ExIm_p_EzImT = ExImT + EzImT; + ReMI = 0.5f * (ExRe_p_EzRe * ExRe_p_EzReT + ExIm_p_EzIm * ExIm_p_EzImT); + ImMI = 0.5f * (ExIm_p_EzIm * ExRe_p_EzReT - ExRe_p_EzRe * ExIm_p_EzImT); + break; + } + case 3: // Linear 135 deg. + { + float ExRe_mi_EzRe = ExRe - EzRe, ExIm_mi_EzIm = ExIm - EzIm; + float ExRe_mi_EzReT = ExReT - EzReT, ExIm_mi_EzImT = ExImT - EzImT; + ReMI = 0.5f * (ExRe_mi_EzRe * ExRe_mi_EzReT + ExIm_mi_EzIm * ExIm_mi_EzImT); + ImMI = 0.5f * (ExIm_mi_EzIm * ExRe_mi_EzReT - ExRe_mi_EzRe * ExIm_mi_EzImT); + break; + } + case 5: // Circ. Left //OC08092019: corrected to be in compliance with definitions for right-hand frame (x,z,s) and with corresponding definition and calculation of Stokes params + //case 4: // Circ. Right + { + float ExRe_mi_EzIm = ExRe - EzIm, ExIm_p_EzRe = ExIm + EzRe; + float ExRe_mi_EzImT = ExReT - EzImT, ExIm_p_EzReT = ExImT + EzReT; + ReMI = 0.5f * (ExRe_mi_EzIm * ExRe_mi_EzImT + ExIm_p_EzRe * ExIm_p_EzReT); + ImMI = 0.5f * (ExIm_p_EzRe * ExRe_mi_EzImT - ExRe_mi_EzIm * ExIm_p_EzReT); + break; + } + case 4: // Circ. Right //OC08092019: corrected to be in compliance with definitions for right-hand frame (x,z,s) and with corresponding definition and calculation of Stokes params + //case 5: // Circ. Left + { + float ExRe_p_EzIm = ExRe + EzIm, ExIm_mi_EzRe = ExIm - EzRe; + float ExRe_p_EzImT = ExReT + EzImT, ExIm_mi_EzReT = ExImT - EzReT; + ReMI = 0.5f * (ExRe_p_EzIm * ExRe_p_EzImT + ExIm_mi_EzRe * ExIm_mi_EzReT); + ImMI = 0.5f * (ExIm_mi_EzRe * ExRe_p_EzImT - ExRe_p_EzIm * ExIm_mi_EzReT); + break; + } + case -1: // s0 + { + ReMI = ExRe * ExReT + ExIm * ExImT + EzRe * EzReT + EzIm * EzImT; + ImMI = ExIm * ExReT - ExRe * ExImT + EzIm * EzReT - EzRe * EzImT; + break; + } + case -2: // s1 + { + ReMI = ExRe * ExReT + ExIm * ExImT - (EzRe * EzReT + EzIm * EzImT); + ImMI = ExIm * ExReT - ExRe * ExImT - (EzIm * EzReT - EzRe * EzImT); + break; + } + case -3: // s2 + { + ReMI = ExImT * EzIm + ExIm * EzImT + ExReT * EzRe + ExRe * EzReT; + ImMI = ExReT * EzIm - ExRe * EzImT - ExImT * EzRe + ExIm * EzReT; + break; + } + case -4: // s3 + { + ReMI = ExReT * EzIm + ExRe * EzImT - ExImT * EzRe - ExIm * EzReT; + ImMI = ExIm * EzImT - ExImT * EzIm - ExReT * EzRe + ExRe * EzReT; + break; + } + default: // total mutual intensity, same as s0 + { + ReMI = ExRe * ExReT + ExIm * ExImT + EzRe * EzReT + EzIm * EzImT; + ImMI = ExIm * ExReT - ExRe * ExImT + EzIm * EzReT - EzRe * EzImT; + break; + //return CAN_NOT_EXTRACT_MUT_INT; + } + } + + if (gt1_iter > 0) + { + pMI[0] = (pMI[0] * iter + (float)ReMI) / (float)(iter + 1.); + pMI[1] = (pMI[1] * iter + (float)ImMI) / (float)(iter + 1.); + } + else if (gt1_iter == 0) + { + pMI[0] = (float)ReMI; + pMI[1] = (float)ImMI; + } + else + { + pMI[0] += (float)ReMI; + pMI[1] += (float)ImMI; + } + } +} + +template +int ExtractSingleElecMutualIntensityVsXZ_GPUSub(float* pEx, float* pEz, float* pMI0, long nxnz, long itStart, long itEnd, long PerX, long iter, bool EhOK, bool EvOK, gpuUsageArg* pGpuUsage) +{ + const int itPerBlk = 1; + dim3 threads = dim3(48, 16, 1); + dim3 grid = dim3((nxnz + 1) / threads.x + (threads.x > 1), (nxnz / 2) / (threads.y * itPerBlk) + (threads.y > 1), 1); + + pEx = (float*)AuxGpu::ToDevice(pGpuUsage, pEx, nxnz * 2 * sizeof(float)); + AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, pEx); + + pEz = (float*)AuxGpu::ToDevice(pGpuUsage, pEz, nxnz * 2 * sizeof(float)); + AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, pEz); + + pMI0 = (float*)AuxGpu::ToDevice(pGpuUsage, pMI0, (itEnd - itStart) * nxnz * 2 * sizeof(float)); + + if (EhOK) + { + if (EvOK) ExtractSingleElecMutualIntensityVsXZ_Kernel << > > (pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter); + else ExtractSingleElecMutualIntensityVsXZ_Kernel << > > (pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter); + } + else + { + if (EvOK) ExtractSingleElecMutualIntensityVsXZ_Kernel << > > (pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter); + else ExtractSingleElecMutualIntensityVsXZ_Kernel << > > (pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter); + } + + pEx = (float*)AuxGpu::ToHostAndFree(pGpuUsage, pEx, nxnz * 2 * sizeof(float), true); + pEz = (float*)AuxGpu::ToHostAndFree(pGpuUsage, pEz, nxnz * 2 * sizeof(float), true); + + AuxGpu::MarkUpdated(pGpuUsage, pMI0, true, false); + +#ifdef _DEBUG + if (pMI0 != NULL) + pMI0 = (float*)AuxGpu::ToHostAndFree(pGpuUsage, pMI0, (itEnd - itStart) * RadAccessData.ne * RadAccessData.nx * RadAccessData.nz * 2 * sizeof(float)); + + cudaStreamSynchronize(0); + auto err = cudaGetLastError(); + printf("%s\r\n", cudaGetErrorString(err)); +#endif + return 0; +} + +int srTRadGenManip::ExtractSingleElecMutualIntensityVsXZ_GPU(float* pEx, float* pEz, float* pMI0, long nxnz, long itStart, long itEnd, long PerX, long iter, int PolCom, bool EhOK, bool EvOK, gpuUsageArg* pGpuUsage) +{ + if (iter > 0) + { + switch (PolCom) + { + case 0: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< 0, 1>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage); + case 1: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< 1, 1>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage); + case 2: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< 2, 1>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage); + case 3: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< 3, 1>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage); + case 4: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< 4, 1>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage); + case 5: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< 5, 1>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage); + case -1: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -1, 1>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage); + case -2: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -2, 1>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage); + case -3: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -3, 1>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage); + case -4: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -4, 1>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage); + default: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -5, 1>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage); + } + } + else if (iter == 0) + { + switch (PolCom) + { + case 0: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< 0, 0>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage); + case 1: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< 1, 0>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage); + case 2: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< 2, 0>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage); + case 3: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< 3, 0>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage); + case 4: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< 4, 0>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage); + case 5: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< 5, 0>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage); + case -1: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -1, 0>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage); + case -2: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -2, 0>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage); + case -3: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -3, 0>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage); + case -4: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -4, 0>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage); + default: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -5, 0>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage); + } + } +} + +#endif \ No newline at end of file diff --git a/cpp/src/core/srradstr.h b/cpp/src/core/srradstr.h index f9b78ba8..def0ffb0 100644 --- a/cpp/src/core/srradstr.h +++ b/cpp/src/core/srradstr.h @@ -32,6 +32,10 @@ #include "srigorre.h" #endif +#ifdef _OFFLOAD_GPU //OC28072023 +#include "auxgpu.h" //HG +#endif + #include "srobject.h" //************************************************************************* @@ -72,8 +76,8 @@ class srTSRWRadStructAccessData : public CGenObject { waveHndl wRad, wRadX, wRadZ; int hStateRadX, hStateRadZ; double eStep, eStart, xStep, xStart, zStep, zStart; - long ne, nx, nz; - //long long ne, nx, nz; //OC26042019 + long ne, nx, nz; //OC03082023 (rolled back) + //long long ne, nx, nz; //HG //OC26042019 double xStartTr, zStartTr; bool UseStartTrToShiftAtChangingRepresToCoord; @@ -242,7 +246,16 @@ class srTSRWRadStructAccessData : public CGenObject { void CheckAndSubtractPhaseTermsLin(double newXc, double newZc); void CheckAndResetPhaseTermsLin(); void EstimateOversamplingFactors(double& estimOverSampX, double& estimOverSampZ); - void MirrorFieldData(int sx, int sz); + + void MirrorFieldData(int sx, int sz, void* pvGPU=0); //OC28072023 + //void MirrorFieldData(int sx, int sz); + +#ifdef _OFFLOAD_GPU + void MirrorFieldData_GPU(int sx, int sz, TGPUUsageArg* pGPU); //OC03082023 + //void MirrorFieldData_GPU(int sx, int sz, void* pGpuUsage); //HG28072023 + void MultiplyElFieldByPhaseLin_GPU(double xMult, double zMult, TGPUUsageArg* pGPU); //OC03082023 + //void MultiplyElFieldByPhaseLin_GPU(double xMult, double zMult, void* pGpuUsage); //HG28072023 +#endif int SetupWfrEdgeCorrData(float* pDataEx, float* pDataEz, srTDataPtrsForWfrEdgeCorr& DataPtrsForWfrEdgeCorr); void MakeWfrEdgeCorrection(float* pDataEx, float* pDataEz, srTDataPtrsForWfrEdgeCorr& DataPtrs); @@ -491,12 +504,32 @@ class srTSRWRadStructAccessData : public CGenObject { } } - void MultiplyElFieldByPhaseLin(double xMult, double zMult) + void MultiplyElFieldByPhaseLin(double xMult, double zMult, void* pvGPU=0) //OC28072023 + //void MultiplyElFieldByPhaseLin(double xMult, double zMult) { bool RadXisDefined = (pBaseRadX != 0); bool RadZisDefined = (pBaseRadZ != 0); if((!RadXisDefined) && (!RadZisDefined)) return; +#ifdef _OFFLOAD_GPU //OC28072023 + //TGPUUsageArg *pGPU = (TGPUUsageArg*)pvGPU; + //GPU_COND(pvGPU, + //{ + // MultiplyElFieldByPhaseLin_GPU(xMult, zMult, pGPU); + // //MultiplyElFieldByPhaseLin_GPU(xMult, zMult, pGPU); + // return; + //} + + if(pvGPU != 0) + { + TGPUUsageArg *pGPU = (TGPUUsageArg*)pvGPU; + if(CAuxGPU::GPUEnabled(pGPU)) + { + MultiplyElFieldByPhaseLin_GPU(xMult, zMult, pGPU); + } + } +#endif + float *tEx = pBaseRadX; float *tEz = pBaseRadZ; diff --git a/cpp/src/core/srradstr_gpu.cu b/cpp/src/core/srradstr_gpu.cu new file mode 100644 index 00000000..5890cbf8 --- /dev/null +++ b/cpp/src/core/srradstr_gpu.cu @@ -0,0 +1,330 @@ +/************************************************************************//** + * File: srradstr_gpu.cu + * Description: Auxiliary structures for various SR calculation methods (CUDA implementation) + * Project: Synchrotron Radiation Workshop + * First release: 2023 + * + * Copyright (C) Brookhaven National Laboratory + * All Rights Reserved + * + * @author H.Goel + * @version 1.0 + ***************************************************************************/ + +#ifdef _OFFLOAD_GPU + +#include "cuda_runtime.h" +#include "device_launch_parameters.h" +#include "math_constants.h" +#include +#include +#include +#include "srradstr.h" + +__global__ void MultiplyElFieldByPhaseLin_Kernel(double xMult, double zMult, float* pBaseRadX, float* pBaseRadZ, int nz, int nx, int ne, float zStart, float zStep, float xStart, float xStep) { + int ix = (blockIdx.x * blockDim.x + threadIdx.x); //nx range + int iz = (blockIdx.y * blockDim.y + threadIdx.y); //nz range + + if (ix < nx && iz < nz) + { + bool RadXisDefined = (pBaseRadX != 0); + bool RadZisDefined = (pBaseRadZ != 0); + + double z = zStart + iz * zStep; + double x = xStart + ix * xStep; + double dPhZ = zMult * z; + double dPh = dPhZ + xMult * x; + double cosPh, sinPh; + sincos(dPh, &sinPh, &cosPh); + + long long offset = iz * nx * ne * 2 + ix * ne * 2; + float* tEx = pBaseRadX + offset; + float* tEz = pBaseRadZ + offset; + for (int ie = 0; ie < ne; ie++) + { + if (RadXisDefined) + { + //*(tEx++) *= a; *(tEx++) *= a; + double newReEx = (*tEx) * cosPh - (*(tEx + 1)) * sinPh; + double newImEx = (*tEx) * sinPh + (*(tEx + 1)) * cosPh; + *(tEx++) = (float)newReEx; *(tEx++) = (float)newImEx; + } + if (RadZisDefined) + { + //*(tEz++) *= a; *(tEz++) *= a; + double newReEz = (*tEz) * cosPh - (*(tEz + 1)) * sinPh; + double newImEz = (*tEz) * sinPh + (*(tEz + 1)) * cosPh; + *(tEz++) = (float)newReEz; *(tEz++) = (float)newImEz; + } + } + } +} + +void srTSRWRadStructAccessData::MultiplyElFieldByPhaseLin_GPU(double xMult, double zMult, TGPUUsageArg* pGPU) //OC03082023 +//void srTSRWRadStructAccessData::MultiplyElFieldByPhaseLin_GPU(double xMult, double zMult, void* pGpuUsage) +{ + //TGPUUsageArg *pGpuUsage_ = (TGPUUsageArg*)pGpuUsage; //OC03082023 (commented-out) + if (pBaseRadX != NULL) + { + pBaseRadX = (float*)CAuxGPU::ToDevice(pGPU, pBaseRadX, nz * nx * ne * 2 * sizeof(float)); //OC03082023 + //pBaseRadX = (float*)CAuxGPU::ToDevice(pGpuUsage_, pBaseRadX, nz * nx * ne * 2 * sizeof(float)); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, pBaseRadX); //OC03082023 + //CAuxGPU::EnsureDeviceMemoryReady(pGpuUsage_, pBaseRadX); + } + if (pBaseRadZ != NULL) + { + pBaseRadZ = (float*)CAuxGPU::ToDevice(pGPU, pBaseRadZ, nz * nx * ne * 2 * sizeof(float)); //OC03082023 + //pBaseRadZ = (float*)CAuxGPU::ToDevice(pGpuUsage_, pBaseRadZ, nz * nx * ne * 2 * sizeof(float)); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, pBaseRadZ); //OC03082023 + //CAuxGPU::EnsureDeviceMemoryReady(pGpuUsage_, pBaseRadZ); + } + + const int bs = 256; + dim3 blocks(nx / bs + ((nx & (bs - 1)) != 0), nz); + dim3 threads(bs, 1); + MultiplyElFieldByPhaseLin_Kernel<<>> (xMult, zMult, pBaseRadX, pBaseRadZ, nz, nx, ne, (float)zStart, (float)zStep, (float)xStart, (float)xStep); + //MultiplyElFieldByPhaseLin_Kernel<<>> (xMult, zMult, pBaseRadX, pBaseRadZ, nz, nx, ne, zStart, zStep, xStart, xStep); + + if (pBaseRadX != NULL) + CAuxGPU::MarkUpdated(pGPU, pBaseRadX, true, false); //OC03082023 + //CAuxGPU::MarkUpdated(pGpuUsage_, pBaseRadX, true, false); + if (pBaseRadZ != NULL) + CAuxGPU::MarkUpdated(pGPU, pBaseRadZ, true, false); //OC03082023 + //CAuxGPU::MarkUpdated(pGpuUsage_, pBaseRadZ, true, false); + +#ifdef _DEBUG + if (pBaseRadX != NULL) + pBaseRadX = (float*)CAuxGPU::ToHostAndFree(pGPU, pBaseRadX, nz * nx * ne * 2 * sizeof(float)); //OC03082023 + //pBaseRadX = (float*)CAuxGPU::ToHostAndFree(pGpuUsage_, pBaseRadX, nz * nx * ne * 2 * sizeof(float)); + if (pBaseRadZ != NULL) + pBaseRadZ = (float*)CAuxGPU::ToHostAndFree(pGPU, pBaseRadZ, nz * nx * ne * 2 * sizeof(float)); //OC03082023 + //pBaseRadZ = (float*)CAuxGPU::ToHostAndFree(pGpuUsage_, pBaseRadZ, nz * nx * ne * 2 * sizeof(float)); + cudaStreamSynchronize(0); + //auto err = cudaGetLastError(); + //printf("%s\r\n", cudaGetErrorString(err)); +#endif +} + +template __global__ void MirrorFieldData_Kernel(long ne, long nx, long nz, float* pEX0, float* pEZ0) { + int ix = (blockIdx.x * blockDim.x + threadIdx.x); //nx range + int iz = (blockIdx.y * blockDim.y + threadIdx.y); //nz range + + if (ix < nx && iz < nz) + { + long long PerX = ne << 1; + long long PerZ = PerX * nx; + float buf; + + if (mode == 0) + { + if (ix >= (nx >> 1)) + return; + + long long nx_mi_1 = nx - 1; //OC26042019 + for (long long ie = 0; ie < ne; ie++) + { + //long Two_ie = ie << 1; + long long Two_ie = ie << 1; //OC26042019 + + //long izPerZ = iz*PerZ; + long long izPerZ = iz * PerZ; + float* pEX_StartForX = pEX0 + izPerZ; + float* pEZ_StartForX = pEZ0 + izPerZ; + + //long ixPerX_p_Two_ie = ix*PerX + Two_ie; + long long ixPerX_p_Two_ie = ix * PerX + Two_ie; + float* pEX = pEX_StartForX + ixPerX_p_Two_ie; + float* pEZ = pEZ_StartForX + ixPerX_p_Two_ie; + + //long rev_ixPerX_p_Two_ie = (nx_mi_1 - ix)*PerX + Two_ie; + long long rev_ixPerX_p_Two_ie = (nx_mi_1 - ix) * PerX + Two_ie; + float* rev_pEX = pEX_StartForX + rev_ixPerX_p_Two_ie; + float* rev_pEZ = pEZ_StartForX + rev_ixPerX_p_Two_ie; + + if (pEX0 != 0) + { + buf = *rev_pEX; *(rev_pEX++) = *pEX; *(pEX++) = buf; + buf = *rev_pEX; *rev_pEX = *pEX; *pEX = buf; + } + if (pEZ0 != 0) + { + buf = *rev_pEZ; *(rev_pEZ++) = *pEZ; *(pEZ++) = buf; + buf = *rev_pEZ; *rev_pEZ = *pEZ; *pEZ = buf; + } + } + } + else if (mode == 1) + { + if (iz >= (nz >> 1)) + return; + + long long nz_mi_1 = nz - 1; //OC26042019 + for (long long ie = 0; ie < ne; ie++) + { + //long Two_ie = ie << 1; + long long Two_ie = ie << 1; + + //long izPerZ = iz*PerZ; + long long izPerZ = iz * PerZ; + float* pEX_StartForX = pEX0 + izPerZ; + float* pEZ_StartForX = pEZ0 + izPerZ; + + //long rev_izPerZ = (nz_mi_1 - iz)*PerZ; + long long rev_izPerZ = (nz_mi_1 - iz) * PerZ; + float* rev_pEX_StartForX = pEX0 + rev_izPerZ; + float* rev_pEZ_StartForX = pEZ0 + rev_izPerZ; + + //long ixPerX_p_Two_ie = ix*PerX + Two_ie; + long long ixPerX_p_Two_ie = ix * PerX + Two_ie; + float* pEX = pEX_StartForX + ixPerX_p_Two_ie; + float* pEZ = pEZ_StartForX + ixPerX_p_Two_ie; + + float* rev_pEX = rev_pEX_StartForX + ixPerX_p_Two_ie; + float* rev_pEZ = rev_pEZ_StartForX + ixPerX_p_Two_ie; + + if (pEX0 != 0) + { + buf = *rev_pEX; *(rev_pEX++) = *pEX; *(pEX++) = buf; + buf = *rev_pEX; *rev_pEX = *pEX; *pEX = buf; + } + if (pEZ0 != 0) + { + buf = *rev_pEZ; *(rev_pEZ++) = *pEZ; *(pEZ++) = buf; + buf = *rev_pEZ; *rev_pEZ = *pEZ; *pEZ = buf; + } + } + } + else if (mode == 2) + { + if (iz >= (nz >> 1)) + return; + + long long nx_mi_1 = nx - 1; //OC26042019 + long long nz_mi_1 = nz - 1; + for (long long ie = 0; ie < ne; ie++) //OC26042019 + //for(long ie=0; ie> 1); iz++) + long long Two_ie = ie << 1; //OC26042019 + + //long izPerZ = iz*PerZ; + long long izPerZ = iz * PerZ; + float* pEX_StartForX = pEX0 + izPerZ; + float* pEZ_StartForX = pEZ0 + izPerZ; + + //long rev_izPerZ = (nz_mi_1 - iz)*PerZ; + long long rev_izPerZ = (nz_mi_1 - iz) * PerZ; + float* rev_pEX_StartForX = pEX0 + rev_izPerZ; + float* rev_pEZ_StartForX = pEZ0 + rev_izPerZ; + + //long ixPerX_p_Two_ie = ix*PerX + Two_ie; + long long ixPerX_p_Two_ie = ix * PerX + Two_ie; + float* pEX = pEX_StartForX + ixPerX_p_Two_ie; + float* pEZ = pEZ_StartForX + ixPerX_p_Two_ie; + + //long rev_ixPerX_p_Two_ie = (nx_mi_1 - ix)*PerX + Two_ie; + long long rev_ixPerX_p_Two_ie = (nx_mi_1 - ix) * PerX + Two_ie; + float* rev_pEX = rev_pEX_StartForX + rev_ixPerX_p_Two_ie; + float* rev_pEZ = rev_pEZ_StartForX + rev_ixPerX_p_Two_ie; + + if (pEX0 != 0) + { + buf = *rev_pEX; *(rev_pEX++) = *pEX; *(pEX++) = buf; + buf = *rev_pEX; *rev_pEX = *pEX; *pEX = buf; + } + if (pEZ0 != 0) + { + buf = *rev_pEZ; *(rev_pEZ++) = *pEZ; *(pEZ++) = buf; + buf = *rev_pEZ; *rev_pEZ = *pEZ; *pEZ = buf; + } + + if (((nz >> 1) << 1) != nz) + { + //long izPerZ = ((nz >> 1) + 1)*PerZ; + long long izPerZ = ((nz >> 1) + 1) * PerZ; + float* pEX_StartForX = pEX0 + izPerZ; + float* pEZ_StartForX = pEZ0 + izPerZ; + + //long ixPerX_p_Two_ie = ix*PerX + Two_ie; + long long ixPerX_p_Two_ie = ix * PerX + Two_ie; + float* pEX = pEX_StartForX + ixPerX_p_Two_ie; + float* pEZ = pEZ_StartForX + ixPerX_p_Two_ie; + + //long rev_ixPerX_p_Two_ie = (nx_mi_1 - ix)*PerX + Two_ie; + long long rev_ixPerX_p_Two_ie = (nx_mi_1 - ix) * PerX + Two_ie; + float* rev_pEX = pEX_StartForX + rev_ixPerX_p_Two_ie; + float* rev_pEZ = pEZ_StartForX + rev_ixPerX_p_Two_ie; + + if (pEX0 != 0) + { + buf = *rev_pEX; *(rev_pEX++) = *pEX; *(pEX++) = buf; + buf = *rev_pEX; *rev_pEX = *pEX; *pEX = buf; + } + if (pEZ0 != 0) + { + buf = *rev_pEZ; *(rev_pEZ++) = *pEZ; *(pEZ++) = buf; + buf = *rev_pEZ; *rev_pEZ = *pEZ; *pEZ = buf; + } + } + } + } + } +} + +void srTSRWRadStructAccessData::MirrorFieldData_GPU(int sx, int sz, TGPUUsageArg* pGPU) //OC03082023 +//void srTSRWRadStructAccessData::MirrorFieldData_GPU(int sx, int sz, void* pGpuUsage) +{ + //TGPUUsageArg *pGpuUsage_ = (TGPUUsageArg*)pGpuUsage; //OC03082023 (commented-out) + float *pEX0 = pBaseRadX; + float *pEZ0 = pBaseRadZ; + + if (pEX0 != NULL) + { + pEX0 = (float*)CAuxGPU::ToDevice(pGPU, pEX0, nz * nx * ne * 2 * sizeof(float)); //OC03082023 + //pEX0 = (float*)CAuxGPU::ToDevice(pGpuUsage_, pEX0, nz * nx * ne * 2 * sizeof(float)); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, pEX0); //OC03082023 + //CAuxGPU::EnsureDeviceMemoryReady(pGpuUsage_, pEX0); + } + if (pEZ0 != NULL) + { + pEZ0 = (float*)CAuxGPU::ToDevice(pGPU, pEZ0, nz * nx * ne * 2 * sizeof(float)); //OC03082023 + //pEZ0 = (float*)CAuxGPU::ToDevice(pGpuUsage_, pEZ0, nz * nx * ne * 2 * sizeof(float)); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, pEZ0); //OC03082023 + //CAuxGPU::EnsureDeviceMemoryReady(pGpuUsage_, pEZ0); + } + + const int bs = 256; + dim3 blocks(nx / bs + ((nx & (bs - 1)) != 0), nz); + dim3 threads(bs, 1); + + if ((sx > 0) && (sz > 0)) + return; + else if ((sx < 0) && (sz > 0)) + MirrorFieldData_Kernel<0> <<>>(ne, nx, nz, pEX0, pEZ0); + else if ((sx > 0) && (sz < 0)) + MirrorFieldData_Kernel<1> <<>> (ne, nx, nz, pEX0, pEZ0); + else + MirrorFieldData_Kernel<2> <<>> (ne, nx, nz, pEX0, pEZ0); + + if (pEX0 != NULL) + CAuxGPU::MarkUpdated(pGPU, pEX0, true, false); //OC03082023 + //CAuxGPU::MarkUpdated(pGpuUsage_, pEX0, true, false); + if (pEZ0 != NULL) + CAuxGPU::MarkUpdated(pGPU, pEZ0, true, false); //OC03082023 + //CAuxGPU::MarkUpdated(pGpuUsage_, pEZ0, true, false); + +#ifdef _DEBUG + if (pEX0 != NULL) + pEX0 = (float*)CAuxGPU::ToHostAndFree(pGPU, pEX0, nz * nx * ne * 2 * sizeof(float)); //OC03082023 + //pEX0 = (float*)CAuxGPU::ToHostAndFree(pGpuUsage_, pEX0, nz * nx * ne * 2 * sizeof(float)); + if (pEZ0 != NULL) + pEZ0 = (float*)CAuxGPU::ToHostAndFree(pGPU, pEZ0, nz * nx * ne * 2 * sizeof(float)); + //pEZ0 = (float*)CAuxGPU::ToHostAndFree(pGpuUsage_, pEZ0, nz * nx * ne * 2 * sizeof(float)); + cudaStreamSynchronize(0); + //auto err = cudaGetLastError(); + //printf("%s\r\n", cudaGetErrorString(err)); +#endif +} + +#endif \ No newline at end of file diff --git a/cpp/src/lib/auxgpu.cpp b/cpp/src/lib/auxgpu.cpp new file mode 100644 index 00000000..02972cd3 --- /dev/null +++ b/cpp/src/lib/auxgpu.cpp @@ -0,0 +1,368 @@ +/************************************************************************//** + * File: auxgpu.cpp + * Description: Auxiliary utilities to manage GPU usage + * Project: Synchrotron Radiation Workshop + * First release: 2023 + * + * Copyright (C) Brookhaven National Laboratory + * All Rights Reserved + * + * @author H.Goel + * @version 1.0 + ***************************************************************************/ + +#include +#include +#include + +#ifdef _OFFLOAD_GPU +#include +#endif + +#include "auxgpu.h" + +static bool isGPUAvailable = false; +static bool isGPUEnabled = false; +static bool GPUAvailabilityTested = false; +static bool deviceOffloadInitialized = false; +static int deviceCount = 0; + +#ifdef _OFFLOAD_GPU +typedef struct +{ + void *devicePtr; + void *hostPtr; + size_t size; + bool HostToDevUpdated; + bool DevToHostUpdated; + cudaEvent_t h2d_event; + cudaEvent_t d2h_event; +} memAllocInfo_t; +static std::map gpuMap; +static cudaStream_t memcpy_stream; +static bool memcpy_stream_initialized = false; +static int current_device = -1; +#endif + +static void CheckGPUAvailability() +{ +#ifdef _OFFLOAD_GPU + if (!GPUAvailabilityTested) + { + isGPUAvailable = false; + GPUAvailabilityTested = true; + int deviceCount = 0; + if (cudaGetDeviceCount(&deviceCount) != cudaSuccess) + return; + + if (deviceCount < 1) + return; + + isGPUAvailable = true; + } +#else + isGPUAvailable = false; + isGPUEnabled = false; + GPUAvailabilityTested = true; +#endif +} + +bool CAuxGPU::GPUAvailable() +{ + CheckGPUAvailability(); + return isGPUAvailable; +} + +bool CAuxGPU::GPUEnabled(TGPUUsageArg *arg) +{ +#ifdef _OFFLOAD_GPU + if (arg == NULL) + return false; + if (arg->deviceIndex > 0) { + if (arg->deviceIndex <= deviceCount) + { + if (memcpy_stream_initialized && current_device != arg->deviceIndex) + { + cudaStreamDestroy(memcpy_stream); + memcpy_stream_initialized = false; + } + cudaSetDevice(arg->deviceIndex - 1); + if (!memcpy_stream_initialized) + cudaStreamCreateWithFlags(&memcpy_stream, cudaStreamNonBlocking); + current_device = arg->deviceIndex; + memcpy_stream_initialized = true; + } + //TODO: Add warning that GPU isn't available + return GPUAvailable(); + } +#endif + return false; +} + +void CAuxGPU::SetGPUStatus(bool enabled) +{ + isGPUEnabled = enabled && GPUAvailable(); +} + +int CAuxGPU::GetDevice(TGPUUsageArg* arg) +{ +#ifdef _OFFLOAD_GPU + if (arg == NULL) + return cudaCpuDeviceId; + + int curDevice = 0; + cudaGetDevice(&curDevice); + return curDevice; +#else + return 0; +#endif +} + +void* CAuxGPU::ToDevice(TGPUUsageArg* arg, void* hostPtr, size_t size, bool dontCopy) +{ +#ifdef _OFFLOAD_GPU + if (arg == NULL) + return hostPtr; + if (arg->deviceIndex == 0) + return hostPtr; + if (hostPtr == NULL) + return hostPtr; + if (size == 0) + return hostPtr; + if (!GPUEnabled(arg)) + return hostPtr; + if (gpuMap.find(hostPtr) != gpuMap.end()){ + memAllocInfo_t info = gpuMap[hostPtr]; + void* devPtr = info.devicePtr; + hostPtr = info.hostPtr; + if (gpuMap[devPtr].HostToDevUpdated && !dontCopy){ + cudaMemcpyAsync(devPtr, hostPtr, size, cudaMemcpyHostToDevice, memcpy_stream); + cudaEventRecord(gpuMap[devPtr].h2d_event, memcpy_stream); + } +//#if _DEBUG +// printf("ToDevice: %p -> %p, %d, D2H: %d, H2D: %d\n", hostPtr, devPtr, size, gpuMap[devPtr].DevToHostUpdated, gpuMap[devPtr].HostToDevUpdated); //HG28072023 +//#endif + gpuMap[devPtr].HostToDevUpdated = false; + return devPtr; + } + + void *devicePtr = NULL; + cudaError_t err = cudaMalloc(&devicePtr, size); + if (err != cudaSuccess) // Try again after freeing up some memory HG24072023 + { + cudaStreamSynchronize(0); + err = cudaMalloc(&devicePtr, size); + } + if (err != cudaSuccess) + return NULL; +//#if _DEBUG +// printf("ToDevice: %p -> %p, %d\n", hostPtr, devicePtr, size); //HG28072023 +//#endif + memAllocInfo_t info; + info.devicePtr = devicePtr; + info.hostPtr = hostPtr; + info.DevToHostUpdated = false; + info.HostToDevUpdated = false; + cudaEventCreateWithFlags(&info.h2d_event, cudaEventDisableTiming); + cudaEventCreateWithFlags(&info.d2h_event, cudaEventDisableTiming); + if (!dontCopy){ + cudaMemcpyAsync(devicePtr, hostPtr, size, cudaMemcpyHostToDevice, memcpy_stream); + cudaEventRecord(info.h2d_event, memcpy_stream); + } + info.size = size; + gpuMap[hostPtr] = info; + gpuMap[devicePtr] = info; + return devicePtr; +#else + return hostPtr; +#endif +} + +void CAuxGPU::EnsureDeviceMemoryReady(TGPUUsageArg* arg, void* hostPtr) +{ +#ifdef _OFFLOAD_GPU + if (arg == NULL) + return; + if (arg->deviceIndex == 0) + return; + if (hostPtr == NULL) + return; + if (!GPUEnabled(arg)) + return; + if (gpuMap.find(hostPtr) != gpuMap.end()){ + void* devPtr = gpuMap[hostPtr].devicePtr; + if (gpuMap[devPtr].HostToDevUpdated){ + cudaStreamWaitEvent(0, gpuMap[devPtr].h2d_event); + } +//#if _DEBUG +// printf("EnsureDeviceMemoryReady: %p -> %p, %d, D2H: %d, H2D: %d\n", hostPtr, devPtr, gpuMap[devPtr].size, gpuMap[devPtr].DevToHostUpdated, gpuMap[devPtr].HostToDevUpdated); //HG28072023 +//#endif + } +#endif +} + +void* CAuxGPU::GetHostPtr(TGPUUsageArg* arg, void* devicePtr) +{ +#ifdef _OFFLOAD_GPU + if (arg == NULL) + return devicePtr; + if (arg->deviceIndex == 0) + return devicePtr; + if (devicePtr == NULL) + return devicePtr; + if (!GPUEnabled(arg)) + return devicePtr; + memAllocInfo_t info; + if (gpuMap.find(devicePtr) == gpuMap.end()) + return devicePtr; + info = gpuMap[devicePtr]; +//#if _DEBUG +// printf("GetHostPtr: %p -> %p\n", devicePtr, info.hostPtr); //HG28072023 +//#endif + return info.hostPtr; +#else + return devicePtr; +#endif +} + +void* CAuxGPU::ToHostAndFree(TGPUUsageArg* arg, void* devicePtr, size_t size, bool dontCopy) +{ +#ifdef _OFFLOAD_GPU + if (arg == NULL) + return devicePtr; + if (arg->deviceIndex == 0) + return devicePtr; + if (devicePtr == NULL) + return devicePtr; + if (size == 0) + return devicePtr; + if (!GPUEnabled(arg)) + return devicePtr; + memAllocInfo_t info; + if (gpuMap.find(devicePtr) == gpuMap.end()) + return devicePtr; + info = gpuMap[devicePtr]; + devicePtr = info.devicePtr; + void *hostPtr = info.hostPtr; + if (!dontCopy && info.DevToHostUpdated) + { + cudaStreamWaitEvent(memcpy_stream, info.d2h_event, 0); + cudaMemcpyAsync(hostPtr, devicePtr, size, cudaMemcpyDeviceToHost, memcpy_stream); + cudaEventRecord(info.d2h_event); + cudaEventSynchronize(info.d2h_event); // we can't treat host memory as valid until the copy is complete + } +//#if _DEBUG +// printf("ToHostAndFree: %p -> %p, %d\n", devicePtr, hostPtr, size); //HG28072023 +//#endif + cudaStreamWaitEvent(0, info.h2d_event); + cudaStreamWaitEvent(0, info.d2h_event); + cudaFreeAsync(devicePtr, 0); + cudaEventDestroy(info.h2d_event); + cudaEventDestroy(info.d2h_event); + gpuMap.erase(devicePtr); + gpuMap.erase(hostPtr); + return hostPtr; +#else + return devicePtr; +#endif +} + +void CAuxGPU::FreeHost(void* ptr) +{ +#ifdef _OFFLOAD_GPU + if (ptr == NULL) + return; + if (gpuMap.find(ptr) == gpuMap.end()) + return; + memAllocInfo_t info = gpuMap[ptr]; + void *hostPtr = info.hostPtr; + void *devicePtr = info.devicePtr; +//#if _DEBUG +// printf("FreeHost: %p, %p\n", devicePtr, hostPtr); +//#endif + //cudaStreamWaitEvent(0, info.h2d_event); + //cudaStreamWaitEvent(0, info.d2h_event); + cudaFreeAsync(devicePtr, 0); + //cudaEventDestroy(info.h2d_event); + //cudaEventDestroy(info.d2h_event); + std::free(hostPtr); //OC02082023 + //CAuxGPU::free(hostPtr); + gpuMap.erase(devicePtr); + gpuMap.erase(hostPtr); +#endif + return; +} + +void CAuxGPU::MarkUpdated(TGPUUsageArg* arg, void* ptr, bool devToHost, bool hostToDev) +{ +#ifdef _OFFLOAD_GPU + if (arg == NULL) + return; + if (arg->deviceIndex == 0) + return; + if (ptr == NULL) + return; + if (!GPUEnabled(arg)) + return; + if (gpuMap.find(ptr) == gpuMap.end()) + return; + void* devPtr = gpuMap[ptr].devicePtr; + void* hostPtr = gpuMap[ptr].hostPtr; + gpuMap[devPtr].DevToHostUpdated = devToHost; + gpuMap[devPtr].HostToDevUpdated = hostToDev; + gpuMap[hostPtr].DevToHostUpdated = devToHost; + gpuMap[hostPtr].HostToDevUpdated = hostToDev; + if (devToHost) + cudaEventRecord(gpuMap[devPtr].d2h_event, 0); +//#if _DEBUG +// printf("MarkUpdated: %p -> %p, D2H: %d, H2D: %d\n", ptr, devPtr, devToHost, hostToDev); +//#endif +#endif +} + +void CAuxGPU::Init() { + deviceOffloadInitialized = true; +#ifdef _OFFLOAD_GPU + cudaGetDeviceCount(&deviceCount); + cudaDeviceSynchronize(); +#endif +} + +void CAuxGPU::Fini() { +#ifdef _OFFLOAD_GPU + // Copy back all updated data + bool updated = false; + bool freed = false; + for (std::map::const_iterator it = gpuMap.cbegin(); it != gpuMap.cend(); it++) + { + if (it->second.DevToHostUpdated){ + cudaStreamWaitEvent(memcpy_stream, it->second.d2h_event, 0); + cudaMemcpyAsync(it->second.hostPtr, it->second.devicePtr, it->second.size, cudaMemcpyDeviceToHost, memcpy_stream); +//#if _DEBUG +// printf("Fini: %p -> %p, %d\n", it->second.devicePtr, it->second.hostPtr, it->second.size); +//#endif + updated = true; + gpuMap[it->second.hostPtr].DevToHostUpdated = false; + gpuMap[it->second.devicePtr].DevToHostUpdated = false; + } + } + for (std::map::const_iterator it = gpuMap.cbegin(); it != gpuMap.cend(); it++) + { + if (it->first == it->second.devicePtr) + { + cudaStreamWaitEvent(0, it->second.h2d_event); + cudaStreamWaitEvent(0, it->second.d2h_event); + cudaFreeAsync(it->second.devicePtr, 0); + freed = true; + cudaEventDestroy(it->second.h2d_event); + cudaEventDestroy(it->second.d2h_event); + } + } + if (updated | freed) + cudaStreamSynchronize(0); + gpuMap.clear(); +//#if _DEBUG +// printf("Fini: %d\n", gpuMap.size()); +//#endif +#endif +} \ No newline at end of file diff --git a/cpp/src/lib/auxgpu.h b/cpp/src/lib/auxgpu.h new file mode 100644 index 00000000..9d64d450 --- /dev/null +++ b/cpp/src/lib/auxgpu.h @@ -0,0 +1,62 @@ +/************************************************************************//** + * File: auxgpu.h + * Description: Auxiliary utilities to manage GPU usage + * Project: Synchrotron Radiation Workshop + * First release: 2023 + * + * Copyright (C) Brookhaven National Laboratory + * All Rights Reserved + * + * @author H.Goel + * @version 1.0 + ***************************************************************************/ + +#ifndef __UTIGPU_H +#define __UTIGPU_H + +#include +#include + +#ifdef _OFFLOAD_GPU +#include +#include +//#if CUDART_VERSION < 11020 +//#error CUDA version too low, need at least 11.2 +//#endif +#endif + +typedef struct +{ + int deviceIndex; // -1 means no device, TODO +} TGPUUsageArg; + +#ifdef _OFFLOAD_GPU +#define GPU_COND(arg, code) if (arg && CAuxGPU::GPUEnabled((TGPUUsageArg*)arg)) { code } +//#define GPU_COND(arg, code) if (arg && CAuxGPU::GPUEnabled(arg)) { code } +#define GPU_PORTABLE __device__ __host__ +#else +#define GPU_COND(arg, code) if(0) { } +#define GPU_PORTABLE +#endif + + //************************************************************************* +class CAuxGPU +{ +private: +public: + static void Init(); + static void Fini(); + static bool GPUAvailable(); //CheckGPUAvailable etc + static bool GPUEnabled(TGPUUsageArg *arg); + static void SetGPUStatus(bool enabled); + static int GetDevice(TGPUUsageArg* arg); + static void* ToDevice(TGPUUsageArg* arg, void* hostPtr, size_t size, bool dontCopy = false); + static void* GetHostPtr(TGPUUsageArg* arg, void* devicePtr); + static void* ToHostAndFree(TGPUUsageArg* arg, void* devicePtr, size_t size, bool dontCopy = false); + static void EnsureDeviceMemoryReady(TGPUUsageArg* arg, void* devicePtr); + static void FreeHost(void* ptr); + static void MarkUpdated(TGPUUsageArg* arg, void* ptr, bool devToHost, bool hostToDev); +}; + +//************************************************************************* +#endif \ No newline at end of file diff --git a/cpp/src/lib/srwlib.cpp b/cpp/src/lib/srwlib.cpp index c36043c7..5bc9a324 100644 --- a/cpp/src/lib/srwlib.cpp +++ b/cpp/src/lib/srwlib.cpp @@ -29,6 +29,9 @@ #include "srisosrc.h" #include "srmatsta.h" +#ifdef _OFFLOAD_GPU +#include "auxgpu.h" //OC27072023 +#endif //#include //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP //------------------------------------------------------------------------- @@ -751,7 +754,8 @@ EXP int CALL srwlCalcPowDenSR(SRWLStokes* pStokes, SRWLPartBeam* pElBeam, SRWLPr //------------------------------------------------------------------------- -EXP int CALL srwlCalcIntFromElecField(char* pInt, SRWLWfr* pWfr, char polar, char intType, char depType, double e, double x, double y, double* pMeth, void* pFldTrj) //OC23022020 +EXP int CALL srwlCalcIntFromElecField(char* pInt, SRWLWfr* pWfr, char polar, char intType, char depType, double e, double x, double y, double* pMeth, void* pFldTrj, void* pGPU) //OC26072023 +//EXP int CALL srwlCalcIntFromElecField(char* pInt, SRWLWfr* pWfr, char polar, char intType, char depType, double e, double x, double y, double* pMeth, void* pFldTrj) //OC23022020 //EXP int CALL srwlCalcIntFromElecField(char* pInt, SRWLWfr* pWfr, char polar, char intType, char depType, double e, double x, double y, double *pMeth) //OC16122019 //EXP int CALL srwlCalcIntFromElecField(char* pInt, SRWLWfr* pWfr, char polar, char intType, char depType, double e, double x, double y, int *pMeth) //OC13122019 //EXP int CALL srwlCalcIntFromElecField(char* pInt, SRWLWfr* pWfr, char polar, char intType, char depType, double e, double x, double y) @@ -994,7 +998,8 @@ EXP int CALL srwlSetRepresElecField(SRWLWfr* pWfr, char repr) //------------------------------------------------------------------------- -EXP int CALL srwlPropagElecField(SRWLWfr* pWfr, SRWLOptC* pOpt, int nInt, char** arID, SRWLRadMesh* arIM, char** arI) //OC15082018 +EXP int CALL srwlPropagElecField(SRWLWfr* pWfr, SRWLOptC* pOpt, int nInt, char** arID, SRWLRadMesh* arIM, char** arI, void* pGPU) //OC26072023 (from HG) +//EXP int CALL srwlPropagElecField(SRWLWfr* pWfr, SRWLOptC* pOpt, int nInt, char** arID, SRWLRadMesh* arIM, char** arI) //OC15082018 //EXP int CALL srwlPropagElecField(SRWLWfr* pWfr, SRWLOptC* pOpt) { if((pWfr == 0) || (pOpt == 0)) return SRWL_INCORRECT_PARAM_FOR_WFR_PROP; @@ -1047,7 +1052,8 @@ EXP int CALL srwlCalcTransm(SRWLOptT* pOpTr, const double* pDelta, const double* //------------------------------------------------------------------------- -EXP int CALL srwlUtiFFT(char* pcData, char typeData, double* arMesh, int nMesh, int dir) +EXP int CALL srwlUtiFFT(char* pcData, char typeData, double* arMesh, int nMesh, int dir, void* pGPU) //OC26072023 +//EXP int CALL srwlUtiFFT(char* pcData, char typeData, double* arMesh, int nMesh, int dir) { if((pcData == 0) || (arMesh == 0) || ((typeData != 'f') && (typeData != 'd')) || (nMesh < 3) || (dir == 0)) return SRWL_INCORRECT_PARAM_FOR_FFT; //OC31012019 @@ -1538,6 +1544,53 @@ EXP int CALL srwlPropagRadMultiE(SRWLStokes* pStokes, SRWLWfr* pWfr0, SRWLOptC* return 0; } +//------------------------------------------------------------------------- +#ifdef _OFFLOAD_GPU //OC30102023 + +EXP bool CALL srwlUtiGPUAvailable() //OC27072023 +//EXP bool CALL srwlAuxGpuAvailable() //HG +{ + return CAuxGPU::GPUAvailable(); //OC05092023 + //return AuxGpu::GPUAvailable(); +} + +//------------------------------------------------------------------------- + +EXP bool CALL srwlUtiGPUEnabled() //OC27072023 +//EXP bool CALL srwlAuxGpuEnabled() //HG +{ + return CAuxGPU::GPUEnabled(nullptr); //OC05092023 + //return AuxGpu::GPUEnabled(nullptr); +} + +//------------------------------------------------------------------------- + +EXP void CALL srwlUtiGPUSetStatus(bool enable) //OC27072023 +//EXP void CALL srwlAuxGpuSetStatus(bool enable) //HG +{ + CAuxGPU::SetGPUStatus(enable); //OC05092023 + //AuxGpu::SetGPUStatus(enable); +} + +//------------------------------------------------------------------------- + +EXP void CALL srwlUtiGPUInit() //OC27072023 +//EXP void CALL srwlAuxGpuInit() //HG +{ + CAuxGPU::Init(); //OC05092023 (why void?) + //AuxGpu::Init(); +} + +//------------------------------------------------------------------------- + +EXP void CALL srwlUtiGPUFini() //OC27072023 +//EXP void CALL srwlAuxGpuFini() //HG +{ + CAuxGPU::Fini(); //OC05092023 (why void?) + //AuxGpu::Fini(); +} + +#endif //------------------------------------------------------------------------- //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: /* diff --git a/cpp/src/lib/srwlib.h b/cpp/src/lib/srwlib.h index aa448d31..ff81e0ff 100644 --- a/cpp/src/lib/srwlib.h +++ b/cpp/src/lib/srwlib.h @@ -732,7 +732,8 @@ EXP int CALL srwlCalcPowDenSR(SRWLStokes* pStokes, SRWLPartBeam* pElBeam, SRWLPr * @return integer error (>0) or warnig (<0) code * @see ... */ -EXP int CALL srwlCalcIntFromElecField(char* pInt, SRWLWfr* pWfr, char pol, char intType, char depType, double e, double x, double y, double* arMeth=0, void* pFldTrj=0); +EXP int CALL srwlCalcIntFromElecField(char* pInt, SRWLWfr* pWfr, char pol, char intType, char depType, double e, double x, double y, double* arMeth=0, void* pFldTrj=0, void* pGPU=0); //OC26072023 (from HG) +//EXP int CALL srwlCalcIntFromElecField(char* pInt, SRWLWfr* pWfr, char pol, char intType, char depType, double e, double x, double y, double* arMeth=0, void* pFldTrj=0); //EXP int CALL srwlCalcIntFromElecField(char* pInt, SRWLWfr* pWfr, char pol, char intType, char depType, double e, double x, double y, double* arMeth=0); //EXP int CALL srwlCalcIntFromElecField(char* pInt, SRWLWfr* pWfr, char pol, char intType, char depType, double e, double x, double y); @@ -802,7 +803,8 @@ EXP int CALL srwlSetRepresElecField(SRWLWfr* pWfr, char repr); * @return integer error (>0) or warnig (<0) code * @see ... */ -EXP int CALL srwlPropagElecField(SRWLWfr* pWfr, SRWLOptC* pOpt, int nInt=0, char** arID=0, SRWLRadMesh* arIM=0, char** arI=0); //OC15082018 +EXP int CALL srwlPropagElecField(SRWLWfr* pWfr, SRWLOptC* pOpt, int nInt=0, char** arID=0, SRWLRadMesh* arIM=0, char** arI=0, void* pGPU=0); //OC26072023 (from HG) +//EXP int CALL srwlPropagElecField(SRWLWfr* pWfr, SRWLOptC* pOpt, int nInt=0, char** arID=0, SRWLRadMesh* arIM=0, char** arI=0); //OC15082018 //EXP int CALL srwlPropagElecField(SRWLWfr* pWfr, SRWLOptC* pOpt); /** TEST @@ -849,7 +851,8 @@ EXP int CALL srwlCalcTransm(SRWLOptT* pOpTr, const double* pDelta, const double* * @return integer error (>0) or warnig (<0) code * @see ... */ -EXP int CALL srwlUtiFFT(char* pcData, char typeData, double* arMesh, int nMesh, int dir); +EXP int CALL srwlUtiFFT(char* pcData, char typeData, double* arMesh, int nMesh, int dir, void* pGPU=0); //OC26072023 (from HG) +//EXP int CALL srwlUtiFFT(char* pcData, char typeData, double* arMesh, int nMesh, int dir); /** * Convolves real data with 1D or 2D Gaussian (depending on arguments) @@ -964,6 +967,43 @@ EXP int CALL srwlUtiUndFromMagFldTab(SRWLMagFldC* pUndCnt, SRWLMagFldC* pMagCnt, */ EXP int CALL srwlUtiUndFindMagFldInterpInds(int* arResInds, int* pnResInds, double* arGaps, double* arPhases, int nVals, double arPrecPar[5]); +/** + * Checks if GPU offloading is available + * @return true if available + * @see ... + */ +EXP bool CALL srwlUtiGPUAvailable(); //OC26072023 +//EXP bool CALL srwlAuxGpuAvailable(); //HG + +/** + * Checks if GPU offloading is enabled + * @return true if enabled + * @see ... + */ +EXP bool CALL srwlUtiGPUEnabled(); //OC26072023 +//EXP bool CALL srwlAuxGpuEnabled(); //HG + +/** + * Enable/Disable GPU offloading + * @see ... + */ +EXP void CALL srwlUtiGPUSetStatus(bool enable); +//EXP void CALL srwlAuxGpuSetStatus(bool enable); //HG + +/** + * Initialize device offloading + * @see ... + */ +EXP void CALL srwlUtiGPUInit(); //OC26072023 +//EXP void CALL srwlAuxGpuInit(); //HG + +/** + * Finalize device offloading + * @see ... + */ +EXP void CALL srwlUtiGPUFini(); //OC26072023 +//EXP void CALL srwlAuxGpuFini(); //HG + /** * These functions were added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP EXP void CALL srwlPrintTime(const char* str, double* start); From 965482631b424261a7dad5866e24d9cb25830d61 Mon Sep 17 00:00:00 2001 From: Himanshu Goel Date: Mon, 4 Dec 2023 02:04:11 -0500 Subject: [PATCH 2/9] Port over most previous GPU related changes. --- cpp/gcc/Makefile | 41 + cpp/py/setup.py | 6 +- cpp/src/core/gmfft.cpp | 1555 -------------------- cpp/src/core/gmfft.h | 1042 ------------- cpp/src/core/gmfft_gpu.h | 43 - cpp/src/core/sroptang.h | 12 +- cpp/src/core/sroptapt.h | 24 +- cpp/src/core/sroptcnt.cpp | 62 +- cpp/src/core/sroptcnt.h | 9 +- cpp/src/core/sroptcryst.h | 9 +- cpp/src/core/sroptdrf.cpp | 84 +- cpp/src/core/sroptdrf.h | 88 +- cpp/src/core/sroptdrf_gpu.cu | 29 + cpp/src/core/sroptel2.cpp | 3 +- cpp/src/core/sroptelm.cpp | 380 +++-- cpp/src/core/sroptelm.h | 74 +- cpp/src/core/sroptelm_gpu.cu | 587 ++++++++ cpp/src/core/sroptelm_gpu.h | 123 ++ cpp/src/core/sroptgtr.cpp | 4 +- cpp/src/core/sroptgtr.h | 203 ++- cpp/src/core/sroptgtr_gpu.cu | 32 + cpp/src/core/srradmnp.cpp | 323 ++-- cpp/src/core/srradmnp.h | 66 +- cpp/src/core/srradmnp_gpu.cu | 112 +- cpp/src/core/srradstr.cpp | 11 +- cpp/src/core/srradstr.h | 9 +- cpp/src/core/srradstr_gpu.cu | 12 +- cpp/src/core/srstraux.h | 23 + cpp/src/ext/genmath/gmfft.cpp | 1204 +++++++++++---- cpp/src/ext/genmath/gmfft.h | 75 +- cpp/src/{core => ext/genmath}/gmfft_gpu.cu | 307 ++-- cpp/src/ext/genmath/gmmeth.h | 8 + cpp/src/ext/utils/utidev.cpp | 97 -- cpp/src/ext/utils/utidev.h | 71 - cpp/src/lib/auxgpu.cpp | 2 + cpp/src/lib/srwlib.cpp | 46 +- cpp/src/lib/srwlib.h | 42 +- cpp/vc/SRW.sln | 50 +- cpp/vc/SRWLClientPython.vcxproj | 2 +- cpp/vc/SRWLIB.vcxproj | 213 ++- cpp/vc/SRWLIB.vcxproj.filters | 32 +- 41 files changed, 3182 insertions(+), 3933 deletions(-) delete mode 100644 cpp/src/core/gmfft.cpp delete mode 100644 cpp/src/core/gmfft.h delete mode 100644 cpp/src/core/gmfft_gpu.h create mode 100644 cpp/src/core/sroptdrf_gpu.cu create mode 100644 cpp/src/core/sroptelm_gpu.cu create mode 100644 cpp/src/core/sroptelm_gpu.h create mode 100644 cpp/src/core/sroptgtr_gpu.cu rename cpp/src/{core => ext/genmath}/gmfft_gpu.cu (54%) delete mode 100644 cpp/src/ext/utils/utidev.cpp delete mode 100644 cpp/src/ext/utils/utidev.h diff --git a/cpp/gcc/Makefile b/cpp/gcc/Makefile index 69864471..23797d0f 100644 --- a/cpp/gcc/Makefile +++ b/cpp/gcc/Makefile @@ -27,6 +27,12 @@ else endif endif +# HG30112023 +CUDA_PATH ?= /usr/local/cuda +CUDA_MATHLIBS_PATH ?= /usr/local/cuda +NVCC = $(CUDA_PATH)/bin/nvcc +NVCXX = $(CUDA_PATH)/bin/nvc++ + SRW_SRC_DEF= -D_GNU_SOURCE -D__USE_XOPEN2K8 -DFFTW_ENABLE_FLOAT -D_GM_WITHOUT_BASE -DSRWLIB_STATIC -DNO_TIMER -DANSI_DECLARATORS -DTRILIBRARY $(OSFLAG) SRW_INCLUDES= -I$(SRW_SRC_GEN_DIR) -I$(SRW_SRC_LIB_DIR) -I$(SH_SRC_PARSE_DIR) -I$(SH_SRC_GEN_MATH_DIR) $(SRW_SRC_DEF) SRW_CFLAGS= -O3 -fPIC @@ -35,6 +41,17 @@ LDFLAGS=-L$(LIB_DIR) -lm ifeq ($(MODE), omp) SRW_CFLAGS+= -D_WITH_OMP -fopenmp -Wno-write-strings LDFLAGS+= -lfftw +else #HG30112023 +ifeq ($(MODE), cuda) +CUDA_INCLUDES = -I$(CUDA_PATH)/include -I$(CUDA_MATHLIBS_PATH)/include +CUDA_LIBS = -L$(CUDA_PATH)/lib64 -L$(CUDA_MATHLIBS_PATH)/lib64 + +SRW_SRC_DEF += -D_OFFLOAD_GPU -DUSE_CUDA -D_FFTW3 +SRW_INCLUDES += $(CUDA_INCLUDES) +SRW_CFLAGS += -std=c++17 +LDFLAGS += $(CUDA_LIBS) -lcudart_static -lcudadevrt -lcufft -lrt +NVCFLAGS = -O3 -arch=sm_80 -dlto -rdc=true +CUDA_OBJ=gmfft_gpu.o srradstr_gpu.o sroptelm_gpu.o sroptdrf_gpu.o sroptgtr_gpu.o srradmnp_gpu.o else ifeq ($(MODE), 0) SRW_CFLAGS+= -D_FFTW3 @@ -43,6 +60,7 @@ else $(error Unknown SRW compilation option) endif endif +endif PYFLAGS=-I$(shell python -c "from __future__ import print_function; from sysconfig import get_paths as gp; print(gp()['include'])") PYFLAGS+=-L$(shell python -c "from __future__ import print_function; from sysconfig import get_paths as gp; import os; print(os.path.join(gp()['stdlib'], '../libs'))") @@ -71,6 +89,10 @@ OBJ += timerec.o track.o srerror.o # src/lib OBJ += srwlib.o +# HG30112023 +ifeq ($(MODE), cuda) +OBJ += auxgpu.o +endif PRG= libsrw.a @@ -89,6 +111,24 @@ PRG= libsrw.a %.o: $(SRW_SRC_GENESIS_DIR)/%.c $(CC) $(CFLAGS) -c $< +# HG30112023 +ifeq ($(MODE), cuda) +lib: $(CUDA_OBJ) $(OBJ) + $(NVCC) $(NVCFLAGS) -Xcompiler="$(SRW_CFLAGS)" -dlink -o srwl_link.o *.o $(LDFLAGS) + ar -cvq $(PRG) *.o + #cp $(PRG) $(LIB_DIR)/ + rm -f *.o + +%.o: $(SRW_SRC_LIB_DIR)/%.cu + $(NVCC) -dc $(NVCFLAGS) $(SRW_INCLUDES) $(SRW_SRC_DEF) -Xcompiler="$(CFLAGS)" -c $< + +%.o: $(SH_SRC_GEN_MATH_DIR)/%.cu + $(NVCC) -dc $(NVCFLAGS) $(SRW_INCLUDES) $(SRW_SRC_DEF) -Xcompiler="$(CFLAGS)" -c $< + +%.o: $(SRW_SRC_GEN_DIR)/%.cu + $(NVCC) -dc $(NVCFLAGS) $(SRW_INCLUDES) $(SRW_SRC_DEF) -Xcompiler="$(CFLAGS)" -c $< + +else lib: $(OBJ) ar -cvq $(PRG) *.o #cp $(PRG) $(LIB_DIR)/ @@ -102,6 +142,7 @@ lib: $(OBJ) %.o: $(SRW_SRC_GEN_DIR)/%.cu $(NVCC) -x=c++ -Xcompiler="$(CFLAGS)" -c $< +endif pylib: $(CXX) -shared $(CFLAGS) $(PYFLAGS) -o srwlpy.so $(SRW_SRC_DIR)/clients/python/srwlpy.cpp libsrw.a $(LDFLAGS) diff --git a/cpp/py/setup.py b/cpp/py/setup.py index 013e8824..075520e1 100644 --- a/cpp/py/setup.py +++ b/cpp/py/setup.py @@ -20,7 +20,11 @@ if 'MODE' in os.environ: sMode = str(os.environ['MODE']) - if sMode == 'omp': + if sMode == 'cuda': # HG30112023 + ext_kwargs.update({'libraries': ['srw', 'm', 'cudart_static', 'cudadevrt', 'cufft', 'fftw3f', 'fftw3', 'rt'], 'extra_compile_args': ['-O3', '-mavx2', '-fno-math-errno']}) + ext_kwargs['library_dirs'].append('{0}/lib64'.format(os.environ['CUDA_PATH'])) + ext_kwargs['library_dirs'].append('{0}/lib64'.format(os.environ['CUDA_MATHLIBS_PATH'])) + elif sMode == 'omp': #ext_kwargs.update({'extra_link_args': ['-fopenmp'], ext_kwargs.update({'libraries': ['srw', 'm', 'fftw'], #OC07022019 'extra_link_args': ['-fopenmp'], diff --git a/cpp/src/core/gmfft.cpp b/cpp/src/core/gmfft.cpp deleted file mode 100644 index 6e59db8a..00000000 --- a/cpp/src/core/gmfft.cpp +++ /dev/null @@ -1,1555 +0,0 @@ -/************************************************************************//** - * File: gmfft.cpp - * Description: Auxiliary utilities to work with FFTW library - * Project: - * First release: 2000 - * - * Copyright (C) European Synchrotron Radiation Facility, Grenoble, France - * All Rights Reserved - * - * @author O.Chubar, P.Elleaume - * @author S. Yakubov (E-XFEL) - noticed issue and suggested fix in FFT1D - * @version 1.1 - ***************************************************************************/ - -#include "gmfft.h" - -#ifdef _OFFLOAD_GPU -#include "gmfft_gpu.h" -#endif - -//#include "srwlib.h" //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP - -#ifdef _WITH_OMP //OC27102018 -//SY: adopted for OpenMP -#include "omp.h" -#endif - -//************************************************************************* - -long CGenMathFFT::GoodNumbers[] = { - 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 36, 40, 42, 44, - 48, 50, 52, 54, 56, 60, 64, 66, 70, 72, 78, 80, 84, 88, 90, 96, 98, 100, 104, - 108, 110, 112, 120, 126, 128, 130, 132, 140, 144, 150, 154, 156, 160, 162, - 168, 176, 180, 182, 192, 196, 198, 200, 208, 210, 216, 220, 224, 234, 240, - 250, 252, 256, 260, 264, 270, 280, 286, 288, 294, 300, 308, 312, 320, 324, - 330, 336, 350, 352, 360, 364, 378, 384, 390, 392, 396, 400, 416, 420, 432, - 440, 448, 450, 462, 468, 480, 486, 490, 500, 504, 512, 520, 528, 540, 546, - 550, 560, 572, 576, 588, 594, 600, 616, 624, 630, 640, 648, 650, 660, 672, - 686, 700, 702, 704, 720, 728, 750, 756, 768, 770, 780, 784, 792, 800, 810, - 832, 840, 858, 864, 880, 882, 896, 900, 910, 924, 936, 960, 972, 980, 990, - 1000, 1008, 1024, 1040, 1050, 1056, 1078, 1080, 1092, 1100, 1120, 1134, 1144, - 1152, 1170, 1176, 1188, 1200, 1232, 1248, 1250, 1260, 1274, 1280, 1296, 1300, - 1320, 1344, 1350, 1372, 1386, 1400, 1404, 1408, 1430, 1440, 1456, 1458, 1470, - 1500, 1512, 1536, 1540, 1560, 1568, 1584, 1600, 1620, 1638, 1650, 1664, 1680, - 1716, 1728, 1750, 1760, 1764, 1782, 1792, 1800, 1820, 1848, 1872, 1890, 1920, - 1944, 1950, 1960, 1980, 2000, 2002, 2016, 2048, 2058, 2080, 2100, 2106, 2112, - 2156, 2160, 2184, 2200, 2240, 2250, 2268, 2288, 2304, 2310, 2340, 2352, 2376, - 2400, 2430, 2450, 2464, 2496, 2500, 2520, 2548, 2560, 2574, 2592, 2600, 2640, - 2646, 2688, 2700, 2730, 2744, 2750, 2772, 2800, 2808, 2816, 2860, 2880, 2912, - 2916, 2940, 2970, 3000, 3024, 3072, 3080, 3120, 3136, 3150, 3168, 3200, 3234, - 3240, 3250, 3276, 3300, 3328, 3360, 3402, 3430, 3432, 3456, 3500, 3510, 3520, - 3528, 3564, 3584, 3600, 3640, 3696, 3744, 3750, 3780, 3822, 3840, 3850, 3888, - 3900, 3920, 3960, 4000, 4004, 4032, 4050, 4096, 4116, 4158, 4160, 4200, 4212, - 4224, 4290, 4312, 4320, 4368, 4374, 4400, 4410, 4480, 4500, 4536, 4550, 4576, - 4608, 4620, 4680, 4704, 4752, 4800, 4802, 4860, 4900, 4914, 4928, 4950, 4992, - 5000, 5040, 5096, 5120, 5148, 5184, 5200, 5250, 5280, 5292, 5346, 5376, - 5390, 5400, 5460, 5488, 5500, 5544, 5600, 5616, 5632, 5670, 5720, 5760, 5824, - 5832, 5850, 5880, 5940, 6000, 6006, 6048, 6144, 6160, 6174, 6240, 6250, 6272, - 6300, 6318, 6336, 6370, 6400, 6468, 6480, 6500, 6552, 6600, 6656, 6720, 6750, - 6804, 6860, 6864, 6912, 6930, 7000, 7020, 7040, 7056, 7128, 7150, 7168, 7200, - 7280, 7290, 7350, 7392, 7488, 7500, 7546, 7560, 7644, 7680, 7700, 7722, 7776, - 7800, 7840, 7920, 7938, 8000, 8008, 8064, 8100, 8190, 8192, 8232, 8250, 8316, - 8320, 8400, 8424, 8448, 8580, 8624, 8640, 8736, 8748, 8750, 8800, 8820, 8910, - 8918, 8960, 9000, 9072, 9100, 9152, 9216, 9240, 9360, 9408, 9450, 9504, 9600, - 9604, 9702, 9720, 9750, 9800, 9828, 9856, 9900, 9984, 10000, 10010, 10080, - 10192, 10206, 10240, 10290, 10296, 10368, 10400, 10500, 10530, 10560, 10584, - 10692, 10752, 10780, 10800, 10920, 10976, 11000, 11088, 11200, 11232, 11250, - 11264, 11340, 11440, 11466, 11520, 11550, 11648, 11664, 11700, 11760, 11880, - 12000, 12012, 12096, 12150, 12250, 12288, 12320, 12348, 12474, 12480, 12500, - 12544, 12600, 12636, 12672, 12740, 12800, 12870, 12936, 12960, 13000, 13104, - 13122, 13200, 13230, 13312, 13440, 13500, 13608, 13650, 13720, 13728, 13750, - 13824, 13860, 14000, 14014, 14040, 14080, 14112, 14256, 14300, 14336, 14400, - 14406, 14560, 14580, 14700, 14742, 14784, 14850, 14976, 15000, 15092, 15120, - 15288, 15360, 15400, 15444, 15552, 15600, 15680, 15750, 15840, 15876, 16000, - 16016, 16038, 16128, 16170, 16200, 16250, 16380, 16384, 16464, 16500, 16632, - 16640, 16800, 16848, 16896, 17010, 17150, 17160, 17248, 17280, 17472, 17496, - 17500, 17550, 17600, 17640, 17820, 17836, 17920, 18000, 18018, 18144, 18200, - 18304, 18432, 18480, 18522, 18720, 18750, 18816, 18900, 18954, 19008, 19110, - 19200, 19208, 19250, 19404, 19440, 19500, 19600, 19656, 19712, 19800, 19968, - 20000, 20020, 20160, 20250, 20384, 20412, 20480, 20580, 20592, 20736, 20790, - 20800, 21000, 21060, 21120, 21168, 21384, 21450, 21504, 21560, 21600, 21840, - 21870, 21952, 22000, 22050, 22176, 22400, 22464, 22500, 22528, 22638, 22680, - 22750, 22880, 22932, 23040, 23100, 23166, 23296, 23328, 23400, 23520, 23760, - 23814, 24000, 24010, 24024, 24192, 24300, 24500, 24570, 24576, 24640, 24696, - 24750, 24948, 24960, 25000, 25088, 25200, 25272, 25344, 25480, 25600, 25740, - 25872, 25920, 26000, 26208, 26244, 26250, 26400, 26460, 26624, 26730, 26754, - 26880, 26950, 27000, 27216, 27300, 27440, 27456, 27500, 27648, 27720, 28000, - 28028, 28080, 28160, 28224, 28350, 28512, 28600, 28672, 28800, 28812, 29106, - 29120, 29160, 29250, 29400, 29484, 29568, 29700, 29952, 30000, 30030, 30184, - 30240, 30576, 30618, 30720, 30800, 30870, 30888, 31104, 31200, 31250, 31360, - 31500, 31590, 31680, 31752, 31850, 32000, 32032, 32076, 32256, 32340, 32400, - 32500, 32760, 32768, 32928, 33000, 33264, 33280, 33600, 33614, 33696, 33750, - 33792, 34020, 34300, 34320, 34398, 34496, 34560, 34650, 34944, 34992, 35000, - 35100, 35200, 35280, 35640, 35672, 35750, 35840, 36000, 36036, 36288, 36400, - 36450, 36608, 36750, 36864, 36960, 37044, 37422, 37440, 37500, 37632, 37730, - 37800, 37908, 38016, 38220, 38400, 38416, 38500, 38610, 38808, 38880, 39000, - 39200, 39312, 39366, 39424, 39600, 39690, 39936, 40000, 40040, 40320, 40500, - 40768, 40824, 40950, 40960, 41160, 41184, 41250, 41472, 41580, 41600, 42000, - 42042, 42120, 42240, 42336, 42768, 42900, 43008, 43120, 43200, 43218, 43680, - 43740, 43750, 43904, 44000, 44100, 44226, 44352, 44550, 44590, 44800, 44928, - 45000, 45056, 45276, 45360, 45500, 45760, 45864, 46080, 46200, 46332, 46592, - 46656, 46800, 47040, 47250, 47520, 47628, 48000, 48020, 48048, 48114, 48384, - 48510, 48600, 48750, 49000, 49140, 49152, 49280, 49392, 49500, 49896, 49920, - 50000, 50050, 50176, 50400, 50544, 50688, 50960, 51030, 51200, 51450, 51480, - 51744, 51840, 52000, 52416, 52488, 52500, 52650, 52800, 52822, 52920, 53248, - 53460, 53508, 53760, 53900, 54000, 54054, 54432, 54600, 54880, 54912, 55000, - 55296, 55440, 55566, 56000, 56056, 56160, 56250, 56320, 56448, 56700, 56862, - 57024, 57200, 57330, 57344, 57600, 57624, 57750, 58212, 58240, 58320, 58500, - 58800, 58968, 59136, 59400, 59904, 60000, 60060, 60368, 60480, 60750, 61152, - 61236, 61250, 61440, 61600, 61740, 61776, 62208, 62370, 62400, 62426, 62500, - 62720, 63000, 63180, 63360, 63504, 63700, 64000, 64064, 64152, 64350, 64512, - 64680, 64800, 65000, 65520, 65536, 65610, 65856, 66000, 66150, 66528, 66560, - 67200, 67228, 67392, 67500, 67584, 67914, 68040, 68250, 68600, 68640, 68750, - 68796, 68992, 69120, 69300, 69498, 69888, 69984, 70000, 70070, 70200, 70400, - 70560, 71280, 71344, 71442, 71500, 71680, 72000, 72030, 72072, 72576, 72800, - 72900, 73216, 73500, 73710, 73728, 73920, 74088, 74250, 74844, 74880, 75000, - 75264, 75460, 75600, 75816, 76032, 76440, 76800, 76832, 77000, 77220, 77616, - 77760, 78000, 78400, 78624, 78732, 78750, 78848, 79200, 79380, 79872, 80000, - 80080, 80190, 80262, 80640, 80850, 81000, 81250, 81536, 81648, 81900, 81920, - 82320, 82368, 82500, 82944, 83160, 83200, 84000, 84084, 84240, 84480, 84672, - 85050, 85536, 85750, 85800, 86016, 86240, 86400, 86436, 87318, 87360, 87480, - 87500, 87750, 87808, 88000, 88200, 88452, 88704, 89100, 89180, 89600, 89856, - 90000, 90090, 90112, 90552, 90720, 91000, 91520, 91728, 91854, 92160, 92400, - 92610, 92664, 93184, 93312, 93600, 93750, 94080, 94500, 94770, 95040, 95256, - 95550, 96000, 96040, 96096, 96228, 96250, 96768, 97020, 97200, 97500, 98000, - 98098, 98280, 98304, 98560, 98784, 99000, 99792, 99840, 100000 -}; -long CGenMathFFT::LenGoodNumbers = 1151; //637; - -long CGenMathFFT::GoodNum100s[] = { 0,37,61,79,95,107,120,130,142,151,159 }; -long CGenMathFFT::LenGoodNum100s = 11; - -long CGenMathFFT::GoodNum1000s[] = { 0,159,228,279,318,354,383,410,435,459,479 }; -long CGenMathFFT::LenGoodNum1000s = 11; - -long CGenMathFFT::GoodNum10000s[] = { 0,479,636,743,830,900,960,1017,1064,1109,1150 }; -long CGenMathFFT::LenGoodNum10000s = 11; - -#ifdef _OFFLOAD_GPU -long CGenMathFFT1D::PlanLen; -long CGenMathFFT1D::dPlanLen; -long CGenMathFFT1D::HowMany; -long CGenMathFFT1D::dHowMany; -cufftHandle CGenMathFFT1D::Plan1DFFT_cu; -cufftHandle CGenMathFFT1D::dPlan1DFFT_cu; -#endif - -#ifdef _OFFLOAD_GPU -long CGenMathFFT2D::PlanNx; -long CGenMathFFT2D::PlanNy; -long CGenMathFFT2D::HowMany; -long CGenMathFFT2D::dPlanNx; -long CGenMathFFT2D::dPlanNy; -long CGenMathFFT2D::dHowMany; -cufftHandle CGenMathFFT2D::Plan2DFFT_cu; -cufftHandle CGenMathFFT2D::dPlan2DFFT_cu; -#endif -//************************************************************************* - -void CGenMathFFT::NextCorrectNumberForFFT(long& n) -//void CGenMathFFT::NextCorrectNumberForFFT(long long& n) //OC26042019 -{ - if(n < 4) - { - n = 4; return; - } - if(n < 100001) - { - long *pGoodPrev, *pGoodNext; - - long n_d_10000 = long(n*0.0001); - if(n_d_10000 > 0) pGoodPrev = GoodNumbers + GoodNum10000s[n_d_10000] - 1; - else - { - long n_d_1000 = long(n*0.001); - if(n_d_1000 > 0) pGoodPrev = GoodNumbers + GoodNum1000s[n_d_1000] - 1; - else - { - long n_d_100 = long(n*0.01); - if(n_d_100 > 0) pGoodPrev = GoodNumbers + GoodNum100s[n_d_100] - 1; - else pGoodPrev = GoodNumbers; - } - } - pGoodNext = pGoodPrev + 1; - for(;;) - { - if((n > *(pGoodPrev++)) && (n <= *pGoodNext)) - { - n = *pGoodNext; return; - } - pGoodNext++; - } - } - else - { - //OC23072020: sorted multiplies by ratios of power of first prime numbers bw 1 and 2 - const double arTestMults[] = {10./9., 9./8., 6./5., 5./4., 4./3., 3./2., 8./5., 5./3., 16./9., 15./8.}; - const int nTestMults = 10; - - //long k = 16384; - //long k = 65536; - long k = 99000; //OC23072020 (make sure this number is < 100001, and divides by 9,8,5) - - for(int j=0; j<100; j++) - { - //OC23072020 (added tests of intermed numbers obtained by multiplying k by a factor bw 1 and 2) - bool intermedNumFound = false; - for(int m=0; m= 0.5) kTest++; - if(n <= kTest) - { - n = kTest; - intermedNumFound = true; - break; - } - } - if(intermedNumFound) break; - - k <<= 1; - if(n <= k) - { - n = k; break; - } - } - } -} - -//************************************************************************* -//int CGenMathFFT1D::Make1DFFT_InPlace(CGenMathFFT1DInfo& FFT1DInfo) -//int CGenMathFFT1D::Make1DFFT_InPlace(CGenMathFFT1DInfo& FFT1DInfo, gpuUsageArg *pGpuUsage) //HG18072022 -int CGenMathFFT1D::Make1DFFT_InPlace(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU) //OC06092023 -{ -#ifdef _OFFLOAD_GPU //OC06092023 (to avoid useless operations / calls at execution on CPU) - GPU_COND(pvGPU, //OC06092023 - //GPU_COND(pGpuUsage, - { - //HG03082022 GPU can do an inplace fft without being given a temporary buffer - FFT1DInfo.pOutData = FFT1DInfo.pInData; - int result; - if (result = Make1DFFT(FFT1DInfo, pvGPU)) return result; //OC06092023 - //if (result = Make1DFFT(FFT1DInfo, pGpuUsage)) return result; - }) - else -#endif - { - //long TotAmOfPo = (FFT1DInfo.Nx << 1)*FFT1DInfo.HowMany; - long long TotAmOfPo = ((long long)(FFT1DInfo.Nx << 1))*((long long)FFT1DInfo.HowMany); - float* AuxDataCont = new float[TotAmOfPo]; - if(AuxDataCont == 0) return MEMORY_ALLOCATION_FAILURE; - FFT1DInfo.pOutData = AuxDataCont; - - int result; - if(result = Make1DFFT(FFT1DInfo)) return result; - - float *tOut = FFT1DInfo.pInData, *t = AuxDataCont; - for(int ix=0; ix RelShiftTol*xStepNx); - NeedsShiftAfterY = (::fabs(y0_After) > RelShiftTol*yStepNy); - - double xStartTr = -0.5/FFT2DInfo.xStep; - double yStartTr = -0.5/FFT2DInfo.yStep; - - NeedsShiftBeforeX = NeedsShiftBeforeY = 0; - double x0_Before = 0., y0_Before = 0.; - if(FFT2DInfo.UseGivenStartTrValues) - { - x0_Before = (FFT2DInfo.xStartTr - xStartTr); // Sign should be probably reversed here: check!!! - y0_Before = (FFT2DInfo.yStartTr - yStartTr); // Sign should be probably reversed here: check!!! - - NeedsShiftBeforeX = (::fabs(x0_Before) > RelShiftTol*(::fabs(xStartTr))); - NeedsShiftBeforeY = (::fabs(y0_Before) > RelShiftTol*(::fabs(yStartTr))); - } - - //ArrayShiftX = 0; ArrayShiftY = 0; - m_ArrayShiftX = 0; m_ArrayShiftY = 0; //OC02022019 - m_dArrayShiftX = 0; m_dArrayShiftY = 0; - if (FFT2DInfo.pData != 0) - { - if (NeedsShiftBeforeX || NeedsShiftAfterX) - { - //ArrayShiftX = new float[Nx << 1]; - //if(ArrayShiftX == 0) return MEMORY_ALLOCATION_FAILURE; - m_ArrayShiftX = new float[Nx << 1]; - if (m_ArrayShiftX == 0) return MEMORY_ALLOCATION_FAILURE; - } - if (NeedsShiftBeforeY || NeedsShiftAfterY) - { - //ArrayShiftY = new float[Ny << 1]; - //if(ArrayShiftY == 0) return MEMORY_ALLOCATION_FAILURE; - m_ArrayShiftY = new float[Ny << 1]; - if (m_ArrayShiftY == 0) return MEMORY_ALLOCATION_FAILURE; - } - } - else if (FFT2DInfo.pdData != 0) - { - if (NeedsShiftBeforeX || NeedsShiftAfterX) - { - m_dArrayShiftX = new double[Nx << 1]; - if (m_dArrayShiftX == 0) return MEMORY_ALLOCATION_FAILURE; - } - if (NeedsShiftBeforeY || NeedsShiftAfterY) - { - m_dArrayShiftY = new double[Ny << 1]; - if (m_dArrayShiftY == 0) return MEMORY_ALLOCATION_FAILURE; - } - } - -#ifdef _FFTW3 - fftwf_plan Plan2DFFT; - fftw_plan dPlan2DFFT; - fftwf_complex* DataToFFT = 0; - fftw_complex* dDataToFFT = 0; -#endif - -//HG18072022 -//#ifdef _DEBUG -// if (pGpuUsage != NULL) -// printf ("GPU: Make2DFFT\n"); -//#endif - -#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) - GPU_COND(pvGPU, //OC06092023 - //GPU_COND(pGpuUsage, //HG02112021 - { - if(FFT2DInfo.pData != 0) - { - DataToFFT = (fftwf_complex*)CAuxGPU::ToDevice((TGPUUsageArg*)pvGPU, FFT2DInfo.pData, FFT2DInfo.Nx * FFT2DInfo.Ny * FFT2DInfo.howMany * 2 * sizeof(float)); //OC06092023 - //DataToFFT = (fftwf_complex*)AuxGPU::ToDevice(pGpuUsage, FFT2DInfo.pData, FFT2DInfo.Nx * FFT2DInfo.Ny * FFT2DInfo.howMany * 2 * sizeof(float)); - } - else if(FFT2DInfo.pdData != 0) - { - dDataToFFT = (fftw_complex*)CAuxGPU::ToDevice((TGPUUsageArg*)pvGPU, FFT2DInfo.pdData, FFT2DInfo.Nx * FFT2DInfo.Ny * FFT2DInfo.howMany * 2 * sizeof(double)); //OC06092023 - //dDataToFFT = (fftw_complex*)AuxGpu::ToDevice(pGpuUsage, FFT2DInfo.pdData, FFT2DInfo.Nx * FFT2DInfo.Ny * FFT2DInfo.howMany * 2 * sizeof(double)); - } - }) - else -#endif - { -#if _FFTW3 //OC28012019 - if (FFT2DInfo.pData != 0) DataToFFT = (fftwf_complex*)(FFT2DInfo.pData); - else if (FFT2DInfo.pdData != 0) dDataToFFT = (fftw_complex*)(FFT2DInfo.pdData); //OC02022019 - -#else - fftwnd_plan Plan2DFFT; - FFTW_COMPLEX* DataToFFT = (FFTW_COMPLEX*)(FFT2DInfo.pData); -#endif - } - - char t0SignMult = (FFT2DInfo.Dir > 0)? -1 : 1; - - //if(NeedsShiftBeforeX) FillArrayShift('x', t0SignMult*x0_Before, FFT2DInfo.xStep); - //if(NeedsShiftBeforeY) FillArrayShift('y', t0SignMult*y0_Before, FFT2DInfo.yStep); - if(NeedsShiftBeforeX) - {//OC02022019 - if(m_ArrayShiftX != 0) - FillArrayShift('x', t0SignMult*x0_Before, FFT2DInfo.xStep, m_ArrayShiftX); - else if(m_dArrayShiftX != 0) - FillArrayShift('x', t0SignMult*x0_Before, FFT2DInfo.xStep, m_dArrayShiftX); - } - if(NeedsShiftBeforeY) - {//OC02022019 - if(m_ArrayShiftY != 0) - FillArrayShift('y', t0SignMult*y0_Before, FFT2DInfo.yStep, m_ArrayShiftY); - else if(m_dArrayShiftY != 0) - FillArrayShift('y', t0SignMult*y0_Before, FFT2DInfo.yStep, m_dArrayShiftY); - } - -#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) - if (DataToFFT != 0) CAuxGPU::EnsureDeviceMemoryReady((TGPUUsageArg*)pvGPU, DataToFFT); //OC06092023 - else if (dDataToFFT != 0) CAuxGPU::EnsureDeviceMemoryReady((TGPUUsageArg*)pvGPU, dDataToFFT); - //if (DataToFFT != 0) AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, DataToFFT); - //else if (dDataToFFT != 0) AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, dDataToFFT); -#endif - - if (NeedsShiftBeforeX || NeedsShiftBeforeY) //HG02112021 - { -#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) - GPU_COND(pvGPU, { //OC06092023 - //GPU_COND(pGpuUsage, { - TGPUUsageArg *pGPU = (TGPUUsageArg*)pvGPU; - if (DataToFFT != 0) { - m_ArrayShiftX = (float*)CAuxGPU::ToDevice(pGPU, m_ArrayShiftX, (Nx << 1) * sizeof(float), false); //OC06092023 - m_ArrayShiftY = (float*)CAuxGPU::ToDevice(pGPU, m_ArrayShiftY, (Ny << 1) * sizeof(float), false); - CAuxGPU::EnsureDeviceMemoryReady(pGPU, m_ArrayShiftX); - CAuxGPU::EnsureDeviceMemoryReady(pGPU, m_ArrayShiftY); - //m_ArrayShiftX = (float*)AuxGpu::ToDevice(pGpuUsage, m_ArrayShiftX, (Nx << 1) * sizeof(float), false); - //m_ArrayShiftY = (float*)AuxGpu::ToDevice(pGpuUsage, m_ArrayShiftY, (Ny << 1) * sizeof(float), false); - //AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, m_ArrayShiftX); - //AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, m_ArrayShiftY); - TreatShifts2D_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany, NeedsShiftBeforeX, NeedsShiftBeforeY, m_ArrayShiftX, m_ArrayShiftY); - m_ArrayShiftX = (float*)CAuxGPU::ToHostAndFree(pGPU, m_ArrayShiftX, (Nx << 1) * sizeof(float), true); //OC06092023 - m_ArrayShiftY = (float*)CAuxGPU::ToHostAndFree(pGPU, m_ArrayShiftY, (Ny << 1) * sizeof(float), true); - //m_ArrayShiftX = (float*)AuxGpu::ToHostAndFree(pGpuUsage, m_ArrayShiftX, (Nx << 1) * sizeof(float), true); - //m_ArrayShiftY = (float*)AuxGpu::ToHostAndFree(pGpuUsage, m_ArrayShiftY, (Ny << 1) * sizeof(float), true); - } - else if (dDataToFFT != 0) { - m_dArrayShiftX = (double*)CAuxGPU::ToDevice(pGPU, m_dArrayShiftX, (Nx << 1) * sizeof(double), false); //OC06092023 - m_dArrayShiftY = (double*)CAuxGPU::ToDevice(pGPU, m_dArrayShiftY, (Ny << 1) * sizeof(double), false); - CAuxGPU::EnsureDeviceMemoryReady(pGPU, m_dArrayShiftX); - CAuxGPU::EnsureDeviceMemoryReady(pGPU, m_dArrayShiftY); - //m_dArrayShiftX = (double*)AuxGpu::ToDevice(pGpuUsage, m_dArrayShiftX, (Nx << 1) * sizeof(double), false); - //m_dArrayShiftY = (double*)AuxGpu::ToDevice(pGpuUsage, m_dArrayShiftY, (Ny << 1) * sizeof(double), false); - //AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, m_dArrayShiftX); - //AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, m_dArrayShiftY); - TreatShifts2D_GPU((double*)dDataToFFT, Nx, Ny, FFT2DInfo.howMany, NeedsShiftBeforeX, NeedsShiftBeforeY, m_dArrayShiftX, m_dArrayShiftY); - m_dArrayShiftX = (double*)CAuxGPU::ToHostAndFree(pGPU, m_dArrayShiftX, (Nx << 1) * sizeof(double), true); //OC06092023 - m_dArrayShiftY = (double*)CAuxGPU::ToHostAndFree(pGPU, m_dArrayShiftY, (Ny << 1) * sizeof(double), true); - //m_dArrayShiftX = (double*)AuxGpu::ToHostAndFree(pGpuUsage, m_dArrayShiftX, (Nx << 1) * sizeof(double), true); - //m_dArrayShiftY = (double*)AuxGpu::ToHostAndFree(pGpuUsage, m_dArrayShiftY, (Ny << 1) * sizeof(double), true); - } - }) - else -#endif - { - if (DataToFFT != 0) TreatShifts(DataToFFT, FFT2DInfo.howMany); - -#ifdef _FFTW3 //OC27022019 - else if (dDataToFFT != 0) TreatShifts(dDataToFFT, FFT2DInfo.howMany); //OC02022019 -#endif - } - } - - bool alreadyNormalized = false; //HG17032022 - //double Mult = FFT2DInfo.xStep*FFT2DInfo.yStep; - double Mult = FFT2DInfo.xStep * FFT2DInfo.yStep * FFT2DInfo.ExtraMult; //OC20112017 - if (FFT2DInfo.Dir > 0) - { -#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) - GPU_COND(pvGPU, //OC06092023 - //GPU_COND(pGpuUsage, //HG02112021 - { - if (DataToFFT != 0) - { - if (pPrecreatedPlan2DFFT == 0) - { - if((Plan2DFFT_cu == NULL) || (!(PlanNx == Nx && PlanNy == Ny && HowMany == FFT2DInfo.howMany))) //OC06092023 - //if (Plan2DFFT_cu == NULL | !(PlanNx == Nx && PlanNy == Ny && HowMany == FFT2DInfo.howMany)) - { - if (Plan2DFFT_cu != NULL) - { - cufftDestroy(Plan2DFFT_cu); - Plan2DFFT_cu = NULL; - } - - PlanNx = Nx; - PlanNy = Ny; - HowMany = FFT2DInfo.howMany; - int plan_shape[2]; plan_shape[0] = Nx; plan_shape[1] = Ny; - cufftPlanMany(&Plan2DFFT_cu, 2, plan_shape, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, FFT2DInfo.howMany); - //cufftPlan2d(&Plan2DFFT_cu, Nx, Ny, CUFFT_C2C); - } - } - else Plan2DFFT_cu = *(cufftHandle*)pPrecreatedPlan2DFFT; - if (Plan2DFFT_cu == 0) return ERROR_IN_FFT; - - auto res = cufftExecC2C(Plan2DFFT_cu, (cufftComplex*)DataToFFT, (cufftComplex*)DataToFFT, CUFFT_FORWARD); -// if (res != CUFFT_SUCCESS) -// printf("CUFFT Error: %d\r\n", res); - } - else if (dDataToFFT != 0) - { - if (pdPrecreatedPlan2DFFT == 0) - { - if((dPlan2DFFT_cu == NULL) || (!(dPlanNx == Nx && dPlanNy == Ny && dHowMany == FFT2DInfo.howMany))) //OC06092023 - //if (dPlan2DFFT_cu == NULL | !(dPlanNx == Nx && dPlanNy == Ny && dHowMany == FFT2DInfo.howMany)) - { - if (dPlan2DFFT_cu != NULL) - { - cufftDestroy(dPlan2DFFT_cu); - dPlan2DFFT_cu = NULL; - } - - dPlanNx = Nx; - dPlanNy = Ny; - HowMany = FFT2DInfo.howMany; - int plan_shape[2]; plan_shape[0] = Nx; plan_shape[1] = Ny; - cufftPlanMany(&Plan2DFFT_cu, 2, plan_shape, 0, 0, 0, 0, 0, 0, CUFFT_Z2Z, FFT2DInfo.howMany); - //cufftPlan2d(&dPlan2DFFT_cu, Nx, Ny, CUFFT_Z2Z); - } - } - else dPlan2DFFT_cu = *(cufftHandle*)pdPrecreatedPlan2DFFT; - if (dPlan2DFFT_cu == 0) return ERROR_IN_FFT; - - cufftExecZ2Z(dPlan2DFFT_cu, (cufftDoubleComplex*)dDataToFFT, (cufftDoubleComplex*)dDataToFFT, CUFFT_FORWARD); - } - }) - else -#endif - { - //Plan2DFFT = fftw2d_create_plan(Ny, Nx, FFTW_FORWARD, FFTW_IN_PLACE); - //OC27102018 - //SY: adopted for OpenMP -#if _FFTW3 //OC28012019 - - for(long iHowMany = 0; iHowMany < FFT2DInfo.howMany; iHowMany++) - { - long iFFT = Nx * Ny * iHowMany; - if (DataToFFT != 0) - { - if (pPrecreatedPlan2DFFT == 0) Plan2DFFT = fftwf_plan_dft_2d(Ny, Nx, DataToFFT + iFFT, DataToFFT + iFFT, FFTW_FORWARD, FFTW_ESTIMATE); - else Plan2DFFT = *pPrecreatedPlan2DFFT; - if (Plan2DFFT == 0) return ERROR_IN_FFT; - - fftwf_execute(Plan2DFFT); - } - else if (dDataToFFT != 0) - { - if (pdPrecreatedPlan2DFFT == 0) dPlan2DFFT = fftw_plan_dft_2d(Ny, Nx, dDataToFFT + iFFT, dDataToFFT + iFFT, FFTW_FORWARD, FFTW_ESTIMATE); - else dPlan2DFFT = *pdPrecreatedPlan2DFFT; - if (dPlan2DFFT == 0) return ERROR_IN_FFT; - - fftw_execute(dPlan2DFFT); - } - } - -#else - if (pPrecreatedPlan2DFFT == 0) Plan2DFFT = fftw2d_create_plan(Ny, Nx, FFTW_FORWARD, FFTW_IN_PLACE); - else Plan2DFFT = *pPrecreatedPlan2DFFT; - if (Plan2DFFT == 0) return ERROR_IN_FFT; - fftwnd(Plan2DFFT, 1, DataToFFT, 1, 0, DataToFFT, 1, 0); -#endif - } - -#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) - GPU_COND(pvGPU, //OC06092023 - //GPU_COND(pGpuUsage, //HG18072022 - { - if (DataToFFT != 0) - { - //RepairSignAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany); - //RotateDataAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany); - //RepairSignAndRotateDataAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany, Mult); - RepairSignAndRotateDataAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany, (float)Mult); //OC06092023 - } - else if (dDataToFFT != 0) - { - //RepairSignAfter2DFFT_GPU((double*)dDataToFFT, Nx, Ny, FFT2DInfo.howMany); - //RotateDataAfter2DFFT_GPU((double*)dDataToFFT, Nx, Ny, FFT2DInfo.howMany); - RepairSignAndRotateDataAfter2DFFT_GPU((double*)dDataToFFT, Nx, Ny, FFT2DInfo.howMany, Mult); - } - alreadyNormalized = true; - }) - else -#endif - { - if (DataToFFT != 0) - { - RepairSignAfter2DFFT(DataToFFT, FFT2DInfo.howMany); - RotateDataAfter2DFFT(DataToFFT, FFT2DInfo.howMany); - } - -#ifdef _FFTW3 //OC27022019 - else if (dDataToFFT != 0) - { - RepairSignAfter2DFFT(dDataToFFT, FFT2DInfo.howMany); - RotateDataAfter2DFFT(dDataToFFT, FFT2DInfo.howMany); - } -#endif - } - } - else - { -#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) - GPU_COND(pvGPU, //OC06092023 - //GPU_COND(pGpuUsage, //HG18072022 - { - if (DataToFFT != 0) - { - if (pPrecreatedPlan2DFFT == 0) { - if((Plan2DFFT_cu == NULL) || (!(PlanNx == Nx && PlanNy == Ny && HowMany == FFT2DInfo.howMany))) //OC06092023 - //if (Plan2DFFT_cu == NULL | !(PlanNx == Nx && PlanNy == Ny && HowMany == FFT2DInfo.howMany)) - { - if (Plan2DFFT_cu != NULL){ - cufftDestroy(Plan2DFFT_cu); - Plan2DFFT_cu = NULL; - } - - PlanNx = Nx; - PlanNy = Ny; - HowMany = FFT2DInfo.howMany; - int plan_shape[2]; plan_shape[0] = Nx; plan_shape[1] = Ny; - cufftPlanMany(&Plan2DFFT_cu, 2, plan_shape, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, FFT2DInfo.howMany); - //cufftPlan2d(&Plan2DFFT_cu, Nx, Ny, CUFFT_C2C); - } - } - else Plan2DFFT_cu = *(cufftHandle*)pPrecreatedPlan2DFFT; - if (Plan2DFFT_cu == 0) return ERROR_IN_FFT; - - RotateDataAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany); - RepairSignAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany); - cufftExecC2C(Plan2DFFT_cu, (cufftComplex*)DataToFFT, (cufftComplex*)DataToFFT, CUFFT_INVERSE); - } - else if (dDataToFFT != 0) - { - if (pdPrecreatedPlan2DFFT == 0) { - if((dPlan2DFFT_cu == NULL) || (!(dPlanNx == Nx && dPlanNy == Ny && dHowMany == FFT2DInfo.howMany))) //OC06092023 - //if (dPlan2DFFT_cu == NULL | !(dPlanNx == Nx && dPlanNy == Ny && dHowMany == FFT2DInfo.howMany)) - { - if (dPlan2DFFT_cu != NULL){ - cufftDestroy(dPlan2DFFT_cu); - dPlan2DFFT_cu = NULL; - } - - dPlanNx = Nx; - dPlanNy = Ny; - dHowMany = FFT2DInfo.howMany; - int plan_shape[2]; plan_shape[0] = Nx; plan_shape[1] = Ny; - cufftPlanMany(&Plan2DFFT_cu, 2, plan_shape, NULL, 0, 0, NULL, 0, 0, CUFFT_Z2Z, FFT2DInfo.howMany); - //cufftPlan2d(&dPlan2DFFT_cu, Nx, Ny, CUFFT_Z2Z); - } - } - else dPlan2DFFT_cu = *(cufftHandle*)pdPrecreatedPlan2DFFT; - if (dPlan2DFFT_cu == 0) return ERROR_IN_FFT; - - RotateDataAfter2DFFT_GPU((double*)dDataToFFT, Nx, Ny, FFT2DInfo.howMany); - RepairSignAfter2DFFT_GPU((double*)dDataToFFT, Nx, Ny, FFT2DInfo.howMany); - cufftExecZ2Z(dPlan2DFFT_cu, (cufftDoubleComplex*)dDataToFFT, (cufftDoubleComplex*)dDataToFFT, CUFFT_INVERSE); - } - }) - else -#endif - { - //Plan2DFFT = fftw2d_create_plan(Ny, Nx, FFTW_BACKWARD, FFTW_IN_PLACE); - //OC27102018 - //SY: adopted for OpenMP -#ifdef _FFTW3 //OC28012019 - for (long iHowMany = 0; iHowMany < FFT2DInfo.howMany; iHowMany++) - { - long iFFT = Nx * Ny * iHowMany; - if (DataToFFT != 0) - { - if (pPrecreatedPlan2DFFT == 0) Plan2DFFT = fftwf_plan_dft_2d(Ny, Nx, DataToFFT + iFFT, DataToFFT + iFFT, FFTW_BACKWARD, FFTW_ESTIMATE); - else Plan2DFFT = *pPrecreatedPlan2DFFT; - if (Plan2DFFT == 0) return ERROR_IN_FFT; - RotateDataAfter2DFFT(DataToFFT, FFT2DInfo.howMany); - RepairSignAfter2DFFT(DataToFFT, FFT2DInfo.howMany); - fftwf_execute(Plan2DFFT); - } - else if (dDataToFFT != 0) - { - if (pdPrecreatedPlan2DFFT == 0) dPlan2DFFT = fftw_plan_dft_2d(Ny, Nx, dDataToFFT + iFFT, dDataToFFT + iFFT, FFTW_BACKWARD, FFTW_ESTIMATE); - else dPlan2DFFT = *pdPrecreatedPlan2DFFT; - if (dPlan2DFFT == 0) return ERROR_IN_FFT; - RotateDataAfter2DFFT(dDataToFFT, FFT2DInfo.howMany); - RepairSignAfter2DFFT(dDataToFFT, FFT2DInfo.howMany); - fftw_execute(dPlan2DFFT); - } - } -#else - if (pPrecreatedPlan2DFFT == 0) Plan2DFFT = fftw2d_create_plan(Ny, Nx, FFTW_BACKWARD, FFTW_IN_PLACE); - else Plan2DFFT = *pPrecreatedPlan2DFFT; - if (Plan2DFFT == 0) return ERROR_IN_FFT; - RotateDataAfter2DFFT(DataToFFT); - RepairSignAfter2DFFT(DataToFFT); - fftwnd(Plan2DFFT, 1, DataToFFT, 1, 0, DataToFFT, 1, 0); -#endif - } - } - - if (!alreadyNormalized){ -#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) - GPU_COND(pvGPU, //OC06092023 - //GPU_COND(pGpuUsage, //HG18072022 - { - if (DataToFFT != 0) - NormalizeDataAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany, Mult); - else if (dDataToFFT != 0) - NormalizeDataAfter2DFFT_GPU((double*)dDataToFFT, Nx, Ny, FFT2DInfo.howMany, Mult); - }) - else -#endif - { - if (DataToFFT != 0) NormalizeDataAfter2DFFT(DataToFFT, Mult, FFT2DInfo.howMany); - -#ifdef _FFTW3 //OC27022019 - else if (dDataToFFT != 0) NormalizeDataAfter2DFFT(dDataToFFT, Mult, FFT2DInfo.howMany); -#endif - } - } - - //if(NeedsShiftAfterX) FillArrayShift('x', t0SignMult*x0_After, FFT2DInfo.xStepTr); - //if(NeedsShiftAfterY) FillArrayShift('y', t0SignMult*y0_After, FFT2DInfo.yStepTr); - - if (NeedsShiftAfterX) - {//OC02022019 - if (m_ArrayShiftX != 0) FillArrayShift('x', t0SignMult * x0_After, FFT2DInfo.xStepTr, m_ArrayShiftX); - else if (m_dArrayShiftX != 0) FillArrayShift('x', t0SignMult * x0_After, FFT2DInfo.xStepTr, m_dArrayShiftX); - } - if (NeedsShiftAfterY) - {//OC02022019 - if (m_ArrayShiftY != 0) FillArrayShift('y', t0SignMult * y0_After, FFT2DInfo.yStepTr, m_ArrayShiftY); - else if (m_dArrayShiftY != 0) FillArrayShift('y', t0SignMult * y0_After, FFT2DInfo.yStepTr, m_dArrayShiftY); - } - if (NeedsShiftAfterX || NeedsShiftAfterY) - { -#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) - GPU_COND(pvGPU, //OC06092023 - //GPU_COND(pGpuUsage, //HG18072022 - { - TGPUUsageArg *pGPU = (TGPUUsageArg*)pvGPU; - if (DataToFFT != 0) { - m_ArrayShiftX = (float*)CAuxGPU::ToDevice(pGPU, m_ArrayShiftX, (Nx << 1) * sizeof(float), false); //OC06092023 - m_ArrayShiftY = (float*)CAuxGPU::ToDevice(pGPU, m_ArrayShiftY, (Ny << 1) * sizeof(float), false); - CAuxGPU::EnsureDeviceMemoryReady(pGPU, m_ArrayShiftX); - CAuxGPU::EnsureDeviceMemoryReady(pGPU, m_ArrayShiftY); - //m_ArrayShiftX = (float*)AuxGpu::ToDevice(pGpuUsage, m_ArrayShiftX, (Nx << 1) * sizeof(float), false); - //m_ArrayShiftY = (float*)AuxGpu::ToDevice(pGpuUsage, m_ArrayShiftY, (Ny << 1) * sizeof(float), false); - //AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, m_ArrayShiftX); - //AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, m_ArrayShiftY); - TreatShifts2D_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany, NeedsShiftAfterX, NeedsShiftAfterY, m_ArrayShiftX, m_ArrayShiftY); - m_ArrayShiftX = (float*)CAuxGPU::ToHostAndFree(pGPU, m_ArrayShiftX, (Nx << 1) * sizeof(float), true); //OC06092023 - m_ArrayShiftY = (float*)CAuxGPU::ToHostAndFree(pGPU, m_ArrayShiftY, (Ny << 1) * sizeof(float), true); - //m_ArrayShiftX = (float*)AuxGpu::ToHostAndFree(pGpuUsage, m_ArrayShiftX, (Nx << 1) * sizeof(float), true); - //m_ArrayShiftY = (float*)AuxGpu::ToHostAndFree(pGpuUsage, m_ArrayShiftY, (Ny << 1) * sizeof(float), true); - } - else if (dDataToFFT != 0) { - m_dArrayShiftX = (double*)CAuxGPU::ToDevice(pGPU, m_dArrayShiftX, (Nx << 1) * sizeof(double), false); //OC06092023 - m_dArrayShiftY = (double*)CAuxGPU::ToDevice(pGPU, m_dArrayShiftY, (Ny << 1) * sizeof(double), false); - CAuxGPU::EnsureDeviceMemoryReady(pGPU, m_dArrayShiftX); - CAuxGPU::EnsureDeviceMemoryReady(pGPU, m_dArrayShiftY); - //m_dArrayShiftX = (double*)AuxGpu::ToDevice(pGpuUsage, m_dArrayShiftX, (Nx << 1) * sizeof(double), false); - //m_dArrayShiftY = (double*)AuxGpu::ToDevice(pGpuUsage, m_dArrayShiftY, (Ny << 1) * sizeof(double), false); - //AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, m_dArrayShiftX); - //AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, m_dArrayShiftY); - TreatShifts2D_GPU((double*)dDataToFFT, Nx, Ny, FFT2DInfo.howMany, NeedsShiftAfterX, NeedsShiftAfterY, m_dArrayShiftX, m_dArrayShiftY); - m_dArrayShiftX = (double*)CAuxGPU::ToHostAndFree(pGPU, m_dArrayShiftX, (Nx << 1) * sizeof(double), true); //OC06092023 - m_dArrayShiftY = (double*)CAuxGPU::ToHostAndFree(pGPU, m_dArrayShiftY, (Ny << 1) * sizeof(double), true); - //m_dArrayShiftX = (double*)AuxGpu::ToHostAndFree(pGpuUsage, m_dArrayShiftX, (Nx << 1) * sizeof(double), true); - //m_dArrayShiftY = (double*)AuxGpu::ToHostAndFree(pGpuUsage, m_dArrayShiftY, (Ny << 1) * sizeof(double), true); - } - }) - else -#endif - { - if (DataToFFT != 0) TreatShifts(DataToFFT, FFT2DInfo.howMany); - -#ifdef _FFTW3 //OC27022019 - else if (dDataToFFT != 0) TreatShifts(dDataToFFT, FFT2DInfo.howMany); //OC02022019 -#endif - } - } - - //OC_NERSC: to comment-out the following line for NERSC (to avoid crash with "python-mpi") - //fftwnd_destroy_plan(Plan2DFFT); - //OC27102018 - //SY: adopted for OpenMP -#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) - GPU_COND(pvGPU, //OC06092023 - //GPU_COND(pGpuUsage, //HG02112021 - { - if (FFT2DInfo.pData != 0) - { - CAuxGPU::MarkUpdated((TGPUUsageArg*)pvGPU, DataToFFT, true, false); //OC06092023 - //AuxGpu::MarkUpdated(pGpuUsage, DataToFFT, true, false); - } - else if (FFT2DInfo.pdData != 0) - { - CAuxGPU::MarkUpdated((TGPUUsageArg*)pvGPU, dDataToFFT, true, false); //OC06092023 - //AuxGpu::MarkUpdated(pGpuUsage, dDataToFFT, true, false); - } - }) - else -#endif - { -#if _FFTW3 //OC28012019 - if (DataToFFT != 0) - { - if (pPrecreatedPlan2DFFT == 0) fftwf_destroy_plan(Plan2DFFT); - } - else if (dDataToFFT != 0) //OC03022019 - { - if (pdPrecreatedPlan2DFFT == 0) fftw_destroy_plan(dPlan2DFFT); - } -#else - if (pPrecreatedPlan2DFFT == 0) fftwnd_destroy_plan(Plan2DFFT); -#endif - } - - //if(ArrayShiftX != 0) { delete[] ArrayShiftX; ArrayShiftX = 0;} - //if(ArrayShiftY != 0) { delete[] ArrayShiftY; ArrayShiftY = 0;} - if (m_ArrayShiftX != 0) { delete[] m_ArrayShiftX;} - if (m_ArrayShiftY != 0) { delete[] m_ArrayShiftY;} - if (m_dArrayShiftX != 0) { delete[] m_dArrayShiftX;} //OC02022019 - if (m_dArrayShiftY != 0) { delete[] m_dArrayShiftY;} - - return 0; -} - -//************************************************************************* -//Forward FFT: Int f(x)*exp(-i*2*Pi*qx*x)dx -//Backward FFT: Int f(qx)*exp(i*2*Pi*qx*x)dqx -//int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo) -//int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, gpuUsageArg *pGpuUsage) //HG20012022 -int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU) //OC05092023 -{// Assumes Nx, Ny even ! - //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: - //double start; - //get_walltime (&start); - - const double RelShiftTol = 1.E-06; - - SetupLimitsTr(FFT1DInfo); - - double xStepNx = FFT1DInfo.Nx*FFT1DInfo.xStep; - double x0_After = FFT1DInfo.xStart + 0.5*xStepNx; - NeedsShiftAfterX = FFT1DInfo.ApplyAutoShiftAfter && (::fabs(x0_After) > RelShiftTol*xStepNx); - - double xStartTr = -0.5/FFT1DInfo.xStep; - - NeedsShiftBeforeX = 0; - double x0_Before = 0.; - - if(FFT1DInfo.UseGivenStartTrValue) - { - x0_Before = (FFT1DInfo.xStartTr - xStartTr); - NeedsShiftBeforeX = (::fabs(x0_Before) > RelShiftTol*(::fabs(xStartTr))); - } - - m_ArrayShiftX = 0; - m_dArrayShiftX = 0; - if (NeedsShiftBeforeX || NeedsShiftAfterX) - { - if (FFT1DInfo.pInData != 0) - { - m_ArrayShiftX = new float[Nx << 1]; - if (m_ArrayShiftX == 0) return MEMORY_ALLOCATION_FAILURE; - -#ifdef _OFFLOAD_GPU //OC05092023 (check for memory leak / misuse!) - m_ArrayShiftX = (float*)CAuxGPU::ToDevice((TGPUUsageArg*)pvGPU, m_ArrayShiftX, (Nx << 1) * sizeof(float), true); - //m_ArrayShiftX = (float*)AuxGpu::ToDevice(pGpuUsage, m_ArrayShiftX, (Nx << 1) * sizeof(float), true); //HG20012022 -#endif - } - else if (FFT1DInfo.pdInData != 0) - { - m_dArrayShiftX = new double[Nx << 1]; - if (m_dArrayShiftX == 0) return MEMORY_ALLOCATION_FAILURE; - -#ifdef _OFFLOAD_GPU //OC05092023 - m_dArrayShiftX = (double*)CAuxGPU::ToDevice((TGPUUsageArg*)pvGPU, m_dArrayShiftX, (Nx << 1) * sizeof(double), true); - //m_dArrayShiftX = (double*)AuxGpu::ToDevice(pGpuUsage, m_dArrayShiftX, (Nx << 1) * sizeof(double), true); //HG20012022 -#endif - } - } - -#ifdef _FFTW3 //OC28012019 - fftwf_plan Plan1DFFT; - fftwf_complex* DataToFFT = 0, * OutDataFFT = 0; //, *pOutDataFFT=0; - - fftw_plan dPlan1DFFT; - fftw_complex* dDataToFFT = 0, * dOutDataFFT = 0; //, *pdOutDataFFT=0; -#endif - -//HG20012022 -//#ifdef _DEBUG -// if (pGpuUsage != NULL) -// printf ("GPU: Make1DFFT\n"); -//#endif -#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) - GPU_COND(pvGPU, //OC06092023 - //GPU_COND(pGpuUsage, //HG20012022 - { - if ((FFT1DInfo.pInData != 0) && (FFT1DInfo.pOutData != 0)) - { - DataToFFT = (fftwf_complex*)CAuxGPU::ToDevice((TGPUUsageArg*)pvGPU, FFT1DInfo.pInData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(float)); //OC06092023 - OutDataFFT = (fftwf_complex*)CAuxGPU::ToDevice((TGPUUsageArg*)pvGPU, FFT1DInfo.pOutData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(float), true); - //DataToFFT = (fftwf_complex*)AuxGpu::ToDevice(pGpuUsage, FFT1DInfo.pInData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(float)); - //OutDataFFT = (fftwf_complex*)AuxGpu::ToDevice(pGpuUsage, FFT1DInfo.pOutData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(float), true); - } - else if ((FFT1DInfo.pdInData != 0) && (FFT1DInfo.pdOutData != 0)) - { - dDataToFFT = (fftw_complex*)CAuxGPU::ToDevice((TGPUUsageArg*)pvGPU, FFT1DInfo.pdInData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(double)); //OC06092023 - dOutDataFFT = (fftw_complex*)CAuxGPU::ToDevice((TGPUUsageArg*)pvGPU, FFT1DInfo.pdOutData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(double), true); - //dDataToFFT = (fftw_complex*)AuxGpu::ToDevice(pGpuUsage, FFT1DInfo.pdInData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(double)); - //dOutDataFFT = (fftw_complex*)AuxGpu::ToDevice(pGpuUsage, FFT1DInfo.pdOutData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(double), true); - } - }) - else -#endif - { -#ifdef _FFTW3 //OC28012019 - if ((FFT1DInfo.pInData != 0) && (FFT1DInfo.pOutData != 0)) - { - DataToFFT = (fftwf_complex*)(FFT1DInfo.pInData); - OutDataFFT = (fftwf_complex*)(FFT1DInfo.pOutData); - //pOutDataFFT = OutDataFFT; //OC03092016 to be used solely in fftw call - } - else if ((FFT1DInfo.pdInData != 0) && (FFT1DInfo.pdOutData != 0)) - { - dDataToFFT = (fftw_complex*)(FFT1DInfo.pdInData); - dOutDataFFT = (fftw_complex*)(FFT1DInfo.pdOutData); - //pdOutDataFFT = dOutDataFFT; - } -#else - fftw_plan Plan1DFFT; - FFTW_COMPLEX* DataToFFT = (FFTW_COMPLEX*)(FFT1DInfo.pInData); - FFTW_COMPLEX* OutDataFFT = (FFTW_COMPLEX*)(FFT1DInfo.pOutData); - FFTW_COMPLEX* pOutDataFFT = OutDataFFT; //OC03092016 to be used solely in fftw call - /** - Pointed-out by Sergey Yakubov (E-XFEL). - From FFTW 2.1.5 docs: - void fftw(fftw_plan plan, int howmany, - fftw_complex *in, int istride, int idist, - fftw_complex *out, int ostride, int odist); - ... - out, ostride and odist describe the output array(s). The format is the same as for the input array. - In-place transforms: If the plan specifies an in-place transform, ostride and odist are always ignored. - If out is NULL, out is ignored, too. Otherwise, out is interpreted as a pointer to an array of n complex numbers, - that FFTW will use as temporary space to perform the in-place computation. out is used as scratch space and its contents destroyed. - In this case, out must be an ordinary array whose elements are contiguous in memory (no striding). - **/ -#endif - } - -#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) - if (DataToFFT != 0) CAuxGPU::EnsureDeviceMemoryReady((TGPUUsageArg*)pvGPU, DataToFFT); - else if (dDataToFFT != 0) CAuxGPU::EnsureDeviceMemoryReady((TGPUUsageArg*)pvGPU, dDataToFFT); - //if (DataToFFT != 0) AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, DataToFFT); - //else if (dDataToFFT != 0) AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, dDataToFFT); -#endif - - char t0SignMult = (FFT1DInfo.Dir > 0) ? -1 : 1; - if (NeedsShiftBeforeX) - { -#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) - GPU_COND(pvGPU, - //GPU_COND(pGpuUsage, //HG20012022 - { - if (m_ArrayShiftX != 0) FillArrayShift_GPU(t0SignMult * x0_Before, FFT1DInfo.xStep, Nx, m_ArrayShiftX); - else if (m_dArrayShiftX != 0) FillArrayShift_GPU(t0SignMult * x0_Before, FFT1DInfo.xStep, Nx, m_dArrayShiftX); - - if (DataToFFT != 0) TreatShift_GPU((float*)DataToFFT, FFT1DInfo.HowMany, Nx, m_ArrayShiftX); - else if (dDataToFFT != 0) TreatShift_GPU((double*)dDataToFFT, FFT1DInfo.HowMany, Nx, m_dArrayShiftX); - }) - else -#endif - { - //FillArrayShift(t0SignMult*x0_Before, FFT1DInfo.xStep); - if (m_ArrayShiftX != 0) FillArrayShift(t0SignMult * x0_Before, FFT1DInfo.xStep, m_ArrayShiftX); - else if (m_dArrayShiftX != 0) FillArrayShift(t0SignMult * x0_Before, FFT1DInfo.xStep, m_dArrayShiftX); - - if (DataToFFT != 0) TreatShift(DataToFFT, FFT1DInfo.HowMany); - -#ifdef _FFTW3 //OC27022019 - else if (dDataToFFT != 0) TreatShift(dDataToFFT, FFT1DInfo.HowMany); -#endif - } - } - - //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: - //srwlPrintTime("::Make1DFFT : before fft",&start); - - int flags = FFTW_ESTIMATE; //OC30012019 - bool alreadyNormalized = false; //HG17032022 - //double Mult = FFT1DInfo.xStep; - double Mult = FFT1DInfo.xStep * FFT1DInfo.MultExtra; - - if (FFT1DInfo.Dir > 0) //HG17112021 - { -#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) - GPU_COND(pvGPU, - //GPU_COND(pGpuUsage, - { - int arN[] = { (int)Nx }; //OC14052020 - if (DataToFFT != 0) - { - if (PlanLen != Nx) { - PlanLen = Nx; - if (Plan1DFFT_cu != NULL) - { - cufftDestroy(Plan1DFFT_cu); - Plan1DFFT_cu = NULL; - } - cufftPlanMany(&Plan1DFFT_cu, 1, arN, NULL, 1, Nx, NULL, 1, Nx, CUFFT_C2C, FFT1DInfo.HowMany); - } - if (Plan1DFFT_cu == 0) return ERROR_IN_FFT; - cufftExecC2C(Plan1DFFT_cu, (cufftComplex*)DataToFFT, (cufftComplex*)OutDataFFT, CUFFT_FORWARD); - } - else if (dDataToFFT != 0) //OC02022019 - { - if (dPlanLen != Nx) { - if (dPlan1DFFT_cu != NULL) - { - cufftDestroy(dPlan1DFFT_cu); - dPlan1DFFT_cu = NULL; - } - dPlanLen = Nx; - cufftPlanMany(&dPlan1DFFT_cu, 1, arN, NULL, 1, Nx, NULL, 1, Nx, CUFFT_Z2Z, FFT1DInfo.HowMany); - } - if (dPlan1DFFT_cu == 0) return ERROR_IN_FFT; - cufftExecZ2Z(dPlan1DFFT_cu, (cufftDoubleComplex*)dDataToFFT, (cufftDoubleComplex*)dOutDataFFT, CUFFT_FORWARD); - } - }) - else -#endif - { - //int flags = FFTW_ESTIMATE; -#ifdef _FFTW3 //OC28012019 -#ifdef _WITH_OMP - //Still needs to be tested! - if (DataToFFT != 0) - { - fftwf_init_threads(); //initialize threading support - int nthreads = omp_get_max_threads(); //detect number of OpenMP threads that are available - fftwf_plan_with_nthreads(nthreads); - } - else if (dDataToFFT != 0) //OC02022019 - { - fftw_init_threads(); //initialize threading support - int nthreads = omp_get_max_threads(); //detect number of OpenMP threads that are available - fftw_plan_with_nthreads(nthreads); - } -#endif //ifndef _WITH_OMP - int arN[] = { (int)Nx }; //OC14052020 - //int arN[] = {Nx}; - if (DataToFFT != 0) - { - //Plan1DFFT = fftwf_plan_many_dft(1, arN, FFT1DInfo.HowMany, DataToFFT, NULL, 1, Nx, pOutDataFFT, NULL, 1, Nx, FFTW_FORWARD, flags); - Plan1DFFT = fftwf_plan_many_dft(1, arN, FFT1DInfo.HowMany, DataToFFT, NULL, 1, Nx, OutDataFFT, NULL, 1, Nx, FFTW_FORWARD, flags); //OC02022019 - if (Plan1DFFT == 0) return ERROR_IN_FFT; - fftwf_execute(Plan1DFFT); - } - else if (dDataToFFT != 0) //OC02022019 - { - dPlan1DFFT = fftw_plan_many_dft(1, arN, FFT1DInfo.HowMany, dDataToFFT, NULL, 1, Nx, dOutDataFFT, NULL, 1, Nx, FFTW_FORWARD, flags); - if (dPlan1DFFT == 0) return ERROR_IN_FFT; - fftw_execute(dPlan1DFFT); - } - -#else //ifndef _FFTW3 - if (DataToFFT == OutDataFFT) - { - flags |= FFTW_IN_PLACE; - pOutDataFFT = 0; //OC03092016 (see FFTW 2.1.5 doc clause above) - } - Plan1DFFT = fftw_create_plan(Nx, FFTW_FORWARD, flags); - if (Plan1DFFT == 0) return ERROR_IN_FFT; - - //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: - //srwlPrintTime("::Make1DFFT : fft create plan dir>0",&start); - -#ifndef _WITH_OMP //OC27102018 - //fftw(Plan1DFFT, FFT1DInfo.HowMany, DataToFFT, 1, Nx, OutDataFFT, 1, Nx); - fftw(Plan1DFFT, FFT1DInfo.HowMany, DataToFFT, 1, Nx, pOutDataFFT, 1, Nx); //OC03092016 -#else //OC27102018 - //SY: split one call into many (for OpenMP) -#pragma omp parallel for if (omp_get_num_threads()==1) // to avoid nested multi-threading (just in case) - for (int i = 0; i < FFT1DInfo.HowMany; i++) - { - //SY: do not use OutDataFFT as scratch space if in-place - if (DataToFFT == OutDataFFT) fftw_one(Plan1DFFT, DataToFFT + i * Nx, 0); - else fftw_one(Plan1DFFT, DataToFFT + i * Nx, OutDataFFT + i * Nx); - } -#endif -#endif - } - //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: - //srwlPrintTime("::Make1DFFT : fft dir>0",&start); - -#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) - GPU_COND(pvGPU, - //GPU_COND(pGpuUsage, //HG20012022 - { - if (OutDataFFT != 0) - { - RepairAndRotateDataAfter1DFFT_GPU((float*)OutDataFFT, FFT1DInfo.HowMany, Nx, (float)Mult); //OC06092023 - //RepairAndRotateDataAfter1DFFT_GPU((float*)OutDataFFT, FFT1DInfo.HowMany, Nx, Mult); - //RepairSignAfter1DFFT_GPU((float*)OutDataFFT, FFT1DInfo.HowMany, Nx); - //RotateDataAfter1DFFT_GPU((float*)OutDataFFT, FFT1DInfo.HowMany, Nx); - } - else if (dOutDataFFT != 0) - { - RepairAndRotateDataAfter1DFFT_GPU((double*)dOutDataFFT, FFT1DInfo.HowMany, Nx, Mult); - //RepairSignAfter1DFFT_GPU((double*)dOutDataFFT, FFT1DInfo.HowMany, Nx); - //RotateDataAfter1DFFT_GPU((double*)dOutDataFFT, FFT1DInfo.HowMany, Nx); - } - alreadyNormalized = true; - }) - else -#endif - { - if (OutDataFFT != 0) - { - RepairSignAfter1DFFT(OutDataFFT, FFT1DInfo.HowMany); - RotateDataAfter1DFFT(OutDataFFT, FFT1DInfo.HowMany); - } -#ifdef _FFTW3 //OC27022019 - else if (dOutDataFFT != 0) - { - RepairSignAfter1DFFT(dOutDataFFT, FFT1DInfo.HowMany); - RotateDataAfter1DFFT(dOutDataFFT, FFT1DInfo.HowMany); - } -#endif - } - } - else - { - //int flags = FFTW_ESTIMATE; //OC30012019 (commented-out) -#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) - GPU_COND(pvGPU, - //GPU_COND(pGpuUsage, //HG20012022 - { - int arN[] = { (int)Nx }; //OC14052020 - //int arN[] = {Nx}; - if (DataToFFT != 0) - { - if (PlanLen != Nx) { - PlanLen = Nx; - HowMany = FFT1DInfo.HowMany; - if (Plan1DFFT_cu != NULL) - { - cufftDestroy(Plan1DFFT_cu); - Plan1DFFT_cu = NULL; - } - cufftPlanMany(&Plan1DFFT_cu, 1, arN, NULL, 1, Nx, NULL, 1, Nx, CUFFT_C2C, FFT1DInfo.HowMany); - } - if (Plan1DFFT_cu == 0) return ERROR_IN_FFT; - - RotateDataAfter1DFFT_GPU((float*)DataToFFT, FFT1DInfo.HowMany, Nx); - RepairSignAfter1DFFT_GPU((float*)DataToFFT, FFT1DInfo.HowMany, Nx); - cufftExecC2C(Plan1DFFT_cu, (cufftComplex*)DataToFFT, (cufftComplex*)OutDataFFT, CUFFT_INVERSE); - } - else if (dDataToFFT != 0) //OC02022019 - { - if (dPlanLen != Nx) - { - dPlanLen = Nx; - dHowMany = FFT1DInfo.HowMany; - if (dPlan1DFFT_cu != NULL) - { - cufftDestroy(dPlan1DFFT_cu); - dPlan1DFFT_cu = NULL; - } - cufftPlanMany(&dPlan1DFFT_cu, 1, arN, NULL, 1, Nx, NULL, 1, Nx, CUFFT_Z2Z, FFT1DInfo.HowMany); - } - if (dPlan1DFFT_cu == 0) return ERROR_IN_FFT; - - RotateDataAfter1DFFT_GPU((double*)dDataToFFT, FFT1DInfo.HowMany, Nx); - RepairSignAfter1DFFT_GPU((double*)dDataToFFT, FFT1DInfo.HowMany, Nx); - cufftExecZ2Z(dPlan1DFFT_cu, (cufftDoubleComplex*)dDataToFFT, (cufftDoubleComplex*)dOutDataFFT, CUFFT_INVERSE); - } - }) - else -#endif - { -#ifdef _FFTW3 //OC28012019 -#ifdef _WITH_OMP - - //Still needs to be tested! - if (DataToFFT != 0) - { - fftwf_init_threads(); //initialize threading support - int nthreads = omp_get_max_threads(); //detect number of OpenMP threads that are available - fftwf_plan_with_nthreads(nthreads); - } - else if (dDataToFFT != 0) - { - fftw_init_threads(); //initialize threading support - int nthreads = omp_get_max_threads(); //detect number of OpenMP threads that are available - fftw_plan_with_nthreads(nthreads); - } - -#endif - int arN[] = { (int)Nx }; //OC14052020 - //int arN[] = {Nx}; - if (DataToFFT != 0) - { - //Plan1DFFT = fftwf_plan_many_dft(1, arN, FFT1DInfo.HowMany, DataToFFT, NULL, 1, Nx, pOutDataFFT, NULL, 1, Nx, FFTW_BACKWARD, flags); - Plan1DFFT = fftwf_plan_many_dft(1, arN, FFT1DInfo.HowMany, DataToFFT, NULL, 1, Nx, OutDataFFT, NULL, 1, Nx, FFTW_BACKWARD, flags); //OC02022019 - if (Plan1DFFT == 0) return ERROR_IN_FFT; - RotateDataAfter1DFFT(DataToFFT, FFT1DInfo.HowMany); - RepairSignAfter1DFFT(DataToFFT, FFT1DInfo.HowMany); - - fftwf_execute(Plan1DFFT); - } - else if (dDataToFFT != 0) //OC02022019 - { - dPlan1DFFT = fftw_plan_many_dft(1, arN, FFT1DInfo.HowMany, dDataToFFT, NULL, 1, Nx, dOutDataFFT, NULL, 1, Nx, FFTW_BACKWARD, flags); - if (dPlan1DFFT == 0) return ERROR_IN_FFT; - RotateDataAfter1DFFT(dDataToFFT, FFT1DInfo.HowMany); - RepairSignAfter1DFFT(dDataToFFT, FFT1DInfo.HowMany); - fftw_execute(dPlan1DFFT); - } -#else //ifndef _FFTW3 - if (DataToFFT == OutDataFFT) - { - flags |= FFTW_IN_PLACE; - pOutDataFFT = 0; //OC03092016 (see FFTW 2.1.5 doc clause above) - } - Plan1DFFT = fftw_create_plan(Nx, FFTW_BACKWARD, flags); - if (Plan1DFFT == 0) return ERROR_IN_FFT; - - //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: - //srwlPrintTime("::Make1DFFT : fft create plan dir<0",&start); - - RotateDataAfter1DFFT(DataToFFT, FFT1DInfo.HowMany); - //srwlPrintTime("::Make1DFFT : rotate dir<0",&start); - - RepairSignAfter1DFFT(DataToFFT, FFT1DInfo.HowMany); - //srwlPrintTime("::Make1DFFT : repair dir<0",&start); - -#ifndef _WITH_OMP //OC27102018 - //fftw(Plan1DFFT, FFT1DInfo.HowMany, DataToFFT, 1, Nx, OutDataFFT, 1, Nx); - fftw(Plan1DFFT, FFT1DInfo.HowMany, DataToFFT, 1, Nx, pOutDataFFT, 1, Nx); //OC03092016 -#else //OC27102018 - //SY: split one call into many (for OpenMP) -#pragma omp parallel for if (omp_get_num_threads()==1) // to avoid nested multi-threading (just in case) - for (int i = 0; i < FFT1DInfo.HowMany; i++) - { - if (DataToFFT == OutDataFFT) fftw_one(Plan1DFFT, DataToFFT + i * Nx, 0); - else fftw_one(Plan1DFFT, DataToFFT + i * Nx, OutDataFFT + i * Nx); - } -#endif -#endif //_FFTW3 - } - //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: - //srwlPrintTime("::Make1DFFT : fft dir<0",&start); - } - - if (!alreadyNormalized) - { -#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) - GPU_COND(pvGPU, - //GPU_COND(pGpuUsage, - { - if (OutDataFFT != 0) { - NormalizeDataAfter1DFFT_GPU((float*)OutDataFFT, FFT1DInfo.HowMany, Nx, Mult); - } - else if (dOutDataFFT != 0) - NormalizeDataAfter1DFFT_GPU((double*)dOutDataFFT, FFT1DInfo.HowMany, Nx, Mult); - }) - else -#endif - { - if (OutDataFFT != 0) NormalizeDataAfter1DFFT(OutDataFFT, FFT1DInfo.HowMany, Mult); -#ifdef _FFTW3 //OC27022019 - else if (dOutDataFFT != 0) NormalizeDataAfter1DFFT(dOutDataFFT, FFT1DInfo.HowMany, Mult); -#endif - } - } - - //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: - //srwlPrintTime("::Make1DFFT : NormalizeDataAfter1DFFT",&start); - - if (NeedsShiftAfterX) - { -#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) - GPU_COND(pvGPU, - //GPU_COND(pGpuUsage, - { - if (m_ArrayShiftX != 0) FillArrayShift_GPU(t0SignMult * x0_After, FFT1DInfo.xStepTr, Nx, m_ArrayShiftX); //OC02022019 - else if (m_dArrayShiftX != 0) FillArrayShift_GPU(t0SignMult * x0_After, FFT1DInfo.xStepTr, Nx, m_dArrayShiftX); - - if (OutDataFFT != 0) TreatShift_GPU((float*)OutDataFFT, FFT1DInfo.HowMany, Nx, m_ArrayShiftX); - else if (dOutDataFFT != 0) TreatShift_GPU((double*)dOutDataFFT, FFT1DInfo.HowMany, Nx, m_dArrayShiftX); - }) - else -#endif - { - //FillArrayShift(t0SignMult*x0_After, FFT1DInfo.xStepTr); - if (m_ArrayShiftX != 0) FillArrayShift(t0SignMult * x0_After, FFT1DInfo.xStepTr, m_ArrayShiftX); //OC02022019 - else if (m_dArrayShiftX != 0) FillArrayShift(t0SignMult * x0_After, FFT1DInfo.xStepTr, m_dArrayShiftX); - - if (OutDataFFT != 0) TreatShift(OutDataFFT, FFT1DInfo.HowMany); -#ifdef _FFTW3 //OC27022019 - else if (dOutDataFFT != 0) TreatShift(dOutDataFFT, FFT1DInfo.HowMany); -#endif - } - } - - if(FFT1DInfo.TreatSharpEdges) - { - int result = ProcessSharpEdges(FFT1DInfo); - if(result) return result; - } - -#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) - GPU_COND(pvGPU, - //GPU_COND(pGpuUsage, //HG20012022 - { - if ((FFT1DInfo.pInData != 0) && (FFT1DInfo.pOutData != 0)) - { - CAuxGPU::MarkUpdated((TGPUUsageArg*)pvGPU, OutDataFFT, true, false); //OC06092023 - //AuxGpu::MarkUpdated(pGpuUsage, OutDataFFT, true, false); - } - else if ((FFT1DInfo.pdInData != 0) && (FFT1DInfo.pdOutData != 0)) - { - CAuxGPU::MarkUpdated((TGPUUsageArg*)pvGPU, dOutDataFFT, true, false); //OC06092023 - //AuxGpu::MarkUpdated(pGpuUsage, dOutDataFFT, true, false); - } - }) - else -#endif - { - //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: - //srwlPrintTime("::Make1DFFT : ProcessSharpEdges",&start); - - //OC_NERSC: to comment-out the following line for NERSC (to avoid crash with "python-mpi") - //OC27102018: thread safety issue? -#ifdef _FFTW3 //OC29012019 - - if(DataToFFT != 0) fftwf_destroy_plan(Plan1DFFT); - else if(dDataToFFT != 0) fftw_destroy_plan(dPlan1DFFT); - -#ifdef _WITH_OMP - - if(DataToFFT != 0) fftwf_cleanup_threads(); //?? - else if(dDataToFFT != 0) fftw_cleanup_threads(); - -#endif -#else //ifndef _FFTW3 - - fftw_destroy_plan(Plan1DFFT); - -#endif - } - - if (m_ArrayShiftX != 0) - { -#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) - m_ArrayShiftX = (float*)CAuxGPU::ToHostAndFree((TGPUUsageArg*)pvGPU, m_ArrayShiftX, (Nx << 1) * sizeof(float), true); //OC06092023 - //m_ArrayShiftX = (float*)AuxGpu::ToHostAndFree(pGpuUsage, m_ArrayShiftX, (Nx << 1) * sizeof(float), true); -#endif - delete[] m_ArrayShiftX; - } - if (m_dArrayShiftX != 0) - { -#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) - m_dArrayShiftX = (double*)CAuxGPU::ToHostAndFree((TGPUUsageArg*)pvGPU, m_dArrayShiftX, (Nx << 1) * sizeof(double), true); //OC06092023 - //m_dArrayShiftX = (double*)AuxGpu::ToHostAndFree(pGpuUsage, m_dArrayShiftX, (Nx << 1) * sizeof(double), true); -#endif - delete[] m_dArrayShiftX; - } - - //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: - //srwlPrintTime("::Make1DFFT : after fft ",&start); - return 0; -} - -//************************************************************************* - -int CGenMathFFT1D::SetupAuxDataForSharpEdgeCorr(CGenMathFFT1DInfo& FFT1DInfo, CGenMathAuxDataForSharpEdgeCorr1D& AuxDataForSharpEdgeCorr, char dataType) -//int CGenMathFFT1D::SetupAuxDataForSharpEdgeCorr(CGenMathFFT1DInfo& FFT1DInfo, CGenMathAuxDataForSharpEdgeCorr1D& AuxDataForSharpEdgeCorr) -{ - double Step = FFT1DInfo.xStep, Start = FFT1DInfo.xStart; - double AbsTol = 0.05*Step; - - double EdgeMinOffsetFromStart = FFT1DInfo.LeftSharpEdge - Start; - long iEdgeMinLower = long(EdgeMinOffsetFromStart/Step + 1.E-04); // Steer: threr was a bug at 1.E-08 and less! - double EdgeMinLowerMisfit = EdgeMinOffsetFromStart - iEdgeMinLower*Step; - - double EdgeMaxOffsetFromStart = FFT1DInfo.RightSharpEdge - Start; - long iEdgeMaxLower = long(EdgeMaxOffsetFromStart/Step + 1.E-04); // Steer: threr was a bug at 1.E-08 and less! - double EdgeMaxLowerMisfit = EdgeMaxOffsetFromStart - iEdgeMaxLower*Step; - - char EdgeMinIsBetweenMeshPoints = (EdgeMinLowerMisfit > AbsTol); - char EdgeMaxIsBetweenMeshPoints = (EdgeMaxLowerMisfit > AbsTol); - char EdgeMaxIsSmallerThanDataEnd = (::fabs((Start + FFT1DInfo.Nx*Step) - FFT1DInfo.RightSharpEdge) > AbsTol); - char EdgeCorrNeeded = (EdgeMinIsBetweenMeshPoints || EdgeMaxIsBetweenMeshPoints || EdgeMaxIsSmallerThanDataEnd); - - //float dSt = 0.; - //if(EdgeMinIsBetweenMeshPoints) dSt = (float)(Step - EdgeMinLowerMisfit); - //float dFi = 0.; - //if(EdgeMaxIsBetweenMeshPoints) dFi = (float)(Step - EdgeMaxLowerMisfit); - //else if(EdgeMaxIsSmallerThanDataEnd) dFi = (float)(0.5*Step); - - //OC02022019 - double dSt = 0.; - if(EdgeMinIsBetweenMeshPoints) dSt = Step - EdgeMinLowerMisfit; - double dFi = 0.; - if(EdgeMaxIsBetweenMeshPoints) dFi = Step - EdgeMaxLowerMisfit; - else if(EdgeMaxIsSmallerThanDataEnd) dFi = 0.5*Step; - - CGenMathFFT1DInfo FFT1DInfoLoc = FFT1DInfo; - FFT1DInfoLoc.UseGivenStartTrValue = 0; - CGenMathFFT1D FFT1D; - FFT1D.SetupLimitsTr(FFT1DInfoLoc); - - if(EdgeCorrNeeded) - { - AuxDataForSharpEdgeCorr.d = Step; - long TwoN = FFT1DInfo.Nx << 1; - - if(dSt != 0.) - { - if(dataType == 'f') - { - AuxDataForSharpEdgeCorr.ExpArrSt = new float[TwoN]; - if(AuxDataForSharpEdgeCorr.ExpArrSt == 0) return MEMORY_ALLOCATION_FAILURE; - } - else if(dataType == 'd') //OC02022019 - { - AuxDataForSharpEdgeCorr.dExpArrSt = new double[TwoN]; - if(AuxDataForSharpEdgeCorr.dExpArrSt == 0) return MEMORY_ALLOCATION_FAILURE; - } - - AuxDataForSharpEdgeCorr.dSt = dSt; - long jSt = iEdgeMinLower + 1; - AuxDataForSharpEdgeCorr.iSt = jSt; - - double ArgjSt = Start + jSt*Step; - SetupSharpEdgeExpCorrArray(AuxDataForSharpEdgeCorr.ExpArrSt, FFT1DInfoLoc.Nx, ArgjSt, FFT1DInfoLoc.xStartTr, FFT1DInfoLoc.xStepTr); - } - if(dFi != 0.) - { - if(dataType == 'f') - { - AuxDataForSharpEdgeCorr.ExpArrFi = new float[TwoN]; - if(AuxDataForSharpEdgeCorr.ExpArrFi == 0) return MEMORY_ALLOCATION_FAILURE; - } - else if(dataType == 'd') - { - AuxDataForSharpEdgeCorr.dExpArrFi = new double[TwoN]; - if(AuxDataForSharpEdgeCorr.dExpArrFi == 0) return MEMORY_ALLOCATION_FAILURE; - } - - AuxDataForSharpEdgeCorr.dFi = dFi; - double ArgjFi = Start + iEdgeMaxLower*Step; - AuxDataForSharpEdgeCorr.iFi = iEdgeMaxLower; - - SetupSharpEdgeExpCorrArray(AuxDataForSharpEdgeCorr.ExpArrFi, FFT1DInfoLoc.Nx, ArgjFi, FFT1DInfoLoc.xStartTr, FFT1DInfoLoc.xStepTr); - } - AuxDataForSharpEdgeCorr.WasSetUp = 1; - } - return 0; -} - -//************************************************************************* - -void CGenMathFFT1D::MakeSharpEdgeCorr(CGenMathFFT1DInfo& FFT1DInfo, CGenMathAuxDataForSharpEdgeCorr1D& AuxData) -{ - double fSRe, fSIm, fFRe, fFIm; - double ExpStRe, ExpStIm, ExpFiRe, ExpFiIm, Re, Im; - long Two_i, Two_i_p_1; - - if((FFT1DInfo.pInData != 0) && (FFT1DInfo.pOutData != 0)) - { - float *t = FFT1DInfo.pOutData; - float *tSt = FFT1DInfo.pInData + (AuxData.iSt << 1); - float *tFi = FFT1DInfo.pInData + (AuxData.iFi << 1); - fSRe = *tSt, fSIm = *(tSt + 1); - fFRe = *tFi, fFIm = *(tFi + 1); - - for(long i=0; i -#include - -#ifndef _GM_WITHOUT_BASE -#include "gmobj.h" -#endif - -#ifdef _WITH_OMP //OC31102018: Pre-processor definition for compiling SRW with OpenMP library -#include "omp.h" -#endif - -#ifndef MEMORY_ALLOCATION_FAILURE -#define MEMORY_ALLOCATION_FAILURE 8 + 10000 //in line with SRW -#endif -#ifndef ERROR_IN_FFT -#define ERROR_IN_FFT 40 + 10000 -#endif - -//************************************************************************* - -class CGenMathFFT //{ -#ifndef _GM_WITHOUT_BASE - : public CGenMathObj -#endif -{//OC01052013 - double a2c, a4c, a6c, a8c, a10c, a12c; - double a3s, a5s, a7s, a9s, a11s, a13s; - -protected: - - static long GoodNumbers[]; - static long LenGoodNumbers; - static long GoodNum100s[]; - static long LenGoodNum100s; - static long GoodNum1000s[]; - static long LenGoodNum1000s; - static long GoodNum10000s[]; - static long LenGoodNum10000s; - -public: - - double HalfPI, PI, TwoPI, ThreePIdTwo, One_dTwoPI; // Constants - - CGenMathFFT() - { - HalfPI = 1.5707963267949; - PI = 3.141592653590; - TwoPI = 6.2831853071796; - ThreePIdTwo = 4.7123889803847; - One_dTwoPI = 0.1591549430919; - a2c = -0.5; a4c = 0.041666666666667; a6c = -0.0013888888888889; a8c = 0.000024801587301587; a10c = -2.755731922E-07; - a3s = -0.16666666666667; a5s = 0.0083333333333333; a7s = -0.0001984126984127; a9s = 2.755731922E-06; a11s = -2.505210839E-08; - } - - void CosAndSin(double x, float& Cos, float& Sin) - { - x -= TwoPI*int(x*One_dTwoPI); - if(x < 0.) x += TwoPI; - - char ChangeSign=0; - if(x > ThreePIdTwo) x -= TwoPI; - else if(x > HalfPI) { x -= PI; ChangeSign = 1;} - - double xe2 = x*x; - Cos = float(1. + xe2*(a2c + xe2*(a4c + xe2*(a6c + xe2*(a8c + xe2*a10c))))); - Sin = float(x*(1. + xe2*(a3s + xe2*(a5s + xe2*(a7s + xe2*(a9s + xe2*a11s)))))); - if(ChangeSign) { Cos = -Cos; Sin = -Sin;} - } - void CosAndSin(double x, double& Cos, double& Sin) //OC02022019 - { - //x -= TwoPI*int(x*One_dTwoPI); - x -= TwoPI*((long long)(x*One_dTwoPI)); - - if(x < 0.) x += TwoPI; - - char ChangeSign=0; - if(x > ThreePIdTwo) x -= TwoPI; - else if(x > HalfPI) { x -= PI; ChangeSign = 1;} - - double xe2 = x*x; - Cos = 1. + xe2*(a2c + xe2*(a4c + xe2*(a6c + xe2*(a8c + xe2*a10c)))); - Sin = x*(1. + xe2*(a3s + xe2*(a5s + xe2*(a7s + xe2*(a9s + xe2*a11s))))); - if(ChangeSign) { Cos = -Cos; Sin = -Sin;} - } - - //void NextCorrectNumberForFFT(long long&); //OC26042019 - void NextCorrectNumberForFFT(long&); -}; - -//************************************************************************* - -struct CGenMathFFT2DInfo { - float* pData; - double* pdData; //OC31012019 - - char Dir; // >0: forward; <0: backward - double xStep, yStep, xStart, yStart; - double xStepTr, yStepTr, xStartTr, yStartTr; - long Nx, Ny; - //long long Nx, Ny; - - long howMany; //OC151014 - long iStride, iDist; //OC151014 - //From FFTW 2.1.5 Tutorial - //iStride and iDist describe the input array(s). - //There are howMany multi-dimensional input arrays; the first one is pointed to by in (= pData), - //the second one is pointed to by in + iDist, and so on, up to in + (howMany - 1) * iDist. - //Each multi-dimensional input array consists of complex numbers (see Section Data Types), - //stored in row-major format (see Section Multi-dimensional Array Format), which are not necessarily contiguous in memory. - //Specifically, in[0] is the first element of the first array, in[istride] is the second element of the first array, and so on. - //In general, the i-th element of the j-th input array will be in position in[i * istride + j * idist]. - //Note that, here, i refers to an index into the row-major format for the multi-dimensional array, rather than an index in any particular dimension. - //In-place transforms: For plans created with the FFTW_IN_PLACE option, the transform is computed in-place--the output is returned in the in array, - //using the same strides, etcetera, as were used in the input. - - char UseGivenStartTrValues; - double ExtraMult; //OC20112017 - - CGenMathFFT2DInfo() - { - howMany = 1; iStride = 1; iDist = 0; //OC151014 - UseGivenStartTrValues = 0; - ExtraMult = 1.; //OC20112017 - - pData = 0; //OC31012019 - pdData = 0; - } -}; - -//************************************************************************* - -class CGenMathFFT2D : public CGenMathFFT { - - long Nx, Ny; - long HalfNx, HalfNy; - //long long Nx, Ny; - //long long HalfNx, HalfNy; - char NeedsShiftBeforeX, NeedsShiftBeforeY, NeedsShiftAfterX, NeedsShiftAfterY; - //float *ArrayShiftX, *ArrayShiftY; - float *m_ArrayShiftX, *m_ArrayShiftY; //OC02022019 - double *m_dArrayShiftX, *m_dArrayShiftY; - -#ifdef _OFFLOAD_GPU - static long PlanNx, PlanNy, HowMany; - static long dPlanNx, dPlanNy, dHowMany; - static cufftHandle Plan2DFFT_cu; - static cufftHandle dPlan2DFFT_cu; -#endif - -public: - CGenMathFFT2D() - { - NeedsShiftBeforeX = NeedsShiftBeforeY = NeedsShiftAfterX = NeedsShiftAfterY = 0; -#ifdef _OFFLOAD_GPU - HowMany = PlanNx = PlanNy = dHowMany = dPlanNx = dPlanNy = 0; - Plan2DFFT_cu = dPlan2DFFT_cu = 0; -#endif - } - - //int Make2DFFT(CGenMathFFT2DInfo&); - //Modification by S.Yakubov for parallelizing SRW via OpenMP: -#ifdef _FFTW3 //28012019 - int Make2DFFT(CGenMathFFT2DInfo&, fftwf_plan* pPrecreatedPlan2DFFT=0, fftw_plan* pdPrecreatedPlan2DFFT=0, void* pvGPU = 0); //OC05092023 - //int Make2DFFT(CGenMathFFT2DInfo&, fftwf_plan* pPrecreatedPlan2DFFT=0, fftw_plan* pdPrecreatedPlan2DFFT=0, gpuUsageArg *pGpuUsage = 0); //OC02022019 - //int Make2DFFT(CGenMathFFT2DInfo&, fftwf_plan* pPrecreatedPlan2DFFT=0); -#else - int Make2DFFT(CGenMathFFT2DInfo&, fftwnd_plan* pPrecreatedPlan2DFFT=0); //OC27102018 -#endif - - int AuxDebug_TestFFT_Plans(); - - void SetupLimitsTr(CGenMathFFT2DInfo& FFT2DInfo) - {// Modify this if Make2DFFT is modified ! - Nx = FFT2DInfo.Nx; Ny = FFT2DInfo.Ny; - HalfNx = (Nx >> 1); HalfNy = (Ny >> 1); - - double xStartTr = -0.5/FFT2DInfo.xStep; - FFT2DInfo.xStepTr = -xStartTr/HalfNx; - - double yStartTr = -0.5/FFT2DInfo.yStep; - FFT2DInfo.yStepTr = -yStartTr/HalfNy; - - if(!FFT2DInfo.UseGivenStartTrValues) - { - FFT2DInfo.xStartTr = xStartTr; - FFT2DInfo.yStartTr = yStartTr; - } - } - - template void FillArrayShift(char x_or_y, double t0, double tStep, T* arShift) //OC02022019 - //void FillArrayShift(char x_or_y, double t0, double tStep) - { - T* tArrayShift = arShift; - //float* tArrayShift; - //long N; - long N = (x_or_y == 'x')? Nx : Ny; - //if(x_or_y == 'x') { tArrayShift = m_ArrayShiftX; N = Nx;} - //else { tArrayShift = m_ArrayShiftY; N = Ny;} - - T *tp = tArrayShift + N; - //float *tp = tArrayShift + N; - *tp = 1.; *(tp+1) = 0.; tp += 2; - T *tm = tp - 4; - //float *tm = tp - 4; - - double t0TwoPI = t0*TwoPI; - double q = tStep; - long HalfN = N >> 1; - for(int i=0; i void RotateDataAfter2DFFT(T* pAfterFFT, long HowMany) - //void RotateDataAfter2DFFT(fftwf_complex* pAfterFFT) - {// Assumes Nx, Ny even ! - //OC281117: Make it work for odd Nx, Ny as well! - //OC281117: Consider combining RotateDataAfter2DFFT, RepairSignAfter2DFFT, NormalizeDataAfter2DFFT - //long HalfNyNx = HalfNy*Nx; - long long HalfNyNx = ((long long)HalfNy)*((long long)Nx); - - for(long iHowMany=0; iHowManyre *= s; (t++)->im *= s; s = -s; - } - sy0 = -sy0; - } - } -#endif - -#ifdef _FFTW3 //OC29012019 - void NormalizeDataAfter2DFFT(fftwf_complex* pAfterFFT, double Mult, long HowMany) - {// Assumes Nx, Ny even ! - //OC281117: To make it work for odd Nx, Ny as well in the future! - float fMult = (float)Mult; - long long NxNy = ((long long)Nx)*((long long)Ny); - for(long iHowMany=0; iHowManyre *= (FFTW_REAL)Mult; (t++)->im *= (FFTW_REAL)Mult; - } - } -#endif - -#ifdef _FFTW3 //OC29012019 - void TreatShifts(fftwf_complex* pData, long HowMany) - { - fftwf_complex *t = pData; - char NeedsShiftX = NeedsShiftBeforeX || NeedsShiftAfterX; - char NeedsShiftY = NeedsShiftBeforeY || NeedsShiftAfterY; - - for(long iHowMany=0; iHowManyre*MultRe - t->im*MultIm; -// float NewIm = t->re*MultIm + t->im*MultRe; -// t->re = NewRe; -// (t++)->im = NewIm; -// #endif - } - } - } - } -#else - void TreatShifts(FFTW_COMPLEX* pData) - { - FFTW_COMPLEX *t = pData; - char NeedsShiftX = NeedsShiftBeforeX || NeedsShiftAfterX; - char NeedsShiftY = NeedsShiftBeforeY || NeedsShiftAfterY; - - float *tShiftY = m_ArrayShiftY; - float MultY_Re = 1., MultY_Im = 0., MultX_Re = 1., MultX_Im = 0.; - float MultRe, MultIm; - - for(long iy=0; iyre*MultRe - t->im*MultIm; - float NewIm = t->re*MultIm + t->im*MultRe; - t->re = NewRe; - (t++)->im = NewIm; - } - } - } -#endif -#ifdef _FFTW3 //OC02022019 - void TreatShifts(fftw_complex* pData, long HowMany) - { - fftw_complex *t = pData; - char NeedsShiftX = NeedsShiftBeforeX || NeedsShiftAfterX; - char NeedsShiftY = NeedsShiftBeforeY || NeedsShiftAfterY; - - for(long iHowMany=0; iHowMany0: forward; <0: backward - double xStep, xStart; - double xStepTr, xStartTr; - long Nx; - //long long Nx; - long HowMany; - //long long HowMany; - char UseGivenStartTrValue; - double MultExtra; - - char TreatSharpEdges; - double LeftSharpEdge, RightSharpEdge; - char ApplyAutoShiftAfter; - - CGenMathFFT1DInfo() - { - HowMany = 1; UseGivenStartTrValue = 0; - TreatSharpEdges = 0; - MultExtra = 1.; - ApplyAutoShiftAfter = 1; - - pInData = 0; //OC31012019 - pOutData = 0; - pdInData = 0; - pdOutData = 0; - } -}; - -//************************************************************************* - -struct CGenMathAuxDataForSharpEdgeCorr1D { - - float *ExpArrSt, *ExpArrFi; - double *dExpArrSt, *dExpArrFi; - - double dSt, dFi, d; - long iSt, iFi; - - char WasSetUp; - - CGenMathAuxDataForSharpEdgeCorr1D() - { - Initialize(); - } - - void Initialize() - { - ExpArrSt = ExpArrFi = 0; - dExpArrSt = dExpArrFi = 0; - - dSt = dFi = d = 0.; - iSt = iFi = 0; - WasSetUp = 0; - } - - void Dispose() - { - if(ExpArrSt != 0) delete[] ExpArrSt; - if(ExpArrFi != 0) delete[] ExpArrFi; - - if(dExpArrSt != 0) delete[] dExpArrSt; - if(dExpArrFi != 0) delete[] dExpArrFi; - - Initialize(); - } -}; - -//************************************************************************* - -class CGenMathFFT1D : public CGenMathFFT { - - long Nx; - long HalfNx; - //long long Nx; - //long long HalfNx; - char NeedsShiftBeforeX, NeedsShiftAfterX; - float *m_ArrayShiftX; - double *m_dArrayShiftX; //OC02022019 -#ifdef _OFFLOAD_GPU - static long PlanLen, HowMany; - static long dPlanLen, dHowMany; - static cufftHandle Plan1DFFT_cu; - static cufftHandle dPlan1DFFT_cu; -#endif - -public: - CGenMathFFT1D() - { - NeedsShiftBeforeX = NeedsShiftAfterX = 0; -#ifdef _OFFLOAD_GPU - PlanLen = dPlanLen = 0; - Plan1DFFT_cu = dPlan1DFFT_cu = 0; - HowMany = dHowMany = 0; -#endif - } - - int Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU=0); //OC05092023 - int Make1DFFT_InPlace(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU=0); //OC05092023 - -//#ifndef _OFFLOAD_GPU //OC05092023 -// int Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo); -// int Make1DFFT_InPlace(CGenMathFFT1DInfo& FFT1DInfo); -//#else -// int Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, TGPUUsageArg* pGPU=0); -// int Make1DFFT_InPlace(CGenMathFFT1DInfo& FFT1DInfo, TGPUUsageArg* pGPU=0); -// //int Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, gpuUsageArg *pGpuUsage=0); //HG -// //int Make1DFFT_InPlace(CGenMathFFT1DInfo& FFT1DInfo, gpuUsageArg *pGpuUsage=0); -//#endif - - void SetupLimitsTr(CGenMathFFT1DInfo& FFT1DInfo) - { // Modify this if Make1DFFT is modified ! - Nx = FFT1DInfo.Nx; - HalfNx = (Nx >> 1); - - double xStartTr = -0.5/FFT1DInfo.xStep; - FFT1DInfo.xStepTr = -xStartTr/HalfNx; - - if(!FFT1DInfo.UseGivenStartTrValue) - { - FFT1DInfo.xStartTr = xStartTr; - } - } - - template void FillArrayShift(double t0, double tStep, T* arShiftX) //OC02022019 - //void FillArrayShift(double t0, double tStep) - { - //float *tArrayShift = m_ArrayShiftX; - T *tArrayShift = arShiftX; //OC02022019 - long N = Nx; - - //float *tp = tArrayShift + N; - T *tp = tArrayShift + N; //OC02022019 - *tp = 1.; *(tp+1) = 0.; tp += 2; - //float *tm = tp - 4; - T *tm = tp - 4; - - double t0TwoPI = t0*TwoPI; - double q = tStep; - long HalfN = N >> 1; - - for(int i=0; ire*MultX_Re - tMany->im*MultX_Im; - float NewIm = tMany->re*MultX_Im + tMany->im*MultX_Re; - tMany->re = NewRe; tMany->im = NewIm; - tMany += Nx; - } - } - } -#endif - -#ifdef _FFTW3 //OC29012019 - template void RepairSignAfter1DFFT(T* pAfterFFT, long HowMany) //OC02022019 - //void RepairSignAfter1DFFT(fftwf_complex* pAfterFFT, long HowMany) - {// Assumes Nx even ! - to be improved - //OC27102018 - //SY: optimized, adopt for OpenMP -#ifdef _WITH_OMP - #pragma omp parallel for -#endif - for(long ix=1; ixre = -tMany->re; tMany->im = -tMany->im; - // tMany += Nx; - // } - // } - // t++; s = -s; - //} - //OC27102018 - //SY: optimized, adopt for OpenMP -#ifdef _WITH_OMP - #pragma omp parallel for -#endif - for(long ix=1; ixre = -tMany->re; tMany->im = -tMany->im; - tMany += Nx; - } - } - } -#endif - -#ifdef _FFTW3 //OC29012019 - template void RotateDataAfter1DFFT(T* pAfterFFT, long HowMany) //OC02022019 - //void RotateDataAfter1DFFT(fftwf_complex* pAfterFFT, long HowMany) - {// Assumes Nx even ! -#ifndef _WITH_OMP //OC27102018 - //fftwf_complex *t1 = pAfterFFT, *t2 = pAfterFFT + HalfNx; - //fftwf_complex Buf; - T *t1 = pAfterFFT, *t2 = pAfterFFT + HalfNx, Buf; - for(long ix=0; ixre *= (FFTW_REAL)Mult; tMany->im *= (FFTW_REAL)Mult; - tMany += Nx; - } - } -#else //OC27102018 - //SY: adopted for OpenMP - #pragma omp parallel for - for(long ix=0; ixre *= (FFTW_REAL)Mult; tMany->im *= (FFTW_REAL)Mult; - tMany += Nx; - } - } -#endif - } -#endif - - int SetupAuxDataForSharpEdgeCorr(CGenMathFFT1DInfo&, CGenMathAuxDataForSharpEdgeCorr1D&, char dataType='f'); //OC02022019 - //int SetupAuxDataForSharpEdgeCorr(CGenMathFFT1DInfo&, CGenMathAuxDataForSharpEdgeCorr1D&); - void MakeSharpEdgeCorr(CGenMathFFT1DInfo&, CGenMathAuxDataForSharpEdgeCorr1D&); - - template void SetupSharpEdgeExpCorrArray(T* pCmpData, long AmOfPt, double x, double qStart, double qStep) //OC02022019 - //void SetupSharpEdgeExpCorrArray(float* pCmpData, long AmOfPt, double x, double qStart, double qStep) - { - const double TwoPi = 6.28318530717959; - double TwoPiX = TwoPi*x; - double q = qStart; - //float *tCmpData = pCmpData; - T *tCmpData = pCmpData; - for(long i=0; iPres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result; - if(result = TraverseRadZXE(pRadAccessData)) return result; + //if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result; + if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0, 0, 0, pvGPU)) return result; //HG30112023 + //if(result = TraverseRadZXE(pRadAccessData)) return result; + if(result = TraverseRadZXE(pRadAccessData, 0, 0, pvGPU)) return result; //HG30112023 //consider programming Angle on angular side by simple change of limits //however note potential problems for many photon energies! diff --git a/cpp/src/core/sroptapt.h b/cpp/src/core/sroptapt.h index 7d1032dc..e5f22dac 100644 --- a/cpp/src/core/sroptapt.h +++ b/cpp/src/core/sroptapt.h @@ -33,11 +33,13 @@ class srTAperture : public srTShapedOptElem { srTAperture () {} //int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, int MethNo, srTRadResizeVect& ResBeforeAndAfterVect) - int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect) + //int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect) + int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect, void* pvGPU) //HG30112023 { char &MethNo = ParPrecWfrPropag.MethNo; - if(MethNo == 0) return PropagateRadiationMeth_0(pRadAccessData); + //if(MethNo == 0) return PropagateRadiationMeth_0(pRadAccessData); + if(MethNo == 0) return PropagateRadiationMeth_0(pRadAccessData, pvGPU); //HG30112023 else if(MethNo == 1) return PropagateRadiationMeth_1(pRadAccessData); //else if(MethNo == 2) return PropagateRadiationMeth_2(pRadAccessData, ResBeforeAndAfterVect); else if(MethNo == 2) return PropagateRadiationMeth_2(pRadAccessData, ParPrecWfrPropag, ResBeforeAndAfterVect); @@ -47,11 +49,14 @@ class srTAperture : public srTShapedOptElem { //int PropagateRadiationMeth_0(srTSRWRadStructAccessData* pRadAccessData) //int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadAccessData, void* pBuf = 0) //OC06092019 //OC01102019 (restored) - int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadAccessData) + //int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadAccessData) + int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadAccessData, void* pvGPU) //HG30112023 { int result; - if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result; - if(result = TraverseRadZXE(pRadAccessData)) return result; + //if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result; + if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0, 0, 0, pvGPU)) return result; //HG30112023 + //if(result = TraverseRadZXE(pRadAccessData)) return result; + if(result = TraverseRadZXE(pRadAccessData, 0, 0, pvGPU)) return result; //HG30112023 if(result = PropagateRadMoments(pRadAccessData, 0)) return result; SetNewNonZeroWfrLimits(pRadAccessData); @@ -76,11 +81,14 @@ class srTAperture : public srTShapedOptElem { //int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData, void* pBuf=0) //OC06092019 //OC01102019 (restored) - int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData) + //int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData) + int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData, void* pvGPU) //HG30112023 { int result; - if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result; - if(result = TraverseRadZXE(pRadAccessData)) return result; + //if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result; + if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0, 0, 0, pvGPU)) return result; //HG30112023 + //if(result = TraverseRadZXE(pRadAccessData)) return result; + if(result = TraverseRadZXE(pRadAccessData, 0, 0, pvGPU)) return result; //HG30112023 SetNewNonZeroWfrLimits(pRadAccessData); return 0; diff --git a/cpp/src/core/sroptcnt.cpp b/cpp/src/core/sroptcnt.cpp index a8bb4278..9b6072b2 100644 --- a/cpp/src/core/sroptcnt.cpp +++ b/cpp/src/core/sroptcnt.cpp @@ -251,7 +251,8 @@ int srTCompositeOptElem::PropagateRadiationTest(srTSRWRadStructAccessData* pInRa //************************************************************************* -int srTCompositeOptElem::PropagateRadiationGuided(srTSRWRadStructAccessData& wfr, int nInt, char** arID, SRWLRadMesh* arIM, char** arI) //OC15082018 +//int srTCompositeOptElem::PropagateRadiationGuided(srTSRWRadStructAccessData& wfr, int nInt, char** arID, SRWLRadMesh* arIM, char** arI) //OC15082018 +int srTCompositeOptElem::PropagateRadiationGuided(srTSRWRadStructAccessData& wfr, int nInt, char** arID, SRWLRadMesh* arIM, char** arI, void* pvGPU) //HG30112023 //int srTCompositeOptElem::PropagateRadiationGuided(srTSRWRadStructAccessData& wfr) { //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: @@ -265,6 +266,9 @@ int srTCompositeOptElem::PropagateRadiationGuided(srTSRWRadStructAccessData& wfr int res = 0, elemCount = 0; bool propIntIsNeeded = (nInt != 0) && (arID != 0) && (arI != 0); //OC27082018 +#ifdef _OFFLOAD_GPU //HG30112023 + bool dataOnDevice = false; +#endif for(srTGenOptElemHndlList::iterator it = GenOptElemList.begin(); it != GenOptElemList.end(); ++it) { @@ -308,7 +312,16 @@ int srTCompositeOptElem::PropagateRadiationGuided(srTSRWRadStructAccessData& wfr if((::fabs(curPropResizeInst.pxd - 1.) > tolRes) || (::fabs(curPropResizeInst.pxm - 1.) > tolRes) || //(::fabs(curPropResizeInst.pzd - 1.) > tolRes) || (::fabs(curPropResizeInst.pzm - 1.) > tolRes)) (::fabs(curPropResizeInst.pzd - 1.) > tolRes) || (::fabs(curPropResizeInst.pzm - 1.) > tolRes) || (curPropResizeInst.ShiftTypeBeforeRes > 0)) //OC11072019 - if(res = RadResizeGen(wfr, curPropResizeInst)) return res; + { + //if(res = RadResizeGen(wfr, curPropResizeInst)) return res; + if(res = RadResizeGen(wfr, curPropResizeInst, pvGPU)) return res; //HG30112023 + +#ifdef _OFFLOAD_GPU //HG30112023 + if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) { + dataOnDevice = true; + } +#endif + } //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: //srwlPrintTime("Iteration: RadResizeGen",&start); @@ -325,14 +338,55 @@ int srTCompositeOptElem::PropagateRadiationGuided(srTSRWRadStructAccessData& wfr //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: //srwlPrintTime("Iteration: precParWfrPropag",&start); +#ifdef _OFFLOAD_GPU //HG30112023 + TGPUUsageArg* pGPU = (TGPUUsageArg*)pvGPU; + if (CAuxGPU::GPUEnabled(pGPU)) { + if (dataOnDevice && (((srTGenOptElem*)it->rep)->SupportedFeatures() & 1) == 0) + { +//#if DEBUG +// printf("Element does not support GPU, transferring to CPU.\r\n"); +//#endif + if (wfr.pBaseRadX != NULL) + wfr.pBaseRadX = (float*)CAuxGPU::ToHostAndFree(pGPU, wfr.pBaseRadX, 2 * wfr.ne * wfr.nx * wfr.nz * sizeof(float)); + if (wfr.pBaseRadZ != NULL) + wfr.pBaseRadZ = (float*)CAuxGPU::ToHostAndFree(pGPU, wfr.pBaseRadZ, 2 * wfr.ne * wfr.nx * wfr.nz * sizeof(float)); + dataOnDevice = false; + } + else if (!dataOnDevice && (((srTGenOptElem*)it->rep)->SupportedFeatures() & 1) == 1) + { + dataOnDevice = true; +//#if DEBUG +// printf("Element supports GPU, transferring...\r\n"); +//#endif + } + } +#endif + srTRadResizeVect auxResizeVect; - if(res = ((srTGenOptElem*)(it->rep))->PropagateRadiation(&wfr, precParWfrPropag, auxResizeVect)) return res; + //if(res = ((srTGenOptElem*)(it->rep))->PropagateRadiation(&wfr, precParWfrPropag, auxResizeVect)) return res; + if(res = ((srTGenOptElem*)(it->rep))->PropagateRadiation(&wfr, precParWfrPropag, auxResizeVect, pvGPU)) return res; //HG30112023 //maybe to use "PropagateRadiationGuided" for srTCompositeOptElem? //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: //srwlPrintTime("Iteration: PropagateRadiation",&start); - if(propIntIsNeeded) ExtractPropagatedIntensity(wfr, nInt, arID, arIM, arI, elemCount); + if(propIntIsNeeded) + { +#ifdef _OFFLOAD_GPU //HG09112022 If the data is on the GPU, transfer it to CPU and synchronize before extracting the intensity + TGPUUsageArg* pGPU = (TGPUUsageArg*)pvGPU; + if (CAuxGPU::GPUEnabled(pGPU)) { + if (dataOnDevice) + { + if (wfr.pBaseRadX != NULL) + wfr.pBaseRadX = (float*)CAuxGPU::ToHostAndFree(pGPU, wfr.pBaseRadX, 2 * wfr.ne * wfr.nx * wfr.nz * sizeof(float)); + if (wfr.pBaseRadZ != NULL) + wfr.pBaseRadZ = (float*)CAuxGPU::ToHostAndFree(pGPU, wfr.pBaseRadZ, 2 * wfr.ne * wfr.nx * wfr.nz * sizeof(float)); + dataOnDevice = false; + } + } +#endif + ExtractPropagatedIntensity(wfr, nInt, arID, arIM, arI, elemCount); + } elemCount++; diff --git a/cpp/src/core/sroptcnt.h b/cpp/src/core/sroptcnt.h index 59095ccc..84f7e7d5 100644 --- a/cpp/src/core/sroptcnt.h +++ b/cpp/src/core/sroptcnt.h @@ -34,7 +34,8 @@ class srTCompositeOptElem : public srTGenOptElem { srTCompositeOptElem() {} int PropagateRadiationTest(srTSRWRadStructAccessData*, srTSRWRadStructAccessData*); - int PropagateRadiationGuided(srTSRWRadStructAccessData& wfr, int nInt=0, char** arID=0, SRWLRadMesh* arIM=0, char** arI=0); //OC15082018 + int PropagateRadiationGuided(srTSRWRadStructAccessData& wfr, int nInt=0, char** arID=0, SRWLRadMesh* arIM=0, char** arI=0, void* pvGPU=0); //HG01122023 + //int PropagateRadiationGuided(srTSRWRadStructAccessData& wfr, int nInt=0, char** arID=0, SRWLRadMesh* arIM=0, char** arI=0); //OC15082018 //int PropagateRadiationGuided(srTSRWRadStructAccessData& wfr); int ExtractPropagatedIntensity(srTSRWRadStructAccessData& wfr, int nInt, char** arID, SRWLRadMesh* arIM, char** arI, int elCnt, int indIntSartSearch=0); //27082018 @@ -47,7 +48,8 @@ class srTCompositeOptElem : public srTGenOptElem { GenOptElemList.push_back(OptElemHndl); } - int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResizeBeforeAndAfterVect) + //int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResizeBeforeAndAfterVect) + int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResizeBeforeAndAfterVect, void* pvGPU=0) //HG01122023 { int AmOfElem = (int)GenOptElemList.size(); //OC110104 int ElemCount = 0; //OC110104 @@ -65,7 +67,8 @@ class srTCompositeOptElem : public srTGenOptElem { } //if(result = ((srTGenOptElem*)((*iter).rep))->PropagateRadiation(pRadAccessData, MethNo, ResizeBeforeAndAfterVect)) return result; - if(result = ((srTGenOptElem*)((*iter).rep))->PropagateRadiation(pRadAccessData, ParPrecWfrPropag, ResizeBeforeAndAfterVect)) return result; + //if(result = ((srTGenOptElem*)((*iter).rep))->PropagateRadiation(pRadAccessData, ParPrecWfrPropag, ResizeBeforeAndAfterVect)) return result; + if(result = ((srTGenOptElem*)((*iter).rep))->PropagateRadiation(pRadAccessData, ParPrecWfrPropag, ResizeBeforeAndAfterVect, pvGPU)) return result; //HG01122023 } ParPrecWfrPropag.UseResAfter = GenUseResAfter; //OC110104 return 0; diff --git a/cpp/src/core/sroptcryst.h b/cpp/src/core/sroptcryst.h index fd25308e..af6f535f 100644 --- a/cpp/src/core/sroptcryst.h +++ b/cpp/src/core/sroptcryst.h @@ -943,7 +943,8 @@ class srTOptCryst : public srTGenOptElem { return 0; } - int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect) //virtual in srTGenOptElem + //int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect) //virtual in srTGenOptElem + int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect, void* pvGPU) //virtual in srTGenOptElem //HG01122023 { m_eStartAux = pRadAccessData->eStart; m_eStepAux = pRadAccessData->eStep; m_ne = pRadAccessData->ne; //required for RadPointModifier @@ -967,7 +968,8 @@ class srTOptCryst : public srTGenOptElem { } //return PropagateRadiationMeth_0(pRadAccessData); - return PropagateRadiationSingleE_Meth_0(pRadAccessData, 0); + //return PropagateRadiationSingleE_Meth_0(pRadAccessData, 0); + return PropagateRadiationSingleE_Meth_0(pRadAccessData, 0, pvGPU); //HG01122023 } //int PropagateRadiationMeth_0(srTSRWRadStructAccessData* pRadAccessData) //virtual in srTGenOptElem @@ -977,7 +979,8 @@ class srTOptCryst : public srTGenOptElem { //int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadAccessData, void* pBuf=0) //OC06092019 //OC01102019 (restored) - int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadAccessData) + //int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadAccessData) + int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadAccessData, void* pvGPU) //HG01122023 {//It works for many photon energies too (as in the case of Drift) //The "in-place" processing involving FFT for many photon energies greatly improves efficiency of the code for Time-/Frequency-Dependent simulations for FEL and pulsed lasers. int result; diff --git a/cpp/src/core/sroptdrf.cpp b/cpp/src/core/sroptdrf.cpp index e4018c6f..27246d8b 100644 --- a/cpp/src/core/sroptdrf.cpp +++ b/cpp/src/core/sroptdrf.cpp @@ -352,7 +352,8 @@ int srTDriftSpace::PropagateRadiationMeth_1(srTSRWRadStructAccessData* pRadAcces //int srTDriftSpace::PropagateRadiationSimple_PropToWaist(srTSRWRadStructAccessData* pRadAccessData, srTDriftPropBufVars* pBufVars) //OC06092019 //OC01102019 (restored) -int srTDriftSpace::PropagateRadiationSimple_PropToWaist(srTSRWRadStructAccessData* pRadAccessData) +//int srTDriftSpace::PropagateRadiationSimple_PropToWaist(srTSRWRadStructAccessData* pRadAccessData) +int srTDriftSpace::PropagateRadiationSimple_PropToWaist(srTSRWRadStructAccessData* pRadAccessData, void* pvGPU) //HG30112023 {// e in eV; Length in m !!! int result; @@ -365,7 +366,8 @@ int srTDriftSpace::PropagateRadiationSimple_PropToWaist(srTSRWRadStructAccessDat SetupPropBufVars_PropToWaist(pRadAccessData, &BufVars); //SetupPropBufVars_PropToWaist(pRadAccessData); - if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result; + //if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result; + if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0, 0, 0, pvGPU)) return result; //HG30112023 //pBufVars->PassNo = 1; //OC06092019 //OC01102019 (restored) @@ -373,7 +375,8 @@ int srTDriftSpace::PropagateRadiationSimple_PropToWaist(srTSRWRadStructAccessDat //PropBufVars.PassNo = 1; //if(result = TraverseRadZXE(pRadAccessData, pBufVars)) return result; //OC06092019 //OC01102019 (restored) - if(result = TraverseRadZXE(pRadAccessData, &BufVars)) return result; //OC29082019 + if(result = TraverseRadZXE(pRadAccessData, &BufVars, sizeof(srTDriftPropBufVars), pvGPU)) return result; //HG30112023 + //if(result = TraverseRadZXE(pRadAccessData, &BufVars)) return result; //OC29082019 //if(result = TraverseRadZXE(pRadAccessData)) return result; //OC240114 (commented-out) @@ -402,7 +405,8 @@ int srTDriftSpace::PropagateRadiationSimple_PropToWaist(srTSRWRadStructAccessDat //To remove this? srTDataPtrsForWfrEdgeCorr DataPtrsForWfrEdgeCorr; - if(result = SetupWfrEdgeCorrData(pRadAccessData, pRadAccessData->pBaseRadX, pRadAccessData->pBaseRadZ, DataPtrsForWfrEdgeCorr)) return result; + if(result = SetupWfrEdgeCorrData(pRadAccessData, pRadAccessData->pBaseRadX, pRadAccessData->pBaseRadZ, DataPtrsForWfrEdgeCorr, pvGPU)) return result; //HG30112023 + //if(result = SetupWfrEdgeCorrData(pRadAccessData, pRadAccessData->pBaseRadX, pRadAccessData->pBaseRadZ, DataPtrsForWfrEdgeCorr)) return result; #if !defined(_FFTW3) && defined(_WITH_OMP) //OC29082019 //OC04062020 @@ -423,9 +427,11 @@ int srTDriftSpace::PropagateRadiationSimple_PropToWaist(srTSRWRadStructAccessDat #else //OCTEST01102019: commented-out the above (to see if this will fix problem of TD calcs) FFT2DInfo.pData = pRadAccessData->pBaseRadX; - if(result = FFT2D.Make2DFFT(FFT2DInfo)) return result; + //if(result = FFT2D.Make2DFFT(FFT2DInfo)) return result; + if(result = FFT2D.Make2DFFT(FFT2DInfo, 0, 0, pvGPU)) return result; //HG30112023 FFT2DInfo.pData = pRadAccessData->pBaseRadZ; - if(result = FFT2D.Make2DFFT(FFT2DInfo)) return result; + //if(result = FFT2D.Make2DFFT(FFT2DInfo)) return result; + if(result = FFT2D.Make2DFFT(FFT2DInfo, 0, 0, pvGPU)) return result; //HG30112023 #endif //FFT2DInfo.pData = pRadAccessData->pBaseRadX; @@ -436,7 +442,8 @@ int srTDriftSpace::PropagateRadiationSimple_PropToWaist(srTSRWRadStructAccessDat //To remove this? if(DataPtrsForWfrEdgeCorr.WasSetup) { - MakeWfrEdgeCorrection(pRadAccessData, pRadAccessData->pBaseRadX, pRadAccessData->pBaseRadZ, DataPtrsForWfrEdgeCorr); + //MakeWfrEdgeCorrection(pRadAccessData, pRadAccessData->pBaseRadX, pRadAccessData->pBaseRadZ, DataPtrsForWfrEdgeCorr); + MakeWfrEdgeCorrection(pRadAccessData, pRadAccessData->pBaseRadX, pRadAccessData->pBaseRadZ, DataPtrsForWfrEdgeCorr, pvGPU); //HG30112023 DataPtrsForWfrEdgeCorr.DisposeData(); } @@ -455,7 +462,8 @@ int srTDriftSpace::PropagateRadiationSimple_PropToWaist(srTSRWRadStructAccessDat //PropBufVars.PassNo = 2; //if(result = TraverseRadZXE(pRadAccessData, pBufVars)) return result; //OC06092019 //OC01102019 (restored) - if(result = TraverseRadZXE(pRadAccessData, &BufVars)) return result; //OC30082019 + if(result = TraverseRadZXE(pRadAccessData, &BufVars, sizeof(srTDriftPropBufVars), pvGPU)) return result; //HG30112023 + //if(result = TraverseRadZXE(pRadAccessData, &BufVars)) return result; //OC30082019 //if(result = TraverseRadZXE(pRadAccessData)) return result; //OC19032022 @@ -479,7 +487,8 @@ int srTDriftSpace::PropagateRadiationSimple_PropToWaist(srTSRWRadStructAccessDat } //************************************************************************* -int srTDriftSpace::PropagateRadiationSimple_PropToWaistBeyondParax(srTSRWRadStructAccessData* pRadAccessData) //OC10112019 +//int srTDriftSpace::PropagateRadiationSimple_PropToWaistBeyondParax(srTSRWRadStructAccessData* pRadAccessData) //OC10112019 +int srTDriftSpace::PropagateRadiationSimple_PropToWaistBeyondParax(srTSRWRadStructAccessData* pRadAccessData, void *pvGPU) //HG30112023 {// e in eV; Length in m !!! int result = 0; @@ -488,7 +497,8 @@ int srTDriftSpace::PropagateRadiationSimple_PropToWaistBeyondParax(srTSRWRadStru srTDriftPropBufVars BufVars; SetupPropBufVars_PropToWaistBeyondParax(pRadAccessData, &BufVars); - if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result; + //if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result; + if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0, 0, 0, pvGPU)) return result; //HG30112023 pRadAccessData->TreatQuadPhaseTerm('r'); //OC17122019 //pRadAccessData->TreatQuadPhaseTermTerm('r'); @@ -509,7 +519,8 @@ int srTDriftSpace::PropagateRadiationSimple_PropToWaistBeyondParax(srTSRWRadStru //pRadAccessData->xStart = (pRadAccessData->xStart)*InvLambdaM_d_Rx; //pRadAccessData->zStart = (pRadAccessData->zStart)*InvLambdaM_d_Rz; - if(result = TraverseRadZXE(pRadAccessData, &BufVars)) return result; + //if(result = TraverseRadZXE(pRadAccessData, &BufVars)) return result; + if(result = TraverseRadZXE(pRadAccessData, &BufVars, sizeof(srTDriftPropBufVars), pvGPU)) return result; //HG30112023 CGenMathFFT2DInfo FFT2DInfo; FFT2DInfo.xStep = pRadAccessData->xStep; @@ -547,9 +558,11 @@ int srTDriftSpace::PropagateRadiationSimple_PropToWaistBeyondParax(srTSRWRadStru #else //OCTEST01102019: commented-out the above (to see if this will fix problem of TD calcs) FFT2DInfo.pData = pRadAccessData->pBaseRadX; - if(result = FFT2D.Make2DFFT(FFT2DInfo)) return result; + //if(result = FFT2D.Make2DFFT(FFT2DInfo)) return result; + if(result = FFT2D.Make2DFFT(FFT2DInfo, 0, 0, pvGPU)) return result; //HG30112023 FFT2DInfo.pData = pRadAccessData->pBaseRadZ; - if(result = FFT2D.Make2DFFT(FFT2DInfo)) return result; + //if(result = FFT2D.Make2DFFT(FFT2DInfo)) return result; + if(result = FFT2D.Make2DFFT(FFT2DInfo, 0, 0, pvGPU)) return result; //HG30112023 #endif //To remove this? @@ -597,7 +610,8 @@ int srTDriftSpace::PropagateRadiationSimple_PropToWaistBeyondParax(srTSRWRadStru //int srTDriftSpace::PropagateRadiationSimple_PropFromWaist(srTSRWRadStructAccessData* pRadAccessData, srTDriftPropBufVars* pBufVars) //OC06092019 //OC01102019 (restored) -int srTDriftSpace::PropagateRadiationSimple_PropFromWaist(srTSRWRadStructAccessData* pRadAccessData) +//int srTDriftSpace::PropagateRadiationSimple_PropFromWaist(srTSRWRadStructAccessData* pRadAccessData) +int srTDriftSpace::PropagateRadiationSimple_PropFromWaist(srTSRWRadStructAccessData* pRadAccessData, void* pvGPU) //HG30112023 {//Should be very similar to PropagateRadiationSimple_PropToWaist, consider merging int result = 0; @@ -607,7 +621,8 @@ int srTDriftSpace::PropagateRadiationSimple_PropFromWaist(srTSRWRadStructAccessD //OC01102019 (restored) SetupPropBufVars_PropFromWaist(pRadAccessData, &BufVars); //SetupPropBufVars_PropFromWaist(pRadAccessData); - if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result; + //if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result; + if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0, 0, 0, pvGPU)) return result; //HG30112023 //OC30082019: commented-out: not needed here, since it is set in ChooseLocalPropMode(...); is it thread-safe? //LocalPropMode = 2; // prop. from waist @@ -616,7 +631,8 @@ int srTDriftSpace::PropagateRadiationSimple_PropFromWaist(srTSRWRadStructAccessD //if(result = TraverseRadZXE(pRadAccessData)) return result; //OC01102019 (restored) BufVars.PassNo = 1; - if(result = TraverseRadZXE(pRadAccessData, &BufVars)) return result; + //if(result = TraverseRadZXE(pRadAccessData, &BufVars)) return result; + if(result = TraverseRadZXE(pRadAccessData, &BufVars, sizeof(srTDriftPropBufVars), pvGPU)) return result; //HG30112023 //OC06092019 //pBufVars->PassNo = 1; //if(result = TraverseRadZXE(pRadAccessData, pBufVars)) return result; @@ -638,7 +654,8 @@ int srTDriftSpace::PropagateRadiationSimple_PropFromWaist(srTSRWRadStructAccessD //OCTEST (commented-out "edge correction") //OC01102019 (uncommented) srTDataPtrsForWfrEdgeCorr DataPtrsForWfrEdgeCorr; - if(result = SetupWfrEdgeCorrData(pRadAccessData, pRadAccessData->pBaseRadX, pRadAccessData->pBaseRadZ, DataPtrsForWfrEdgeCorr)) return result; + //if(result = SetupWfrEdgeCorrData(pRadAccessData, pRadAccessData->pBaseRadX, pRadAccessData->pBaseRadZ, DataPtrsForWfrEdgeCorr)) return result; + if(result = SetupWfrEdgeCorrData(pRadAccessData, pRadAccessData->pBaseRadX, pRadAccessData->pBaseRadZ, DataPtrsForWfrEdgeCorr, pvGPU)) return result; //HG30112023 CGenMathFFT2D FFT2D; @@ -666,16 +683,19 @@ int srTDriftSpace::PropagateRadiationSimple_PropFromWaist(srTSRWRadStructAccessD //} #else FFT2DInfo.pData = pRadAccessData->pBaseRadX; - if(result = FFT2D.Make2DFFT(FFT2DInfo)) return result; + //if(result = FFT2D.Make2DFFT(FFT2DInfo)) return result; + if(result = FFT2D.Make2DFFT(FFT2DInfo, 0, 0, pvGPU)) return result; //HG30112023 FFT2DInfo.pData = pRadAccessData->pBaseRadZ; - if(result = FFT2D.Make2DFFT(FFT2DInfo)) return result; + //if(result = FFT2D.Make2DFFT(FFT2DInfo)) return result; + if(result = FFT2D.Make2DFFT(FFT2DInfo, 0, 0, pvGPU)) return result; //HG30112023 #endif //OCTEST (commented-out "edge correction") //OC01102019 (uncommented) if(DataPtrsForWfrEdgeCorr.WasSetup) { - MakeWfrEdgeCorrection(pRadAccessData, pRadAccessData->pBaseRadX, pRadAccessData->pBaseRadZ, DataPtrsForWfrEdgeCorr); + //MakeWfrEdgeCorrection(pRadAccessData, pRadAccessData->pBaseRadX, pRadAccessData->pBaseRadZ, DataPtrsForWfrEdgeCorr); + MakeWfrEdgeCorrection(pRadAccessData, pRadAccessData->pBaseRadX, pRadAccessData->pBaseRadZ, DataPtrsForWfrEdgeCorr, pvGPU); //HG30112023 DataPtrsForWfrEdgeCorr.DisposeData(); } @@ -689,7 +709,8 @@ int srTDriftSpace::PropagateRadiationSimple_PropFromWaist(srTSRWRadStructAccessD //if(result = TraverseRadZXE(pRadAccessData)) return result; //OC01102019 (restored) BufVars.PassNo = 2; - if(result = TraverseRadZXE(pRadAccessData, &BufVars)) return result; + //if(result = TraverseRadZXE(pRadAccessData, &BufVars)) return result; + if(result = TraverseRadZXE(pRadAccessData, &BufVars, sizeof(srTDriftPropBufVars), pvGPU)) return result; //HG30112023 //OC06092019 //pBufVars->PassNo = 2; //if(result = TraverseRadZXE(pRadAccessData, pBufVars)) return result; @@ -701,7 +722,8 @@ int srTDriftSpace::PropagateRadiationSimple_PropFromWaist(srTSRWRadStructAccessD //int srTDriftSpace::PropagateRadiationSimple_AnalytTreatQuadPhaseTerm(srTSRWRadStructAccessData* pRadAccessData, srTDriftPropBufVars* pBufVars) //OC06092019 //OC01102019 (restored) -int srTDriftSpace::PropagateRadiationSimple_AnalytTreatQuadPhaseTerm(srTSRWRadStructAccessData* pRadAccessData) +//int srTDriftSpace::PropagateRadiationSimple_AnalytTreatQuadPhaseTerm(srTSRWRadStructAccessData* pRadAccessData) +int srTDriftSpace::PropagateRadiationSimple_AnalytTreatQuadPhaseTerm(srTSRWRadStructAccessData* pRadAccessData, void* pvGPU) //HG30112023 {// e in eV; Length in m !!! int result = 0; @@ -720,7 +742,8 @@ int srTDriftSpace::PropagateRadiationSimple_AnalytTreatQuadPhaseTerm(srTSRWRadSt //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: //srwlPrintTime(":PropagateRadiationSimple_AnalytTreatQuadPhaseTerm:SetupPropBufVars_AnalytTreatQuadPhaseTerm",&start); - if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result; + //if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result; + if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0, 0, 0, pvGPU)) return result; //HG30112023 //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: //srwlPrintTime(":PropagateRadiationSimple_AnalytTreatQuadPhaseTerm:SetRadRepres 1",&start); @@ -731,7 +754,8 @@ int srTDriftSpace::PropagateRadiationSimple_AnalytTreatQuadPhaseTerm(srTSRWRadSt //PropBufVars.PassNo = 1; //Remove quadratic term from the Phase in coord. repres. //if(result = TraverseRadZXE(pRadAccessData, pBufVars)) return result; //OC06092019 //OC01102019 (restored) - if(result = TraverseRadZXE(pRadAccessData, &BufVars)) return result; //OC30082019 + //if(result = TraverseRadZXE(pRadAccessData, &BufVars)) return result; //OC30082019 + if(result = TraverseRadZXE(pRadAccessData, &BufVars, sizeof(srTDriftPropBufVars), pvGPU)) return result; //HG30112023 //if(result = TraverseRadZXE(pRadAccessData)) return result; //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: @@ -750,7 +774,8 @@ int srTDriftSpace::PropagateRadiationSimple_AnalytTreatQuadPhaseTerm(srTSRWRadSt pRadAccessData->WfrEdgeCorrShouldBeDone = 0; - if(result = SetRadRepres(pRadAccessData, 1)) return result; //To angular repres. + //if(result = SetRadRepres(pRadAccessData, 1)) return result; //To angular repres. + if(result = SetRadRepres(pRadAccessData, 1, 0, 0, pvGPU)) return result; //To angular repres. //HG30112023 //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: //srwlPrintTime(":PropagateRadiationSimple_AnalytTreatQuadPhaseTerm:SetRadRepres 2",&start); @@ -761,7 +786,8 @@ int srTDriftSpace::PropagateRadiationSimple_AnalytTreatQuadPhaseTerm(srTSRWRadSt //PropBufVars.PassNo = 2; //Loop in angular repres. //if(result = TraverseRadZXE(pRadAccessData, pBufVars)) return result; //OC06092019 //OC01102019 (restored) - if(result = TraverseRadZXE(pRadAccessData, &BufVars)) return result; //OC30082019 + //if(result = TraverseRadZXE(pRadAccessData, &BufVars)) return result; //OC30082019 + if(result = TraverseRadZXE(pRadAccessData, &BufVars, sizeof(srTDriftPropBufVars), pvGPU)) return result; //HG30112023 //if(result = TraverseRadZXE(pRadAccessData)) return result; //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: @@ -773,7 +799,8 @@ int srTDriftSpace::PropagateRadiationSimple_AnalytTreatQuadPhaseTerm(srTSRWRadSt pRadAccessData->zStartTr += zShift; } - if(result = SetRadRepres(pRadAccessData, 0)) return result; //Back to coord. repres. + //if(result = SetRadRepres(pRadAccessData, 0)) return result; //Back to coord. repres. + if(result = SetRadRepres(pRadAccessData, 0, 0, 0, pvGPU)) return result; //Back to coord. repres. //HG30112023 //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: //srwlPrintTime(":PropagateRadiationSimple_AnalytTreatQuadPhaseTerm:SetRadRepres 3",&start); @@ -816,7 +843,8 @@ int srTDriftSpace::PropagateRadiationSimple_AnalytTreatQuadPhaseTerm(srTSRWRadSt //PropBufVars.PassNo = 3; //Add new quadratic term to the Phase in coord. repres. //if(result = TraverseRadZXE(pRadAccessData, pBufVars)) return result; //OC06092019 //OC01102019 (restored) - if(result = TraverseRadZXE(pRadAccessData, &BufVars)) return result; //OC30082019 + //if(result = TraverseRadZXE(pRadAccessData, &BufVars)) return result; //OC30082019 + if(result = TraverseRadZXE(pRadAccessData, &BufVars, sizeof(srTDriftPropBufVars), pvGPU)) return result; //HG30112023 //if(result = TraverseRadZXE(pRadAccessData)) return result; //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: diff --git a/cpp/src/core/sroptdrf.h b/cpp/src/core/sroptdrf.h index 01b03722..a6a16d20 100644 --- a/cpp/src/core/sroptdrf.h +++ b/cpp/src/core/sroptdrf.h @@ -90,6 +90,7 @@ class srTDriftSpace : public srTGenOptElem { double Length; //OC06092019 (commented-out) //srTDriftPropBufVars PropBufVars; + int SupportedFeatures() override { return 1; } //HG01122023 Returns 1 if the element supports GPU propagation srTDriftSpace(double InLength =0., char InTreatPath =0) { @@ -109,7 +110,8 @@ class srTDriftSpace : public srTGenOptElem { } //int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, int MethNo, srTRadResizeVect& ResizeBeforeAndAfterVect) - int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResizeBeforeAndAfterVect) + //int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResizeBeforeAndAfterVect) + int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResizeBeforeAndAfterVect, void* pvGPU=0) //HG01122023 { //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: //double start; @@ -159,7 +161,8 @@ class srTDriftSpace : public srTGenOptElem { //if(MethNo == 0) result = PropagateRadiationMeth_0(pRadAccessData, &BufVars); //OC06092019 //OC01102019 (restored) - if(MethNo == 0) result = PropagateRadiationMeth_0(pRadAccessData); + //if(MethNo == 0) result = PropagateRadiationMeth_0(pRadAccessData); + if(MethNo == 0) result = PropagateRadiationMeth_0(pRadAccessData, pvGPU); //HG01122023 else if(MethNo == 1) result = PropagateRadiationMeth_1(pRadAccessData); else if(MethNo == 2) result = PropagateRadiationMeth_2(pRadAccessData, ParPrecWfrPropag, ResizeBeforeAndAfterVect); @@ -175,12 +178,14 @@ class srTDriftSpace : public srTGenOptElem { //int PropagateRadiationMeth_0(srTSRWRadStructAccessData* pRadAccessData) //int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadAccessData, void* pBuf=0) //OC06092019 //OC01102019 (restored) - int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadAccessData) + //int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadAccessData) + int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadAccessData, void* pvGPU) //HG01122023 {//it works for many photon energies too! int result; //if(result = PropagateRadiationSimple(pRadAccessData, pBuf)) return result; //OC06092019 //OC01102019 (restored) - if(result = PropagateRadiationSimple(pRadAccessData)) return result; + //if(result = PropagateRadiationSimple(pRadAccessData)) return result; + if(result = PropagateRadiationSimple(pRadAccessData, pvGPU)) return result; //HG01122023 if(result = PropagateRadMoments(pRadAccessData, 0)) return result; if(result = PropagateWaveFrontRadius(pRadAccessData)) return result; if(result = Propagate4x4PropMatr(pRadAccessData)) return result; @@ -189,7 +194,8 @@ class srTDriftSpace : public srTGenOptElem { //int PropagateRadiationMeth_0(srTSRWRadStructAccessData* pRadAccessData, void* pBuf) //OC06092019 //OC01102019 (restored) - int PropagateRadiationMeth_0(srTSRWRadStructAccessData* pRadAccessData) //virtual in srTGenOptElem + //int PropagateRadiationMeth_0(srTSRWRadStructAccessData* pRadAccessData) //virtual in srTGenOptElem + int PropagateRadiationMeth_0(srTSRWRadStructAccessData* pRadAccessData, void* pvGPU) //virtual in srTGenOptElem //HG01122023 {//because for the Drift, the following works for many photon energies too! //return PropagateRadiationSingleE_Meth_0(pRadAccessData, 0); //OC251214 @@ -198,7 +204,8 @@ class srTDriftSpace : public srTGenOptElem { //srTDriftPropBufVars* pBufVars = (srTDriftPropBufVars*)pBuf; //OC06092019 //if((pBufVars->LocalPropMode == 0) || (pBufVars->LocalPropMode == 3) || (pRadAccessData->ne == 1)) return PropagateRadiationSingleE_Meth_0(pRadAccessData, 0, pBuf); //OC06092019 //OC01102019 (restored) - if((LocalPropMode == 0) || (LocalPropMode == 3) || (pRadAccessData->ne == 1)) return PropagateRadiationSingleE_Meth_0(pRadAccessData, 0); + //if((LocalPropMode == 0) || (LocalPropMode == 3) || (pRadAccessData->ne == 1)) return PropagateRadiationSingleE_Meth_0(pRadAccessData, 0); + if((LocalPropMode == 0) || (LocalPropMode == 3) || (pRadAccessData->ne == 1)) return PropagateRadiationSingleE_Meth_0(pRadAccessData, 0, pvGPU); //HG01122023 else { pRadAccessData->SetNonZeroWavefrontLimitsToFullRange(); @@ -304,20 +311,26 @@ class srTDriftSpace : public srTGenOptElem { //int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData, void* pBuf=0) //OC06092019 //OC01102019 (restored) - int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData) + //int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData) + int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData, void* pvGPU) //HG01122023 { //srTDriftPropBufVars* pBufVars = (srTDriftPropBufVars*)pBuf; //OC06092019 //char LocalPropMode = pBufVars->LocalPropMode; //OC06092019 //OC01102019 (commented-out / restored) - if(LocalPropMode == 0) return PropagateRadiationSimple_AngRepres(pRadAccessData); + //if(LocalPropMode == 0) return PropagateRadiationSimple_AngRepres(pRadAccessData); + if(LocalPropMode == 0) return PropagateRadiationSimple_AngRepres(pRadAccessData, pvGPU); //HG01122023 //OC01102019 (restored) - else if(LocalPropMode == 1) return PropagateRadiationSimple_PropToWaist(pRadAccessData); + //else if(LocalPropMode == 1) return PropagateRadiationSimple_PropToWaist(pRadAccessData); + else if(LocalPropMode == 1) return PropagateRadiationSimple_PropToWaist(pRadAccessData, pvGPU); //HG01122023 - else if(LocalPropMode == 11) return PropagateRadiationSimple_PropToWaistBeyondParax(pRadAccessData); //OC10112019 + //else if(LocalPropMode == 11) return PropagateRadiationSimple_PropToWaistBeyondParax(pRadAccessData); //OC10112019 + else if(LocalPropMode == 11) return PropagateRadiationSimple_PropToWaistBeyondParax(pRadAccessData, pvGPU); //OC10112019 //HG01122023 - else if(LocalPropMode == 2) return PropagateRadiationSimple_PropFromWaist(pRadAccessData); //OC240114 (added) - else if(LocalPropMode == 3) return PropagateRadiationSimple_AnalytTreatQuadPhaseTerm(pRadAccessData); + //else if(LocalPropMode == 2) return PropagateRadiationSimple_PropFromWaist(pRadAccessData); //OC240114 (added) + else if(LocalPropMode == 2) return PropagateRadiationSimple_PropFromWaist(pRadAccessData, pvGPU); //OC240114 (added) //HG01122023 + //else if(LocalPropMode == 3) return PropagateRadiationSimple_AnalytTreatQuadPhaseTerm(pRadAccessData); + else if(LocalPropMode == 3) return PropagateRadiationSimple_AnalytTreatQuadPhaseTerm(pRadAccessData, pvGPU); //HG01122023 //OC06092019 //else if(LocalPropMode == 1) return PropagateRadiationSimple_PropToWaist(pRadAccessData, pBufVars); //else if(LocalPropMode == 2) return PropagateRadiationSimple_PropFromWaist(pRadAccessData, pBufVars); //OC240114 (added) @@ -329,7 +342,8 @@ class srTDriftSpace : public srTGenOptElem { else return 0; } - int PropagateRadiationSimple_AngRepres(srTSRWRadStructAccessData* pRadAccessData) + //int PropagateRadiationSimple_AngRepres(srTSRWRadStructAccessData* pRadAccessData) + int PropagateRadiationSimple_AngRepres(srTSRWRadStructAccessData* pRadAccessData, void* pvGPU) //HG01122023 { //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: //double start; @@ -351,13 +365,15 @@ class srTDriftSpace : public srTGenOptElem { if(pRadAccessData->Pres != 1) { - if(result = SetRadRepres(pRadAccessData, 1)) return result; + //if(result = SetRadRepres(pRadAccessData, 1)) return result; + if(result = SetRadRepres(pRadAccessData, 1, 0, 0, pvGPU)) return result; //HG01122023 } //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: //srwlPrintTime(":PropagateRadiationSimple_AngRepres:SetRadRepres 1",&start); - if(result = TraverseRadZXE(pRadAccessData)) return result; + //if(result = TraverseRadZXE(pRadAccessData)) return result; + if(result = TraverseRadZXE(pRadAccessData, 0, 0, pvGPU)) return result; //HG01122023 //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: //srwlPrintTime(":PropagateRadiationSimple_AngRepres:TraverseRadZXE",&start); @@ -368,7 +384,8 @@ class srTDriftSpace : public srTGenOptElem { pRadAccessData->zStartTr += zShift; } - if(result = SetRadRepres(pRadAccessData, 0)) return result; + //if(result = SetRadRepres(pRadAccessData, 0)) return result; + if(result = SetRadRepres(pRadAccessData, 0, 0, 0, pvGPU)) return result; //HG01122023 //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: //srwlPrintTime(":PropagateRadiationSimple_AngRepres:SetRadRepres 2",&start); @@ -390,11 +407,15 @@ class srTDriftSpace : public srTGenOptElem { } //OC01102019 (restored) - int PropagateRadiationSimple_PropToWaist(srTSRWRadStructAccessData* pRadAccessData); - int PropagateRadiationSimple_PropToWaistBeyondParax(srTSRWRadStructAccessData* pRadAccessData); //OC10112019 - - int PropagateRadiationSimple_PropFromWaist(srTSRWRadStructAccessData* pRadAccessData); - int PropagateRadiationSimple_AnalytTreatQuadPhaseTerm(srTSRWRadStructAccessData* pRadAccessData); + //int PropagateRadiationSimple_PropToWaist(srTSRWRadStructAccessData* pRadAccessData); + int PropagateRadiationSimple_PropToWaist(srTSRWRadStructAccessData* pRadAccessData, void* pvGPU=0); //HG01122023 + //int PropagateRadiationSimple_PropToWaistBeyondParax(srTSRWRadStructAccessData* pRadAccessData); //OC10112019 + int PropagateRadiationSimple_PropToWaistBeyondParax(srTSRWRadStructAccessData* pRadAccessData, void* pvGPU=0); //OC10112019 //HG01122023 + + //int PropagateRadiationSimple_PropFromWaist(srTSRWRadStructAccessData* pRadAccessData); + int PropagateRadiationSimple_PropFromWaist(srTSRWRadStructAccessData* pRadAccessData, void* pvGPU=0); //HG01122023 + //int PropagateRadiationSimple_AnalytTreatQuadPhaseTerm(srTSRWRadStructAccessData* pRadAccessData); + int PropagateRadiationSimple_AnalytTreatQuadPhaseTerm(srTSRWRadStructAccessData* pRadAccessData, void* pvGPU=0); //HG01122023 //OC06092019 //int PropagateRadiationSimple_PropToWaist(srTSRWRadStructAccessData* pRadAccessData, srTDriftPropBufVars* pBufVars=0); //int PropagateRadiationSimple_PropFromWaist(srTSRWRadStructAccessData* pRadAccessData, srTDriftPropBufVars* pBufVars=0); @@ -553,6 +574,16 @@ class srTDriftSpace : public srTGenOptElem { void RadPointModifier(srTEXZ& EXZ, srTEFieldPtrs& EPtrs, void* pBuf=0) //OC29082019 //void RadPointModifier(srTEXZ& EXZ, srTEFieldPtrs& EPtrs) + { + RadPointModifierPortable(EXZ, EPtrs, pBuf); //HG01122023 + } + +#ifdef _OFFLOAD_GPU //HG01122023 + int RadPointModifierParallel(srTSRWRadStructAccessData* pRadAccessData, void* pBufVars=0, long pBufVarsSz=0, TGPUUsageArg* pGPU=0) override; + + GPU_PORTABLE +#endif + void RadPointModifierPortable(srTEXZ& EXZ, srTEFieldPtrs& EPtrs, void* pBuf=0) //HG01122023 { srTDriftPropBufVars* pBufVars = (srTDriftPropBufVars*)pBuf; //char LocalPropMode = pBufVars->LocalPropMode; @@ -568,6 +599,9 @@ class srTDriftSpace : public srTGenOptElem { //else if(LocalPropMode == 3) { RadPointModifier_AnalytTreatQuadPhaseTerm(EXZ, EPtrs); return;} } +#ifdef _OFFLOAD_GPU //HG01122023 + GPU_PORTABLE +#endif void RadPointModifier_AngRepres(srTEXZ& EXZ, srTEFieldPtrs& EPtrs) {// e in eV; Length in m !!! // Operates on Angles side !!! @@ -599,6 +633,9 @@ class srTDriftSpace : public srTGenOptElem { *(EPtrs.pEzRe) = NewEzRe; *(EPtrs.pEzIm) = NewEzIm; } +#ifdef _OFFLOAD_GPU //HG01122023 + GPU_PORTABLE +#endif void RadPointModifier_PropToWaist(srTEXZ& EXZ, srTEFieldPtrs& EPtrs, srTDriftPropBufVars* pBufVars) //OC29082019 //void RadPointModifier_PropToWaist(srTEXZ& EXZ, srTEFieldPtrs& EPtrs) { @@ -661,6 +698,9 @@ class srTDriftSpace : public srTGenOptElem { } } +#ifdef _OFFLOAD_GPU //HG01122023 + GPU_PORTABLE +#endif void RadPointModifier_PropToWaistBeyondParax(srTEXZ& EXZ, srTEFieldPtrs& EPtrs, srTDriftPropBufVars* pBufVars) //OC10112019 { double rx = EXZ.x, rz = EXZ.z; @@ -695,6 +735,9 @@ class srTDriftSpace : public srTGenOptElem { *(EPtrs.pEzRe) = NewEzRe; *(EPtrs.pEzIm) = NewEzIm; } +#ifdef _OFFLOAD_GPU //HG01122023 + GPU_PORTABLE +#endif void RadPointModifier_PropFromWaist(srTEXZ& EXZ, srTEFieldPtrs& EPtrs, srTDriftPropBufVars* pBufVars) //OC30082019 //void RadPointModifier_PropFromWaist(srTEXZ& EXZ, srTEFieldPtrs& EPtrs) { @@ -742,6 +785,9 @@ class srTDriftSpace : public srTGenOptElem { } } +#ifdef _OFFLOAD_GPU //HG01122023 + GPU_PORTABLE +#endif void RadPointModifier_AnalytTreatQuadPhaseTerm(srTEXZ& EXZ, srTEFieldPtrs& EPtrs, srTDriftPropBufVars* pBufVars) //OC30082019 //void RadPointModifier_AnalytTreatQuadPhaseTerm(srTEXZ& EXZ, srTEFieldPtrs& EPtrs) {//don't use RobsX, RobsZ directly here! diff --git a/cpp/src/core/sroptdrf_gpu.cu b/cpp/src/core/sroptdrf_gpu.cu new file mode 100644 index 00000000..7d98fa8a --- /dev/null +++ b/cpp/src/core/sroptdrf_gpu.cu @@ -0,0 +1,29 @@ +/************************************************************************//** + * File: sroptdrf_gpu.cu + * Description: Optical element: Drift space (CUDA implementation) + * Project: Synchrotron Radiation Workshop + * First release: 2023 + * + * Copyright (C) Brookhaven National Laboratory + * All Rights Reserved + * + * @author H.Goel + * @version 1.0 + ***************************************************************************/ + +#ifdef _OFFLOAD_GPU +#include "cuda_runtime.h" +#include "device_launch_parameters.h" +#include "math_constants.h" + +#include +#include +#include +#include "sroptdrf.h" + +//Implementation of the RadPointModifier's GPU function for the srTDriftSpace class +int srTDriftSpace::RadPointModifierParallel(srTSRWRadStructAccessData* pRadAccessData, void* pBufVars, long pBufVarsSz, TGPUUsageArg *pGpu) +{ + return RadPointModifierParallelImpl(pRadAccessData, pBufVars, pBufVarsSz, this, pGpu); +} //HG03092022 +#endif \ No newline at end of file diff --git a/cpp/src/core/sroptel2.cpp b/cpp/src/core/sroptel2.cpp index 0c426d5d..73d11027 100644 --- a/cpp/src/core/sroptel2.cpp +++ b/cpp/src/core/sroptel2.cpp @@ -37,7 +37,8 @@ double srTGenOptElem::CheckMemoryAvailable() //int srTGenOptElem::PropagateRadiationMeth_0(srTSRWRadStructAccessData* pRadAccessData, void* pBuf) //OC06092019 //OC01102019 (restored) -int srTGenOptElem::PropagateRadiationMeth_0(srTSRWRadStructAccessData* pRadAccessData) +//int srTGenOptElem::PropagateRadiationMeth_0(srTSRWRadStructAccessData* pRadAccessData) +int srTGenOptElem::PropagateRadiationMeth_0(srTSRWRadStructAccessData* pRadAccessData, void *pvGPU) //HG30112023 {//Moved from derived classes: loops over E, calls derived PropagateRadiationSingleE_Meth_0 //This propagation method doesn't allow for true wavefront "resizing/resampling" //(which results in changing numbers of points) in "slices" vs photon energy. diff --git a/cpp/src/core/sroptelm.cpp b/cpp/src/core/sroptelm.cpp index 341d0bae..bde700d1 100644 --- a/cpp/src/core/sroptelm.cpp +++ b/cpp/src/core/sroptelm.cpp @@ -30,6 +30,10 @@ #include "sropthck.h" #include "sroptgrat.h" +#ifdef _OFFLOAD_GPU //HG01122023 +#include "auxgpu.h" +#endif + #ifdef _WITH_OMP //Pre-processor definition for compiling with OpenMP library #include "omp.h" #endif @@ -146,7 +150,8 @@ int srTGenOptElem::ExtraDataExpected(const char* sElemID) //OC01062020 //************************************************************************* -int srTGenOptElem::TraverseRadZXE(srTSRWRadStructAccessData* pRadAccessData, void* pBufVars) //OC29082019 +//int srTGenOptElem::TraverseRadZXE(srTSRWRadStructAccessData* pRadAccessData, void* pBufVars) //OC29082019 +int srTGenOptElem::TraverseRadZXE(srTSRWRadStructAccessData* pRadAccessData, void* pBufVars, long pBufVarsSz, void* pvGPU) //OC29082019 //HG01122023 //int srTGenOptElem::TraverseRadZXE(srTSRWRadStructAccessData* pRadAccessData) { float *pEx0 = pRadAccessData->pBaseRadX; @@ -156,6 +161,15 @@ int srTGenOptElem::TraverseRadZXE(srTSRWRadStructAccessData* pRadAccessData, voi long long PerX = pRadAccessData->ne << 1; long long PerZ = PerX*pRadAccessData->nx; +#ifdef _OFFLOAD_GPU //HG01122023 + if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) + { + if (RadPointModifierParallel(pRadAccessData, pBufVars, pBufVarsSz, (TGPUUsageArg*)pvGPU) == -1) //Try to call the GPU version, if it fails, call the CPU version + return TraverseRadZXE(pRadAccessData, pBufVars, pBufVarsSz, NULL); + return 0; + } +#endif + #ifndef _WITH_OMP //OC28102018 srTEFieldPtrs EFieldPtrs; @@ -731,7 +745,8 @@ int srTGenOptElem::RemoveSliceConstE_FromGenRadStruct(srTSRWRadStructAccessData* //************************************************************************* -int srTGenOptElem::SetupWfrEdgeCorrData(srTSRWRadStructAccessData* pRadAccessData, float* pDataEx, float* pDataEz, srTDataPtrsForWfrEdgeCorr& DataPtrsForWfrEdgeCorr) +//int srTGenOptElem::SetupWfrEdgeCorrData(srTSRWRadStructAccessData* pRadAccessData, float* pDataEx, float* pDataEz, srTDataPtrsForWfrEdgeCorr& DataPtrsForWfrEdgeCorr) +int srTGenOptElem::SetupWfrEdgeCorrData(srTSRWRadStructAccessData* pRadAccessData, float* pDataEx, float* pDataEz, srTDataPtrsForWfrEdgeCorr& DataPtrsForWfrEdgeCorr, void* pvGPU) //HG01122023 { int result; @@ -849,7 +864,8 @@ int srTGenOptElem::SetupWfrEdgeCorrData(srTSRWRadStructAccessData* pRadAccessDat FFT1DInfo.xStart = pRadAccessData->zStart; FFT1DInfo.Nx = pRadAccessData->nz; CGenMathFFT1D FFT1D; - if(result = FFT1D.Make1DFFT_InPlace(FFT1DInfo)) return result; + //if(result = FFT1D.Make1DFFT_InPlace(FFT1DInfo)) return result; + if(result = FFT1D.Make1DFFT_InPlace(FFT1DInfo, pvGPU)) return result; //HG01122023 } if(dxFi != 0.) { @@ -889,7 +905,8 @@ int srTGenOptElem::SetupWfrEdgeCorrData(srTSRWRadStructAccessData* pRadAccessDat FFT1DInfo.xStart = pRadAccessData->zStart; FFT1DInfo.Nx = pRadAccessData->nz; CGenMathFFT1D FFT1D; - if(result = FFT1D.Make1DFFT_InPlace(FFT1DInfo)) return result; + //if(result = FFT1D.Make1DFFT_InPlace(FFT1DInfo)) return result; + if(result = FFT1D.Make1DFFT_InPlace(FFT1DInfo, pvGPU)) return result; //HG01122023 } if(dzSt != 0.) { @@ -913,7 +930,8 @@ int srTGenOptElem::SetupWfrEdgeCorrData(srTSRWRadStructAccessData* pRadAccessDat FFT1DInfo.xStart = pRadAccessData->xStart; FFT1DInfo.Nx = pRadAccessData->nx; CGenMathFFT1D FFT1D; - if(result = FFT1D.Make1DFFT_InPlace(FFT1DInfo)) return result; + //if(result = FFT1D.Make1DFFT_InPlace(FFT1DInfo)) return result; + if(result = FFT1D.Make1DFFT_InPlace(FFT1DInfo, pvGPU)) return result; //HG01122023 } if(dzFi != 0.) { @@ -936,7 +954,8 @@ int srTGenOptElem::SetupWfrEdgeCorrData(srTSRWRadStructAccessData* pRadAccessDat FFT1DInfo.xStart = pRadAccessData->xStart; FFT1DInfo.Nx = pRadAccessData->nx; CGenMathFFT1D FFT1D; - if(result = FFT1D.Make1DFFT_InPlace(FFT1DInfo)) return result; + //if(result = FFT1D.Make1DFFT_InPlace(FFT1DInfo)) return result; + if(result = FFT1D.Make1DFFT_InPlace(FFT1DInfo, pvGPU)) return result; //HG01122023 } DataPtrsForWfrEdgeCorr.WasSetup = 1; } @@ -1015,8 +1034,18 @@ int srTGenOptElem::SetupWfrEdgeCorrData1D(srTRadSect1D* pRadSect1D, float* pData //************************************************************************* -void srTGenOptElem::MakeWfrEdgeCorrection(srTSRWRadStructAccessData* pRadAccessData, float* pDataEx, float* pDataEz, srTDataPtrsForWfrEdgeCorr& DataPtrs) +//void srTGenOptElem::MakeWfrEdgeCorrection(srTSRWRadStructAccessData* pRadAccessData, float* pDataEx, float* pDataEz, srTDataPtrsForWfrEdgeCorr& DataPtrs) +void srTGenOptElem::MakeWfrEdgeCorrection(srTSRWRadStructAccessData* pRadAccessData, float* pDataEx, float* pDataEz, srTDataPtrsForWfrEdgeCorr& DataPtrs, void* pvGPU) //HG01122023 { + //HG23082022 Use GPU if requested +#ifdef _OFFLOAD_GPU + if(CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) + { + MakeWfrEdgeCorrection_GPU(pRadAccessData, pDataEx, pDataEz, DataPtrs, (TGPUUsageArg*)pvGPU); + return; + } +#endif + float *tEx = pDataEx, *tEz = pDataEz; double dxSt_dzSt = DataPtrs.dxSt*DataPtrs.dzSt; @@ -1204,7 +1233,8 @@ void srTGenOptElem::MakeWfrEdgeCorrection1D(srTRadSect1D* pRadSect1D, float* pDa //************************************************************************* //int srTGenOptElem::SetRadRepres(srTSRWRadStructAccessData* pRadAccessData, char CoordOrAng) -int srTGenOptElem::SetRadRepres(srTSRWRadStructAccessData* pRadAccessData, char CoordOrAng, double* ar_xStartInSlicesE, double* ar_zStartInSlicesE) +//int srTGenOptElem::SetRadRepres(srTSRWRadStructAccessData* pRadAccessData, char CoordOrAng, double* ar_xStartInSlicesE, double* ar_zStartInSlicesE) +int srTGenOptElem::SetRadRepres(srTSRWRadStructAccessData* pRadAccessData, char CoordOrAng, double* ar_xStartInSlicesE, double* ar_zStartInSlicesE, void* pvGPU) //HG01122023 {// 0- to coord.; 1- to ang. int result; @@ -1247,7 +1277,8 @@ int srTGenOptElem::SetRadRepres(srTSRWRadStructAccessData* pRadAccessData, char { if(CoordOrAng == 1) { - if(result = SetupWfrEdgeCorrData(pRadAccessData, pRadAccessData->pBaseRadX, pRadAccessData->pBaseRadZ, DataPtrsForWfrEdgeCorr)) return result; + //if(result = SetupWfrEdgeCorrData(pRadAccessData, pRadAccessData->pBaseRadX, pRadAccessData->pBaseRadZ, DataPtrsForWfrEdgeCorr)) return result; + if(result = SetupWfrEdgeCorrData(pRadAccessData, pRadAccessData->pBaseRadX, pRadAccessData->pBaseRadZ, DataPtrsForWfrEdgeCorr, pvGPU)) return result; //HG01122023 } } @@ -1255,9 +1286,11 @@ int srTGenOptElem::SetRadRepres(srTSRWRadStructAccessData* pRadAccessData, char if(ar_zStartInSlicesE != 0) FFT2DInfo.yStart = *ar_zStartInSlicesE; FFT2DInfo.pData = pRadAccessData->pBaseRadX; - if(result = FFT2D.Make2DFFT(FFT2DInfo)) return result; + //if(result = FFT2D.Make2DFFT(FFT2DInfo)) return result; + if(result = FFT2D.Make2DFFT(FFT2DInfo, 0, 0, pvGPU)) return result; //HG01122023 FFT2DInfo.pData = pRadAccessData->pBaseRadZ; - if(result = FFT2D.Make2DFFT(FFT2DInfo)) return result; + //if(result = FFT2D.Make2DFFT(FFT2DInfo)) return result; + if(result = FFT2D.Make2DFFT(FFT2DInfo, 0, 0, pvGPU)) return result; //HG01122023 if(WfrEdgeCorrShouldBeTreated) { @@ -1265,7 +1298,8 @@ int srTGenOptElem::SetRadRepres(srTSRWRadStructAccessData* pRadAccessData, char { if(DataPtrsForWfrEdgeCorr.WasSetup) { - MakeWfrEdgeCorrection(pRadAccessData, pRadAccessData->pBaseRadX, pRadAccessData->pBaseRadZ, DataPtrsForWfrEdgeCorr); + //MakeWfrEdgeCorrection(pRadAccessData, pRadAccessData->pBaseRadX, pRadAccessData->pBaseRadZ, DataPtrsForWfrEdgeCorr); + MakeWfrEdgeCorrection(pRadAccessData, pRadAccessData->pBaseRadX, pRadAccessData->pBaseRadZ, DataPtrsForWfrEdgeCorr, pvGPU); //HG01122023 DataPtrsForWfrEdgeCorr.DisposeData(); } } @@ -1309,7 +1343,8 @@ int srTGenOptElem::SetRadRepres(srTSRWRadStructAccessData* pRadAccessData, char { if(CoordOrAng == 1) { - if(result = SetupWfrEdgeCorrData(pRadAccessData, AuxEx, AuxEz, DataPtrsForWfrEdgeCorr)) return result; + //if(result = SetupWfrEdgeCorrData(pRadAccessData, AuxEx, AuxEz, DataPtrsForWfrEdgeCorr)) return result; + if(result = SetupWfrEdgeCorrData(pRadAccessData, AuxEx, AuxEz, DataPtrsForWfrEdgeCorr, pvGPU)) return result; //HG01122023 } } @@ -1318,9 +1353,11 @@ int srTGenOptElem::SetRadRepres(srTSRWRadStructAccessData* pRadAccessData, char if(ar_zStartInSlicesE != 0) FFT2DInfo.yStart = ar_zStartInSlicesE[ie]; FFT2DInfo.pData = AuxEx; - if(result = FFT2D.Make2DFFT(FFT2DInfo)) return result; + //if(result = FFT2D.Make2DFFT(FFT2DInfo)) return result; + if(result = FFT2D.Make2DFFT(FFT2DInfo, 0, 0, pvGPU)) return result; //HG01122023 FFT2DInfo.pData = AuxEz; - if(result = FFT2D.Make2DFFT(FFT2DInfo)) return result; + //if(result = FFT2D.Make2DFFT(FFT2DInfo)) return result; + if(result = FFT2D.Make2DFFT(FFT2DInfo, 0, 0, pvGPU)) return result; //HG01122023 if(WfrEdgeCorrShouldBeTreated) { @@ -1328,7 +1365,8 @@ int srTGenOptElem::SetRadRepres(srTSRWRadStructAccessData* pRadAccessData, char { if(DataPtrsForWfrEdgeCorr.WasSetup) { - MakeWfrEdgeCorrection(pRadAccessData, AuxEx, AuxEz, DataPtrsForWfrEdgeCorr); + //MakeWfrEdgeCorrection(pRadAccessData, AuxEx, AuxEz, DataPtrsForWfrEdgeCorr); + MakeWfrEdgeCorrection(pRadAccessData, AuxEx, AuxEz, DataPtrsForWfrEdgeCorr, pvGPU); //HG01122023 DataPtrsForWfrEdgeCorr.DisposeData(); } } @@ -2182,7 +2220,8 @@ void srTGenOptElem::FindMinMaxRatio(double* Arr1, double* Arr2, int n, double& M //************************************************************************* -int srTGenOptElem::RadResizeGen(srTSRWRadStructAccessData& SRWRadStructAccessData, srTRadResize& RadResizeStruct) +//int srTGenOptElem::RadResizeGen(srTSRWRadStructAccessData& SRWRadStructAccessData, srTRadResize& RadResizeStruct) +int srTGenOptElem::RadResizeGen(srTSRWRadStructAccessData& SRWRadStructAccessData, srTRadResize& RadResizeStruct, void* pvGPU) //HG01122023 { //Added by SY (for profiling?) at parallelizing SRW via OpenMP: //double start; @@ -2257,7 +2296,8 @@ int srTGenOptElem::RadResizeGen(srTSRWRadStructAccessData& SRWRadStructAccessDat SRWRadStructAccessData.zWfrMin += zShift; SRWRadStructAccessData.zWfrMax += zShift; } - if(result = SetRadRepres(&SRWRadStructAccessData, ToRepres)) return result; + //if(result = SetRadRepres(&SRWRadStructAccessData, ToRepres)) return result; + if(result = SetRadRepres(&SRWRadStructAccessData, ToRepres, 0, 0, pvGPU)) return result; //HG01122023 double pxmNew = RadResizeStruct.pxd, pxdNew = RadResizeStruct.pxm; double pzmNew = RadResizeStruct.pzd, pzdNew = RadResizeStruct.pzm; @@ -2537,7 +2577,8 @@ int srTGenOptElem::RadResizeGen(srTSRWRadStructAccessData& SRWRadStructAccessDat //Added by SY (for profiling?) at parallelizing SRW via OpenMP: //srwlPrintTime(":RadResizeGen: copydata",&start); - if(result = RadResizeCore(SRWRadStructAccessData, NewSRWRadStructAccessData, RadResizeStruct)) return result; + //if(result = RadResizeCore(SRWRadStructAccessData, NewSRWRadStructAccessData, RadResizeStruct)) return result; + if(result = RadResizeCore(SRWRadStructAccessData, NewSRWRadStructAccessData, RadResizeStruct, 0, pvGPU)) return result; //HG01122023 if(OldRadXCopy != 0) delete[] OldRadXCopy; if(OldRadZCopy != 0) delete[] OldRadZCopy; @@ -2602,7 +2643,8 @@ int srTGenOptElem::RadResizeGen(srTSRWRadStructAccessData& SRWRadStructAccessDat //Added by SY (for profiling?) at parallelizing SRW via OpenMP: //srwlPrintTime(":RadResizeGen: TreatPolarizSepar-PrepareStructs",&start); - if(result = RadResizeCore(SRWRadStructAccessData, NewSRWRadStructAccessData, RadResizeStruct)) return result; + //if(result = RadResizeCore(SRWRadStructAccessData, NewSRWRadStructAccessData, RadResizeStruct)) return result; + if(result = RadResizeCore(SRWRadStructAccessData, NewSRWRadStructAccessData, RadResizeStruct, 0, pvGPU)) return result; //HG01122023 //Added by SY (for profiling?) at parallelizing SRW via OpenMP: //srwlPrintTime(":RadResizeGen: RadResizeCore 2",&start); @@ -2662,7 +2704,8 @@ int srTGenOptElem::RadResizeGen(srTSRWRadStructAccessData& SRWRadStructAccessDat *(tBaseRadX++) = 0.; } SRWRadStructAccessData.pBaseRadX = OldRadXCopy; - if(result = RadResizeCore(SRWRadStructAccessData, NewSRWRadStructAccessData, RadResizeStruct, 'x')) return result; + //if(result = RadResizeCore(SRWRadStructAccessData, NewSRWRadStructAccessData, RadResizeStruct, 'x')) return result; + if(result = RadResizeCore(SRWRadStructAccessData, NewSRWRadStructAccessData, RadResizeStruct, 'x', pvGPU)) return result; //HG01122023 if(OldRadXCopy != 0) delete[] OldRadXCopy; } //Added by SY (for profiling?) at parallelizing SRW via OpenMP: @@ -2698,7 +2741,8 @@ int srTGenOptElem::RadResizeGen(srTSRWRadStructAccessData& SRWRadStructAccessDat *(tBaseRadZ++) = 0.; } SRWRadStructAccessData.pBaseRadZ = OldRadZCopy; - if(result = RadResizeCore(SRWRadStructAccessData, NewSRWRadStructAccessData, RadResizeStruct, 'z')) return result; + //if(result = RadResizeCore(SRWRadStructAccessData, NewSRWRadStructAccessData, RadResizeStruct, 'z')) return result; + if(result = RadResizeCore(SRWRadStructAccessData, NewSRWRadStructAccessData, RadResizeStruct, 'z', pvGPU)) return result; //HG01122023 if(OldRadZCopy != 0) delete[] OldRadZCopy; } //Added by SY (for profiling?) at parallelizing SRW via OpenMP: @@ -2734,7 +2778,8 @@ int srTGenOptElem::RadResizeGen(srTSRWRadStructAccessData& SRWRadStructAccessDat //for(long j=0; j NewRadAccessData.zWfrMax + DistAbsTol)) FieldShouldBeZeroedDueToZ = 1; - } + //SY: do we need this (always returns 0, updates some clock) + //if(result = srYield.Check()) return result; - int izcOld = int((zAbs - OldRadAccessData.zStart)*zStepInvOld + 1.E-06); + double zAbs = NewRadAccessData.zStart + iz*NewRadAccessData.zStep; - double zRel = zAbs - (OldRadAccessData.zStart + izcOld*OldRadAccessData.zStep); + char FieldShouldBeZeroedDueToZ = 0; + if(NewRadAccessData.WfrEdgeCorrShouldBeDone) + { + if((zAbs < NewRadAccessData.zWfrMin - DistAbsTol) || (zAbs > NewRadAccessData.zWfrMax + DistAbsTol)) FieldShouldBeZeroedDueToZ = 1; + } - if(izcOld == nz_mi_1Old) { izStOld = izcOld - 3; zRel += 2.*OldRadAccessData.zStep;} - else if(izcOld == nz_mi_2Old) { izStOld = izcOld - 2; zRel += OldRadAccessData.zStep;} - else if(izcOld == 0) { izStOld = izcOld; zRel -= OldRadAccessData.zStep;} - else izStOld = izcOld - 1; + int izcOld = int((zAbs - OldRadAccessData.zStart)*zStepInvOld + 1.E-06); - zRel *= zStepInvOld; + double zRel = zAbs - (OldRadAccessData.zStart + izcOld*OldRadAccessData.zStep); - int izcOld_mi_izStOld = izcOld - izStOld; - //long izPerZ_New = iz*PerZ_New; - long long izPerZ_New = iz*PerZ_New; + if(izcOld == nz_mi_1Old) { izStOld = izcOld - 3; zRel += 2.*OldRadAccessData.zStep;} + else if(izcOld == nz_mi_2Old) { izStOld = izcOld - 2; zRel += OldRadAccessData.zStep;} + else if(izcOld == 0) { izStOld = izcOld; zRel -= OldRadAccessData.zStep;} + else izStOld = izcOld - 1; - float *pEX_StartForX_New = 0, *pEZ_StartForX_New = 0; - if(TreatPolCompX) pEX_StartForX_New = pEX0_New + izPerZ_New; - if(TreatPolCompZ) pEZ_StartForX_New = pEZ0_New + izPerZ_New; + zRel *= zStepInvOld; - for(int ix=ixStart; ix<=ixEnd; ix++) - { - //long ixPerX_New_p_Two_ie = ix*PerX_New + Two_ie; - long long ixPerX_New_p_Two_ie = ix*PerX_New + Two_ie; - float *pEX_New = 0, *pEZ_New = 0; - if(TreatPolCompX) pEX_New = pEX_StartForX_New + ixPerX_New_p_Two_ie; - if(TreatPolCompZ) pEZ_New = pEZ_StartForX_New + ixPerX_New_p_Two_ie; + int izcOld_mi_izStOld = izcOld - izStOld; + //long izPerZ_New = iz*PerZ_New; + long long izPerZ_New = iz*PerZ_New; - double xAbs = NewRadAccessData.xStart + ix*NewRadAccessData.xStep; + float *pEX_StartForX_New = 0, *pEZ_StartForX_New = 0; + if(TreatPolCompX) pEX_StartForX_New = pEX0_New + izPerZ_New; + if(TreatPolCompZ) pEZ_StartForX_New = pEZ0_New + izPerZ_New; - char FieldShouldBeZeroedDueToX = 0; - if(NewRadAccessData.WfrEdgeCorrShouldBeDone) + for(int ix=ixStart; ix<=ixEnd; ix++) { - if((xAbs < NewRadAccessData.xWfrMin - DistAbsTol) || (xAbs > NewRadAccessData.xWfrMax + DistAbsTol)) FieldShouldBeZeroedDueToX = 1; - } - char FieldShouldBeZeroed = (FieldShouldBeZeroedDueToX || FieldShouldBeZeroedDueToZ); + //long ixPerX_New_p_Two_ie = ix*PerX_New + Two_ie; + long long ixPerX_New_p_Two_ie = ix*PerX_New + Two_ie; + float *pEX_New = 0, *pEZ_New = 0; + if(TreatPolCompX) pEX_New = pEX_StartForX_New + ixPerX_New_p_Two_ie; + if(TreatPolCompZ) pEZ_New = pEZ_StartForX_New + ixPerX_New_p_Two_ie; - int ixcOld = int((xAbs - OldRadAccessData.xStart)*xStepInvOld + 1.E-06); - double xRel = xAbs - (OldRadAccessData.xStart + ixcOld*OldRadAccessData.xStep); + double xAbs = NewRadAccessData.xStart + ix*NewRadAccessData.xStep; - if(ixcOld == nx_mi_1Old) { ixStOld = ixcOld - 3; xRel += 2.*OldRadAccessData.xStep;} - else if(ixcOld == nx_mi_2Old) { ixStOld = ixcOld - 2; xRel += OldRadAccessData.xStep;} - else if(ixcOld == 0) { ixStOld = ixcOld; xRel -= OldRadAccessData.xStep;} - else ixStOld = ixcOld - 1; + char FieldShouldBeZeroedDueToX = 0; + if(NewRadAccessData.WfrEdgeCorrShouldBeDone) + { + if((xAbs < NewRadAccessData.xWfrMin - DistAbsTol) || (xAbs > NewRadAccessData.xWfrMax + DistAbsTol)) FieldShouldBeZeroedDueToX = 1; + } + char FieldShouldBeZeroed = (FieldShouldBeZeroedDueToX || FieldShouldBeZeroedDueToZ); - xRel *= xStepInvOld; + int ixcOld = int((xAbs - OldRadAccessData.xStart)*xStepInvOld + 1.E-06); + double xRel = xAbs - (OldRadAccessData.xStart + ixcOld*OldRadAccessData.xStep); - int ixcOld_mi_ixStOld = ixcOld - ixStOld; + if(ixcOld == nx_mi_1Old) { ixStOld = ixcOld - 3; xRel += 2.*OldRadAccessData.xStep;} + else if(ixcOld == nx_mi_2Old) { ixStOld = ixcOld - 2; xRel += OldRadAccessData.xStep;} + else if(ixcOld == 0) { ixStOld = ixcOld; xRel -= OldRadAccessData.xStep;} + else ixStOld = ixcOld - 1; - if((izStOld != izStOldPrev) || (ixStOld != ixStOldPrev)) - { - UseLowOrderInterp_PolCompX = 0; UseLowOrderInterp_PolCompZ = 0; + xRel *= xStepInvOld; - //long TotOffsetOld = izStOld*PerZ_Old + ixStOld*PerX_Old + Two_ie; - long long TotOffsetOld = izStOld*PerZ_Old + ixStOld*PerX_Old + Two_ie; + int ixcOld_mi_ixStOld = ixcOld - ixStOld; - if(TreatPolCompX) + if((izStOld != izStOldPrev) || (ixStOld != ixStOldPrev)) { - float* pExSt_Old = OldRadAccessData.pBaseRadX + TotOffsetOld; - GetCellDataForInterpol(pExSt_Old, PerX_Old, PerZ_Old, AuxF); + UseLowOrderInterp_PolCompX = 0; UseLowOrderInterp_PolCompZ = 0; - SetupCellDataI(AuxF, AuxFI); - UseLowOrderInterp_PolCompX = CheckForLowOrderInterp(AuxF, AuxFI, ixcOld_mi_ixStOld, izcOld_mi_izStOld, &InterpolAux01, InterpolAux02, InterpolAux02I); + //long TotOffsetOld = izStOld*PerZ_Old + ixStOld*PerX_Old + Two_ie; + long long TotOffsetOld = izStOld*PerZ_Old + ixStOld*PerX_Old + Two_ie; - if(!UseLowOrderInterp_PolCompX) + if(TreatPolCompX) { - for(int i=0; i<2; i++) + float* pExSt_Old = OldRadAccessData.pBaseRadX + TotOffsetOld; + GetCellDataForInterpol(pExSt_Old, PerX_Old, PerZ_Old, AuxF); + + SetupCellDataI(AuxF, AuxFI); + UseLowOrderInterp_PolCompX = CheckForLowOrderInterp(AuxF, AuxFI, ixcOld_mi_ixStOld, izcOld_mi_izStOld, &InterpolAux01, InterpolAux02, InterpolAux02I); + + if(!UseLowOrderInterp_PolCompX) { - SetupInterpolAux02(AuxF + i, &InterpolAux01, InterpolAux02 + i); + for(int i=0; i<2; i++) + { + SetupInterpolAux02(AuxF + i, &InterpolAux01, InterpolAux02 + i); + } + SetupInterpolAux02(AuxFI, &InterpolAux01, InterpolAux02I); } - SetupInterpolAux02(AuxFI, &InterpolAux01, InterpolAux02I); } - } - if(TreatPolCompZ) - { - float* pEzSt_Old = OldRadAccessData.pBaseRadZ + TotOffsetOld; - GetCellDataForInterpol(pEzSt_Old, PerX_Old, PerZ_Old, AuxF+2); + if(TreatPolCompZ) + { + float* pEzSt_Old = OldRadAccessData.pBaseRadZ + TotOffsetOld; + GetCellDataForInterpol(pEzSt_Old, PerX_Old, PerZ_Old, AuxF+2); - SetupCellDataI(AuxF+2, AuxFI+1); - UseLowOrderInterp_PolCompZ = CheckForLowOrderInterp(AuxF+2, AuxFI+1, ixcOld_mi_ixStOld, izcOld_mi_izStOld, &InterpolAux01, InterpolAux02+2, InterpolAux02I+1); + SetupCellDataI(AuxF+2, AuxFI+1); + UseLowOrderInterp_PolCompZ = CheckForLowOrderInterp(AuxF+2, AuxFI+1, ixcOld_mi_ixStOld, izcOld_mi_izStOld, &InterpolAux01, InterpolAux02+2, InterpolAux02I+1); - if(!UseLowOrderInterp_PolCompZ) - { - for(int i=0; i<2; i++) + if(!UseLowOrderInterp_PolCompZ) { - SetupInterpolAux02(AuxF+2+i, &InterpolAux01, InterpolAux02+2+i); + for(int i=0; i<2; i++) + { + SetupInterpolAux02(AuxF+2+i, &InterpolAux01, InterpolAux02+2+i); + } + SetupInterpolAux02(AuxFI+1, &InterpolAux01, InterpolAux02I+1); } - SetupInterpolAux02(AuxFI+1, &InterpolAux01, InterpolAux02I+1); } - } - ixStOldPrev = ixStOld; izStOldPrev = izStOld; - } - - if(TreatPolCompX) - { - if(UseLowOrderInterp_PolCompX) - { - InterpolF_LowOrder(InterpolAux02, xRel, zRel, BufF, 0); - InterpolFI_LowOrder(InterpolAux02I, xRel, zRel, BufFI, 0); + ixStOldPrev = ixStOld; izStOldPrev = izStOld; } - else + + if(TreatPolCompX) { - InterpolF(InterpolAux02, xRel, zRel, BufF, 0); - InterpolFI(InterpolAux02I, xRel, zRel, BufFI, 0); - } + if(UseLowOrderInterp_PolCompX) + { + InterpolF_LowOrder(InterpolAux02, xRel, zRel, BufF, 0); + InterpolFI_LowOrder(InterpolAux02I, xRel, zRel, BufFI, 0); + } + else + { + InterpolF(InterpolAux02, xRel, zRel, BufF, 0); + InterpolFI(InterpolAux02I, xRel, zRel, BufFI, 0); + } - (*BufFI) *= AuxFI->fNorm; - ImproveReAndIm(BufF, BufFI); + (*BufFI) *= AuxFI->fNorm; + ImproveReAndIm(BufF, BufFI); - if(FieldShouldBeZeroed) - { - *BufF = 0.; *(BufF+1) = 0.; - } + if(FieldShouldBeZeroed) + { + *BufF = 0.; *(BufF+1) = 0.; + } - *pEX_New = *BufF; - *(pEX_New+1) = *(BufF+1); - } - if(TreatPolCompZ) - { - if(UseLowOrderInterp_PolCompZ) - { - InterpolF_LowOrder(InterpolAux02, xRel, zRel, BufF, 2); - InterpolFI_LowOrder(InterpolAux02I, xRel, zRel, BufFI, 1); + *pEX_New = *BufF; + *(pEX_New+1) = *(BufF+1); } - else + if(TreatPolCompZ) { - InterpolF(InterpolAux02, xRel, zRel, BufF, 2); - InterpolFI(InterpolAux02I, xRel, zRel, BufFI, 1); - } + if(UseLowOrderInterp_PolCompZ) + { + InterpolF_LowOrder(InterpolAux02, xRel, zRel, BufF, 2); + InterpolFI_LowOrder(InterpolAux02I, xRel, zRel, BufFI, 1); + } + else + { + InterpolF(InterpolAux02, xRel, zRel, BufF, 2); + InterpolFI(InterpolAux02I, xRel, zRel, BufFI, 1); + } - (*(BufFI+1)) *= (AuxFI+1)->fNorm; - ImproveReAndIm(BufF+2, BufFI+1); + (*(BufFI+1)) *= (AuxFI+1)->fNorm; + ImproveReAndIm(BufF+2, BufFI+1); - if(FieldShouldBeZeroed) - { - *(BufF+2) = 0.; *(BufF+3) = 0.; - } + if(FieldShouldBeZeroed) + { + *(BufF+2) = 0.; *(BufF+3) = 0.; + } - *pEZ_New = *(BufF+2); - *(pEZ_New+1) = *(BufF+3); + *pEZ_New = *(BufF+2); + *(pEZ_New+1) = *(BufF+3); + } } } } @@ -3089,7 +3147,8 @@ int srTGenOptElem::RadResizeCore(srTSRWRadStructAccessData& OldRadAccessData, sr //sprintf(str,"%s %d",":RadResizeCore: cycles:",NewRadAccessData.ne); //srwlPrintTime(str,&start); - if(WaveFrontTermWasTreated) TreatStronglyOscillatingTerm(NewRadAccessData, 'a', PolComp); + //if(WaveFrontTermWasTreated) TreatStronglyOscillatingTerm(NewRadAccessData, 'a', PolComp); + if(WaveFrontTermWasTreated) TreatStronglyOscillatingTerm(NewRadAccessData, 'a', PolComp, -1, pvGPU); //HG01122023 //OC31102018: added by SY (for profiling?) at parallelizing SRW via OpenMP //srwlPrintTime(":RadResizeCore: TreatStronglyOscillatingTerm 2",&start); @@ -4503,7 +4562,8 @@ char srTGenOptElem::WaveFrontTermCanBeTreated(srTSRWRadStructAccessData& RadAcce //************************************************************************* -void srTGenOptElem::TreatStronglyOscillatingTerm(srTSRWRadStructAccessData& RadAccessData, char AddOrRem, char PolComp, int ieOnly) +//void srTGenOptElem::TreatStronglyOscillatingTerm(srTSRWRadStructAccessData& RadAccessData, char AddOrRem, char PolComp, int ieOnly) +void srTGenOptElem::TreatStronglyOscillatingTerm(srTSRWRadStructAccessData& RadAccessData, char AddOrRem, char PolComp, int ieOnly, void* pvGPU) //HG01122023 { //Later treat X and Z coordinates separately here!!! @@ -4634,6 +4694,14 @@ void srTGenOptElem::TreatStronglyOscillatingTerm(srTSRWRadStructAccessData& RadA ieStart = ieOnly; ieBefEnd = ieOnly + 1; } +#ifdef _OFFLOAD_GPU //HG01122023 + if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) + { + TreatStronglyOscillatingTerm_GPU(RadAccessData, TreatPolCompX, TreatPolCompZ, ConstRx, ConstRz, ieStart, ieBefEnd, (TGPUUsageArg*)pvGPU); + return; + } +#endif + #ifdef _WITH_OMP //OC31102018: added by SY at parallelizing SRW via OpenMP #pragma omp parallel for #endif diff --git a/cpp/src/core/sroptelm.h b/cpp/src/core/sroptelm.h index 4e5c9445..a4ea6919 100644 --- a/cpp/src/core/sroptelm.h +++ b/cpp/src/core/sroptelm.h @@ -17,6 +17,7 @@ #include //required by some (buggy?) version of GCC #include //required? + #include "gmtrans.h" #include "gmvect.h" @@ -43,6 +44,11 @@ #endif #endif +#ifdef _OFFLOAD_GPU +#include "auxgpu.h" +#include "sroptelm_gpu.h" +#endif + //************************************************************************* extern srTIntVect gVectWarnNos; @@ -119,7 +125,10 @@ class srTGenOptElem : public CGenObject { #endif } - virtual int PropagateRadiation(srTSRWRadStructAccessData*, srTParPrecWfrPropag&, srTRadResizeVect&) { return 0;} + virtual int SupportedFeatures() { return 0; } //HG01122023 0=CPU only, 1=GPU supported + + //virtual int PropagateRadiation(srTSRWRadStructAccessData*, srTParPrecWfrPropag&, srTRadResizeVect&) { return 0;} + virtual int PropagateRadiation(srTSRWRadStructAccessData*, srTParPrecWfrPropag&, srTRadResizeVect&, void* pvGPU=0) { return 0;} //HG01122023 virtual int PropagateRadMoments(srTSRWRadStructAccessData*, srTMomentsRatios*) { return 0;} virtual int PropagateWaveFrontRadius(srTSRWRadStructAccessData*) { return 0;} @@ -128,16 +137,21 @@ class srTGenOptElem : public CGenObject { //virtual int PropagateRadiationSimple(srTSRWRadStructAccessData*, void* pBuf=0) { return 0;} //OC06092019 //OC01102019 (restored) - virtual int PropagateRadiationSimple(srTSRWRadStructAccessData*) { return 0;} + //virtual int PropagateRadiationSimple(srTSRWRadStructAccessData*) { return 0;} + virtual int PropagateRadiationSimple(srTSRWRadStructAccessData*, void* pvGPU=0) { return 0;} //HG01122023 virtual int PropagateRadiationSimple1D(srTRadSect1D*) { return 0;} //virtual int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadData, void* pBuf=0) { return 0;} //OC06092019 //OC01102019 (restored) - virtual int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadData) { return 0;} + //virtual int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadData) { return 0;} + virtual int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadData, void* pvGPU=0) { return 0;} //HG01122023 virtual int RangeShouldBeAdjustedAtPropag() { return 1;} virtual int ResolutionShouldBeAdjustedAtPropag() { return 1;} +#ifdef _OFFLOAD_GPU //HG01122023 + virtual int RadPointModifierParallel(srTSRWRadStructAccessData* pRadAccessData, void* pBufVars=0, long pBufVarsSz=0, TGPUUsageArg* pvGPU=0) { return -1; } +#endif virtual void RadPointModifier(srTEXZ&, srTEFieldPtrs&, void* pBufVars=0) {} //OC29082019 //virtual void RadPointModifier(srTEXZ&, srTEFieldPtrs&) {} virtual void RadPointModifier1D(srTEXZ&, srTEFieldPtrs&, void* pBufVars=0) {}//OC06092019 @@ -182,7 +196,8 @@ class srTGenOptElem : public CGenObject { //virtual int PropagateRadiationMeth_0(srTSRWRadStructAccessData* pRadAccessData, void* pBuf=0); //OC06092019 //OC01102019 (restored) - virtual int PropagateRadiationMeth_0(srTSRWRadStructAccessData* pRadAccessData); //moved from derived classes: loops over E, calls derived PropagateRadiationSingleE_Meth_0 + //virtual int PropagateRadiationMeth_0(srTSRWRadStructAccessData* pRadAccessData); //moved from derived classes: loops over E, calls derived PropagateRadiationSingleE_Meth_0 + virtual int PropagateRadiationMeth_0(srTSRWRadStructAccessData* pRadAccessData, void* pvGPU=0); //moved from derived classes: loops over E, calls derived PropagateRadiationSingleE_Meth_0 //HG01122023 void FindWidestWfrMeshParam(vector& vRadSlices, srTSRWRadStructAccessData* pRad, bool keepConstNumPoints); int ReInterpolateWfrDataOnNewTransvMesh(vector& vRadSlices, srTSRWRadStructAccessData* pAuxRadSingleE, srTSRWRadStructAccessData* pRadRes); @@ -236,7 +251,8 @@ class srTGenOptElem : public CGenObject { int FillOutRadFromInRad(srTSRWRadStructAccessData*, srTSRWRadStructAccessData*); - int TraverseRadZXE(srTSRWRadStructAccessData*, void* pBufVars=0); //OC29082019 + int TraverseRadZXE(srTSRWRadStructAccessData*, void* pBufVars=0, long pBufVarsSz=0, void* pvGPU=0); //OC29082019 //HG01122023 + //int TraverseRadZXE(srTSRWRadStructAccessData*, void* pBufVars=0); //OC29082019 //int TraverseRadZXE(srTSRWRadStructAccessData*); int TraverseRad1D(srTRadSect1D*, void* pBufVars=0); //OC29082019 //int TraverseRad1D(srTRadSect1D*); @@ -258,41 +274,73 @@ class srTGenOptElem : public CGenObject { int RemoveSliceConstE_FromGenRadStruct(srTSRWRadStructAccessData*, long); //int SetRadRepres(srTSRWRadStructAccessData*, char); - int SetRadRepres(srTSRWRadStructAccessData*, char, double* ar_xStartInSlicesE=0, double* ar_zStartInSlicesE=0); + //int SetRadRepres(srTSRWRadStructAccessData*, char, double* ar_xStartInSlicesE=0, double* ar_zStartInSlicesE=0); + int SetRadRepres(srTSRWRadStructAccessData*, char, double* ar_xStartInSlicesE=0, double* ar_zStartInSlicesE=0, void* pvGPU=0); //HG01122023 int SetRadRepres1D(srTRadSect1D*, char); - int SetupWfrEdgeCorrData(srTSRWRadStructAccessData*, float*, float*, srTDataPtrsForWfrEdgeCorr&); + int SetupWfrEdgeCorrData(srTSRWRadStructAccessData*, float*, float*, srTDataPtrsForWfrEdgeCorr&, void* pvGPU=0); //HG01122023 + //int SetupWfrEdgeCorrData(srTSRWRadStructAccessData*, float*, float*, srTDataPtrsForWfrEdgeCorr&); //inline void SetupExpCorrArray(float*, long, double, double, double); inline void SetupExpCorrArray(float*, long long, double, double, double); - void MakeWfrEdgeCorrection(srTSRWRadStructAccessData*, float*, float*, srTDataPtrsForWfrEdgeCorr&); + void MakeWfrEdgeCorrection(srTSRWRadStructAccessData*, float*, float*, srTDataPtrsForWfrEdgeCorr&, void* pvGPU=0); //HG01122023 + //void MakeWfrEdgeCorrection(srTSRWRadStructAccessData*, float*, float*, srTDataPtrsForWfrEdgeCorr&); +#ifdef _OFFLOAD_GPU //HG01122023 + void srTGenOptElem::MakeWfrEdgeCorrection_GPU(srTSRWRadStructAccessData* RadAccessData, float* pDataEx, float* pDataEz, srTDataPtrsForWfrEdgeCorr& DataPtrs, TGPUUsageArg* pGPU); +#endif int SetupWfrEdgeCorrData1D(srTRadSect1D*, float*, float*, srTDataPtrsForWfrEdgeCorr1D&); void MakeWfrEdgeCorrection1D(srTRadSect1D*, float*, float*, srTDataPtrsForWfrEdgeCorr1D&); int ComputeRadMoments(srTSRWRadStructAccessData*); - int RadResizeGen(srTSRWRadStructAccessData&, srTRadResize&); + int RadResizeGen(srTSRWRadStructAccessData&, srTRadResize&, void* pvGPU=0); //HG01122023 + //int RadResizeGen(srTSRWRadStructAccessData&, srTRadResize&); int RadResizeGenE(srTSRWRadStructAccessData&, srTRadResize&); - int RadResizeCore(srTSRWRadStructAccessData&, srTSRWRadStructAccessData&, srTRadResize&, char =0); + int RadResizeCore(srTSRWRadStructAccessData&, srTSRWRadStructAccessData&, srTRadResize&, char =0, void* =0); //HG01122023 + //int RadResizeCore(srTSRWRadStructAccessData&, srTSRWRadStructAccessData&, srTRadResize&, char =0); +#ifdef _OFFLOAD_GPU //HG01122023 + int RadResizeCore_GPU(srTSRWRadStructAccessData&, srTSRWRadStructAccessData&, char =0, TGPUUsageArg* =0); +#endif int RadResizeCoreE(srTSRWRadStructAccessData&, srTSRWRadStructAccessData&, srTRadResize&, char =0); int RadResizeCore_OnlyLargerRange(srTSRWRadStructAccessData& OldRadAccessData, srTSRWRadStructAccessData& NewRadAccessData, srTRadResize& RadResizeStruct, char PolComp); int RadResizeCore_OnlyLargerRangeE(srTSRWRadStructAccessData& OldRadAccessData, srTSRWRadStructAccessData& NewRadAccessData, srTRadResize& RadResizeStruct, char PolComp); //inline void GetCellDataForInterpol(float*, long long , long long, srTInterpolAuxF*); +#ifdef _OFFLOAD_GPU //HG01122023 + GPU_PORTABLE +#endif inline static void GetCellDataForInterpol(float*, long long, long long, srTInterpolAuxF*); //OC02022020 //inline void SetupCellDataI(srTInterpolAuxF*, srTInterpolAuxF*); +#ifdef _OFFLOAD_GPU //HG01122023 + GPU_PORTABLE +#endif inline static void SetupCellDataI(srTInterpolAuxF*, srTInterpolAuxF*); //OC02022020 //char WaveFrontTermCanBeTreated(srTSRWRadStructAccessData&); //char WaveFrontTermCanBeTreated(srTSRWRadStructAccessData&, bool checkBenefit=true); //OC06012017 (uncommented after some fixes in bool srTSRWRadStructAccessData::CheckIfQuadTermTreatIsBenefit(char, char)) //char WaveFrontTermCanBeTreated(srTSRWRadStructAccessData&, bool checkBenefit=false); //OC05012017 (changed to checkBenefit=false to resolve problem of resizing in near field at strong under-sampling) char WaveFrontTermCanBeTreated(srTSRWRadStructAccessData&, bool checkBenefit=false); //OC29032017 (changed again to checkBenefit=false to resolve problem of resizing of wiggler radiation at strong under-sampling, the ELETTRA SCW case) - void TreatStronglyOscillatingTerm(srTSRWRadStructAccessData&, char, char =0, int ieOnly =-1); + //void TreatStronglyOscillatingTerm(srTSRWRadStructAccessData&, char, char =0, int ieOnly =-1); + void TreatStronglyOscillatingTerm(srTSRWRadStructAccessData&, char, char =0, int ieOnly =-1, void* pvGPU=0); //HG01122023 +#ifdef _OFFLOAD_GPU //HG01122023 + void TreatStronglyOscillatingTerm_GPU(srTSRWRadStructAccessData& RadAccessData, bool TreatPolCompX, bool TreatPolCompZ, double ConstRx, double ConstRz, int ieStart, int ieBefEnd, TGPUUsageArg* pGPU); +#endif //void TreatStronglyOscillatingTermIrregMesh(srTSRWRadStructAccessData&, float*, float, float, float, float, char, char =0, int =-1); void TreatStronglyOscillatingTermIrregMesh(srTSRWRadStructAccessData&, double*, double, double, double, double, char, char =0, int =-1); //OC260114 //void TreatStronglyOscillatingTermIrregMesh(srTSRWRadStructAccessData&, double*, double, double, double, double, char, char =0, int =-1, double =1, double =1); //OC220214 void TreatStronglyOscillatingTermIrregMeshTrf(srTSRWRadStructAccessData& RadAccessData, char AddOrRem, double CrdTrf[2][3], char PolComp =0, int ieOnly =-1); //OC27122020 +#ifdef _OFFLOAD_GPU //HG01122023 + GPU_PORTABLE inline static void SetupInterpolAux02(srTInterpolAuxF*, srTInterpolAux01*, srTInterpolAux02*); //OC02022020 + GPU_PORTABLE inline static void SetupInterpolAux02_LowOrder(srTInterpolAuxF*, srTInterpolAux01*, srTInterpolAux02*); //OC02022020 + GPU_PORTABLE inline static void InterpolF(srTInterpolAux02*, double, double, float*, int); //OC02022020 + GPU_PORTABLE inline static void InterpolFI(srTInterpolAux02*, double, double, float*, int); //OC02022020 + GPU_PORTABLE inline static void InterpolF_LowOrder(srTInterpolAux02*, double, double, float*, int); //OC02022020 + GPU_PORTABLE inline static void InterpolFI_LowOrder(srTInterpolAux02*, double, double, float*, int); //OC02022020 + GPU_PORTABLE inline double InterpLin(double r, double f1, double f2) { return f1 + r*(f2 - f1);} + GPU_PORTABLE inline static void ImproveReAndIm(float*, float*); //OC02022020 + GPU_PORTABLE inline static int CheckForLowOrderInterp(srTInterpolAuxF*, srTInterpolAuxF*, int, int, srTInterpolAux01*, srTInterpolAux02*, srTInterpolAux02*); //OC02022020 +#else //inline void SetupInterpolAux02(srTInterpolAuxF*, srTInterpolAux01*, srTInterpolAux02*); inline static void SetupInterpolAux02(srTInterpolAuxF*, srTInterpolAux01*, srTInterpolAux02*); //OC02022020 //inline void SetupInterpolAux02_LowOrder(srTInterpolAuxF*, srTInterpolAux01*, srTInterpolAux02*); @@ -310,6 +358,7 @@ class srTGenOptElem : public CGenObject { inline static void ImproveReAndIm(float*, float*); //OC02022020 //inline int CheckForLowOrderInterp(srTInterpolAuxF*, srTInterpolAuxF*, int, int, srTInterpolAux01*, srTInterpolAux02*, srTInterpolAux02*); inline static int CheckForLowOrderInterp(srTInterpolAuxF*, srTInterpolAuxF*, int, int, srTInterpolAux01*, srTInterpolAux02*, srTInterpolAux02*); //OC02022020 +#endif int RadResizeGen1D(srTRadSect1D&, srTRadResize1D&); int RadResizeCore1D(srTRadSect1D&, srTRadSect1D&, srTRadResize1D&); @@ -346,6 +395,9 @@ class srTGenOptElem : public CGenObject { //inline void MultSquareMatrByVect(float**, float*, int, float*); inline void MultSquareMatrByVect(double**, double*, int, double*); //OC130311 +#ifdef _OFFLOAD_GPU //HG04122023 + GPU_PORTABLE +#endif inline void CosAndSin(double, float&, float&); inline void FindLowestAndUppestPoints(TVector3d&, TVector3d*, int, int&, int&); inline void ReflectVect(TVector3d& N, TVector3d& V); diff --git a/cpp/src/core/sroptelm_gpu.cu b/cpp/src/core/sroptelm_gpu.cu new file mode 100644 index 00000000..f9a65861 --- /dev/null +++ b/cpp/src/core/sroptelm_gpu.cu @@ -0,0 +1,587 @@ +/************************************************************************//** + * File: sroptelm_gpu.cu + * Description: Optical element (general CUDA functions) + * Project: Synchrotron Radiation Workshop + * First release: 2023 + * + * Copyright (C) Brookhaven National Laboratory + * All Rights Reserved + * + * @author H.Goel + * @version 1.0 + ***************************************************************************/ + +#ifdef _OFFLOAD_GPU +#include "cuda_runtime.h" +#include "device_launch_parameters.h" +#include "math_constants.h" +#include +#include +#include +#include "sroptelm.h" +#include "sroptelm_gpu.h" + + +__global__ void TreatStronglyOscillatingTerm_Kernel(srTSRWRadStructAccessData RadAccessData, bool TreatPolCompX, bool TreatPolCompZ, double ConstRx, double ConstRz, int ieStart) { + int ix = (blockIdx.x * blockDim.x + threadIdx.x); //nx range + int iz = (blockIdx.y * blockDim.y + threadIdx.y); //nz range + int ie = (blockIdx.z * blockDim.z + threadIdx.z) + ieStart; //ne range + + if (ix < RadAccessData.nx && iz < RadAccessData.nz && ie < RadAccessData.ne + ieStart) + { + double ePh = RadAccessData.eStart + RadAccessData.eStep * (ie - ieStart); + if (RadAccessData.PresT == 1) + { + ePh = RadAccessData.avgPhotEn; //?? OC041108 + } + + double ConstRxE = ConstRx * ePh; + double ConstRzE = ConstRz * ePh; + if (RadAccessData.Pres == 1) + { + //double Lambda_m = 1.239854e-06/ePh; + double Lambda_m = 1.239842e-06 / ePh; + if (RadAccessData.PhotEnergyUnit == 1) Lambda_m *= 0.001; // if keV + + double Lambda_me2 = Lambda_m * Lambda_m; + ConstRxE *= Lambda_me2; + ConstRzE *= Lambda_me2; + } + + double z = (RadAccessData.zStart - RadAccessData.zc) + (iz * RadAccessData.zStep); + double PhaseAddZ = 0; + if (RadAccessData.WfrQuadTermCanBeTreatedAtResizeZ) PhaseAddZ = ConstRzE * z * z; + + double x = (RadAccessData.xStart - RadAccessData.xc) + (ix * RadAccessData.xStep); + double Phase = PhaseAddZ; + if (RadAccessData.WfrQuadTermCanBeTreatedAtResizeX) Phase += ConstRxE * x * x; + + float SinPh, CosPh; + sincosf(Phase, &SinPh, &CosPh); + + long long PerX = RadAccessData.ne << 1; + long long PerZ = PerX * RadAccessData.nx; + long long offset = ie * 2 + iz * PerZ + ix * PerX; + + if (TreatPolCompX) + { + float* pExRe = RadAccessData.pBaseRadX + offset; + float* pExIm = pExRe + 1; + double ExReNew = (*pExRe) * CosPh - (*pExIm) * SinPh; + double ExImNew = (*pExRe) * SinPh + (*pExIm) * CosPh; + *pExRe = (float)ExReNew; *pExIm = (float)ExImNew; + } + if (TreatPolCompZ) + { + float* pEzRe = RadAccessData.pBaseRadZ + offset; + float* pEzIm = pEzRe + 1; + double EzReNew = (*pEzRe) * CosPh - (*pEzIm) * SinPh; + double EzImNew = (*pEzRe) * SinPh + (*pEzIm) * CosPh; + *pEzRe = (float)EzReNew; *pEzIm = (float)EzImNew; + } + } +} + +void srTGenOptElem::TreatStronglyOscillatingTerm_GPU(srTSRWRadStructAccessData& RadAccessData, bool TreatPolCompX, bool TreatPolCompZ, double ConstRx, double ConstRz, int ieStart, int ieBefEnd, TGPUUsageArg* pGPU) +{ + if (RadAccessData.pBaseRadX != NULL) + { + RadAccessData.pBaseRadX = (float*)CAuxGPU::ToDevice(pGPU, RadAccessData.pBaseRadX, 2*RadAccessData.ne*RadAccessData.nx*RadAccessData.nz*sizeof(float)); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, RadAccessData.pBaseRadX); + } + if (RadAccessData.pBaseRadZ != NULL) + { + RadAccessData.pBaseRadZ = (float*)CAuxGPU::ToDevice(pGPU, RadAccessData.pBaseRadZ, 2*RadAccessData.ne*RadAccessData.nx*RadAccessData.nz*sizeof(float)); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, RadAccessData.pBaseRadZ); + } + + const int bs = 256; + dim3 blocks(RadAccessData.nx / bs + ((RadAccessData.nx & (bs - 1)) != 0), RadAccessData.nz, ieBefEnd - ieStart); + dim3 threads(bs, 1); + TreatStronglyOscillatingTerm_Kernel<< > > (RadAccessData, TreatPolCompX, TreatPolCompZ, ConstRx, ConstRz, ieStart); + + CAuxGPU::MarkUpdated(pGPU, RadAccessData.pBaseRadX, true, false); + CAuxGPU::MarkUpdated(pGPU, RadAccessData.pBaseRadZ, true, false); + +#ifndef _DEBUG + if (RadAccessData.pBaseRadX != NULL) + RadAccessData.pBaseRadX = (float*)CAuxGPU::GetHostPtr(pGPU, RadAccessData.pBaseRadX); + if (RadAccessData.pBaseRadZ != NULL) + RadAccessData.pBaseRadZ = (float*)CAuxGPU::GetHostPtr(pGPU, RadAccessData.pBaseRadZ); +#endif + +#ifdef _DEBUG + if (RadAccessData.pBaseRadX != NULL) + RadAccessData.pBaseRadX = (float*)CAuxGPU::ToHostAndFree(pGPU, RadAccessData.pBaseRadX, 2*RadAccessData.ne*RadAccessData.nx*RadAccessData.nz*sizeof(float)); + if (RadAccessData.pBaseRadZ != NULL) + RadAccessData.pBaseRadZ = (float*)CAuxGPU::ToHostAndFree(pGPU, RadAccessData.pBaseRadZ, 2*RadAccessData.ne*RadAccessData.nx*RadAccessData.nz*sizeof(float)); + cudaStreamSynchronize(0); + auto err = cudaGetLastError(); + printf("%s\r\n", cudaGetErrorString(err)); +#endif +} + +__global__ void MakeWfrEdgeCorrection_Kernel(srTSRWRadStructAccessData RadAccessData, float* pDataEx, float* pDataEz, srTDataPtrsForWfrEdgeCorr DataPtrs, float dxSt, float dxFi, float dzSt, float dzFi) +{ + int ix = (blockIdx.x * blockDim.x + threadIdx.x); //nx range + int iz = (blockIdx.y * blockDim.y + threadIdx.y); //nz range + + if (ix < RadAccessData.nx && iz < RadAccessData.nz) + { + //float dxSt = (float)DataPtrs.dxSt; + //float dxFi = (float)DataPtrs.dxFi; + //float dzSt = (float)DataPtrs.dzSt; + //float dzFi = (float)DataPtrs.dzFi; + float dxSt_dzSt = dxSt * dzSt; + float dxSt_dzFi = dxSt * dzFi; + float dxFi_dzSt = dxFi * dzSt; + float dxFi_dzFi = dxFi * dzFi; + + long TwoNz = RadAccessData.nz << 1; + long PerX = 2; + long PerZ = PerX * RadAccessData.nx; + + float fSSExRe = DataPtrs.fxStzSt[0]; + float fSSExIm = DataPtrs.fxStzSt[1]; + float fSSEzRe = DataPtrs.fxStzSt[2]; + float fSSEzIm = DataPtrs.fxStzSt[3]; + + float fFSExRe = DataPtrs.fxFizSt[0]; + float fFSExIm = DataPtrs.fxFizSt[1]; + float fFSEzRe = DataPtrs.fxFizSt[2]; + float fFSEzIm = DataPtrs.fxFizSt[3]; + + float fSFExRe = DataPtrs.fxStzFi[0]; + float fSFExIm = DataPtrs.fxStzFi[1]; + float fSFEzRe = DataPtrs.fxStzFi[2]; + float fSFEzIm = DataPtrs.fxStzFi[3]; + + float fFFExRe = DataPtrs.fxFizFi[0]; + float fFFExIm = DataPtrs.fxFizFi[1]; + float fFFEzRe = DataPtrs.fxFizFi[2]; + float fFFEzIm = DataPtrs.fxFizFi[3]; + + float bRe, bIm, cRe, cIm; + + long long Two_iz = iz << 1; + long long Two_iz_p_1 = Two_iz + 1; + long long Two_ix = ix << 1; + long long Two_ix_p_1 = Two_ix + 1; + + float* tEx = pDataEx + iz * PerZ + ix * PerX, * tEz = pDataEz + iz * PerZ + ix * PerX; + float ExRe = *tEx, ExIm = *(tEx + 1); + float EzRe = *tEz, EzIm = *(tEz + 1); + + if (dxSt != 0.f) + { + float ExpXStRe = DataPtrs.ExpArrXSt[Two_ix], ExpXStIm = DataPtrs.ExpArrXSt[Two_ix_p_1]; + + bRe = DataPtrs.FFTArrXStEx[Two_iz]; bIm = DataPtrs.FFTArrXStEx[Two_iz_p_1]; + ExRe += (float)(dxSt * (ExpXStRe * bRe - ExpXStIm * bIm)); + ExIm += (float)(dxSt * (ExpXStRe * bIm + ExpXStIm * bRe)); + + bRe = DataPtrs.FFTArrXStEz[Two_iz]; bIm = DataPtrs.FFTArrXStEz[Two_iz_p_1]; + EzRe += (float)(dxSt * (ExpXStRe * bRe - ExpXStIm * bIm)); + EzIm += (float)(dxSt * (ExpXStRe * bIm + ExpXStIm * bRe)); + + if (dzSt != 0.f) + { + bRe = DataPtrs.ExpArrZSt[Two_iz], bIm = DataPtrs.ExpArrZSt[Two_iz_p_1]; + cRe = ExpXStRe * bRe - ExpXStIm * bIm; cIm = ExpXStRe * bIm + ExpXStIm * bRe; + + ExRe += (float)(dxSt_dzSt * (fSSExRe * cRe - fSSExIm * cIm)); + ExIm += (float)(dxSt_dzSt * (fSSExRe * cIm + fSSExIm * cRe)); + EzRe += (float)(dxSt_dzSt * (fSSEzRe * cRe - fSSEzIm * cIm)); + EzIm += (float)(dxSt_dzSt * (fSSEzRe * cIm + fSSEzIm * cRe)); + } + if (dzFi != 0.f) + { + bRe = DataPtrs.ExpArrZFi[Two_iz], bIm = DataPtrs.ExpArrZFi[Two_iz_p_1]; + cRe = ExpXStRe * bRe - ExpXStIm * bIm; cIm = ExpXStRe * bIm + ExpXStIm * bRe; + + ExRe -= (float)(dxSt_dzFi * (fSFExRe * cRe - fSFExIm * cIm)); + ExIm -= (float)(dxSt_dzFi * (fSFExRe * cIm + fSFExIm * cRe)); + EzRe -= (float)(dxSt_dzFi * (fSFEzRe * cRe - fSFEzIm * cIm)); + EzIm -= (float)(dxSt_dzFi * (fSFEzRe * cIm + fSFEzIm * cRe)); + } + } + if (dxFi != 0.f) + { + float ExpXFiRe = DataPtrs.ExpArrXFi[Two_ix], ExpXFiIm = DataPtrs.ExpArrXFi[Two_ix_p_1]; + + bRe = DataPtrs.FFTArrXFiEx[Two_iz]; bIm = DataPtrs.FFTArrXFiEx[Two_iz_p_1]; + ExRe -= (float)(dxFi * (ExpXFiRe * bRe - ExpXFiIm * bIm)); + ExIm -= (float)(dxFi * (ExpXFiRe * bIm + ExpXFiIm * bRe)); + + bRe = DataPtrs.FFTArrXFiEz[Two_iz]; bIm = DataPtrs.FFTArrXFiEz[Two_iz_p_1]; + EzRe -= (float)(dxFi * (ExpXFiRe * bRe - ExpXFiIm * bIm)); + EzIm -= (float)(dxFi * (ExpXFiRe * bIm + ExpXFiIm * bRe)); + + if (dzSt != 0.f) + { + bRe = DataPtrs.ExpArrZSt[Two_iz], bIm = DataPtrs.ExpArrZSt[Two_iz_p_1]; + cRe = ExpXFiRe * bRe - ExpXFiIm * bIm; cIm = ExpXFiRe * bIm + ExpXFiIm * bRe; + + ExRe -= (float)(dxFi_dzSt * (fFSExRe * cRe - fFSExIm * cIm)); + ExIm -= (float)(dxFi_dzSt * (fFSExRe * cIm + fFSExIm * cRe)); + EzRe -= (float)(dxFi_dzSt * (fFSEzRe * cRe - fFSEzIm * cIm)); + EzIm -= (float)(dxFi_dzSt * (fFSEzRe * cIm + fFSEzIm * cRe)); + } + if (dzFi != 0.f) + { + bRe = DataPtrs.ExpArrZFi[Two_iz], bIm = DataPtrs.ExpArrZFi[Two_iz_p_1]; + cRe = ExpXFiRe * bRe - ExpXFiIm * bIm; cIm = ExpXFiRe * bIm + ExpXFiIm * bRe; + + ExRe += (float)(dxFi_dzFi * (fFFExRe * cRe - fFFExIm * cIm)); + ExIm += (float)(dxFi_dzFi * (fFFExRe * cIm + fFFExIm * cRe)); + EzRe += (float)(dxFi_dzFi * (fFFEzRe * cRe - fFFEzIm * cIm)); + EzIm += (float)(dxFi_dzFi * (fFFEzRe * cIm + fFFEzIm * cRe)); + } + } + if (dzSt != 0.f) + { + float ExpZStRe = DataPtrs.ExpArrZSt[Two_iz], ExpZStIm = DataPtrs.ExpArrZSt[Two_iz_p_1]; + + bRe = DataPtrs.FFTArrZStEx[Two_ix]; bIm = DataPtrs.FFTArrZStEx[Two_ix_p_1]; + ExRe += (float)(dzSt * (ExpZStRe * bRe - ExpZStIm * bIm)); + ExIm += (float)(dzSt * (ExpZStRe * bIm + ExpZStIm * bRe)); + + bRe = DataPtrs.FFTArrZStEz[Two_ix]; bIm = DataPtrs.FFTArrZStEz[Two_ix_p_1]; + EzRe += (float)(DataPtrs.dzSt * (ExpZStRe * bRe - ExpZStIm * bIm)); + EzIm += (float)(DataPtrs.dzSt * (ExpZStRe * bIm + ExpZStIm * bRe)); + } + if (dzFi != 0.f) + { + float ExpZFiRe = DataPtrs.ExpArrZFi[Two_iz], ExpZFiIm = DataPtrs.ExpArrZFi[Two_iz_p_1]; + + bRe = DataPtrs.FFTArrZFiEx[Two_ix]; bIm = DataPtrs.FFTArrZFiEx[Two_ix_p_1]; + ExRe -= (float)(dzFi * (ExpZFiRe * bRe - ExpZFiIm * bIm)); + ExIm -= (float)(dzFi * (ExpZFiRe * bIm + ExpZFiIm * bRe)); + + bRe = DataPtrs.FFTArrZFiEz[Two_ix]; bIm = DataPtrs.FFTArrZFiEz[Two_ix_p_1]; + EzRe -= (float)(dzFi * (ExpZFiRe * bRe - ExpZFiIm * bIm)); + EzIm -= (float)(dzFi * (ExpZFiRe * bIm + ExpZFiIm * bRe)); + } + + *tEx = ExRe; *(tEx + 1) = ExIm; + *tEz = EzRe; *(tEz + 1) = EzIm; + } +} + +void srTGenOptElem::MakeWfrEdgeCorrection_GPU(srTSRWRadStructAccessData* RadAccessData, float* pDataEx, float* pDataEz, srTDataPtrsForWfrEdgeCorr& DataPtrs, TGPUUsageArg* pGPU) +{ + pDataEx = (float*)CAuxGPU::ToDevice(pGPU, pDataEx, 2*RadAccessData->ne*RadAccessData->nx*RadAccessData->nz*sizeof(float)); + pDataEz = (float*)CAuxGPU::ToDevice(pGPU, pDataEz, 2*RadAccessData->ne*RadAccessData->nx*RadAccessData->nz*sizeof(float)); + DataPtrs.FFTArrXStEx = (float*)CAuxGPU::ToDevice(pGPU, DataPtrs.FFTArrXStEx, 2*RadAccessData->nz*sizeof(float)); + DataPtrs.FFTArrXStEz = (float*)CAuxGPU::ToDevice(pGPU, DataPtrs.FFTArrXStEz, 2*RadAccessData->nz*sizeof(float)); + DataPtrs.FFTArrXFiEx = (float*)CAuxGPU::ToDevice(pGPU, DataPtrs.FFTArrXFiEx, 2*RadAccessData->nz*sizeof(float)); + DataPtrs.FFTArrXFiEz = (float*)CAuxGPU::ToDevice(pGPU, DataPtrs.FFTArrXFiEz, 2*RadAccessData->nz*sizeof(float)); + DataPtrs.FFTArrZStEx = (float*)CAuxGPU::ToDevice(pGPU, DataPtrs.FFTArrZStEx, 2*RadAccessData->nx*sizeof(float)); + DataPtrs.FFTArrZStEz = (float*)CAuxGPU::ToDevice(pGPU, DataPtrs.FFTArrZStEz, 2*RadAccessData->nx*sizeof(float)); + DataPtrs.FFTArrZFiEx = (float*)CAuxGPU::ToDevice(pGPU, DataPtrs.FFTArrZFiEx, 2*RadAccessData->nx*sizeof(float)); + DataPtrs.FFTArrZFiEz = (float*)CAuxGPU::ToDevice(pGPU, DataPtrs.FFTArrZFiEz, 2*RadAccessData->nx*sizeof(float)); + DataPtrs.ExpArrXSt = (float*)CAuxGPU::ToDevice(pGPU, DataPtrs.ExpArrXSt, 2*RadAccessData->nx*sizeof(float)); + DataPtrs.ExpArrXFi = (float*)CAuxGPU::ToDevice(pGPU, DataPtrs.ExpArrXFi, 2*RadAccessData->nx*sizeof(float)); + DataPtrs.ExpArrZSt = (float*)CAuxGPU::ToDevice(pGPU, DataPtrs.ExpArrZSt, 2*RadAccessData->nz*sizeof(float)); + DataPtrs.ExpArrZFi = (float*)CAuxGPU::ToDevice(pGPU, DataPtrs.ExpArrZFi, 2*RadAccessData->nz*sizeof(float)); + + CAuxGPU::EnsureDeviceMemoryReady(pGPU, pDataEx); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, pDataEz); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, DataPtrs.FFTArrXStEx); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, DataPtrs.FFTArrXStEz); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, DataPtrs.FFTArrXFiEx); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, DataPtrs.FFTArrXFiEz); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, DataPtrs.FFTArrZStEx); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, DataPtrs.FFTArrZStEz); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, DataPtrs.FFTArrZFiEx); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, DataPtrs.FFTArrZFiEz); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, DataPtrs.ExpArrXSt); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, DataPtrs.ExpArrXFi); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, DataPtrs.ExpArrZSt); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, DataPtrs.ExpArrZFi); + + const int bs = 256; + dim3 blocks(RadAccessData->nx / bs + ((RadAccessData->nx & (bs - 1)) != 0), RadAccessData->nz); + dim3 threads(bs, 1); + MakeWfrEdgeCorrection_Kernel << > > (*RadAccessData, pDataEx, pDataEz, DataPtrs, (float)DataPtrs.dxSt, (float)DataPtrs.dxFi, (float)DataPtrs.dzSt, (float)DataPtrs.dzFi); + + DataPtrs.FFTArrXStEx = (float*)CAuxGPU::ToHostAndFree(pGPU, DataPtrs.FFTArrXStEx, 2*RadAccessData->nz*sizeof(float), true); + DataPtrs.FFTArrXStEz = (float*)CAuxGPU::ToHostAndFree(pGPU, DataPtrs.FFTArrXStEz, 2*RadAccessData->nz*sizeof(float), true); + DataPtrs.FFTArrXFiEx = (float*)CAuxGPU::ToHostAndFree(pGPU, DataPtrs.FFTArrXFiEx, 2*RadAccessData->nz*sizeof(float), true); + DataPtrs.FFTArrXFiEz = (float*)CAuxGPU::ToHostAndFree(pGPU, DataPtrs.FFTArrXFiEz, 2*RadAccessData->nz*sizeof(float), true); + DataPtrs.FFTArrZStEx = (float*)CAuxGPU::ToHostAndFree(pGPU, DataPtrs.FFTArrZStEx, 2*RadAccessData->nx*sizeof(float), true); + DataPtrs.FFTArrZStEz = (float*)CAuxGPU::ToHostAndFree(pGPU, DataPtrs.FFTArrZStEz, 2*RadAccessData->nx*sizeof(float), true); + DataPtrs.FFTArrZFiEx = (float*)CAuxGPU::ToHostAndFree(pGPU, DataPtrs.FFTArrZFiEx, 2*RadAccessData->nx*sizeof(float), true); + DataPtrs.FFTArrZFiEz = (float*)CAuxGPU::ToHostAndFree(pGPU, DataPtrs.FFTArrZFiEz, 2*RadAccessData->nx*sizeof(float), true); + DataPtrs.ExpArrXSt = (float*)CAuxGPU::ToHostAndFree(pGPU, DataPtrs.ExpArrXSt, 2*RadAccessData->nx*sizeof(float), true); + DataPtrs.ExpArrXFi = (float*)CAuxGPU::ToHostAndFree(pGPU, DataPtrs.ExpArrXFi, 2*RadAccessData->nx*sizeof(float), true); + DataPtrs.ExpArrZSt = (float*)CAuxGPU::ToHostAndFree(pGPU, DataPtrs.ExpArrZSt, 2*RadAccessData->nz*sizeof(float), true); + DataPtrs.ExpArrZFi = (float*)CAuxGPU::ToHostAndFree(pGPU, DataPtrs.ExpArrZFi, 2*RadAccessData->nz*sizeof(float), true); + + CAuxGPU::MarkUpdated(pGPU, pDataEx, true, false); + CAuxGPU::MarkUpdated(pGPU, pDataEz, true, false); + +#ifdef _DEBUG + CAuxGPU::ToHostAndFree(pGPU, pDataEx, 2*RadAccessData->ne*RadAccessData->nx*RadAccessData->nz*sizeof(float)); + CAuxGPU::ToHostAndFree(pGPU, pDataEz, 2*RadAccessData->ne*RadAccessData->nx*RadAccessData->nz*sizeof(float)); + cudaStreamSynchronize(0); + auto err = cudaGetLastError(); + printf("%s\r\n", cudaGetErrorString(err)); +#endif +} + +template __global__ void RadResizeCore_Kernel(srTSRWRadStructAccessData OldRadAccessData, srTSRWRadStructAccessData NewRadAccessData) +{ + int ixStart = int(NewRadAccessData.AuxLong1); + int ixEnd = int(NewRadAccessData.AuxLong2); + int izStart = int(NewRadAccessData.AuxLong3); + int izEnd = int(NewRadAccessData.AuxLong4); + + int ix = (blockIdx.x * blockDim.x + threadIdx.x) + ixStart; //nx range + int iz = (blockIdx.y * blockDim.y + threadIdx.y) + izStart; //nz range + int ie = (blockIdx.z * blockDim.z + threadIdx.z); //ne range + + if (ix > ixEnd) return; + if (iz > izEnd) return; + + const double DistAbsTol = 1.E-10; + double xStepInvOld = 1./OldRadAccessData.xStep; + double zStepInvOld = 1./OldRadAccessData.zStep; + int nx_mi_1Old = OldRadAccessData.nx - 1; + int nz_mi_1Old = OldRadAccessData.nz - 1; + int nx_mi_2Old = nx_mi_1Old - 1; + int nz_mi_2Old = nz_mi_1Old - 1; + + //OC31102018: moved by SY at parallelizing SRW via OpenMP + //srTInterpolAux01 InterpolAux01; + //srTInterpolAux02 InterpolAux02[4], InterpolAux02I[2]; + //srTInterpolAuxF AuxF[4], AuxFI[2]; + //int ixStOld, izStOld, ixStOldPrev = -1000, izStOldPrev = -1000; + + //long PerX_New = NewRadAccessData.ne << 1; + //long PerZ_New = PerX_New*NewRadAccessData.nx; + long long PerX_New = NewRadAccessData.ne << 1; + long long PerZ_New = PerX_New*NewRadAccessData.nx; + + //long PerX_Old = PerX_New; + //long PerZ_Old = PerX_Old*OldRadAccessData.nx; + long long PerX_Old = PerX_New; + long long PerZ_Old = PerX_Old*OldRadAccessData.nx; + + float *pEX0_New = 0, *pEZ0_New = 0; + pEX0_New = NewRadAccessData.pBaseRadX; + pEZ0_New = NewRadAccessData.pBaseRadZ; + + float* pEX0_Old = 0, * pEZ0_Old = 0; + pEX0_Old = OldRadAccessData.pBaseRadX; + pEZ0_Old = OldRadAccessData.pBaseRadZ; + + + int ixStOld, izStOld, ixStOldPrev = -1000, izStOldPrev = -1000; + //SY: do we need this (always returns 0, updates some clock) + //if(result = srYield.Check()) return result; + + double zAbs = NewRadAccessData.zStart + iz * NewRadAccessData.zStep; + + char FieldShouldBeZeroedDueToZ = 0; + if (NewRadAccessData.WfrEdgeCorrShouldBeDone) + { + if ((zAbs < NewRadAccessData.zWfrMin - DistAbsTol) || (zAbs > NewRadAccessData.zWfrMax + DistAbsTol)) FieldShouldBeZeroedDueToZ = 1; + } + + int izcOld = int((zAbs - OldRadAccessData.zStart) * zStepInvOld + 1.E-06); + + double zRel = zAbs - (OldRadAccessData.zStart + izcOld * OldRadAccessData.zStep); + + if (izcOld == nz_mi_1Old) { izStOld = izcOld - 3; zRel += 2. * OldRadAccessData.zStep; } + else if (izcOld == nz_mi_2Old) { izStOld = izcOld - 2; zRel += OldRadAccessData.zStep; } + else if (izcOld == 0) { izStOld = izcOld; zRel -= OldRadAccessData.zStep; } + else izStOld = izcOld - 1; + + zRel *= zStepInvOld; + + int izcOld_mi_izStOld = izcOld - izStOld; + //long izPerZ_New = iz*PerZ_New; + long long izPerZ_New = iz * PerZ_New; + + double xAbs = NewRadAccessData.xStart + ix * NewRadAccessData.xStep; + + char FieldShouldBeZeroedDueToX = 0; + if (NewRadAccessData.WfrEdgeCorrShouldBeDone) + { + if ((xAbs < NewRadAccessData.xWfrMin - DistAbsTol) || (xAbs > NewRadAccessData.xWfrMax + DistAbsTol)) FieldShouldBeZeroedDueToX = 1; + } + char FieldShouldBeZeroed = (FieldShouldBeZeroedDueToX || FieldShouldBeZeroedDueToZ); + + int ixcOld = int((xAbs - OldRadAccessData.xStart) * xStepInvOld + 1.E-06); + double xRel = xAbs - (OldRadAccessData.xStart + ixcOld * OldRadAccessData.xStep); + + if (ixcOld == nx_mi_1Old) { ixStOld = ixcOld - 3; xRel += 2. * OldRadAccessData.xStep; } + else if (ixcOld == nx_mi_2Old) { ixStOld = ixcOld - 2; xRel += OldRadAccessData.xStep; } + else if (ixcOld == 0) { ixStOld = ixcOld; xRel -= OldRadAccessData.xStep; } + else ixStOld = ixcOld - 1; + + xRel *= xStepInvOld; + + int ixcOld_mi_ixStOld = ixcOld - ixStOld; + + //or (int ie = 0; ie < NewRadAccessData.ne; ie++) + { + //OC31102018: modified by SY at OpenMP parallelization + //ixStOldPrev = -1000; izStOldPrev = -1000; + + //OC31102018: moved by SY at OpenMP parallelization + srTInterpolAux01 InterpolAux01; + srTInterpolAux02 InterpolAux02[4], InterpolAux02I[2]; + srTInterpolAuxF AuxF[4], AuxFI[2]; + ixStOldPrev = -1000; izStOldPrev = -1000; + float BufF[4], BufFI[2]; + char UseLowOrderInterp_PolCompX = 0, UseLowOrderInterp_PolCompZ = 0; + + //long Two_ie = ie << 1; + long long Two_ie = ie << 1; + + float* pEX_StartForX_New = 0, * pEZ_StartForX_New = 0; + pEX_StartForX_New = pEX0_New + izPerZ_New; + pEZ_StartForX_New = pEZ0_New + izPerZ_New; + + //long ixPerX_New_p_Two_ie = ix*PerX_New + Two_ie; + long long ixPerX_New_p_Two_ie = ix * PerX_New + Two_ie; + float* pEX_New = 0, * pEZ_New = 0; + pEX_New = pEX_StartForX_New + ixPerX_New_p_Two_ie; + pEZ_New = pEZ_StartForX_New + ixPerX_New_p_Two_ie; + + //long TotOffsetOld = izStOld*PerZ_Old + ixStOld*PerX_Old + Two_ie; + long long TotOffsetOld = izStOld * PerZ_Old + ixStOld * PerX_Old + Two_ie; + + if (TreatPolCompX) + { + float* pExSt_Old = pEX0_Old + TotOffsetOld; + srTGenOptElem::GetCellDataForInterpol(pExSt_Old, PerX_Old, PerZ_Old, AuxF); + + srTGenOptElem::SetupCellDataI(AuxF, AuxFI); + UseLowOrderInterp_PolCompX = srTGenOptElem::CheckForLowOrderInterp(AuxF, AuxFI, ixcOld_mi_ixStOld, izcOld_mi_izStOld, &InterpolAux01, InterpolAux02, InterpolAux02I); + + if (!UseLowOrderInterp_PolCompX) + { + for (int i = 0; i < 2; i++) + { + srTGenOptElem::SetupInterpolAux02(AuxF + i, &InterpolAux01, InterpolAux02 + i); + } + srTGenOptElem::SetupInterpolAux02(AuxFI, &InterpolAux01, InterpolAux02I); + } + + if (UseLowOrderInterp_PolCompX) + { + srTGenOptElem::InterpolF_LowOrder(InterpolAux02, xRel, zRel, BufF, 0); + srTGenOptElem::InterpolFI_LowOrder(InterpolAux02I, xRel, zRel, BufFI, 0); + } + else + { + srTGenOptElem::InterpolF(InterpolAux02, xRel, zRel, BufF, 0); + srTGenOptElem::InterpolFI(InterpolAux02I, xRel, zRel, BufFI, 0); + } + + (*BufFI) *= AuxFI->fNorm; + srTGenOptElem::ImproveReAndIm(BufF, BufFI); + + if (FieldShouldBeZeroed) + { + *BufF = 0.; *(BufF + 1) = 0.; + } + + *pEX_New = *BufF; + *(pEX_New + 1) = *(BufF + 1); + } + if (TreatPolCompZ) + { + float* pEzSt_Old = pEZ0_Old + TotOffsetOld; + srTGenOptElem::GetCellDataForInterpol(pEzSt_Old, PerX_Old, PerZ_Old, AuxF + 2); + + srTGenOptElem::SetupCellDataI(AuxF + 2, AuxFI + 1); + UseLowOrderInterp_PolCompZ = srTGenOptElem::CheckForLowOrderInterp(AuxF + 2, AuxFI + 1, ixcOld_mi_ixStOld, izcOld_mi_izStOld, &InterpolAux01, InterpolAux02 + 2, InterpolAux02I + 1); + + if (!UseLowOrderInterp_PolCompZ) + { + for (int i = 0; i < 2; i++) + { + srTGenOptElem::SetupInterpolAux02(AuxF + 2 + i, &InterpolAux01, InterpolAux02 + 2 + i); + } + srTGenOptElem::SetupInterpolAux02(AuxFI + 1, &InterpolAux01, InterpolAux02I + 1); + } + + if (UseLowOrderInterp_PolCompZ) + { + srTGenOptElem::InterpolF_LowOrder(InterpolAux02, xRel, zRel, BufF, 2); + srTGenOptElem::InterpolFI_LowOrder(InterpolAux02I, xRel, zRel, BufFI, 1); + } + else + { + srTGenOptElem::InterpolF(InterpolAux02, xRel, zRel, BufF, 2); + srTGenOptElem::InterpolFI(InterpolAux02I, xRel, zRel, BufFI, 1); + } + + (*(BufFI + 1)) *= (AuxFI + 1)->fNorm; + srTGenOptElem::ImproveReAndIm(BufF + 2, BufFI + 1); + + if (FieldShouldBeZeroed) + { + *(BufF + 2) = 0.; *(BufF + 3) = 0.; + } + + *pEZ_New = *(BufF + 2); + *(pEZ_New + 1) = *(BufF + 3); + } + } +} + +int srTGenOptElem::RadResizeCore_GPU(srTSRWRadStructAccessData& OldRadAccessData, srTSRWRadStructAccessData& NewRadAccessData, char PolComp, TGPUUsageArg* pGPU) +{ + char TreatPolCompX = ((PolComp == 0) || (PolComp == 'x')); + char TreatPolCompZ = ((PolComp == 0) || (PolComp == 'z')); + + int nx = NewRadAccessData.AuxLong2 - NewRadAccessData.AuxLong1 + 1; + int nz = NewRadAccessData.AuxLong4 - NewRadAccessData.AuxLong3 + 1; + int ne = NewRadAccessData.ne; + OldRadAccessData.pBaseRadX = (float*)CAuxGPU::ToDevice(pGPU, OldRadAccessData.pBaseRadX, 2*OldRadAccessData.ne*OldRadAccessData.nx*OldRadAccessData.nz*sizeof(float)); + OldRadAccessData.pBaseRadZ = (float*)CAuxGPU::ToDevice(pGPU, OldRadAccessData.pBaseRadZ, 2*OldRadAccessData.ne*OldRadAccessData.nx*OldRadAccessData.nz*sizeof(float)); + NewRadAccessData.pBaseRadX = (float*)CAuxGPU::ToDevice(pGPU, NewRadAccessData.pBaseRadX, 2*NewRadAccessData.ne*NewRadAccessData.nx*NewRadAccessData.nz*sizeof(float), true); + NewRadAccessData.pBaseRadZ = (float*)CAuxGPU::ToDevice(pGPU, NewRadAccessData.pBaseRadZ, 2*NewRadAccessData.ne*NewRadAccessData.nx*NewRadAccessData.nz*sizeof(float), true); + + CAuxGPU::EnsureDeviceMemoryReady(pGPU, OldRadAccessData.pBaseRadX); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, OldRadAccessData.pBaseRadZ); + //CAuxGPU::EnsureDeviceMemoryReady(pGPU, NewRadAccessData.pBaseRadX); + //CAuxGPU::EnsureDeviceMemoryReady(pGPU, NewRadAccessData.pBaseRadZ); + + const int bs = 32; + dim3 blocks(nx / bs + ((nx & (bs - 1)) != 0), nz, ne); + dim3 threads(bs, 1); + + if (TreatPolCompX && TreatPolCompZ) RadResizeCore_Kernel << > > (OldRadAccessData, NewRadAccessData); + else if (TreatPolCompX) RadResizeCore_Kernel << > > (OldRadAccessData, NewRadAccessData); + else if (TreatPolCompZ) RadResizeCore_Kernel << > > (OldRadAccessData, NewRadAccessData); + + OldRadAccessData.pBaseRadX = (float*)CAuxGPU::ToHostAndFree(pGPU, OldRadAccessData.pBaseRadX, 2*OldRadAccessData.ne*OldRadAccessData.nx*OldRadAccessData.nz*sizeof(float), true); + OldRadAccessData.pBaseRadZ = (float*)CAuxGPU::ToHostAndFree(pGPU, OldRadAccessData.pBaseRadZ, 2*OldRadAccessData.ne*OldRadAccessData.nx*OldRadAccessData.nz*sizeof(float), true); + //NewRadAccessData.pBaseRadX = CAuxGPU::ToHostAndFree(pGPU, NewRadAccessData.pBaseRadX, 2*NewRadAccessData.ne*NewRadAccessData.nx*NewRadAccessData.nz*sizeof(float)); + //NewRadAccessData.pBaseRadZ = CAuxGPU::ToHostAndFree(pGPU, NewRadAccessData.pBaseRadZ, 2*NewRadAccessData.ne*NewRadAccessData.nx*NewRadAccessData.nz*sizeof(float)); + CAuxGPU::MarkUpdated(pGPU, NewRadAccessData.pBaseRadX, true, false); + CAuxGPU::MarkUpdated(pGPU, NewRadAccessData.pBaseRadZ, true, false); +#ifndef _DEBUG + NewRadAccessData.pBaseRadX = (float*)CAuxGPU::GetHostPtr(pGPU, NewRadAccessData.pBaseRadX); + NewRadAccessData.pBaseRadZ = (float*)CAuxGPU::GetHostPtr(pGPU, NewRadAccessData.pBaseRadZ); +#endif + +#ifdef _DEBUG + NewRadAccessData.pBaseRadX = (float*)CAuxGPU::ToHostAndFree(pGPU, NewRadAccessData.pBaseRadX, 2*NewRadAccessData.ne*NewRadAccessData.nx*NewRadAccessData.nz*sizeof(float), false); + NewRadAccessData.pBaseRadZ = (float*)CAuxGPU::ToHostAndFree(pGPU, NewRadAccessData.pBaseRadZ, 2*NewRadAccessData.ne*NewRadAccessData.nx*NewRadAccessData.nz*sizeof(float), false); + cudaStreamSynchronize(0); + auto err = cudaGetLastError(); + printf("%s\r\n", cudaGetErrorString(err)); + +#endif + + return 0; +} + +#endif \ No newline at end of file diff --git a/cpp/src/core/sroptelm_gpu.h b/cpp/src/core/sroptelm_gpu.h new file mode 100644 index 00000000..629e0c42 --- /dev/null +++ b/cpp/src/core/sroptelm_gpu.h @@ -0,0 +1,123 @@ +/************************************************************************//** + * File: sroptelm_gpu.h + * Description: Optical element (general CUDA header) + * Project: Synchrotron Radiation Workshop + * First release: 2023 + * + * Copyright (C) Brookhaven National Laboratory + * All Rights Reserved + * + * @author H.Goel + * @version 1.0 + ***************************************************************************/ + +#ifdef _OFFLOAD_GPU +#ifndef __SROPTELMGPU_H +#define __SROPTELMGPU_H + +#include "cuda_runtime.h" +#include +#include +#include + +#ifdef __CUDACC__ +template __global__ void RadPointModifierParallel_Kernel(srTSRWRadStructAccessData RadAccessData, void* pBufVars, T* tgt_obj) +{ + int ix = (blockIdx.x * blockDim.x + threadIdx.x); //nx range + int iz = (blockIdx.y * blockDim.y + threadIdx.y); //nz range + + if (ix < RadAccessData.nx && iz < RadAccessData.nz) + { + srTEFieldPtrs EPtrs; + srTEXZ EXZ; + EXZ.z = RadAccessData.zStart + iz * RadAccessData.zStep; + EXZ.x = RadAccessData.xStart + ix * RadAccessData.xStep; + + for (int ie = 0; ie < RadAccessData.ne; ie++) { + EXZ.e = RadAccessData.eStart + ie * RadAccessData.eStep; + EXZ.aux_offset = RadAccessData.ne * RadAccessData.nx * 2 * iz + RadAccessData.ne * 2 * ix + ie * 2; + if (RadAccessData.pBaseRadX != 0) + { + EPtrs.pExRe = RadAccessData.pBaseRadX + EXZ.aux_offset; + EPtrs.pExIm = EPtrs.pExRe + 1; + } + else + { + EPtrs.pExRe = 0; + EPtrs.pExIm = 0; + } + if (RadAccessData.pBaseRadZ != 0) + { + EPtrs.pEzRe = RadAccessData.pBaseRadZ + EXZ.aux_offset; + EPtrs.pEzIm = EPtrs.pEzRe + 1; + } + else + { + EPtrs.pEzRe = 0; + EPtrs.pEzIm = 0; + } + + tgt_obj->RadPointModifierPortable(EXZ, EPtrs, pBufVars); + } + } +} + +template int RadPointModifierParallelImpl(srTSRWRadStructAccessData* pRadAccessData, void* pBufVars, long pBufVarsSz, T* tgt_obj, TGPUUsageArg* pGPU) +{ + const int bs = 256; + dim3 blocks(pRadAccessData->nx / bs + ((pRadAccessData->nx & (bs - 1)) != 0), pRadAccessData->nz); + dim3 threads(bs, 1); + + if (pRadAccessData->pBaseRadX != NULL) + { + pRadAccessData->pBaseRadX = (float*)CAuxGPU::ToDevice(pGPU, pRadAccessData->pBaseRadX, 2*pRadAccessData->ne*pRadAccessData->nx*pRadAccessData->nz*sizeof(float)); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, pRadAccessData->pBaseRadX); + } + if (pRadAccessData->pBaseRadZ != NULL) + { + pRadAccessData->pBaseRadZ = (float*)CAuxGPU::ToDevice(pGPU, pRadAccessData->pBaseRadZ, 2*pRadAccessData->ne*pRadAccessData->nx*pRadAccessData->nz*sizeof(float)); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, pRadAccessData->pBaseRadZ); + } + + T* local_copy = (T*)CAuxGPU::ToDevice(pGPU, tgt_obj, sizeof(T)); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, local_copy); + //cudaMalloc(&local_copy, sizeof(T)); + //cudaMemcpy(local_copy, tgt_obj, sizeof(T), cudaMemcpyHostToDevice); + + void* pBufVars_dev = NULL; + if (pBufVarsSz > 0){ + pBufVars_dev = CAuxGPU::ToDevice(pGPU, pBufVars, pBufVarsSz); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, pBufVars_dev); + } + RadPointModifierParallel_Kernel << > > (*pRadAccessData, pBufVars_dev, local_copy); + //cudaDeviceSynchronize(); + //cudaFreeAsync(local_copy, 0); + if (pBufVarsSz > 0) CAuxGPU::ToHostAndFree(pGPU, pBufVars_dev, pBufVarsSz, true); + CAuxGPU::ToHostAndFree(pGPU, local_copy, sizeof(T), true); + + CAuxGPU::MarkUpdated(pGPU, pRadAccessData->pBaseRadX, true, false); + CAuxGPU::MarkUpdated(pGPU, pRadAccessData->pBaseRadZ, true, false); + +#ifndef _DEBUG + if (pRadAccessData->pBaseRadX != NULL) + pRadAccessData->pBaseRadX = (float*)CAuxGPU::GetHostPtr(pGPU, pRadAccessData->pBaseRadX); + if (pRadAccessData->pBaseRadZ != NULL) + pRadAccessData->pBaseRadZ = (float*)CAuxGPU::GetHostPtr(pGPU, pRadAccessData->pBaseRadZ); +#endif + +#ifdef _DEBUG + if (pRadAccessData->pBaseRadX != NULL) + pRadAccessData->pBaseRadX = (float*)CAuxGPU::ToHostAndFree(pGPU, pRadAccessData->pBaseRadX, 2*pRadAccessData->ne*pRadAccessData->nx*pRadAccessData->nz*sizeof(float)); + if (pRadAccessData->pBaseRadZ != NULL) + pRadAccessData->pBaseRadZ = (float*)CAuxGPU::ToHostAndFree(pGPU, pRadAccessData->pBaseRadZ, 2*pRadAccessData->ne*pRadAccessData->nx*pRadAccessData->nz*sizeof(float)); + cudaStreamSynchronize(0); + auto err = cudaGetLastError(); + printf("%s\r\n", cudaGetErrorString(err)); +#endif + + return 0; +} +#endif + +#endif //__SROPTELMGPU_H +#endif \ No newline at end of file diff --git a/cpp/src/core/sroptgtr.cpp b/cpp/src/core/sroptgtr.cpp index 7f348172..96681032 100644 --- a/cpp/src/core/sroptgtr.cpp +++ b/cpp/src/core/sroptgtr.cpp @@ -1172,7 +1172,7 @@ int srTGenTransmission::DetermineFocalDistByPropag1D(srTRadSect1D& Sect1D, doubl } //************************************************************************* - +/* HG01122023 Moved to header file to reduce code duplication for GPU support void srTGenTransmission::RadPointModifier(srTEXZ& EXZ, srTEFieldPtrs& EPtrs, void* pBufVars) //OC29082019 //void srTGenTransmission::RadPointModifier(srTEXZ& EXZ, srTEFieldPtrs& EPtrs) {// e in eV; Length in m !!! @@ -1338,7 +1338,7 @@ void srTGenTransmission::RadPointModifier(srTEXZ& EXZ, srTEFieldPtrs& EPtrs, voi float NewEzIm = (float)(T*((*(EPtrs.pEzRe))*SinPh + (*(EPtrs.pEzIm))*CosPh)); *(EPtrs.pEzRe) = NewEzRe; *(EPtrs.pEzIm) = NewEzIm; } -} +} */ //************************************************************************* diff --git a/cpp/src/core/sroptgtr.h b/cpp/src/core/sroptgtr.h index 0cde61e5..202190c3 100644 --- a/cpp/src/core/sroptgtr.h +++ b/cpp/src/core/sroptgtr.h @@ -50,6 +50,8 @@ class srTGenTransmission : public srTFocusingElem { } } + int SupportedFeatures() override { return 1; } //HG01122023 =1 means that it supports GPU propagation + void EnsureTransmissionForField(); double DetermineAppropriatePhotEnergyForFocDistTest(double Rx, double Rz); int EstimateFocalDistancesAndCheckSampling(); @@ -79,7 +81,8 @@ class srTGenTransmission : public srTFocusingElem { } //int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, int MethNo, srTRadResizeVect& ResBeforeAndAfterArr) - int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterArr) + //int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterArr) + int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterArr, void* pvGPU) //HG01122023 { //if(ParPrecWfrPropag.AnalTreatment == 1) //{// Treating linear terms analytically @@ -90,7 +93,8 @@ class srTGenTransmission : public srTFocusingElem { int result = 0; - if(MethNo == 0) result = PropagateRadiationMeth_0(pRadAccessData); + //if(MethNo == 0) result = PropagateRadiationMeth_0(pRadAccessData); + if(MethNo == 0) result = PropagateRadiationMeth_0(pRadAccessData, pvGPU); //HG01122023 else result = PropagateRadiationMeth_2(pRadAccessData, ParPrecWfrPropag, ResBeforeAndAfterArr); //if(ParPrecWfrPropag.AnalTreatment == 1) @@ -104,25 +108,30 @@ class srTGenTransmission : public srTFocusingElem { //int PropagateRadiationMeth_0(srTSRWRadStructAccessData* pRadAccessData) //int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadDataSingleE, void* pBuf=0) //OC06092019 //OC01102019 (restored) - int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadDataSingleE) + //int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadDataSingleE) + int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadDataSingleE, void* pvGPU) //HG01122023 { int result; if(result = PropagateRadMoments(pRadAccessData, 0)) return result; if(result = PropagateWaveFrontRadius(pRadAccessData)) return result; //if(result = PropagateRadiationSimple(pRadAccessData, pBuf)) return result; //OC06092019 //OC01102019 (restored) - if(result = PropagateRadiationSimple(pRadAccessData)) return result; + //if(result = PropagateRadiationSimple(pRadAccessData)) return result; + if(result = PropagateRadiationSimple(pRadAccessData, pvGPU)) return result; //HG01122023 if(result = Propagate4x4PropMatr(pRadAccessData)) return result; return 0; } //int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData, void* pBuf=0) //OC06092019 //OC01102019 (restored) - int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData) + //int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData) + int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData, void* pvGPU) //HG01122023 { int result; - if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result; - return TraverseRadZXE(pRadAccessData); + //if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result; + if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0, 0, 0, pvGPU)) return result; //HG01122023 + //return TraverseRadZXE(pRadAccessData); + return TraverseRadZXE(pRadAccessData, 0, 0, pvGPU); //HG01122023 } int PropagateRadiationSimple1D(srTRadSect1D* pSect1D) { @@ -131,8 +140,186 @@ class srTGenTransmission : public srTFocusingElem { return TraverseRad1D(pSect1D); } - void RadPointModifier(srTEXZ& EXZ, srTEFieldPtrs& EPtrs, void* pBuf=0); //OC29082019 + void RadPointModifier(srTEXZ& EXZ, srTEFieldPtrs& EPtrs, void* pBuf=0) //OC29082019 //HG01122023 + { + RadPointModifierPortable(EXZ, EPtrs, pBuf); + } + //void RadPointModifier(srTEXZ& EXZ, srTEFieldPtrs& EPtrs, void* pBuf=0); //OC29082019 //void RadPointModifier(srTEXZ& EXZ, srTEFieldPtrs& EPtrs); + + +#ifdef _OFFLOAD_GPU //HG01122023 Brought from sroptgtr.cpp, to reduce code duplication for GPU port + int RadPointModifierParallel(srTSRWRadStructAccessData* pRadAccessData, void* pBufVars=0, long pBufVarsSz=0, TGPUUsageArg* pGPU=0) override; + + GPU_PORTABLE +#endif + void RadPointModifierPortable(srTEXZ& EXZ, srTEFieldPtrs& EPtrs, void* pBufVars) //OC29082019 + //void srTGenTransmission::RadPointModifier(srTEXZ& EXZ, srTEFieldPtrs& EPtrs) + {// e in eV; Length in m !!! + // Operates on Coord. side !!! + //double xRel = EXZ.x - TransvCenPoint.x, zRel = EXZ.z - TransvCenPoint.y; + double xRel = EXZ.x, zRel = EXZ.z; //OC080311 + + long Ne = 1, Nemi2 = -1; + long iDimX = 0, iDimZ = 1; + if(GenTransNumData.AmOfDims == 3) + { + //Ne = (GenTransNumData.DimSizes)[0]; + Ne = (long)((GenTransNumData.DimSizes)[0]); //OC28042019 + Nemi2 = Ne - 2; + iDimX = 1; iDimZ = 2; + } + + //long Nx = (GenTransNumData.DimSizes)[0], Nz = (GenTransNumData.DimSizes)[1]; + //long Nx = (GenTransNumData.DimSizes)[iDimX], Nz = (GenTransNumData.DimSizes)[iDimZ]; //OC241112 + long Nx = (long)((GenTransNumData.DimSizes)[iDimX]), Nz = (long)((GenTransNumData.DimSizes)[iDimZ]); //OC28042019 + long Nxmi2 = Nx - 2, Nzmi2 = Nz - 2; + + //double xStart = (GenTransNumData.DimStartValues)[0], zStart = (GenTransNumData.DimStartValues)[1]; + //double xStep = (GenTransNumData.DimSteps)[0], zStep = (GenTransNumData.DimSteps)[1]; + double xStart = (GenTransNumData.DimStartValues)[iDimX], zStart = (GenTransNumData.DimStartValues)[iDimZ]; + double xStep = (GenTransNumData.DimSteps)[iDimX], zStep = (GenTransNumData.DimSteps)[iDimZ]; + + double xEnd = xStart + (Nx - 1)*xStep, zEnd = zStart + (Nz - 1)*zStep; + + double AbsTolX = xStep*0.001, AbsTolZ = zStep*0.001; // To steer + if(OuterTransmIs == 1) + { + if((xRel < xStart - AbsTolX) || (xRel > xEnd + AbsTolX) || (zRel < zStart - AbsTolZ) || (zRel > zEnd + AbsTolZ)) + { + if(EPtrs.pExRe != 0) { *(EPtrs.pExRe) = 0.; *(EPtrs.pExIm) = 0.;} + if(EPtrs.pEzRe != 0) { *(EPtrs.pEzRe) = 0.; *(EPtrs.pEzIm) = 0.;} + return; + } + } + + double xr = 0., zr = 0.; + double T = 1., Ph = 0.; + //char NotExactRightEdgeX = 1, NotExactRightEdgeZ = 1; + + long ix = long((xRel - xStart)/xStep); + if(::fabs(xRel - ((ix + 1)*xStep + xStart)) < 1.E-05*xStep) ix++; + + //if(ix < 0) { ix = 0; xr = 0.;} + //else if(ix > Nxmi2) { ix = Nx - 1; xr = 0.; NotExactRightEdgeX = 0;} + //else xr = (xRel - (ix*xStep + xStart))/xStep; + + if(ix < 0) ix = 0; //OC241112 + //else if(ix > Nxmi2) ix = Nxmi2; + //xr = (xRel - (ix*xStep + xStart))/xStep; + else if(ix > Nxmi2) { ix = Nxmi2; xr = 1.;} + else xr = (xRel - (ix*xStep + xStart))/xStep; + + long iz = long((zRel - zStart)/zStep); + if(::fabs(zRel - ((iz + 1)*zStep + zStart)) < 1.E-05*zStep) iz++; + + //if(iz < 0) { iz = 0; zr = 0.;} + //else if(iz > Nzmi2) { iz = Nz - 1; zr = 0.; NotExactRightEdgeZ = 0;} + //else zr = (zRel - (iz*zStep + zStart))/zStep; + + if(iz < 0) iz = 0; + //else if(iz > Nzmi2) iz = Nzmi2; + //zr = (zRel - (iz*zStep + zStart))/zStep; + else if(iz > Nzmi2) { iz = Nzmi2; zr = 1.;} + else zr = (zRel - (iz*zStep + zStart))/zStep; + + double xrzr = xr*zr; + if((GenTransNumData.AmOfDims == 2) || ((GenTransNumData.AmOfDims == 3) && (Ne == 1))) + { + //long zPer = Nx << 1; + long long zPer = Nx << 1; + + //DOUBLE *p00 = (DOUBLE*)(GenTransNumData.pData) + (iz*zPer + (ix << 1)); + //DOUBLE *p10 = p00 + 2, *p01 = p00 + zPer; + //DOUBLE *p11 = p01 + 2; + //DOUBLE *p00p1 = p00+1, *p10p1 = p10+1, *p01p1 = p01+1, *p11p1 = p11+1; + double *p00 = (double*)(GenTransNumData.pData) + (iz*zPer + (ix << 1)); //OC26112019 (related to SRW port to IGOR XOP8 on Mac) + double *p10 = p00 + 2, *p01 = p00 + zPer; + double *p11 = p01 + 2; + double *p00p1 = p00+1, *p10p1 = p10+1, *p01p1 = p01+1, *p11p1 = p11+1; + + //double Axz = 0., Ax = 0., Az = 0., Bxz = 0., Bx = 0., Bz = 0.; + //if(NotExactRightEdgeX && NotExactRightEdgeZ) { Axz = *p00 - *p01 - *p10 + *p11; Bxz = *p00p1 - *p01p1 - *p10p1 + *p11p1;} + //if(NotExactRightEdgeX) { Ax = (*p10 - *p00); Bx = (*p10p1 - *p00p1);} + //if(NotExactRightEdgeZ) { Az = (*p01 - *p00); Bz = (*p01p1 - *p00p1);} + + double Axz = *p00 - *p01 - *p10 + *p11, Bxz = *p00p1 - *p01p1 - *p10p1 + *p11p1; + double Ax = (*p10 - *p00), Bx = (*p10p1 - *p00p1); + double Az = (*p01 - *p00), Bz = (*p01p1 - *p00p1); + + T = Axz*xrzr + Ax*xr + Az*zr + *p00; + Ph = Bxz*xrzr + Bx*xr + Bz*zr + *p00p1; + + //OCTEST 04032019 + //T = *p00 + Ax*xr + Az*zr; + //Ph = *p00p1 + Bx*xr + Bz*zr; + + //OCTEST 05032019 + //T = CGenMathInterp::InterpOnRegMesh2d(EXZ.x, EXZ.z, xStart, xStep, Nx, zStart, zStep, Nz, (double*)(GenTransNumData.pData), 3, 2); + //Ph = CGenMathInterp::InterpOnRegMesh2d(EXZ.x, EXZ.z, xStart, xStep, Nx, zStart, zStep, Nz, (double*)(GenTransNumData.pData) + 1, 3, 2); + //END OCTEST + } + else if(GenTransNumData.AmOfDims == 3) + {//bi-linear 3D interpolation + double eStart = (GenTransNumData.DimStartValues)[0]; + double eStep = (GenTransNumData.DimSteps)[0]; + + long ie = long((EXZ.e - eStart)/eStep + 1.e-10); + if(ie < 0) ie = 0; + else if(ie > Nemi2) ie = Nemi2; + + double er = (EXZ.e - (ie*eStep + eStart))/eStep; + //double erxr = er*xr, erzr = er*zr; + //double erxrzr = erxr*zr; + + //long xPer = Ne << 1; + //long zPer = Nx*xPer; + long long xPer = Ne << 1; + long long zPer = Nx*xPer; + //DOUBLE *p000 = (DOUBLE*)(GenTransNumData.pData) + (iz*zPer + ix*xPer + (ie << 1)); + //DOUBLE *p100 = p000 + 2, *p010 = p000 + xPer, *p001 = p000 + zPer; + //DOUBLE *p110 = p100 + xPer, *p101 = p100 + zPer, *p011 = p010 + zPer; + //DOUBLE *p111 = p110 + zPer; + double *p000 = (double*)(GenTransNumData.pData) + (iz*zPer + ix*xPer + (ie << 1)); //OC26112019 (related to SRW port to IGOR XOP8 on Mac) + double *p100 = p000 + 2, *p010 = p000 + xPer, *p001 = p000 + zPer; + double *p110 = p100 + xPer, *p101 = p100 + zPer, *p011 = p010 + zPer; + double *p111 = p110 + zPer; + + double one_mi_er = 1.- er, one_mi_xr = 1.- xr, one_mi_zr = 1.- zr; + double one_mi_er_one_mi_xr = one_mi_er*one_mi_xr, er_one_mi_xr = er*one_mi_xr; + double one_mi_er_xr = one_mi_er*xr, er_xr = er*xr; + T = ((*p000)*one_mi_er_one_mi_xr + (*p100)*er_one_mi_xr + (*p010)*one_mi_er_xr + (*p110)*er_xr)*one_mi_zr + + ((*p001)*one_mi_er_one_mi_xr + (*p101)*er_one_mi_xr + (*p011)*one_mi_er_xr + (*p111)*er_xr)*zr; + Ph = ((*(p000+1))*one_mi_er_one_mi_xr + (*(p100+1))*er_one_mi_xr + (*(p010+1))*one_mi_er_xr + (*(p110+1))*er_xr)*one_mi_zr + + ((*(p001+1))*one_mi_er_one_mi_xr + (*(p101+1))*er_one_mi_xr + (*(p011+1))*one_mi_er_xr + (*(p111+1))*er_xr)*zr; + + // inArFunc[] = {f(x0,y0,z0),f(x1,y0,z0),f(x0,y1,z0),f(x0,y0,z1),f(x1,y1,z0),f(x1,y0,z1),f(x0,y1,z1),f(x1,y1,z1)} //function values at the corners of the cube + //return inArFunc[0]*one_mi_xt*one_mi_yt*one_mi_zt + // + inArFunc[1]*xt*one_mi_yt*one_mi_zt + // + inArFunc[2]*one_mi_xt*yt*one_mi_zt + // + inArFunc[3]*one_mi_xt*one_mi_yt*zt + // + inArFunc[4]*xt*yt*one_mi_zt + // + inArFunc[5]*xt*one_mi_yt*zt + // + inArFunc[6]*one_mi_xt*yt*zt + // + inArFunc[7]*xt*yt*zt; + } + + if(OptPathOrPhase == 1) Ph *= EXZ.e*5.0676816042E+06; // TwoPi_d_Lambda_m + float CosPh, SinPh; CosAndSin(Ph, CosPh, SinPh); + if(EPtrs.pExRe != 0) + { + float NewExRe = (float)(T*((*(EPtrs.pExRe))*CosPh - (*(EPtrs.pExIm))*SinPh)); + float NewExIm = (float)(T*((*(EPtrs.pExRe))*SinPh + (*(EPtrs.pExIm))*CosPh)); + *(EPtrs.pExRe) = NewExRe; *(EPtrs.pExIm) = NewExIm; + } + if(EPtrs.pEzRe != 0) + { + float NewEzRe = (float)(T*((*(EPtrs.pEzRe))*CosPh - (*(EPtrs.pEzIm))*SinPh)); + float NewEzIm = (float)(T*((*(EPtrs.pEzRe))*SinPh + (*(EPtrs.pEzIm))*CosPh)); + *(EPtrs.pEzRe) = NewEzRe; *(EPtrs.pEzIm) = NewEzIm; + } + } + void RadPointModifier1D(srTEXZ& EXZ, srTEFieldPtrs& EPtrs, void* pBuf=0); //OC06092019 //void RadPointModifier1D(srTEXZ& EXZ, srTEFieldPtrs& EPtrs); diff --git a/cpp/src/core/sroptgtr_gpu.cu b/cpp/src/core/sroptgtr_gpu.cu new file mode 100644 index 00000000..9250740e --- /dev/null +++ b/cpp/src/core/sroptgtr_gpu.cu @@ -0,0 +1,32 @@ +/************************************************************************//** + * File: sroptgtr_gpu.cu + * Description: Optical element: Transmission (CUDA implementation) + * Project: Synchrotron Radiation Workshop + * First release: 2023 + * + * Copyright (C) Brookhaven National Laboratory + * All Rights Reserved + * + * @author H.Goel + * @version 1.0 + ***************************************************************************/ + +#ifdef _OFFLOAD_GPU +#include "sroptgtr.h" +#include "cuda_runtime.h" +#include "device_launch_parameters.h" +#include "math_constants.h" + +#include +#include +#include + +int srTGenTransmission::RadPointModifierParallel(srTSRWRadStructAccessData* pRadAccessData, void* pBufVars, long pBufVarsSz, TGPUUsageArg* pGPU) +{ + GenTransNumData.pData = (char*)CAuxGPU::ToDevice(pGPU, GenTransNumData.pData, GenTransNumData.DimSizes[0] * (int)GenTransNumData.DimSizes[1] * (int)GenTransNumData.DimSizes[2] * sizeof(double) * 2); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, GenTransNumData.pData); + int retCode = RadPointModifierParallelImpl(pRadAccessData, pBufVars, pBufVarsSz, this, pGPU); + GenTransNumData.pData = (char*)CAuxGPU::ToHostAndFree(pGPU, GenTransNumData.pData, GenTransNumData.DimSizes[0] * (int)GenTransNumData.DimSizes[1] * (int)GenTransNumData.DimSizes[2] * sizeof(double) * 2, true); + return retCode; +} //HG03092022 +#endif \ No newline at end of file diff --git a/cpp/src/core/srradmnp.cpp b/cpp/src/core/srradmnp.cpp index 7522f78b..ff3597b8 100644 --- a/cpp/src/core/srradmnp.cpp +++ b/cpp/src/core/srradmnp.cpp @@ -676,8 +676,9 @@ int srTRadGenManip::ExtractSingleElecIntensity1DvsZ(srTRadExtract& RadExtract) //************************************************************************* -int srTRadGenManip::ExtractSingleElecIntensity2DvsXZ(srTRadExtract& RadExtract) -//int srTRadGenManip::ExtractSingleElecIntensity2DvsXZ(srTRadExtract& RadExtract, gpuUsageArg *pGpuUsage) //Himanshu? +//int srTRadGenManip::ExtractSingleElecIntensity2DvsXZ(srTRadExtract& RadExtract) +//int srTRadGenManip::ExtractSingleElecIntensity2DvsXZ(srTRadExtract& RadExtract, gpuUsageArg *pGpuUsage) //HG30112023 +int srTRadGenManip::ExtractSingleElecIntensity2DvsXZ(srTRadExtract& RadExtract, void* pvGPU) //HG02122023 { int PolCom = RadExtract.PolarizCompon; int Int_or_ReE = RadExtract.Int_or_Phase; @@ -691,7 +692,7 @@ int srTRadGenManip::ExtractSingleElecIntensity2DvsXZ(srTRadExtract& RadExtract) float *pI = 0, *pI1 = 0, *pI2 = 0, *pI3 = 0; //OC17042020 double *pId = 0, *pI1d = 0, *pI2d = 0, *pI3d = 0; long ne = RadAccessData.ne, nx = RadAccessData.nx, nz = RadAccessData.nz; - //long ne = RadAccessData.ne, nx = RadAccessData.nx, nz = RadAccessData.nz, nwfr = RadAccessData.nwfr; //Himanshu? + //long ne = RadAccessData.ne, nx = RadAccessData.nx, nz = RadAccessData.nz, nwfr = RadAccessData.nwfr; //HG30112023 //float *pI = 0; //DOUBLE *pId = 0; //double *pId = 0; //OC26112019 (related to SRW port to IGOR XOP8 on Mac) @@ -759,185 +760,180 @@ int srTRadGenManip::ExtractSingleElecIntensity2DvsXZ(srTRadExtract& RadExtract) //long izPerZ = 0; long ix, ie; - //Himanshu? - //GPU_COND(pGpuUsage, - //{ - // ExtractSingleElecIntensity2DvsXZ_GPU(RadExtract, arAuxInt, ie0, ie1, InvStepRelArg, pGpuUsage); - //}) - //else - //{ - //long long iwfrPerWfr = 0; - //for(long long iwfr=0; iwfr 0) //OC08052021 + } + + if(iter == 0) //OC08052021 + { + //OC140813 + if(pI != 0) *(pI++) = (float)resInt; + if(pId != 0) *(pId++) = resInt; //OC18042020 + //if(pId != 0) *(pId++) = (double)resInt; + if(allStokesReq) //OC18042020 { - if(pI != 0) + if(RadExtract.pExtractedData != 0) { - float newI = (float)(((*pI)*iter + resInt)*inv_iter_p_1); - *(pI++) = newI; + *(pI1++) = (float)resInt1; *(pI2++) = (float)resInt2; *(pI3++) = (float)resInt3; } - if(pId != 0) + else + { + *(pI1d++) = resInt1; *(pI2d++) = resInt2; *(pI3d++) = resInt3; + } + } + } + else if(iter > 0) //OC08052021 + { + if(pI != 0) + { + float newI = (float)(((*pI)*iter + resInt)*inv_iter_p_1); + *(pI++) = newI; + } + if(pId != 0) + { + double newI = ((*pId)*iter + resInt)*inv_iter_p_1; + *(pId++) = newI; + } + if(allStokesReq) + { + if(RadExtract.pExtractedData != 0) { - double newI = ((*pId)*iter + resInt)*inv_iter_p_1; - *(pId++) = newI; + float newI1 = (float)(((*pI1)*iter + resInt1)*inv_iter_p_1); + float newI2 = (float)(((*pI2)*iter + resInt2)*inv_iter_p_1); + float newI3 = (float)(((*pI3)*iter + resInt3)*inv_iter_p_1); + *(pI1++) = newI1; *(pI2++) = newI2; *(pI3++) = newI3; } - if(allStokesReq) + else { - if(RadExtract.pExtractedData != 0) - { - float newI1 = (float)(((*pI1)*iter + resInt1)*inv_iter_p_1); - float newI2 = (float)(((*pI2)*iter + resInt2)*inv_iter_p_1); - float newI3 = (float)(((*pI3)*iter + resInt3)*inv_iter_p_1); - *(pI1++) = newI1; *(pI2++) = newI2; *(pI3++) = newI3; - } - else - { - double newI1 = ((*pI1d)*iter + resInt1)*inv_iter_p_1; - double newI2 = ((*pI2d)*iter + resInt2)*inv_iter_p_1; - double newI3 = ((*pI3d)*iter + resInt3)*inv_iter_p_1; - *(pI1d++) = newI1; *(pI2d++) = newI2; *(pI3d++) = newI3; - } + double newI1 = ((*pI1d)*iter + resInt1)*inv_iter_p_1; + double newI2 = ((*pI2d)*iter + resInt2)*inv_iter_p_1; + double newI3 = ((*pI3d)*iter + resInt3)*inv_iter_p_1; + *(pI1d++) = newI1; *(pI2d++) = newI2; *(pI3d++) = newI3; } } - else //OC08052021 + } + else //OC08052021 + { + if(pI != 0) *(pI++) += (float)resInt; + if(pId != 0) *(pId++) += resInt; + if(allStokesReq) { - if(pI != 0) *(pI++) += (float)resInt; - if(pId != 0) *(pId++) += resInt; - if(allStokesReq) + if(RadExtract.pExtractedData != 0) { - if(RadExtract.pExtractedData != 0) - { - *(pI1++) += (float)resInt1; *(pI2++) += (float)resInt2; *(pI3++) += (float)resInt3; - } - else - { - *(pI1d++) += resInt1; *(pI2d++) += resInt2; *(pI3d++) += resInt3; - } + *(pI1++) += (float)resInt1; *(pI2++) += (float)resInt2; *(pI3++) += (float)resInt3; + } + else + { + *(pI1d++) += resInt1; *(pI2d++) += resInt2; *(pI3d++) += resInt3; } } - - //ixPerX += PerX; - pEx_St += PerX; - pEz_St += PerX; - pEx_Fi += PerX; - pEz_Fi += PerX; } - izPerZ += PerZ; + + pEx_St += PerX; + pEz_St += PerX; + pEx_Fi += PerX; + pEz_Fi += PerX; } - //iwfrPerWfr += PerWfr; - //} - //} + izPerZ += PerZ; + } + } if(arAuxInt != 0) delete[] arAuxInt; //OC150813 return 0; } @@ -1586,8 +1582,8 @@ int srTRadGenManip::ExtractSingleElecMutualIntensityVsZ(srTRadExtract& RadExtrac //************************************************************************* -int srTRadGenManip::ExtractSingleElecMutualIntensityVsXZ(srTRadExtract& RadExtract) -//int srTRadGenManip::ExtractSingleElecMutualIntensityVsXZ(srTRadExtract& RadExtract, gpuUsageArg *pGpuUsage) //Himanshu? +//int srTRadGenManip::ExtractSingleElecMutualIntensityVsXZ(srTRadExtract& RadExtract) +int srTRadGenManip::ExtractSingleElecMutualIntensityVsXZ(srTRadExtract& RadExtract, void* pvGPU) //HG30112023 {//OC13122019 //This assumes "normal" data alignment in the complex "matrix" E(x,y)*E*(x',y') int res = 0; @@ -2124,13 +2120,14 @@ int srTRadGenManip::ExtractSingleElecMutualIntensityVsXZ(srTRadExtract& RadExtra if(DontNeedInterp) { - //Himanshu? - //GPU_COND(pGpuUsage, - // { - // ExtractSingleElecMutualIntensityVsXZ_GPU(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, PolCom, EhOK, EvOK, pGpuUsage); - // }) - //else - //{ +#ifdef _OFFLOAD_GPU //HG30112023 + if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) + { + ExtractSingleElecMutualIntensityVsXZ_GPU(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, PolCom, EhOK, EvOK, (TGPUUsageArg*)pvGPU); + } + else +#endif + { for(long long it=itStart; it<=itEnd; it++) //OC16042021 (to enable partial update of MI/CSD) //for(long long it=0; it<=(itEnd-itStart); it++) //OC03032021 (to enable partial update of MI/CSD) //for(long long it=0; it 1); //OC18042020 double resInt, resInt1, resInt2, resInt3; @@ -74,7 +72,7 @@ __global__ void ExtractSingleElecIntensity2DvsXZ_Kernel(srTRadExtract RadExtract long long Two_ie0 = ie0 << 1, Two_ie1 = ie1 << 1; //OC26042019 long ie; - long offset = iwfr * PerWfr + iz * PerZ + ix * PerX; + long offset = iz * PerZ + ix * PerX; long offsetDiv2 = offset >> 1; float* pEx_StartForX = pEx0 + offset; @@ -211,30 +209,30 @@ static inline void ExtractSingleElecIntensity2DvsXZ_GPUSub(dim3 &blocks, dim3 &t } } -int srTRadGenManip::ExtractSingleElecIntensity2DvsXZ_GPU(srTRadExtract& RadExtract, double* arAuxInt, long long ie0, long long ie1, double InvStepRelArg, gpuUsageArg *pGpuUsage) +int srTRadGenManip::ExtractSingleElecIntensity2DvsXZ_GPU(srTRadExtract& RadExtract, double* arAuxInt, long long ie0, long long ie1, double InvStepRelArg, TGPUUsageArg* pGPU) { srTSRWRadStructAccessData& RadAccessData = *((srTSRWRadStructAccessData*)(hRadAccessData.ptr())); const int bs = 256; - dim3 blocks(RadAccessData.nx / bs + ((RadAccessData.nx & (bs - 1)) != 0), RadAccessData.nz, RadAccessData.nwfr); + dim3 blocks(RadAccessData.nx / bs + ((RadAccessData.nx & (bs - 1)) != 0), RadAccessData.nz); dim3 threads(bs, 1); if (RadAccessData.pBaseRadX != NULL) { - RadAccessData.pBaseRadX = (float*)AuxGpu::ToDevice(pGpuUsage, RadAccessData.pBaseRadX, 2*RadAccessData.ne*RadAccessData.nx*RadAccessData.nz*RadAccessData.nwfr*sizeof(float)); - AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, RadAccessData.pBaseRadX); + RadAccessData.pBaseRadX = (float*)CAuxGPU::ToDevice(pGPU, RadAccessData.pBaseRadX, 2*RadAccessData.ne*RadAccessData.nx*RadAccessData.nz*sizeof(float)); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, RadAccessData.pBaseRadX); } if (RadAccessData.pBaseRadZ != NULL) { - RadAccessData.pBaseRadZ = (float*)AuxGpu::ToDevice(pGpuUsage, RadAccessData.pBaseRadZ, 2*RadAccessData.ne*RadAccessData.nx*RadAccessData.nz*RadAccessData.nwfr*sizeof(float)); - AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, RadAccessData.pBaseRadZ); + RadAccessData.pBaseRadZ = (float*)CAuxGPU::ToDevice(pGPU, RadAccessData.pBaseRadZ, 2*RadAccessData.ne*RadAccessData.nx*RadAccessData.nz*sizeof(float)); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, RadAccessData.pBaseRadZ); } - srTRadGenManip *local_copy = (srTRadGenManip*)AuxGpu::ToDevice(pGpuUsage, this, sizeof(srTRadGenManip)); - AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, local_copy); + srTRadGenManip *local_copy = (srTRadGenManip*)CAuxGPU::ToDevice(pGPU, this, sizeof(srTRadGenManip)); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, local_copy); - arAuxInt = (double*)AuxGpu::ToDevice(pGpuUsage, arAuxInt, RadAccessData.ne*sizeof(double)); - AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, arAuxInt); + arAuxInt = (double*)CAuxGPU::ToDevice(pGPU, arAuxInt, RadAccessData.ne*sizeof(double)); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, arAuxInt); bool allStokesReq = (RadExtract.PolarizCompon == -5); bool intOverEnIsRequired = (RadExtract.Int_or_Phase == 7) && (RadAccessData.ne > 1); @@ -253,23 +251,23 @@ int srTRadGenManip::ExtractSingleElecIntensity2DvsXZ_GPU(srTRadExtract& RadExtra else ExtractSingleElecIntensity2DvsXZ_GPUSub (blocks, threads, RadExtract, RadAccessData, local_copy, arAuxInt, ie0, ie1, InvStepRelArg, Int_or_ReE); - AuxGpu::ToHostAndFree(pGpuUsage, local_copy, sizeof(srTRadGenManip), true); - AuxGpu::ToHostAndFree(pGpuUsage, arAuxInt, RadAccessData.ne*sizeof(double), true); - AuxGpu::MarkUpdated(pGpuUsage, RadAccessData.pBaseRadX, true, false); - AuxGpu::MarkUpdated(pGpuUsage, RadAccessData.pBaseRadZ, true, false); + CAuxGPU::ToHostAndFree(pGPU, local_copy, sizeof(srTRadGenManip), true); + CAuxGPU::ToHostAndFree(pGPU, arAuxInt, RadAccessData.ne*sizeof(double), true); + CAuxGPU::MarkUpdated(pGPU, RadAccessData.pBaseRadX, true, false); + CAuxGPU::MarkUpdated(pGPU, RadAccessData.pBaseRadZ, true, false); #ifndef _DEBUG if (RadAccessData.pBaseRadX != NULL) - RadAccessData.pBaseRadX = (float*)AuxGpu::GetHostPtr(pGpuUsage, RadAccessData.pBaseRadX); + RadAccessData.pBaseRadX = (float*)CAuxGPU::GetHostPtr(pGPU, RadAccessData.pBaseRadX); if (RadAccessData.pBaseRadZ != NULL) - RadAccessData.pBaseRadZ = (float*)AuxGpu::GetHostPtr(pGpuUsage, RadAccessData.pBaseRadZ); + RadAccessData.pBaseRadZ = (float*)CAuxGPU::GetHostPtr(pGPU, RadAccessData.pBaseRadZ); #endif #ifdef _DEBUG if (RadAccessData.pBaseRadX != NULL) - RadAccessData.pBaseRadX = (float*)AuxGpu::ToHostAndFree(pGpuUsage, RadAccessData.pBaseRadX, 2 * RadAccessData.ne * RadAccessData.nx * RadAccessData.nz * RadAccessData.nwfr * sizeof(float)); + RadAccessData.pBaseRadX = (float*)CAuxGPU::ToHostAndFree(pGPU, RadAccessData.pBaseRadX, 2*RadAccessData.ne*RadAccessData.nx*RadAccessData.nz*sizeof(float)); if (RadAccessData.pBaseRadZ != NULL) - RadAccessData.pBaseRadZ = (float*)AuxGpu::ToHostAndFree(pGpuUsage, RadAccessData.pBaseRadZ, 2 * RadAccessData.ne * RadAccessData.nx * RadAccessData.nz * RadAccessData.nwfr * sizeof(float)); + RadAccessData.pBaseRadZ = (float*)CAuxGPU::ToHostAndFree(pGPU, RadAccessData.pBaseRadZ, 2*RadAccessData.ne*RadAccessData.nx*RadAccessData.nz*sizeof(float)); cudaStreamSynchronize(0); auto err = cudaGetLastError(); printf("%s\r\n", cudaGetErrorString(err)); @@ -437,19 +435,21 @@ __global__ void ExtractSingleElecMutualIntensityVsXZ_Kernel(const float* __restr } template -int ExtractSingleElecMutualIntensityVsXZ_GPUSub(float* pEx, float* pEz, float* pMI0, long nxnz, long itStart, long itEnd, long PerX, long iter, bool EhOK, bool EvOK, gpuUsageArg* pGpuUsage) +int ExtractSingleElecMutualIntensityVsXZ_GPUSub(float* pEx, float* pEz, float* pMI0, long nx, long nz, long ne, long itStart, long itEnd, long PerX, long iter, bool EhOK, bool EvOK, TGPUUsageArg* pGPU) { + long long nxnz = ((long long)nx) * ((long long)nz); + const int itPerBlk = 1; dim3 threads = dim3(48, 16, 1); dim3 grid = dim3((nxnz + 1) / threads.x + (threads.x > 1), (nxnz / 2) / (threads.y * itPerBlk) + (threads.y > 1), 1); - pEx = (float*)AuxGpu::ToDevice(pGpuUsage, pEx, nxnz * 2 * sizeof(float)); - AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, pEx); + pEx = (float*)CAuxGPU::ToDevice(pGPU, pEx, nxnz*2*sizeof(float)); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, pEx); - pEz = (float*)AuxGpu::ToDevice(pGpuUsage, pEz, nxnz * 2 * sizeof(float)); - AuxGpu::EnsureDeviceMemoryReady(pGpuUsage, pEz); + pEz = (float*)CAuxGPU::ToDevice(pGPU, pEz, nxnz*2*sizeof(float)); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, pEz); - pMI0 = (float*)AuxGpu::ToDevice(pGpuUsage, pMI0, (itEnd - itStart) * nxnz * 2 * sizeof(float)); + pMI0 = (float*)CAuxGPU::ToDevice(pGPU, pMI0, (itEnd - itStart)*nxnz*2*sizeof(float)); if (EhOK) { @@ -462,14 +462,14 @@ int ExtractSingleElecMutualIntensityVsXZ_GPUSub(float* pEx, float* pEz, float* p else ExtractSingleElecMutualIntensityVsXZ_Kernel << > > (pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter); } - pEx = (float*)AuxGpu::ToHostAndFree(pGpuUsage, pEx, nxnz * 2 * sizeof(float), true); - pEz = (float*)AuxGpu::ToHostAndFree(pGpuUsage, pEz, nxnz * 2 * sizeof(float), true); + pEx = (float*)CAuxGPU::ToHostAndFree(pGPU, pEx, nxnz * 2 * sizeof(float), true); + pEz = (float*)CAuxGPU::ToHostAndFree(pGPU, pEz, nxnz * 2 * sizeof(float), true); - AuxGpu::MarkUpdated(pGpuUsage, pMI0, true, false); + CAuxGPU::MarkUpdated(pGPU, pMI0, true, false); #ifdef _DEBUG if (pMI0 != NULL) - pMI0 = (float*)AuxGpu::ToHostAndFree(pGpuUsage, pMI0, (itEnd - itStart) * RadAccessData.ne * RadAccessData.nx * RadAccessData.nz * 2 * sizeof(float)); + pMI0 = (float*)CAuxGPU::ToHostAndFree(pGPU, pMI0, (itEnd - itStart)*ne*nx*nz*2*sizeof(float)); cudaStreamSynchronize(0); auto err = cudaGetLastError(); @@ -478,40 +478,40 @@ int ExtractSingleElecMutualIntensityVsXZ_GPUSub(float* pEx, float* pEz, float* p return 0; } -int srTRadGenManip::ExtractSingleElecMutualIntensityVsXZ_GPU(float* pEx, float* pEz, float* pMI0, long nxnz, long itStart, long itEnd, long PerX, long iter, int PolCom, bool EhOK, bool EvOK, gpuUsageArg* pGpuUsage) +int srTRadGenManip::ExtractSingleElecMutualIntensityVsXZ_GPU(float* pEx, float* pEz, float* pMI0, long nx, long nz, long ne, long itStart, long itEnd, long PerX, long iter, int PolCom, bool EhOK, bool EvOK, TGPUUsageArg* pGPU) { if (iter > 0) { switch (PolCom) { - case 0: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< 0, 1>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage); - case 1: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< 1, 1>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage); - case 2: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< 2, 1>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage); - case 3: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< 3, 1>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage); - case 4: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< 4, 1>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage); - case 5: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< 5, 1>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage); - case -1: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -1, 1>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage); - case -2: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -2, 1>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage); - case -3: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -3, 1>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage); - case -4: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -4, 1>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage); - default: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -5, 1>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage); + case 1: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< 1, 1>(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, EhOK, EvOK, pGPU); + case 2: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< 2, 1>(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, EhOK, EvOK, pGPU); + case 0: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< 0, 1>(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, EhOK, EvOK, pGPU); + case 3: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< 3, 1>(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, EhOK, EvOK, pGPU); + case 4: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< 4, 1>(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, EhOK, EvOK, pGPU); + case 5: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< 5, 1>(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, EhOK, EvOK, pGPU); + case -1: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -1, 1>(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, EhOK, EvOK, pGPU); + case -2: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -2, 1>(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, EhOK, EvOK, pGPU); + case -3: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -3, 1>(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, EhOK, EvOK, pGPU); + case -4: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -4, 1>(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, EhOK, EvOK, pGPU); + default: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -5, 1>(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, EhOK, EvOK, pGPU); } } else if (iter == 0) { switch (PolCom) { - case 0: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< 0, 0>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage); - case 1: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< 1, 0>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage); - case 2: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< 2, 0>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage); - case 3: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< 3, 0>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage); - case 4: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< 4, 0>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage); - case 5: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< 5, 0>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage); - case -1: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -1, 0>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage); - case -2: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -2, 0>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage); - case -3: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -3, 0>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage); - case -4: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -4, 0>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage); - default: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -5, 0>(pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter, EhOK, EvOK, pGpuUsage); + case 0: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< 0, 0>(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, EhOK, EvOK, pGPU); + case 1: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< 1, 0>(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, EhOK, EvOK, pGPU); + case 2: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< 2, 0>(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, EhOK, EvOK, pGPU); + case 3: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< 3, 0>(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, EhOK, EvOK, pGPU); + case 4: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< 4, 0>(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, EhOK, EvOK, pGPU); + case 5: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< 5, 0>(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, EhOK, EvOK, pGPU); + case -1: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -1, 0>(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, EhOK, EvOK, pGPU); + case -2: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -2, 0>(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, EhOK, EvOK, pGPU); + case -3: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -3, 0>(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, EhOK, EvOK, pGPU); + case -4: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -4, 0>(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, EhOK, EvOK, pGPU); + default: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -5, 0>(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, EhOK, EvOK, pGPU); } } } diff --git a/cpp/src/core/srradstr.cpp b/cpp/src/core/srradstr.cpp index d2571a41..f7c7a87d 100644 --- a/cpp/src/core/srradstr.cpp +++ b/cpp/src/core/srradstr.cpp @@ -2700,7 +2700,8 @@ void srTSRWRadStructAccessData::CheckAndResetPhaseTermsLin() //************************************************************************* -void srTSRWRadStructAccessData::MirrorFieldData(int sx, int sz) +//void srTSRWRadStructAccessData::MirrorFieldData(int sx, int sz) +void srTSRWRadStructAccessData::MirrorFieldData(int sx, int sz, void* pvGPU) //HG02122023 {// sx < 0 means mirroring should be done vs x // sz < 0 means mirroring should be done vs z //long PerX = ne << 1; @@ -2711,6 +2712,14 @@ void srTSRWRadStructAccessData::MirrorFieldData(int sx, int sz) float *pEX0 = pBaseRadX; float *pEZ0 = pBaseRadZ; +#ifdef _OFFLOAD_GPU //HG02122023 + if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) + { + MirrorFieldData_GPU(sx, sz, (TGPUUsageArg*)pvGPU); + return; + } +#endif + if((sx > 0) && (sz > 0)) return; //no mirroring is necessary else if((sx < 0) && (sz > 0)) //mirroring with respect to x { diff --git a/cpp/src/core/srradstr.h b/cpp/src/core/srradstr.h index def0ffb0..dee50e5f 100644 --- a/cpp/src/core/srradstr.h +++ b/cpp/src/core/srradstr.h @@ -33,7 +33,7 @@ #endif #ifdef _OFFLOAD_GPU //OC28072023 -#include "auxgpu.h" //HG +#include "auxgpu.h" //HG04122023 #endif #include "srobject.h" @@ -520,14 +520,15 @@ class srTSRWRadStructAccessData : public CGenObject { // return; //} - if(pvGPU != 0) - { + //if(pvGPU != 0) //HG02122023 Null check is already done by CAuxGPU::GPUEnabled + //{ TGPUUsageArg *pGPU = (TGPUUsageArg*)pvGPU; if(CAuxGPU::GPUEnabled(pGPU)) { MultiplyElFieldByPhaseLin_GPU(xMult, zMult, pGPU); + return; //HG02122023 } - } + //} #endif float *tEx = pBaseRadX; diff --git a/cpp/src/core/srradstr_gpu.cu b/cpp/src/core/srradstr_gpu.cu index 5890cbf8..658d39e0 100644 --- a/cpp/src/core/srradstr_gpu.cu +++ b/cpp/src/core/srradstr_gpu.cu @@ -21,7 +21,7 @@ #include #include "srradstr.h" -__global__ void MultiplyElFieldByPhaseLin_Kernel(double xMult, double zMult, float* pBaseRadX, float* pBaseRadZ, int nz, int nx, int ne, float zStart, float zStep, float xStart, float xStep) { +__global__ void MultiplyElFieldByPhaseLin_Kernel(double xMult, double zMult, float* pBaseRadX, float* pBaseRadZ, int nx, int nz, int ne, float xStart, float zStart, float xStep, float zStep) { int ix = (blockIdx.x * blockDim.x + threadIdx.x); //nx range int iz = (blockIdx.y * blockDim.y + threadIdx.y); //nz range @@ -82,7 +82,7 @@ void srTSRWRadStructAccessData::MultiplyElFieldByPhaseLin_GPU(double xMult, doub const int bs = 256; dim3 blocks(nx / bs + ((nx & (bs - 1)) != 0), nz); dim3 threads(bs, 1); - MultiplyElFieldByPhaseLin_Kernel<<>> (xMult, zMult, pBaseRadX, pBaseRadZ, nz, nx, ne, (float)zStart, (float)zStep, (float)xStart, (float)xStep); + MultiplyElFieldByPhaseLin_Kernel<<>> (xMult, zMult, pBaseRadX, pBaseRadZ, nx, nz, ne, (float)xStart, (float)zStart, (float)xStep, (float)zStep); //MultiplyElFieldByPhaseLin_Kernel<<>> (xMult, zMult, pBaseRadX, pBaseRadZ, nz, nx, ne, zStart, zStep, xStart, xStep); if (pBaseRadX != NULL) @@ -105,7 +105,7 @@ void srTSRWRadStructAccessData::MultiplyElFieldByPhaseLin_GPU(double xMult, doub #endif } -template __global__ void MirrorFieldData_Kernel(long ne, long nx, long nz, float* pEX0, float* pEZ0) { +template __global__ void MirrorFieldData_Kernel(long nx, long nz, long ne, float* pEX0, float* pEZ0) { int ix = (blockIdx.x * blockDim.x + threadIdx.x); //nx range int iz = (blockIdx.y * blockDim.y + threadIdx.y); //nz range @@ -301,11 +301,11 @@ void srTSRWRadStructAccessData::MirrorFieldData_GPU(int sx, int sz, TGPUUsageArg if ((sx > 0) && (sz > 0)) return; else if ((sx < 0) && (sz > 0)) - MirrorFieldData_Kernel<0> <<>>(ne, nx, nz, pEX0, pEZ0); + MirrorFieldData_Kernel<0> <<>>(nx, nz, ne, pEX0, pEZ0); else if ((sx > 0) && (sz < 0)) - MirrorFieldData_Kernel<1> <<>> (ne, nx, nz, pEX0, pEZ0); + MirrorFieldData_Kernel<1> <<>> (nx, nz, ne, pEX0, pEZ0); else - MirrorFieldData_Kernel<2> <<>> (ne, nx, nz, pEX0, pEZ0); + MirrorFieldData_Kernel<2> <<>> (nx, nz, ne, pEX0, pEZ0); if (pEX0 != NULL) CAuxGPU::MarkUpdated(pGPU, pEX0, true, false); //OC03082023 diff --git a/cpp/src/core/srstraux.h b/cpp/src/core/srstraux.h index ad7dec02..49aff638 100644 --- a/cpp/src/core/srstraux.h +++ b/cpp/src/core/srstraux.h @@ -203,6 +203,9 @@ struct srTStokesC { struct srTEFieldPtrs { float *pExRe, *pExIm, *pEzRe, *pEzIm; +#ifdef _OFFLOAD_GPU //HG02122023 + GPU_PORTABLE +#endif srTEFieldPtrs(float* In_pExRe =0, float* In_pExIm =0, float* In_pEzRe =0, float* In_pEzIm =0) { pExRe = In_pExRe; pExIm = In_pExIm; pEzRe = In_pEzRe; pEzIm = In_pEzIm; @@ -1588,6 +1591,9 @@ struct srTInterpolAux01 { double cAx2z0, cAx2z1, cAx2z2, cAx2z3, cAx3z0, cAx3z1, cAx3z2, cAx3z3; double cLAx1z0, cLAx0z1, cLAx1z1; +#ifdef _OFFLOAD_GPU //HG02122023 + GPU_PORTABLE +#endif srTInterpolAux01() { cAx0z1 = 0.1666666667; @@ -1654,10 +1660,18 @@ struct srTInterpolAuxF { float f03, f13, f23, f33; float fAvg, fNorm; + +#ifdef _OFFLOAD_GPU //HG02122023 + GPU_PORTABLE +#endif void SetUpAvg() { fAvg = (float)(0.0625*(f00 + f10 + f20 + f30 + f01 + f11 + f21 + f31 + f02 + f12 + f22 + f32 + f03 + f13 + f23 + f33)); } + +#ifdef _OFFLOAD_GPU //HG02122023 + GPU_PORTABLE +#endif void NormalizeByAvg() { const float CritNorm = 1.; @@ -1724,11 +1738,17 @@ struct srTDataPtrsForWfrEdgeCorr { double dxSt, dxFi, dzSt, dzFi, dx, dz; char WasSetup; +#ifdef _OFFLOAD_GPU //HG02122023 + GPU_PORTABLE +#endif srTDataPtrsForWfrEdgeCorr() { InitializeAll(); } +#ifdef _OFFLOAD_GPU //HG02122023 + GPU_PORTABLE +#endif void InitializeAll() { ExpArrXSt = ExpArrXFi = 0; @@ -1747,6 +1767,9 @@ struct srTDataPtrsForWfrEdgeCorr { } WasSetup = 0; } +#ifdef _OFFLOAD_GPU //HG02122023 + GPU_PORTABLE +#endif void DisposeData() { if(ExpArrXSt != 0) delete[] ExpArrXSt; diff --git a/cpp/src/ext/genmath/gmfft.cpp b/cpp/src/ext/genmath/gmfft.cpp index dbea0340..3c242adc 100644 --- a/cpp/src/ext/genmath/gmfft.cpp +++ b/cpp/src/ext/genmath/gmfft.cpp @@ -132,6 +132,23 @@ long CGenMathFFT::LenGoodNum1000s = 11; long CGenMathFFT::GoodNum10000s[] = { 0,479,636,743,830,900,960,1017,1064,1109,1150 }; long CGenMathFFT::LenGoodNum10000s = 11; +#ifdef _OFFLOAD_GPU +long CGenMathFFT1D::PlanLen; +long CGenMathFFT1D::dPlanLen; +long CGenMathFFT1D::HowMany; +long CGenMathFFT1D::dHowMany; +cufftHandle CGenMathFFT1D::Plan1DFFT_cu; +cufftHandle CGenMathFFT1D::dPlan1DFFT_cu; +#endif + +#ifdef _OFFLOAD_GPU +long CGenMathFFT2D::PlanNx; +long CGenMathFFT2D::PlanNy; +long CGenMathFFT2D::dPlanNx; +long CGenMathFFT2D::dPlanNy; +cufftHandle CGenMathFFT2D::Plan2DFFT_cu; +cufftHandle CGenMathFFT2D::dPlan2DFFT_cu; +#endif //************************************************************************* void CGenMathFFT::NextCorrectNumberForFFT(long& n) @@ -206,22 +223,38 @@ void CGenMathFFT::NextCorrectNumberForFFT(long& n) } //************************************************************************* - -int CGenMathFFT1D::Make1DFFT_InPlace(CGenMathFFT1DInfo& FFT1DInfo) +//int CGenMathFFT1D::Make1DFFT_InPlace(CGenMathFFT1DInfo& FFT1DInfo) +//int CGenMathFFT1D::Make1DFFT_InPlace(CGenMathFFT1DInfo& FFT1DInfo, gpuUsageArg *pGpuUsage) //HG18072022 +int CGenMathFFT1D::Make1DFFT_InPlace(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU) //OC06092023 { - //long TotAmOfPo = (FFT1DInfo.Nx << 1)*FFT1DInfo.HowMany; - long long TotAmOfPo = ((long long)(FFT1DInfo.Nx << 1))*((long long)FFT1DInfo.HowMany); - float* AuxDataCont = new float[TotAmOfPo]; - if(AuxDataCont == 0) return MEMORY_ALLOCATION_FAILURE; - FFT1DInfo.pOutData = AuxDataCont; +#ifdef _OFFLOAD_GPU //OC06092023 (to avoid useless operations / calls at execution on CPU) + if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023 + //GPU_COND(pvGPU, //OC06092023 + //GPU_COND(pGpuUsage, + { + //HG03082022 GPU can do an inplace fft without being given a temporary buffer + FFT1DInfo.pOutData = FFT1DInfo.pInData; + int result; + if (result = Make1DFFT(FFT1DInfo, pvGPU)) return result; //OC06092023 + //if (result = Make1DFFT(FFT1DInfo, pGpuUsage)) return result; + }//) + else +#endif + { + //long TotAmOfPo = (FFT1DInfo.Nx << 1)*FFT1DInfo.HowMany; + long long TotAmOfPo = ((long long)(FFT1DInfo.Nx << 1))*((long long)FFT1DInfo.HowMany); + float* AuxDataCont = new float[TotAmOfPo]; + if(AuxDataCont == 0) return MEMORY_ALLOCATION_FAILURE; + FFT1DInfo.pOutData = AuxDataCont; - int result; - if(result = Make1DFFT(FFT1DInfo)) return result; + int result; + if(result = Make1DFFT(FFT1DInfo)) return result; - float *tOut = FFT1DInfo.pInData, *t = AuxDataCont; - for(int ix=0; ix 0)? -1 : 1; @@ -345,164 +408,471 @@ int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwnd_plan* pPrecrea //if(NeedsShiftBeforeY) FillArrayShift('y', t0SignMult*y0_Before, FFT2DInfo.yStep); if(NeedsShiftBeforeX) {//OC02022019 - if(m_ArrayShiftX != 0) FillArrayShift('x', t0SignMult*x0_Before, FFT2DInfo.xStep, m_ArrayShiftX); - else if(m_dArrayShiftX != 0) FillArrayShift('x', t0SignMult*x0_Before, FFT2DInfo.xStep, m_dArrayShiftX); + if(m_ArrayShiftX != 0) + FillArrayShift('x', t0SignMult*x0_Before, FFT2DInfo.xStep, m_ArrayShiftX); + else if(m_dArrayShiftX != 0) + FillArrayShift('x', t0SignMult*x0_Before, FFT2DInfo.xStep, m_dArrayShiftX); } if(NeedsShiftBeforeY) {//OC02022019 - if(m_ArrayShiftY != 0) FillArrayShift('y', t0SignMult*y0_Before, FFT2DInfo.yStep, m_ArrayShiftY); - else if(m_dArrayShiftY != 0) FillArrayShift('y', t0SignMult*y0_Before, FFT2DInfo.yStep, m_dArrayShiftY); + if(m_ArrayShiftY != 0) + FillArrayShift('y', t0SignMult*y0_Before, FFT2DInfo.yStep, m_ArrayShiftY); + else if(m_dArrayShiftY != 0) + FillArrayShift('y', t0SignMult*y0_Before, FFT2DInfo.yStep, m_dArrayShiftY); } - if(NeedsShiftBeforeX || NeedsShiftBeforeY) + +#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) + if (DataToFFT != 0) CAuxGPU::EnsureDeviceMemoryReady((TGPUUsageArg*)pvGPU, DataToFFT); //OC06092023 + else if (dDataToFFT != 0) CAuxGPU::EnsureDeviceMemoryReady((TGPUUsageArg*)pvGPU, dDataToFFT); + //if (DataToFFT != 0) CAuxGPU::EnsureDeviceMemoryReady(pGpuUsage, DataToFFT); + //else if (dDataToFFT != 0) CAuxGPU::EnsureDeviceMemoryReady(pGpuUsage, dDataToFFT); +#endif + + if (NeedsShiftBeforeX || NeedsShiftBeforeY) //HG02112021 { - if(DataToFFT != 0) TreatShifts(DataToFFT); +#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) + if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023 + { + //GPU_COND(pvGPU, { //OC06092023 + //GPU_COND(pGpuUsage, { + TGPUUsageArg *pGPU = (TGPUUsageArg*)pvGPU; + if (DataToFFT != 0) { + m_ArrayShiftX = (float*)CAuxGPU::ToDevice(pGPU, m_ArrayShiftX, (Nx << 1) * sizeof(float), false); //OC06092023 + m_ArrayShiftY = (float*)CAuxGPU::ToDevice(pGPU, m_ArrayShiftY, (Ny << 1) * sizeof(float), false); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, m_ArrayShiftX); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, m_ArrayShiftY); + //m_ArrayShiftX = (float*)CAuxGPU::ToDevice(pGpuUsage, m_ArrayShiftX, (Nx << 1) * sizeof(float), false); + //m_ArrayShiftY = (float*)CAuxGPU::ToDevice(pGpuUsage, m_ArrayShiftY, (Ny << 1) * sizeof(float), false); + //CAuxGPU::EnsureDeviceMemoryReady(pGpuUsage, m_ArrayShiftX); + //CAuxGPU::EnsureDeviceMemoryReady(pGpuUsage, m_ArrayShiftY); + TreatShifts2D_GPU((float*)DataToFFT, Nx, Ny, NeedsShiftBeforeX, NeedsShiftBeforeY, m_ArrayShiftX, m_ArrayShiftY); + m_ArrayShiftX = (float*)CAuxGPU::ToHostAndFree(pGPU, m_ArrayShiftX, (Nx << 1) * sizeof(float), true); //OC06092023 + m_ArrayShiftY = (float*)CAuxGPU::ToHostAndFree(pGPU, m_ArrayShiftY, (Ny << 1) * sizeof(float), true); + //m_ArrayShiftX = (float*)CAuxGPU::ToHostAndFree(pGpuUsage, m_ArrayShiftX, (Nx << 1) * sizeof(float), true); + //m_ArrayShiftY = (float*)CAuxGPU::ToHostAndFree(pGpuUsage, m_ArrayShiftY, (Ny << 1) * sizeof(float), true); + } + else if (dDataToFFT != 0) { + m_dArrayShiftX = (double*)CAuxGPU::ToDevice(pGPU, m_dArrayShiftX, (Nx << 1) * sizeof(double), false); //OC06092023 + m_dArrayShiftY = (double*)CAuxGPU::ToDevice(pGPU, m_dArrayShiftY, (Ny << 1) * sizeof(double), false); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, m_dArrayShiftX); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, m_dArrayShiftY); + //m_dArrayShiftX = (double*)CAuxGPU::ToDevice(pGpuUsage, m_dArrayShiftX, (Nx << 1) * sizeof(double), false); + //m_dArrayShiftY = (double*)CAuxGPU::ToDevice(pGpuUsage, m_dArrayShiftY, (Ny << 1) * sizeof(double), false); + //CAuxGPU::EnsureDeviceMemoryReady(pGpuUsage, m_dArrayShiftX); + //CAuxGPU::EnsureDeviceMemoryReady(pGpuUsage, m_dArrayShiftY); + TreatShifts2D_GPU((double*)dDataToFFT, Nx, Ny, NeedsShiftBeforeX, NeedsShiftBeforeY, m_dArrayShiftX, m_dArrayShiftY); + m_dArrayShiftX = (double*)CAuxGPU::ToHostAndFree(pGPU, m_dArrayShiftX, (Nx << 1) * sizeof(double), true); //OC06092023 + m_dArrayShiftY = (double*)CAuxGPU::ToHostAndFree(pGPU, m_dArrayShiftY, (Ny << 1) * sizeof(double), true); + //m_dArrayShiftX = (double*)CAuxGPU::ToHostAndFree(pGpuUsage, m_dArrayShiftX, (Nx << 1) * sizeof(double), true); + //m_dArrayShiftY = (double*)CAuxGPU::ToHostAndFree(pGpuUsage, m_dArrayShiftY, (Ny << 1) * sizeof(double), true); + } + }//) + else +#endif + { + if (DataToFFT != 0) TreatShifts(DataToFFT); #ifdef _FFTW3 //OC27022019 - else if(dDataToFFT != 0) TreatShifts(dDataToFFT); //OC02022019 + else if (dDataToFFT != 0) TreatShifts(dDataToFFT); //OC02022019 #endif + } } - if(FFT2DInfo.Dir > 0) + bool alreadyNormalized = false; //HG17032022 + //double Mult = FFT2DInfo.xStep*FFT2DInfo.yStep; + double Mult = FFT2DInfo.xStep * FFT2DInfo.yStep * FFT2DInfo.ExtraMult; //OC20112017 + if (FFT2DInfo.Dir > 0) { - //Plan2DFFT = fftw2d_create_plan(Ny, Nx, FFTW_FORWARD, FFTW_IN_PLACE); - //OC27102018 - //SY: adopted for OpenMP -#ifdef _FFTW3 //OC28012019 - - if(DataToFFT != 0) +#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) + if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023 + //GPU_COND(pvGPU, //OC06092023 + //GPU_COND(pGpuUsage, //HG02112021 { - if(pPrecreatedPlan2DFFT == 0) Plan2DFFT = fftwf_plan_dft_2d(Ny, Nx, DataToFFT, DataToFFT, FFTW_FORWARD, FFTW_ESTIMATE); - else Plan2DFFT = *pPrecreatedPlan2DFFT; - if(Plan2DFFT == 0) return ERROR_IN_FFT; + if (DataToFFT != 0) + { + if (pPrecreatedPlan2DFFT == 0) + { + if ((Plan2DFFT_cu == NULL) || (!(PlanNx == Nx && PlanNy == Ny))) //OC06092023 //HG04122023 + //if((Plan2DFFT_cu == NULL) || (!(PlanNx == Nx && PlanNy == Ny && HowMany == FFT2DInfo.howMany))) //OC06092023 + //if (Plan2DFFT_cu == NULL | !(PlanNx == Nx && PlanNy == Ny && HowMany == FFT2DInfo.howMany)) + { + if (Plan2DFFT_cu != NULL) + { + cufftDestroy(Plan2DFFT_cu); + Plan2DFFT_cu = NULL; + } + + PlanNx = Nx; + PlanNy = Ny; + int plan_shape[2]; plan_shape[0] = Nx; plan_shape[1] = Ny; + cufftPlanMany(&Plan2DFFT_cu, 2, plan_shape, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, 1); //HG04122023 + //cufftPlanMany(&Plan2DFFT_cu, 2, plan_shape, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, FFT2DInfo.howMany); + //cufftPlan2d(&Plan2DFFT_cu, Nx, Ny, CUFFT_C2C); + } + } + else Plan2DFFT_cu = *(cufftHandle*)pPrecreatedPlan2DFFT; + if (Plan2DFFT_cu == 0) return ERROR_IN_FFT; - fftwf_execute(Plan2DFFT); - } - else if(dDataToFFT != 0) + auto res = cufftExecC2C(Plan2DFFT_cu, (cufftComplex*)DataToFFT, (cufftComplex*)DataToFFT, CUFFT_FORWARD); +// if (res != CUFFT_SUCCESS) +// printf("CUFFT Error: %d\r\n", res); + } + else if (dDataToFFT != 0) + { + if (pdPrecreatedPlan2DFFT == 0) + { + if ((dPlan2DFFT_cu == NULL) || (!(dPlanNx == Nx && dPlanNy == Ny))) //OC06092023 //HG04122023 + //if((dPlan2DFFT_cu == NULL) || (!(dPlanNx == Nx && dPlanNy == Ny && dHowMany == FFT2DInfo.howMany))) //OC06092023 + //if (dPlan2DFFT_cu == NULL | !(dPlanNx == Nx && dPlanNy == Ny && dHowMany == FFT2DInfo.howMany)) + { + if (dPlan2DFFT_cu != NULL) + { + cufftDestroy(dPlan2DFFT_cu); + dPlan2DFFT_cu = NULL; + } + + dPlanNx = Nx; + dPlanNy = Ny; + int plan_shape[2]; plan_shape[0] = Nx; plan_shape[1] = Ny; + cufftPlanMany(&Plan2DFFT_cu, 2, plan_shape, 0, 0, 0, 0, 0, 0, CUFFT_Z2Z, 1); //HG04122023 + //cufftPlanMany(&Plan2DFFT_cu, 2, plan_shape, 0, 0, 0, 0, 0, 0, CUFFT_Z2Z, FFT2DInfo.howMany); + //cufftPlan2d(&dPlan2DFFT_cu, Nx, Ny, CUFFT_Z2Z); + } + } + else dPlan2DFFT_cu = *(cufftHandle*)pdPrecreatedPlan2DFFT; + if (dPlan2DFFT_cu == 0) return ERROR_IN_FFT; + + cufftExecZ2Z(dPlan2DFFT_cu, (cufftDoubleComplex*)dDataToFFT, (cufftDoubleComplex*)dDataToFFT, CUFFT_FORWARD); + } + }//) + else +#endif { - if(pdPrecreatedPlan2DFFT == 0) dPlan2DFFT = fftw_plan_dft_2d(Ny, Nx, dDataToFFT, dDataToFFT, FFTW_FORWARD, FFTW_ESTIMATE); - else dPlan2DFFT = *pdPrecreatedPlan2DFFT; - if(dPlan2DFFT == 0) return ERROR_IN_FFT; + //Plan2DFFT = fftw2d_create_plan(Ny, Nx, FFTW_FORWARD, FFTW_IN_PLACE); + //OC27102018 + //SY: adopted for OpenMP +#if _FFTW3 //OC28012019 - fftw_execute(dPlan2DFFT); - } + for(long iHowMany = 0; iHowMany < FFT2DInfo.howMany; iHowMany++) + { + long iFFT = Nx * Ny * iHowMany; + if (DataToFFT != 0) + { + if (pPrecreatedPlan2DFFT == 0) Plan2DFFT = fftwf_plan_dft_2d(Ny, Nx, DataToFFT + iFFT, DataToFFT + iFFT, FFTW_FORWARD, FFTW_ESTIMATE); + else Plan2DFFT = *pPrecreatedPlan2DFFT; + if (Plan2DFFT == 0) return ERROR_IN_FFT; + + fftwf_execute(Plan2DFFT); + } + else if (dDataToFFT != 0) + { + if (pdPrecreatedPlan2DFFT == 0) dPlan2DFFT = fftw_plan_dft_2d(Ny, Nx, dDataToFFT + iFFT, dDataToFFT + iFFT, FFTW_FORWARD, FFTW_ESTIMATE); + else dPlan2DFFT = *pdPrecreatedPlan2DFFT; + if (dPlan2DFFT == 0) return ERROR_IN_FFT; + + fftw_execute(dPlan2DFFT); + } + } #else - if(pPrecreatedPlan2DFFT == 0) Plan2DFFT = fftw2d_create_plan(Ny, Nx, FFTW_FORWARD, FFTW_IN_PLACE); - else Plan2DFFT = *pPrecreatedPlan2DFFT; - if(Plan2DFFT == 0) return ERROR_IN_FFT; - fftwnd(Plan2DFFT, 1, DataToFFT, 1, 0, DataToFFT, 1, 0); + if (pPrecreatedPlan2DFFT == 0) Plan2DFFT = fftw2d_create_plan(Ny, Nx, FFTW_FORWARD, FFTW_IN_PLACE); + else Plan2DFFT = *pPrecreatedPlan2DFFT; + if (Plan2DFFT == 0) return ERROR_IN_FFT; + fftwnd(Plan2DFFT, 1, DataToFFT, 1, 0, DataToFFT, 1, 0); #endif + } - if(DataToFFT != 0) +#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) + if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023 + //GPU_COND(pvGPU, //OC06092023 + //GPU_COND(pGpuUsage, //HG18072022 { - RepairSignAfter2DFFT(DataToFFT); - RotateDataAfter2DFFT(DataToFFT); - } + if (DataToFFT != 0) + { + //RepairSignAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany); + //RotateDataAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany); + //RepairSignAndRotateDataAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany, Mult); + //RepairSignAndRotateDataAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany, (float)Mult); //OC06092023 + RepairSignAndRotateDataAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, (float)Mult); //OC06092023 //HG04122023 + } + else if (dDataToFFT != 0) + { + //RepairSignAfter2DFFT_GPU((double*)dDataToFFT, Nx, Ny, FFT2DInfo.howMany); + //RotateDataAfter2DFFT_GPU((double*)dDataToFFT, Nx, Ny, FFT2DInfo.howMany); + //RepairSignAndRotateDataAfter2DFFT_GPU((double*)dDataToFFT, Nx, Ny, FFT2DInfo.howMany, Mult); + RepairSignAndRotateDataAfter2DFFT_GPU((double*)dDataToFFT, Nx, Ny, Mult); //HG04122023 + } + alreadyNormalized = true; + }//) + else +#endif + { + if (DataToFFT != 0) + { + RepairSignAfter2DFFT(DataToFFT); + RotateDataAfter2DFFT(DataToFFT); + } #ifdef _FFTW3 //OC27022019 - else if(dDataToFFT != 0) - { - RepairSignAfter2DFFT(dDataToFFT); - RotateDataAfter2DFFT(dDataToFFT); - } + else if (dDataToFFT != 0) + { + RepairSignAfter2DFFT(dDataToFFT); + RotateDataAfter2DFFT(dDataToFFT); + } #endif + } } else { - //Plan2DFFT = fftw2d_create_plan(Ny, Nx, FFTW_BACKWARD, FFTW_IN_PLACE); - //OC27102018 - //SY: adopted for OpenMP -#ifdef _FFTW3 //OC28012019 - if(DataToFFT != 0) +#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) + if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023 + //GPU_COND(pvGPU, //OC06092023 + //GPU_COND(pGpuUsage, //HG18072022 + { + if (DataToFFT != 0) + { + if (pPrecreatedPlan2DFFT == 0) { + if ((Plan2DFFT_cu == NULL) || (!(PlanNx == Nx && PlanNy == Ny))) //OC06092023 //HG04122023 + //if((Plan2DFFT_cu == NULL) || (!(PlanNx == Nx && PlanNy == Ny && HowMany == FFT2DInfo.howMany))) //OC06092023 + //if (Plan2DFFT_cu == NULL | !(PlanNx == Nx && PlanNy == Ny && HowMany == FFT2DInfo.howMany)) + { + if (Plan2DFFT_cu != NULL){ + cufftDestroy(Plan2DFFT_cu); + Plan2DFFT_cu = NULL; + } + + PlanNx = Nx; + PlanNy = Ny; + //HowMany = FFT2DInfo.howMany; //HG04122023 (Commented out) + int plan_shape[2]; plan_shape[0] = Nx; plan_shape[1] = Ny; + cufftPlanMany(&Plan2DFFT_cu, 2, plan_shape, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, 1); //HG04122023 + //cufftPlanMany(&Plan2DFFT_cu, 2, plan_shape, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, FFT2DInfo.howMany); + //cufftPlan2d(&Plan2DFFT_cu, Nx, Ny, CUFFT_C2C); + } + } + else Plan2DFFT_cu = *(cufftHandle*)pPrecreatedPlan2DFFT; + if (Plan2DFFT_cu == 0) return ERROR_IN_FFT; + + //RotateDataAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany); + //RepairSignAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany); + RotateDataAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny); //HG04122023 + RepairSignAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny); + cufftExecC2C(Plan2DFFT_cu, (cufftComplex*)DataToFFT, (cufftComplex*)DataToFFT, CUFFT_INVERSE); + } + else if (dDataToFFT != 0) + { + if (pdPrecreatedPlan2DFFT == 0) { + if ((dPlan2DFFT_cu == NULL) || (!(dPlanNx == Nx && dPlanNy == Ny))) //OC06092023 //HG04122023 + //if((dPlan2DFFT_cu == NULL) || (!(dPlanNx == Nx && dPlanNy == Ny && dHowMany == FFT2DInfo.howMany))) //OC06092023 + //if (dPlan2DFFT_cu == NULL | !(dPlanNx == Nx && dPlanNy == Ny && dHowMany == FFT2DInfo.howMany)) + { + if (dPlan2DFFT_cu != NULL){ + cufftDestroy(dPlan2DFFT_cu); + dPlan2DFFT_cu = NULL; + } + + dPlanNx = Nx; + dPlanNy = Ny; + //dHowMany = FFT2DInfo.howMany; //HG04122023 (Commented out) + int plan_shape[2]; plan_shape[0] = Nx; plan_shape[1] = Ny; + cufftPlanMany(&Plan2DFFT_cu, 2, plan_shape, NULL, 0, 0, NULL, 0, 0, CUFFT_Z2Z, FFT2DInfo.howMany); + //cufftPlan2d(&dPlan2DFFT_cu, Nx, Ny, CUFFT_Z2Z); + } + } + else dPlan2DFFT_cu = *(cufftHandle*)pdPrecreatedPlan2DFFT; + if (dPlan2DFFT_cu == 0) return ERROR_IN_FFT; + + //RotateDataAfter2DFFT_GPU((double*)dDataToFFT, Nx, Ny, FFT2DInfo.howMany); + //RepairSignAfter2DFFT_GPU((double*)dDataToFFT, Nx, Ny, FFT2DInfo.howMany); + RotateDataAfter2DFFT_GPU((double*)dDataToFFT, Nx, Ny); + RepairSignAfter2DFFT_GPU((double*)dDataToFFT, Nx, Ny); + cufftExecZ2Z(dPlan2DFFT_cu, (cufftDoubleComplex*)dDataToFFT, (cufftDoubleComplex*)dDataToFFT, CUFFT_INVERSE); + } + }//) + else +#endif { - if(pPrecreatedPlan2DFFT == 0) Plan2DFFT = fftwf_plan_dft_2d(Ny, Nx, DataToFFT, DataToFFT, FFTW_BACKWARD, FFTW_ESTIMATE); + //Plan2DFFT = fftw2d_create_plan(Ny, Nx, FFTW_BACKWARD, FFTW_IN_PLACE); + //OC27102018 + //SY: adopted for OpenMP +#ifdef _FFTW3 //OC28012019 + for (long iHowMany = 0; iHowMany < FFT2DInfo.howMany; iHowMany++) + { + long iFFT = Nx * Ny * iHowMany; + if (DataToFFT != 0) + { + if (pPrecreatedPlan2DFFT == 0) Plan2DFFT = fftwf_plan_dft_2d(Ny, Nx, DataToFFT + iFFT, DataToFFT + iFFT, FFTW_BACKWARD, FFTW_ESTIMATE); + else Plan2DFFT = *pPrecreatedPlan2DFFT; + if (Plan2DFFT == 0) return ERROR_IN_FFT; + RotateDataAfter2DFFT(DataToFFT); + RepairSignAfter2DFFT(DataToFFT); + fftwf_execute(Plan2DFFT); + } + else if (dDataToFFT != 0) + { + if (pdPrecreatedPlan2DFFT == 0) dPlan2DFFT = fftw_plan_dft_2d(Ny, Nx, dDataToFFT + iFFT, dDataToFFT + iFFT, FFTW_BACKWARD, FFTW_ESTIMATE); + else dPlan2DFFT = *pdPrecreatedPlan2DFFT; + if (dPlan2DFFT == 0) return ERROR_IN_FFT; + RotateDataAfter2DFFT(dDataToFFT); + RepairSignAfter2DFFT(dDataToFFT); + fftw_execute(dPlan2DFFT); + } + } +#else + if (pPrecreatedPlan2DFFT == 0) Plan2DFFT = fftw2d_create_plan(Ny, Nx, FFTW_BACKWARD, FFTW_IN_PLACE); else Plan2DFFT = *pPrecreatedPlan2DFFT; - if(Plan2DFFT == 0) return ERROR_IN_FFT; + if (Plan2DFFT == 0) return ERROR_IN_FFT; RotateDataAfter2DFFT(DataToFFT); RepairSignAfter2DFFT(DataToFFT); - fftwf_execute(Plan2DFFT); - } - else if(dDataToFFT != 0) - { - if(pdPrecreatedPlan2DFFT == 0) dPlan2DFFT = fftw_plan_dft_2d(Ny, Nx, dDataToFFT, dDataToFFT, FFTW_BACKWARD, FFTW_ESTIMATE); - else dPlan2DFFT = *pdPrecreatedPlan2DFFT; - if(dPlan2DFFT == 0) return ERROR_IN_FFT; - RotateDataAfter2DFFT(dDataToFFT); - RepairSignAfter2DFFT(dDataToFFT); - fftw_execute(dPlan2DFFT); - } -#else - if(pPrecreatedPlan2DFFT == 0) Plan2DFFT = fftw2d_create_plan(Ny, Nx, FFTW_BACKWARD, FFTW_IN_PLACE); - else Plan2DFFT = *pPrecreatedPlan2DFFT; - if(Plan2DFFT == 0) return ERROR_IN_FFT; - RotateDataAfter2DFFT(DataToFFT); - RepairSignAfter2DFFT(DataToFFT); - fftwnd(Plan2DFFT, 1, DataToFFT, 1, 0, DataToFFT, 1, 0); + fftwnd(Plan2DFFT, 1, DataToFFT, 1, 0, DataToFFT, 1, 0); #endif + } } - //double Mult = FFT2DInfo.xStep*FFT2DInfo.yStep; - double Mult = FFT2DInfo.xStep*FFT2DInfo.yStep*FFT2DInfo.ExtraMult; //OC20112017 - - if(DataToFFT != 0) NormalizeDataAfter2DFFT(DataToFFT, Mult); + if (!alreadyNormalized){ +#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) + if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023 + //GPU_COND(pvGPU, //OC06092023 + //GPU_COND(pGpuUsage, //HG18072022 + { + //if (DataToFFT != 0) + // NormalizeDataAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany, Mult); + //else if (dDataToFFT != 0) + // NormalizeDataAfter2DFFT_GPU((double*)dDataToFFT, Nx, Ny, FFT2DInfo.howMany, Mult); + if (DataToFFT != 0) //HG04122023 + NormalizeDataAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, Mult); + else if (dDataToFFT != 0) + NormalizeDataAfter2DFFT_GPU((double*)dDataToFFT, Nx, Ny, Mult); + }//) + else +#endif + { + if (DataToFFT != 0) NormalizeDataAfter2DFFT(DataToFFT, Mult); #ifdef _FFTW3 //OC27022019 - else if(dDataToFFT != 0) NormalizeDataAfter2DFFT(dDataToFFT, Mult); + else if (dDataToFFT != 0) NormalizeDataAfter2DFFT(dDataToFFT, Mult); #endif + } + } //if(NeedsShiftAfterX) FillArrayShift('x', t0SignMult*x0_After, FFT2DInfo.xStepTr); //if(NeedsShiftAfterY) FillArrayShift('y', t0SignMult*y0_After, FFT2DInfo.yStepTr); - if(NeedsShiftAfterX) + + if (NeedsShiftAfterX) {//OC02022019 - if(m_ArrayShiftX != 0) FillArrayShift('x', t0SignMult*x0_After, FFT2DInfo.xStepTr, m_ArrayShiftX); - else if(m_dArrayShiftX != 0) FillArrayShift('x', t0SignMult*x0_After, FFT2DInfo.xStepTr, m_dArrayShiftX); + if (m_ArrayShiftX != 0) FillArrayShift('x', t0SignMult * x0_After, FFT2DInfo.xStepTr, m_ArrayShiftX); + else if (m_dArrayShiftX != 0) FillArrayShift('x', t0SignMult * x0_After, FFT2DInfo.xStepTr, m_dArrayShiftX); } - if(NeedsShiftAfterY) + if (NeedsShiftAfterY) {//OC02022019 - if(m_ArrayShiftY != 0) FillArrayShift('y', t0SignMult*y0_After, FFT2DInfo.yStepTr, m_ArrayShiftY); - else if(m_dArrayShiftY != 0) FillArrayShift('y', t0SignMult*y0_After, FFT2DInfo.yStepTr, m_dArrayShiftY); + if (m_ArrayShiftY != 0) FillArrayShift('y', t0SignMult * y0_After, FFT2DInfo.yStepTr, m_ArrayShiftY); + else if (m_dArrayShiftY != 0) FillArrayShift('y', t0SignMult * y0_After, FFT2DInfo.yStepTr, m_dArrayShiftY); } - if(NeedsShiftAfterX || NeedsShiftAfterY) + if (NeedsShiftAfterX || NeedsShiftAfterY) { - if(DataToFFT != 0) TreatShifts(DataToFFT); +#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) + if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023 + //GPU_COND(pvGPU, //OC06092023 + //GPU_COND(pGpuUsage, //HG18072022 + { + TGPUUsageArg *pGPU = (TGPUUsageArg*)pvGPU; + if (DataToFFT != 0) { + m_ArrayShiftX = (float*)CAuxGPU::ToDevice(pGPU, m_ArrayShiftX, (Nx << 1) * sizeof(float), false); //OC06092023 + m_ArrayShiftY = (float*)CAuxGPU::ToDevice(pGPU, m_ArrayShiftY, (Ny << 1) * sizeof(float), false); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, m_ArrayShiftX); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, m_ArrayShiftY); + //m_ArrayShiftX = (float*)CAuxGPU::ToDevice(pGpuUsage, m_ArrayShiftX, (Nx << 1) * sizeof(float), false); + //m_ArrayShiftY = (float*)CAuxGPU::ToDevice(pGpuUsage, m_ArrayShiftY, (Ny << 1) * sizeof(float), false); + //CAuxGPU::EnsureDeviceMemoryReady(pGpuUsage, m_ArrayShiftX); + //CAuxGPU::EnsureDeviceMemoryReady(pGpuUsage, m_ArrayShiftY); + //TreatShifts2D_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany, NeedsShiftAfterX, NeedsShiftAfterY, m_ArrayShiftX, m_ArrayShiftY); + TreatShifts2D_GPU((float*)DataToFFT, Nx, Ny, NeedsShiftAfterX, NeedsShiftAfterY, m_ArrayShiftX, m_ArrayShiftY); //HG04122023 + m_ArrayShiftX = (float*)CAuxGPU::ToHostAndFree(pGPU, m_ArrayShiftX, (Nx << 1) * sizeof(float), true); //OC06092023 + m_ArrayShiftY = (float*)CAuxGPU::ToHostAndFree(pGPU, m_ArrayShiftY, (Ny << 1) * sizeof(float), true); + //m_ArrayShiftX = (float*)CAuxGPU::ToHostAndFree(pGpuUsage, m_ArrayShiftX, (Nx << 1) * sizeof(float), true); + //m_ArrayShiftY = (float*)CAuxGPU::ToHostAndFree(pGpuUsage, m_ArrayShiftY, (Ny << 1) * sizeof(float), true); + } + else if (dDataToFFT != 0) { + m_dArrayShiftX = (double*)CAuxGPU::ToDevice(pGPU, m_dArrayShiftX, (Nx << 1) * sizeof(double), false); //OC06092023 + m_dArrayShiftY = (double*)CAuxGPU::ToDevice(pGPU, m_dArrayShiftY, (Ny << 1) * sizeof(double), false); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, m_dArrayShiftX); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, m_dArrayShiftY); + //m_dArrayShiftX = (double*)CAuxGPU::ToDevice(pGpuUsage, m_dArrayShiftX, (Nx << 1) * sizeof(double), false); + //m_dArrayShiftY = (double*)CAuxGPU::ToDevice(pGpuUsage, m_dArrayShiftY, (Ny << 1) * sizeof(double), false); + //CAuxGPU::EnsureDeviceMemoryReady(pGpuUsage, m_dArrayShiftX); + //CAuxGPU::EnsureDeviceMemoryReady(pGpuUsage, m_dArrayShiftY); + //TreatShifts2D_GPU((double*)dDataToFFT, Nx, Ny, FFT2DInfo.howMany, NeedsShiftAfterX, NeedsShiftAfterY, m_dArrayShiftX, m_dArrayShiftY); + TreatShifts2D_GPU((double*)dDataToFFT, Nx, Ny, NeedsShiftAfterX, NeedsShiftAfterY, m_dArrayShiftX, m_dArrayShiftY); //HG04122023 + m_dArrayShiftX = (double*)CAuxGPU::ToHostAndFree(pGPU, m_dArrayShiftX, (Nx << 1) * sizeof(double), true); //OC06092023 + m_dArrayShiftY = (double*)CAuxGPU::ToHostAndFree(pGPU, m_dArrayShiftY, (Ny << 1) * sizeof(double), true); + //m_dArrayShiftX = (double*)CAuxGPU::ToHostAndFree(pGpuUsage, m_dArrayShiftX, (Nx << 1) * sizeof(double), true); + //m_dArrayShiftY = (double*)CAuxGPU::ToHostAndFree(pGpuUsage, m_dArrayShiftY, (Ny << 1) * sizeof(double), true); + } + }//) + else +#endif + { + if (DataToFFT != 0) TreatShifts(DataToFFT); #ifdef _FFTW3 //OC27022019 - else if(dDataToFFT != 0) TreatShifts(dDataToFFT); //OC02022019 + else if (dDataToFFT != 0) TreatShifts(dDataToFFT); //OC02022019 #endif + } } //OC_NERSC: to comment-out the following line for NERSC (to avoid crash with "python-mpi") //fftwnd_destroy_plan(Plan2DFFT); //OC27102018 //SY: adopted for OpenMP - -#ifdef _FFTW3 //OC28012019 - if(DataToFFT != 0) +#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) + if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023 + //GPU_COND(pvGPU, //OC06092023 + //GPU_COND(pGpuUsage, //HG02112021 { - if(pPrecreatedPlan2DFFT == 0) fftwf_destroy_plan(Plan2DFFT); - } - else if(dDataToFFT != 0) //OC03022019 + if (FFT2DInfo.pData != 0) + { + CAuxGPU::MarkUpdated((TGPUUsageArg*)pvGPU, DataToFFT, true, false); //OC06092023 + //CAuxGPU::MarkUpdated(pGpuUsage, DataToFFT, true, false); + } + else if (FFT2DInfo.pdData != 0) + { + CAuxGPU::MarkUpdated((TGPUUsageArg*)pvGPU, dDataToFFT, true, false); //OC06092023 + //CAuxGPU::MarkUpdated(pGpuUsage, dDataToFFT, true, false); + } + }//) + else +#endif { - if(pdPrecreatedPlan2DFFT == 0) fftw_destroy_plan(dPlan2DFFT); - } +#if _FFTW3 //OC28012019 + if (DataToFFT != 0) + { + if (pPrecreatedPlan2DFFT == 0) fftwf_destroy_plan(Plan2DFFT); + } + else if (dDataToFFT != 0) //OC03022019 + { + if (pdPrecreatedPlan2DFFT == 0) fftw_destroy_plan(dPlan2DFFT); + } #else - if(pPrecreatedPlan2DFFT == 0) fftwnd_destroy_plan(Plan2DFFT); + if (pPrecreatedPlan2DFFT == 0) fftwnd_destroy_plan(Plan2DFFT); #endif + } //if(ArrayShiftX != 0) { delete[] ArrayShiftX; ArrayShiftX = 0;} //if(ArrayShiftY != 0) { delete[] ArrayShiftY; ArrayShiftY = 0;} - if(m_ArrayShiftX != 0) { delete[] m_ArrayShiftX; m_ArrayShiftX = 0;} - if(m_ArrayShiftY != 0) { delete[] m_ArrayShiftY; m_ArrayShiftY = 0;} - if(m_dArrayShiftX != 0) { delete[] m_dArrayShiftX; m_dArrayShiftX = 0;} //OC02022019 - if(m_dArrayShiftY != 0) { delete[] m_dArrayShiftY; m_dArrayShiftY = 0;} - + if (m_ArrayShiftX != 0) { delete[] m_ArrayShiftX;} + if (m_ArrayShiftY != 0) { delete[] m_ArrayShiftY;} + if (m_dArrayShiftX != 0) { delete[] m_dArrayShiftX;} //OC02022019 + if (m_dArrayShiftY != 0) { delete[] m_dArrayShiftY;} + return 0; } //************************************************************************* //Forward FFT: Int f(x)*exp(-i*2*Pi*qx*x)dx //Backward FFT: Int f(qx)*exp(i*2*Pi*qx*x)dqx -int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo) +//int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo) +//int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, gpuUsageArg *pGpuUsage) //HG20012022 +int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU) //OC05092023 {// Assumes Nx, Ny even ! //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: //double start; @@ -529,260 +899,460 @@ int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo) m_ArrayShiftX = 0; m_dArrayShiftX = 0; - if(NeedsShiftBeforeX || NeedsShiftAfterX) + if (NeedsShiftBeforeX || NeedsShiftAfterX) { - if(FFT1DInfo.pInData != 0) + if (FFT1DInfo.pInData != 0) { m_ArrayShiftX = new float[Nx << 1]; - if(m_ArrayShiftX == 0) return MEMORY_ALLOCATION_FAILURE; + if (m_ArrayShiftX == 0) return MEMORY_ALLOCATION_FAILURE; + +#ifdef _OFFLOAD_GPU //OC05092023 (check for memory leak / misuse!) + m_ArrayShiftX = (float*)CAuxGPU::ToDevice((TGPUUsageArg*)pvGPU, m_ArrayShiftX, (Nx << 1) * sizeof(float), true); + //m_ArrayShiftX = (float*)CAuxGPU::ToDevice(pGpuUsage, m_ArrayShiftX, (Nx << 1) * sizeof(float), true); //HG20012022 +#endif } - else if(FFT1DInfo.pdInData != 0) + else if (FFT1DInfo.pdInData != 0) { m_dArrayShiftX = new double[Nx << 1]; - if(m_dArrayShiftX == 0) return MEMORY_ALLOCATION_FAILURE; + if (m_dArrayShiftX == 0) return MEMORY_ALLOCATION_FAILURE; + +#ifdef _OFFLOAD_GPU //OC05092023 + m_dArrayShiftX = (double*)CAuxGPU::ToDevice((TGPUUsageArg*)pvGPU, m_dArrayShiftX, (Nx << 1) * sizeof(double), true); + //m_dArrayShiftX = (double*)CAuxGPU::ToDevice(pGpuUsage, m_dArrayShiftX, (Nx << 1) * sizeof(double), true); //HG20012022 +#endif } } #ifdef _FFTW3 //OC28012019 fftwf_plan Plan1DFFT; - fftwf_complex *DataToFFT=0, *OutDataFFT=0; //, *pOutDataFFT=0; + fftwf_complex* DataToFFT = 0, * OutDataFFT = 0; //, *pOutDataFFT=0; fftw_plan dPlan1DFFT; - fftw_complex *dDataToFFT=0, *dOutDataFFT=0; //, *pdOutDataFFT=0; + fftw_complex* dDataToFFT = 0, * dOutDataFFT = 0; //, *pdOutDataFFT=0; +#endif - if((FFT1DInfo.pInData != 0) && (FFT1DInfo.pOutData != 0)) +//HG20012022 +//#ifdef _DEBUG +// if (pGpuUsage != NULL) +// printf ("GPU: Make1DFFT\n"); +//#endif +#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) + if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023 + //GPU_COND(pvGPU, //OC06092023 + //GPU_COND(pGpuUsage, //HG20012022 { - DataToFFT = (fftwf_complex*)(FFT1DInfo.pInData); - OutDataFFT = (fftwf_complex*)(FFT1DInfo.pOutData); - //pOutDataFFT = OutDataFFT; //OC03092016 to be used solely in fftw call - } - else if((FFT1DInfo.pdInData != 0) && (FFT1DInfo.pdOutData != 0)) + if ((FFT1DInfo.pInData != 0) && (FFT1DInfo.pOutData != 0)) + { + DataToFFT = (fftwf_complex*)CAuxGPU::ToDevice((TGPUUsageArg*)pvGPU, FFT1DInfo.pInData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(float)); //OC06092023 + OutDataFFT = (fftwf_complex*)CAuxGPU::ToDevice((TGPUUsageArg*)pvGPU, FFT1DInfo.pOutData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(float), true); + //DataToFFT = (fftwf_complex*)CAuxGPU::ToDevice(pGpuUsage, FFT1DInfo.pInData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(float)); + //OutDataFFT = (fftwf_complex*)CAuxGPU::ToDevice(pGpuUsage, FFT1DInfo.pOutData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(float), true); + } + else if ((FFT1DInfo.pdInData != 0) && (FFT1DInfo.pdOutData != 0)) + { + dDataToFFT = (fftw_complex*)CAuxGPU::ToDevice((TGPUUsageArg*)pvGPU, FFT1DInfo.pdInData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(double)); //OC06092023 + dOutDataFFT = (fftw_complex*)CAuxGPU::ToDevice((TGPUUsageArg*)pvGPU, FFT1DInfo.pdOutData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(double), true); + //dDataToFFT = (fftw_complex*)CAuxGPU::ToDevice(pGpuUsage, FFT1DInfo.pdInData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(double)); + //dOutDataFFT = (fftw_complex*)CAuxGPU::ToDevice(pGpuUsage, FFT1DInfo.pdOutData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(double), true); + } + }//) + else +#endif { - dDataToFFT = (fftw_complex*)(FFT1DInfo.pdInData); - dOutDataFFT = (fftw_complex*)(FFT1DInfo.pdOutData); - //pdOutDataFFT = dOutDataFFT; - } +#ifdef _FFTW3 //OC28012019 + if ((FFT1DInfo.pInData != 0) && (FFT1DInfo.pOutData != 0)) + { + DataToFFT = (fftwf_complex*)(FFT1DInfo.pInData); + OutDataFFT = (fftwf_complex*)(FFT1DInfo.pOutData); + //pOutDataFFT = OutDataFFT; //OC03092016 to be used solely in fftw call + } + else if ((FFT1DInfo.pdInData != 0) && (FFT1DInfo.pdOutData != 0)) + { + dDataToFFT = (fftw_complex*)(FFT1DInfo.pdInData); + dOutDataFFT = (fftw_complex*)(FFT1DInfo.pdOutData); + //pdOutDataFFT = dOutDataFFT; + } #else - fftw_plan Plan1DFFT; - FFTW_COMPLEX *DataToFFT = (FFTW_COMPLEX*)(FFT1DInfo.pInData); - FFTW_COMPLEX *OutDataFFT = (FFTW_COMPLEX*)(FFT1DInfo.pOutData); - FFTW_COMPLEX *pOutDataFFT = OutDataFFT; //OC03092016 to be used solely in fftw call -/** - Pointed-out by Sergey Yakubov (E-XFEL). - From FFTW 2.1.5 docs: - void fftw(fftw_plan plan, int howmany, - fftw_complex *in, int istride, int idist, - fftw_complex *out, int ostride, int odist); - ... - out, ostride and odist describe the output array(s). The format is the same as for the input array. - In-place transforms: If the plan specifies an in-place transform, ostride and odist are always ignored. - If out is NULL, out is ignored, too. Otherwise, out is interpreted as a pointer to an array of n complex numbers, - that FFTW will use as temporary space to perform the in-place computation. out is used as scratch space and its contents destroyed. - In this case, out must be an ordinary array whose elements are contiguous in memory (no striding). -**/ -#endif - - char t0SignMult = (FFT1DInfo.Dir > 0)? -1 : 1; - if(NeedsShiftBeforeX) + fftw_plan Plan1DFFT; + FFTW_COMPLEX* DataToFFT = (FFTW_COMPLEX*)(FFT1DInfo.pInData); + FFTW_COMPLEX* OutDataFFT = (FFTW_COMPLEX*)(FFT1DInfo.pOutData); + FFTW_COMPLEX* pOutDataFFT = OutDataFFT; //OC03092016 to be used solely in fftw call + /** + Pointed-out by Sergey Yakubov (E-XFEL). + From FFTW 2.1.5 docs: + void fftw(fftw_plan plan, int howmany, + fftw_complex *in, int istride, int idist, + fftw_complex *out, int ostride, int odist); + ... + out, ostride and odist describe the output array(s). The format is the same as for the input array. + In-place transforms: If the plan specifies an in-place transform, ostride and odist are always ignored. + If out is NULL, out is ignored, too. Otherwise, out is interpreted as a pointer to an array of n complex numbers, + that FFTW will use as temporary space to perform the in-place computation. out is used as scratch space and its contents destroyed. + In this case, out must be an ordinary array whose elements are contiguous in memory (no striding). + **/ +#endif + } + +#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) + if (DataToFFT != 0) CAuxGPU::EnsureDeviceMemoryReady((TGPUUsageArg*)pvGPU, DataToFFT); + else if (dDataToFFT != 0) CAuxGPU::EnsureDeviceMemoryReady((TGPUUsageArg*)pvGPU, dDataToFFT); + //if (DataToFFT != 0) CAuxGPU::EnsureDeviceMemoryReady(pGpuUsage, DataToFFT); + //else if (dDataToFFT != 0) CAuxGPU::EnsureDeviceMemoryReady(pGpuUsage, dDataToFFT); +#endif + + char t0SignMult = (FFT1DInfo.Dir > 0) ? -1 : 1; + if (NeedsShiftBeforeX) { - //FillArrayShift(t0SignMult*x0_Before, FFT1DInfo.xStep); - if(m_ArrayShiftX != 0) FillArrayShift(t0SignMult*x0_Before, FFT1DInfo.xStep, m_ArrayShiftX); - else if(m_dArrayShiftX != 0) FillArrayShift(t0SignMult*x0_Before, FFT1DInfo.xStep, m_dArrayShiftX); +#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) + if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023 + //GPU_COND(pvGPU, + //GPU_COND(pGpuUsage, //HG20012022 + { + if (m_ArrayShiftX != 0) FillArrayShift_GPU(t0SignMult * x0_Before, FFT1DInfo.xStep, Nx, m_ArrayShiftX); + else if (m_dArrayShiftX != 0) FillArrayShift_GPU(t0SignMult * x0_Before, FFT1DInfo.xStep, Nx, m_dArrayShiftX); - if(DataToFFT != 0) TreatShift(DataToFFT, FFT1DInfo.HowMany); + if (DataToFFT != 0) TreatShift_GPU((float*)DataToFFT, FFT1DInfo.HowMany, Nx, m_ArrayShiftX); + else if (dDataToFFT != 0) TreatShift_GPU((double*)dDataToFFT, FFT1DInfo.HowMany, Nx, m_dArrayShiftX); + }//) + else +#endif + { + //FillArrayShift(t0SignMult*x0_Before, FFT1DInfo.xStep); + if (m_ArrayShiftX != 0) FillArrayShift(t0SignMult * x0_Before, FFT1DInfo.xStep, m_ArrayShiftX); + else if (m_dArrayShiftX != 0) FillArrayShift(t0SignMult * x0_Before, FFT1DInfo.xStep, m_dArrayShiftX); + + if (DataToFFT != 0) TreatShift(DataToFFT, FFT1DInfo.HowMany); #ifdef _FFTW3 //OC27022019 - else if(dDataToFFT != 0) TreatShift(dDataToFFT, FFT1DInfo.HowMany); + else if (dDataToFFT != 0) TreatShift(dDataToFFT, FFT1DInfo.HowMany); #endif + } } //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: //srwlPrintTime("::Make1DFFT : before fft",&start); int flags = FFTW_ESTIMATE; //OC30012019 + bool alreadyNormalized = false; //HG17032022 + //double Mult = FFT1DInfo.xStep; + double Mult = FFT1DInfo.xStep * FFT1DInfo.MultExtra; - if(FFT1DInfo.Dir > 0) + if (FFT1DInfo.Dir > 0) //HG17112021 { - //int flags = FFTW_ESTIMATE; +#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) + if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023 + //GPU_COND(pvGPU, + //GPU_COND(pGpuUsage, + { + int arN[] = { (int)Nx }; //OC14052020 + if (DataToFFT != 0) + { + if (PlanLen != Nx) { + PlanLen = Nx; + if (Plan1DFFT_cu != NULL) + { + cufftDestroy(Plan1DFFT_cu); + Plan1DFFT_cu = NULL; + } + cufftPlanMany(&Plan1DFFT_cu, 1, arN, NULL, 1, Nx, NULL, 1, Nx, CUFFT_C2C, FFT1DInfo.HowMany); + } + if (Plan1DFFT_cu == 0) return ERROR_IN_FFT; + cufftExecC2C(Plan1DFFT_cu, (cufftComplex*)DataToFFT, (cufftComplex*)OutDataFFT, CUFFT_FORWARD); + } + else if (dDataToFFT != 0) //OC02022019 + { + if (dPlanLen != Nx) { + if (dPlan1DFFT_cu != NULL) + { + cufftDestroy(dPlan1DFFT_cu); + dPlan1DFFT_cu = NULL; + } + dPlanLen = Nx; + cufftPlanMany(&dPlan1DFFT_cu, 1, arN, NULL, 1, Nx, NULL, 1, Nx, CUFFT_Z2Z, FFT1DInfo.HowMany); + } + if (dPlan1DFFT_cu == 0) return ERROR_IN_FFT; + cufftExecZ2Z(dPlan1DFFT_cu, (cufftDoubleComplex*)dDataToFFT, (cufftDoubleComplex*)dOutDataFFT, CUFFT_FORWARD); + } + }//) + else +#endif + { + //int flags = FFTW_ESTIMATE; #ifdef _FFTW3 //OC28012019 #ifdef _WITH_OMP //Still needs to be tested! - if(DataToFFT != 0) - { - fftwf_init_threads(); //initialize threading support - int nthreads = omp_get_max_threads(); //detect number of OpenMP threads that are available - fftwf_plan_with_nthreads(nthreads); - } - else if(dDataToFFT != 0) //OC02022019 - { - fftw_init_threads(); //initialize threading support - int nthreads = omp_get_max_threads(); //detect number of OpenMP threads that are available - fftw_plan_with_nthreads(nthreads); - } + if (DataToFFT != 0) + { + fftwf_init_threads(); //initialize threading support + int nthreads = omp_get_max_threads(); //detect number of OpenMP threads that are available + fftwf_plan_with_nthreads(nthreads); + } + else if (dDataToFFT != 0) //OC02022019 + { + fftw_init_threads(); //initialize threading support + int nthreads = omp_get_max_threads(); //detect number of OpenMP threads that are available + fftw_plan_with_nthreads(nthreads); + } #endif //ifndef _WITH_OMP - - int arN[] = {(int)Nx}; //OC14052020 - //int arN[] = {Nx}; - if(DataToFFT != 0) - { - //Plan1DFFT = fftwf_plan_many_dft(1, arN, FFT1DInfo.HowMany, DataToFFT, NULL, 1, Nx, pOutDataFFT, NULL, 1, Nx, FFTW_FORWARD, flags); - Plan1DFFT = fftwf_plan_many_dft(1, arN, FFT1DInfo.HowMany, DataToFFT, NULL, 1, Nx, OutDataFFT, NULL, 1, Nx, FFTW_FORWARD, flags); //OC02022019 - if(Plan1DFFT == 0) return ERROR_IN_FFT; - fftwf_execute(Plan1DFFT); - } - else if(dDataToFFT != 0) //OC02022019 - { - dPlan1DFFT = fftw_plan_many_dft(1, arN, FFT1DInfo.HowMany, dDataToFFT, NULL, 1, Nx, dOutDataFFT, NULL, 1, Nx, FFTW_FORWARD, flags); - if(dPlan1DFFT == 0) return ERROR_IN_FFT; - fftw_execute(dPlan1DFFT); - } + int arN[] = { (int)Nx }; //OC14052020 + //int arN[] = {Nx}; + if (DataToFFT != 0) + { + //Plan1DFFT = fftwf_plan_many_dft(1, arN, FFT1DInfo.HowMany, DataToFFT, NULL, 1, Nx, pOutDataFFT, NULL, 1, Nx, FFTW_FORWARD, flags); + Plan1DFFT = fftwf_plan_many_dft(1, arN, FFT1DInfo.HowMany, DataToFFT, NULL, 1, Nx, OutDataFFT, NULL, 1, Nx, FFTW_FORWARD, flags); //OC02022019 + if (Plan1DFFT == 0) return ERROR_IN_FFT; + fftwf_execute(Plan1DFFT); + } + else if (dDataToFFT != 0) //OC02022019 + { + dPlan1DFFT = fftw_plan_many_dft(1, arN, FFT1DInfo.HowMany, dDataToFFT, NULL, 1, Nx, dOutDataFFT, NULL, 1, Nx, FFTW_FORWARD, flags); + if (dPlan1DFFT == 0) return ERROR_IN_FFT; + fftw_execute(dPlan1DFFT); + } #else //ifndef _FFTW3 - if(DataToFFT == OutDataFFT) - { - flags |= FFTW_IN_PLACE; - pOutDataFFT = 0; //OC03092016 (see FFTW 2.1.5 doc clause above) - } - Plan1DFFT = fftw_create_plan(Nx, FFTW_FORWARD, flags); - if(Plan1DFFT == 0) return ERROR_IN_FFT; + if (DataToFFT == OutDataFFT) + { + flags |= FFTW_IN_PLACE; + pOutDataFFT = 0; //OC03092016 (see FFTW 2.1.5 doc clause above) + } + Plan1DFFT = fftw_create_plan(Nx, FFTW_FORWARD, flags); + if (Plan1DFFT == 0) return ERROR_IN_FFT; - //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: - //srwlPrintTime("::Make1DFFT : fft create plan dir>0",&start); + //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: + //srwlPrintTime("::Make1DFFT : fft create plan dir>0",&start); #ifndef _WITH_OMP //OC27102018 //fftw(Plan1DFFT, FFT1DInfo.HowMany, DataToFFT, 1, Nx, OutDataFFT, 1, Nx); - fftw(Plan1DFFT, FFT1DInfo.HowMany, DataToFFT, 1, Nx, pOutDataFFT, 1, Nx); //OC03092016 + fftw(Plan1DFFT, FFT1DInfo.HowMany, DataToFFT, 1, Nx, pOutDataFFT, 1, Nx); //OC03092016 #else //OC27102018 //SY: split one call into many (for OpenMP) - #pragma omp parallel for if (omp_get_num_threads()==1) // to avoid nested multi-threading (just in case) - for(int i=0; i0",&start); - if(OutDataFFT != 0) +#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) + if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023 + //GPU_COND(pvGPU, + //GPU_COND(pGpuUsage, //HG20012022 { - RepairSignAfter1DFFT(OutDataFFT, FFT1DInfo.HowMany); - RotateDataAfter1DFFT(OutDataFFT, FFT1DInfo.HowMany); - } - -#ifdef _FFTW3 //OC27022019 - else if(dOutDataFFT != 0) + if (OutDataFFT != 0) + { + RepairAndRotateDataAfter1DFFT_GPU((float*)OutDataFFT, FFT1DInfo.HowMany, Nx, (float)Mult); //OC06092023 + //RepairAndRotateDataAfter1DFFT_GPU((float*)OutDataFFT, FFT1DInfo.HowMany, Nx, Mult); + //RepairSignAfter1DFFT_GPU((float*)OutDataFFT, FFT1DInfo.HowMany, Nx); + //RotateDataAfter1DFFT_GPU((float*)OutDataFFT, FFT1DInfo.HowMany, Nx); + } + else if (dOutDataFFT != 0) + { + RepairAndRotateDataAfter1DFFT_GPU((double*)dOutDataFFT, FFT1DInfo.HowMany, Nx, Mult); + //RepairSignAfter1DFFT_GPU((double*)dOutDataFFT, FFT1DInfo.HowMany, Nx); + //RotateDataAfter1DFFT_GPU((double*)dOutDataFFT, FFT1DInfo.HowMany, Nx); + } + alreadyNormalized = true; + }//) + else +#endif { - RepairSignAfter1DFFT(dOutDataFFT, FFT1DInfo.HowMany); - RotateDataAfter1DFFT(dOutDataFFT, FFT1DInfo.HowMany); - } + if (OutDataFFT != 0) + { + RepairSignAfter1DFFT(OutDataFFT, FFT1DInfo.HowMany); + RotateDataAfter1DFFT(OutDataFFT, FFT1DInfo.HowMany); + } +#ifdef _FFTW3 //OC27022019 + else if (dOutDataFFT != 0) + { + RepairSignAfter1DFFT(dOutDataFFT, FFT1DInfo.HowMany); + RotateDataAfter1DFFT(dOutDataFFT, FFT1DInfo.HowMany); + } #endif + } } else { //int flags = FFTW_ESTIMATE; //OC30012019 (commented-out) -#ifdef _FFTW3 //OC28012019 -#ifdef _WITH_OMP - - //Still needs to be tested! - if(DataToFFT != 0) +#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) + if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023 + //GPU_COND(pvGPU, + //GPU_COND(pGpuUsage, //HG20012022 { - fftwf_init_threads(); //initialize threading support - int nthreads = omp_get_max_threads(); //detect number of OpenMP threads that are available - fftwf_plan_with_nthreads(nthreads); - } - else if(dDataToFFT != 0) - { - fftw_init_threads(); //initialize threading support - int nthreads = omp_get_max_threads(); //detect number of OpenMP threads that are available - fftw_plan_with_nthreads(nthreads); - } - -#endif - - int arN[] = {(int)Nx}; //OC14052020 - //int arN[] = {Nx}; - if(DataToFFT != 0) - { - //Plan1DFFT = fftwf_plan_many_dft(1, arN, FFT1DInfo.HowMany, DataToFFT, NULL, 1, Nx, pOutDataFFT, NULL, 1, Nx, FFTW_BACKWARD, flags); - Plan1DFFT = fftwf_plan_many_dft(1, arN, FFT1DInfo.HowMany, DataToFFT, NULL, 1, Nx, OutDataFFT, NULL, 1, Nx, FFTW_BACKWARD, flags); //OC02022019 - if(Plan1DFFT == 0) return ERROR_IN_FFT; + int arN[] = { (int)Nx }; //OC14052020 + //int arN[] = {Nx}; + if (DataToFFT != 0) + { + if (PlanLen != Nx) { + PlanLen = Nx; + HowMany = FFT1DInfo.HowMany; + if (Plan1DFFT_cu != NULL) + { + cufftDestroy(Plan1DFFT_cu); + Plan1DFFT_cu = NULL; + } + cufftPlanMany(&Plan1DFFT_cu, 1, arN, NULL, 1, Nx, NULL, 1, Nx, CUFFT_C2C, FFT1DInfo.HowMany); + } + if (Plan1DFFT_cu == 0) return ERROR_IN_FFT; - RotateDataAfter1DFFT(DataToFFT, FFT1DInfo.HowMany); - RepairSignAfter1DFFT(DataToFFT, FFT1DInfo.HowMany); + RotateDataAfter1DFFT_GPU((float*)DataToFFT, FFT1DInfo.HowMany, Nx); + RepairSignAfter1DFFT_GPU((float*)DataToFFT, FFT1DInfo.HowMany, Nx); + cufftExecC2C(Plan1DFFT_cu, (cufftComplex*)DataToFFT, (cufftComplex*)OutDataFFT, CUFFT_INVERSE); + } + else if (dDataToFFT != 0) //OC02022019 + { + if (dPlanLen != Nx) + { + dPlanLen = Nx; + dHowMany = FFT1DInfo.HowMany; + if (dPlan1DFFT_cu != NULL) + { + cufftDestroy(dPlan1DFFT_cu); + dPlan1DFFT_cu = NULL; + } + cufftPlanMany(&dPlan1DFFT_cu, 1, arN, NULL, 1, Nx, NULL, 1, Nx, CUFFT_Z2Z, FFT1DInfo.HowMany); + } + if (dPlan1DFFT_cu == 0) return ERROR_IN_FFT; - fftwf_execute(Plan1DFFT); - } - else if(dDataToFFT != 0) //OC02022019 + RotateDataAfter1DFFT_GPU((double*)dDataToFFT, FFT1DInfo.HowMany, Nx); + RepairSignAfter1DFFT_GPU((double*)dDataToFFT, FFT1DInfo.HowMany, Nx); + cufftExecZ2Z(dPlan1DFFT_cu, (cufftDoubleComplex*)dDataToFFT, (cufftDoubleComplex*)dOutDataFFT, CUFFT_INVERSE); + } + }//) + else +#endif { - dPlan1DFFT = fftw_plan_many_dft(1, arN, FFT1DInfo.HowMany, dDataToFFT, NULL, 1, Nx, dOutDataFFT, NULL, 1, Nx, FFTW_BACKWARD, flags); - if(dPlan1DFFT == 0) return ERROR_IN_FFT; +#ifdef _FFTW3 //OC28012019 +#ifdef _WITH_OMP - RotateDataAfter1DFFT(dDataToFFT, FFT1DInfo.HowMany); - RepairSignAfter1DFFT(dDataToFFT, FFT1DInfo.HowMany); + //Still needs to be tested! + if (DataToFFT != 0) + { + fftwf_init_threads(); //initialize threading support + int nthreads = omp_get_max_threads(); //detect number of OpenMP threads that are available + fftwf_plan_with_nthreads(nthreads); + } + else if (dDataToFFT != 0) + { + fftw_init_threads(); //initialize threading support + int nthreads = omp_get_max_threads(); //detect number of OpenMP threads that are available + fftw_plan_with_nthreads(nthreads); + } - fftw_execute(dPlan1DFFT); - } +#endif + int arN[] = { (int)Nx }; //OC14052020 + //int arN[] = {Nx}; + if (DataToFFT != 0) + { + //Plan1DFFT = fftwf_plan_many_dft(1, arN, FFT1DInfo.HowMany, DataToFFT, NULL, 1, Nx, pOutDataFFT, NULL, 1, Nx, FFTW_BACKWARD, flags); + Plan1DFFT = fftwf_plan_many_dft(1, arN, FFT1DInfo.HowMany, DataToFFT, NULL, 1, Nx, OutDataFFT, NULL, 1, Nx, FFTW_BACKWARD, flags); //OC02022019 + if (Plan1DFFT == 0) return ERROR_IN_FFT; + RotateDataAfter1DFFT(DataToFFT, FFT1DInfo.HowMany); + RepairSignAfter1DFFT(DataToFFT, FFT1DInfo.HowMany); + fftwf_execute(Plan1DFFT); + } + else if (dDataToFFT != 0) //OC02022019 + { + dPlan1DFFT = fftw_plan_many_dft(1, arN, FFT1DInfo.HowMany, dDataToFFT, NULL, 1, Nx, dOutDataFFT, NULL, 1, Nx, FFTW_BACKWARD, flags); + if (dPlan1DFFT == 0) return ERROR_IN_FFT; + RotateDataAfter1DFFT(dDataToFFT, FFT1DInfo.HowMany); + RepairSignAfter1DFFT(dDataToFFT, FFT1DInfo.HowMany); + fftw_execute(dPlan1DFFT); + } #else //ifndef _FFTW3 - if(DataToFFT == OutDataFFT) - { - flags |= FFTW_IN_PLACE; - pOutDataFFT = 0; //OC03092016 (see FFTW 2.1.5 doc clause above) - } - Plan1DFFT = fftw_create_plan(Nx, FFTW_BACKWARD, flags); - if(Plan1DFFT == 0) return ERROR_IN_FFT; + if (DataToFFT == OutDataFFT) + { + flags |= FFTW_IN_PLACE; + pOutDataFFT = 0; //OC03092016 (see FFTW 2.1.5 doc clause above) + } + Plan1DFFT = fftw_create_plan(Nx, FFTW_BACKWARD, flags); + if (Plan1DFFT == 0) return ERROR_IN_FFT; - //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: - //srwlPrintTime("::Make1DFFT : fft create plan dir<0",&start); + //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: + //srwlPrintTime("::Make1DFFT : fft create plan dir<0",&start); - RotateDataAfter1DFFT(DataToFFT, FFT1DInfo.HowMany); - //srwlPrintTime("::Make1DFFT : rotate dir<0",&start); + RotateDataAfter1DFFT(DataToFFT, FFT1DInfo.HowMany); + //srwlPrintTime("::Make1DFFT : rotate dir<0",&start); - RepairSignAfter1DFFT(DataToFFT, FFT1DInfo.HowMany); - //srwlPrintTime("::Make1DFFT : repair dir<0",&start); + RepairSignAfter1DFFT(DataToFFT, FFT1DInfo.HowMany); + //srwlPrintTime("::Make1DFFT : repair dir<0",&start); #ifndef _WITH_OMP //OC27102018 //fftw(Plan1DFFT, FFT1DInfo.HowMany, DataToFFT, 1, Nx, OutDataFFT, 1, Nx); - fftw(Plan1DFFT, FFT1DInfo.HowMany, DataToFFT, 1, Nx, pOutDataFFT, 1, Nx); //OC03092016 + fftw(Plan1DFFT, FFT1DInfo.HowMany, DataToFFT, 1, Nx, pOutDataFFT, 1, Nx); //OC03092016 #else //OC27102018 //SY: split one call into many (for OpenMP) - #pragma omp parallel for if (omp_get_num_threads()==1) // to avoid nested multi-threading (just in case) - for(int i=0; i #include #include +#include "gmfft.h" #define GMFFT_BLOCK_SIZE 256 @@ -148,7 +149,7 @@ template __global__ void TreatShift_Kernel(T* pData, long HowMany, } } -void RepairSignAfter1DFFT_GPU(float* pAfterFFT, long HowMany, long Nx) +void CGenMathFFT1D::RepairSignAfter1DFFT_GPU(float* pAfterFFT, long HowMany, long Nx) { dim3 blocks(Nx / GMFFT_BLOCK_SIZE + ((Nx & (GMFFT_BLOCK_SIZE - 1)) != 0)); @@ -162,7 +163,7 @@ void RepairSignAfter1DFFT_GPU(float* pAfterFFT, long HowMany, long Nx) //#endif } -void RotateDataAfter1DFFT_GPU(float* pAfterFFT, long HowMany, long Nx) +void CGenMathFFT1D::RotateDataAfter1DFFT_GPU(float* pAfterFFT, long HowMany, long Nx) { dim3 blocks(Nx / (2 * GMFFT_BLOCK_SIZE) + ((Nx / 2 & (GMFFT_BLOCK_SIZE - 1)) != 0)); @@ -176,7 +177,7 @@ void RotateDataAfter1DFFT_GPU(float* pAfterFFT, long HowMany, long Nx) //#endif } -void RepairAndRotateDataAfter1DFFT_GPU(float* pAfterFFT, long HowMany, long Nx, float Mult) +void CGenMathFFT1D::RepairAndRotateDataAfter1DFFT_GPU(float* pAfterFFT, long HowMany, long Nx, float Mult) { //#ifdef _DEBUG @@ -197,7 +198,7 @@ void RepairAndRotateDataAfter1DFFT_GPU(float* pAfterFFT, long HowMany, long Nx, //#endif } -void NormalizeDataAfter1DFFT_GPU(float* pAfterFFT, long HowMany, long Nx, double Mult) +void CGenMathFFT1D::NormalizeDataAfter1DFFT_GPU(float* pAfterFFT, long HowMany, long Nx, double Mult) { dim3 blocks(Nx / GMFFT_BLOCK_SIZE + ((Nx & (GMFFT_BLOCK_SIZE - 1)) != 0), 1); @@ -212,7 +213,7 @@ void NormalizeDataAfter1DFFT_GPU(float* pAfterFFT, long HowMany, long Nx, double //#endif } -void FillArrayShift_GPU(double t0, double tStep, long Nx, float* tShiftX) +void CGenMathFFT1D::FillArrayShift_GPU(double t0, double tStep, long Nx, float* tShiftX) { dim3 blocks(Nx / (2 * GMFFT_BLOCK_SIZE) + ((Nx / 2 & (GMFFT_BLOCK_SIZE - 1)) != 0), 1); @@ -226,7 +227,7 @@ void FillArrayShift_GPU(double t0, double tStep, long Nx, float* tShiftX) //#endif } -void TreatShift_GPU(float* pData, long HowMany, long Nx, float* tShiftX) +void CGenMathFFT1D::TreatShift_GPU(float* pData, long HowMany, long Nx, float* tShiftX) { dim3 blocks(Nx / GMFFT_BLOCK_SIZE + ((Nx & (GMFFT_BLOCK_SIZE - 1)) != 0)); @@ -240,7 +241,7 @@ void TreatShift_GPU(float* pData, long HowMany, long Nx, float* tShiftX) //#endif } -void RepairSignAfter1DFFT_GPU(double* pAfterFFT, long HowMany, long Nx) +void CGenMathFFT1D::RepairSignAfter1DFFT_GPU(double* pAfterFFT, long HowMany, long Nx) { dim3 blocks(Nx / GMFFT_BLOCK_SIZE + ((Nx & (GMFFT_BLOCK_SIZE - 1)) != 0)); @@ -254,7 +255,7 @@ void RepairSignAfter1DFFT_GPU(double* pAfterFFT, long HowMany, long Nx) //#endif } -void RotateDataAfter1DFFT_GPU(double* pAfterFFT, long HowMany, long Nx) +void CGenMathFFT1D::RotateDataAfter1DFFT_GPU(double* pAfterFFT, long HowMany, long Nx) { dim3 blocks(Nx / (2 * GMFFT_BLOCK_SIZE) + ((Nx & (2 * GMFFT_BLOCK_SIZE - 1)) != 0)); @@ -268,7 +269,7 @@ void RotateDataAfter1DFFT_GPU(double* pAfterFFT, long HowMany, long Nx) //#endif } -void RepairAndRotateDataAfter1DFFT_GPU(double* pAfterFFT, long HowMany, long Nx, double Mult) +void CGenMathFFT1D::RepairAndRotateDataAfter1DFFT_GPU(double* pAfterFFT, long HowMany, long Nx, double Mult) { dim3 blocks(Nx / (2 * GMFFT_BLOCK_SIZE) + (((Nx / 2) & (GMFFT_BLOCK_SIZE - 1)) != 0), 1); @@ -283,7 +284,7 @@ void RepairAndRotateDataAfter1DFFT_GPU(double* pAfterFFT, long HowMany, long Nx, //#endif } -void NormalizeDataAfter1DFFT_GPU(double* pAfterFFT, long HowMany, long Nx, double Mult) +void CGenMathFFT1D::NormalizeDataAfter1DFFT_GPU(double* pAfterFFT, long HowMany, long Nx, double Mult) { dim3 blocks(Nx / GMFFT_BLOCK_SIZE + ((Nx & (GMFFT_BLOCK_SIZE - 1)) != 0)); @@ -297,7 +298,7 @@ void NormalizeDataAfter1DFFT_GPU(double* pAfterFFT, long HowMany, long Nx, doubl //#endif } -void FillArrayShift_GPU(double t0, double tStep, long Nx, double* tShiftX) +void CGenMathFFT1D::FillArrayShift_GPU(double t0, double tStep, long Nx, double* tShiftX) { dim3 blocks(Nx / (2 * GMFFT_BLOCK_SIZE) + ((Nx & (2 * GMFFT_BLOCK_SIZE - 1)) != 0), 1); @@ -311,7 +312,7 @@ void FillArrayShift_GPU(double t0, double tStep, long Nx, double* tShiftX) //#endif } -void TreatShift_GPU(double* pData, long HowMany, long Nx, double* tShiftX) +void CGenMathFFT1D::TreatShift_GPU(double* pData, long HowMany, long Nx, double* tShiftX) { dim3 blocks(Nx / GMFFT_BLOCK_SIZE + ((Nx & (GMFFT_BLOCK_SIZE - 1)) != 0)); @@ -326,7 +327,7 @@ void TreatShift_GPU(double* pData, long HowMany, long Nx, double* tShiftX) } -template __global__ void RepairSignAfter2DFFT_Kernel(T* pAfterFFT, long Nx, long Ny, long Nx2Ny2, long howMany) +template __global__ void RepairSignAfter2DFFT_Kernel(T* pAfterFFT, long Nx, long Ny) { int ix = (blockIdx.x * blockDim.x + threadIdx.x); //Nx range int iy = (blockIdx.y * blockDim.y + threadIdx.y); //Ny range @@ -337,15 +338,12 @@ template __global__ void RepairSignAfter2DFFT_Kernel(T* pAfterFFT, if (ix < Nx && iy < Ny) { - for (long i=0; i __global__ void RotateDataAfter2DFFT_Kernel(T* pAfterFFT, long HalfNx, long Nx, long HalfNy, long Ny, long Nx2Ny2, long howMany) +template __global__ void RotateDataAfter2DFFT_Kernel(T* pAfterFFT, long HalfNx, long Nx, long HalfNy, long Ny) { int ix = (blockIdx.x * blockDim.x + threadIdx.x); //HalfNx range int iy = (blockIdx.y * blockDim.y + threadIdx.y); //HalfNy range @@ -353,32 +351,29 @@ template __global__ void RotateDataAfter2DFFT_Kernel(T* pAfterFFT, if (ix < HalfNx && iy < HalfNy) { int idx = (ix + iy * Nx) * 2; - for (long i=0; i __global__ void RepairSignAndRotateDataAfter2DFFT_Kernel(T* pAfterFFT, long HalfNx, long Nx, long HalfNy, long Ny, long Nx2Ny2, long howMany, T2 Mult) +template __global__ void RepairSignAndRotateDataAfter2DFFT_Kernel(T* pAfterFFT, long HalfNx, long Nx, long HalfNy, long Ny, T2 Mult) { int ix = (blockIdx.x * blockDim.x + threadIdx.x); //HalfNx range int iy = (blockIdx.y * blockDim.y + threadIdx.y); //HalfNy range @@ -396,52 +391,47 @@ template __global__ void RepairSignAndRotateDataAfter2 float s4 = sx0 * sy1 * Mult; int idx = (ix + iy * Nx); - for (long i=0; i __global__ void NormalizeDataAfter2DFFT_Kernel(T* pAfterFFT, long Nx2Ny2, long howMany, long n, T Mult) +template __global__ void NormalizeDataAfter2DFFT_Kernel(T* pAfterFFT, long Nx2Ny2, long n, T Mult) { int ix = (blockIdx.x * blockDim.x + threadIdx.x) * 2; //Nx range if (ix < Nx2Ny2) { - for (long i=0; i __global__ void TreatShift2D_Kernel(T* pData, long HowMany, long Nx2, long Ny, T* tShiftX, T* tShiftY) +template __global__ void TreatShift2D_Kernel(T* pData, long Nx2, long Ny, T* tShiftX, T* tShiftY) { int ix = (blockIdx.x * blockDim.x + threadIdx.x) * 2; //Nx range int iy = (blockIdx.y * blockDim.y + threadIdx.y); //Ny range @@ -484,221 +474,100 @@ template __global__ void TreatS MultIm = MultY_Im; } - for (long k=0; k << > > (pAfterFFT, Nx, Ny, Nx * Ny * 2, howMany); + RepairSignAfter2DFFT_Kernel << > > (pAfterFFT, Nx, Ny); } -void RotateDataAfter2DFFT_GPU(float* pAfterFFT, long Nx, long Ny, long howMany) +void CGenMathFFT2D::RotateDataAfter2DFFT_GPU(float* pAfterFFT, long Nx, long Ny) { dim3 blocks(Nx / (2 * GMFFT_BLOCK_SIZE) + ((Nx / 2 & (GMFFT_BLOCK_SIZE - 1)) != 0), Ny); dim3 threads(GMFFT_BLOCK_SIZE, 1); - RotateDataAfter2DFFT_Kernel << > > (pAfterFFT, Nx / 2, Nx, Ny / 2, Ny, Nx * Ny * 2, howMany); + RotateDataAfter2DFFT_Kernel << > > (pAfterFFT, Nx / 2, Nx, Ny / 2, Ny); } -void RepairSignAndRotateDataAfter2DFFT_GPU(float* pAfterFFT, long Nx, long Ny, long howMany, float Mult) +void CGenMathFFT2D::RepairSignAndRotateDataAfter2DFFT_GPU(float* pAfterFFT, long Nx, long Ny, float Mult) { dim3 blocks(Nx / (2 * GMFFT_BLOCK_SIZE) + ((Nx / 2 & (GMFFT_BLOCK_SIZE - 1)) != 0), Ny/2); dim3 threads(GMFFT_BLOCK_SIZE, 1); - RepairSignAndRotateDataAfter2DFFT_Kernel << > > ((float2*)pAfterFFT, Nx / 2, Nx, Ny / 2, Ny, Nx * Ny, howMany, Mult); + RepairSignAndRotateDataAfter2DFFT_Kernel << > > ((float2*)pAfterFFT, Nx / 2, Nx, Ny / 2, Ny, Mult); } -void NormalizeDataAfter2DFFT_GPU(float* pAfterFFT, long Nx, long Ny, long howMany, double Mult) +void CGenMathFFT2D::NormalizeDataAfter2DFFT_GPU(float* pAfterFFT, long Nx, long Ny, double Mult) { dim3 blocks((Nx * Ny) / GMFFT_BLOCK_SIZE + (((Nx * Ny) & (GMFFT_BLOCK_SIZE - 1)) != 0), 1); dim3 threads(GMFFT_BLOCK_SIZE, 1); - NormalizeDataAfter2DFFT_Kernel << > > (pAfterFFT, Nx * Ny * 2, howMany,1, (float)Mult); //OC06092023 + NormalizeDataAfter2DFFT_Kernel << > > (pAfterFFT, Nx * Ny * 2, 1, (float)Mult); //OC06092023 //NormalizeDataAfter2DFFT_Kernel << > > (pAfterFFT, Nx * Ny * 2, howMany,1, Mult); } -void TreatShifts2D_GPU(float* pData, long Nx, long Ny, long howMany, bool NeedsShiftX, bool NeedsShiftY, float* m_ArrayShiftX, float* m_ArrayShiftY) +void CGenMathFFT2D::TreatShifts2D_GPU(float* pData, long Nx, long Ny, bool NeedsShiftX, bool NeedsShiftY, float* m_ArrayShiftX, float* m_ArrayShiftY) { dim3 blocks((Nx) / GMFFT_BLOCK_SIZE + (((Nx) & (GMFFT_BLOCK_SIZE - 1)) != 0), Ny); dim3 threads(GMFFT_BLOCK_SIZE, 1); - if (NeedsShiftX && NeedsShiftY) TreatShift2D_Kernel << > > (pData, howMany, Nx * 2, Ny, m_ArrayShiftX, m_ArrayShiftY); - else if (NeedsShiftX) TreatShift2D_Kernel << > > (pData, howMany, Nx * 2, Ny, m_ArrayShiftX, m_ArrayShiftY); - else if (NeedsShiftY) TreatShift2D_Kernel << > > (pData, howMany, Nx * 2, Ny, m_ArrayShiftX, m_ArrayShiftY); + if (NeedsShiftX && NeedsShiftY) TreatShift2D_Kernel << > > (pData, Nx * 2, Ny, m_ArrayShiftX, m_ArrayShiftY); + else if (NeedsShiftX) TreatShift2D_Kernel << > > (pData, Nx * 2, Ny, m_ArrayShiftX, m_ArrayShiftY); + else if (NeedsShiftY) TreatShift2D_Kernel << > > (pData, Nx * 2, Ny, m_ArrayShiftX, m_ArrayShiftY); } -void RepairSignAfter2DFFT_GPU(double* pAfterFFT, long Nx, long Ny, long howMany) +void CGenMathFFT2D::RepairSignAfter2DFFT_GPU(double* pAfterFFT, long Nx, long Ny) { dim3 blocks(Nx / GMFFT_BLOCK_SIZE + ((Nx & (GMFFT_BLOCK_SIZE - 1)) != 0), Ny); dim3 threads(GMFFT_BLOCK_SIZE, 1); - RepairSignAfter2DFFT_Kernel << > > (pAfterFFT, Nx, Ny, Nx * Ny * 2, howMany); + RepairSignAfter2DFFT_Kernel << > > (pAfterFFT, Nx, Ny); } -void RotateDataAfter2DFFT_GPU(double* pAfterFFT, long Nx, long Ny, long howMany) +void CGenMathFFT2D::RotateDataAfter2DFFT_GPU(double* pAfterFFT, long Nx, long Ny) { dim3 blocks(Nx / (2 * GMFFT_BLOCK_SIZE) + ((Nx / 2 & (GMFFT_BLOCK_SIZE - 1)) != 0), Ny); dim3 threads(GMFFT_BLOCK_SIZE, 1); - RotateDataAfter2DFFT_Kernel << > > (pAfterFFT, Nx / 2, Nx, Ny / 2, Ny, Nx * Ny * 2, howMany); + RotateDataAfter2DFFT_Kernel << > > (pAfterFFT, Nx / 2, Nx, Ny / 2, Ny); } -void RepairSignAndRotateDataAfter2DFFT_GPU(double* pAfterFFT, long Nx, long Ny, long howMany, double Mult) +void CGenMathFFT2D::RepairSignAndRotateDataAfter2DFFT_GPU(double* pAfterFFT, long Nx, long Ny, double Mult) { dim3 blocks(Nx / (2 * GMFFT_BLOCK_SIZE) + ((Nx / 2 & (GMFFT_BLOCK_SIZE - 1)) != 0), Ny/2); dim3 threads(GMFFT_BLOCK_SIZE, 1); - RepairSignAndRotateDataAfter2DFFT_Kernel << > > ((double2*)pAfterFFT, Nx / 2, Nx, Ny / 2, Ny, Nx * Ny, howMany, Mult); + RepairSignAndRotateDataAfter2DFFT_Kernel << > > ((double2*)pAfterFFT, Nx / 2, Nx, Ny / 2, Ny, Mult); } -void NormalizeDataAfter2DFFT_GPU(double* pAfterFFT, long Nx, long Ny, long howMany, double Mult) +void CGenMathFFT2D::NormalizeDataAfter2DFFT_GPU(double* pAfterFFT, long Nx, long Ny, double Mult) { dim3 blocks((Nx * Ny) / GMFFT_BLOCK_SIZE + (((Nx * Ny) & (GMFFT_BLOCK_SIZE - 1)) != 0), 1); dim3 threads(GMFFT_BLOCK_SIZE, 1); - NormalizeDataAfter2DFFT_Kernel << > > (pAfterFFT, Nx * Ny * 2, howMany,1, Mult); + NormalizeDataAfter2DFFT_Kernel << > > (pAfterFFT, Nx * Ny * 2,1, Mult); } -void TreatShifts2D_GPU(double* pData, long Nx, long Ny, long howMany, bool NeedsShiftX, bool NeedsShiftY, double* m_ArrayShiftX, double* m_ArrayShiftY) +void CGenMathFFT2D::TreatShifts2D_GPU(double* pData, long Nx, long Ny, bool NeedsShiftX, bool NeedsShiftY, double* m_ArrayShiftX, double* m_ArrayShiftY) { dim3 blocks((Nx) / GMFFT_BLOCK_SIZE + (((Nx) & (GMFFT_BLOCK_SIZE - 1)) != 0), Ny); dim3 threads(GMFFT_BLOCK_SIZE, 1); - if (NeedsShiftX && NeedsShiftY) TreatShift2D_Kernel << > > (pData, howMany, Nx * 2, Ny, m_ArrayShiftX, m_ArrayShiftY); - else if (NeedsShiftX) TreatShift2D_Kernel << > > (pData, howMany, Nx * 2, Ny, m_ArrayShiftX, m_ArrayShiftY); - else if (NeedsShiftY) TreatShift2D_Kernel << > > (pData, howMany, Nx * 2, Ny, m_ArrayShiftX, m_ArrayShiftY); -} - -//OC06092023: looks like place if wrong here for this function, why all these functions are programmed without classes? -template __global__ void StokesAvgUpdateInterp_Kernel(float* pStokesArS, float* pMoreStokesArS, int nIters, int nOrder, int nStokesComp, T mult, int iSt, long xNpMeshRes, long yNpMeshRes, long eNpMeshRes, T yStartMeshRes, T yStepMeshRes, T yStartWfr, T yStepWfr, T xStartMeshRes, T xStepMeshRes, T xStartWfr, T xStepWfr, int iOfstSt, long xNpWfr, long yNpWfr, long eNpWfr, bool sum) -{ - int ix = (blockIdx.x * blockDim.x + threadIdx.x); //xNpMeshRes range - int iy = (blockIdx.y * blockDim.y + threadIdx.y); //yNpMeshRes range - int ie = (blockIdx.z * blockDim.z + threadIdx.z); //eNpMeshRes range - - if (ix >= xNpMeshRes) - return; - if (iy >= yNpMeshRes) - return; - if (ie >= eNpMeshRes) - return; - - long ir = iSt * yNpMeshRes * xNpMeshRes * eNpMeshRes + iy * xNpMeshRes * eNpMeshRes + ix * eNpMeshRes + ie; - - auto yMeshRes = yStartMeshRes + iy * yStepMeshRes; - auto xMeshRes = xStartMeshRes + ix * xStepMeshRes; - T fInterp = 0; - int loc_ix_ofst = iOfstSt + ie; - auto nx_ix_per = xNpWfr * eNpWfr; - - switch (nOrder) - { - case 1: - { - int ix0 = (int)trunc((xMeshRes - xStartWfr) / xStepWfr + 1e-09); - if ((ix0 < 0) | (ix0 >= xNpWfr - 1)) - { - pStokesArS[ir] = pStokesArS[ir] * nIters / (nIters + 1); - return; - } - int ix1 = ix0 + 1; - auto tx = (xMeshRes - (xStartWfr + xStepWfr * ix0)) / xStepWfr; - int iy0 = (int)trunc((yMeshRes - yStartWfr) / yStepWfr + 1e-09); - if ((iy0 < 0) | (iy0 >= yNpWfr - 1)) - { - pStokesArS[ir] = pStokesArS[ir] * nIters / (nIters + 1); - return; - } - - - int iy1 = iy0 + 1; - auto ty = (yMeshRes - (yStartWfr + yStepWfr * iy0)) / yStepWfr; - auto iy0_nx_ix_per = iy0 * nx_ix_per; - auto iy1_nx_ix_per = iy1 * nx_ix_per; - auto ix0_ix_per_p_ix_ofst = ix0 * eNpWfr + loc_ix_ofst; - auto ix1_ix_per_p_ix_ofst = ix1 * eNpWfr + loc_ix_ofst; - auto a00 = pMoreStokesArS[iy0_nx_ix_per + ix0_ix_per_p_ix_ofst]; - auto f10 = pMoreStokesArS[iy0_nx_ix_per + ix1_ix_per_p_ix_ofst]; - auto f01 = pMoreStokesArS[iy1_nx_ix_per + ix0_ix_per_p_ix_ofst]; - auto f11 = pMoreStokesArS[iy1_nx_ix_per + ix1_ix_per_p_ix_ofst]; - auto a10 = f10 - a00; - auto a01 = f01 - a00; - auto a11 = a00 - f01 - f10 + f11; - fInterp = a00 + tx * (a10 + ty * a11) + ty * a01; - } - break; - case 2: - { - int ix0 = int(round((xMeshRes - xStartWfr) / xStepWfr)); - if ((ix0 < 0) || (ix0 >= xNpWfr - 1)) - { - pStokesArS[ir] = pStokesArS[ir] * nIters / (float)(nIters + 1); - ir += 1; - return; - } - int ixm1 = ix0 - 1; - int ix1 = ix0 + 1; - auto tx = (xMeshRes - (xStartWfr + xStepWfr * ix0)) / xStepWfr; - int iy0 = int(round((yMeshRes - yStartWfr) / yStepWfr)); - if ((iy0 < 0) || (iy0 >= yNpWfr - 1)) - { - pStokesArS[ir] = pStokesArS[ir] * nIters / (nIters + 1); - ir += 1; - return; - } - int iym1 = iy0 - 1; - int iy1 = iy0 + 1; - auto ty = (yMeshRes - (yStartWfr + yStepWfr * iy0)) / yStepWfr; - auto iym1_nx_ix_per = iym1 * nx_ix_per; - auto iy0_nx_ix_per = iy0 * nx_ix_per; - auto iy1_nx_ix_per = iy1 * nx_ix_per; - auto ixm1_ix_per_p_ix_ofst = ixm1 * eNpWfr + loc_ix_ofst; - auto ix0_ix_per_p_ix_ofst = ix0 * eNpWfr + loc_ix_ofst; - auto ix1_ix_per_p_ix_ofst = ix1 * eNpWfr + loc_ix_ofst; - auto fm10 = pMoreStokesArS[iy0_nx_ix_per + ixm1_ix_per_p_ix_ofst]; - auto a00 = pMoreStokesArS[iy0_nx_ix_per + ix0_ix_per_p_ix_ofst]; - auto f10 = pMoreStokesArS[iy0_nx_ix_per + ix1_ix_per_p_ix_ofst]; - auto f0m1 = pMoreStokesArS[iym1_nx_ix_per + ix0_ix_per_p_ix_ofst]; - auto f01 = pMoreStokesArS[iy1_nx_ix_per + ix0_ix_per_p_ix_ofst]; - auto f11 = pMoreStokesArS[iy1_nx_ix_per + ix1_ix_per_p_ix_ofst]; - auto a10 = 0.5 * (f10 - fm10); - auto a01 = 0.5 * (f01 - f0m1); - auto a11 = a00 - f01 - f10 + f11; - auto a20 = 0.5 * (f10 + fm10) - a00; - auto a02 = 0.5 * (f01 + f0m1) - a00; - fInterp = a00 + tx * (a10 + tx * a20 + ty * a11) + ty * (a01 + ty * a02); - } - break; - } - - if (sum) pStokesArS[ir] += mult * fInterp; - else pStokesArS[ir] = (pStokesArS[ir] * nIters + mult * fInterp) / (nIters + 1); - return; -} - -//OC06092023: looks like place if wrong here for this function, why all these functions are programmed without classes? -void StokesAvgUpdateInterp(float* pStokesArS, float* pMoreStokesArS, int nIters, int nOrder, int nStokesComp, double mult, int iSt, long xNpMeshRes, long yNpMeshRes, long eNpMeshRes, double yStartMeshRes, double yStepMeshRes, double yStartWfr, double yStepWfr, double xStartMeshRes, double xStepMeshRes, double xStartWfr, double xStepWfr, int iOfstSt, long xNpWfr, long yNpWfr, long eNpWfr, bool sum) -{ - const int bs = 8; - dim3 threads(xNpMeshRes / bs + ((xNpMeshRes & (bs - 1)) != 0), yNpMeshRes / bs + ((yNpMeshRes & (bs - 1)) != 0), eNpMeshRes); - dim3 blocks(bs, bs, 1); - //OC06092023 (check order of variables, loop over e) - StokesAvgUpdateInterp_Kernel << > > (pStokesArS, pMoreStokesArS, nIters, nOrder, nStokesComp, (float)mult, iSt, xNpMeshRes, yNpMeshRes, eNpMeshRes, (float)yStartMeshRes, (float)yStepMeshRes, (float)yStartWfr, (float)yStepWfr, (float)xStartMeshRes, (float)xStepMeshRes, (float)xStartWfr, (float)xStepWfr, iOfstSt, xNpWfr, yNpWfr, eNpWfr, sum); - //StokesAvgUpdateInterp_Kernel << > > (pStokesArS, pMoreStokesArS, nIters, nOrder, nStokesComp, mult, iSt, xNpMeshRes, yNpMeshRes, eNpMeshRes, yStartMeshRes, yStepMeshRes, yStartWfr, yStepWfr, xStartMeshRes, xStepMeshRes, xStartWfr, xStepWfr, iOfstSt, xNpWfr, yNpWfr, eNpWfr, sum); + if (NeedsShiftX && NeedsShiftY) TreatShift2D_Kernel << > > (pData, Nx * 2, Ny, m_ArrayShiftX, m_ArrayShiftY); + else if (NeedsShiftX) TreatShift2D_Kernel << > > (pData, Nx * 2, Ny, m_ArrayShiftX, m_ArrayShiftY); + else if (NeedsShiftY) TreatShift2D_Kernel << > > (pData, Nx * 2, Ny, m_ArrayShiftX, m_ArrayShiftY); } #endif \ No newline at end of file diff --git a/cpp/src/ext/genmath/gmmeth.h b/cpp/src/ext/genmath/gmmeth.h index 6388ae26..619a7d01 100644 --- a/cpp/src/ext/genmath/gmmeth.h +++ b/cpp/src/ext/genmath/gmmeth.h @@ -18,6 +18,10 @@ #include "gmobj.h" #endif +#ifdef _OFFLOAD_GPU //HG04122023 +#include "auxgpu.h" +#endif + #include "gmvect.h" #include #include @@ -163,7 +167,11 @@ class CGenMathMeth //static double Integ1D_FuncDefByArray(double* FuncArr, long Np, double Step); //static double Integ1D_FuncDefByArray(float* FuncArr, long Np, double Step); //template static double Integ1D_FuncDefByArray(T* FuncArr, long Np, double Step) +#ifdef _OFFLOAD_GPU //HG04122023 + template GPU_PORTABLE static double Integ1D_FuncDefByArray(T* FuncArr, long long Np, double Step) +#else template static double Integ1D_FuncDefByArray(T* FuncArr, long long Np, double Step) +#endif { if((FuncArr == 0) || (Np < 2) || (Step == 0)) return 0; //if(Np == 2) return (double)(0.5*(FuncArr[0] + FuncArr[1])); diff --git a/cpp/src/ext/utils/utidev.cpp b/cpp/src/ext/utils/utidev.cpp deleted file mode 100644 index 3a2057f1..00000000 --- a/cpp/src/ext/utils/utidev.cpp +++ /dev/null @@ -1,97 +0,0 @@ -/************************************************************************//** - * File: utidev.cpp - * Description: Auxiliary utilities to support GPU management - * - * @author H.Goel - * @version 0.1 - ***************************************************************************/ - -#include -#include -#include - -#ifdef _OFFLOAD_GPU -#include -#endif - -#include "utidev.h" - -static bool isGPUAvailable = false; -static bool isGPUEnabled = false; -static bool GPUAvailabilityTested = false; -static bool deviceOffloadInitialized = false; - -static void CheckGPUAvailability() -{ -#ifdef _OFFLOAD_GPU - if (!GPUAvailabilityTested) - { - isGPUAvailable = false; - GPUAvailabilityTested = true; - int deviceCount = 0; - if (cudaGetDeviceCount(&deviceCount) != cudaSuccess) - return; - - if (deviceCount < 1) - return; - - isGPUAvailable = true; - } -#else - isGPUAvailable = false; - isGPUEnabled = false; - GPUAvailabilityTested = true; -#endif -} - -bool UtiDev::GPUAvailable() -{ - CheckGPUAvailability(); - return isGPUAvailable; -} - -bool UtiDev::GPUEnabled(gpuUsageArg_t *arg) -{ -#ifdef _OFFLOAD_GPU - if (arg == NULL) - return false; - if (*arg > 0) { - //if (cudaSetDevice(*arg - 1) != cudaSuccess) return false; - return GPUAvailable(); - } -#endif - return false; -} - -void UtiDev::SetGPUStatus(bool enabled) -{ - isGPUEnabled = enabled && GPUAvailable(); -} - -int UtiDev::GetDevice(gpuUsageArg_t* arg) -{ -#ifdef _OFFLOAD_GPU - if (arg == NULL) - return cudaCpuDeviceId; - - int curDevice = 0; - cudaGetDevice(&curDevice); - return curDevice; -#else - return 0; -#endif -} - -void UtiDev::Init() { - deviceOffloadInitialized = true; -#ifdef _OFFLOAD_GPU - cudaDeviceSynchronize(); -#endif -} - -void UtiDev::Fini() { -#ifdef _OFFLOAD_GPU - cudaDeviceSynchronize(); -#endif - //deviceOffloadInitialized = false; -} \ No newline at end of file diff --git a/cpp/src/ext/utils/utidev.h b/cpp/src/ext/utils/utidev.h deleted file mode 100644 index 2df059fd..00000000 --- a/cpp/src/ext/utils/utidev.h +++ /dev/null @@ -1,71 +0,0 @@ -/************************************************************************//** - * File: utidev.h - * Description: GPU offloading detection and control - * Project: Synchrotron Radiation Workshop (and possibly others) - * First release: 2022 - * - * @author H. Goel - * @version 0.1 - ***************************************************************************/ - -#ifndef __UTIGPU_H -#define __UTIGPU_H - -#include -#include - -#ifdef _OFFLOAD_GPU -#include -#endif - -typedef int gpuUsageArg_t; - -#define ALLOC_ARRAY(type, size) (type *)UtiDev::malloc(sizeof(type)*(size)) -#define FREE_ARRAY(x) UtiDev::free(x); x=NULL -#define ALLOC_STRUCT(type) (type *)UtiDev::malloc(sizeof(type)) -#define FREE_STRUCT(x) UtiDev::free(x); x=NULL - -#ifdef _OFFLOAD_GPU -#define GPU_ENABLED(arg) UtiDev::GPUEnabled(arg) -#define GPU_COND(arg, code) if (GPU_ENABLED(arg)) { code } -#define GPU_PORTABLE __device__ __host__ -#else -#define GPU_COND(arg, code) if(0) { } -#define GPU_ENABLED(arg) 0 -#define GPU_PORTABLE -#endif - - //************************************************************************* -class UtiDev -{ -public: - static void Init(); - static void Fini(); - static bool GPUAvailable(); //CheckGPUAvailable etc - static bool GPUEnabled(gpuUsageArg_t *arg); - static void SetGPUStatus(bool enabled); - static int GetDevice(gpuUsageArg_t* arg); - - static inline void* malloc(size_t sz) { -#ifdef _OFFLOAD_GPU - void *ptr; - auto err = cudaMallocManaged(&ptr, sz); - if (err != cudaSuccess) - printf("Allocation Failure\r\n"); - return ptr; -#else - return std::malloc(sz); -#endif - } - - static inline void free(void* ptr) { -#ifdef _OFFLOAD_GPU - cudaFree(ptr); -#else - std::free(ptr); -#endif - } -}; - -//************************************************************************* -#endif \ No newline at end of file diff --git a/cpp/src/lib/auxgpu.cpp b/cpp/src/lib/auxgpu.cpp index 02972cd3..d65db5e0 100644 --- a/cpp/src/lib/auxgpu.cpp +++ b/cpp/src/lib/auxgpu.cpp @@ -330,6 +330,8 @@ void CAuxGPU::Init() { void CAuxGPU::Fini() { #ifdef _OFFLOAD_GPU + SetGPUStatus(false); //HG30112023 Disable GPU + // Copy back all updated data bool updated = false; bool freed = false; diff --git a/cpp/src/lib/srwlib.cpp b/cpp/src/lib/srwlib.cpp index 5bc9a324..fac92539 100644 --- a/cpp/src/lib/srwlib.cpp +++ b/cpp/src/lib/srwlib.cpp @@ -754,7 +754,7 @@ EXP int CALL srwlCalcPowDenSR(SRWLStokes* pStokes, SRWLPartBeam* pElBeam, SRWLPr //------------------------------------------------------------------------- -EXP int CALL srwlCalcIntFromElecField(char* pInt, SRWLWfr* pWfr, char polar, char intType, char depType, double e, double x, double y, double* pMeth, void* pFldTrj, void* pGPU) //OC26072023 +EXP int CALL srwlCalcIntFromElecField(char* pInt, SRWLWfr* pWfr, char polar, char intType, char depType, double e, double x, double y, double* pMeth, void* pFldTrj, void* pvGPU) //OC26072023 //EXP int CALL srwlCalcIntFromElecField(char* pInt, SRWLWfr* pWfr, char polar, char intType, char depType, double e, double x, double y, double* pMeth, void* pFldTrj) //OC23022020 //EXP int CALL srwlCalcIntFromElecField(char* pInt, SRWLWfr* pWfr, char polar, char intType, char depType, double e, double x, double y, double *pMeth) //OC16122019 //EXP int CALL srwlCalcIntFromElecField(char* pInt, SRWLWfr* pWfr, char polar, char intType, char depType, double e, double x, double y, int *pMeth) //OC13122019 @@ -800,7 +800,8 @@ EXP int CALL srwlCalcIntFromElecField(char* pInt, SRWLWfr* pWfr, char polar, cha //pFldTrj = pTrjData; } - radGenManip.ExtractRadiation((int)polar, (int)arIntTypeConv[intType], (int)depType, wfr.Pres, e, x, y, pInt, pMeth, pTrjDat); //OC23022020 + radGenManip.ExtractRadiation((int)polar, (int)arIntTypeConv[intType], (int)depType, wfr.Pres, e, x, y, pInt, pMeth, pTrjDat, pvGPU); //HG03122023 + //radGenManip.ExtractRadiation((int)polar, (int)arIntTypeConv[intType], (int)depType, wfr.Pres, e, x, y, pInt, pMeth, pTrjDat); //OC23022020 //radGenManip.ExtractRadiation((int)polar, (int)arIntTypeConv[intType], (int)depType, wfr.Pres, e, x, y, pInt, pMeth); //OC13122019 //radGenManip.ExtractRadiation((int)polar, (int)intType, (int)depType, wfr.Pres, e, x, y, pInt); @@ -998,7 +999,7 @@ EXP int CALL srwlSetRepresElecField(SRWLWfr* pWfr, char repr) //------------------------------------------------------------------------- -EXP int CALL srwlPropagElecField(SRWLWfr* pWfr, SRWLOptC* pOpt, int nInt, char** arID, SRWLRadMesh* arIM, char** arI, void* pGPU) //OC26072023 (from HG) +EXP int CALL srwlPropagElecField(SRWLWfr* pWfr, SRWLOptC* pOpt, int nInt, char** arID, SRWLRadMesh* arIM, char** arI, void* pvGPU) //OC26072023 (from HG) //EXP int CALL srwlPropagElecField(SRWLWfr* pWfr, SRWLOptC* pOpt, int nInt, char** arID, SRWLRadMesh* arIM, char** arI) //OC15082018 //EXP int CALL srwlPropagElecField(SRWLWfr* pWfr, SRWLOptC* pOpt) { @@ -1019,7 +1020,8 @@ EXP int CALL srwlPropagElecField(SRWLWfr* pWfr, SRWLOptC* pOpt, int nInt, char** //srwlPrintTime("srwlPropagElecField: CheckRadStructForPropagation",&start); //if(locErNo = optCont.PropagateRadiationGuided(wfr)) return locErNo; - if(locErNo = optCont.PropagateRadiationGuided(wfr, nInt, arID, arIM, arI)) return locErNo; //OC15082018 + //if(locErNo = optCont.PropagateRadiationGuided(wfr, nInt, arID, arIM, arI)) return locErNo; //OC15082018 + if(locErNo = optCont.PropagateRadiationGuided(wfr, nInt, arID, arIM, arI, pvGPU)) return locErNo; //OC15082018 //HG03122023 //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: //srwlPrintTime("srwlPropagElecField: PropagateRadiationGuided",&start); @@ -1052,7 +1054,7 @@ EXP int CALL srwlCalcTransm(SRWLOptT* pOpTr, const double* pDelta, const double* //------------------------------------------------------------------------- -EXP int CALL srwlUtiFFT(char* pcData, char typeData, double* arMesh, int nMesh, int dir, void* pGPU) //OC26072023 +EXP int CALL srwlUtiFFT(char* pcData, char typeData, double* arMesh, int nMesh, int dir, void* pvGPU) //OC26072023 //EXP int CALL srwlUtiFFT(char* pcData, char typeData, double* arMesh, int nMesh, int dir) { if((pcData == 0) || (arMesh == 0) || ((typeData != 'f') && (typeData != 'd')) || (nMesh < 3) || (dir == 0)) return SRWL_INCORRECT_PARAM_FOR_FFT; //OC31012019 @@ -1098,7 +1100,8 @@ EXP int CALL srwlUtiFFT(char* pcData, char typeData, double* arMesh, int nMesh, FFT1DInfo.UseGivenStartTrValue = 0; CGenMathFFT1D FFT1D; - if(locErNo = FFT1D.Make1DFFT(FFT1DInfo)) return locErNo; + //if(locErNo = FFT1D.Make1DFFT(FFT1DInfo)) return locErNo; + if(locErNo = FFT1D.Make1DFFT(FFT1DInfo, pvGPU)) return locErNo; //HG03122023 arMesh[0] = FFT1DInfo.xStartTr; arMesh[1] = FFT1DInfo.xStepTr; @@ -1128,7 +1131,8 @@ EXP int CALL srwlUtiFFT(char* pcData, char typeData, double* arMesh, int nMesh, FFT2DInfo.UseGivenStartTrValues = 0; CGenMathFFT2D FFT2D; - if(locErNo = FFT2D.Make2DFFT(FFT2DInfo)) return locErNo; + //if(locErNo = FFT2D.Make2DFFT(FFT2DInfo)) return locErNo; + if(locErNo = FFT2D.Make2DFFT(FFT2DInfo, 0, 0, pvGPU)) return locErNo; //HG03122023 arMesh[0] = FFT2DInfo.xStartTr; arMesh[1] = FFT2DInfo.xStepTr; @@ -1546,49 +1550,57 @@ EXP int CALL srwlPropagRadMultiE(SRWLStokes* pStokes, SRWLWfr* pWfr0, SRWLOptC* //------------------------------------------------------------------------- #ifdef _OFFLOAD_GPU //OC30102023 +EXP int CALL srwlUtiGPUProc(int op, void* pvGPU) //HG04122023 +{ + if(op == 0) CAuxGPU::Fini(); + if(op == 1) CAuxGPU::Init(); + return 0; +} +/* HG30112023 EXP bool CALL srwlUtiGPUAvailable() //OC27072023 -//EXP bool CALL srwlAuxGpuAvailable() //HG +//EXP bool CALL srwlCAuxGPUAvailable() //HG { return CAuxGPU::GPUAvailable(); //OC05092023 - //return AuxGpu::GPUAvailable(); + //return CAuxGPU::GPUAvailable(); } //------------------------------------------------------------------------- EXP bool CALL srwlUtiGPUEnabled() //OC27072023 -//EXP bool CALL srwlAuxGpuEnabled() //HG +//EXP bool CALL srwlCAuxGPUEnabled() //HG { return CAuxGPU::GPUEnabled(nullptr); //OC05092023 - //return AuxGpu::GPUEnabled(nullptr); + //return CAuxGPU::GPUEnabled(nullptr); } //------------------------------------------------------------------------- EXP void CALL srwlUtiGPUSetStatus(bool enable) //OC27072023 -//EXP void CALL srwlAuxGpuSetStatus(bool enable) //HG +//EXP void CALL srwlCAuxGPUSetStatus(bool enable) //HG { CAuxGPU::SetGPUStatus(enable); //OC05092023 - //AuxGpu::SetGPUStatus(enable); + //CAuxGPU::SetGPUStatus(enable); } //------------------------------------------------------------------------- EXP void CALL srwlUtiGPUInit() //OC27072023 -//EXP void CALL srwlAuxGpuInit() //HG +//EXP void CALL srwlCAuxGPUInit() //HG { CAuxGPU::Init(); //OC05092023 (why void?) - //AuxGpu::Init(); + //CAuxGPU::Init(); } //------------------------------------------------------------------------- EXP void CALL srwlUtiGPUFini() //OC27072023 -//EXP void CALL srwlAuxGpuFini() //HG +//EXP void CALL srwlCAuxGPUFini() //HG { CAuxGPU::Fini(); //OC05092023 (why void?) - //AuxGpu::Fini(); + //CAuxGPU::Fini(); } +*/ #endif //------------------------------------------------------------------------- diff --git a/cpp/src/lib/srwlib.h b/cpp/src/lib/srwlib.h index ff81e0ff..9b73c400 100644 --- a/cpp/src/lib/srwlib.h +++ b/cpp/src/lib/srwlib.h @@ -729,10 +729,11 @@ EXP int CALL srwlCalcPowDenSR(SRWLStokes* pStokes, SRWLPartBeam* pElBeam, SRWLPr * arMeth[18]: used for mutual intensity calculaiton / update: index of first general conjugated position to start updating the mutual intensity * arMeth[19]: used for mutual intensity calculaiton / update: index of last general conjugated position to finish updating the mutual intensity * @param [in] pFldTrj auxiliary pointer to magnetic field or trajectory of central electron + * @param [in] pvGPU optional GPU utilization related parameters (TGPUUsageArg*) * @return integer error (>0) or warnig (<0) code * @see ... */ -EXP int CALL srwlCalcIntFromElecField(char* pInt, SRWLWfr* pWfr, char pol, char intType, char depType, double e, double x, double y, double* arMeth=0, void* pFldTrj=0, void* pGPU=0); //OC26072023 (from HG) +EXP int CALL srwlCalcIntFromElecField(char* pInt, SRWLWfr* pWfr, char pol, char intType, char depType, double e, double x, double y, double* arMeth=0, void* pFldTrj=0, void* pvGPU=0); //OC26072023 (from HG) //EXP int CALL srwlCalcIntFromElecField(char* pInt, SRWLWfr* pWfr, char pol, char intType, char depType, double e, double x, double y, double* arMeth=0, void* pFldTrj=0); //EXP int CALL srwlCalcIntFromElecField(char* pInt, SRWLWfr* pWfr, char pol, char intType, char depType, double e, double x, double y, double* arMeth=0); //EXP int CALL srwlCalcIntFromElecField(char* pInt, SRWLWfr* pWfr, char pol, char intType, char depType, double e, double x, double y); @@ -800,10 +801,11 @@ EXP int CALL srwlSetRepresElecField(SRWLWfr* pWfr, char repr); * "Propagates" Electric Field Wavefront through Optical Elements and free spaces * @param [in, out] pWfr pointer to pre-calculated Wavefront structure * @param [in] pOpt pointer to container of optical elements the propagation should be done through + * @param [in] pvGPU optional GPU utilization related parameters (TGPUUsageArg*) * @return integer error (>0) or warnig (<0) code * @see ... */ -EXP int CALL srwlPropagElecField(SRWLWfr* pWfr, SRWLOptC* pOpt, int nInt=0, char** arID=0, SRWLRadMesh* arIM=0, char** arI=0, void* pGPU=0); //OC26072023 (from HG) +EXP int CALL srwlPropagElecField(SRWLWfr* pWfr, SRWLOptC* pOpt, int nInt=0, char** arID=0, SRWLRadMesh* arIM=0, char** arI=0, void* pvGPU=0); //OC26072023 (from HG) //EXP int CALL srwlPropagElecField(SRWLWfr* pWfr, SRWLOptC* pOpt, int nInt=0, char** arID=0, SRWLRadMesh* arIM=0, char** arI=0); //OC15082018 //EXP int CALL srwlPropagElecField(SRWLWfr* pWfr, SRWLOptC* pOpt); @@ -848,10 +850,11 @@ EXP int CALL srwlCalcTransm(SRWLOptT* pOpTr, const double* pDelta, const double* * arMesh[5]: (optional) number of points of the second argument * @param [in] nMesh length of arMesh array (3 or 6 elements) * @param [in] dir direction for the FFT (>0 means forward, <0 means backward) + * @param [in] pvGPU optional GPU utilization related parameters (TGPUUsageArg*) * @return integer error (>0) or warnig (<0) code * @see ... */ -EXP int CALL srwlUtiFFT(char* pcData, char typeData, double* arMesh, int nMesh, int dir, void* pGPU=0); //OC26072023 (from HG) +EXP int CALL srwlUtiFFT(char* pcData, char typeData, double* arMesh, int nMesh, int dir, void* pvGPU=0); //OC26072023 (from HG) //EXP int CALL srwlUtiFFT(char* pcData, char typeData, double* arMesh, int nMesh, int dir); /** @@ -967,42 +970,55 @@ EXP int CALL srwlUtiUndFromMagFldTab(SRWLMagFldC* pUndCnt, SRWLMagFldC* pMagCnt, */ EXP int CALL srwlUtiUndFindMagFldInterpInds(int* arResInds, int* pnResInds, double* arGaps, double* arPhases, int nVals, double arPrecPar[5]); +#ifdef _OFFLOAD_GPU //HG30112023 +/** + * Implements GPU related operations. + * @param [in] op operation to be performed: + * 0= Deinitialize GPU + * 1= Initialize GPU + * @param [in] pvGPU optional GPU utilization related parameters (TGPUUsageArg*) + * @return integer error (>0) or warnig (<0) code + * @see ... + */ +EXP int CALL srwlUtiGPUProc(int op, void* pvGpu=0); + /** * Checks if GPU offloading is available * @return true if available * @see ... */ -EXP bool CALL srwlUtiGPUAvailable(); //OC26072023 -//EXP bool CALL srwlAuxGpuAvailable(); //HG +//EXP bool CALL srwlUtiGPUAvailable(); //OC26072023 +//EXP bool CALL srwlCAuxGPUAvailable(); //HG /** * Checks if GPU offloading is enabled * @return true if enabled * @see ... */ -EXP bool CALL srwlUtiGPUEnabled(); //OC26072023 -//EXP bool CALL srwlAuxGpuEnabled(); //HG +//EXP bool CALL srwlUtiGPUEnabled(); //OC26072023 +//EXP bool CALL srwlCAuxGPUEnabled(); //HG /** * Enable/Disable GPU offloading * @see ... */ -EXP void CALL srwlUtiGPUSetStatus(bool enable); -//EXP void CALL srwlAuxGpuSetStatus(bool enable); //HG +//EXP void CALL srwlUtiGPUSetStatus(bool enable); +//EXP void CALL srwlCAuxGPUSetStatus(bool enable); //HG /** * Initialize device offloading * @see ... */ -EXP void CALL srwlUtiGPUInit(); //OC26072023 -//EXP void CALL srwlAuxGpuInit(); //HG +//EXP void CALL srwlUtiGPUInit(); //OC26072023 +//EXP void CALL srwlCAuxGPUInit(); //HG /** * Finalize device offloading * @see ... */ -EXP void CALL srwlUtiGPUFini(); //OC26072023 -//EXP void CALL srwlAuxGpuFini(); //HG +//EXP void CALL srwlUtiGPUFini(); //OC26072023 +//EXP void CALL srwlCAuxGPUFini(); //HG +#endif /** * These functions were added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP diff --git a/cpp/vc/SRW.sln b/cpp/vc/SRW.sln index d62533af..57eb7848 100644 --- a/cpp/vc/SRW.sln +++ b/cpp/vc/SRW.sln @@ -1,14 +1,14 @@ Microsoft Visual Studio Solution File, Format Version 12.00 # Visual Studio Version 17 -VisualStudioVersion = 17.0.31912.275 +VisualStudioVersion = 17.4.33110.190 MinimumVisualStudioVersion = 10.0.40219.1 -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "SRWLIB", "SRWLIB.vcxproj", "{A7E707A6-D325-42AE-A0D0-3C97C38D36A6}" -EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "SRWLClientPython", "SRWLClientPython.vcxproj", "{B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}" ProjectSection(ProjectDependencies) = postProject {A7E707A6-D325-42AE-A0D0-3C97C38D36A6} = {A7E707A6-D325-42AE-A0D0-3C97C38D36A6} EndProjectSection EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "SRWLIB", "SRWLIB.vcxproj", "{A7E707A6-D325-42AE-A0D0-3C97C38D36A6}" +EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "SRWLClientIgor", "SRWLClientIgor.vcxproj", "{0D473386-2B3E-4586-8516-DD4DCF6D4E1E}" EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "SRWLClientC", "SRWLClientC.vcxproj", "{AA6DEE02-E699-4AF6-A8FC-655906C9CDC9}" @@ -23,32 +23,32 @@ Global Release|x64 = Release|x64 EndGlobalSection GlobalSection(ProjectConfigurationPlatforms) = postSolution - {A7E707A6-D325-42AE-A0D0-3C97C38D36A6}.Debug|Mixed Platforms.ActiveCfg = Debug|Win32 - {A7E707A6-D325-42AE-A0D0-3C97C38D36A6}.Debug|Mixed Platforms.Build.0 = Debug|Win32 + {B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Debug|Mixed Platforms.ActiveCfg = Debug_Py3_9|x64 + {B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Debug|Mixed Platforms.Build.0 = Debug_Py3_9|x64 + {B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Debug|Win32.ActiveCfg = Debug_Py2x|Win32 + {B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Debug|Win32.Build.0 = Debug_Py2x|Win32 + {B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Debug|x64.ActiveCfg = Debug_Py3_9|x64 + {B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Debug|x64.Build.0 = Debug_Py3_9|x64 + {B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Release|Mixed Platforms.ActiveCfg = Release_Py3_9|Win32 + {B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Release|Mixed Platforms.Build.0 = Release_Py3_9|Win32 + {B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Release|Win32.ActiveCfg = Release_Py2x|Win32 + {B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Release|Win32.Build.0 = Release_Py2x|Win32 + {B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Release|x64.ActiveCfg = Release_Py3_9|x64 + {B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Release|x64.Build.0 = Release_Py3_9|x64 + {A7E707A6-D325-42AE-A0D0-3C97C38D36A6}.Debug|Mixed Platforms.ActiveCfg = Debug_cuda|x64 + {A7E707A6-D325-42AE-A0D0-3C97C38D36A6}.Debug|Mixed Platforms.Build.0 = Debug_cuda|x64 {A7E707A6-D325-42AE-A0D0-3C97C38D36A6}.Debug|Win32.ActiveCfg = Debug|Win32 {A7E707A6-D325-42AE-A0D0-3C97C38D36A6}.Debug|Win32.Build.0 = Debug|Win32 - {A7E707A6-D325-42AE-A0D0-3C97C38D36A6}.Debug|x64.ActiveCfg = Debug|x64 - {A7E707A6-D325-42AE-A0D0-3C97C38D36A6}.Debug|x64.Build.0 = Debug|x64 + {A7E707A6-D325-42AE-A0D0-3C97C38D36A6}.Debug|x64.ActiveCfg = Debug_cuda|x64 + {A7E707A6-D325-42AE-A0D0-3C97C38D36A6}.Debug|x64.Build.0 = Debug_cuda|x64 {A7E707A6-D325-42AE-A0D0-3C97C38D36A6}.Release|Mixed Platforms.ActiveCfg = Release|Win32 {A7E707A6-D325-42AE-A0D0-3C97C38D36A6}.Release|Mixed Platforms.Build.0 = Release|Win32 {A7E707A6-D325-42AE-A0D0-3C97C38D36A6}.Release|Win32.ActiveCfg = Release|Win32 {A7E707A6-D325-42AE-A0D0-3C97C38D36A6}.Release|Win32.Build.0 = Release|Win32 - {A7E707A6-D325-42AE-A0D0-3C97C38D36A6}.Release|x64.ActiveCfg = Release|x64 - {A7E707A6-D325-42AE-A0D0-3C97C38D36A6}.Release|x64.Build.0 = Release|x64 - {B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Debug|Mixed Platforms.ActiveCfg = Debug_Py3_3|Win32 - {B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Debug|Mixed Platforms.Build.0 = Debug_Py3_3|Win32 - {B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Debug|Win32.ActiveCfg = Debug_Py3_3|Win32 - {B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Debug|Win32.Build.0 = Debug_Py3_3|Win32 - {B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Debug|x64.ActiveCfg = Debug_Py3_11|x64 - {B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Debug|x64.Build.0 = Debug_Py3_11|x64 - {B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Release|Mixed Platforms.ActiveCfg = Release_Py3_3|Win32 - {B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Release|Mixed Platforms.Build.0 = Release_Py3_3|Win32 - {B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Release|Win32.ActiveCfg = Release_Py2x|Win32 - {B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Release|Win32.Build.0 = Release_Py2x|Win32 - {B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Release|x64.ActiveCfg = Release_Py3_11|x64 - {B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Release|x64.Build.0 = Release_Py3_11|x64 - {0D473386-2B3E-4586-8516-DD4DCF6D4E1E}.Debug|Mixed Platforms.ActiveCfg = Debug|Win32 - {0D473386-2B3E-4586-8516-DD4DCF6D4E1E}.Debug|Mixed Platforms.Build.0 = Debug|Win32 + {A7E707A6-D325-42AE-A0D0-3C97C38D36A6}.Release|x64.ActiveCfg = Release_cuda|x64 + {A7E707A6-D325-42AE-A0D0-3C97C38D36A6}.Release|x64.Build.0 = Release_cuda|x64 + {0D473386-2B3E-4586-8516-DD4DCF6D4E1E}.Debug|Mixed Platforms.ActiveCfg = Debug|x64 + {0D473386-2B3E-4586-8516-DD4DCF6D4E1E}.Debug|Mixed Platforms.Build.0 = Debug|x64 {0D473386-2B3E-4586-8516-DD4DCF6D4E1E}.Debug|Win32.ActiveCfg = Debug|Win32 {0D473386-2B3E-4586-8516-DD4DCF6D4E1E}.Debug|Win32.Build.0 = Debug|Win32 {0D473386-2B3E-4586-8516-DD4DCF6D4E1E}.Debug|x64.ActiveCfg = Debug|x64 @@ -59,8 +59,8 @@ Global {0D473386-2B3E-4586-8516-DD4DCF6D4E1E}.Release|Win32.Build.0 = Release|Win32 {0D473386-2B3E-4586-8516-DD4DCF6D4E1E}.Release|x64.ActiveCfg = Release|x64 {0D473386-2B3E-4586-8516-DD4DCF6D4E1E}.Release|x64.Build.0 = Release|x64 - {AA6DEE02-E699-4AF6-A8FC-655906C9CDC9}.Debug|Mixed Platforms.ActiveCfg = Debug|Win32 - {AA6DEE02-E699-4AF6-A8FC-655906C9CDC9}.Debug|Mixed Platforms.Build.0 = Debug|Win32 + {AA6DEE02-E699-4AF6-A8FC-655906C9CDC9}.Debug|Mixed Platforms.ActiveCfg = Debug|x64 + {AA6DEE02-E699-4AF6-A8FC-655906C9CDC9}.Debug|Mixed Platforms.Build.0 = Debug|x64 {AA6DEE02-E699-4AF6-A8FC-655906C9CDC9}.Debug|Win32.ActiveCfg = Debug|Win32 {AA6DEE02-E699-4AF6-A8FC-655906C9CDC9}.Debug|Win32.Build.0 = Debug|Win32 {AA6DEE02-E699-4AF6-A8FC-655906C9CDC9}.Debug|x64.ActiveCfg = Debug|x64 diff --git a/cpp/vc/SRWLClientPython.vcxproj b/cpp/vc/SRWLClientPython.vcxproj index 88a2cc49..882e3575 100644 --- a/cpp/vc/SRWLClientPython.vcxproj +++ b/cpp/vc/SRWLClientPython.vcxproj @@ -1427,7 +1427,7 @@ srwlpy.pgd - copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\work\srw_python\" + copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\clients\python\srwpy\" diff --git a/cpp/vc/SRWLIB.vcxproj b/cpp/vc/SRWLIB.vcxproj index d0a4e611..e2215a8e 100644 --- a/cpp/vc/SRWLIB.vcxproj +++ b/cpp/vc/SRWLIB.vcxproj @@ -33,6 +33,14 @@ Debug_fftw2 x64 + + Release_cuda + Win32 + + + Release_cuda + x64 + Release_omph Win32 @@ -70,7 +78,6 @@ {A7E707A6-D325-42AE-A0D0-3C97C38D36A6} SRWLIB 10.0 - $(CUDA_PATH) @@ -85,6 +92,12 @@ false MultiByte + + StaticLibrary + v143 + false + MultiByte + StaticLibrary v143 @@ -134,6 +147,13 @@ MultiByte false + + StaticLibrary + v143 + false + MultiByte + false + StaticLibrary v143 @@ -175,7 +195,7 @@ - + @@ -185,6 +205,10 @@ + + + + @@ -217,6 +241,10 @@ + + + + @@ -295,6 +323,11 @@ $(Platform)\$(Configuration)\ srw_win32 + + $(SolutionDir) + $(Platform)\$(Configuration)\ + srw_win32 + $(SolutionDir) $(Platform)\$(Configuration)\ @@ -317,6 +350,12 @@ true srw_x64 + + $(SolutionDir) + $(Platform)\$(Configuration)\ + true + srw_x64 + $(SolutionDir) $(Platform)\$(Configuration)\ @@ -579,7 +618,7 @@ Disabled - ..\src\core;..\src\lib;..\src\ext\genmath;..\src\ext\auxparse;%(AdditionalIncludeDirectories) + ..\src\core;..\src\lib;..\src\ext\genmath;..\src\ext\auxparse;$(CUDA_PATH)\include;%(AdditionalIncludeDirectories) _DEBUG;WIN32;_WINDOWS;_USRDLL;__VC__;SRWLIB_STATIC;_GM_WITHOUT_BASE;_CRT_SECURE_NO_WARNINGS;NON_UNIX_STDIO;_FFTW3;_OFFLOAD_GPU;%(PreprocessorDefinitions) EnableFastChecks MultiThreadedDebug @@ -602,13 +641,19 @@ 0x0809 - ..\..\ext_lib\fftw3_64.lib;..\..\ext_lib\fftw3f_64.lib;%(AdditionalDependencies) + ..\..\ext_lib\fftw3_64.lib;..\..\ext_lib\fftw3f_64.lib;cudart_static.lib;cufft.lib;cudadevrt.lib;%(AdditionalDependencies) srw_x64.lib + $(CUDA_PATH)\lib\x64 + + 64 + compute_60,sm_60 + _OFFLOAD_GPU;_USE_CUDA; + @@ -729,6 +774,46 @@ copy $(TargetPath) "$(SolutionDir)..\..\env\work\srw_python\lib\" + + + NDEBUG;%(PreprocessorDefinitions) + true + true + Win32 + .\Release/SRWLIB.tlb + + + OnlyExplicitInline + Neither + ..\src\lib;..\src\core;..\src\ext\genmath;..\src\ext\auxparse;%(AdditionalIncludeDirectories) + NDEBUG;WIN32;_WINDOWS;_USRDLL;__VC__;SRWLIB_STATIC;_GM_WITHOUT_BASE;_CRT_SECURE_NO_WARNINGS;NON_UNIX_STDIO;%(PreprocessorDefinitions) + true + MultiThreaded + true + true + + + + + $(IntDir) + $(IntDir)vc90.pdb + Level2 + true + Default + Default + + + NDEBUG;%(PreprocessorDefinitions) + 0x0809 + + + ..\..\ext_lib\fftw_f.lib;%(AdditionalDependencies) + srw_win32.lib + + + copy $(TargetPath) "$(SolutionDir)..\..\env\work\srw_python\lib\" + + NDEBUG;%(PreprocessorDefinitions) @@ -901,6 +986,60 @@ + + + NDEBUG;%(PreprocessorDefinitions) + true + true + X64 + .\Release/SRWLIB.tlb + + + OnlyExplicitInline + Speed + ..\src\lib;..\src\core;..\src\ext\genmath;..\src\ext\auxparse;$(CUDA_PATH)\include;%(AdditionalIncludeDirectories) + NDEBUG;WIN32;_WINDOWS;_USRDLL;__VC__;SRWLIB_STATIC;_GM_WITHOUT_BASE;_CRT_SECURE_NO_WARNINGS;NON_UNIX_STDIO;_FFTW3;_OFFLOAD_GPU;%(PreprocessorDefinitions) + true + MultiThreaded + false + true + + + + + $(IntDir) + $(IntDir)vc90.pdb + Level3 + true + + + Default + true + MaxSpeed + true + Precise + + + NDEBUG;%(PreprocessorDefinitions) + 0x0809 + + + ..\..\ext_lib\fftw3_64.lib;..\..\ext_lib\fftw3f_64.lib;cudart_static.lib;cufft.lib;cudadevrt.lib;%(AdditionalDependencies) + srw_x64.lib + + + $(CUDA_PATH)\lib\x64 + + + + + + + 64 + compute_60,sm_60 + _OFFLOAD_GPU;_USE_CUDA + + NDEBUG;%(PreprocessorDefinitions) @@ -1045,6 +1184,31 @@ + + + + + + + + + + + + + + + + + + + + + + + + + @@ -1097,36 +1261,13 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - + + + @@ -1196,13 +1337,21 @@ - + + + + + + + + + - + \ No newline at end of file diff --git a/cpp/vc/SRWLIB.vcxproj.filters b/cpp/vc/SRWLIB.vcxproj.filters index c77ca1a2..6c69e3cc 100644 --- a/cpp/vc/SRWLIB.vcxproj.filters +++ b/cpp/vc/SRWLIB.vcxproj.filters @@ -393,8 +393,8 @@ f2c - - core + + lib @@ -614,8 +614,34 @@ lib - + + lib + + core + + core + + + + + core + + + core + + + core + + + core + + + core + + + core + \ No newline at end of file From 06c2cea71d7a7a41b7cf240c33d5e1cf7362b544 Mon Sep 17 00:00:00 2001 From: Himanshu Goel Date: Mon, 4 Dec 2023 04:42:16 -0500 Subject: [PATCH 3/9] Fix virtual function parameters. --- cpp/src/core/sroptang.h | 5 +++-- cpp/src/core/sroptapt.h | 4 ++-- cpp/src/core/sroptcryst.h | 2 +- cpp/src/core/sroptdrf.h | 4 ++-- cpp/src/core/sroptfoc.h | 12 ++++++++---- cpp/src/core/sroptgrat.h | 3 ++- cpp/src/core/sroptgtr.h | 4 ++-- cpp/src/core/sropthck.h | 6 ++++-- cpp/src/core/sroptpsh.h | 12 ++++++++---- cpp/src/core/sroptsmr.h | 6 ++++-- cpp/src/core/sroptwgr.h | 6 ++++-- cpp/src/core/sroptzp.h | 15 ++++++++++----- cpp/src/core/sroptzps.h | 12 ++++++++---- 13 files changed, 58 insertions(+), 33 deletions(-) diff --git a/cpp/src/core/sroptang.h b/cpp/src/core/sroptang.h index d0370a26..2294731c 100644 --- a/cpp/src/core/sroptang.h +++ b/cpp/src/core/sroptang.h @@ -31,7 +31,7 @@ class srTOptAngle : public srTGenOptElem { } //int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect) //virtual - int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect, void* pvGPU) //virtual //HG30112023 + int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect, void* pvGPU=0) //virtual //HG30112023 { //return PropagateRadiationMeth_0(pRadAccessData); int res = 0; @@ -135,7 +135,8 @@ class srTOptShift : public srTGenOptElem { ShiftY = InShiftY; } - int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect) //virtual + //int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect) //virtual + int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect, void* pvGPU=0) //virtual //HG04122023 { //return PropagateRadiationMeth_0(pRadAccessData); int res = 0; diff --git a/cpp/src/core/sroptapt.h b/cpp/src/core/sroptapt.h index e5f22dac..98f6598c 100644 --- a/cpp/src/core/sroptapt.h +++ b/cpp/src/core/sroptapt.h @@ -34,7 +34,7 @@ class srTAperture : public srTShapedOptElem { //int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, int MethNo, srTRadResizeVect& ResBeforeAndAfterVect) //int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect) - int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect, void* pvGPU) //HG30112023 + int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect, void* pvGPU=0) //HG30112023 { char &MethNo = ParPrecWfrPropag.MethNo; @@ -82,7 +82,7 @@ class srTAperture : public srTShapedOptElem { //int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData, void* pBuf=0) //OC06092019 //OC01102019 (restored) //int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData) - int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData, void* pvGPU) //HG30112023 + int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData, void* pvGPU=0) //HG30112023 { int result; //if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result; diff --git a/cpp/src/core/sroptcryst.h b/cpp/src/core/sroptcryst.h index af6f535f..ee0ef80b 100644 --- a/cpp/src/core/sroptcryst.h +++ b/cpp/src/core/sroptcryst.h @@ -944,7 +944,7 @@ class srTOptCryst : public srTGenOptElem { } //int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect) //virtual in srTGenOptElem - int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect, void* pvGPU) //virtual in srTGenOptElem //HG01122023 + int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect, void* pvGPU=0) //virtual in srTGenOptElem //HG01122023 { m_eStartAux = pRadAccessData->eStart; m_eStepAux = pRadAccessData->eStep; m_ne = pRadAccessData->ne; //required for RadPointModifier diff --git a/cpp/src/core/sroptdrf.h b/cpp/src/core/sroptdrf.h index a6a16d20..c3a7509a 100644 --- a/cpp/src/core/sroptdrf.h +++ b/cpp/src/core/sroptdrf.h @@ -179,7 +179,7 @@ class srTDriftSpace : public srTGenOptElem { //int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadAccessData, void* pBuf=0) //OC06092019 //OC01102019 (restored) //int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadAccessData) - int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadAccessData, void* pvGPU) //HG01122023 + int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadAccessData, void* pvGPU=0) //HG01122023 {//it works for many photon energies too! int result; //if(result = PropagateRadiationSimple(pRadAccessData, pBuf)) return result; //OC06092019 @@ -312,7 +312,7 @@ class srTDriftSpace : public srTGenOptElem { //int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData, void* pBuf=0) //OC06092019 //OC01102019 (restored) //int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData) - int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData, void* pvGPU) //HG01122023 + int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData, void* pvGPU=0) //HG01122023 { //srTDriftPropBufVars* pBufVars = (srTDriftPropBufVars*)pBuf; //OC06092019 //char LocalPropMode = pBufVars->LocalPropMode; //OC06092019 diff --git a/cpp/src/core/sroptfoc.h b/cpp/src/core/sroptfoc.h index d2a05579..f950a775 100644 --- a/cpp/src/core/sroptfoc.h +++ b/cpp/src/core/sroptfoc.h @@ -153,7 +153,8 @@ class srTThinLens : public srTFocusingElem { srTThinLens() {} //int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, int MethNo, srTRadResizeVect& ResBeforeAndAfterVect) - int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect) + //int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect) + int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect, void* pvGPU=0) //HG04122023 { //if(ParPrecWfrPropag.AnalTreatment == 1) //{// Treating linear terms analytically @@ -197,11 +198,14 @@ class srTThinLens : public srTFocusingElem { //int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData, void* pBuf=0) //OC06092019 //OC01102019 (restored) - int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData) + //int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData) + int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData, void* pvGPU=0) //HG04122023 { int result; - if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result; - if(result = TraverseRadZXE(pRadAccessData)) return result; + //if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result; + if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0, 0, 0, pvGPU)) return result; //HG04122023 + //if(result = TraverseRadZXE(pRadAccessData)) return result; + if(result = TraverseRadZXE(pRadAccessData, 0, 0, pvGPU)) return result; //HG04122023 return 0; } int PropagateRadiationSimple1D(srTRadSect1D* pSect1D) diff --git a/cpp/src/core/sroptgrat.h b/cpp/src/core/sroptgrat.h index 487f805f..2c761663 100644 --- a/cpp/src/core/sroptgrat.h +++ b/cpp/src/core/sroptgrat.h @@ -96,7 +96,8 @@ class srTGrating : public srTShapedOptElem { m_PropWfrInPlace = true; //OC151008 //previous electric field is NOT necessary for the propagation } - int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect) + //int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect) + int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect, void* pvGPU=0) //HG04122023 { //char &MethNo = ParPrecWfrPropag.MethNo; SetupPropBufVars_Gen(pRadAccessData); diff --git a/cpp/src/core/sroptgtr.h b/cpp/src/core/sroptgtr.h index 202190c3..34052e34 100644 --- a/cpp/src/core/sroptgtr.h +++ b/cpp/src/core/sroptgtr.h @@ -82,7 +82,7 @@ class srTGenTransmission : public srTFocusingElem { //int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, int MethNo, srTRadResizeVect& ResBeforeAndAfterArr) //int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterArr) - int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterArr, void* pvGPU) //HG01122023 + int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterArr, void* pvGPU=0) //HG01122023 { //if(ParPrecWfrPropag.AnalTreatment == 1) //{// Treating linear terms analytically @@ -125,7 +125,7 @@ class srTGenTransmission : public srTFocusingElem { //int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData, void* pBuf=0) //OC06092019 //OC01102019 (restored) //int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData) - int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData, void* pvGPU) //HG01122023 + int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData, void* pvGPU=0) //HG01122023 { int result; //if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result; diff --git a/cpp/src/core/sropthck.h b/cpp/src/core/sropthck.h index 45f09323..cc7fe350 100644 --- a/cpp/src/core/sropthck.h +++ b/cpp/src/core/sropthck.h @@ -167,7 +167,8 @@ class srTMirror : public srTFocusingElem { //return true; } - int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect) //virtual in srTGenOptElem + //int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect) //virtual in srTGenOptElem + int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect, void* pvGPU=0) //virtual in srTGenOptElem //HG04122023 { m_ParPrecWfrPropag = ParPrecWfrPropag; //store for use in a composite prapagator (through drif space, etc.) @@ -206,7 +207,8 @@ class srTMirror : public srTFocusingElem { //int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData, void* pBuf=0) //OC06092019 //OC01102019 (restored) - int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData) + //int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData) + int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData, void* pvGPU=0) //HG04122023 { if(m_propMeth == 1) return PropagateRadiationSimple_ThinElem(pRadAccessData); else if(m_propMeth == 2) return PropagateRadiationSimple_LocRayTracing(pRadAccessData); diff --git a/cpp/src/core/sroptpsh.h b/cpp/src/core/sroptpsh.h index ab0ac787..181df8df 100644 --- a/cpp/src/core/sroptpsh.h +++ b/cpp/src/core/sroptpsh.h @@ -75,7 +75,8 @@ class srTPhaseShift : public srTFocusingElem { } //int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, int MethNo, srTRadResizeVect& ResBeforeAndAfterVect) - int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect) + //int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect) + int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect, void* pvGPU=0) //HG04122023 { char &MethNo = ParPrecWfrPropag.MethNo; @@ -86,7 +87,8 @@ class srTPhaseShift : public srTFocusingElem { //int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData, void* pBuf=0) //OC06092019 //OC01102019 (restored) - int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData) + //int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData) + int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData, void* pvGPU=0) //HG04122023 { int result; srTWaveAccessData PhShWaveAccessData; @@ -94,8 +96,10 @@ class srTPhaseShift : public srTFocusingElem { //tPhaseShiftData = (DOUBLE*)(PhShWaveAccessData.pWaveData); tPhaseShiftData = (double*)(PhShWaveAccessData.pWaveData); //OC26112019 (related to SRW port to IGOR XOP8 on Mac) - if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result; - if(result = TraverseRadZXE(pRadAccessData)) return result; + //if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result; + if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0, 0, 0, pvGPU)) return result; //HG04122023 + //if(result = TraverseRadZXE(pRadAccessData)) return result; + if(result = TraverseRadZXE(pRadAccessData, 0, 0, pvGPU)) return result; //HG04122023 //srTSend Send; //if(result = Send.FinishWorkingWithWave(&PhShWaveAccessData)) return result; diff --git a/cpp/src/core/sroptsmr.h b/cpp/src/core/sroptsmr.h index 9d36eb82..f0d1e6b1 100644 --- a/cpp/src/core/sroptsmr.h +++ b/cpp/src/core/sroptsmr.h @@ -67,10 +67,12 @@ class srTSpherMirror : public srTFocusingElem { void SetupSpherMirrorApprox(); //int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, int MethNo, srTRadResizeVect& ResBeforeAndAfterVect) - int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect) + //int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect) + int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect, void* pvGPU=0) //HG04122023 { //if(UseSpherMirrorApprox) return ((srTGenOptElem*)(SpherMirrorApproxHndl.rep))->PropagateRadiation(pRadAccessData, MethNo, ResBeforeAndAfterVect); - if(UseSpherMirrorApprox) return ((srTGenOptElem*)(SpherMirrorApproxHndl.rep))->PropagateRadiation(pRadAccessData, ParPrecWfrPropag, ResBeforeAndAfterVect); + //if(UseSpherMirrorApprox) return ((srTGenOptElem*)(SpherMirrorApproxHndl.rep))->PropagateRadiation(pRadAccessData, ParPrecWfrPropag, ResBeforeAndAfterVect); + if(UseSpherMirrorApprox) return ((srTGenOptElem*)(SpherMirrorApproxHndl.rep))->PropagateRadiation(pRadAccessData, ParPrecWfrPropag, ResBeforeAndAfterVect, pvGPU); //HG04122023 else { char &MethNo = ParPrecWfrPropag.MethNo; diff --git a/cpp/src/core/sroptwgr.h b/cpp/src/core/sroptwgr.h index c9be6164..580b97d3 100644 --- a/cpp/src/core/sroptwgr.h +++ b/cpp/src/core/sroptwgr.h @@ -134,7 +134,8 @@ class srTWaveguideRect : public srTShapedOptElem { } //int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, int MethNo, srTRadResizeVect& ResizeBeforeAndAfterVect) - int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResizeBeforeAndAfterVect) + //int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResizeBeforeAndAfterVect) + int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResizeBeforeAndAfterVect, void* pvGPU=0) //HG04122023 { //Checks current sampling "resolution" in hor. and vert. directions //Makes necessary sampling for propag. through the waveguide (fit the waveguide with approx. the same resolution, include all harmonics until the cut-off) @@ -151,7 +152,8 @@ class srTWaveguideRect : public srTShapedOptElem { if(result = PropagateRadiationSimple_AngRepres(&AuxWfrData)) return result; srTRectAperture RectAp(Dx, Dz, TransvCenPoint.x, TransvCenPoint.y); - if(result = RectAp.TraverseRadZXE(&AuxWfrData)) return result; + //if(result = RectAp.TraverseRadZXE(&AuxWfrData)) return result; + if(result = RectAp.TraverseRadZXE(&AuxWfrData, 0, 0, pvGPU)) return result; //HG04122023 if(result = CopyElecFieldDataForOut(AuxWfrData, *pRadAccessData)) return result; AuxWfrData.DeleteElecFieldArrays(); //deletes Ex, Ez only diff --git a/cpp/src/core/sroptzp.h b/cpp/src/core/sroptzp.h index 813974de..68ae1ee7 100644 --- a/cpp/src/core/sroptzp.h +++ b/cpp/src/core/sroptzp.h @@ -100,7 +100,8 @@ class srTZonePlate : public srTFocusingElem { srTZonePlate() {} //int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, int MethNo, srTRadResizeVect& ResBeforeAndAfterVect) - int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect) + //int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect) + int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect, void* pvGPU=0) //HG04122023 { //if(ParPrecWfrPropag.AnalTreatment == 1) //{// Treating linear terms analytically @@ -111,7 +112,8 @@ class srTZonePlate : public srTFocusingElem { int result = 0; - if(MethNo == 0) result = PropagateRadiationMeth_0(pRadAccessData); + //if(MethNo == 0) result = PropagateRadiationMeth_0(pRadAccessData); + if(MethNo == 0) result = PropagateRadiationMeth_0(pRadAccessData, pvGPU); //HG04122023 //else return PropagateRadiationMeth_2(pRadAccessData, ResBeforeAndAfterVect); else result = PropagateRadiationMeth_2(pRadAccessData, ParPrecWfrPropag, ResBeforeAndAfterVect); @@ -125,11 +127,14 @@ class srTZonePlate : public srTFocusingElem { //int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData, void* pBuf=0) //OC06092019 //OC01102019 (restored) - int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData) + //int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData) + int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData, void* pvGPU=0) //HG04122023 { int result; - if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result; - if(result = TraverseRadZXE(pRadAccessData)) return result; + //if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result; + if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0, 0, 0, pvGPU)) return result; //HG04122023 + //if(result = TraverseRadZXE(pRadAccessData)) return result; + if(result = TraverseRadZXE(pRadAccessData, 0, 0, pvGPU)) return result; //HG04122023 return 0; } int PropagateRadiationSimple1D(srTRadSect1D* pSect1D) diff --git a/cpp/src/core/sroptzps.h b/cpp/src/core/sroptzps.h index 792d02c6..e5409814 100644 --- a/cpp/src/core/sroptzps.h +++ b/cpp/src/core/sroptzps.h @@ -80,7 +80,8 @@ class srTZonePlateSpec : public srTFocusingElem { } //int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, int MethNo, srTRadResizeVect& ResBeforeAndAfterVect) - int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect) + //int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect) + int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect, void* pvGPU=0) //HG04122023 { char &MethNo = ParPrecWfrPropag.MethNo; //if(MethNo == 2) return PropagateRadiationMeth_2(pRadAccessData, ResBeforeAndAfterVect); @@ -90,11 +91,14 @@ class srTZonePlateSpec : public srTFocusingElem { //int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData, void* pBuf=0) //OC06092019 //OC01102019 (restored) - int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData) + //int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData) + int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData, void* pvGPU=0) //HG04122023 { int result; - if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result; - return TraverseRadZXE(pRadAccessData); + //if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result; + if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0, 0, 0, pvGPU)) return result; //HG04122023 + //return TraverseRadZXE(pRadAccessData); + return TraverseRadZXE(pRadAccessData, 0, 0, pvGPU); //HG04122023 } int PropagateRadiationSimple1D(srTRadSect1D* pSect1D) { From 85b7c5cc6c4564dcec99509ef8d246f5a5c9723a Mon Sep 17 00:00:00 2001 From: Himanshu Goel Date: Thu, 4 Jan 2024 12:07:04 -0500 Subject: [PATCH 4/9] Add GPU related code for srwlpy.cpp --- cpp/src/clients/python/srwlpy.cpp | 75 ++++++++++++++++++++++++++++--- 1 file changed, 69 insertions(+), 6 deletions(-) diff --git a/cpp/src/clients/python/srwlpy.cpp b/cpp/src/clients/python/srwlpy.cpp index f320cfc5..217fbe95 100644 --- a/cpp/src/clients/python/srwlpy.cpp +++ b/cpp/src/clients/python/srwlpy.cpp @@ -26,6 +26,10 @@ #include #include //OCTEST_161214 +#ifdef _OFFLOAD_GPU //HG30112023 +#include "auxgpu.h" +#endif + //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: //#include @@ -3319,6 +3323,22 @@ void ParseSructSmpObj3D(double**& arObjShapeDefs, int& nObj3D, PyObject* oListSh } } +#ifdef _OFFLOAD_GPU //HG30112023 +/************************************************************************//** + * Convert Python device specification to C++ structure. + ***************************************************************************/ +void ParseDeviceParam(PyObject* oDev, gpuUsageArg *pGpuUsage) //HG10202021 Convert Python device specification to C++ structure +{ + if (oDev != 0) { + if (PyLong_Check(oDev)) { + pGpuUsage->deviceIndex = _PyLong_AsInt(oDev); + return; + } + } + pGpuUsage->deviceIndex = 0; +} +#endif + /************************************************************************//** * Updates Py List by numbers ***************************************************************************/ @@ -4617,18 +4637,24 @@ static PyObject* srwlpy_CalcIntFromElecField(PyObject *self, PyObject *args) { //PyObject *oInt=0, *oWfr=0, *oPol=0, *oIntType=0, *oDepType=0, *oE=0, *oX=0, *oY=0; //PyObject *oInt=0, *oWfr=0, *oPol=0, *oIntType=0, *oDepType=0, *oE=0, *oX=0, *oY=0, *oMeth=0; - PyObject *oInt=0, *oWfr=0, *oPol=0, *oIntType=0, *oDepType=0, *oE=0, *oX=0, *oY=0, *oMeth=0, *oFldTrj=0; //OC23022020 + //PyObject *oInt=0, *oWfr=0, *oPol=0, *oIntType=0, *oDepType=0, *oE=0, *oX=0, *oY=0, *oMeth=0, *oFldTrj=0; //OC23022020 + PyObject *oInt=0, *oWfr=0, *oPol=0, *oIntType=0, *oDepType=0, *oE=0, *oX=0, *oY=0, *oMeth=0, *oFldTrj=0, *oDev=0; //HG03012024 vector vBuf; SRWLWfr wfr; SRWLMagFldC *pMagCnt=0; //OC23022020 SRWLPrtTrj *pPrtTrj=0; +#ifdef _OFFLOAD_GPU //HG30112023 + TGPUUsageArg gpu; + srwlUtiGPUProc(1); //to prepare GPU for calculations +#endif try { //if(!PyArg_ParseTuple(args, "OOOOOOOO:CalcIntFromElecField", &oInt, &oWfr, &oPol, &oIntType, &oDepType, &oE, &oX, &oY)) throw strEr_BadArg_CalcIntFromElecField; //if(!PyArg_ParseTuple(args, "OOOOOOOO|O:CalcIntFromElecField", &oInt, &oWfr, &oPol, &oIntType, &oDepType, &oE, &oX, &oY, &oMeth)) throw strEr_BadArg_CalcIntFromElecField; //OC13122019 //if(!PyArg_ParseTuple(args, "OOOOOOOO|O:CalcIntFromElecField", &oInt, &oWfr, &oPol, &oIntType, &oDepType, &oE, &oX, &oY, &oMeth, &oFldTrj)) throw strEr_BadArg_CalcIntFromElecField; //OC23022020 - if(!PyArg_ParseTuple(args, "OOOOOOOO|OO:CalcIntFromElecField", &oInt, &oWfr, &oPol, &oIntType, &oDepType, &oE, &oX, &oY, &oMeth, &oFldTrj)) throw strEr_BadArg_CalcIntFromElecField; //OC03032021 (just formally corrected, according to number of arguments) + //if(!PyArg_ParseTuple(args, "OOOOOOOO|OO:CalcIntFromElecField", &oInt, &oWfr, &oPol, &oIntType, &oDepType, &oE, &oX, &oY, &oMeth, &oFldTrj)) throw strEr_BadArg_CalcIntFromElecField; //OC03032021 (just formally corrected, according to number of arguments) + if(!PyArg_ParseTuple(args, "OOOOOOOO|OOO:CalcIntFromElecField", &oInt, &oWfr, &oPol, &oIntType, &oDepType, &oE, &oX, &oY, &oMeth, &oFldTrj, &oDev)) throw strEr_BadArg_CalcIntFromElecField; //HG03012024 if((oInt == 0) || (oWfr == 0) || (oPol == 0) || (oIntType == 0) || (oDepType == 0) || (oE == 0) || (oX == 0) || (oY == 0)) throw strEr_BadArg_CalcIntFromElecField; //char *arInt = (char*)GetPyArrayBuf(oInt, vBuf, PyBUF_WRITABLE, 0); @@ -4691,7 +4717,13 @@ static PyObject* srwlpy_CalcIntFromElecField(PyObject *self, PyObject *args) //ProcRes(srwlCalcIntFromElecField(arInt, &wfr, pol, intType, depType, e, x, y)); //ProcRes(srwlCalcIntFromElecField(arInt, &wfr, pol, intType, depType, e, x, y, pMeth)); //OC13122019 + +#ifdef _OFFLOAD_GPU //HG30112023 + ParseDeviceParam(oDev, &gpu); + ProcRes(srwlCalcIntFromElecField(arInt, &wfr, pol, intType, depType, e, x, y, pMeth, pFldTrj, (void*)&gpu)); +#else ProcRes(srwlCalcIntFromElecField(arInt, &wfr, pol, intType, depType, e, x, y, pMeth, pFldTrj)); //OC23022020 +#endif } catch(const char* erText) { @@ -4700,6 +4732,9 @@ static PyObject* srwlpy_CalcIntFromElecField(PyObject *self, PyObject *args) oInt = 0; } +#ifdef _OFFLOAD_GPU //HG30112023 + srwlUtiGPUProc(0); //to free GPU +#endif if(pMagCnt != 0) DeallocMagCntArrays(pMagCnt); ReleasePyBuffers(vBuf); EraseElementFromMap(&wfr, gmWfrPyPtr); @@ -4932,7 +4967,8 @@ static PyObject* srwlpy_SetRepresElecField(PyObject *self, PyObject *args) static PyObject* srwlpy_PropagElecField(PyObject *self, PyObject *args) { //PyObject *oWfr=0, *oOptCnt=0; - PyObject *oWfr=0, *oOptCnt=0, *oInt=0; //OC14082018 + //PyObject *oWfr=0, *oOptCnt=0, *oInt=0; //OC14082018 + PyObject *oWfr=0, *oOptCnt=0, *oInt=0, *oDev=0; //Hg03012024 vector vBuf; SRWLWfr wfr; @@ -4945,10 +4981,15 @@ static PyObject* srwlpy_PropagElecField(PyObject *self, PyObject *args) //float **arInts=0; char **arInts=0; +#ifdef _OFFLOAD_GPU //HG03012024 + TGPUUsageArg gpu; + srwlUtiGPUProc(1); //to prepare GPU for calculations +#endif try { //if(!PyArg_ParseTuple(args, "OO:PropagElecField", &oWfr, &oOptCnt)) throw strEr_BadArg_PropagElecField; - if(!PyArg_ParseTuple(args, "OO|O:PropagElecField", &oWfr, &oOptCnt, &oInt)) throw strEr_BadArg_PropagElecField; //OC14082018 + //if(!PyArg_ParseTuple(args, "OO|O:PropagElecField", &oWfr, &oOptCnt, &oInt)) throw strEr_BadArg_PropagElecField; //OC14082018 + if(!PyArg_ParseTuple(args, "OO|OO:PropagElecField", &oWfr, &oOptCnt, &oInt, &oDev)) throw strEr_BadArg_PropagElecField; //HG03012024 if((oWfr == 0) || (oOptCnt == 0)) throw strEr_BadArg_PropagElecField; //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: @@ -4981,7 +5022,12 @@ static PyObject* srwlpy_PropagElecField(PyObject *self, PyObject *args) } //ProcRes(srwlPropagElecField(&wfr, &optCnt)); +#ifdef _OFFLOAD_GPU //HG03012024 + ParseDeviceParam(oDev, &gpu); + ProcRes(srwlPropagElecField(&wfr, &optCnt, nInt, arIntDescr, arIntMesh, arInts, (void*)&gpu)); +#else ProcRes(srwlPropagElecField(&wfr, &optCnt, nInt, arIntDescr, arIntMesh, arInts)); //OC15082018 +#endif //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: //srwlPrintTime(":srwlpy_PropagElecField :srwlPropagElecField", &start); @@ -5002,6 +5048,9 @@ static PyObject* srwlpy_PropagElecField(PyObject *self, PyObject *args) //PyErr_PrintEx(1); oWfr = 0; } +#ifdef _OFFLOAD_GPU //HG03012024 + srwlUtiGPUProc(0); //to free GPU +#endif DeallocOptCntArrays(&optCnt); ReleasePyBuffers(vBuf); @@ -5102,12 +5151,18 @@ static PyObject* srwlpy_CalcTransm(PyObject* self, PyObject* args) //HG27012021 ***************************************************************************/ static PyObject* srwlpy_UtiFFT(PyObject *self, PyObject *args) { - PyObject *oData=0, *oMesh=0, *oDir=0; + //PyObject *oData=0, *oMesh=0, *oDir=0; + PyObject *oData=0, *oMesh=0, *oDir=0, *oDev=0; //HG03012024 vector vBuf; +#ifdef _OFFLOAD_GPU //HG03012024 + TGPUUsageArg gpu; + srwlUtiGPUProc(1); //to prepare GPU for calculations +#endif try { - if(!PyArg_ParseTuple(args, "OOO:UtiFFT", &oData, &oMesh, &oDir)) throw strEr_BadArg_UtiFFT; + //if(!PyArg_ParseTuple(args, "OOO:UtiFFT", &oData, &oMesh, &oDir)) throw strEr_BadArg_UtiFFT; + if(!PyArg_ParseTuple(args, "OOO|O:UtiFFT", &oData, &oMesh, &oDir, &oDev)) throw strEr_BadArg_UtiFFT; //HG03012024 if((oData == 0) || (oMesh == 0) || (oDir == 0)) throw strEr_BadArg_UtiFFT; //int sizeVectBuf = (int)vBuf.size(); @@ -5143,7 +5198,12 @@ static PyObject* srwlpy_UtiFFT(PyObject *self, PyObject *args) if(!PyNumber_Check(oDir)) throw strEr_BadArg_UtiFFT; int dir = (int)PyLong_AsLong(oDir); +#ifdef _OFFLOAD_GPU //HG03012024 + ParseDeviceParam(oDev, &gpu); + ProcRes(srwlUtiFFT(pcData, typeData, arMesh, nMesh, dir, (void*)&gpu)); +#else ProcRes(srwlUtiFFT(pcData, typeData, arMesh, nMesh, dir)); +#endif if(meshArType == 'l') UpdatePyListNum(oMesh, arMesh, nMesh); //04092016 } @@ -5153,6 +5213,9 @@ static PyObject* srwlpy_UtiFFT(PyObject *self, PyObject *args) //if(vBuf.size() > 0) ReleasePyBuffers(vBuf); oData = 0; oMesh = 0; oDir = 0; } +#ifdef _OFFLOAD_GPU //HG03012024 + srwlUtiGPUProc(0); //to free GPU +#endif ReleasePyBuffers(vBuf); From 4a3a0be9e0cd93a80e5389c85d7e961c26db6a82 Mon Sep 17 00:00:00 2001 From: Himanshu Goel Date: Thu, 4 Jan 2024 12:17:05 -0500 Subject: [PATCH 5/9] Fix gmfft formatting --- cpp/src/ext/genmath/gmfft.cpp | 368 +++++++++++++++++----------------- 1 file changed, 184 insertions(+), 184 deletions(-) diff --git a/cpp/src/ext/genmath/gmfft.cpp b/cpp/src/ext/genmath/gmfft.cpp index 3c242adc..43639845 100644 --- a/cpp/src/ext/genmath/gmfft.cpp +++ b/cpp/src/ext/genmath/gmfft.cpp @@ -228,15 +228,15 @@ void CGenMathFFT::NextCorrectNumberForFFT(long& n) int CGenMathFFT1D::Make1DFFT_InPlace(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU) //OC06092023 { #ifdef _OFFLOAD_GPU //OC06092023 (to avoid useless operations / calls at execution on CPU) - if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023 + if(CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023 //GPU_COND(pvGPU, //OC06092023 //GPU_COND(pGpuUsage, { //HG03082022 GPU can do an inplace fft without being given a temporary buffer FFT1DInfo.pOutData = FFT1DInfo.pInData; int result; - if (result = Make1DFFT(FFT1DInfo, pvGPU)) return result; //OC06092023 - //if (result = Make1DFFT(FFT1DInfo, pGpuUsage)) return result; + if(result = Make1DFFT(FFT1DInfo, pvGPU)) return result; //OC06092023 + //if(result = Make1DFFT(FFT1DInfo, pGpuUsage)) return result; }//) else #endif @@ -329,34 +329,34 @@ int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwnd_plan* pPrecrea //ArrayShiftX = 0; ArrayShiftY = 0; m_ArrayShiftX = 0; m_ArrayShiftY = 0; //OC02022019 m_dArrayShiftX = 0; m_dArrayShiftY = 0; - if (FFT2DInfo.pData != 0) + if(FFT2DInfo.pData != 0) { - if (NeedsShiftBeforeX || NeedsShiftAfterX) + if(NeedsShiftBeforeX || NeedsShiftAfterX) { //ArrayShiftX = new float[Nx << 1]; //if(ArrayShiftX == 0) return MEMORY_ALLOCATION_FAILURE; m_ArrayShiftX = new float[Nx << 1]; - if (m_ArrayShiftX == 0) return MEMORY_ALLOCATION_FAILURE; + if(m_ArrayShiftX == 0) return MEMORY_ALLOCATION_FAILURE; } - if (NeedsShiftBeforeY || NeedsShiftAfterY) + if(NeedsShiftBeforeY || NeedsShiftAfterY) { //ArrayShiftY = new float[Ny << 1]; //if(ArrayShiftY == 0) return MEMORY_ALLOCATION_FAILURE; m_ArrayShiftY = new float[Ny << 1]; - if (m_ArrayShiftY == 0) return MEMORY_ALLOCATION_FAILURE; + if(m_ArrayShiftY == 0) return MEMORY_ALLOCATION_FAILURE; } } - else if (FFT2DInfo.pdData != 0) + else if(FFT2DInfo.pdData != 0) { - if (NeedsShiftBeforeX || NeedsShiftAfterX) + if(NeedsShiftBeforeX || NeedsShiftAfterX) { m_dArrayShiftX = new double[Nx << 1]; - if (m_dArrayShiftX == 0) return MEMORY_ALLOCATION_FAILURE; + if(m_dArrayShiftX == 0) return MEMORY_ALLOCATION_FAILURE; } - if (NeedsShiftBeforeY || NeedsShiftAfterY) + if(NeedsShiftBeforeY || NeedsShiftAfterY) { m_dArrayShiftY = new double[Ny << 1]; - if (m_dArrayShiftY == 0) return MEMORY_ALLOCATION_FAILURE; + if(m_dArrayShiftY == 0) return MEMORY_ALLOCATION_FAILURE; } } @@ -374,7 +374,7 @@ int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwnd_plan* pPrecrea //#endif #ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) - if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023 + if(CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023 //GPU_COND(pvGPU, //OC06092023 //GPU_COND(pGpuUsage, //HG02112021 { @@ -393,8 +393,8 @@ int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwnd_plan* pPrecrea #endif { #if _FFTW3 //OC28012019 - if (FFT2DInfo.pData != 0) DataToFFT = (fftwf_complex*)(FFT2DInfo.pData); - else if (FFT2DInfo.pdData != 0) dDataToFFT = (fftw_complex*)(FFT2DInfo.pdData); //OC02022019 + if(FFT2DInfo.pData != 0) DataToFFT = (fftwf_complex*)(FFT2DInfo.pData); + else if(FFT2DInfo.pdData != 0) dDataToFFT = (fftw_complex*)(FFT2DInfo.pdData); //OC02022019 #else fftwnd_plan Plan2DFFT; @@ -422,21 +422,21 @@ int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwnd_plan* pPrecrea } #ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) - if (DataToFFT != 0) CAuxGPU::EnsureDeviceMemoryReady((TGPUUsageArg*)pvGPU, DataToFFT); //OC06092023 - else if (dDataToFFT != 0) CAuxGPU::EnsureDeviceMemoryReady((TGPUUsageArg*)pvGPU, dDataToFFT); + if(DataToFFT != 0) CAuxGPU::EnsureDeviceMemoryReady((TGPUUsageArg*)pvGPU, DataToFFT); //OC06092023 + else if(dDataToFFT != 0) CAuxGPU::EnsureDeviceMemoryReady((TGPUUsageArg*)pvGPU, dDataToFFT); //if (DataToFFT != 0) CAuxGPU::EnsureDeviceMemoryReady(pGpuUsage, DataToFFT); //else if (dDataToFFT != 0) CAuxGPU::EnsureDeviceMemoryReady(pGpuUsage, dDataToFFT); #endif - if (NeedsShiftBeforeX || NeedsShiftBeforeY) //HG02112021 + if(NeedsShiftBeforeX || NeedsShiftBeforeY) //HG02112021 { #ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) - if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023 + if(CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023 { //GPU_COND(pvGPU, { //OC06092023 //GPU_COND(pGpuUsage, { TGPUUsageArg *pGPU = (TGPUUsageArg*)pvGPU; - if (DataToFFT != 0) { + if(DataToFFT != 0) { m_ArrayShiftX = (float*)CAuxGPU::ToDevice(pGPU, m_ArrayShiftX, (Nx << 1) * sizeof(float), false); //OC06092023 m_ArrayShiftY = (float*)CAuxGPU::ToDevice(pGPU, m_ArrayShiftY, (Ny << 1) * sizeof(float), false); CAuxGPU::EnsureDeviceMemoryReady(pGPU, m_ArrayShiftX); @@ -451,7 +451,7 @@ int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwnd_plan* pPrecrea //m_ArrayShiftX = (float*)CAuxGPU::ToHostAndFree(pGpuUsage, m_ArrayShiftX, (Nx << 1) * sizeof(float), true); //m_ArrayShiftY = (float*)CAuxGPU::ToHostAndFree(pGpuUsage, m_ArrayShiftY, (Ny << 1) * sizeof(float), true); } - else if (dDataToFFT != 0) { + else if(dDataToFFT != 0) { m_dArrayShiftX = (double*)CAuxGPU::ToDevice(pGPU, m_dArrayShiftX, (Nx << 1) * sizeof(double), false); //OC06092023 m_dArrayShiftY = (double*)CAuxGPU::ToDevice(pGPU, m_dArrayShiftY, (Ny << 1) * sizeof(double), false); CAuxGPU::EnsureDeviceMemoryReady(pGPU, m_dArrayShiftX); @@ -470,10 +470,10 @@ int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwnd_plan* pPrecrea else #endif { - if (DataToFFT != 0) TreatShifts(DataToFFT); + if(DataToFFT != 0) TreatShifts(DataToFFT); #ifdef _FFTW3 //OC27022019 - else if (dDataToFFT != 0) TreatShifts(dDataToFFT); //OC02022019 + else if(dDataToFFT != 0) TreatShifts(dDataToFFT); //OC02022019 #endif } } @@ -481,22 +481,22 @@ int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwnd_plan* pPrecrea bool alreadyNormalized = false; //HG17032022 //double Mult = FFT2DInfo.xStep*FFT2DInfo.yStep; double Mult = FFT2DInfo.xStep * FFT2DInfo.yStep * FFT2DInfo.ExtraMult; //OC20112017 - if (FFT2DInfo.Dir > 0) + if(FFT2DInfo.Dir > 0) { #ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) - if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023 + if(CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023 //GPU_COND(pvGPU, //OC06092023 //GPU_COND(pGpuUsage, //HG02112021 { - if (DataToFFT != 0) + if(DataToFFT != 0) { - if (pPrecreatedPlan2DFFT == 0) + if(pPrecreatedPlan2DFFT == 0) { - if ((Plan2DFFT_cu == NULL) || (!(PlanNx == Nx && PlanNy == Ny))) //OC06092023 //HG04122023 + if((Plan2DFFT_cu == NULL) || (!(PlanNx == Nx && PlanNy == Ny))) //OC06092023 //HG04122023 //if((Plan2DFFT_cu == NULL) || (!(PlanNx == Nx && PlanNy == Ny && HowMany == FFT2DInfo.howMany))) //OC06092023 //if (Plan2DFFT_cu == NULL | !(PlanNx == Nx && PlanNy == Ny && HowMany == FFT2DInfo.howMany)) { - if (Plan2DFFT_cu != NULL) + if(Plan2DFFT_cu != NULL) { cufftDestroy(Plan2DFFT_cu); Plan2DFFT_cu = NULL; @@ -511,21 +511,21 @@ int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwnd_plan* pPrecrea } } else Plan2DFFT_cu = *(cufftHandle*)pPrecreatedPlan2DFFT; - if (Plan2DFFT_cu == 0) return ERROR_IN_FFT; + if(Plan2DFFT_cu == 0) return ERROR_IN_FFT; auto res = cufftExecC2C(Plan2DFFT_cu, (cufftComplex*)DataToFFT, (cufftComplex*)DataToFFT, CUFFT_FORWARD); // if (res != CUFFT_SUCCESS) // printf("CUFFT Error: %d\r\n", res); } - else if (dDataToFFT != 0) + else if(dDataToFFT != 0) { - if (pdPrecreatedPlan2DFFT == 0) + if(pdPrecreatedPlan2DFFT == 0) { - if ((dPlan2DFFT_cu == NULL) || (!(dPlanNx == Nx && dPlanNy == Ny))) //OC06092023 //HG04122023 + if((dPlan2DFFT_cu == NULL) || (!(dPlanNx == Nx && dPlanNy == Ny))) //OC06092023 //HG04122023 //if((dPlan2DFFT_cu == NULL) || (!(dPlanNx == Nx && dPlanNy == Ny && dHowMany == FFT2DInfo.howMany))) //OC06092023 //if (dPlan2DFFT_cu == NULL | !(dPlanNx == Nx && dPlanNy == Ny && dHowMany == FFT2DInfo.howMany)) { - if (dPlan2DFFT_cu != NULL) + if(dPlan2DFFT_cu != NULL) { cufftDestroy(dPlan2DFFT_cu); dPlan2DFFT_cu = NULL; @@ -540,7 +540,7 @@ int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwnd_plan* pPrecrea } } else dPlan2DFFT_cu = *(cufftHandle*)pdPrecreatedPlan2DFFT; - if (dPlan2DFFT_cu == 0) return ERROR_IN_FFT; + if(dPlan2DFFT_cu == 0) return ERROR_IN_FFT; cufftExecZ2Z(dPlan2DFFT_cu, (cufftDoubleComplex*)dDataToFFT, (cufftDoubleComplex*)dDataToFFT, CUFFT_FORWARD); } @@ -556,38 +556,38 @@ int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwnd_plan* pPrecrea for(long iHowMany = 0; iHowMany < FFT2DInfo.howMany; iHowMany++) { long iFFT = Nx * Ny * iHowMany; - if (DataToFFT != 0) + if(DataToFFT != 0) { - if (pPrecreatedPlan2DFFT == 0) Plan2DFFT = fftwf_plan_dft_2d(Ny, Nx, DataToFFT + iFFT, DataToFFT + iFFT, FFTW_FORWARD, FFTW_ESTIMATE); + if(pPrecreatedPlan2DFFT == 0) Plan2DFFT = fftwf_plan_dft_2d(Ny, Nx, DataToFFT + iFFT, DataToFFT + iFFT, FFTW_FORWARD, FFTW_ESTIMATE); else Plan2DFFT = *pPrecreatedPlan2DFFT; - if (Plan2DFFT == 0) return ERROR_IN_FFT; + if(Plan2DFFT == 0) return ERROR_IN_FFT; fftwf_execute(Plan2DFFT); } - else if (dDataToFFT != 0) + else if(dDataToFFT != 0) { - if (pdPrecreatedPlan2DFFT == 0) dPlan2DFFT = fftw_plan_dft_2d(Ny, Nx, dDataToFFT + iFFT, dDataToFFT + iFFT, FFTW_FORWARD, FFTW_ESTIMATE); + if(pdPrecreatedPlan2DFFT == 0) dPlan2DFFT = fftw_plan_dft_2d(Ny, Nx, dDataToFFT + iFFT, dDataToFFT + iFFT, FFTW_FORWARD, FFTW_ESTIMATE); else dPlan2DFFT = *pdPrecreatedPlan2DFFT; - if (dPlan2DFFT == 0) return ERROR_IN_FFT; + if(dPlan2DFFT == 0) return ERROR_IN_FFT; fftw_execute(dPlan2DFFT); } } #else - if (pPrecreatedPlan2DFFT == 0) Plan2DFFT = fftw2d_create_plan(Ny, Nx, FFTW_FORWARD, FFTW_IN_PLACE); + if(pPrecreatedPlan2DFFT == 0) Plan2DFFT = fftw2d_create_plan(Ny, Nx, FFTW_FORWARD, FFTW_IN_PLACE); else Plan2DFFT = *pPrecreatedPlan2DFFT; - if (Plan2DFFT == 0) return ERROR_IN_FFT; + if(Plan2DFFT == 0) return ERROR_IN_FFT; fftwnd(Plan2DFFT, 1, DataToFFT, 1, 0, DataToFFT, 1, 0); #endif } #ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) - if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023 + if(CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023 //GPU_COND(pvGPU, //OC06092023 //GPU_COND(pGpuUsage, //HG18072022 { - if (DataToFFT != 0) + if(DataToFFT != 0) { //RepairSignAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany); //RotateDataAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany); @@ -595,7 +595,7 @@ int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwnd_plan* pPrecrea //RepairSignAndRotateDataAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany, (float)Mult); //OC06092023 RepairSignAndRotateDataAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, (float)Mult); //OC06092023 //HG04122023 } - else if (dDataToFFT != 0) + else if(dDataToFFT != 0) { //RepairSignAfter2DFFT_GPU((double*)dDataToFFT, Nx, Ny, FFT2DInfo.howMany); //RotateDataAfter2DFFT_GPU((double*)dDataToFFT, Nx, Ny, FFT2DInfo.howMany); @@ -607,14 +607,14 @@ int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwnd_plan* pPrecrea else #endif { - if (DataToFFT != 0) + if(DataToFFT != 0) { RepairSignAfter2DFFT(DataToFFT); RotateDataAfter2DFFT(DataToFFT); } #ifdef _FFTW3 //OC27022019 - else if (dDataToFFT != 0) + else if(dDataToFFT != 0) { RepairSignAfter2DFFT(dDataToFFT); RotateDataAfter2DFFT(dDataToFFT); @@ -625,18 +625,18 @@ int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwnd_plan* pPrecrea else { #ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) - if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023 + if(CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023 //GPU_COND(pvGPU, //OC06092023 //GPU_COND(pGpuUsage, //HG18072022 { - if (DataToFFT != 0) + if(DataToFFT != 0) { - if (pPrecreatedPlan2DFFT == 0) { - if ((Plan2DFFT_cu == NULL) || (!(PlanNx == Nx && PlanNy == Ny))) //OC06092023 //HG04122023 + if(pPrecreatedPlan2DFFT == 0) { + if((Plan2DFFT_cu == NULL) || (!(PlanNx == Nx && PlanNy == Ny))) //OC06092023 //HG04122023 //if((Plan2DFFT_cu == NULL) || (!(PlanNx == Nx && PlanNy == Ny && HowMany == FFT2DInfo.howMany))) //OC06092023 //if (Plan2DFFT_cu == NULL | !(PlanNx == Nx && PlanNy == Ny && HowMany == FFT2DInfo.howMany)) { - if (Plan2DFFT_cu != NULL){ + if(Plan2DFFT_cu != NULL){ cufftDestroy(Plan2DFFT_cu); Plan2DFFT_cu = NULL; } @@ -651,7 +651,7 @@ int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwnd_plan* pPrecrea } } else Plan2DFFT_cu = *(cufftHandle*)pPrecreatedPlan2DFFT; - if (Plan2DFFT_cu == 0) return ERROR_IN_FFT; + if(Plan2DFFT_cu == 0) return ERROR_IN_FFT; //RotateDataAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany); //RepairSignAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany); @@ -659,14 +659,14 @@ int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwnd_plan* pPrecrea RepairSignAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny); cufftExecC2C(Plan2DFFT_cu, (cufftComplex*)DataToFFT, (cufftComplex*)DataToFFT, CUFFT_INVERSE); } - else if (dDataToFFT != 0) + else if(dDataToFFT != 0) { - if (pdPrecreatedPlan2DFFT == 0) { - if ((dPlan2DFFT_cu == NULL) || (!(dPlanNx == Nx && dPlanNy == Ny))) //OC06092023 //HG04122023 + if(pdPrecreatedPlan2DFFT == 0) { + if((dPlan2DFFT_cu == NULL) || (!(dPlanNx == Nx && dPlanNy == Ny))) //OC06092023 //HG04122023 //if((dPlan2DFFT_cu == NULL) || (!(dPlanNx == Nx && dPlanNy == Ny && dHowMany == FFT2DInfo.howMany))) //OC06092023 //if (dPlan2DFFT_cu == NULL | !(dPlanNx == Nx && dPlanNy == Ny && dHowMany == FFT2DInfo.howMany)) { - if (dPlan2DFFT_cu != NULL){ + if(dPlan2DFFT_cu != NULL){ cufftDestroy(dPlan2DFFT_cu); dPlan2DFFT_cu = NULL; } @@ -680,7 +680,7 @@ int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwnd_plan* pPrecrea } } else dPlan2DFFT_cu = *(cufftHandle*)pdPrecreatedPlan2DFFT; - if (dPlan2DFFT_cu == 0) return ERROR_IN_FFT; + if(dPlan2DFFT_cu == 0) return ERROR_IN_FFT; //RotateDataAfter2DFFT_GPU((double*)dDataToFFT, Nx, Ny, FFT2DInfo.howMany); //RepairSignAfter2DFFT_GPU((double*)dDataToFFT, Nx, Ny, FFT2DInfo.howMany); @@ -699,29 +699,29 @@ int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwnd_plan* pPrecrea for (long iHowMany = 0; iHowMany < FFT2DInfo.howMany; iHowMany++) { long iFFT = Nx * Ny * iHowMany; - if (DataToFFT != 0) + if(DataToFFT != 0) { - if (pPrecreatedPlan2DFFT == 0) Plan2DFFT = fftwf_plan_dft_2d(Ny, Nx, DataToFFT + iFFT, DataToFFT + iFFT, FFTW_BACKWARD, FFTW_ESTIMATE); + if(pPrecreatedPlan2DFFT == 0) Plan2DFFT = fftwf_plan_dft_2d(Ny, Nx, DataToFFT + iFFT, DataToFFT + iFFT, FFTW_BACKWARD, FFTW_ESTIMATE); else Plan2DFFT = *pPrecreatedPlan2DFFT; - if (Plan2DFFT == 0) return ERROR_IN_FFT; + if(Plan2DFFT == 0) return ERROR_IN_FFT; RotateDataAfter2DFFT(DataToFFT); RepairSignAfter2DFFT(DataToFFT); fftwf_execute(Plan2DFFT); } - else if (dDataToFFT != 0) + else if(dDataToFFT != 0) { - if (pdPrecreatedPlan2DFFT == 0) dPlan2DFFT = fftw_plan_dft_2d(Ny, Nx, dDataToFFT + iFFT, dDataToFFT + iFFT, FFTW_BACKWARD, FFTW_ESTIMATE); + if(pdPrecreatedPlan2DFFT == 0) dPlan2DFFT = fftw_plan_dft_2d(Ny, Nx, dDataToFFT + iFFT, dDataToFFT + iFFT, FFTW_BACKWARD, FFTW_ESTIMATE); else dPlan2DFFT = *pdPrecreatedPlan2DFFT; - if (dPlan2DFFT == 0) return ERROR_IN_FFT; + if(dPlan2DFFT == 0) return ERROR_IN_FFT; RotateDataAfter2DFFT(dDataToFFT); RepairSignAfter2DFFT(dDataToFFT); fftw_execute(dPlan2DFFT); } } #else - if (pPrecreatedPlan2DFFT == 0) Plan2DFFT = fftw2d_create_plan(Ny, Nx, FFTW_BACKWARD, FFTW_IN_PLACE); + if(pPrecreatedPlan2DFFT == 0) Plan2DFFT = fftw2d_create_plan(Ny, Nx, FFTW_BACKWARD, FFTW_IN_PLACE); else Plan2DFFT = *pPrecreatedPlan2DFFT; - if (Plan2DFFT == 0) return ERROR_IN_FFT; + if(Plan2DFFT == 0) return ERROR_IN_FFT; RotateDataAfter2DFFT(DataToFFT); RepairSignAfter2DFFT(DataToFFT); fftwnd(Plan2DFFT, 1, DataToFFT, 1, 0, DataToFFT, 1, 0); @@ -729,9 +729,9 @@ int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwnd_plan* pPrecrea } } - if (!alreadyNormalized){ + if(!alreadyNormalized){ #ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) - if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023 + if(CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023 //GPU_COND(pvGPU, //OC06092023 //GPU_COND(pGpuUsage, //HG18072022 { @@ -739,18 +739,18 @@ int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwnd_plan* pPrecrea // NormalizeDataAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany, Mult); //else if (dDataToFFT != 0) // NormalizeDataAfter2DFFT_GPU((double*)dDataToFFT, Nx, Ny, FFT2DInfo.howMany, Mult); - if (DataToFFT != 0) //HG04122023 + if(DataToFFT != 0) //HG04122023 NormalizeDataAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, Mult); - else if (dDataToFFT != 0) + else if(dDataToFFT != 0) NormalizeDataAfter2DFFT_GPU((double*)dDataToFFT, Nx, Ny, Mult); }//) else #endif { - if (DataToFFT != 0) NormalizeDataAfter2DFFT(DataToFFT, Mult); + if(DataToFFT != 0) NormalizeDataAfter2DFFT(DataToFFT, Mult); #ifdef _FFTW3 //OC27022019 - else if (dDataToFFT != 0) NormalizeDataAfter2DFFT(dDataToFFT, Mult); + else if(dDataToFFT != 0) NormalizeDataAfter2DFFT(dDataToFFT, Mult); #endif } } @@ -758,25 +758,25 @@ int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwnd_plan* pPrecrea //if(NeedsShiftAfterX) FillArrayShift('x', t0SignMult*x0_After, FFT2DInfo.xStepTr); //if(NeedsShiftAfterY) FillArrayShift('y', t0SignMult*y0_After, FFT2DInfo.yStepTr); - if (NeedsShiftAfterX) + if(NeedsShiftAfterX) {//OC02022019 - if (m_ArrayShiftX != 0) FillArrayShift('x', t0SignMult * x0_After, FFT2DInfo.xStepTr, m_ArrayShiftX); - else if (m_dArrayShiftX != 0) FillArrayShift('x', t0SignMult * x0_After, FFT2DInfo.xStepTr, m_dArrayShiftX); + if(m_ArrayShiftX != 0) FillArrayShift('x', t0SignMult * x0_After, FFT2DInfo.xStepTr, m_ArrayShiftX); + else if(m_dArrayShiftX != 0) FillArrayShift('x', t0SignMult * x0_After, FFT2DInfo.xStepTr, m_dArrayShiftX); } - if (NeedsShiftAfterY) + if(NeedsShiftAfterY) {//OC02022019 - if (m_ArrayShiftY != 0) FillArrayShift('y', t0SignMult * y0_After, FFT2DInfo.yStepTr, m_ArrayShiftY); - else if (m_dArrayShiftY != 0) FillArrayShift('y', t0SignMult * y0_After, FFT2DInfo.yStepTr, m_dArrayShiftY); + if(m_ArrayShiftY != 0) FillArrayShift('y', t0SignMult * y0_After, FFT2DInfo.yStepTr, m_ArrayShiftY); + else if(m_dArrayShiftY != 0) FillArrayShift('y', t0SignMult * y0_After, FFT2DInfo.yStepTr, m_dArrayShiftY); } - if (NeedsShiftAfterX || NeedsShiftAfterY) + if(NeedsShiftAfterX || NeedsShiftAfterY) { #ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) - if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023 + if(CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023 //GPU_COND(pvGPU, //OC06092023 //GPU_COND(pGpuUsage, //HG18072022 { TGPUUsageArg *pGPU = (TGPUUsageArg*)pvGPU; - if (DataToFFT != 0) { + if(DataToFFT != 0) { m_ArrayShiftX = (float*)CAuxGPU::ToDevice(pGPU, m_ArrayShiftX, (Nx << 1) * sizeof(float), false); //OC06092023 m_ArrayShiftY = (float*)CAuxGPU::ToDevice(pGPU, m_ArrayShiftY, (Ny << 1) * sizeof(float), false); CAuxGPU::EnsureDeviceMemoryReady(pGPU, m_ArrayShiftX); @@ -792,7 +792,7 @@ int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwnd_plan* pPrecrea //m_ArrayShiftX = (float*)CAuxGPU::ToHostAndFree(pGpuUsage, m_ArrayShiftX, (Nx << 1) * sizeof(float), true); //m_ArrayShiftY = (float*)CAuxGPU::ToHostAndFree(pGpuUsage, m_ArrayShiftY, (Ny << 1) * sizeof(float), true); } - else if (dDataToFFT != 0) { + else if(dDataToFFT != 0) { m_dArrayShiftX = (double*)CAuxGPU::ToDevice(pGPU, m_dArrayShiftX, (Nx << 1) * sizeof(double), false); //OC06092023 m_dArrayShiftY = (double*)CAuxGPU::ToDevice(pGPU, m_dArrayShiftY, (Ny << 1) * sizeof(double), false); CAuxGPU::EnsureDeviceMemoryReady(pGPU, m_dArrayShiftX); @@ -812,10 +812,10 @@ int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwnd_plan* pPrecrea else #endif { - if (DataToFFT != 0) TreatShifts(DataToFFT); + if(DataToFFT != 0) TreatShifts(DataToFFT); #ifdef _FFTW3 //OC27022019 - else if (dDataToFFT != 0) TreatShifts(dDataToFFT); //OC02022019 + else if(dDataToFFT != 0) TreatShifts(dDataToFFT); //OC02022019 #endif } } @@ -825,16 +825,16 @@ int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwnd_plan* pPrecrea //OC27102018 //SY: adopted for OpenMP #ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) - if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023 + if(CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023 //GPU_COND(pvGPU, //OC06092023 //GPU_COND(pGpuUsage, //HG02112021 { - if (FFT2DInfo.pData != 0) + if(FFT2DInfo.pData != 0) { CAuxGPU::MarkUpdated((TGPUUsageArg*)pvGPU, DataToFFT, true, false); //OC06092023 //CAuxGPU::MarkUpdated(pGpuUsage, DataToFFT, true, false); } - else if (FFT2DInfo.pdData != 0) + else if(FFT2DInfo.pdData != 0) { CAuxGPU::MarkUpdated((TGPUUsageArg*)pvGPU, dDataToFFT, true, false); //OC06092023 //CAuxGPU::MarkUpdated(pGpuUsage, dDataToFFT, true, false); @@ -844,25 +844,25 @@ int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwnd_plan* pPrecrea #endif { #if _FFTW3 //OC28012019 - if (DataToFFT != 0) + if(DataToFFT != 0) { - if (pPrecreatedPlan2DFFT == 0) fftwf_destroy_plan(Plan2DFFT); + if(pPrecreatedPlan2DFFT == 0) fftwf_destroy_plan(Plan2DFFT); } - else if (dDataToFFT != 0) //OC03022019 + else if(dDataToFFT != 0) //OC03022019 { - if (pdPrecreatedPlan2DFFT == 0) fftw_destroy_plan(dPlan2DFFT); + if(pdPrecreatedPlan2DFFT == 0) fftw_destroy_plan(dPlan2DFFT); } #else - if (pPrecreatedPlan2DFFT == 0) fftwnd_destroy_plan(Plan2DFFT); + if(pPrecreatedPlan2DFFT == 0) fftwnd_destroy_plan(Plan2DFFT); #endif } //if(ArrayShiftX != 0) { delete[] ArrayShiftX; ArrayShiftX = 0;} //if(ArrayShiftY != 0) { delete[] ArrayShiftY; ArrayShiftY = 0;} - if (m_ArrayShiftX != 0) { delete[] m_ArrayShiftX;} - if (m_ArrayShiftY != 0) { delete[] m_ArrayShiftY;} - if (m_dArrayShiftX != 0) { delete[] m_dArrayShiftX;} //OC02022019 - if (m_dArrayShiftY != 0) { delete[] m_dArrayShiftY;} + if(m_ArrayShiftX != 0) { delete[] m_ArrayShiftX;} + if(m_ArrayShiftY != 0) { delete[] m_ArrayShiftY;} + if(m_dArrayShiftX != 0) { delete[] m_dArrayShiftX;} //OC02022019 + if(m_dArrayShiftY != 0) { delete[] m_dArrayShiftY;} return 0; } @@ -899,22 +899,22 @@ int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU) //OC0509 m_ArrayShiftX = 0; m_dArrayShiftX = 0; - if (NeedsShiftBeforeX || NeedsShiftAfterX) + if(NeedsShiftBeforeX || NeedsShiftAfterX) { - if (FFT1DInfo.pInData != 0) + if(FFT1DInfo.pInData != 0) { m_ArrayShiftX = new float[Nx << 1]; - if (m_ArrayShiftX == 0) return MEMORY_ALLOCATION_FAILURE; + if(m_ArrayShiftX == 0) return MEMORY_ALLOCATION_FAILURE; #ifdef _OFFLOAD_GPU //OC05092023 (check for memory leak / misuse!) m_ArrayShiftX = (float*)CAuxGPU::ToDevice((TGPUUsageArg*)pvGPU, m_ArrayShiftX, (Nx << 1) * sizeof(float), true); //m_ArrayShiftX = (float*)CAuxGPU::ToDevice(pGpuUsage, m_ArrayShiftX, (Nx << 1) * sizeof(float), true); //HG20012022 #endif } - else if (FFT1DInfo.pdInData != 0) + else if(FFT1DInfo.pdInData != 0) { m_dArrayShiftX = new double[Nx << 1]; - if (m_dArrayShiftX == 0) return MEMORY_ALLOCATION_FAILURE; + if(m_dArrayShiftX == 0) return MEMORY_ALLOCATION_FAILURE; #ifdef _OFFLOAD_GPU //OC05092023 m_dArrayShiftX = (double*)CAuxGPU::ToDevice((TGPUUsageArg*)pvGPU, m_dArrayShiftX, (Nx << 1) * sizeof(double), true); @@ -937,18 +937,18 @@ int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU) //OC0509 // printf ("GPU: Make1DFFT\n"); //#endif #ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) - if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023 + if(CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023 //GPU_COND(pvGPU, //OC06092023 //GPU_COND(pGpuUsage, //HG20012022 { - if ((FFT1DInfo.pInData != 0) && (FFT1DInfo.pOutData != 0)) + if((FFT1DInfo.pInData != 0) && (FFT1DInfo.pOutData != 0)) { DataToFFT = (fftwf_complex*)CAuxGPU::ToDevice((TGPUUsageArg*)pvGPU, FFT1DInfo.pInData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(float)); //OC06092023 OutDataFFT = (fftwf_complex*)CAuxGPU::ToDevice((TGPUUsageArg*)pvGPU, FFT1DInfo.pOutData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(float), true); //DataToFFT = (fftwf_complex*)CAuxGPU::ToDevice(pGpuUsage, FFT1DInfo.pInData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(float)); //OutDataFFT = (fftwf_complex*)CAuxGPU::ToDevice(pGpuUsage, FFT1DInfo.pOutData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(float), true); } - else if ((FFT1DInfo.pdInData != 0) && (FFT1DInfo.pdOutData != 0)) + else if((FFT1DInfo.pdInData != 0) && (FFT1DInfo.pdOutData != 0)) { dDataToFFT = (fftw_complex*)CAuxGPU::ToDevice((TGPUUsageArg*)pvGPU, FFT1DInfo.pdInData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(double)); //OC06092023 dOutDataFFT = (fftw_complex*)CAuxGPU::ToDevice((TGPUUsageArg*)pvGPU, FFT1DInfo.pdOutData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(double), true); @@ -960,13 +960,13 @@ int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU) //OC0509 #endif { #ifdef _FFTW3 //OC28012019 - if ((FFT1DInfo.pInData != 0) && (FFT1DInfo.pOutData != 0)) + if((FFT1DInfo.pInData != 0) && (FFT1DInfo.pOutData != 0)) { DataToFFT = (fftwf_complex*)(FFT1DInfo.pInData); OutDataFFT = (fftwf_complex*)(FFT1DInfo.pOutData); //pOutDataFFT = OutDataFFT; //OC03092016 to be used solely in fftw call } - else if ((FFT1DInfo.pdInData != 0) && (FFT1DInfo.pdOutData != 0)) + else if((FFT1DInfo.pdInData != 0) && (FFT1DInfo.pdOutData != 0)) { dDataToFFT = (fftw_complex*)(FFT1DInfo.pdInData); dOutDataFFT = (fftw_complex*)(FFT1DInfo.pdOutData); @@ -994,37 +994,37 @@ int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU) //OC0509 } #ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) - if (DataToFFT != 0) CAuxGPU::EnsureDeviceMemoryReady((TGPUUsageArg*)pvGPU, DataToFFT); - else if (dDataToFFT != 0) CAuxGPU::EnsureDeviceMemoryReady((TGPUUsageArg*)pvGPU, dDataToFFT); + if(DataToFFT != 0) CAuxGPU::EnsureDeviceMemoryReady((TGPUUsageArg*)pvGPU, DataToFFT); + else if(dDataToFFT != 0) CAuxGPU::EnsureDeviceMemoryReady((TGPUUsageArg*)pvGPU, dDataToFFT); //if (DataToFFT != 0) CAuxGPU::EnsureDeviceMemoryReady(pGpuUsage, DataToFFT); //else if (dDataToFFT != 0) CAuxGPU::EnsureDeviceMemoryReady(pGpuUsage, dDataToFFT); #endif char t0SignMult = (FFT1DInfo.Dir > 0) ? -1 : 1; - if (NeedsShiftBeforeX) + if(NeedsShiftBeforeX) { #ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) - if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023 + if(CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023 //GPU_COND(pvGPU, //GPU_COND(pGpuUsage, //HG20012022 { - if (m_ArrayShiftX != 0) FillArrayShift_GPU(t0SignMult * x0_Before, FFT1DInfo.xStep, Nx, m_ArrayShiftX); - else if (m_dArrayShiftX != 0) FillArrayShift_GPU(t0SignMult * x0_Before, FFT1DInfo.xStep, Nx, m_dArrayShiftX); + if(m_ArrayShiftX != 0) FillArrayShift_GPU(t0SignMult * x0_Before, FFT1DInfo.xStep, Nx, m_ArrayShiftX); + else if(m_dArrayShiftX != 0) FillArrayShift_GPU(t0SignMult * x0_Before, FFT1DInfo.xStep, Nx, m_dArrayShiftX); - if (DataToFFT != 0) TreatShift_GPU((float*)DataToFFT, FFT1DInfo.HowMany, Nx, m_ArrayShiftX); - else if (dDataToFFT != 0) TreatShift_GPU((double*)dDataToFFT, FFT1DInfo.HowMany, Nx, m_dArrayShiftX); + if(DataToFFT != 0) TreatShift_GPU((float*)DataToFFT, FFT1DInfo.HowMany, Nx, m_ArrayShiftX); + else if(dDataToFFT != 0) TreatShift_GPU((double*)dDataToFFT, FFT1DInfo.HowMany, Nx, m_dArrayShiftX); }//) else #endif { //FillArrayShift(t0SignMult*x0_Before, FFT1DInfo.xStep); - if (m_ArrayShiftX != 0) FillArrayShift(t0SignMult * x0_Before, FFT1DInfo.xStep, m_ArrayShiftX); - else if (m_dArrayShiftX != 0) FillArrayShift(t0SignMult * x0_Before, FFT1DInfo.xStep, m_dArrayShiftX); + if(m_ArrayShiftX != 0) FillArrayShift(t0SignMult * x0_Before, FFT1DInfo.xStep, m_ArrayShiftX); + else if(m_dArrayShiftX != 0) FillArrayShift(t0SignMult * x0_Before, FFT1DInfo.xStep, m_dArrayShiftX); - if (DataToFFT != 0) TreatShift(DataToFFT, FFT1DInfo.HowMany); + if(DataToFFT != 0) TreatShift(DataToFFT, FFT1DInfo.HowMany); #ifdef _FFTW3 //OC27022019 - else if (dDataToFFT != 0) TreatShift(dDataToFFT, FFT1DInfo.HowMany); + else if(dDataToFFT != 0) TreatShift(dDataToFFT, FFT1DInfo.HowMany); #endif } } @@ -1037,32 +1037,32 @@ int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU) //OC0509 //double Mult = FFT1DInfo.xStep; double Mult = FFT1DInfo.xStep * FFT1DInfo.MultExtra; - if (FFT1DInfo.Dir > 0) //HG17112021 + if(FFT1DInfo.Dir > 0) //HG17112021 { #ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) - if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023 + if(CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023 //GPU_COND(pvGPU, //GPU_COND(pGpuUsage, { int arN[] = { (int)Nx }; //OC14052020 - if (DataToFFT != 0) + if(DataToFFT != 0) { - if (PlanLen != Nx) { + if(PlanLen != Nx) { PlanLen = Nx; - if (Plan1DFFT_cu != NULL) + if(Plan1DFFT_cu != NULL) { cufftDestroy(Plan1DFFT_cu); Plan1DFFT_cu = NULL; } cufftPlanMany(&Plan1DFFT_cu, 1, arN, NULL, 1, Nx, NULL, 1, Nx, CUFFT_C2C, FFT1DInfo.HowMany); } - if (Plan1DFFT_cu == 0) return ERROR_IN_FFT; + if(Plan1DFFT_cu == 0) return ERROR_IN_FFT; cufftExecC2C(Plan1DFFT_cu, (cufftComplex*)DataToFFT, (cufftComplex*)OutDataFFT, CUFFT_FORWARD); } - else if (dDataToFFT != 0) //OC02022019 + else if(dDataToFFT != 0) //OC02022019 { - if (dPlanLen != Nx) { - if (dPlan1DFFT_cu != NULL) + if(dPlanLen != Nx) { + if(dPlan1DFFT_cu != NULL) { cufftDestroy(dPlan1DFFT_cu); dPlan1DFFT_cu = NULL; @@ -1070,7 +1070,7 @@ int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU) //OC0509 dPlanLen = Nx; cufftPlanMany(&dPlan1DFFT_cu, 1, arN, NULL, 1, Nx, NULL, 1, Nx, CUFFT_Z2Z, FFT1DInfo.HowMany); } - if (dPlan1DFFT_cu == 0) return ERROR_IN_FFT; + if(dPlan1DFFT_cu == 0) return ERROR_IN_FFT; cufftExecZ2Z(dPlan1DFFT_cu, (cufftDoubleComplex*)dDataToFFT, (cufftDoubleComplex*)dOutDataFFT, CUFFT_FORWARD); } }//) @@ -1081,13 +1081,13 @@ int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU) //OC0509 #ifdef _FFTW3 //OC28012019 #ifdef _WITH_OMP //Still needs to be tested! - if (DataToFFT != 0) + if(DataToFFT != 0) { fftwf_init_threads(); //initialize threading support int nthreads = omp_get_max_threads(); //detect number of OpenMP threads that are available fftwf_plan_with_nthreads(nthreads); } - else if (dDataToFFT != 0) //OC02022019 + else if(dDataToFFT != 0) //OC02022019 { fftw_init_threads(); //initialize threading support int nthreads = omp_get_max_threads(); //detect number of OpenMP threads that are available @@ -1096,28 +1096,28 @@ int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU) //OC0509 #endif //ifndef _WITH_OMP int arN[] = { (int)Nx }; //OC14052020 //int arN[] = {Nx}; - if (DataToFFT != 0) + if(DataToFFT != 0) { //Plan1DFFT = fftwf_plan_many_dft(1, arN, FFT1DInfo.HowMany, DataToFFT, NULL, 1, Nx, pOutDataFFT, NULL, 1, Nx, FFTW_FORWARD, flags); Plan1DFFT = fftwf_plan_many_dft(1, arN, FFT1DInfo.HowMany, DataToFFT, NULL, 1, Nx, OutDataFFT, NULL, 1, Nx, FFTW_FORWARD, flags); //OC02022019 - if (Plan1DFFT == 0) return ERROR_IN_FFT; + if(Plan1DFFT == 0) return ERROR_IN_FFT; fftwf_execute(Plan1DFFT); } - else if (dDataToFFT != 0) //OC02022019 + else if(dDataToFFT != 0) //OC02022019 { dPlan1DFFT = fftw_plan_many_dft(1, arN, FFT1DInfo.HowMany, dDataToFFT, NULL, 1, Nx, dOutDataFFT, NULL, 1, Nx, FFTW_FORWARD, flags); - if (dPlan1DFFT == 0) return ERROR_IN_FFT; + if(dPlan1DFFT == 0) return ERROR_IN_FFT; fftw_execute(dPlan1DFFT); } #else //ifndef _FFTW3 - if (DataToFFT == OutDataFFT) + if(DataToFFT == OutDataFFT) { flags |= FFTW_IN_PLACE; pOutDataFFT = 0; //OC03092016 (see FFTW 2.1.5 doc clause above) } Plan1DFFT = fftw_create_plan(Nx, FFTW_FORWARD, flags); - if (Plan1DFFT == 0) return ERROR_IN_FFT; + if(Plan1DFFT == 0) return ERROR_IN_FFT; //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: //srwlPrintTime("::Make1DFFT : fft create plan dir>0",&start); @@ -1131,7 +1131,7 @@ int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU) //OC0509 for (int i = 0; i < FFT1DInfo.HowMany; i++) { //SY: do not use OutDataFFT as scratch space if in-place - if (DataToFFT == OutDataFFT) fftw_one(Plan1DFFT, DataToFFT + i * Nx, 0); + if(DataToFFT == OutDataFFT) fftw_one(Plan1DFFT, DataToFFT + i * Nx, 0); else fftw_one(Plan1DFFT, DataToFFT + i * Nx, OutDataFFT + i * Nx); } #endif @@ -1141,18 +1141,18 @@ int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU) //OC0509 //srwlPrintTime("::Make1DFFT : fft dir>0",&start); #ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) - if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023 + if(CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023 //GPU_COND(pvGPU, //GPU_COND(pGpuUsage, //HG20012022 { - if (OutDataFFT != 0) + if(OutDataFFT != 0) { RepairAndRotateDataAfter1DFFT_GPU((float*)OutDataFFT, FFT1DInfo.HowMany, Nx, (float)Mult); //OC06092023 //RepairAndRotateDataAfter1DFFT_GPU((float*)OutDataFFT, FFT1DInfo.HowMany, Nx, Mult); //RepairSignAfter1DFFT_GPU((float*)OutDataFFT, FFT1DInfo.HowMany, Nx); //RotateDataAfter1DFFT_GPU((float*)OutDataFFT, FFT1DInfo.HowMany, Nx); } - else if (dOutDataFFT != 0) + else if(dOutDataFFT != 0) { RepairAndRotateDataAfter1DFFT_GPU((double*)dOutDataFFT, FFT1DInfo.HowMany, Nx, Mult); //RepairSignAfter1DFFT_GPU((double*)dOutDataFFT, FFT1DInfo.HowMany, Nx); @@ -1163,13 +1163,13 @@ int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU) //OC0509 else #endif { - if (OutDataFFT != 0) + if(OutDataFFT != 0) { RepairSignAfter1DFFT(OutDataFFT, FFT1DInfo.HowMany); RotateDataAfter1DFFT(OutDataFFT, FFT1DInfo.HowMany); } #ifdef _FFTW3 //OC27022019 - else if (dOutDataFFT != 0) + else if(dOutDataFFT != 0) { RepairSignAfter1DFFT(dOutDataFFT, FFT1DInfo.HowMany); RotateDataAfter1DFFT(dOutDataFFT, FFT1DInfo.HowMany); @@ -1181,44 +1181,44 @@ int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU) //OC0509 { //int flags = FFTW_ESTIMATE; //OC30012019 (commented-out) #ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) - if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023 + if(CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023 //GPU_COND(pvGPU, //GPU_COND(pGpuUsage, //HG20012022 { int arN[] = { (int)Nx }; //OC14052020 //int arN[] = {Nx}; - if (DataToFFT != 0) + if(DataToFFT != 0) { - if (PlanLen != Nx) { + if(PlanLen != Nx) { PlanLen = Nx; HowMany = FFT1DInfo.HowMany; - if (Plan1DFFT_cu != NULL) + if(Plan1DFFT_cu != NULL) { cufftDestroy(Plan1DFFT_cu); Plan1DFFT_cu = NULL; } cufftPlanMany(&Plan1DFFT_cu, 1, arN, NULL, 1, Nx, NULL, 1, Nx, CUFFT_C2C, FFT1DInfo.HowMany); } - if (Plan1DFFT_cu == 0) return ERROR_IN_FFT; + if(Plan1DFFT_cu == 0) return ERROR_IN_FFT; RotateDataAfter1DFFT_GPU((float*)DataToFFT, FFT1DInfo.HowMany, Nx); RepairSignAfter1DFFT_GPU((float*)DataToFFT, FFT1DInfo.HowMany, Nx); cufftExecC2C(Plan1DFFT_cu, (cufftComplex*)DataToFFT, (cufftComplex*)OutDataFFT, CUFFT_INVERSE); } - else if (dDataToFFT != 0) //OC02022019 + else if(dDataToFFT != 0) //OC02022019 { - if (dPlanLen != Nx) + if(dPlanLen != Nx) { dPlanLen = Nx; dHowMany = FFT1DInfo.HowMany; - if (dPlan1DFFT_cu != NULL) + if(dPlan1DFFT_cu != NULL) { cufftDestroy(dPlan1DFFT_cu); dPlan1DFFT_cu = NULL; } cufftPlanMany(&dPlan1DFFT_cu, 1, arN, NULL, 1, Nx, NULL, 1, Nx, CUFFT_Z2Z, FFT1DInfo.HowMany); } - if (dPlan1DFFT_cu == 0) return ERROR_IN_FFT; + if(dPlan1DFFT_cu == 0) return ERROR_IN_FFT; RotateDataAfter1DFFT_GPU((double*)dDataToFFT, FFT1DInfo.HowMany, Nx); RepairSignAfter1DFFT_GPU((double*)dDataToFFT, FFT1DInfo.HowMany, Nx); @@ -1232,13 +1232,13 @@ int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU) //OC0509 #ifdef _WITH_OMP //Still needs to be tested! - if (DataToFFT != 0) + if(DataToFFT != 0) { fftwf_init_threads(); //initialize threading support int nthreads = omp_get_max_threads(); //detect number of OpenMP threads that are available fftwf_plan_with_nthreads(nthreads); } - else if (dDataToFFT != 0) + else if(dDataToFFT != 0) { fftw_init_threads(); //initialize threading support int nthreads = omp_get_max_threads(); //detect number of OpenMP threads that are available @@ -1248,32 +1248,32 @@ int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU) //OC0509 #endif int arN[] = { (int)Nx }; //OC14052020 //int arN[] = {Nx}; - if (DataToFFT != 0) + if(DataToFFT != 0) { //Plan1DFFT = fftwf_plan_many_dft(1, arN, FFT1DInfo.HowMany, DataToFFT, NULL, 1, Nx, pOutDataFFT, NULL, 1, Nx, FFTW_BACKWARD, flags); Plan1DFFT = fftwf_plan_many_dft(1, arN, FFT1DInfo.HowMany, DataToFFT, NULL, 1, Nx, OutDataFFT, NULL, 1, Nx, FFTW_BACKWARD, flags); //OC02022019 - if (Plan1DFFT == 0) return ERROR_IN_FFT; + if(Plan1DFFT == 0) return ERROR_IN_FFT; RotateDataAfter1DFFT(DataToFFT, FFT1DInfo.HowMany); RepairSignAfter1DFFT(DataToFFT, FFT1DInfo.HowMany); fftwf_execute(Plan1DFFT); } - else if (dDataToFFT != 0) //OC02022019 + else if(dDataToFFT != 0) //OC02022019 { dPlan1DFFT = fftw_plan_many_dft(1, arN, FFT1DInfo.HowMany, dDataToFFT, NULL, 1, Nx, dOutDataFFT, NULL, 1, Nx, FFTW_BACKWARD, flags); - if (dPlan1DFFT == 0) return ERROR_IN_FFT; + if(dPlan1DFFT == 0) return ERROR_IN_FFT; RotateDataAfter1DFFT(dDataToFFT, FFT1DInfo.HowMany); RepairSignAfter1DFFT(dDataToFFT, FFT1DInfo.HowMany); fftw_execute(dPlan1DFFT); } #else //ifndef _FFTW3 - if (DataToFFT == OutDataFFT) + if(DataToFFT == OutDataFFT) { flags |= FFTW_IN_PLACE; pOutDataFFT = 0; //OC03092016 (see FFTW 2.1.5 doc clause above) } Plan1DFFT = fftw_create_plan(Nx, FFTW_BACKWARD, flags); - if (Plan1DFFT == 0) return ERROR_IN_FFT; + if(Plan1DFFT == 0) return ERROR_IN_FFT; //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: //srwlPrintTime("::Make1DFFT : fft create plan dir<0",&start); @@ -1289,10 +1289,10 @@ int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU) //OC0509 fftw(Plan1DFFT, FFT1DInfo.HowMany, DataToFFT, 1, Nx, pOutDataFFT, 1, Nx); //OC03092016 #else //OC27102018 //SY: split one call into many (for OpenMP) -#pragma omp parallel for if (omp_get_num_threads()==1) // to avoid nested multi-threading (just in case) +#pragma omp parallel for if(omp_get_num_threads()==1) // to avoid nested multi-threading (just in case) for (int i = 0; i < FFT1DInfo.HowMany; i++) { - if (DataToFFT == OutDataFFT) fftw_one(Plan1DFFT, DataToFFT + i * Nx, 0); + if(DataToFFT == OutDataFFT) fftw_one(Plan1DFFT, DataToFFT + i * Nx, 0); else fftw_one(Plan1DFFT, DataToFFT + i * Nx, OutDataFFT + i * Nx); } #endif @@ -1302,25 +1302,25 @@ int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU) //OC0509 //srwlPrintTime("::Make1DFFT : fft dir<0",&start); } - if (!alreadyNormalized) + if(!alreadyNormalized) { #ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) - if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023 + if(CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023 //GPU_COND(pvGPU, //GPU_COND(pGpuUsage, { - if (OutDataFFT != 0) { + if(OutDataFFT != 0) { NormalizeDataAfter1DFFT_GPU((float*)OutDataFFT, FFT1DInfo.HowMany, Nx, Mult); } - else if (dOutDataFFT != 0) + else if(dOutDataFFT != 0) NormalizeDataAfter1DFFT_GPU((double*)dOutDataFFT, FFT1DInfo.HowMany, Nx, Mult); }//) else #endif { - if (OutDataFFT != 0) NormalizeDataAfter1DFFT(OutDataFFT, FFT1DInfo.HowMany, Mult); + if(OutDataFFT != 0) NormalizeDataAfter1DFFT(OutDataFFT, FFT1DInfo.HowMany, Mult); #ifdef _FFTW3 //OC27022019 - else if (dOutDataFFT != 0) NormalizeDataAfter1DFFT(dOutDataFFT, FFT1DInfo.HowMany, Mult); + else if(dOutDataFFT != 0) NormalizeDataAfter1DFFT(dOutDataFFT, FFT1DInfo.HowMany, Mult); #endif } } @@ -1328,29 +1328,29 @@ int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU) //OC0509 //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: //srwlPrintTime("::Make1DFFT : NormalizeDataAfter1DFFT",&start); - if (NeedsShiftAfterX) + if(NeedsShiftAfterX) { #ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) - if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023 + if(CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023 //GPU_COND(pvGPU, //GPU_COND(pGpuUsage, { - if (m_ArrayShiftX != 0) FillArrayShift_GPU(t0SignMult * x0_After, FFT1DInfo.xStepTr, Nx, m_ArrayShiftX); //OC02022019 - else if (m_dArrayShiftX != 0) FillArrayShift_GPU(t0SignMult * x0_After, FFT1DInfo.xStepTr, Nx, m_dArrayShiftX); + if(m_ArrayShiftX != 0) FillArrayShift_GPU(t0SignMult * x0_After, FFT1DInfo.xStepTr, Nx, m_ArrayShiftX); //OC02022019 + else if(m_dArrayShiftX != 0) FillArrayShift_GPU(t0SignMult * x0_After, FFT1DInfo.xStepTr, Nx, m_dArrayShiftX); - if (OutDataFFT != 0) TreatShift_GPU((float*)OutDataFFT, FFT1DInfo.HowMany, Nx, m_ArrayShiftX); - else if (dOutDataFFT != 0) TreatShift_GPU((double*)dOutDataFFT, FFT1DInfo.HowMany, Nx, m_dArrayShiftX); + if(OutDataFFT != 0) TreatShift_GPU((float*)OutDataFFT, FFT1DInfo.HowMany, Nx, m_ArrayShiftX); + else if(dOutDataFFT != 0) TreatShift_GPU((double*)dOutDataFFT, FFT1DInfo.HowMany, Nx, m_dArrayShiftX); }//) else #endif { //FillArrayShift(t0SignMult*x0_After, FFT1DInfo.xStepTr); - if (m_ArrayShiftX != 0) FillArrayShift(t0SignMult * x0_After, FFT1DInfo.xStepTr, m_ArrayShiftX); //OC02022019 - else if (m_dArrayShiftX != 0) FillArrayShift(t0SignMult * x0_After, FFT1DInfo.xStepTr, m_dArrayShiftX); + if(m_ArrayShiftX != 0) FillArrayShift(t0SignMult * x0_After, FFT1DInfo.xStepTr, m_ArrayShiftX); //OC02022019 + else if(m_dArrayShiftX != 0) FillArrayShift(t0SignMult * x0_After, FFT1DInfo.xStepTr, m_dArrayShiftX); - if (OutDataFFT != 0) TreatShift(OutDataFFT, FFT1DInfo.HowMany); + if(OutDataFFT != 0) TreatShift(OutDataFFT, FFT1DInfo.HowMany); #ifdef _FFTW3 //OC27022019 - else if (dOutDataFFT != 0) TreatShift(dOutDataFFT, FFT1DInfo.HowMany); + else if(dOutDataFFT != 0) TreatShift(dOutDataFFT, FFT1DInfo.HowMany); #endif } } @@ -1362,16 +1362,16 @@ int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU) //OC0509 } #ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) - if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023 + if(CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023 //GPU_COND(pvGPU, //GPU_COND(pGpuUsage, //HG20012022 { - if ((FFT1DInfo.pInData != 0) && (FFT1DInfo.pOutData != 0)) + if((FFT1DInfo.pInData != 0) && (FFT1DInfo.pOutData != 0)) { CAuxGPU::MarkUpdated((TGPUUsageArg*)pvGPU, OutDataFFT, true, false); //OC06092023 //CAuxGPU::MarkUpdated(pGpuUsage, OutDataFFT, true, false); } - else if ((FFT1DInfo.pdInData != 0) && (FFT1DInfo.pdOutData != 0)) + else if((FFT1DInfo.pdInData != 0) && (FFT1DInfo.pdOutData != 0)) { CAuxGPU::MarkUpdated((TGPUUsageArg*)pvGPU, dOutDataFFT, true, false); //OC06092023 //CAuxGPU::MarkUpdated(pGpuUsage, dOutDataFFT, true, false); @@ -1403,7 +1403,7 @@ int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU) //OC0509 #endif } - if (m_ArrayShiftX != 0) + if(m_ArrayShiftX != 0) { #ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) m_ArrayShiftX = (float*)CAuxGPU::ToHostAndFree((TGPUUsageArg*)pvGPU, m_ArrayShiftX, (Nx << 1) * sizeof(float), true); //OC06092023 @@ -1411,7 +1411,7 @@ int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU) //OC0509 #endif delete[] m_ArrayShiftX; } - if (m_dArrayShiftX != 0) + if(m_dArrayShiftX != 0) { #ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) m_dArrayShiftX = (double*)CAuxGPU::ToHostAndFree((TGPUUsageArg*)pvGPU, m_dArrayShiftX, (Nx << 1) * sizeof(double), true); //OC06092023 From 5ecedf46cffb099de11917072a325006bd49a63b Mon Sep 17 00:00:00 2001 From: Himanshu Goel Date: Thu, 4 Jan 2024 12:23:54 -0500 Subject: [PATCH 6/9] Restore deleted change tag. --- cpp/src/ext/genmath/gmfft.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/ext/genmath/gmfft.cpp b/cpp/src/ext/genmath/gmfft.cpp index 43639845..58b766a2 100644 --- a/cpp/src/ext/genmath/gmfft.cpp +++ b/cpp/src/ext/genmath/gmfft.cpp @@ -360,7 +360,7 @@ int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwnd_plan* pPrecrea } } -#ifdef _FFTW3 +#ifdef _FFTW3 //OC28012019 fftwf_plan Plan2DFFT; fftw_plan dPlan2DFFT; fftwf_complex* DataToFFT = 0; From 44cce0811cf90d7c371c0f9ff19c48467bad09f5 Mon Sep 17 00:00:00 2001 From: Himanshu Goel Date: Thu, 4 Jan 2024 13:43:51 -0500 Subject: [PATCH 7/9] Update ParseDeviceParam definition --- cpp/src/clients/python/srwlpy.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/clients/python/srwlpy.cpp b/cpp/src/clients/python/srwlpy.cpp index 217fbe95..d568aafc 100644 --- a/cpp/src/clients/python/srwlpy.cpp +++ b/cpp/src/clients/python/srwlpy.cpp @@ -3327,15 +3327,15 @@ void ParseSructSmpObj3D(double**& arObjShapeDefs, int& nObj3D, PyObject* oListSh /************************************************************************//** * Convert Python device specification to C++ structure. ***************************************************************************/ -void ParseDeviceParam(PyObject* oDev, gpuUsageArg *pGpuUsage) //HG10202021 Convert Python device specification to C++ structure +void ParseDeviceParam(PyObject* oDev, TGPUUsageArg* pGpu) //HG10202021 Convert Python device specification to C++ structure { if (oDev != 0) { if (PyLong_Check(oDev)) { - pGpuUsage->deviceIndex = _PyLong_AsInt(oDev); + pGpu->deviceIndex = _PyLong_AsInt(oDev); return; } } - pGpuUsage->deviceIndex = 0; + pGpu->deviceIndex = 0; } #endif From 863ce5718ca988c715bdc9e83045bf67ff8f9ade Mon Sep 17 00:00:00 2001 From: Himanshu Goel Date: Thu, 4 Jan 2024 13:56:04 -0500 Subject: [PATCH 8/9] Update SRWLClientPython copy paths. --- cpp/vc/SRWLClientPython.vcxproj | 58 ++++++++++++++++----------------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/cpp/vc/SRWLClientPython.vcxproj b/cpp/vc/SRWLClientPython.vcxproj index 882e3575..25826e85 100644 --- a/cpp/vc/SRWLClientPython.vcxproj +++ b/cpp/vc/SRWLClientPython.vcxproj @@ -648,7 +648,7 @@ ..\..\..\Python33\libs\python33.lib;srw_win32.lib;%(AdditionalDependencies) - $(SolutionDir)..\..\env\work\srw_python\srwlpy.pyd + $(SolutionDir)..\..\env\python\srwpy\srwlpy.pyd LIBC;%(IgnoreSpecificDefaultLibraries) true Windows @@ -674,7 +674,7 @@ ..\..\..\Python33\libs\python33.lib;srw_win32.lib;%(AdditionalDependencies) - $(SolutionDir)..\..\env\work\srw_python\srwlpy.pyd + $(SolutionDir)..\..\env\python\srwpy\srwlpy.pyd LIBC;%(IgnoreSpecificDefaultLibraries) true Windows @@ -701,7 +701,7 @@ ..\..\..\Python33\libs\python33.lib;srw_win32.lib;%(AdditionalDependencies) - $(SolutionDir)..\..\env\work\srw_python\srwlpy.pyd + $(SolutionDir)..\..\env\python\srwpy\srwlpy.pyd LIBC;%(IgnoreSpecificDefaultLibraries) true Windows @@ -728,7 +728,7 @@ ..\..\..\Python33\libs\python33.lib;srw_win32.lib;%(AdditionalDependencies) - $(SolutionDir)..\..\env\work\srw_python\srwlpy.pyd + $(SolutionDir)..\..\env\python\srwpy\srwlpy.pyd LIBC;%(IgnoreSpecificDefaultLibraries) true Windows @@ -755,7 +755,7 @@ ..\..\..\Python33\libs\python33.lib;srw_win32.lib;%(AdditionalDependencies) - $(SolutionDir)..\..\env\work\srw_python\srwlpy.pyd + $(SolutionDir)..\..\env\python\srwpy\srwlpy.pyd LIBC;%(IgnoreSpecificDefaultLibraries) true Windows @@ -782,7 +782,7 @@ ..\..\..\Python33\libs\python33.lib;srw_win32.lib;%(AdditionalDependencies) - $(SolutionDir)..\..\env\work\srw_python\srwlpy.pyd + $(SolutionDir)..\..\env\python\srwpy\srwlpy.pyd LIBC;%(IgnoreSpecificDefaultLibraries) true Windows @@ -809,7 +809,7 @@ ..\..\..\Python33\libs\python33.lib;srw_win32.lib;%(AdditionalDependencies) - $(SolutionDir)..\..\env\work\srw_python\srwlpy.pyd + $(SolutionDir)..\..\env\python\srwpy\srwlpy.pyd LIBC;%(IgnoreSpecificDefaultLibraries) true Windows @@ -854,7 +854,7 @@ Default - copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\work\srw_python\" + copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\python\srwpy\" @@ -874,7 +874,7 @@ ..\..\..\Python35_x64\libs\python35.lib;srw_x64.lib;%(AdditionalDependencies) - $(SolutionDir)..\..\env\work\srw_python\srwlpy.pyd + $(SolutionDir)..\..\env\python\srwpy\srwlpy.pyd LIBCMT;%(IgnoreSpecificDefaultLibraries) true Windows @@ -902,7 +902,7 @@ ..\..\..\Python36_x64\libs\python36.lib;srw_x64.lib;%(AdditionalDependencies) - $(SolutionDir)..\..\env\work\srw_python\srwlpy.pyd + $(SolutionDir)..\..\env\python\srwpy\srwlpy.pyd LIBCMT;%(IgnoreSpecificDefaultLibraries) true Windows @@ -930,7 +930,7 @@ ..\..\..\Python38_x64\libs\python38.lib;srw_x64.lib;%(AdditionalDependencies) - $(SolutionDir)..\..\env\work\srw_python\srwlpy.pyd + $(SolutionDir)..\..\env\python\srwpy\srwlpy.pyd LIBCMT;%(IgnoreSpecificDefaultLibraries) true Windows @@ -958,7 +958,7 @@ ..\..\..\Python39_x64\libs\python39.lib;srw_x64.lib;%(AdditionalDependencies) - $(SolutionDir)..\..\env\work\srw_python\srwlpy.pyd + $(SolutionDir)..\..\env\python\srwpy\srwlpy.pyd LIBCMT;%(IgnoreSpecificDefaultLibraries) true Windows @@ -1014,7 +1014,7 @@ ..\..\..\Python37_x64\libs\python37.lib;srw_x64.lib;%(AdditionalDependencies) - $(SolutionDir)..\..\env\work\srw_python\srwlpy.pyd + $(SolutionDir)..\..\env\python\srwpy\srwlpy.pyd LIBCMT;%(IgnoreSpecificDefaultLibraries) true Windows @@ -1052,7 +1052,7 @@ false - copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\work\srw_python\" + copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\python\srwpy\" @@ -1083,7 +1083,7 @@ false - copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\work\srw_python\" + copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\python\srwpy\" @@ -1114,7 +1114,7 @@ false - copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\work\srw_python\" + copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\python\srwpy\" @@ -1145,7 +1145,7 @@ false - copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\work\srw_python\" + copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\python\srwpy\" @@ -1176,7 +1176,7 @@ false - copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\work\srw_python\" + copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\python\srwpy\" @@ -1207,7 +1207,7 @@ false - copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\work\srw_python\" + copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\python\srwpy\" @@ -1238,7 +1238,7 @@ false - copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\work\srw_python\" + copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\python\srwpy\" @@ -1275,7 +1275,7 @@ srwlpy.pgd - copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\work\srw_python\" + copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\python\srwpy\" @@ -1313,7 +1313,7 @@ srwlpy.pgd - copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\work\srw_python\" + copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\python\srwpy\" @@ -1351,7 +1351,7 @@ srwlpy.pgd - copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\work\srw_python\" + copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\python\srwpy\" @@ -1389,7 +1389,7 @@ srwlpy.pgd - copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\work\srw_python\" + copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\python\srwpy\" @@ -1465,7 +1465,7 @@ srwlpy.pgd - copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\work\srw_python\" + copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\python\srwpy\" @@ -1520,7 +1520,7 @@ ..\..\..\Python32\libs\python32.lib;srw_win32.lib;%(AdditionalDependencies) - $(SolutionDir)..\..\env\work\srw_python\srwlpy.pyd + $(SolutionDir)..\..\env\python\srwpy\srwlpy.pyd LIBC;%(IgnoreSpecificDefaultLibraries) true Windows @@ -1548,7 +1548,7 @@ ..\..\..\Python27_x64\libs\python27.lib;srw_x64.lib;%(AdditionalDependencies) - $(SolutionDir)..\..\env\work\srw_python\srwlpy.pyd + $(SolutionDir)..\..\env\python\srwpy\srwlpy.pyd LIBCMT;%(IgnoreSpecificDefaultLibraries) true Windows @@ -1582,7 +1582,7 @@ false - copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\work\srw_python\" + copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\python\srwpy\" @@ -1616,7 +1616,7 @@ MachineX64 - copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\work\srw_python\" + copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\python\srwpy\" From 02637c5e57fd3d14a1b089584cb70baecd3a6877 Mon Sep 17 00:00:00 2001 From: Himanshu Goel Date: Thu, 4 Jan 2024 14:04:01 -0500 Subject: [PATCH 9/9] Update SRWLClientPython project. --- cpp/vc/SRWLClientPython.vcxproj | 202 +++++++++++++++++++++++++++ cpp/vc/SRWLClientPython.vcxproj.user | 18 +++ 2 files changed, 220 insertions(+) diff --git a/cpp/vc/SRWLClientPython.vcxproj b/cpp/vc/SRWLClientPython.vcxproj index 25826e85..ed3d9959 100644 --- a/cpp/vc/SRWLClientPython.vcxproj +++ b/cpp/vc/SRWLClientPython.vcxproj @@ -9,6 +9,14 @@ Debug_Py2x x64 + + Debug_Py3_11_cuda + Win32 + + + Debug_Py3_11_cuda + x64 + Debug_Py3_11 Win32 @@ -73,6 +81,14 @@ Release_Py2x x64 + + Release_Py3_11_cuda + Win32 + + + Release_Py3_11_cuda + x64 + Release_Py3_11 Win32 @@ -190,6 +206,12 @@ Unicode true + + DynamicLibrary + v143 + Unicode + true + DynamicLibrary v143 @@ -220,6 +242,11 @@ v143 Unicode + + DynamicLibrary + v143 + Unicode + DynamicLibrary v143 @@ -283,6 +310,13 @@ true false + + DynamicLibrary + v143 + Unicode + true + false + DynamicLibrary v143 @@ -317,6 +351,12 @@ Unicode false + + DynamicLibrary + v143 + Unicode + false + DynamicLibrary v143 @@ -353,6 +393,9 @@ + + + @@ -371,6 +414,9 @@ + + + @@ -401,6 +447,9 @@ + + + @@ -419,6 +468,9 @@ + + + @@ -456,6 +508,11 @@ $(Platform)\$(Configuration)\ true + + $(SolutionDir)$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + true + $(SolutionDir)$(Platform)\$(Configuration)\ $(Platform)\$(Configuration)\ @@ -503,6 +560,13 @@ srwlpy .pyd + + $(ProjectDir) + $(Platform)\$(Configuration)\ + true + srwlpy + .pyd + $(ProjectDir) $(Platform)\$(Configuration)\ @@ -559,6 +623,13 @@ srwlpy .pyd + + $(ProjectDir) + $(Platform)\$(Configuration)\ + false + srwlpy + .pyd + $(ProjectDir) $(Platform)\$(Configuration)\ @@ -608,6 +679,13 @@ srwlpy .pyd + + $(ProjectDir) + $(Platform)\$(Configuration)\ + false + srwlpy + .pyd + $(SolutionDir)$(Platform)\$(Configuration)\ $(Platform)\$(Configuration)\ @@ -794,6 +872,33 @@ + + + Disabled + ..\src\lib;..\..\..\Python33\include;%(AdditionalIncludeDirectories) + WIN32;_DEBUG;_WINDOWS;_USRDLL;SRWLIB_CLIENT;SRWLCLIENTPYTHON_EXPORTS;_CRT_SECURE_NO_WARNINGS;NON_UNIX_STDIO;%(PreprocessorDefinitions) + true + EnableFastChecks + MultiThreadedDebug + + + Level3 + EditAndContinue + + + ..\..\..\Python33\libs\python33.lib;srw_win32.lib;%(AdditionalDependencies) + $(SolutionDir)..\..\env\python\srwpy\srwlpy.pyd + LIBC;%(IgnoreSpecificDefaultLibraries) + true + Windows + MachineX86 + false + + + + + + Disabled @@ -997,6 +1102,34 @@ + + + X64 + + + Disabled + ..\src\lib;..\..\..\Python311_x64\include;$(CUDA_PATH)\include;%(AdditionalIncludeDirectories) + WIN32;_OFFLOAD_GPU;_DEBUG;_WINDOWS;_USRDLL;SRWLIB_CLIENT;SRWLCLIENTPYTHON_EXPORTS;_CRT_SECURE_NO_WARNINGS;NON_UNIX_STDIO;%(PreprocessorDefinitions) + true + EnableFastChecks + MultiThreadedDebug + NotUsing + Level3 + ProgramDatabase + + + ..\..\..\Python311_x64\libs\python311.lib;$(CUDA_PATH)\lib\x64\cudart_static.lib;$(CUDA_PATH)\lib\x64\cudadevrt.lib;srw_x64.lib;%(AdditionalDependencies) + $(SolutionDir)..\..\env\python\srwpy\srwlpy.pyd + LIBCMT;%(IgnoreSpecificDefaultLibraries) + true + Windows + MachineX64 + + + + + + X64 @@ -1241,6 +1374,37 @@ copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\python\srwpy\" + + + MaxSpeed + false + ..\src\lib;..\..\..\Python36\include;%(AdditionalIncludeDirectories) + WIN32;NDEBUG;_WINDOWS;_USRDLL;SRWLCLIENTPYTHON_EXPORTS;SRWLIB_CLIENT;_CRT_SECURE_NO_WARNINGS;NON_UNIX_STDIO;%(PreprocessorDefinitions) + MultiThreaded + true + + + Level3 + ProgramDatabase + false + Default + true + + + ..\..\..\Python36\libs\python36.lib;srw_win32.lib;%(AdditionalDependencies) + srwlpy.pyd + LIBC;%(IgnoreSpecificDefaultLibraries) + true + Windows + true + true + MachineX86 + false + + + copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\python\srwpy\" + + X64 @@ -1506,6 +1670,44 @@ copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\python\srwpy\" + + + X64 + + + MaxSpeed + false + ..\src\lib;..\..\..\Python311_x64\include;$(CUDA_PATH)\include;%(AdditionalIncludeDirectories) + WIN32;_OFFLOAD_GPU;NDEBUG;_WINDOWS;_USRDLL;SRWLCLIENTPYTHON_EXPORTS;SRWLIB_CLIENT;_CRT_SECURE_NO_WARNINGS;NON_UNIX_STDIO;%(PreprocessorDefinitions) + MultiThreaded + false + + + Level3 + None + Speed + OnlyExplicitInline + true + false + true + Precise + + + ..\..\..\Python311_x64\libs\python311.lib;$(CUDA_PATH)\lib\x64\cudart_static.lib;$(CUDA_PATH)\lib\x64\cudadevrt.lib;srw_x64.lib;%(AdditionalDependencies) + srwlpy.pyd + LIBC;%(IgnoreSpecificDefaultLibraries) + true + Windows + true + true + MachineX64 + Default + srwlpy.pgd + + + copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\python\srwpy\" + + Disabled diff --git a/cpp/vc/SRWLClientPython.vcxproj.user b/cpp/vc/SRWLClientPython.vcxproj.user index 0e4ca8e2..c029e307 100644 --- a/cpp/vc/SRWLClientPython.vcxproj.user +++ b/cpp/vc/SRWLClientPython.vcxproj.user @@ -36,6 +36,12 @@ ..\..\env\work\srw_python WindowsLocalDebugger + + C:\SoftwareDevelopments\Python39_x64\python.exe + ELETTRA-CDI-Source-Test-Tandem-350-eV.py + ..\..\env\work\srw_python + WindowsLocalDebugger + ..\..\Python37_x64\python.exe SRWLIB_Example04_test_mi4d_resize_mesh.py @@ -84,6 +90,12 @@ split-delay-test-vcc.py ..\..\env\work\srw_python + + C:\SoftwareDevelopments\Python38_x64\python.exe + WindowsLocalDebugger + split-delay-test-vcc.py + ..\..\env\work\srw_python + C:\SoftwareDevelopments\Python27_x64\python.exe WindowsLocalDebugger @@ -114,6 +126,12 @@ ..\..\env\work\srw_python WindowsLocalDebugger + + python + test_hdf5_convert.py + ..\..\env\work\srw_python + WindowsLocalDebugger + python smf-preliminary-03-an-2d-test-01.py