diff --git a/cpp/gcc/Makefile b/cpp/gcc/Makefile index 69864471..23797d0f 100644 --- a/cpp/gcc/Makefile +++ b/cpp/gcc/Makefile @@ -27,6 +27,12 @@ else endif endif +# HG30112023 +CUDA_PATH ?= /usr/local/cuda +CUDA_MATHLIBS_PATH ?= /usr/local/cuda +NVCC = $(CUDA_PATH)/bin/nvcc +NVCXX = $(CUDA_PATH)/bin/nvc++ + SRW_SRC_DEF= -D_GNU_SOURCE -D__USE_XOPEN2K8 -DFFTW_ENABLE_FLOAT -D_GM_WITHOUT_BASE -DSRWLIB_STATIC -DNO_TIMER -DANSI_DECLARATORS -DTRILIBRARY $(OSFLAG) SRW_INCLUDES= -I$(SRW_SRC_GEN_DIR) -I$(SRW_SRC_LIB_DIR) -I$(SH_SRC_PARSE_DIR) -I$(SH_SRC_GEN_MATH_DIR) $(SRW_SRC_DEF) SRW_CFLAGS= -O3 -fPIC @@ -35,6 +41,17 @@ LDFLAGS=-L$(LIB_DIR) -lm ifeq ($(MODE), omp) SRW_CFLAGS+= -D_WITH_OMP -fopenmp -Wno-write-strings LDFLAGS+= -lfftw +else #HG30112023 +ifeq ($(MODE), cuda) +CUDA_INCLUDES = -I$(CUDA_PATH)/include -I$(CUDA_MATHLIBS_PATH)/include +CUDA_LIBS = -L$(CUDA_PATH)/lib64 -L$(CUDA_MATHLIBS_PATH)/lib64 + +SRW_SRC_DEF += -D_OFFLOAD_GPU -DUSE_CUDA -D_FFTW3 +SRW_INCLUDES += $(CUDA_INCLUDES) +SRW_CFLAGS += -std=c++17 +LDFLAGS += $(CUDA_LIBS) -lcudart_static -lcudadevrt -lcufft -lrt +NVCFLAGS = -O3 -arch=sm_80 -dlto -rdc=true +CUDA_OBJ=gmfft_gpu.o srradstr_gpu.o sroptelm_gpu.o sroptdrf_gpu.o sroptgtr_gpu.o srradmnp_gpu.o else ifeq ($(MODE), 0) SRW_CFLAGS+= -D_FFTW3 @@ -43,6 +60,7 @@ else $(error Unknown SRW compilation option) endif endif +endif PYFLAGS=-I$(shell python -c "from __future__ import print_function; from sysconfig import get_paths as gp; print(gp()['include'])") PYFLAGS+=-L$(shell python -c "from __future__ import print_function; from sysconfig import get_paths as gp; import os; print(os.path.join(gp()['stdlib'], '../libs'))") @@ -71,6 +89,10 @@ OBJ += timerec.o track.o srerror.o # src/lib OBJ += srwlib.o +# HG30112023 +ifeq ($(MODE), cuda) +OBJ += auxgpu.o +endif PRG= libsrw.a @@ -89,6 +111,24 @@ PRG= libsrw.a %.o: $(SRW_SRC_GENESIS_DIR)/%.c $(CC) $(CFLAGS) -c $< +# HG30112023 +ifeq ($(MODE), cuda) +lib: $(CUDA_OBJ) $(OBJ) + $(NVCC) $(NVCFLAGS) -Xcompiler="$(SRW_CFLAGS)" -dlink -o srwl_link.o *.o $(LDFLAGS) + ar -cvq $(PRG) *.o + #cp $(PRG) $(LIB_DIR)/ + rm -f *.o + +%.o: $(SRW_SRC_LIB_DIR)/%.cu + $(NVCC) -dc $(NVCFLAGS) $(SRW_INCLUDES) $(SRW_SRC_DEF) -Xcompiler="$(CFLAGS)" -c $< + +%.o: $(SH_SRC_GEN_MATH_DIR)/%.cu + $(NVCC) -dc $(NVCFLAGS) $(SRW_INCLUDES) $(SRW_SRC_DEF) -Xcompiler="$(CFLAGS)" -c $< + +%.o: $(SRW_SRC_GEN_DIR)/%.cu + $(NVCC) -dc $(NVCFLAGS) $(SRW_INCLUDES) $(SRW_SRC_DEF) -Xcompiler="$(CFLAGS)" -c $< + +else lib: $(OBJ) ar -cvq $(PRG) *.o #cp $(PRG) $(LIB_DIR)/ @@ -102,6 +142,7 @@ lib: $(OBJ) %.o: $(SRW_SRC_GEN_DIR)/%.cu $(NVCC) -x=c++ -Xcompiler="$(CFLAGS)" -c $< +endif pylib: $(CXX) -shared $(CFLAGS) $(PYFLAGS) -o srwlpy.so $(SRW_SRC_DIR)/clients/python/srwlpy.cpp libsrw.a $(LDFLAGS) diff --git a/cpp/py/setup.py b/cpp/py/setup.py index 013e8824..075520e1 100644 --- a/cpp/py/setup.py +++ b/cpp/py/setup.py @@ -20,7 +20,11 @@ if 'MODE' in os.environ: sMode = str(os.environ['MODE']) - if sMode == 'omp': + if sMode == 'cuda': # HG30112023 + ext_kwargs.update({'libraries': ['srw', 'm', 'cudart_static', 'cudadevrt', 'cufft', 'fftw3f', 'fftw3', 'rt'], 'extra_compile_args': ['-O3', '-mavx2', '-fno-math-errno']}) + ext_kwargs['library_dirs'].append('{0}/lib64'.format(os.environ['CUDA_PATH'])) + ext_kwargs['library_dirs'].append('{0}/lib64'.format(os.environ['CUDA_MATHLIBS_PATH'])) + elif sMode == 'omp': #ext_kwargs.update({'extra_link_args': ['-fopenmp'], ext_kwargs.update({'libraries': ['srw', 'm', 'fftw'], #OC07022019 'extra_link_args': ['-fopenmp'], diff --git a/cpp/src/clients/python/srwlpy.cpp b/cpp/src/clients/python/srwlpy.cpp index f320cfc5..d568aafc 100644 --- a/cpp/src/clients/python/srwlpy.cpp +++ b/cpp/src/clients/python/srwlpy.cpp @@ -26,6 +26,10 @@ #include #include //OCTEST_161214 +#ifdef _OFFLOAD_GPU //HG30112023 +#include "auxgpu.h" +#endif + //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: //#include @@ -3319,6 +3323,22 @@ void ParseSructSmpObj3D(double**& arObjShapeDefs, int& nObj3D, PyObject* oListSh } } +#ifdef _OFFLOAD_GPU //HG30112023 +/************************************************************************//** + * Convert Python device specification to C++ structure. + ***************************************************************************/ +void ParseDeviceParam(PyObject* oDev, TGPUUsageArg* pGpu) //HG10202021 Convert Python device specification to C++ structure +{ + if (oDev != 0) { + if (PyLong_Check(oDev)) { + pGpu->deviceIndex = _PyLong_AsInt(oDev); + return; + } + } + pGpu->deviceIndex = 0; +} +#endif + /************************************************************************//** * Updates Py List by numbers ***************************************************************************/ @@ -4617,18 +4637,24 @@ static PyObject* srwlpy_CalcIntFromElecField(PyObject *self, PyObject *args) { //PyObject *oInt=0, *oWfr=0, *oPol=0, *oIntType=0, *oDepType=0, *oE=0, *oX=0, *oY=0; //PyObject *oInt=0, *oWfr=0, *oPol=0, *oIntType=0, *oDepType=0, *oE=0, *oX=0, *oY=0, *oMeth=0; - PyObject *oInt=0, *oWfr=0, *oPol=0, *oIntType=0, *oDepType=0, *oE=0, *oX=0, *oY=0, *oMeth=0, *oFldTrj=0; //OC23022020 + //PyObject *oInt=0, *oWfr=0, *oPol=0, *oIntType=0, *oDepType=0, *oE=0, *oX=0, *oY=0, *oMeth=0, *oFldTrj=0; //OC23022020 + PyObject *oInt=0, *oWfr=0, *oPol=0, *oIntType=0, *oDepType=0, *oE=0, *oX=0, *oY=0, *oMeth=0, *oFldTrj=0, *oDev=0; //HG03012024 vector vBuf; SRWLWfr wfr; SRWLMagFldC *pMagCnt=0; //OC23022020 SRWLPrtTrj *pPrtTrj=0; +#ifdef _OFFLOAD_GPU //HG30112023 + TGPUUsageArg gpu; + srwlUtiGPUProc(1); //to prepare GPU for calculations +#endif try { //if(!PyArg_ParseTuple(args, "OOOOOOOO:CalcIntFromElecField", &oInt, &oWfr, &oPol, &oIntType, &oDepType, &oE, &oX, &oY)) throw strEr_BadArg_CalcIntFromElecField; //if(!PyArg_ParseTuple(args, "OOOOOOOO|O:CalcIntFromElecField", &oInt, &oWfr, &oPol, &oIntType, &oDepType, &oE, &oX, &oY, &oMeth)) throw strEr_BadArg_CalcIntFromElecField; //OC13122019 //if(!PyArg_ParseTuple(args, "OOOOOOOO|O:CalcIntFromElecField", &oInt, &oWfr, &oPol, &oIntType, &oDepType, &oE, &oX, &oY, &oMeth, &oFldTrj)) throw strEr_BadArg_CalcIntFromElecField; //OC23022020 - if(!PyArg_ParseTuple(args, "OOOOOOOO|OO:CalcIntFromElecField", &oInt, &oWfr, &oPol, &oIntType, &oDepType, &oE, &oX, &oY, &oMeth, &oFldTrj)) throw strEr_BadArg_CalcIntFromElecField; //OC03032021 (just formally corrected, according to number of arguments) + //if(!PyArg_ParseTuple(args, "OOOOOOOO|OO:CalcIntFromElecField", &oInt, &oWfr, &oPol, &oIntType, &oDepType, &oE, &oX, &oY, &oMeth, &oFldTrj)) throw strEr_BadArg_CalcIntFromElecField; //OC03032021 (just formally corrected, according to number of arguments) + if(!PyArg_ParseTuple(args, "OOOOOOOO|OOO:CalcIntFromElecField", &oInt, &oWfr, &oPol, &oIntType, &oDepType, &oE, &oX, &oY, &oMeth, &oFldTrj, &oDev)) throw strEr_BadArg_CalcIntFromElecField; //HG03012024 if((oInt == 0) || (oWfr == 0) || (oPol == 0) || (oIntType == 0) || (oDepType == 0) || (oE == 0) || (oX == 0) || (oY == 0)) throw strEr_BadArg_CalcIntFromElecField; //char *arInt = (char*)GetPyArrayBuf(oInt, vBuf, PyBUF_WRITABLE, 0); @@ -4691,7 +4717,13 @@ static PyObject* srwlpy_CalcIntFromElecField(PyObject *self, PyObject *args) //ProcRes(srwlCalcIntFromElecField(arInt, &wfr, pol, intType, depType, e, x, y)); //ProcRes(srwlCalcIntFromElecField(arInt, &wfr, pol, intType, depType, e, x, y, pMeth)); //OC13122019 + +#ifdef _OFFLOAD_GPU //HG30112023 + ParseDeviceParam(oDev, &gpu); + ProcRes(srwlCalcIntFromElecField(arInt, &wfr, pol, intType, depType, e, x, y, pMeth, pFldTrj, (void*)&gpu)); +#else ProcRes(srwlCalcIntFromElecField(arInt, &wfr, pol, intType, depType, e, x, y, pMeth, pFldTrj)); //OC23022020 +#endif } catch(const char* erText) { @@ -4700,6 +4732,9 @@ static PyObject* srwlpy_CalcIntFromElecField(PyObject *self, PyObject *args) oInt = 0; } +#ifdef _OFFLOAD_GPU //HG30112023 + srwlUtiGPUProc(0); //to free GPU +#endif if(pMagCnt != 0) DeallocMagCntArrays(pMagCnt); ReleasePyBuffers(vBuf); EraseElementFromMap(&wfr, gmWfrPyPtr); @@ -4932,7 +4967,8 @@ static PyObject* srwlpy_SetRepresElecField(PyObject *self, PyObject *args) static PyObject* srwlpy_PropagElecField(PyObject *self, PyObject *args) { //PyObject *oWfr=0, *oOptCnt=0; - PyObject *oWfr=0, *oOptCnt=0, *oInt=0; //OC14082018 + //PyObject *oWfr=0, *oOptCnt=0, *oInt=0; //OC14082018 + PyObject *oWfr=0, *oOptCnt=0, *oInt=0, *oDev=0; //Hg03012024 vector vBuf; SRWLWfr wfr; @@ -4945,10 +4981,15 @@ static PyObject* srwlpy_PropagElecField(PyObject *self, PyObject *args) //float **arInts=0; char **arInts=0; +#ifdef _OFFLOAD_GPU //HG03012024 + TGPUUsageArg gpu; + srwlUtiGPUProc(1); //to prepare GPU for calculations +#endif try { //if(!PyArg_ParseTuple(args, "OO:PropagElecField", &oWfr, &oOptCnt)) throw strEr_BadArg_PropagElecField; - if(!PyArg_ParseTuple(args, "OO|O:PropagElecField", &oWfr, &oOptCnt, &oInt)) throw strEr_BadArg_PropagElecField; //OC14082018 + //if(!PyArg_ParseTuple(args, "OO|O:PropagElecField", &oWfr, &oOptCnt, &oInt)) throw strEr_BadArg_PropagElecField; //OC14082018 + if(!PyArg_ParseTuple(args, "OO|OO:PropagElecField", &oWfr, &oOptCnt, &oInt, &oDev)) throw strEr_BadArg_PropagElecField; //HG03012024 if((oWfr == 0) || (oOptCnt == 0)) throw strEr_BadArg_PropagElecField; //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: @@ -4981,7 +5022,12 @@ static PyObject* srwlpy_PropagElecField(PyObject *self, PyObject *args) } //ProcRes(srwlPropagElecField(&wfr, &optCnt)); +#ifdef _OFFLOAD_GPU //HG03012024 + ParseDeviceParam(oDev, &gpu); + ProcRes(srwlPropagElecField(&wfr, &optCnt, nInt, arIntDescr, arIntMesh, arInts, (void*)&gpu)); +#else ProcRes(srwlPropagElecField(&wfr, &optCnt, nInt, arIntDescr, arIntMesh, arInts)); //OC15082018 +#endif //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: //srwlPrintTime(":srwlpy_PropagElecField :srwlPropagElecField", &start); @@ -5002,6 +5048,9 @@ static PyObject* srwlpy_PropagElecField(PyObject *self, PyObject *args) //PyErr_PrintEx(1); oWfr = 0; } +#ifdef _OFFLOAD_GPU //HG03012024 + srwlUtiGPUProc(0); //to free GPU +#endif DeallocOptCntArrays(&optCnt); ReleasePyBuffers(vBuf); @@ -5102,12 +5151,18 @@ static PyObject* srwlpy_CalcTransm(PyObject* self, PyObject* args) //HG27012021 ***************************************************************************/ static PyObject* srwlpy_UtiFFT(PyObject *self, PyObject *args) { - PyObject *oData=0, *oMesh=0, *oDir=0; + //PyObject *oData=0, *oMesh=0, *oDir=0; + PyObject *oData=0, *oMesh=0, *oDir=0, *oDev=0; //HG03012024 vector vBuf; +#ifdef _OFFLOAD_GPU //HG03012024 + TGPUUsageArg gpu; + srwlUtiGPUProc(1); //to prepare GPU for calculations +#endif try { - if(!PyArg_ParseTuple(args, "OOO:UtiFFT", &oData, &oMesh, &oDir)) throw strEr_BadArg_UtiFFT; + //if(!PyArg_ParseTuple(args, "OOO:UtiFFT", &oData, &oMesh, &oDir)) throw strEr_BadArg_UtiFFT; + if(!PyArg_ParseTuple(args, "OOO|O:UtiFFT", &oData, &oMesh, &oDir, &oDev)) throw strEr_BadArg_UtiFFT; //HG03012024 if((oData == 0) || (oMesh == 0) || (oDir == 0)) throw strEr_BadArg_UtiFFT; //int sizeVectBuf = (int)vBuf.size(); @@ -5143,7 +5198,12 @@ static PyObject* srwlpy_UtiFFT(PyObject *self, PyObject *args) if(!PyNumber_Check(oDir)) throw strEr_BadArg_UtiFFT; int dir = (int)PyLong_AsLong(oDir); +#ifdef _OFFLOAD_GPU //HG03012024 + ParseDeviceParam(oDev, &gpu); + ProcRes(srwlUtiFFT(pcData, typeData, arMesh, nMesh, dir, (void*)&gpu)); +#else ProcRes(srwlUtiFFT(pcData, typeData, arMesh, nMesh, dir)); +#endif if(meshArType == 'l') UpdatePyListNum(oMesh, arMesh, nMesh); //04092016 } @@ -5153,6 +5213,9 @@ static PyObject* srwlpy_UtiFFT(PyObject *self, PyObject *args) //if(vBuf.size() > 0) ReleasePyBuffers(vBuf); oData = 0; oMesh = 0; oDir = 0; } +#ifdef _OFFLOAD_GPU //HG03012024 + srwlUtiGPUProc(0); //to free GPU +#endif ReleasePyBuffers(vBuf); diff --git a/cpp/src/core/sroptang.h b/cpp/src/core/sroptang.h index c9f85485..2294731c 100644 --- a/cpp/src/core/sroptang.h +++ b/cpp/src/core/sroptang.h @@ -30,7 +30,8 @@ class srTOptAngle : public srTGenOptElem { AngY = InAngY; } - int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect) //virtual + //int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect) //virtual + int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect, void* pvGPU=0) //virtual //HG30112023 { //return PropagateRadiationMeth_0(pRadAccessData); int res = 0; @@ -43,11 +44,14 @@ class srTOptAngle : public srTGenOptElem { //int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadAccessData, void* pBuf=0) //OC06092019 //OC01102019 (restored) - int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadAccessData) + //int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadAccessData) + int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadAccessData, void* pvGPU) //HG30112023 { int result; - if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result; - if(result = TraverseRadZXE(pRadAccessData)) return result; + //if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result; + if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0, 0, 0, pvGPU)) return result; //HG30112023 + //if(result = TraverseRadZXE(pRadAccessData)) return result; + if(result = TraverseRadZXE(pRadAccessData, 0, 0, pvGPU)) return result; //HG30112023 //consider programming Angle on angular side by simple change of limits //however note potential problems for many photon energies! @@ -131,7 +135,8 @@ class srTOptShift : public srTGenOptElem { ShiftY = InShiftY; } - int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect) //virtual + //int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect) //virtual + int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect, void* pvGPU=0) //virtual //HG04122023 { //return PropagateRadiationMeth_0(pRadAccessData); int res = 0; diff --git a/cpp/src/core/sroptapt.h b/cpp/src/core/sroptapt.h index 7d1032dc..98f6598c 100644 --- a/cpp/src/core/sroptapt.h +++ b/cpp/src/core/sroptapt.h @@ -33,11 +33,13 @@ class srTAperture : public srTShapedOptElem { srTAperture () {} //int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, int MethNo, srTRadResizeVect& ResBeforeAndAfterVect) - int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect) + //int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect) + int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect, void* pvGPU=0) //HG30112023 { char &MethNo = ParPrecWfrPropag.MethNo; - if(MethNo == 0) return PropagateRadiationMeth_0(pRadAccessData); + //if(MethNo == 0) return PropagateRadiationMeth_0(pRadAccessData); + if(MethNo == 0) return PropagateRadiationMeth_0(pRadAccessData, pvGPU); //HG30112023 else if(MethNo == 1) return PropagateRadiationMeth_1(pRadAccessData); //else if(MethNo == 2) return PropagateRadiationMeth_2(pRadAccessData, ResBeforeAndAfterVect); else if(MethNo == 2) return PropagateRadiationMeth_2(pRadAccessData, ParPrecWfrPropag, ResBeforeAndAfterVect); @@ -47,11 +49,14 @@ class srTAperture : public srTShapedOptElem { //int PropagateRadiationMeth_0(srTSRWRadStructAccessData* pRadAccessData) //int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadAccessData, void* pBuf = 0) //OC06092019 //OC01102019 (restored) - int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadAccessData) + //int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadAccessData) + int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadAccessData, void* pvGPU) //HG30112023 { int result; - if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result; - if(result = TraverseRadZXE(pRadAccessData)) return result; + //if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result; + if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0, 0, 0, pvGPU)) return result; //HG30112023 + //if(result = TraverseRadZXE(pRadAccessData)) return result; + if(result = TraverseRadZXE(pRadAccessData, 0, 0, pvGPU)) return result; //HG30112023 if(result = PropagateRadMoments(pRadAccessData, 0)) return result; SetNewNonZeroWfrLimits(pRadAccessData); @@ -76,11 +81,14 @@ class srTAperture : public srTShapedOptElem { //int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData, void* pBuf=0) //OC06092019 //OC01102019 (restored) - int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData) + //int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData) + int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData, void* pvGPU=0) //HG30112023 { int result; - if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result; - if(result = TraverseRadZXE(pRadAccessData)) return result; + //if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result; + if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0, 0, 0, pvGPU)) return result; //HG30112023 + //if(result = TraverseRadZXE(pRadAccessData)) return result; + if(result = TraverseRadZXE(pRadAccessData, 0, 0, pvGPU)) return result; //HG30112023 SetNewNonZeroWfrLimits(pRadAccessData); return 0; diff --git a/cpp/src/core/sroptcnt.cpp b/cpp/src/core/sroptcnt.cpp index a8bb4278..9b6072b2 100644 --- a/cpp/src/core/sroptcnt.cpp +++ b/cpp/src/core/sroptcnt.cpp @@ -251,7 +251,8 @@ int srTCompositeOptElem::PropagateRadiationTest(srTSRWRadStructAccessData* pInRa //************************************************************************* -int srTCompositeOptElem::PropagateRadiationGuided(srTSRWRadStructAccessData& wfr, int nInt, char** arID, SRWLRadMesh* arIM, char** arI) //OC15082018 +//int srTCompositeOptElem::PropagateRadiationGuided(srTSRWRadStructAccessData& wfr, int nInt, char** arID, SRWLRadMesh* arIM, char** arI) //OC15082018 +int srTCompositeOptElem::PropagateRadiationGuided(srTSRWRadStructAccessData& wfr, int nInt, char** arID, SRWLRadMesh* arIM, char** arI, void* pvGPU) //HG30112023 //int srTCompositeOptElem::PropagateRadiationGuided(srTSRWRadStructAccessData& wfr) { //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: @@ -265,6 +266,9 @@ int srTCompositeOptElem::PropagateRadiationGuided(srTSRWRadStructAccessData& wfr int res = 0, elemCount = 0; bool propIntIsNeeded = (nInt != 0) && (arID != 0) && (arI != 0); //OC27082018 +#ifdef _OFFLOAD_GPU //HG30112023 + bool dataOnDevice = false; +#endif for(srTGenOptElemHndlList::iterator it = GenOptElemList.begin(); it != GenOptElemList.end(); ++it) { @@ -308,7 +312,16 @@ int srTCompositeOptElem::PropagateRadiationGuided(srTSRWRadStructAccessData& wfr if((::fabs(curPropResizeInst.pxd - 1.) > tolRes) || (::fabs(curPropResizeInst.pxm - 1.) > tolRes) || //(::fabs(curPropResizeInst.pzd - 1.) > tolRes) || (::fabs(curPropResizeInst.pzm - 1.) > tolRes)) (::fabs(curPropResizeInst.pzd - 1.) > tolRes) || (::fabs(curPropResizeInst.pzm - 1.) > tolRes) || (curPropResizeInst.ShiftTypeBeforeRes > 0)) //OC11072019 - if(res = RadResizeGen(wfr, curPropResizeInst)) return res; + { + //if(res = RadResizeGen(wfr, curPropResizeInst)) return res; + if(res = RadResizeGen(wfr, curPropResizeInst, pvGPU)) return res; //HG30112023 + +#ifdef _OFFLOAD_GPU //HG30112023 + if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) { + dataOnDevice = true; + } +#endif + } //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: //srwlPrintTime("Iteration: RadResizeGen",&start); @@ -325,14 +338,55 @@ int srTCompositeOptElem::PropagateRadiationGuided(srTSRWRadStructAccessData& wfr //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: //srwlPrintTime("Iteration: precParWfrPropag",&start); +#ifdef _OFFLOAD_GPU //HG30112023 + TGPUUsageArg* pGPU = (TGPUUsageArg*)pvGPU; + if (CAuxGPU::GPUEnabled(pGPU)) { + if (dataOnDevice && (((srTGenOptElem*)it->rep)->SupportedFeatures() & 1) == 0) + { +//#if DEBUG +// printf("Element does not support GPU, transferring to CPU.\r\n"); +//#endif + if (wfr.pBaseRadX != NULL) + wfr.pBaseRadX = (float*)CAuxGPU::ToHostAndFree(pGPU, wfr.pBaseRadX, 2 * wfr.ne * wfr.nx * wfr.nz * sizeof(float)); + if (wfr.pBaseRadZ != NULL) + wfr.pBaseRadZ = (float*)CAuxGPU::ToHostAndFree(pGPU, wfr.pBaseRadZ, 2 * wfr.ne * wfr.nx * wfr.nz * sizeof(float)); + dataOnDevice = false; + } + else if (!dataOnDevice && (((srTGenOptElem*)it->rep)->SupportedFeatures() & 1) == 1) + { + dataOnDevice = true; +//#if DEBUG +// printf("Element supports GPU, transferring...\r\n"); +//#endif + } + } +#endif + srTRadResizeVect auxResizeVect; - if(res = ((srTGenOptElem*)(it->rep))->PropagateRadiation(&wfr, precParWfrPropag, auxResizeVect)) return res; + //if(res = ((srTGenOptElem*)(it->rep))->PropagateRadiation(&wfr, precParWfrPropag, auxResizeVect)) return res; + if(res = ((srTGenOptElem*)(it->rep))->PropagateRadiation(&wfr, precParWfrPropag, auxResizeVect, pvGPU)) return res; //HG30112023 //maybe to use "PropagateRadiationGuided" for srTCompositeOptElem? //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: //srwlPrintTime("Iteration: PropagateRadiation",&start); - if(propIntIsNeeded) ExtractPropagatedIntensity(wfr, nInt, arID, arIM, arI, elemCount); + if(propIntIsNeeded) + { +#ifdef _OFFLOAD_GPU //HG09112022 If the data is on the GPU, transfer it to CPU and synchronize before extracting the intensity + TGPUUsageArg* pGPU = (TGPUUsageArg*)pvGPU; + if (CAuxGPU::GPUEnabled(pGPU)) { + if (dataOnDevice) + { + if (wfr.pBaseRadX != NULL) + wfr.pBaseRadX = (float*)CAuxGPU::ToHostAndFree(pGPU, wfr.pBaseRadX, 2 * wfr.ne * wfr.nx * wfr.nz * sizeof(float)); + if (wfr.pBaseRadZ != NULL) + wfr.pBaseRadZ = (float*)CAuxGPU::ToHostAndFree(pGPU, wfr.pBaseRadZ, 2 * wfr.ne * wfr.nx * wfr.nz * sizeof(float)); + dataOnDevice = false; + } + } +#endif + ExtractPropagatedIntensity(wfr, nInt, arID, arIM, arI, elemCount); + } elemCount++; diff --git a/cpp/src/core/sroptcnt.h b/cpp/src/core/sroptcnt.h index 59095ccc..84f7e7d5 100644 --- a/cpp/src/core/sroptcnt.h +++ b/cpp/src/core/sroptcnt.h @@ -34,7 +34,8 @@ class srTCompositeOptElem : public srTGenOptElem { srTCompositeOptElem() {} int PropagateRadiationTest(srTSRWRadStructAccessData*, srTSRWRadStructAccessData*); - int PropagateRadiationGuided(srTSRWRadStructAccessData& wfr, int nInt=0, char** arID=0, SRWLRadMesh* arIM=0, char** arI=0); //OC15082018 + int PropagateRadiationGuided(srTSRWRadStructAccessData& wfr, int nInt=0, char** arID=0, SRWLRadMesh* arIM=0, char** arI=0, void* pvGPU=0); //HG01122023 + //int PropagateRadiationGuided(srTSRWRadStructAccessData& wfr, int nInt=0, char** arID=0, SRWLRadMesh* arIM=0, char** arI=0); //OC15082018 //int PropagateRadiationGuided(srTSRWRadStructAccessData& wfr); int ExtractPropagatedIntensity(srTSRWRadStructAccessData& wfr, int nInt, char** arID, SRWLRadMesh* arIM, char** arI, int elCnt, int indIntSartSearch=0); //27082018 @@ -47,7 +48,8 @@ class srTCompositeOptElem : public srTGenOptElem { GenOptElemList.push_back(OptElemHndl); } - int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResizeBeforeAndAfterVect) + //int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResizeBeforeAndAfterVect) + int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResizeBeforeAndAfterVect, void* pvGPU=0) //HG01122023 { int AmOfElem = (int)GenOptElemList.size(); //OC110104 int ElemCount = 0; //OC110104 @@ -65,7 +67,8 @@ class srTCompositeOptElem : public srTGenOptElem { } //if(result = ((srTGenOptElem*)((*iter).rep))->PropagateRadiation(pRadAccessData, MethNo, ResizeBeforeAndAfterVect)) return result; - if(result = ((srTGenOptElem*)((*iter).rep))->PropagateRadiation(pRadAccessData, ParPrecWfrPropag, ResizeBeforeAndAfterVect)) return result; + //if(result = ((srTGenOptElem*)((*iter).rep))->PropagateRadiation(pRadAccessData, ParPrecWfrPropag, ResizeBeforeAndAfterVect)) return result; + if(result = ((srTGenOptElem*)((*iter).rep))->PropagateRadiation(pRadAccessData, ParPrecWfrPropag, ResizeBeforeAndAfterVect, pvGPU)) return result; //HG01122023 } ParPrecWfrPropag.UseResAfter = GenUseResAfter; //OC110104 return 0; diff --git a/cpp/src/core/sroptcryst.h b/cpp/src/core/sroptcryst.h index fd25308e..ee0ef80b 100644 --- a/cpp/src/core/sroptcryst.h +++ b/cpp/src/core/sroptcryst.h @@ -943,7 +943,8 @@ class srTOptCryst : public srTGenOptElem { return 0; } - int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect) //virtual in srTGenOptElem + //int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect) //virtual in srTGenOptElem + int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect, void* pvGPU=0) //virtual in srTGenOptElem //HG01122023 { m_eStartAux = pRadAccessData->eStart; m_eStepAux = pRadAccessData->eStep; m_ne = pRadAccessData->ne; //required for RadPointModifier @@ -967,7 +968,8 @@ class srTOptCryst : public srTGenOptElem { } //return PropagateRadiationMeth_0(pRadAccessData); - return PropagateRadiationSingleE_Meth_0(pRadAccessData, 0); + //return PropagateRadiationSingleE_Meth_0(pRadAccessData, 0); + return PropagateRadiationSingleE_Meth_0(pRadAccessData, 0, pvGPU); //HG01122023 } //int PropagateRadiationMeth_0(srTSRWRadStructAccessData* pRadAccessData) //virtual in srTGenOptElem @@ -977,7 +979,8 @@ class srTOptCryst : public srTGenOptElem { //int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadAccessData, void* pBuf=0) //OC06092019 //OC01102019 (restored) - int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadAccessData) + //int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadAccessData) + int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadAccessData, void* pvGPU) //HG01122023 {//It works for many photon energies too (as in the case of Drift) //The "in-place" processing involving FFT for many photon energies greatly improves efficiency of the code for Time-/Frequency-Dependent simulations for FEL and pulsed lasers. int result; diff --git a/cpp/src/core/sroptdrf.cpp b/cpp/src/core/sroptdrf.cpp index e4018c6f..27246d8b 100644 --- a/cpp/src/core/sroptdrf.cpp +++ b/cpp/src/core/sroptdrf.cpp @@ -352,7 +352,8 @@ int srTDriftSpace::PropagateRadiationMeth_1(srTSRWRadStructAccessData* pRadAcces //int srTDriftSpace::PropagateRadiationSimple_PropToWaist(srTSRWRadStructAccessData* pRadAccessData, srTDriftPropBufVars* pBufVars) //OC06092019 //OC01102019 (restored) -int srTDriftSpace::PropagateRadiationSimple_PropToWaist(srTSRWRadStructAccessData* pRadAccessData) +//int srTDriftSpace::PropagateRadiationSimple_PropToWaist(srTSRWRadStructAccessData* pRadAccessData) +int srTDriftSpace::PropagateRadiationSimple_PropToWaist(srTSRWRadStructAccessData* pRadAccessData, void* pvGPU) //HG30112023 {// e in eV; Length in m !!! int result; @@ -365,7 +366,8 @@ int srTDriftSpace::PropagateRadiationSimple_PropToWaist(srTSRWRadStructAccessDat SetupPropBufVars_PropToWaist(pRadAccessData, &BufVars); //SetupPropBufVars_PropToWaist(pRadAccessData); - if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result; + //if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result; + if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0, 0, 0, pvGPU)) return result; //HG30112023 //pBufVars->PassNo = 1; //OC06092019 //OC01102019 (restored) @@ -373,7 +375,8 @@ int srTDriftSpace::PropagateRadiationSimple_PropToWaist(srTSRWRadStructAccessDat //PropBufVars.PassNo = 1; //if(result = TraverseRadZXE(pRadAccessData, pBufVars)) return result; //OC06092019 //OC01102019 (restored) - if(result = TraverseRadZXE(pRadAccessData, &BufVars)) return result; //OC29082019 + if(result = TraverseRadZXE(pRadAccessData, &BufVars, sizeof(srTDriftPropBufVars), pvGPU)) return result; //HG30112023 + //if(result = TraverseRadZXE(pRadAccessData, &BufVars)) return result; //OC29082019 //if(result = TraverseRadZXE(pRadAccessData)) return result; //OC240114 (commented-out) @@ -402,7 +405,8 @@ int srTDriftSpace::PropagateRadiationSimple_PropToWaist(srTSRWRadStructAccessDat //To remove this? srTDataPtrsForWfrEdgeCorr DataPtrsForWfrEdgeCorr; - if(result = SetupWfrEdgeCorrData(pRadAccessData, pRadAccessData->pBaseRadX, pRadAccessData->pBaseRadZ, DataPtrsForWfrEdgeCorr)) return result; + if(result = SetupWfrEdgeCorrData(pRadAccessData, pRadAccessData->pBaseRadX, pRadAccessData->pBaseRadZ, DataPtrsForWfrEdgeCorr, pvGPU)) return result; //HG30112023 + //if(result = SetupWfrEdgeCorrData(pRadAccessData, pRadAccessData->pBaseRadX, pRadAccessData->pBaseRadZ, DataPtrsForWfrEdgeCorr)) return result; #if !defined(_FFTW3) && defined(_WITH_OMP) //OC29082019 //OC04062020 @@ -423,9 +427,11 @@ int srTDriftSpace::PropagateRadiationSimple_PropToWaist(srTSRWRadStructAccessDat #else //OCTEST01102019: commented-out the above (to see if this will fix problem of TD calcs) FFT2DInfo.pData = pRadAccessData->pBaseRadX; - if(result = FFT2D.Make2DFFT(FFT2DInfo)) return result; + //if(result = FFT2D.Make2DFFT(FFT2DInfo)) return result; + if(result = FFT2D.Make2DFFT(FFT2DInfo, 0, 0, pvGPU)) return result; //HG30112023 FFT2DInfo.pData = pRadAccessData->pBaseRadZ; - if(result = FFT2D.Make2DFFT(FFT2DInfo)) return result; + //if(result = FFT2D.Make2DFFT(FFT2DInfo)) return result; + if(result = FFT2D.Make2DFFT(FFT2DInfo, 0, 0, pvGPU)) return result; //HG30112023 #endif //FFT2DInfo.pData = pRadAccessData->pBaseRadX; @@ -436,7 +442,8 @@ int srTDriftSpace::PropagateRadiationSimple_PropToWaist(srTSRWRadStructAccessDat //To remove this? if(DataPtrsForWfrEdgeCorr.WasSetup) { - MakeWfrEdgeCorrection(pRadAccessData, pRadAccessData->pBaseRadX, pRadAccessData->pBaseRadZ, DataPtrsForWfrEdgeCorr); + //MakeWfrEdgeCorrection(pRadAccessData, pRadAccessData->pBaseRadX, pRadAccessData->pBaseRadZ, DataPtrsForWfrEdgeCorr); + MakeWfrEdgeCorrection(pRadAccessData, pRadAccessData->pBaseRadX, pRadAccessData->pBaseRadZ, DataPtrsForWfrEdgeCorr, pvGPU); //HG30112023 DataPtrsForWfrEdgeCorr.DisposeData(); } @@ -455,7 +462,8 @@ int srTDriftSpace::PropagateRadiationSimple_PropToWaist(srTSRWRadStructAccessDat //PropBufVars.PassNo = 2; //if(result = TraverseRadZXE(pRadAccessData, pBufVars)) return result; //OC06092019 //OC01102019 (restored) - if(result = TraverseRadZXE(pRadAccessData, &BufVars)) return result; //OC30082019 + if(result = TraverseRadZXE(pRadAccessData, &BufVars, sizeof(srTDriftPropBufVars), pvGPU)) return result; //HG30112023 + //if(result = TraverseRadZXE(pRadAccessData, &BufVars)) return result; //OC30082019 //if(result = TraverseRadZXE(pRadAccessData)) return result; //OC19032022 @@ -479,7 +487,8 @@ int srTDriftSpace::PropagateRadiationSimple_PropToWaist(srTSRWRadStructAccessDat } //************************************************************************* -int srTDriftSpace::PropagateRadiationSimple_PropToWaistBeyondParax(srTSRWRadStructAccessData* pRadAccessData) //OC10112019 +//int srTDriftSpace::PropagateRadiationSimple_PropToWaistBeyondParax(srTSRWRadStructAccessData* pRadAccessData) //OC10112019 +int srTDriftSpace::PropagateRadiationSimple_PropToWaistBeyondParax(srTSRWRadStructAccessData* pRadAccessData, void *pvGPU) //HG30112023 {// e in eV; Length in m !!! int result = 0; @@ -488,7 +497,8 @@ int srTDriftSpace::PropagateRadiationSimple_PropToWaistBeyondParax(srTSRWRadStru srTDriftPropBufVars BufVars; SetupPropBufVars_PropToWaistBeyondParax(pRadAccessData, &BufVars); - if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result; + //if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result; + if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0, 0, 0, pvGPU)) return result; //HG30112023 pRadAccessData->TreatQuadPhaseTerm('r'); //OC17122019 //pRadAccessData->TreatQuadPhaseTermTerm('r'); @@ -509,7 +519,8 @@ int srTDriftSpace::PropagateRadiationSimple_PropToWaistBeyondParax(srTSRWRadStru //pRadAccessData->xStart = (pRadAccessData->xStart)*InvLambdaM_d_Rx; //pRadAccessData->zStart = (pRadAccessData->zStart)*InvLambdaM_d_Rz; - if(result = TraverseRadZXE(pRadAccessData, &BufVars)) return result; + //if(result = TraverseRadZXE(pRadAccessData, &BufVars)) return result; + if(result = TraverseRadZXE(pRadAccessData, &BufVars, sizeof(srTDriftPropBufVars), pvGPU)) return result; //HG30112023 CGenMathFFT2DInfo FFT2DInfo; FFT2DInfo.xStep = pRadAccessData->xStep; @@ -547,9 +558,11 @@ int srTDriftSpace::PropagateRadiationSimple_PropToWaistBeyondParax(srTSRWRadStru #else //OCTEST01102019: commented-out the above (to see if this will fix problem of TD calcs) FFT2DInfo.pData = pRadAccessData->pBaseRadX; - if(result = FFT2D.Make2DFFT(FFT2DInfo)) return result; + //if(result = FFT2D.Make2DFFT(FFT2DInfo)) return result; + if(result = FFT2D.Make2DFFT(FFT2DInfo, 0, 0, pvGPU)) return result; //HG30112023 FFT2DInfo.pData = pRadAccessData->pBaseRadZ; - if(result = FFT2D.Make2DFFT(FFT2DInfo)) return result; + //if(result = FFT2D.Make2DFFT(FFT2DInfo)) return result; + if(result = FFT2D.Make2DFFT(FFT2DInfo, 0, 0, pvGPU)) return result; //HG30112023 #endif //To remove this? @@ -597,7 +610,8 @@ int srTDriftSpace::PropagateRadiationSimple_PropToWaistBeyondParax(srTSRWRadStru //int srTDriftSpace::PropagateRadiationSimple_PropFromWaist(srTSRWRadStructAccessData* pRadAccessData, srTDriftPropBufVars* pBufVars) //OC06092019 //OC01102019 (restored) -int srTDriftSpace::PropagateRadiationSimple_PropFromWaist(srTSRWRadStructAccessData* pRadAccessData) +//int srTDriftSpace::PropagateRadiationSimple_PropFromWaist(srTSRWRadStructAccessData* pRadAccessData) +int srTDriftSpace::PropagateRadiationSimple_PropFromWaist(srTSRWRadStructAccessData* pRadAccessData, void* pvGPU) //HG30112023 {//Should be very similar to PropagateRadiationSimple_PropToWaist, consider merging int result = 0; @@ -607,7 +621,8 @@ int srTDriftSpace::PropagateRadiationSimple_PropFromWaist(srTSRWRadStructAccessD //OC01102019 (restored) SetupPropBufVars_PropFromWaist(pRadAccessData, &BufVars); //SetupPropBufVars_PropFromWaist(pRadAccessData); - if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result; + //if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result; + if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0, 0, 0, pvGPU)) return result; //HG30112023 //OC30082019: commented-out: not needed here, since it is set in ChooseLocalPropMode(...); is it thread-safe? //LocalPropMode = 2; // prop. from waist @@ -616,7 +631,8 @@ int srTDriftSpace::PropagateRadiationSimple_PropFromWaist(srTSRWRadStructAccessD //if(result = TraverseRadZXE(pRadAccessData)) return result; //OC01102019 (restored) BufVars.PassNo = 1; - if(result = TraverseRadZXE(pRadAccessData, &BufVars)) return result; + //if(result = TraverseRadZXE(pRadAccessData, &BufVars)) return result; + if(result = TraverseRadZXE(pRadAccessData, &BufVars, sizeof(srTDriftPropBufVars), pvGPU)) return result; //HG30112023 //OC06092019 //pBufVars->PassNo = 1; //if(result = TraverseRadZXE(pRadAccessData, pBufVars)) return result; @@ -638,7 +654,8 @@ int srTDriftSpace::PropagateRadiationSimple_PropFromWaist(srTSRWRadStructAccessD //OCTEST (commented-out "edge correction") //OC01102019 (uncommented) srTDataPtrsForWfrEdgeCorr DataPtrsForWfrEdgeCorr; - if(result = SetupWfrEdgeCorrData(pRadAccessData, pRadAccessData->pBaseRadX, pRadAccessData->pBaseRadZ, DataPtrsForWfrEdgeCorr)) return result; + //if(result = SetupWfrEdgeCorrData(pRadAccessData, pRadAccessData->pBaseRadX, pRadAccessData->pBaseRadZ, DataPtrsForWfrEdgeCorr)) return result; + if(result = SetupWfrEdgeCorrData(pRadAccessData, pRadAccessData->pBaseRadX, pRadAccessData->pBaseRadZ, DataPtrsForWfrEdgeCorr, pvGPU)) return result; //HG30112023 CGenMathFFT2D FFT2D; @@ -666,16 +683,19 @@ int srTDriftSpace::PropagateRadiationSimple_PropFromWaist(srTSRWRadStructAccessD //} #else FFT2DInfo.pData = pRadAccessData->pBaseRadX; - if(result = FFT2D.Make2DFFT(FFT2DInfo)) return result; + //if(result = FFT2D.Make2DFFT(FFT2DInfo)) return result; + if(result = FFT2D.Make2DFFT(FFT2DInfo, 0, 0, pvGPU)) return result; //HG30112023 FFT2DInfo.pData = pRadAccessData->pBaseRadZ; - if(result = FFT2D.Make2DFFT(FFT2DInfo)) return result; + //if(result = FFT2D.Make2DFFT(FFT2DInfo)) return result; + if(result = FFT2D.Make2DFFT(FFT2DInfo, 0, 0, pvGPU)) return result; //HG30112023 #endif //OCTEST (commented-out "edge correction") //OC01102019 (uncommented) if(DataPtrsForWfrEdgeCorr.WasSetup) { - MakeWfrEdgeCorrection(pRadAccessData, pRadAccessData->pBaseRadX, pRadAccessData->pBaseRadZ, DataPtrsForWfrEdgeCorr); + //MakeWfrEdgeCorrection(pRadAccessData, pRadAccessData->pBaseRadX, pRadAccessData->pBaseRadZ, DataPtrsForWfrEdgeCorr); + MakeWfrEdgeCorrection(pRadAccessData, pRadAccessData->pBaseRadX, pRadAccessData->pBaseRadZ, DataPtrsForWfrEdgeCorr, pvGPU); //HG30112023 DataPtrsForWfrEdgeCorr.DisposeData(); } @@ -689,7 +709,8 @@ int srTDriftSpace::PropagateRadiationSimple_PropFromWaist(srTSRWRadStructAccessD //if(result = TraverseRadZXE(pRadAccessData)) return result; //OC01102019 (restored) BufVars.PassNo = 2; - if(result = TraverseRadZXE(pRadAccessData, &BufVars)) return result; + //if(result = TraverseRadZXE(pRadAccessData, &BufVars)) return result; + if(result = TraverseRadZXE(pRadAccessData, &BufVars, sizeof(srTDriftPropBufVars), pvGPU)) return result; //HG30112023 //OC06092019 //pBufVars->PassNo = 2; //if(result = TraverseRadZXE(pRadAccessData, pBufVars)) return result; @@ -701,7 +722,8 @@ int srTDriftSpace::PropagateRadiationSimple_PropFromWaist(srTSRWRadStructAccessD //int srTDriftSpace::PropagateRadiationSimple_AnalytTreatQuadPhaseTerm(srTSRWRadStructAccessData* pRadAccessData, srTDriftPropBufVars* pBufVars) //OC06092019 //OC01102019 (restored) -int srTDriftSpace::PropagateRadiationSimple_AnalytTreatQuadPhaseTerm(srTSRWRadStructAccessData* pRadAccessData) +//int srTDriftSpace::PropagateRadiationSimple_AnalytTreatQuadPhaseTerm(srTSRWRadStructAccessData* pRadAccessData) +int srTDriftSpace::PropagateRadiationSimple_AnalytTreatQuadPhaseTerm(srTSRWRadStructAccessData* pRadAccessData, void* pvGPU) //HG30112023 {// e in eV; Length in m !!! int result = 0; @@ -720,7 +742,8 @@ int srTDriftSpace::PropagateRadiationSimple_AnalytTreatQuadPhaseTerm(srTSRWRadSt //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: //srwlPrintTime(":PropagateRadiationSimple_AnalytTreatQuadPhaseTerm:SetupPropBufVars_AnalytTreatQuadPhaseTerm",&start); - if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result; + //if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result; + if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0, 0, 0, pvGPU)) return result; //HG30112023 //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: //srwlPrintTime(":PropagateRadiationSimple_AnalytTreatQuadPhaseTerm:SetRadRepres 1",&start); @@ -731,7 +754,8 @@ int srTDriftSpace::PropagateRadiationSimple_AnalytTreatQuadPhaseTerm(srTSRWRadSt //PropBufVars.PassNo = 1; //Remove quadratic term from the Phase in coord. repres. //if(result = TraverseRadZXE(pRadAccessData, pBufVars)) return result; //OC06092019 //OC01102019 (restored) - if(result = TraverseRadZXE(pRadAccessData, &BufVars)) return result; //OC30082019 + //if(result = TraverseRadZXE(pRadAccessData, &BufVars)) return result; //OC30082019 + if(result = TraverseRadZXE(pRadAccessData, &BufVars, sizeof(srTDriftPropBufVars), pvGPU)) return result; //HG30112023 //if(result = TraverseRadZXE(pRadAccessData)) return result; //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: @@ -750,7 +774,8 @@ int srTDriftSpace::PropagateRadiationSimple_AnalytTreatQuadPhaseTerm(srTSRWRadSt pRadAccessData->WfrEdgeCorrShouldBeDone = 0; - if(result = SetRadRepres(pRadAccessData, 1)) return result; //To angular repres. + //if(result = SetRadRepres(pRadAccessData, 1)) return result; //To angular repres. + if(result = SetRadRepres(pRadAccessData, 1, 0, 0, pvGPU)) return result; //To angular repres. //HG30112023 //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: //srwlPrintTime(":PropagateRadiationSimple_AnalytTreatQuadPhaseTerm:SetRadRepres 2",&start); @@ -761,7 +786,8 @@ int srTDriftSpace::PropagateRadiationSimple_AnalytTreatQuadPhaseTerm(srTSRWRadSt //PropBufVars.PassNo = 2; //Loop in angular repres. //if(result = TraverseRadZXE(pRadAccessData, pBufVars)) return result; //OC06092019 //OC01102019 (restored) - if(result = TraverseRadZXE(pRadAccessData, &BufVars)) return result; //OC30082019 + //if(result = TraverseRadZXE(pRadAccessData, &BufVars)) return result; //OC30082019 + if(result = TraverseRadZXE(pRadAccessData, &BufVars, sizeof(srTDriftPropBufVars), pvGPU)) return result; //HG30112023 //if(result = TraverseRadZXE(pRadAccessData)) return result; //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: @@ -773,7 +799,8 @@ int srTDriftSpace::PropagateRadiationSimple_AnalytTreatQuadPhaseTerm(srTSRWRadSt pRadAccessData->zStartTr += zShift; } - if(result = SetRadRepres(pRadAccessData, 0)) return result; //Back to coord. repres. + //if(result = SetRadRepres(pRadAccessData, 0)) return result; //Back to coord. repres. + if(result = SetRadRepres(pRadAccessData, 0, 0, 0, pvGPU)) return result; //Back to coord. repres. //HG30112023 //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: //srwlPrintTime(":PropagateRadiationSimple_AnalytTreatQuadPhaseTerm:SetRadRepres 3",&start); @@ -816,7 +843,8 @@ int srTDriftSpace::PropagateRadiationSimple_AnalytTreatQuadPhaseTerm(srTSRWRadSt //PropBufVars.PassNo = 3; //Add new quadratic term to the Phase in coord. repres. //if(result = TraverseRadZXE(pRadAccessData, pBufVars)) return result; //OC06092019 //OC01102019 (restored) - if(result = TraverseRadZXE(pRadAccessData, &BufVars)) return result; //OC30082019 + //if(result = TraverseRadZXE(pRadAccessData, &BufVars)) return result; //OC30082019 + if(result = TraverseRadZXE(pRadAccessData, &BufVars, sizeof(srTDriftPropBufVars), pvGPU)) return result; //HG30112023 //if(result = TraverseRadZXE(pRadAccessData)) return result; //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: diff --git a/cpp/src/core/sroptdrf.h b/cpp/src/core/sroptdrf.h index 01b03722..c3a7509a 100644 --- a/cpp/src/core/sroptdrf.h +++ b/cpp/src/core/sroptdrf.h @@ -90,6 +90,7 @@ class srTDriftSpace : public srTGenOptElem { double Length; //OC06092019 (commented-out) //srTDriftPropBufVars PropBufVars; + int SupportedFeatures() override { return 1; } //HG01122023 Returns 1 if the element supports GPU propagation srTDriftSpace(double InLength =0., char InTreatPath =0) { @@ -109,7 +110,8 @@ class srTDriftSpace : public srTGenOptElem { } //int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, int MethNo, srTRadResizeVect& ResizeBeforeAndAfterVect) - int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResizeBeforeAndAfterVect) + //int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResizeBeforeAndAfterVect) + int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResizeBeforeAndAfterVect, void* pvGPU=0) //HG01122023 { //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: //double start; @@ -159,7 +161,8 @@ class srTDriftSpace : public srTGenOptElem { //if(MethNo == 0) result = PropagateRadiationMeth_0(pRadAccessData, &BufVars); //OC06092019 //OC01102019 (restored) - if(MethNo == 0) result = PropagateRadiationMeth_0(pRadAccessData); + //if(MethNo == 0) result = PropagateRadiationMeth_0(pRadAccessData); + if(MethNo == 0) result = PropagateRadiationMeth_0(pRadAccessData, pvGPU); //HG01122023 else if(MethNo == 1) result = PropagateRadiationMeth_1(pRadAccessData); else if(MethNo == 2) result = PropagateRadiationMeth_2(pRadAccessData, ParPrecWfrPropag, ResizeBeforeAndAfterVect); @@ -175,12 +178,14 @@ class srTDriftSpace : public srTGenOptElem { //int PropagateRadiationMeth_0(srTSRWRadStructAccessData* pRadAccessData) //int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadAccessData, void* pBuf=0) //OC06092019 //OC01102019 (restored) - int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadAccessData) + //int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadAccessData) + int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadAccessData, void* pvGPU=0) //HG01122023 {//it works for many photon energies too! int result; //if(result = PropagateRadiationSimple(pRadAccessData, pBuf)) return result; //OC06092019 //OC01102019 (restored) - if(result = PropagateRadiationSimple(pRadAccessData)) return result; + //if(result = PropagateRadiationSimple(pRadAccessData)) return result; + if(result = PropagateRadiationSimple(pRadAccessData, pvGPU)) return result; //HG01122023 if(result = PropagateRadMoments(pRadAccessData, 0)) return result; if(result = PropagateWaveFrontRadius(pRadAccessData)) return result; if(result = Propagate4x4PropMatr(pRadAccessData)) return result; @@ -189,7 +194,8 @@ class srTDriftSpace : public srTGenOptElem { //int PropagateRadiationMeth_0(srTSRWRadStructAccessData* pRadAccessData, void* pBuf) //OC06092019 //OC01102019 (restored) - int PropagateRadiationMeth_0(srTSRWRadStructAccessData* pRadAccessData) //virtual in srTGenOptElem + //int PropagateRadiationMeth_0(srTSRWRadStructAccessData* pRadAccessData) //virtual in srTGenOptElem + int PropagateRadiationMeth_0(srTSRWRadStructAccessData* pRadAccessData, void* pvGPU) //virtual in srTGenOptElem //HG01122023 {//because for the Drift, the following works for many photon energies too! //return PropagateRadiationSingleE_Meth_0(pRadAccessData, 0); //OC251214 @@ -198,7 +204,8 @@ class srTDriftSpace : public srTGenOptElem { //srTDriftPropBufVars* pBufVars = (srTDriftPropBufVars*)pBuf; //OC06092019 //if((pBufVars->LocalPropMode == 0) || (pBufVars->LocalPropMode == 3) || (pRadAccessData->ne == 1)) return PropagateRadiationSingleE_Meth_0(pRadAccessData, 0, pBuf); //OC06092019 //OC01102019 (restored) - if((LocalPropMode == 0) || (LocalPropMode == 3) || (pRadAccessData->ne == 1)) return PropagateRadiationSingleE_Meth_0(pRadAccessData, 0); + //if((LocalPropMode == 0) || (LocalPropMode == 3) || (pRadAccessData->ne == 1)) return PropagateRadiationSingleE_Meth_0(pRadAccessData, 0); + if((LocalPropMode == 0) || (LocalPropMode == 3) || (pRadAccessData->ne == 1)) return PropagateRadiationSingleE_Meth_0(pRadAccessData, 0, pvGPU); //HG01122023 else { pRadAccessData->SetNonZeroWavefrontLimitsToFullRange(); @@ -304,20 +311,26 @@ class srTDriftSpace : public srTGenOptElem { //int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData, void* pBuf=0) //OC06092019 //OC01102019 (restored) - int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData) + //int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData) + int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData, void* pvGPU=0) //HG01122023 { //srTDriftPropBufVars* pBufVars = (srTDriftPropBufVars*)pBuf; //OC06092019 //char LocalPropMode = pBufVars->LocalPropMode; //OC06092019 //OC01102019 (commented-out / restored) - if(LocalPropMode == 0) return PropagateRadiationSimple_AngRepres(pRadAccessData); + //if(LocalPropMode == 0) return PropagateRadiationSimple_AngRepres(pRadAccessData); + if(LocalPropMode == 0) return PropagateRadiationSimple_AngRepres(pRadAccessData, pvGPU); //HG01122023 //OC01102019 (restored) - else if(LocalPropMode == 1) return PropagateRadiationSimple_PropToWaist(pRadAccessData); + //else if(LocalPropMode == 1) return PropagateRadiationSimple_PropToWaist(pRadAccessData); + else if(LocalPropMode == 1) return PropagateRadiationSimple_PropToWaist(pRadAccessData, pvGPU); //HG01122023 - else if(LocalPropMode == 11) return PropagateRadiationSimple_PropToWaistBeyondParax(pRadAccessData); //OC10112019 + //else if(LocalPropMode == 11) return PropagateRadiationSimple_PropToWaistBeyondParax(pRadAccessData); //OC10112019 + else if(LocalPropMode == 11) return PropagateRadiationSimple_PropToWaistBeyondParax(pRadAccessData, pvGPU); //OC10112019 //HG01122023 - else if(LocalPropMode == 2) return PropagateRadiationSimple_PropFromWaist(pRadAccessData); //OC240114 (added) - else if(LocalPropMode == 3) return PropagateRadiationSimple_AnalytTreatQuadPhaseTerm(pRadAccessData); + //else if(LocalPropMode == 2) return PropagateRadiationSimple_PropFromWaist(pRadAccessData); //OC240114 (added) + else if(LocalPropMode == 2) return PropagateRadiationSimple_PropFromWaist(pRadAccessData, pvGPU); //OC240114 (added) //HG01122023 + //else if(LocalPropMode == 3) return PropagateRadiationSimple_AnalytTreatQuadPhaseTerm(pRadAccessData); + else if(LocalPropMode == 3) return PropagateRadiationSimple_AnalytTreatQuadPhaseTerm(pRadAccessData, pvGPU); //HG01122023 //OC06092019 //else if(LocalPropMode == 1) return PropagateRadiationSimple_PropToWaist(pRadAccessData, pBufVars); //else if(LocalPropMode == 2) return PropagateRadiationSimple_PropFromWaist(pRadAccessData, pBufVars); //OC240114 (added) @@ -329,7 +342,8 @@ class srTDriftSpace : public srTGenOptElem { else return 0; } - int PropagateRadiationSimple_AngRepres(srTSRWRadStructAccessData* pRadAccessData) + //int PropagateRadiationSimple_AngRepres(srTSRWRadStructAccessData* pRadAccessData) + int PropagateRadiationSimple_AngRepres(srTSRWRadStructAccessData* pRadAccessData, void* pvGPU) //HG01122023 { //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: //double start; @@ -351,13 +365,15 @@ class srTDriftSpace : public srTGenOptElem { if(pRadAccessData->Pres != 1) { - if(result = SetRadRepres(pRadAccessData, 1)) return result; + //if(result = SetRadRepres(pRadAccessData, 1)) return result; + if(result = SetRadRepres(pRadAccessData, 1, 0, 0, pvGPU)) return result; //HG01122023 } //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: //srwlPrintTime(":PropagateRadiationSimple_AngRepres:SetRadRepres 1",&start); - if(result = TraverseRadZXE(pRadAccessData)) return result; + //if(result = TraverseRadZXE(pRadAccessData)) return result; + if(result = TraverseRadZXE(pRadAccessData, 0, 0, pvGPU)) return result; //HG01122023 //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: //srwlPrintTime(":PropagateRadiationSimple_AngRepres:TraverseRadZXE",&start); @@ -368,7 +384,8 @@ class srTDriftSpace : public srTGenOptElem { pRadAccessData->zStartTr += zShift; } - if(result = SetRadRepres(pRadAccessData, 0)) return result; + //if(result = SetRadRepres(pRadAccessData, 0)) return result; + if(result = SetRadRepres(pRadAccessData, 0, 0, 0, pvGPU)) return result; //HG01122023 //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: //srwlPrintTime(":PropagateRadiationSimple_AngRepres:SetRadRepres 2",&start); @@ -390,11 +407,15 @@ class srTDriftSpace : public srTGenOptElem { } //OC01102019 (restored) - int PropagateRadiationSimple_PropToWaist(srTSRWRadStructAccessData* pRadAccessData); - int PropagateRadiationSimple_PropToWaistBeyondParax(srTSRWRadStructAccessData* pRadAccessData); //OC10112019 - - int PropagateRadiationSimple_PropFromWaist(srTSRWRadStructAccessData* pRadAccessData); - int PropagateRadiationSimple_AnalytTreatQuadPhaseTerm(srTSRWRadStructAccessData* pRadAccessData); + //int PropagateRadiationSimple_PropToWaist(srTSRWRadStructAccessData* pRadAccessData); + int PropagateRadiationSimple_PropToWaist(srTSRWRadStructAccessData* pRadAccessData, void* pvGPU=0); //HG01122023 + //int PropagateRadiationSimple_PropToWaistBeyondParax(srTSRWRadStructAccessData* pRadAccessData); //OC10112019 + int PropagateRadiationSimple_PropToWaistBeyondParax(srTSRWRadStructAccessData* pRadAccessData, void* pvGPU=0); //OC10112019 //HG01122023 + + //int PropagateRadiationSimple_PropFromWaist(srTSRWRadStructAccessData* pRadAccessData); + int PropagateRadiationSimple_PropFromWaist(srTSRWRadStructAccessData* pRadAccessData, void* pvGPU=0); //HG01122023 + //int PropagateRadiationSimple_AnalytTreatQuadPhaseTerm(srTSRWRadStructAccessData* pRadAccessData); + int PropagateRadiationSimple_AnalytTreatQuadPhaseTerm(srTSRWRadStructAccessData* pRadAccessData, void* pvGPU=0); //HG01122023 //OC06092019 //int PropagateRadiationSimple_PropToWaist(srTSRWRadStructAccessData* pRadAccessData, srTDriftPropBufVars* pBufVars=0); //int PropagateRadiationSimple_PropFromWaist(srTSRWRadStructAccessData* pRadAccessData, srTDriftPropBufVars* pBufVars=0); @@ -553,6 +574,16 @@ class srTDriftSpace : public srTGenOptElem { void RadPointModifier(srTEXZ& EXZ, srTEFieldPtrs& EPtrs, void* pBuf=0) //OC29082019 //void RadPointModifier(srTEXZ& EXZ, srTEFieldPtrs& EPtrs) + { + RadPointModifierPortable(EXZ, EPtrs, pBuf); //HG01122023 + } + +#ifdef _OFFLOAD_GPU //HG01122023 + int RadPointModifierParallel(srTSRWRadStructAccessData* pRadAccessData, void* pBufVars=0, long pBufVarsSz=0, TGPUUsageArg* pGPU=0) override; + + GPU_PORTABLE +#endif + void RadPointModifierPortable(srTEXZ& EXZ, srTEFieldPtrs& EPtrs, void* pBuf=0) //HG01122023 { srTDriftPropBufVars* pBufVars = (srTDriftPropBufVars*)pBuf; //char LocalPropMode = pBufVars->LocalPropMode; @@ -568,6 +599,9 @@ class srTDriftSpace : public srTGenOptElem { //else if(LocalPropMode == 3) { RadPointModifier_AnalytTreatQuadPhaseTerm(EXZ, EPtrs); return;} } +#ifdef _OFFLOAD_GPU //HG01122023 + GPU_PORTABLE +#endif void RadPointModifier_AngRepres(srTEXZ& EXZ, srTEFieldPtrs& EPtrs) {// e in eV; Length in m !!! // Operates on Angles side !!! @@ -599,6 +633,9 @@ class srTDriftSpace : public srTGenOptElem { *(EPtrs.pEzRe) = NewEzRe; *(EPtrs.pEzIm) = NewEzIm; } +#ifdef _OFFLOAD_GPU //HG01122023 + GPU_PORTABLE +#endif void RadPointModifier_PropToWaist(srTEXZ& EXZ, srTEFieldPtrs& EPtrs, srTDriftPropBufVars* pBufVars) //OC29082019 //void RadPointModifier_PropToWaist(srTEXZ& EXZ, srTEFieldPtrs& EPtrs) { @@ -661,6 +698,9 @@ class srTDriftSpace : public srTGenOptElem { } } +#ifdef _OFFLOAD_GPU //HG01122023 + GPU_PORTABLE +#endif void RadPointModifier_PropToWaistBeyondParax(srTEXZ& EXZ, srTEFieldPtrs& EPtrs, srTDriftPropBufVars* pBufVars) //OC10112019 { double rx = EXZ.x, rz = EXZ.z; @@ -695,6 +735,9 @@ class srTDriftSpace : public srTGenOptElem { *(EPtrs.pEzRe) = NewEzRe; *(EPtrs.pEzIm) = NewEzIm; } +#ifdef _OFFLOAD_GPU //HG01122023 + GPU_PORTABLE +#endif void RadPointModifier_PropFromWaist(srTEXZ& EXZ, srTEFieldPtrs& EPtrs, srTDriftPropBufVars* pBufVars) //OC30082019 //void RadPointModifier_PropFromWaist(srTEXZ& EXZ, srTEFieldPtrs& EPtrs) { @@ -742,6 +785,9 @@ class srTDriftSpace : public srTGenOptElem { } } +#ifdef _OFFLOAD_GPU //HG01122023 + GPU_PORTABLE +#endif void RadPointModifier_AnalytTreatQuadPhaseTerm(srTEXZ& EXZ, srTEFieldPtrs& EPtrs, srTDriftPropBufVars* pBufVars) //OC30082019 //void RadPointModifier_AnalytTreatQuadPhaseTerm(srTEXZ& EXZ, srTEFieldPtrs& EPtrs) {//don't use RobsX, RobsZ directly here! diff --git a/cpp/src/core/sroptdrf_gpu.cu b/cpp/src/core/sroptdrf_gpu.cu new file mode 100644 index 00000000..7d98fa8a --- /dev/null +++ b/cpp/src/core/sroptdrf_gpu.cu @@ -0,0 +1,29 @@ +/************************************************************************//** + * File: sroptdrf_gpu.cu + * Description: Optical element: Drift space (CUDA implementation) + * Project: Synchrotron Radiation Workshop + * First release: 2023 + * + * Copyright (C) Brookhaven National Laboratory + * All Rights Reserved + * + * @author H.Goel + * @version 1.0 + ***************************************************************************/ + +#ifdef _OFFLOAD_GPU +#include "cuda_runtime.h" +#include "device_launch_parameters.h" +#include "math_constants.h" + +#include +#include +#include +#include "sroptdrf.h" + +//Implementation of the RadPointModifier's GPU function for the srTDriftSpace class +int srTDriftSpace::RadPointModifierParallel(srTSRWRadStructAccessData* pRadAccessData, void* pBufVars, long pBufVarsSz, TGPUUsageArg *pGpu) +{ + return RadPointModifierParallelImpl(pRadAccessData, pBufVars, pBufVarsSz, this, pGpu); +} //HG03092022 +#endif \ No newline at end of file diff --git a/cpp/src/core/sroptel2.cpp b/cpp/src/core/sroptel2.cpp index 0c426d5d..73d11027 100644 --- a/cpp/src/core/sroptel2.cpp +++ b/cpp/src/core/sroptel2.cpp @@ -37,7 +37,8 @@ double srTGenOptElem::CheckMemoryAvailable() //int srTGenOptElem::PropagateRadiationMeth_0(srTSRWRadStructAccessData* pRadAccessData, void* pBuf) //OC06092019 //OC01102019 (restored) -int srTGenOptElem::PropagateRadiationMeth_0(srTSRWRadStructAccessData* pRadAccessData) +//int srTGenOptElem::PropagateRadiationMeth_0(srTSRWRadStructAccessData* pRadAccessData) +int srTGenOptElem::PropagateRadiationMeth_0(srTSRWRadStructAccessData* pRadAccessData, void *pvGPU) //HG30112023 {//Moved from derived classes: loops over E, calls derived PropagateRadiationSingleE_Meth_0 //This propagation method doesn't allow for true wavefront "resizing/resampling" //(which results in changing numbers of points) in "slices" vs photon energy. diff --git a/cpp/src/core/sroptelm.cpp b/cpp/src/core/sroptelm.cpp index 341d0bae..bde700d1 100644 --- a/cpp/src/core/sroptelm.cpp +++ b/cpp/src/core/sroptelm.cpp @@ -30,6 +30,10 @@ #include "sropthck.h" #include "sroptgrat.h" +#ifdef _OFFLOAD_GPU //HG01122023 +#include "auxgpu.h" +#endif + #ifdef _WITH_OMP //Pre-processor definition for compiling with OpenMP library #include "omp.h" #endif @@ -146,7 +150,8 @@ int srTGenOptElem::ExtraDataExpected(const char* sElemID) //OC01062020 //************************************************************************* -int srTGenOptElem::TraverseRadZXE(srTSRWRadStructAccessData* pRadAccessData, void* pBufVars) //OC29082019 +//int srTGenOptElem::TraverseRadZXE(srTSRWRadStructAccessData* pRadAccessData, void* pBufVars) //OC29082019 +int srTGenOptElem::TraverseRadZXE(srTSRWRadStructAccessData* pRadAccessData, void* pBufVars, long pBufVarsSz, void* pvGPU) //OC29082019 //HG01122023 //int srTGenOptElem::TraverseRadZXE(srTSRWRadStructAccessData* pRadAccessData) { float *pEx0 = pRadAccessData->pBaseRadX; @@ -156,6 +161,15 @@ int srTGenOptElem::TraverseRadZXE(srTSRWRadStructAccessData* pRadAccessData, voi long long PerX = pRadAccessData->ne << 1; long long PerZ = PerX*pRadAccessData->nx; +#ifdef _OFFLOAD_GPU //HG01122023 + if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) + { + if (RadPointModifierParallel(pRadAccessData, pBufVars, pBufVarsSz, (TGPUUsageArg*)pvGPU) == -1) //Try to call the GPU version, if it fails, call the CPU version + return TraverseRadZXE(pRadAccessData, pBufVars, pBufVarsSz, NULL); + return 0; + } +#endif + #ifndef _WITH_OMP //OC28102018 srTEFieldPtrs EFieldPtrs; @@ -731,7 +745,8 @@ int srTGenOptElem::RemoveSliceConstE_FromGenRadStruct(srTSRWRadStructAccessData* //************************************************************************* -int srTGenOptElem::SetupWfrEdgeCorrData(srTSRWRadStructAccessData* pRadAccessData, float* pDataEx, float* pDataEz, srTDataPtrsForWfrEdgeCorr& DataPtrsForWfrEdgeCorr) +//int srTGenOptElem::SetupWfrEdgeCorrData(srTSRWRadStructAccessData* pRadAccessData, float* pDataEx, float* pDataEz, srTDataPtrsForWfrEdgeCorr& DataPtrsForWfrEdgeCorr) +int srTGenOptElem::SetupWfrEdgeCorrData(srTSRWRadStructAccessData* pRadAccessData, float* pDataEx, float* pDataEz, srTDataPtrsForWfrEdgeCorr& DataPtrsForWfrEdgeCorr, void* pvGPU) //HG01122023 { int result; @@ -849,7 +864,8 @@ int srTGenOptElem::SetupWfrEdgeCorrData(srTSRWRadStructAccessData* pRadAccessDat FFT1DInfo.xStart = pRadAccessData->zStart; FFT1DInfo.Nx = pRadAccessData->nz; CGenMathFFT1D FFT1D; - if(result = FFT1D.Make1DFFT_InPlace(FFT1DInfo)) return result; + //if(result = FFT1D.Make1DFFT_InPlace(FFT1DInfo)) return result; + if(result = FFT1D.Make1DFFT_InPlace(FFT1DInfo, pvGPU)) return result; //HG01122023 } if(dxFi != 0.) { @@ -889,7 +905,8 @@ int srTGenOptElem::SetupWfrEdgeCorrData(srTSRWRadStructAccessData* pRadAccessDat FFT1DInfo.xStart = pRadAccessData->zStart; FFT1DInfo.Nx = pRadAccessData->nz; CGenMathFFT1D FFT1D; - if(result = FFT1D.Make1DFFT_InPlace(FFT1DInfo)) return result; + //if(result = FFT1D.Make1DFFT_InPlace(FFT1DInfo)) return result; + if(result = FFT1D.Make1DFFT_InPlace(FFT1DInfo, pvGPU)) return result; //HG01122023 } if(dzSt != 0.) { @@ -913,7 +930,8 @@ int srTGenOptElem::SetupWfrEdgeCorrData(srTSRWRadStructAccessData* pRadAccessDat FFT1DInfo.xStart = pRadAccessData->xStart; FFT1DInfo.Nx = pRadAccessData->nx; CGenMathFFT1D FFT1D; - if(result = FFT1D.Make1DFFT_InPlace(FFT1DInfo)) return result; + //if(result = FFT1D.Make1DFFT_InPlace(FFT1DInfo)) return result; + if(result = FFT1D.Make1DFFT_InPlace(FFT1DInfo, pvGPU)) return result; //HG01122023 } if(dzFi != 0.) { @@ -936,7 +954,8 @@ int srTGenOptElem::SetupWfrEdgeCorrData(srTSRWRadStructAccessData* pRadAccessDat FFT1DInfo.xStart = pRadAccessData->xStart; FFT1DInfo.Nx = pRadAccessData->nx; CGenMathFFT1D FFT1D; - if(result = FFT1D.Make1DFFT_InPlace(FFT1DInfo)) return result; + //if(result = FFT1D.Make1DFFT_InPlace(FFT1DInfo)) return result; + if(result = FFT1D.Make1DFFT_InPlace(FFT1DInfo, pvGPU)) return result; //HG01122023 } DataPtrsForWfrEdgeCorr.WasSetup = 1; } @@ -1015,8 +1034,18 @@ int srTGenOptElem::SetupWfrEdgeCorrData1D(srTRadSect1D* pRadSect1D, float* pData //************************************************************************* -void srTGenOptElem::MakeWfrEdgeCorrection(srTSRWRadStructAccessData* pRadAccessData, float* pDataEx, float* pDataEz, srTDataPtrsForWfrEdgeCorr& DataPtrs) +//void srTGenOptElem::MakeWfrEdgeCorrection(srTSRWRadStructAccessData* pRadAccessData, float* pDataEx, float* pDataEz, srTDataPtrsForWfrEdgeCorr& DataPtrs) +void srTGenOptElem::MakeWfrEdgeCorrection(srTSRWRadStructAccessData* pRadAccessData, float* pDataEx, float* pDataEz, srTDataPtrsForWfrEdgeCorr& DataPtrs, void* pvGPU) //HG01122023 { + //HG23082022 Use GPU if requested +#ifdef _OFFLOAD_GPU + if(CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) + { + MakeWfrEdgeCorrection_GPU(pRadAccessData, pDataEx, pDataEz, DataPtrs, (TGPUUsageArg*)pvGPU); + return; + } +#endif + float *tEx = pDataEx, *tEz = pDataEz; double dxSt_dzSt = DataPtrs.dxSt*DataPtrs.dzSt; @@ -1204,7 +1233,8 @@ void srTGenOptElem::MakeWfrEdgeCorrection1D(srTRadSect1D* pRadSect1D, float* pDa //************************************************************************* //int srTGenOptElem::SetRadRepres(srTSRWRadStructAccessData* pRadAccessData, char CoordOrAng) -int srTGenOptElem::SetRadRepres(srTSRWRadStructAccessData* pRadAccessData, char CoordOrAng, double* ar_xStartInSlicesE, double* ar_zStartInSlicesE) +//int srTGenOptElem::SetRadRepres(srTSRWRadStructAccessData* pRadAccessData, char CoordOrAng, double* ar_xStartInSlicesE, double* ar_zStartInSlicesE) +int srTGenOptElem::SetRadRepres(srTSRWRadStructAccessData* pRadAccessData, char CoordOrAng, double* ar_xStartInSlicesE, double* ar_zStartInSlicesE, void* pvGPU) //HG01122023 {// 0- to coord.; 1- to ang. int result; @@ -1247,7 +1277,8 @@ int srTGenOptElem::SetRadRepres(srTSRWRadStructAccessData* pRadAccessData, char { if(CoordOrAng == 1) { - if(result = SetupWfrEdgeCorrData(pRadAccessData, pRadAccessData->pBaseRadX, pRadAccessData->pBaseRadZ, DataPtrsForWfrEdgeCorr)) return result; + //if(result = SetupWfrEdgeCorrData(pRadAccessData, pRadAccessData->pBaseRadX, pRadAccessData->pBaseRadZ, DataPtrsForWfrEdgeCorr)) return result; + if(result = SetupWfrEdgeCorrData(pRadAccessData, pRadAccessData->pBaseRadX, pRadAccessData->pBaseRadZ, DataPtrsForWfrEdgeCorr, pvGPU)) return result; //HG01122023 } } @@ -1255,9 +1286,11 @@ int srTGenOptElem::SetRadRepres(srTSRWRadStructAccessData* pRadAccessData, char if(ar_zStartInSlicesE != 0) FFT2DInfo.yStart = *ar_zStartInSlicesE; FFT2DInfo.pData = pRadAccessData->pBaseRadX; - if(result = FFT2D.Make2DFFT(FFT2DInfo)) return result; + //if(result = FFT2D.Make2DFFT(FFT2DInfo)) return result; + if(result = FFT2D.Make2DFFT(FFT2DInfo, 0, 0, pvGPU)) return result; //HG01122023 FFT2DInfo.pData = pRadAccessData->pBaseRadZ; - if(result = FFT2D.Make2DFFT(FFT2DInfo)) return result; + //if(result = FFT2D.Make2DFFT(FFT2DInfo)) return result; + if(result = FFT2D.Make2DFFT(FFT2DInfo, 0, 0, pvGPU)) return result; //HG01122023 if(WfrEdgeCorrShouldBeTreated) { @@ -1265,7 +1298,8 @@ int srTGenOptElem::SetRadRepres(srTSRWRadStructAccessData* pRadAccessData, char { if(DataPtrsForWfrEdgeCorr.WasSetup) { - MakeWfrEdgeCorrection(pRadAccessData, pRadAccessData->pBaseRadX, pRadAccessData->pBaseRadZ, DataPtrsForWfrEdgeCorr); + //MakeWfrEdgeCorrection(pRadAccessData, pRadAccessData->pBaseRadX, pRadAccessData->pBaseRadZ, DataPtrsForWfrEdgeCorr); + MakeWfrEdgeCorrection(pRadAccessData, pRadAccessData->pBaseRadX, pRadAccessData->pBaseRadZ, DataPtrsForWfrEdgeCorr, pvGPU); //HG01122023 DataPtrsForWfrEdgeCorr.DisposeData(); } } @@ -1309,7 +1343,8 @@ int srTGenOptElem::SetRadRepres(srTSRWRadStructAccessData* pRadAccessData, char { if(CoordOrAng == 1) { - if(result = SetupWfrEdgeCorrData(pRadAccessData, AuxEx, AuxEz, DataPtrsForWfrEdgeCorr)) return result; + //if(result = SetupWfrEdgeCorrData(pRadAccessData, AuxEx, AuxEz, DataPtrsForWfrEdgeCorr)) return result; + if(result = SetupWfrEdgeCorrData(pRadAccessData, AuxEx, AuxEz, DataPtrsForWfrEdgeCorr, pvGPU)) return result; //HG01122023 } } @@ -1318,9 +1353,11 @@ int srTGenOptElem::SetRadRepres(srTSRWRadStructAccessData* pRadAccessData, char if(ar_zStartInSlicesE != 0) FFT2DInfo.yStart = ar_zStartInSlicesE[ie]; FFT2DInfo.pData = AuxEx; - if(result = FFT2D.Make2DFFT(FFT2DInfo)) return result; + //if(result = FFT2D.Make2DFFT(FFT2DInfo)) return result; + if(result = FFT2D.Make2DFFT(FFT2DInfo, 0, 0, pvGPU)) return result; //HG01122023 FFT2DInfo.pData = AuxEz; - if(result = FFT2D.Make2DFFT(FFT2DInfo)) return result; + //if(result = FFT2D.Make2DFFT(FFT2DInfo)) return result; + if(result = FFT2D.Make2DFFT(FFT2DInfo, 0, 0, pvGPU)) return result; //HG01122023 if(WfrEdgeCorrShouldBeTreated) { @@ -1328,7 +1365,8 @@ int srTGenOptElem::SetRadRepres(srTSRWRadStructAccessData* pRadAccessData, char { if(DataPtrsForWfrEdgeCorr.WasSetup) { - MakeWfrEdgeCorrection(pRadAccessData, AuxEx, AuxEz, DataPtrsForWfrEdgeCorr); + //MakeWfrEdgeCorrection(pRadAccessData, AuxEx, AuxEz, DataPtrsForWfrEdgeCorr); + MakeWfrEdgeCorrection(pRadAccessData, AuxEx, AuxEz, DataPtrsForWfrEdgeCorr, pvGPU); //HG01122023 DataPtrsForWfrEdgeCorr.DisposeData(); } } @@ -2182,7 +2220,8 @@ void srTGenOptElem::FindMinMaxRatio(double* Arr1, double* Arr2, int n, double& M //************************************************************************* -int srTGenOptElem::RadResizeGen(srTSRWRadStructAccessData& SRWRadStructAccessData, srTRadResize& RadResizeStruct) +//int srTGenOptElem::RadResizeGen(srTSRWRadStructAccessData& SRWRadStructAccessData, srTRadResize& RadResizeStruct) +int srTGenOptElem::RadResizeGen(srTSRWRadStructAccessData& SRWRadStructAccessData, srTRadResize& RadResizeStruct, void* pvGPU) //HG01122023 { //Added by SY (for profiling?) at parallelizing SRW via OpenMP: //double start; @@ -2257,7 +2296,8 @@ int srTGenOptElem::RadResizeGen(srTSRWRadStructAccessData& SRWRadStructAccessDat SRWRadStructAccessData.zWfrMin += zShift; SRWRadStructAccessData.zWfrMax += zShift; } - if(result = SetRadRepres(&SRWRadStructAccessData, ToRepres)) return result; + //if(result = SetRadRepres(&SRWRadStructAccessData, ToRepres)) return result; + if(result = SetRadRepres(&SRWRadStructAccessData, ToRepres, 0, 0, pvGPU)) return result; //HG01122023 double pxmNew = RadResizeStruct.pxd, pxdNew = RadResizeStruct.pxm; double pzmNew = RadResizeStruct.pzd, pzdNew = RadResizeStruct.pzm; @@ -2537,7 +2577,8 @@ int srTGenOptElem::RadResizeGen(srTSRWRadStructAccessData& SRWRadStructAccessDat //Added by SY (for profiling?) at parallelizing SRW via OpenMP: //srwlPrintTime(":RadResizeGen: copydata",&start); - if(result = RadResizeCore(SRWRadStructAccessData, NewSRWRadStructAccessData, RadResizeStruct)) return result; + //if(result = RadResizeCore(SRWRadStructAccessData, NewSRWRadStructAccessData, RadResizeStruct)) return result; + if(result = RadResizeCore(SRWRadStructAccessData, NewSRWRadStructAccessData, RadResizeStruct, 0, pvGPU)) return result; //HG01122023 if(OldRadXCopy != 0) delete[] OldRadXCopy; if(OldRadZCopy != 0) delete[] OldRadZCopy; @@ -2602,7 +2643,8 @@ int srTGenOptElem::RadResizeGen(srTSRWRadStructAccessData& SRWRadStructAccessDat //Added by SY (for profiling?) at parallelizing SRW via OpenMP: //srwlPrintTime(":RadResizeGen: TreatPolarizSepar-PrepareStructs",&start); - if(result = RadResizeCore(SRWRadStructAccessData, NewSRWRadStructAccessData, RadResizeStruct)) return result; + //if(result = RadResizeCore(SRWRadStructAccessData, NewSRWRadStructAccessData, RadResizeStruct)) return result; + if(result = RadResizeCore(SRWRadStructAccessData, NewSRWRadStructAccessData, RadResizeStruct, 0, pvGPU)) return result; //HG01122023 //Added by SY (for profiling?) at parallelizing SRW via OpenMP: //srwlPrintTime(":RadResizeGen: RadResizeCore 2",&start); @@ -2662,7 +2704,8 @@ int srTGenOptElem::RadResizeGen(srTSRWRadStructAccessData& SRWRadStructAccessDat *(tBaseRadX++) = 0.; } SRWRadStructAccessData.pBaseRadX = OldRadXCopy; - if(result = RadResizeCore(SRWRadStructAccessData, NewSRWRadStructAccessData, RadResizeStruct, 'x')) return result; + //if(result = RadResizeCore(SRWRadStructAccessData, NewSRWRadStructAccessData, RadResizeStruct, 'x')) return result; + if(result = RadResizeCore(SRWRadStructAccessData, NewSRWRadStructAccessData, RadResizeStruct, 'x', pvGPU)) return result; //HG01122023 if(OldRadXCopy != 0) delete[] OldRadXCopy; } //Added by SY (for profiling?) at parallelizing SRW via OpenMP: @@ -2698,7 +2741,8 @@ int srTGenOptElem::RadResizeGen(srTSRWRadStructAccessData& SRWRadStructAccessDat *(tBaseRadZ++) = 0.; } SRWRadStructAccessData.pBaseRadZ = OldRadZCopy; - if(result = RadResizeCore(SRWRadStructAccessData, NewSRWRadStructAccessData, RadResizeStruct, 'z')) return result; + //if(result = RadResizeCore(SRWRadStructAccessData, NewSRWRadStructAccessData, RadResizeStruct, 'z')) return result; + if(result = RadResizeCore(SRWRadStructAccessData, NewSRWRadStructAccessData, RadResizeStruct, 'z', pvGPU)) return result; //HG01122023 if(OldRadZCopy != 0) delete[] OldRadZCopy; } //Added by SY (for profiling?) at parallelizing SRW via OpenMP: @@ -2734,7 +2778,8 @@ int srTGenOptElem::RadResizeGen(srTSRWRadStructAccessData& SRWRadStructAccessDat //for(long j=0; j NewRadAccessData.zWfrMax + DistAbsTol)) FieldShouldBeZeroedDueToZ = 1; - } + //SY: do we need this (always returns 0, updates some clock) + //if(result = srYield.Check()) return result; - int izcOld = int((zAbs - OldRadAccessData.zStart)*zStepInvOld + 1.E-06); + double zAbs = NewRadAccessData.zStart + iz*NewRadAccessData.zStep; - double zRel = zAbs - (OldRadAccessData.zStart + izcOld*OldRadAccessData.zStep); + char FieldShouldBeZeroedDueToZ = 0; + if(NewRadAccessData.WfrEdgeCorrShouldBeDone) + { + if((zAbs < NewRadAccessData.zWfrMin - DistAbsTol) || (zAbs > NewRadAccessData.zWfrMax + DistAbsTol)) FieldShouldBeZeroedDueToZ = 1; + } - if(izcOld == nz_mi_1Old) { izStOld = izcOld - 3; zRel += 2.*OldRadAccessData.zStep;} - else if(izcOld == nz_mi_2Old) { izStOld = izcOld - 2; zRel += OldRadAccessData.zStep;} - else if(izcOld == 0) { izStOld = izcOld; zRel -= OldRadAccessData.zStep;} - else izStOld = izcOld - 1; + int izcOld = int((zAbs - OldRadAccessData.zStart)*zStepInvOld + 1.E-06); - zRel *= zStepInvOld; + double zRel = zAbs - (OldRadAccessData.zStart + izcOld*OldRadAccessData.zStep); - int izcOld_mi_izStOld = izcOld - izStOld; - //long izPerZ_New = iz*PerZ_New; - long long izPerZ_New = iz*PerZ_New; + if(izcOld == nz_mi_1Old) { izStOld = izcOld - 3; zRel += 2.*OldRadAccessData.zStep;} + else if(izcOld == nz_mi_2Old) { izStOld = izcOld - 2; zRel += OldRadAccessData.zStep;} + else if(izcOld == 0) { izStOld = izcOld; zRel -= OldRadAccessData.zStep;} + else izStOld = izcOld - 1; - float *pEX_StartForX_New = 0, *pEZ_StartForX_New = 0; - if(TreatPolCompX) pEX_StartForX_New = pEX0_New + izPerZ_New; - if(TreatPolCompZ) pEZ_StartForX_New = pEZ0_New + izPerZ_New; + zRel *= zStepInvOld; - for(int ix=ixStart; ix<=ixEnd; ix++) - { - //long ixPerX_New_p_Two_ie = ix*PerX_New + Two_ie; - long long ixPerX_New_p_Two_ie = ix*PerX_New + Two_ie; - float *pEX_New = 0, *pEZ_New = 0; - if(TreatPolCompX) pEX_New = pEX_StartForX_New + ixPerX_New_p_Two_ie; - if(TreatPolCompZ) pEZ_New = pEZ_StartForX_New + ixPerX_New_p_Two_ie; + int izcOld_mi_izStOld = izcOld - izStOld; + //long izPerZ_New = iz*PerZ_New; + long long izPerZ_New = iz*PerZ_New; - double xAbs = NewRadAccessData.xStart + ix*NewRadAccessData.xStep; + float *pEX_StartForX_New = 0, *pEZ_StartForX_New = 0; + if(TreatPolCompX) pEX_StartForX_New = pEX0_New + izPerZ_New; + if(TreatPolCompZ) pEZ_StartForX_New = pEZ0_New + izPerZ_New; - char FieldShouldBeZeroedDueToX = 0; - if(NewRadAccessData.WfrEdgeCorrShouldBeDone) + for(int ix=ixStart; ix<=ixEnd; ix++) { - if((xAbs < NewRadAccessData.xWfrMin - DistAbsTol) || (xAbs > NewRadAccessData.xWfrMax + DistAbsTol)) FieldShouldBeZeroedDueToX = 1; - } - char FieldShouldBeZeroed = (FieldShouldBeZeroedDueToX || FieldShouldBeZeroedDueToZ); + //long ixPerX_New_p_Two_ie = ix*PerX_New + Two_ie; + long long ixPerX_New_p_Two_ie = ix*PerX_New + Two_ie; + float *pEX_New = 0, *pEZ_New = 0; + if(TreatPolCompX) pEX_New = pEX_StartForX_New + ixPerX_New_p_Two_ie; + if(TreatPolCompZ) pEZ_New = pEZ_StartForX_New + ixPerX_New_p_Two_ie; - int ixcOld = int((xAbs - OldRadAccessData.xStart)*xStepInvOld + 1.E-06); - double xRel = xAbs - (OldRadAccessData.xStart + ixcOld*OldRadAccessData.xStep); + double xAbs = NewRadAccessData.xStart + ix*NewRadAccessData.xStep; - if(ixcOld == nx_mi_1Old) { ixStOld = ixcOld - 3; xRel += 2.*OldRadAccessData.xStep;} - else if(ixcOld == nx_mi_2Old) { ixStOld = ixcOld - 2; xRel += OldRadAccessData.xStep;} - else if(ixcOld == 0) { ixStOld = ixcOld; xRel -= OldRadAccessData.xStep;} - else ixStOld = ixcOld - 1; + char FieldShouldBeZeroedDueToX = 0; + if(NewRadAccessData.WfrEdgeCorrShouldBeDone) + { + if((xAbs < NewRadAccessData.xWfrMin - DistAbsTol) || (xAbs > NewRadAccessData.xWfrMax + DistAbsTol)) FieldShouldBeZeroedDueToX = 1; + } + char FieldShouldBeZeroed = (FieldShouldBeZeroedDueToX || FieldShouldBeZeroedDueToZ); - xRel *= xStepInvOld; + int ixcOld = int((xAbs - OldRadAccessData.xStart)*xStepInvOld + 1.E-06); + double xRel = xAbs - (OldRadAccessData.xStart + ixcOld*OldRadAccessData.xStep); - int ixcOld_mi_ixStOld = ixcOld - ixStOld; + if(ixcOld == nx_mi_1Old) { ixStOld = ixcOld - 3; xRel += 2.*OldRadAccessData.xStep;} + else if(ixcOld == nx_mi_2Old) { ixStOld = ixcOld - 2; xRel += OldRadAccessData.xStep;} + else if(ixcOld == 0) { ixStOld = ixcOld; xRel -= OldRadAccessData.xStep;} + else ixStOld = ixcOld - 1; - if((izStOld != izStOldPrev) || (ixStOld != ixStOldPrev)) - { - UseLowOrderInterp_PolCompX = 0; UseLowOrderInterp_PolCompZ = 0; + xRel *= xStepInvOld; - //long TotOffsetOld = izStOld*PerZ_Old + ixStOld*PerX_Old + Two_ie; - long long TotOffsetOld = izStOld*PerZ_Old + ixStOld*PerX_Old + Two_ie; + int ixcOld_mi_ixStOld = ixcOld - ixStOld; - if(TreatPolCompX) + if((izStOld != izStOldPrev) || (ixStOld != ixStOldPrev)) { - float* pExSt_Old = OldRadAccessData.pBaseRadX + TotOffsetOld; - GetCellDataForInterpol(pExSt_Old, PerX_Old, PerZ_Old, AuxF); + UseLowOrderInterp_PolCompX = 0; UseLowOrderInterp_PolCompZ = 0; - SetupCellDataI(AuxF, AuxFI); - UseLowOrderInterp_PolCompX = CheckForLowOrderInterp(AuxF, AuxFI, ixcOld_mi_ixStOld, izcOld_mi_izStOld, &InterpolAux01, InterpolAux02, InterpolAux02I); + //long TotOffsetOld = izStOld*PerZ_Old + ixStOld*PerX_Old + Two_ie; + long long TotOffsetOld = izStOld*PerZ_Old + ixStOld*PerX_Old + Two_ie; - if(!UseLowOrderInterp_PolCompX) + if(TreatPolCompX) { - for(int i=0; i<2; i++) + float* pExSt_Old = OldRadAccessData.pBaseRadX + TotOffsetOld; + GetCellDataForInterpol(pExSt_Old, PerX_Old, PerZ_Old, AuxF); + + SetupCellDataI(AuxF, AuxFI); + UseLowOrderInterp_PolCompX = CheckForLowOrderInterp(AuxF, AuxFI, ixcOld_mi_ixStOld, izcOld_mi_izStOld, &InterpolAux01, InterpolAux02, InterpolAux02I); + + if(!UseLowOrderInterp_PolCompX) { - SetupInterpolAux02(AuxF + i, &InterpolAux01, InterpolAux02 + i); + for(int i=0; i<2; i++) + { + SetupInterpolAux02(AuxF + i, &InterpolAux01, InterpolAux02 + i); + } + SetupInterpolAux02(AuxFI, &InterpolAux01, InterpolAux02I); } - SetupInterpolAux02(AuxFI, &InterpolAux01, InterpolAux02I); } - } - if(TreatPolCompZ) - { - float* pEzSt_Old = OldRadAccessData.pBaseRadZ + TotOffsetOld; - GetCellDataForInterpol(pEzSt_Old, PerX_Old, PerZ_Old, AuxF+2); + if(TreatPolCompZ) + { + float* pEzSt_Old = OldRadAccessData.pBaseRadZ + TotOffsetOld; + GetCellDataForInterpol(pEzSt_Old, PerX_Old, PerZ_Old, AuxF+2); - SetupCellDataI(AuxF+2, AuxFI+1); - UseLowOrderInterp_PolCompZ = CheckForLowOrderInterp(AuxF+2, AuxFI+1, ixcOld_mi_ixStOld, izcOld_mi_izStOld, &InterpolAux01, InterpolAux02+2, InterpolAux02I+1); + SetupCellDataI(AuxF+2, AuxFI+1); + UseLowOrderInterp_PolCompZ = CheckForLowOrderInterp(AuxF+2, AuxFI+1, ixcOld_mi_ixStOld, izcOld_mi_izStOld, &InterpolAux01, InterpolAux02+2, InterpolAux02I+1); - if(!UseLowOrderInterp_PolCompZ) - { - for(int i=0; i<2; i++) + if(!UseLowOrderInterp_PolCompZ) { - SetupInterpolAux02(AuxF+2+i, &InterpolAux01, InterpolAux02+2+i); + for(int i=0; i<2; i++) + { + SetupInterpolAux02(AuxF+2+i, &InterpolAux01, InterpolAux02+2+i); + } + SetupInterpolAux02(AuxFI+1, &InterpolAux01, InterpolAux02I+1); } - SetupInterpolAux02(AuxFI+1, &InterpolAux01, InterpolAux02I+1); } - } - ixStOldPrev = ixStOld; izStOldPrev = izStOld; - } - - if(TreatPolCompX) - { - if(UseLowOrderInterp_PolCompX) - { - InterpolF_LowOrder(InterpolAux02, xRel, zRel, BufF, 0); - InterpolFI_LowOrder(InterpolAux02I, xRel, zRel, BufFI, 0); + ixStOldPrev = ixStOld; izStOldPrev = izStOld; } - else + + if(TreatPolCompX) { - InterpolF(InterpolAux02, xRel, zRel, BufF, 0); - InterpolFI(InterpolAux02I, xRel, zRel, BufFI, 0); - } + if(UseLowOrderInterp_PolCompX) + { + InterpolF_LowOrder(InterpolAux02, xRel, zRel, BufF, 0); + InterpolFI_LowOrder(InterpolAux02I, xRel, zRel, BufFI, 0); + } + else + { + InterpolF(InterpolAux02, xRel, zRel, BufF, 0); + InterpolFI(InterpolAux02I, xRel, zRel, BufFI, 0); + } - (*BufFI) *= AuxFI->fNorm; - ImproveReAndIm(BufF, BufFI); + (*BufFI) *= AuxFI->fNorm; + ImproveReAndIm(BufF, BufFI); - if(FieldShouldBeZeroed) - { - *BufF = 0.; *(BufF+1) = 0.; - } + if(FieldShouldBeZeroed) + { + *BufF = 0.; *(BufF+1) = 0.; + } - *pEX_New = *BufF; - *(pEX_New+1) = *(BufF+1); - } - if(TreatPolCompZ) - { - if(UseLowOrderInterp_PolCompZ) - { - InterpolF_LowOrder(InterpolAux02, xRel, zRel, BufF, 2); - InterpolFI_LowOrder(InterpolAux02I, xRel, zRel, BufFI, 1); + *pEX_New = *BufF; + *(pEX_New+1) = *(BufF+1); } - else + if(TreatPolCompZ) { - InterpolF(InterpolAux02, xRel, zRel, BufF, 2); - InterpolFI(InterpolAux02I, xRel, zRel, BufFI, 1); - } + if(UseLowOrderInterp_PolCompZ) + { + InterpolF_LowOrder(InterpolAux02, xRel, zRel, BufF, 2); + InterpolFI_LowOrder(InterpolAux02I, xRel, zRel, BufFI, 1); + } + else + { + InterpolF(InterpolAux02, xRel, zRel, BufF, 2); + InterpolFI(InterpolAux02I, xRel, zRel, BufFI, 1); + } - (*(BufFI+1)) *= (AuxFI+1)->fNorm; - ImproveReAndIm(BufF+2, BufFI+1); + (*(BufFI+1)) *= (AuxFI+1)->fNorm; + ImproveReAndIm(BufF+2, BufFI+1); - if(FieldShouldBeZeroed) - { - *(BufF+2) = 0.; *(BufF+3) = 0.; - } + if(FieldShouldBeZeroed) + { + *(BufF+2) = 0.; *(BufF+3) = 0.; + } - *pEZ_New = *(BufF+2); - *(pEZ_New+1) = *(BufF+3); + *pEZ_New = *(BufF+2); + *(pEZ_New+1) = *(BufF+3); + } } } } @@ -3089,7 +3147,8 @@ int srTGenOptElem::RadResizeCore(srTSRWRadStructAccessData& OldRadAccessData, sr //sprintf(str,"%s %d",":RadResizeCore: cycles:",NewRadAccessData.ne); //srwlPrintTime(str,&start); - if(WaveFrontTermWasTreated) TreatStronglyOscillatingTerm(NewRadAccessData, 'a', PolComp); + //if(WaveFrontTermWasTreated) TreatStronglyOscillatingTerm(NewRadAccessData, 'a', PolComp); + if(WaveFrontTermWasTreated) TreatStronglyOscillatingTerm(NewRadAccessData, 'a', PolComp, -1, pvGPU); //HG01122023 //OC31102018: added by SY (for profiling?) at parallelizing SRW via OpenMP //srwlPrintTime(":RadResizeCore: TreatStronglyOscillatingTerm 2",&start); @@ -4503,7 +4562,8 @@ char srTGenOptElem::WaveFrontTermCanBeTreated(srTSRWRadStructAccessData& RadAcce //************************************************************************* -void srTGenOptElem::TreatStronglyOscillatingTerm(srTSRWRadStructAccessData& RadAccessData, char AddOrRem, char PolComp, int ieOnly) +//void srTGenOptElem::TreatStronglyOscillatingTerm(srTSRWRadStructAccessData& RadAccessData, char AddOrRem, char PolComp, int ieOnly) +void srTGenOptElem::TreatStronglyOscillatingTerm(srTSRWRadStructAccessData& RadAccessData, char AddOrRem, char PolComp, int ieOnly, void* pvGPU) //HG01122023 { //Later treat X and Z coordinates separately here!!! @@ -4634,6 +4694,14 @@ void srTGenOptElem::TreatStronglyOscillatingTerm(srTSRWRadStructAccessData& RadA ieStart = ieOnly; ieBefEnd = ieOnly + 1; } +#ifdef _OFFLOAD_GPU //HG01122023 + if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) + { + TreatStronglyOscillatingTerm_GPU(RadAccessData, TreatPolCompX, TreatPolCompZ, ConstRx, ConstRz, ieStart, ieBefEnd, (TGPUUsageArg*)pvGPU); + return; + } +#endif + #ifdef _WITH_OMP //OC31102018: added by SY at parallelizing SRW via OpenMP #pragma omp parallel for #endif diff --git a/cpp/src/core/sroptelm.h b/cpp/src/core/sroptelm.h index 4e5c9445..a4ea6919 100644 --- a/cpp/src/core/sroptelm.h +++ b/cpp/src/core/sroptelm.h @@ -17,6 +17,7 @@ #include //required by some (buggy?) version of GCC #include //required? + #include "gmtrans.h" #include "gmvect.h" @@ -43,6 +44,11 @@ #endif #endif +#ifdef _OFFLOAD_GPU +#include "auxgpu.h" +#include "sroptelm_gpu.h" +#endif + //************************************************************************* extern srTIntVect gVectWarnNos; @@ -119,7 +125,10 @@ class srTGenOptElem : public CGenObject { #endif } - virtual int PropagateRadiation(srTSRWRadStructAccessData*, srTParPrecWfrPropag&, srTRadResizeVect&) { return 0;} + virtual int SupportedFeatures() { return 0; } //HG01122023 0=CPU only, 1=GPU supported + + //virtual int PropagateRadiation(srTSRWRadStructAccessData*, srTParPrecWfrPropag&, srTRadResizeVect&) { return 0;} + virtual int PropagateRadiation(srTSRWRadStructAccessData*, srTParPrecWfrPropag&, srTRadResizeVect&, void* pvGPU=0) { return 0;} //HG01122023 virtual int PropagateRadMoments(srTSRWRadStructAccessData*, srTMomentsRatios*) { return 0;} virtual int PropagateWaveFrontRadius(srTSRWRadStructAccessData*) { return 0;} @@ -128,16 +137,21 @@ class srTGenOptElem : public CGenObject { //virtual int PropagateRadiationSimple(srTSRWRadStructAccessData*, void* pBuf=0) { return 0;} //OC06092019 //OC01102019 (restored) - virtual int PropagateRadiationSimple(srTSRWRadStructAccessData*) { return 0;} + //virtual int PropagateRadiationSimple(srTSRWRadStructAccessData*) { return 0;} + virtual int PropagateRadiationSimple(srTSRWRadStructAccessData*, void* pvGPU=0) { return 0;} //HG01122023 virtual int PropagateRadiationSimple1D(srTRadSect1D*) { return 0;} //virtual int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadData, void* pBuf=0) { return 0;} //OC06092019 //OC01102019 (restored) - virtual int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadData) { return 0;} + //virtual int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadData) { return 0;} + virtual int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadData, void* pvGPU=0) { return 0;} //HG01122023 virtual int RangeShouldBeAdjustedAtPropag() { return 1;} virtual int ResolutionShouldBeAdjustedAtPropag() { return 1;} +#ifdef _OFFLOAD_GPU //HG01122023 + virtual int RadPointModifierParallel(srTSRWRadStructAccessData* pRadAccessData, void* pBufVars=0, long pBufVarsSz=0, TGPUUsageArg* pvGPU=0) { return -1; } +#endif virtual void RadPointModifier(srTEXZ&, srTEFieldPtrs&, void* pBufVars=0) {} //OC29082019 //virtual void RadPointModifier(srTEXZ&, srTEFieldPtrs&) {} virtual void RadPointModifier1D(srTEXZ&, srTEFieldPtrs&, void* pBufVars=0) {}//OC06092019 @@ -182,7 +196,8 @@ class srTGenOptElem : public CGenObject { //virtual int PropagateRadiationMeth_0(srTSRWRadStructAccessData* pRadAccessData, void* pBuf=0); //OC06092019 //OC01102019 (restored) - virtual int PropagateRadiationMeth_0(srTSRWRadStructAccessData* pRadAccessData); //moved from derived classes: loops over E, calls derived PropagateRadiationSingleE_Meth_0 + //virtual int PropagateRadiationMeth_0(srTSRWRadStructAccessData* pRadAccessData); //moved from derived classes: loops over E, calls derived PropagateRadiationSingleE_Meth_0 + virtual int PropagateRadiationMeth_0(srTSRWRadStructAccessData* pRadAccessData, void* pvGPU=0); //moved from derived classes: loops over E, calls derived PropagateRadiationSingleE_Meth_0 //HG01122023 void FindWidestWfrMeshParam(vector& vRadSlices, srTSRWRadStructAccessData* pRad, bool keepConstNumPoints); int ReInterpolateWfrDataOnNewTransvMesh(vector& vRadSlices, srTSRWRadStructAccessData* pAuxRadSingleE, srTSRWRadStructAccessData* pRadRes); @@ -236,7 +251,8 @@ class srTGenOptElem : public CGenObject { int FillOutRadFromInRad(srTSRWRadStructAccessData*, srTSRWRadStructAccessData*); - int TraverseRadZXE(srTSRWRadStructAccessData*, void* pBufVars=0); //OC29082019 + int TraverseRadZXE(srTSRWRadStructAccessData*, void* pBufVars=0, long pBufVarsSz=0, void* pvGPU=0); //OC29082019 //HG01122023 + //int TraverseRadZXE(srTSRWRadStructAccessData*, void* pBufVars=0); //OC29082019 //int TraverseRadZXE(srTSRWRadStructAccessData*); int TraverseRad1D(srTRadSect1D*, void* pBufVars=0); //OC29082019 //int TraverseRad1D(srTRadSect1D*); @@ -258,41 +274,73 @@ class srTGenOptElem : public CGenObject { int RemoveSliceConstE_FromGenRadStruct(srTSRWRadStructAccessData*, long); //int SetRadRepres(srTSRWRadStructAccessData*, char); - int SetRadRepres(srTSRWRadStructAccessData*, char, double* ar_xStartInSlicesE=0, double* ar_zStartInSlicesE=0); + //int SetRadRepres(srTSRWRadStructAccessData*, char, double* ar_xStartInSlicesE=0, double* ar_zStartInSlicesE=0); + int SetRadRepres(srTSRWRadStructAccessData*, char, double* ar_xStartInSlicesE=0, double* ar_zStartInSlicesE=0, void* pvGPU=0); //HG01122023 int SetRadRepres1D(srTRadSect1D*, char); - int SetupWfrEdgeCorrData(srTSRWRadStructAccessData*, float*, float*, srTDataPtrsForWfrEdgeCorr&); + int SetupWfrEdgeCorrData(srTSRWRadStructAccessData*, float*, float*, srTDataPtrsForWfrEdgeCorr&, void* pvGPU=0); //HG01122023 + //int SetupWfrEdgeCorrData(srTSRWRadStructAccessData*, float*, float*, srTDataPtrsForWfrEdgeCorr&); //inline void SetupExpCorrArray(float*, long, double, double, double); inline void SetupExpCorrArray(float*, long long, double, double, double); - void MakeWfrEdgeCorrection(srTSRWRadStructAccessData*, float*, float*, srTDataPtrsForWfrEdgeCorr&); + void MakeWfrEdgeCorrection(srTSRWRadStructAccessData*, float*, float*, srTDataPtrsForWfrEdgeCorr&, void* pvGPU=0); //HG01122023 + //void MakeWfrEdgeCorrection(srTSRWRadStructAccessData*, float*, float*, srTDataPtrsForWfrEdgeCorr&); +#ifdef _OFFLOAD_GPU //HG01122023 + void srTGenOptElem::MakeWfrEdgeCorrection_GPU(srTSRWRadStructAccessData* RadAccessData, float* pDataEx, float* pDataEz, srTDataPtrsForWfrEdgeCorr& DataPtrs, TGPUUsageArg* pGPU); +#endif int SetupWfrEdgeCorrData1D(srTRadSect1D*, float*, float*, srTDataPtrsForWfrEdgeCorr1D&); void MakeWfrEdgeCorrection1D(srTRadSect1D*, float*, float*, srTDataPtrsForWfrEdgeCorr1D&); int ComputeRadMoments(srTSRWRadStructAccessData*); - int RadResizeGen(srTSRWRadStructAccessData&, srTRadResize&); + int RadResizeGen(srTSRWRadStructAccessData&, srTRadResize&, void* pvGPU=0); //HG01122023 + //int RadResizeGen(srTSRWRadStructAccessData&, srTRadResize&); int RadResizeGenE(srTSRWRadStructAccessData&, srTRadResize&); - int RadResizeCore(srTSRWRadStructAccessData&, srTSRWRadStructAccessData&, srTRadResize&, char =0); + int RadResizeCore(srTSRWRadStructAccessData&, srTSRWRadStructAccessData&, srTRadResize&, char =0, void* =0); //HG01122023 + //int RadResizeCore(srTSRWRadStructAccessData&, srTSRWRadStructAccessData&, srTRadResize&, char =0); +#ifdef _OFFLOAD_GPU //HG01122023 + int RadResizeCore_GPU(srTSRWRadStructAccessData&, srTSRWRadStructAccessData&, char =0, TGPUUsageArg* =0); +#endif int RadResizeCoreE(srTSRWRadStructAccessData&, srTSRWRadStructAccessData&, srTRadResize&, char =0); int RadResizeCore_OnlyLargerRange(srTSRWRadStructAccessData& OldRadAccessData, srTSRWRadStructAccessData& NewRadAccessData, srTRadResize& RadResizeStruct, char PolComp); int RadResizeCore_OnlyLargerRangeE(srTSRWRadStructAccessData& OldRadAccessData, srTSRWRadStructAccessData& NewRadAccessData, srTRadResize& RadResizeStruct, char PolComp); //inline void GetCellDataForInterpol(float*, long long , long long, srTInterpolAuxF*); +#ifdef _OFFLOAD_GPU //HG01122023 + GPU_PORTABLE +#endif inline static void GetCellDataForInterpol(float*, long long, long long, srTInterpolAuxF*); //OC02022020 //inline void SetupCellDataI(srTInterpolAuxF*, srTInterpolAuxF*); +#ifdef _OFFLOAD_GPU //HG01122023 + GPU_PORTABLE +#endif inline static void SetupCellDataI(srTInterpolAuxF*, srTInterpolAuxF*); //OC02022020 //char WaveFrontTermCanBeTreated(srTSRWRadStructAccessData&); //char WaveFrontTermCanBeTreated(srTSRWRadStructAccessData&, bool checkBenefit=true); //OC06012017 (uncommented after some fixes in bool srTSRWRadStructAccessData::CheckIfQuadTermTreatIsBenefit(char, char)) //char WaveFrontTermCanBeTreated(srTSRWRadStructAccessData&, bool checkBenefit=false); //OC05012017 (changed to checkBenefit=false to resolve problem of resizing in near field at strong under-sampling) char WaveFrontTermCanBeTreated(srTSRWRadStructAccessData&, bool checkBenefit=false); //OC29032017 (changed again to checkBenefit=false to resolve problem of resizing of wiggler radiation at strong under-sampling, the ELETTRA SCW case) - void TreatStronglyOscillatingTerm(srTSRWRadStructAccessData&, char, char =0, int ieOnly =-1); + //void TreatStronglyOscillatingTerm(srTSRWRadStructAccessData&, char, char =0, int ieOnly =-1); + void TreatStronglyOscillatingTerm(srTSRWRadStructAccessData&, char, char =0, int ieOnly =-1, void* pvGPU=0); //HG01122023 +#ifdef _OFFLOAD_GPU //HG01122023 + void TreatStronglyOscillatingTerm_GPU(srTSRWRadStructAccessData& RadAccessData, bool TreatPolCompX, bool TreatPolCompZ, double ConstRx, double ConstRz, int ieStart, int ieBefEnd, TGPUUsageArg* pGPU); +#endif //void TreatStronglyOscillatingTermIrregMesh(srTSRWRadStructAccessData&, float*, float, float, float, float, char, char =0, int =-1); void TreatStronglyOscillatingTermIrregMesh(srTSRWRadStructAccessData&, double*, double, double, double, double, char, char =0, int =-1); //OC260114 //void TreatStronglyOscillatingTermIrregMesh(srTSRWRadStructAccessData&, double*, double, double, double, double, char, char =0, int =-1, double =1, double =1); //OC220214 void TreatStronglyOscillatingTermIrregMeshTrf(srTSRWRadStructAccessData& RadAccessData, char AddOrRem, double CrdTrf[2][3], char PolComp =0, int ieOnly =-1); //OC27122020 +#ifdef _OFFLOAD_GPU //HG01122023 + GPU_PORTABLE inline static void SetupInterpolAux02(srTInterpolAuxF*, srTInterpolAux01*, srTInterpolAux02*); //OC02022020 + GPU_PORTABLE inline static void SetupInterpolAux02_LowOrder(srTInterpolAuxF*, srTInterpolAux01*, srTInterpolAux02*); //OC02022020 + GPU_PORTABLE inline static void InterpolF(srTInterpolAux02*, double, double, float*, int); //OC02022020 + GPU_PORTABLE inline static void InterpolFI(srTInterpolAux02*, double, double, float*, int); //OC02022020 + GPU_PORTABLE inline static void InterpolF_LowOrder(srTInterpolAux02*, double, double, float*, int); //OC02022020 + GPU_PORTABLE inline static void InterpolFI_LowOrder(srTInterpolAux02*, double, double, float*, int); //OC02022020 + GPU_PORTABLE inline double InterpLin(double r, double f1, double f2) { return f1 + r*(f2 - f1);} + GPU_PORTABLE inline static void ImproveReAndIm(float*, float*); //OC02022020 + GPU_PORTABLE inline static int CheckForLowOrderInterp(srTInterpolAuxF*, srTInterpolAuxF*, int, int, srTInterpolAux01*, srTInterpolAux02*, srTInterpolAux02*); //OC02022020 +#else //inline void SetupInterpolAux02(srTInterpolAuxF*, srTInterpolAux01*, srTInterpolAux02*); inline static void SetupInterpolAux02(srTInterpolAuxF*, srTInterpolAux01*, srTInterpolAux02*); //OC02022020 //inline void SetupInterpolAux02_LowOrder(srTInterpolAuxF*, srTInterpolAux01*, srTInterpolAux02*); @@ -310,6 +358,7 @@ class srTGenOptElem : public CGenObject { inline static void ImproveReAndIm(float*, float*); //OC02022020 //inline int CheckForLowOrderInterp(srTInterpolAuxF*, srTInterpolAuxF*, int, int, srTInterpolAux01*, srTInterpolAux02*, srTInterpolAux02*); inline static int CheckForLowOrderInterp(srTInterpolAuxF*, srTInterpolAuxF*, int, int, srTInterpolAux01*, srTInterpolAux02*, srTInterpolAux02*); //OC02022020 +#endif int RadResizeGen1D(srTRadSect1D&, srTRadResize1D&); int RadResizeCore1D(srTRadSect1D&, srTRadSect1D&, srTRadResize1D&); @@ -346,6 +395,9 @@ class srTGenOptElem : public CGenObject { //inline void MultSquareMatrByVect(float**, float*, int, float*); inline void MultSquareMatrByVect(double**, double*, int, double*); //OC130311 +#ifdef _OFFLOAD_GPU //HG04122023 + GPU_PORTABLE +#endif inline void CosAndSin(double, float&, float&); inline void FindLowestAndUppestPoints(TVector3d&, TVector3d*, int, int&, int&); inline void ReflectVect(TVector3d& N, TVector3d& V); diff --git a/cpp/src/core/sroptelm_gpu.cu b/cpp/src/core/sroptelm_gpu.cu new file mode 100644 index 00000000..f9a65861 --- /dev/null +++ b/cpp/src/core/sroptelm_gpu.cu @@ -0,0 +1,587 @@ +/************************************************************************//** + * File: sroptelm_gpu.cu + * Description: Optical element (general CUDA functions) + * Project: Synchrotron Radiation Workshop + * First release: 2023 + * + * Copyright (C) Brookhaven National Laboratory + * All Rights Reserved + * + * @author H.Goel + * @version 1.0 + ***************************************************************************/ + +#ifdef _OFFLOAD_GPU +#include "cuda_runtime.h" +#include "device_launch_parameters.h" +#include "math_constants.h" +#include +#include +#include +#include "sroptelm.h" +#include "sroptelm_gpu.h" + + +__global__ void TreatStronglyOscillatingTerm_Kernel(srTSRWRadStructAccessData RadAccessData, bool TreatPolCompX, bool TreatPolCompZ, double ConstRx, double ConstRz, int ieStart) { + int ix = (blockIdx.x * blockDim.x + threadIdx.x); //nx range + int iz = (blockIdx.y * blockDim.y + threadIdx.y); //nz range + int ie = (blockIdx.z * blockDim.z + threadIdx.z) + ieStart; //ne range + + if (ix < RadAccessData.nx && iz < RadAccessData.nz && ie < RadAccessData.ne + ieStart) + { + double ePh = RadAccessData.eStart + RadAccessData.eStep * (ie - ieStart); + if (RadAccessData.PresT == 1) + { + ePh = RadAccessData.avgPhotEn; //?? OC041108 + } + + double ConstRxE = ConstRx * ePh; + double ConstRzE = ConstRz * ePh; + if (RadAccessData.Pres == 1) + { + //double Lambda_m = 1.239854e-06/ePh; + double Lambda_m = 1.239842e-06 / ePh; + if (RadAccessData.PhotEnergyUnit == 1) Lambda_m *= 0.001; // if keV + + double Lambda_me2 = Lambda_m * Lambda_m; + ConstRxE *= Lambda_me2; + ConstRzE *= Lambda_me2; + } + + double z = (RadAccessData.zStart - RadAccessData.zc) + (iz * RadAccessData.zStep); + double PhaseAddZ = 0; + if (RadAccessData.WfrQuadTermCanBeTreatedAtResizeZ) PhaseAddZ = ConstRzE * z * z; + + double x = (RadAccessData.xStart - RadAccessData.xc) + (ix * RadAccessData.xStep); + double Phase = PhaseAddZ; + if (RadAccessData.WfrQuadTermCanBeTreatedAtResizeX) Phase += ConstRxE * x * x; + + float SinPh, CosPh; + sincosf(Phase, &SinPh, &CosPh); + + long long PerX = RadAccessData.ne << 1; + long long PerZ = PerX * RadAccessData.nx; + long long offset = ie * 2 + iz * PerZ + ix * PerX; + + if (TreatPolCompX) + { + float* pExRe = RadAccessData.pBaseRadX + offset; + float* pExIm = pExRe + 1; + double ExReNew = (*pExRe) * CosPh - (*pExIm) * SinPh; + double ExImNew = (*pExRe) * SinPh + (*pExIm) * CosPh; + *pExRe = (float)ExReNew; *pExIm = (float)ExImNew; + } + if (TreatPolCompZ) + { + float* pEzRe = RadAccessData.pBaseRadZ + offset; + float* pEzIm = pEzRe + 1; + double EzReNew = (*pEzRe) * CosPh - (*pEzIm) * SinPh; + double EzImNew = (*pEzRe) * SinPh + (*pEzIm) * CosPh; + *pEzRe = (float)EzReNew; *pEzIm = (float)EzImNew; + } + } +} + +void srTGenOptElem::TreatStronglyOscillatingTerm_GPU(srTSRWRadStructAccessData& RadAccessData, bool TreatPolCompX, bool TreatPolCompZ, double ConstRx, double ConstRz, int ieStart, int ieBefEnd, TGPUUsageArg* pGPU) +{ + if (RadAccessData.pBaseRadX != NULL) + { + RadAccessData.pBaseRadX = (float*)CAuxGPU::ToDevice(pGPU, RadAccessData.pBaseRadX, 2*RadAccessData.ne*RadAccessData.nx*RadAccessData.nz*sizeof(float)); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, RadAccessData.pBaseRadX); + } + if (RadAccessData.pBaseRadZ != NULL) + { + RadAccessData.pBaseRadZ = (float*)CAuxGPU::ToDevice(pGPU, RadAccessData.pBaseRadZ, 2*RadAccessData.ne*RadAccessData.nx*RadAccessData.nz*sizeof(float)); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, RadAccessData.pBaseRadZ); + } + + const int bs = 256; + dim3 blocks(RadAccessData.nx / bs + ((RadAccessData.nx & (bs - 1)) != 0), RadAccessData.nz, ieBefEnd - ieStart); + dim3 threads(bs, 1); + TreatStronglyOscillatingTerm_Kernel<< > > (RadAccessData, TreatPolCompX, TreatPolCompZ, ConstRx, ConstRz, ieStart); + + CAuxGPU::MarkUpdated(pGPU, RadAccessData.pBaseRadX, true, false); + CAuxGPU::MarkUpdated(pGPU, RadAccessData.pBaseRadZ, true, false); + +#ifndef _DEBUG + if (RadAccessData.pBaseRadX != NULL) + RadAccessData.pBaseRadX = (float*)CAuxGPU::GetHostPtr(pGPU, RadAccessData.pBaseRadX); + if (RadAccessData.pBaseRadZ != NULL) + RadAccessData.pBaseRadZ = (float*)CAuxGPU::GetHostPtr(pGPU, RadAccessData.pBaseRadZ); +#endif + +#ifdef _DEBUG + if (RadAccessData.pBaseRadX != NULL) + RadAccessData.pBaseRadX = (float*)CAuxGPU::ToHostAndFree(pGPU, RadAccessData.pBaseRadX, 2*RadAccessData.ne*RadAccessData.nx*RadAccessData.nz*sizeof(float)); + if (RadAccessData.pBaseRadZ != NULL) + RadAccessData.pBaseRadZ = (float*)CAuxGPU::ToHostAndFree(pGPU, RadAccessData.pBaseRadZ, 2*RadAccessData.ne*RadAccessData.nx*RadAccessData.nz*sizeof(float)); + cudaStreamSynchronize(0); + auto err = cudaGetLastError(); + printf("%s\r\n", cudaGetErrorString(err)); +#endif +} + +__global__ void MakeWfrEdgeCorrection_Kernel(srTSRWRadStructAccessData RadAccessData, float* pDataEx, float* pDataEz, srTDataPtrsForWfrEdgeCorr DataPtrs, float dxSt, float dxFi, float dzSt, float dzFi) +{ + int ix = (blockIdx.x * blockDim.x + threadIdx.x); //nx range + int iz = (blockIdx.y * blockDim.y + threadIdx.y); //nz range + + if (ix < RadAccessData.nx && iz < RadAccessData.nz) + { + //float dxSt = (float)DataPtrs.dxSt; + //float dxFi = (float)DataPtrs.dxFi; + //float dzSt = (float)DataPtrs.dzSt; + //float dzFi = (float)DataPtrs.dzFi; + float dxSt_dzSt = dxSt * dzSt; + float dxSt_dzFi = dxSt * dzFi; + float dxFi_dzSt = dxFi * dzSt; + float dxFi_dzFi = dxFi * dzFi; + + long TwoNz = RadAccessData.nz << 1; + long PerX = 2; + long PerZ = PerX * RadAccessData.nx; + + float fSSExRe = DataPtrs.fxStzSt[0]; + float fSSExIm = DataPtrs.fxStzSt[1]; + float fSSEzRe = DataPtrs.fxStzSt[2]; + float fSSEzIm = DataPtrs.fxStzSt[3]; + + float fFSExRe = DataPtrs.fxFizSt[0]; + float fFSExIm = DataPtrs.fxFizSt[1]; + float fFSEzRe = DataPtrs.fxFizSt[2]; + float fFSEzIm = DataPtrs.fxFizSt[3]; + + float fSFExRe = DataPtrs.fxStzFi[0]; + float fSFExIm = DataPtrs.fxStzFi[1]; + float fSFEzRe = DataPtrs.fxStzFi[2]; + float fSFEzIm = DataPtrs.fxStzFi[3]; + + float fFFExRe = DataPtrs.fxFizFi[0]; + float fFFExIm = DataPtrs.fxFizFi[1]; + float fFFEzRe = DataPtrs.fxFizFi[2]; + float fFFEzIm = DataPtrs.fxFizFi[3]; + + float bRe, bIm, cRe, cIm; + + long long Two_iz = iz << 1; + long long Two_iz_p_1 = Two_iz + 1; + long long Two_ix = ix << 1; + long long Two_ix_p_1 = Two_ix + 1; + + float* tEx = pDataEx + iz * PerZ + ix * PerX, * tEz = pDataEz + iz * PerZ + ix * PerX; + float ExRe = *tEx, ExIm = *(tEx + 1); + float EzRe = *tEz, EzIm = *(tEz + 1); + + if (dxSt != 0.f) + { + float ExpXStRe = DataPtrs.ExpArrXSt[Two_ix], ExpXStIm = DataPtrs.ExpArrXSt[Two_ix_p_1]; + + bRe = DataPtrs.FFTArrXStEx[Two_iz]; bIm = DataPtrs.FFTArrXStEx[Two_iz_p_1]; + ExRe += (float)(dxSt * (ExpXStRe * bRe - ExpXStIm * bIm)); + ExIm += (float)(dxSt * (ExpXStRe * bIm + ExpXStIm * bRe)); + + bRe = DataPtrs.FFTArrXStEz[Two_iz]; bIm = DataPtrs.FFTArrXStEz[Two_iz_p_1]; + EzRe += (float)(dxSt * (ExpXStRe * bRe - ExpXStIm * bIm)); + EzIm += (float)(dxSt * (ExpXStRe * bIm + ExpXStIm * bRe)); + + if (dzSt != 0.f) + { + bRe = DataPtrs.ExpArrZSt[Two_iz], bIm = DataPtrs.ExpArrZSt[Two_iz_p_1]; + cRe = ExpXStRe * bRe - ExpXStIm * bIm; cIm = ExpXStRe * bIm + ExpXStIm * bRe; + + ExRe += (float)(dxSt_dzSt * (fSSExRe * cRe - fSSExIm * cIm)); + ExIm += (float)(dxSt_dzSt * (fSSExRe * cIm + fSSExIm * cRe)); + EzRe += (float)(dxSt_dzSt * (fSSEzRe * cRe - fSSEzIm * cIm)); + EzIm += (float)(dxSt_dzSt * (fSSEzRe * cIm + fSSEzIm * cRe)); + } + if (dzFi != 0.f) + { + bRe = DataPtrs.ExpArrZFi[Two_iz], bIm = DataPtrs.ExpArrZFi[Two_iz_p_1]; + cRe = ExpXStRe * bRe - ExpXStIm * bIm; cIm = ExpXStRe * bIm + ExpXStIm * bRe; + + ExRe -= (float)(dxSt_dzFi * (fSFExRe * cRe - fSFExIm * cIm)); + ExIm -= (float)(dxSt_dzFi * (fSFExRe * cIm + fSFExIm * cRe)); + EzRe -= (float)(dxSt_dzFi * (fSFEzRe * cRe - fSFEzIm * cIm)); + EzIm -= (float)(dxSt_dzFi * (fSFEzRe * cIm + fSFEzIm * cRe)); + } + } + if (dxFi != 0.f) + { + float ExpXFiRe = DataPtrs.ExpArrXFi[Two_ix], ExpXFiIm = DataPtrs.ExpArrXFi[Two_ix_p_1]; + + bRe = DataPtrs.FFTArrXFiEx[Two_iz]; bIm = DataPtrs.FFTArrXFiEx[Two_iz_p_1]; + ExRe -= (float)(dxFi * (ExpXFiRe * bRe - ExpXFiIm * bIm)); + ExIm -= (float)(dxFi * (ExpXFiRe * bIm + ExpXFiIm * bRe)); + + bRe = DataPtrs.FFTArrXFiEz[Two_iz]; bIm = DataPtrs.FFTArrXFiEz[Two_iz_p_1]; + EzRe -= (float)(dxFi * (ExpXFiRe * bRe - ExpXFiIm * bIm)); + EzIm -= (float)(dxFi * (ExpXFiRe * bIm + ExpXFiIm * bRe)); + + if (dzSt != 0.f) + { + bRe = DataPtrs.ExpArrZSt[Two_iz], bIm = DataPtrs.ExpArrZSt[Two_iz_p_1]; + cRe = ExpXFiRe * bRe - ExpXFiIm * bIm; cIm = ExpXFiRe * bIm + ExpXFiIm * bRe; + + ExRe -= (float)(dxFi_dzSt * (fFSExRe * cRe - fFSExIm * cIm)); + ExIm -= (float)(dxFi_dzSt * (fFSExRe * cIm + fFSExIm * cRe)); + EzRe -= (float)(dxFi_dzSt * (fFSEzRe * cRe - fFSEzIm * cIm)); + EzIm -= (float)(dxFi_dzSt * (fFSEzRe * cIm + fFSEzIm * cRe)); + } + if (dzFi != 0.f) + { + bRe = DataPtrs.ExpArrZFi[Two_iz], bIm = DataPtrs.ExpArrZFi[Two_iz_p_1]; + cRe = ExpXFiRe * bRe - ExpXFiIm * bIm; cIm = ExpXFiRe * bIm + ExpXFiIm * bRe; + + ExRe += (float)(dxFi_dzFi * (fFFExRe * cRe - fFFExIm * cIm)); + ExIm += (float)(dxFi_dzFi * (fFFExRe * cIm + fFFExIm * cRe)); + EzRe += (float)(dxFi_dzFi * (fFFEzRe * cRe - fFFEzIm * cIm)); + EzIm += (float)(dxFi_dzFi * (fFFEzRe * cIm + fFFEzIm * cRe)); + } + } + if (dzSt != 0.f) + { + float ExpZStRe = DataPtrs.ExpArrZSt[Two_iz], ExpZStIm = DataPtrs.ExpArrZSt[Two_iz_p_1]; + + bRe = DataPtrs.FFTArrZStEx[Two_ix]; bIm = DataPtrs.FFTArrZStEx[Two_ix_p_1]; + ExRe += (float)(dzSt * (ExpZStRe * bRe - ExpZStIm * bIm)); + ExIm += (float)(dzSt * (ExpZStRe * bIm + ExpZStIm * bRe)); + + bRe = DataPtrs.FFTArrZStEz[Two_ix]; bIm = DataPtrs.FFTArrZStEz[Two_ix_p_1]; + EzRe += (float)(DataPtrs.dzSt * (ExpZStRe * bRe - ExpZStIm * bIm)); + EzIm += (float)(DataPtrs.dzSt * (ExpZStRe * bIm + ExpZStIm * bRe)); + } + if (dzFi != 0.f) + { + float ExpZFiRe = DataPtrs.ExpArrZFi[Two_iz], ExpZFiIm = DataPtrs.ExpArrZFi[Two_iz_p_1]; + + bRe = DataPtrs.FFTArrZFiEx[Two_ix]; bIm = DataPtrs.FFTArrZFiEx[Two_ix_p_1]; + ExRe -= (float)(dzFi * (ExpZFiRe * bRe - ExpZFiIm * bIm)); + ExIm -= (float)(dzFi * (ExpZFiRe * bIm + ExpZFiIm * bRe)); + + bRe = DataPtrs.FFTArrZFiEz[Two_ix]; bIm = DataPtrs.FFTArrZFiEz[Two_ix_p_1]; + EzRe -= (float)(dzFi * (ExpZFiRe * bRe - ExpZFiIm * bIm)); + EzIm -= (float)(dzFi * (ExpZFiRe * bIm + ExpZFiIm * bRe)); + } + + *tEx = ExRe; *(tEx + 1) = ExIm; + *tEz = EzRe; *(tEz + 1) = EzIm; + } +} + +void srTGenOptElem::MakeWfrEdgeCorrection_GPU(srTSRWRadStructAccessData* RadAccessData, float* pDataEx, float* pDataEz, srTDataPtrsForWfrEdgeCorr& DataPtrs, TGPUUsageArg* pGPU) +{ + pDataEx = (float*)CAuxGPU::ToDevice(pGPU, pDataEx, 2*RadAccessData->ne*RadAccessData->nx*RadAccessData->nz*sizeof(float)); + pDataEz = (float*)CAuxGPU::ToDevice(pGPU, pDataEz, 2*RadAccessData->ne*RadAccessData->nx*RadAccessData->nz*sizeof(float)); + DataPtrs.FFTArrXStEx = (float*)CAuxGPU::ToDevice(pGPU, DataPtrs.FFTArrXStEx, 2*RadAccessData->nz*sizeof(float)); + DataPtrs.FFTArrXStEz = (float*)CAuxGPU::ToDevice(pGPU, DataPtrs.FFTArrXStEz, 2*RadAccessData->nz*sizeof(float)); + DataPtrs.FFTArrXFiEx = (float*)CAuxGPU::ToDevice(pGPU, DataPtrs.FFTArrXFiEx, 2*RadAccessData->nz*sizeof(float)); + DataPtrs.FFTArrXFiEz = (float*)CAuxGPU::ToDevice(pGPU, DataPtrs.FFTArrXFiEz, 2*RadAccessData->nz*sizeof(float)); + DataPtrs.FFTArrZStEx = (float*)CAuxGPU::ToDevice(pGPU, DataPtrs.FFTArrZStEx, 2*RadAccessData->nx*sizeof(float)); + DataPtrs.FFTArrZStEz = (float*)CAuxGPU::ToDevice(pGPU, DataPtrs.FFTArrZStEz, 2*RadAccessData->nx*sizeof(float)); + DataPtrs.FFTArrZFiEx = (float*)CAuxGPU::ToDevice(pGPU, DataPtrs.FFTArrZFiEx, 2*RadAccessData->nx*sizeof(float)); + DataPtrs.FFTArrZFiEz = (float*)CAuxGPU::ToDevice(pGPU, DataPtrs.FFTArrZFiEz, 2*RadAccessData->nx*sizeof(float)); + DataPtrs.ExpArrXSt = (float*)CAuxGPU::ToDevice(pGPU, DataPtrs.ExpArrXSt, 2*RadAccessData->nx*sizeof(float)); + DataPtrs.ExpArrXFi = (float*)CAuxGPU::ToDevice(pGPU, DataPtrs.ExpArrXFi, 2*RadAccessData->nx*sizeof(float)); + DataPtrs.ExpArrZSt = (float*)CAuxGPU::ToDevice(pGPU, DataPtrs.ExpArrZSt, 2*RadAccessData->nz*sizeof(float)); + DataPtrs.ExpArrZFi = (float*)CAuxGPU::ToDevice(pGPU, DataPtrs.ExpArrZFi, 2*RadAccessData->nz*sizeof(float)); + + CAuxGPU::EnsureDeviceMemoryReady(pGPU, pDataEx); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, pDataEz); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, DataPtrs.FFTArrXStEx); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, DataPtrs.FFTArrXStEz); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, DataPtrs.FFTArrXFiEx); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, DataPtrs.FFTArrXFiEz); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, DataPtrs.FFTArrZStEx); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, DataPtrs.FFTArrZStEz); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, DataPtrs.FFTArrZFiEx); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, DataPtrs.FFTArrZFiEz); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, DataPtrs.ExpArrXSt); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, DataPtrs.ExpArrXFi); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, DataPtrs.ExpArrZSt); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, DataPtrs.ExpArrZFi); + + const int bs = 256; + dim3 blocks(RadAccessData->nx / bs + ((RadAccessData->nx & (bs - 1)) != 0), RadAccessData->nz); + dim3 threads(bs, 1); + MakeWfrEdgeCorrection_Kernel << > > (*RadAccessData, pDataEx, pDataEz, DataPtrs, (float)DataPtrs.dxSt, (float)DataPtrs.dxFi, (float)DataPtrs.dzSt, (float)DataPtrs.dzFi); + + DataPtrs.FFTArrXStEx = (float*)CAuxGPU::ToHostAndFree(pGPU, DataPtrs.FFTArrXStEx, 2*RadAccessData->nz*sizeof(float), true); + DataPtrs.FFTArrXStEz = (float*)CAuxGPU::ToHostAndFree(pGPU, DataPtrs.FFTArrXStEz, 2*RadAccessData->nz*sizeof(float), true); + DataPtrs.FFTArrXFiEx = (float*)CAuxGPU::ToHostAndFree(pGPU, DataPtrs.FFTArrXFiEx, 2*RadAccessData->nz*sizeof(float), true); + DataPtrs.FFTArrXFiEz = (float*)CAuxGPU::ToHostAndFree(pGPU, DataPtrs.FFTArrXFiEz, 2*RadAccessData->nz*sizeof(float), true); + DataPtrs.FFTArrZStEx = (float*)CAuxGPU::ToHostAndFree(pGPU, DataPtrs.FFTArrZStEx, 2*RadAccessData->nx*sizeof(float), true); + DataPtrs.FFTArrZStEz = (float*)CAuxGPU::ToHostAndFree(pGPU, DataPtrs.FFTArrZStEz, 2*RadAccessData->nx*sizeof(float), true); + DataPtrs.FFTArrZFiEx = (float*)CAuxGPU::ToHostAndFree(pGPU, DataPtrs.FFTArrZFiEx, 2*RadAccessData->nx*sizeof(float), true); + DataPtrs.FFTArrZFiEz = (float*)CAuxGPU::ToHostAndFree(pGPU, DataPtrs.FFTArrZFiEz, 2*RadAccessData->nx*sizeof(float), true); + DataPtrs.ExpArrXSt = (float*)CAuxGPU::ToHostAndFree(pGPU, DataPtrs.ExpArrXSt, 2*RadAccessData->nx*sizeof(float), true); + DataPtrs.ExpArrXFi = (float*)CAuxGPU::ToHostAndFree(pGPU, DataPtrs.ExpArrXFi, 2*RadAccessData->nx*sizeof(float), true); + DataPtrs.ExpArrZSt = (float*)CAuxGPU::ToHostAndFree(pGPU, DataPtrs.ExpArrZSt, 2*RadAccessData->nz*sizeof(float), true); + DataPtrs.ExpArrZFi = (float*)CAuxGPU::ToHostAndFree(pGPU, DataPtrs.ExpArrZFi, 2*RadAccessData->nz*sizeof(float), true); + + CAuxGPU::MarkUpdated(pGPU, pDataEx, true, false); + CAuxGPU::MarkUpdated(pGPU, pDataEz, true, false); + +#ifdef _DEBUG + CAuxGPU::ToHostAndFree(pGPU, pDataEx, 2*RadAccessData->ne*RadAccessData->nx*RadAccessData->nz*sizeof(float)); + CAuxGPU::ToHostAndFree(pGPU, pDataEz, 2*RadAccessData->ne*RadAccessData->nx*RadAccessData->nz*sizeof(float)); + cudaStreamSynchronize(0); + auto err = cudaGetLastError(); + printf("%s\r\n", cudaGetErrorString(err)); +#endif +} + +template __global__ void RadResizeCore_Kernel(srTSRWRadStructAccessData OldRadAccessData, srTSRWRadStructAccessData NewRadAccessData) +{ + int ixStart = int(NewRadAccessData.AuxLong1); + int ixEnd = int(NewRadAccessData.AuxLong2); + int izStart = int(NewRadAccessData.AuxLong3); + int izEnd = int(NewRadAccessData.AuxLong4); + + int ix = (blockIdx.x * blockDim.x + threadIdx.x) + ixStart; //nx range + int iz = (blockIdx.y * blockDim.y + threadIdx.y) + izStart; //nz range + int ie = (blockIdx.z * blockDim.z + threadIdx.z); //ne range + + if (ix > ixEnd) return; + if (iz > izEnd) return; + + const double DistAbsTol = 1.E-10; + double xStepInvOld = 1./OldRadAccessData.xStep; + double zStepInvOld = 1./OldRadAccessData.zStep; + int nx_mi_1Old = OldRadAccessData.nx - 1; + int nz_mi_1Old = OldRadAccessData.nz - 1; + int nx_mi_2Old = nx_mi_1Old - 1; + int nz_mi_2Old = nz_mi_1Old - 1; + + //OC31102018: moved by SY at parallelizing SRW via OpenMP + //srTInterpolAux01 InterpolAux01; + //srTInterpolAux02 InterpolAux02[4], InterpolAux02I[2]; + //srTInterpolAuxF AuxF[4], AuxFI[2]; + //int ixStOld, izStOld, ixStOldPrev = -1000, izStOldPrev = -1000; + + //long PerX_New = NewRadAccessData.ne << 1; + //long PerZ_New = PerX_New*NewRadAccessData.nx; + long long PerX_New = NewRadAccessData.ne << 1; + long long PerZ_New = PerX_New*NewRadAccessData.nx; + + //long PerX_Old = PerX_New; + //long PerZ_Old = PerX_Old*OldRadAccessData.nx; + long long PerX_Old = PerX_New; + long long PerZ_Old = PerX_Old*OldRadAccessData.nx; + + float *pEX0_New = 0, *pEZ0_New = 0; + pEX0_New = NewRadAccessData.pBaseRadX; + pEZ0_New = NewRadAccessData.pBaseRadZ; + + float* pEX0_Old = 0, * pEZ0_Old = 0; + pEX0_Old = OldRadAccessData.pBaseRadX; + pEZ0_Old = OldRadAccessData.pBaseRadZ; + + + int ixStOld, izStOld, ixStOldPrev = -1000, izStOldPrev = -1000; + //SY: do we need this (always returns 0, updates some clock) + //if(result = srYield.Check()) return result; + + double zAbs = NewRadAccessData.zStart + iz * NewRadAccessData.zStep; + + char FieldShouldBeZeroedDueToZ = 0; + if (NewRadAccessData.WfrEdgeCorrShouldBeDone) + { + if ((zAbs < NewRadAccessData.zWfrMin - DistAbsTol) || (zAbs > NewRadAccessData.zWfrMax + DistAbsTol)) FieldShouldBeZeroedDueToZ = 1; + } + + int izcOld = int((zAbs - OldRadAccessData.zStart) * zStepInvOld + 1.E-06); + + double zRel = zAbs - (OldRadAccessData.zStart + izcOld * OldRadAccessData.zStep); + + if (izcOld == nz_mi_1Old) { izStOld = izcOld - 3; zRel += 2. * OldRadAccessData.zStep; } + else if (izcOld == nz_mi_2Old) { izStOld = izcOld - 2; zRel += OldRadAccessData.zStep; } + else if (izcOld == 0) { izStOld = izcOld; zRel -= OldRadAccessData.zStep; } + else izStOld = izcOld - 1; + + zRel *= zStepInvOld; + + int izcOld_mi_izStOld = izcOld - izStOld; + //long izPerZ_New = iz*PerZ_New; + long long izPerZ_New = iz * PerZ_New; + + double xAbs = NewRadAccessData.xStart + ix * NewRadAccessData.xStep; + + char FieldShouldBeZeroedDueToX = 0; + if (NewRadAccessData.WfrEdgeCorrShouldBeDone) + { + if ((xAbs < NewRadAccessData.xWfrMin - DistAbsTol) || (xAbs > NewRadAccessData.xWfrMax + DistAbsTol)) FieldShouldBeZeroedDueToX = 1; + } + char FieldShouldBeZeroed = (FieldShouldBeZeroedDueToX || FieldShouldBeZeroedDueToZ); + + int ixcOld = int((xAbs - OldRadAccessData.xStart) * xStepInvOld + 1.E-06); + double xRel = xAbs - (OldRadAccessData.xStart + ixcOld * OldRadAccessData.xStep); + + if (ixcOld == nx_mi_1Old) { ixStOld = ixcOld - 3; xRel += 2. * OldRadAccessData.xStep; } + else if (ixcOld == nx_mi_2Old) { ixStOld = ixcOld - 2; xRel += OldRadAccessData.xStep; } + else if (ixcOld == 0) { ixStOld = ixcOld; xRel -= OldRadAccessData.xStep; } + else ixStOld = ixcOld - 1; + + xRel *= xStepInvOld; + + int ixcOld_mi_ixStOld = ixcOld - ixStOld; + + //or (int ie = 0; ie < NewRadAccessData.ne; ie++) + { + //OC31102018: modified by SY at OpenMP parallelization + //ixStOldPrev = -1000; izStOldPrev = -1000; + + //OC31102018: moved by SY at OpenMP parallelization + srTInterpolAux01 InterpolAux01; + srTInterpolAux02 InterpolAux02[4], InterpolAux02I[2]; + srTInterpolAuxF AuxF[4], AuxFI[2]; + ixStOldPrev = -1000; izStOldPrev = -1000; + float BufF[4], BufFI[2]; + char UseLowOrderInterp_PolCompX = 0, UseLowOrderInterp_PolCompZ = 0; + + //long Two_ie = ie << 1; + long long Two_ie = ie << 1; + + float* pEX_StartForX_New = 0, * pEZ_StartForX_New = 0; + pEX_StartForX_New = pEX0_New + izPerZ_New; + pEZ_StartForX_New = pEZ0_New + izPerZ_New; + + //long ixPerX_New_p_Two_ie = ix*PerX_New + Two_ie; + long long ixPerX_New_p_Two_ie = ix * PerX_New + Two_ie; + float* pEX_New = 0, * pEZ_New = 0; + pEX_New = pEX_StartForX_New + ixPerX_New_p_Two_ie; + pEZ_New = pEZ_StartForX_New + ixPerX_New_p_Two_ie; + + //long TotOffsetOld = izStOld*PerZ_Old + ixStOld*PerX_Old + Two_ie; + long long TotOffsetOld = izStOld * PerZ_Old + ixStOld * PerX_Old + Two_ie; + + if (TreatPolCompX) + { + float* pExSt_Old = pEX0_Old + TotOffsetOld; + srTGenOptElem::GetCellDataForInterpol(pExSt_Old, PerX_Old, PerZ_Old, AuxF); + + srTGenOptElem::SetupCellDataI(AuxF, AuxFI); + UseLowOrderInterp_PolCompX = srTGenOptElem::CheckForLowOrderInterp(AuxF, AuxFI, ixcOld_mi_ixStOld, izcOld_mi_izStOld, &InterpolAux01, InterpolAux02, InterpolAux02I); + + if (!UseLowOrderInterp_PolCompX) + { + for (int i = 0; i < 2; i++) + { + srTGenOptElem::SetupInterpolAux02(AuxF + i, &InterpolAux01, InterpolAux02 + i); + } + srTGenOptElem::SetupInterpolAux02(AuxFI, &InterpolAux01, InterpolAux02I); + } + + if (UseLowOrderInterp_PolCompX) + { + srTGenOptElem::InterpolF_LowOrder(InterpolAux02, xRel, zRel, BufF, 0); + srTGenOptElem::InterpolFI_LowOrder(InterpolAux02I, xRel, zRel, BufFI, 0); + } + else + { + srTGenOptElem::InterpolF(InterpolAux02, xRel, zRel, BufF, 0); + srTGenOptElem::InterpolFI(InterpolAux02I, xRel, zRel, BufFI, 0); + } + + (*BufFI) *= AuxFI->fNorm; + srTGenOptElem::ImproveReAndIm(BufF, BufFI); + + if (FieldShouldBeZeroed) + { + *BufF = 0.; *(BufF + 1) = 0.; + } + + *pEX_New = *BufF; + *(pEX_New + 1) = *(BufF + 1); + } + if (TreatPolCompZ) + { + float* pEzSt_Old = pEZ0_Old + TotOffsetOld; + srTGenOptElem::GetCellDataForInterpol(pEzSt_Old, PerX_Old, PerZ_Old, AuxF + 2); + + srTGenOptElem::SetupCellDataI(AuxF + 2, AuxFI + 1); + UseLowOrderInterp_PolCompZ = srTGenOptElem::CheckForLowOrderInterp(AuxF + 2, AuxFI + 1, ixcOld_mi_ixStOld, izcOld_mi_izStOld, &InterpolAux01, InterpolAux02 + 2, InterpolAux02I + 1); + + if (!UseLowOrderInterp_PolCompZ) + { + for (int i = 0; i < 2; i++) + { + srTGenOptElem::SetupInterpolAux02(AuxF + 2 + i, &InterpolAux01, InterpolAux02 + 2 + i); + } + srTGenOptElem::SetupInterpolAux02(AuxFI + 1, &InterpolAux01, InterpolAux02I + 1); + } + + if (UseLowOrderInterp_PolCompZ) + { + srTGenOptElem::InterpolF_LowOrder(InterpolAux02, xRel, zRel, BufF, 2); + srTGenOptElem::InterpolFI_LowOrder(InterpolAux02I, xRel, zRel, BufFI, 1); + } + else + { + srTGenOptElem::InterpolF(InterpolAux02, xRel, zRel, BufF, 2); + srTGenOptElem::InterpolFI(InterpolAux02I, xRel, zRel, BufFI, 1); + } + + (*(BufFI + 1)) *= (AuxFI + 1)->fNorm; + srTGenOptElem::ImproveReAndIm(BufF + 2, BufFI + 1); + + if (FieldShouldBeZeroed) + { + *(BufF + 2) = 0.; *(BufF + 3) = 0.; + } + + *pEZ_New = *(BufF + 2); + *(pEZ_New + 1) = *(BufF + 3); + } + } +} + +int srTGenOptElem::RadResizeCore_GPU(srTSRWRadStructAccessData& OldRadAccessData, srTSRWRadStructAccessData& NewRadAccessData, char PolComp, TGPUUsageArg* pGPU) +{ + char TreatPolCompX = ((PolComp == 0) || (PolComp == 'x')); + char TreatPolCompZ = ((PolComp == 0) || (PolComp == 'z')); + + int nx = NewRadAccessData.AuxLong2 - NewRadAccessData.AuxLong1 + 1; + int nz = NewRadAccessData.AuxLong4 - NewRadAccessData.AuxLong3 + 1; + int ne = NewRadAccessData.ne; + OldRadAccessData.pBaseRadX = (float*)CAuxGPU::ToDevice(pGPU, OldRadAccessData.pBaseRadX, 2*OldRadAccessData.ne*OldRadAccessData.nx*OldRadAccessData.nz*sizeof(float)); + OldRadAccessData.pBaseRadZ = (float*)CAuxGPU::ToDevice(pGPU, OldRadAccessData.pBaseRadZ, 2*OldRadAccessData.ne*OldRadAccessData.nx*OldRadAccessData.nz*sizeof(float)); + NewRadAccessData.pBaseRadX = (float*)CAuxGPU::ToDevice(pGPU, NewRadAccessData.pBaseRadX, 2*NewRadAccessData.ne*NewRadAccessData.nx*NewRadAccessData.nz*sizeof(float), true); + NewRadAccessData.pBaseRadZ = (float*)CAuxGPU::ToDevice(pGPU, NewRadAccessData.pBaseRadZ, 2*NewRadAccessData.ne*NewRadAccessData.nx*NewRadAccessData.nz*sizeof(float), true); + + CAuxGPU::EnsureDeviceMemoryReady(pGPU, OldRadAccessData.pBaseRadX); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, OldRadAccessData.pBaseRadZ); + //CAuxGPU::EnsureDeviceMemoryReady(pGPU, NewRadAccessData.pBaseRadX); + //CAuxGPU::EnsureDeviceMemoryReady(pGPU, NewRadAccessData.pBaseRadZ); + + const int bs = 32; + dim3 blocks(nx / bs + ((nx & (bs - 1)) != 0), nz, ne); + dim3 threads(bs, 1); + + if (TreatPolCompX && TreatPolCompZ) RadResizeCore_Kernel << > > (OldRadAccessData, NewRadAccessData); + else if (TreatPolCompX) RadResizeCore_Kernel << > > (OldRadAccessData, NewRadAccessData); + else if (TreatPolCompZ) RadResizeCore_Kernel << > > (OldRadAccessData, NewRadAccessData); + + OldRadAccessData.pBaseRadX = (float*)CAuxGPU::ToHostAndFree(pGPU, OldRadAccessData.pBaseRadX, 2*OldRadAccessData.ne*OldRadAccessData.nx*OldRadAccessData.nz*sizeof(float), true); + OldRadAccessData.pBaseRadZ = (float*)CAuxGPU::ToHostAndFree(pGPU, OldRadAccessData.pBaseRadZ, 2*OldRadAccessData.ne*OldRadAccessData.nx*OldRadAccessData.nz*sizeof(float), true); + //NewRadAccessData.pBaseRadX = CAuxGPU::ToHostAndFree(pGPU, NewRadAccessData.pBaseRadX, 2*NewRadAccessData.ne*NewRadAccessData.nx*NewRadAccessData.nz*sizeof(float)); + //NewRadAccessData.pBaseRadZ = CAuxGPU::ToHostAndFree(pGPU, NewRadAccessData.pBaseRadZ, 2*NewRadAccessData.ne*NewRadAccessData.nx*NewRadAccessData.nz*sizeof(float)); + CAuxGPU::MarkUpdated(pGPU, NewRadAccessData.pBaseRadX, true, false); + CAuxGPU::MarkUpdated(pGPU, NewRadAccessData.pBaseRadZ, true, false); +#ifndef _DEBUG + NewRadAccessData.pBaseRadX = (float*)CAuxGPU::GetHostPtr(pGPU, NewRadAccessData.pBaseRadX); + NewRadAccessData.pBaseRadZ = (float*)CAuxGPU::GetHostPtr(pGPU, NewRadAccessData.pBaseRadZ); +#endif + +#ifdef _DEBUG + NewRadAccessData.pBaseRadX = (float*)CAuxGPU::ToHostAndFree(pGPU, NewRadAccessData.pBaseRadX, 2*NewRadAccessData.ne*NewRadAccessData.nx*NewRadAccessData.nz*sizeof(float), false); + NewRadAccessData.pBaseRadZ = (float*)CAuxGPU::ToHostAndFree(pGPU, NewRadAccessData.pBaseRadZ, 2*NewRadAccessData.ne*NewRadAccessData.nx*NewRadAccessData.nz*sizeof(float), false); + cudaStreamSynchronize(0); + auto err = cudaGetLastError(); + printf("%s\r\n", cudaGetErrorString(err)); + +#endif + + return 0; +} + +#endif \ No newline at end of file diff --git a/cpp/src/core/sroptelm_gpu.h b/cpp/src/core/sroptelm_gpu.h new file mode 100644 index 00000000..629e0c42 --- /dev/null +++ b/cpp/src/core/sroptelm_gpu.h @@ -0,0 +1,123 @@ +/************************************************************************//** + * File: sroptelm_gpu.h + * Description: Optical element (general CUDA header) + * Project: Synchrotron Radiation Workshop + * First release: 2023 + * + * Copyright (C) Brookhaven National Laboratory + * All Rights Reserved + * + * @author H.Goel + * @version 1.0 + ***************************************************************************/ + +#ifdef _OFFLOAD_GPU +#ifndef __SROPTELMGPU_H +#define __SROPTELMGPU_H + +#include "cuda_runtime.h" +#include +#include +#include + +#ifdef __CUDACC__ +template __global__ void RadPointModifierParallel_Kernel(srTSRWRadStructAccessData RadAccessData, void* pBufVars, T* tgt_obj) +{ + int ix = (blockIdx.x * blockDim.x + threadIdx.x); //nx range + int iz = (blockIdx.y * blockDim.y + threadIdx.y); //nz range + + if (ix < RadAccessData.nx && iz < RadAccessData.nz) + { + srTEFieldPtrs EPtrs; + srTEXZ EXZ; + EXZ.z = RadAccessData.zStart + iz * RadAccessData.zStep; + EXZ.x = RadAccessData.xStart + ix * RadAccessData.xStep; + + for (int ie = 0; ie < RadAccessData.ne; ie++) { + EXZ.e = RadAccessData.eStart + ie * RadAccessData.eStep; + EXZ.aux_offset = RadAccessData.ne * RadAccessData.nx * 2 * iz + RadAccessData.ne * 2 * ix + ie * 2; + if (RadAccessData.pBaseRadX != 0) + { + EPtrs.pExRe = RadAccessData.pBaseRadX + EXZ.aux_offset; + EPtrs.pExIm = EPtrs.pExRe + 1; + } + else + { + EPtrs.pExRe = 0; + EPtrs.pExIm = 0; + } + if (RadAccessData.pBaseRadZ != 0) + { + EPtrs.pEzRe = RadAccessData.pBaseRadZ + EXZ.aux_offset; + EPtrs.pEzIm = EPtrs.pEzRe + 1; + } + else + { + EPtrs.pEzRe = 0; + EPtrs.pEzIm = 0; + } + + tgt_obj->RadPointModifierPortable(EXZ, EPtrs, pBufVars); + } + } +} + +template int RadPointModifierParallelImpl(srTSRWRadStructAccessData* pRadAccessData, void* pBufVars, long pBufVarsSz, T* tgt_obj, TGPUUsageArg* pGPU) +{ + const int bs = 256; + dim3 blocks(pRadAccessData->nx / bs + ((pRadAccessData->nx & (bs - 1)) != 0), pRadAccessData->nz); + dim3 threads(bs, 1); + + if (pRadAccessData->pBaseRadX != NULL) + { + pRadAccessData->pBaseRadX = (float*)CAuxGPU::ToDevice(pGPU, pRadAccessData->pBaseRadX, 2*pRadAccessData->ne*pRadAccessData->nx*pRadAccessData->nz*sizeof(float)); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, pRadAccessData->pBaseRadX); + } + if (pRadAccessData->pBaseRadZ != NULL) + { + pRadAccessData->pBaseRadZ = (float*)CAuxGPU::ToDevice(pGPU, pRadAccessData->pBaseRadZ, 2*pRadAccessData->ne*pRadAccessData->nx*pRadAccessData->nz*sizeof(float)); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, pRadAccessData->pBaseRadZ); + } + + T* local_copy = (T*)CAuxGPU::ToDevice(pGPU, tgt_obj, sizeof(T)); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, local_copy); + //cudaMalloc(&local_copy, sizeof(T)); + //cudaMemcpy(local_copy, tgt_obj, sizeof(T), cudaMemcpyHostToDevice); + + void* pBufVars_dev = NULL; + if (pBufVarsSz > 0){ + pBufVars_dev = CAuxGPU::ToDevice(pGPU, pBufVars, pBufVarsSz); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, pBufVars_dev); + } + RadPointModifierParallel_Kernel << > > (*pRadAccessData, pBufVars_dev, local_copy); + //cudaDeviceSynchronize(); + //cudaFreeAsync(local_copy, 0); + if (pBufVarsSz > 0) CAuxGPU::ToHostAndFree(pGPU, pBufVars_dev, pBufVarsSz, true); + CAuxGPU::ToHostAndFree(pGPU, local_copy, sizeof(T), true); + + CAuxGPU::MarkUpdated(pGPU, pRadAccessData->pBaseRadX, true, false); + CAuxGPU::MarkUpdated(pGPU, pRadAccessData->pBaseRadZ, true, false); + +#ifndef _DEBUG + if (pRadAccessData->pBaseRadX != NULL) + pRadAccessData->pBaseRadX = (float*)CAuxGPU::GetHostPtr(pGPU, pRadAccessData->pBaseRadX); + if (pRadAccessData->pBaseRadZ != NULL) + pRadAccessData->pBaseRadZ = (float*)CAuxGPU::GetHostPtr(pGPU, pRadAccessData->pBaseRadZ); +#endif + +#ifdef _DEBUG + if (pRadAccessData->pBaseRadX != NULL) + pRadAccessData->pBaseRadX = (float*)CAuxGPU::ToHostAndFree(pGPU, pRadAccessData->pBaseRadX, 2*pRadAccessData->ne*pRadAccessData->nx*pRadAccessData->nz*sizeof(float)); + if (pRadAccessData->pBaseRadZ != NULL) + pRadAccessData->pBaseRadZ = (float*)CAuxGPU::ToHostAndFree(pGPU, pRadAccessData->pBaseRadZ, 2*pRadAccessData->ne*pRadAccessData->nx*pRadAccessData->nz*sizeof(float)); + cudaStreamSynchronize(0); + auto err = cudaGetLastError(); + printf("%s\r\n", cudaGetErrorString(err)); +#endif + + return 0; +} +#endif + +#endif //__SROPTELMGPU_H +#endif \ No newline at end of file diff --git a/cpp/src/core/sroptfoc.h b/cpp/src/core/sroptfoc.h index d2a05579..f950a775 100644 --- a/cpp/src/core/sroptfoc.h +++ b/cpp/src/core/sroptfoc.h @@ -153,7 +153,8 @@ class srTThinLens : public srTFocusingElem { srTThinLens() {} //int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, int MethNo, srTRadResizeVect& ResBeforeAndAfterVect) - int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect) + //int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect) + int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect, void* pvGPU=0) //HG04122023 { //if(ParPrecWfrPropag.AnalTreatment == 1) //{// Treating linear terms analytically @@ -197,11 +198,14 @@ class srTThinLens : public srTFocusingElem { //int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData, void* pBuf=0) //OC06092019 //OC01102019 (restored) - int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData) + //int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData) + int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData, void* pvGPU=0) //HG04122023 { int result; - if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result; - if(result = TraverseRadZXE(pRadAccessData)) return result; + //if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result; + if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0, 0, 0, pvGPU)) return result; //HG04122023 + //if(result = TraverseRadZXE(pRadAccessData)) return result; + if(result = TraverseRadZXE(pRadAccessData, 0, 0, pvGPU)) return result; //HG04122023 return 0; } int PropagateRadiationSimple1D(srTRadSect1D* pSect1D) diff --git a/cpp/src/core/sroptgrat.h b/cpp/src/core/sroptgrat.h index 487f805f..2c761663 100644 --- a/cpp/src/core/sroptgrat.h +++ b/cpp/src/core/sroptgrat.h @@ -96,7 +96,8 @@ class srTGrating : public srTShapedOptElem { m_PropWfrInPlace = true; //OC151008 //previous electric field is NOT necessary for the propagation } - int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect) + //int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect) + int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect, void* pvGPU=0) //HG04122023 { //char &MethNo = ParPrecWfrPropag.MethNo; SetupPropBufVars_Gen(pRadAccessData); diff --git a/cpp/src/core/sroptgtr.cpp b/cpp/src/core/sroptgtr.cpp index 7f348172..96681032 100644 --- a/cpp/src/core/sroptgtr.cpp +++ b/cpp/src/core/sroptgtr.cpp @@ -1172,7 +1172,7 @@ int srTGenTransmission::DetermineFocalDistByPropag1D(srTRadSect1D& Sect1D, doubl } //************************************************************************* - +/* HG01122023 Moved to header file to reduce code duplication for GPU support void srTGenTransmission::RadPointModifier(srTEXZ& EXZ, srTEFieldPtrs& EPtrs, void* pBufVars) //OC29082019 //void srTGenTransmission::RadPointModifier(srTEXZ& EXZ, srTEFieldPtrs& EPtrs) {// e in eV; Length in m !!! @@ -1338,7 +1338,7 @@ void srTGenTransmission::RadPointModifier(srTEXZ& EXZ, srTEFieldPtrs& EPtrs, voi float NewEzIm = (float)(T*((*(EPtrs.pEzRe))*SinPh + (*(EPtrs.pEzIm))*CosPh)); *(EPtrs.pEzRe) = NewEzRe; *(EPtrs.pEzIm) = NewEzIm; } -} +} */ //************************************************************************* diff --git a/cpp/src/core/sroptgtr.h b/cpp/src/core/sroptgtr.h index 0cde61e5..34052e34 100644 --- a/cpp/src/core/sroptgtr.h +++ b/cpp/src/core/sroptgtr.h @@ -50,6 +50,8 @@ class srTGenTransmission : public srTFocusingElem { } } + int SupportedFeatures() override { return 1; } //HG01122023 =1 means that it supports GPU propagation + void EnsureTransmissionForField(); double DetermineAppropriatePhotEnergyForFocDistTest(double Rx, double Rz); int EstimateFocalDistancesAndCheckSampling(); @@ -79,7 +81,8 @@ class srTGenTransmission : public srTFocusingElem { } //int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, int MethNo, srTRadResizeVect& ResBeforeAndAfterArr) - int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterArr) + //int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterArr) + int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterArr, void* pvGPU=0) //HG01122023 { //if(ParPrecWfrPropag.AnalTreatment == 1) //{// Treating linear terms analytically @@ -90,7 +93,8 @@ class srTGenTransmission : public srTFocusingElem { int result = 0; - if(MethNo == 0) result = PropagateRadiationMeth_0(pRadAccessData); + //if(MethNo == 0) result = PropagateRadiationMeth_0(pRadAccessData); + if(MethNo == 0) result = PropagateRadiationMeth_0(pRadAccessData, pvGPU); //HG01122023 else result = PropagateRadiationMeth_2(pRadAccessData, ParPrecWfrPropag, ResBeforeAndAfterArr); //if(ParPrecWfrPropag.AnalTreatment == 1) @@ -104,25 +108,30 @@ class srTGenTransmission : public srTFocusingElem { //int PropagateRadiationMeth_0(srTSRWRadStructAccessData* pRadAccessData) //int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadDataSingleE, void* pBuf=0) //OC06092019 //OC01102019 (restored) - int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadDataSingleE) + //int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadDataSingleE) + int PropagateRadiationSingleE_Meth_0(srTSRWRadStructAccessData* pRadAccessData, srTSRWRadStructAccessData* pPrevRadDataSingleE, void* pvGPU) //HG01122023 { int result; if(result = PropagateRadMoments(pRadAccessData, 0)) return result; if(result = PropagateWaveFrontRadius(pRadAccessData)) return result; //if(result = PropagateRadiationSimple(pRadAccessData, pBuf)) return result; //OC06092019 //OC01102019 (restored) - if(result = PropagateRadiationSimple(pRadAccessData)) return result; + //if(result = PropagateRadiationSimple(pRadAccessData)) return result; + if(result = PropagateRadiationSimple(pRadAccessData, pvGPU)) return result; //HG01122023 if(result = Propagate4x4PropMatr(pRadAccessData)) return result; return 0; } //int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData, void* pBuf=0) //OC06092019 //OC01102019 (restored) - int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData) + //int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData) + int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData, void* pvGPU=0) //HG01122023 { int result; - if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result; - return TraverseRadZXE(pRadAccessData); + //if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result; + if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0, 0, 0, pvGPU)) return result; //HG01122023 + //return TraverseRadZXE(pRadAccessData); + return TraverseRadZXE(pRadAccessData, 0, 0, pvGPU); //HG01122023 } int PropagateRadiationSimple1D(srTRadSect1D* pSect1D) { @@ -131,8 +140,186 @@ class srTGenTransmission : public srTFocusingElem { return TraverseRad1D(pSect1D); } - void RadPointModifier(srTEXZ& EXZ, srTEFieldPtrs& EPtrs, void* pBuf=0); //OC29082019 + void RadPointModifier(srTEXZ& EXZ, srTEFieldPtrs& EPtrs, void* pBuf=0) //OC29082019 //HG01122023 + { + RadPointModifierPortable(EXZ, EPtrs, pBuf); + } + //void RadPointModifier(srTEXZ& EXZ, srTEFieldPtrs& EPtrs, void* pBuf=0); //OC29082019 //void RadPointModifier(srTEXZ& EXZ, srTEFieldPtrs& EPtrs); + + +#ifdef _OFFLOAD_GPU //HG01122023 Brought from sroptgtr.cpp, to reduce code duplication for GPU port + int RadPointModifierParallel(srTSRWRadStructAccessData* pRadAccessData, void* pBufVars=0, long pBufVarsSz=0, TGPUUsageArg* pGPU=0) override; + + GPU_PORTABLE +#endif + void RadPointModifierPortable(srTEXZ& EXZ, srTEFieldPtrs& EPtrs, void* pBufVars) //OC29082019 + //void srTGenTransmission::RadPointModifier(srTEXZ& EXZ, srTEFieldPtrs& EPtrs) + {// e in eV; Length in m !!! + // Operates on Coord. side !!! + //double xRel = EXZ.x - TransvCenPoint.x, zRel = EXZ.z - TransvCenPoint.y; + double xRel = EXZ.x, zRel = EXZ.z; //OC080311 + + long Ne = 1, Nemi2 = -1; + long iDimX = 0, iDimZ = 1; + if(GenTransNumData.AmOfDims == 3) + { + //Ne = (GenTransNumData.DimSizes)[0]; + Ne = (long)((GenTransNumData.DimSizes)[0]); //OC28042019 + Nemi2 = Ne - 2; + iDimX = 1; iDimZ = 2; + } + + //long Nx = (GenTransNumData.DimSizes)[0], Nz = (GenTransNumData.DimSizes)[1]; + //long Nx = (GenTransNumData.DimSizes)[iDimX], Nz = (GenTransNumData.DimSizes)[iDimZ]; //OC241112 + long Nx = (long)((GenTransNumData.DimSizes)[iDimX]), Nz = (long)((GenTransNumData.DimSizes)[iDimZ]); //OC28042019 + long Nxmi2 = Nx - 2, Nzmi2 = Nz - 2; + + //double xStart = (GenTransNumData.DimStartValues)[0], zStart = (GenTransNumData.DimStartValues)[1]; + //double xStep = (GenTransNumData.DimSteps)[0], zStep = (GenTransNumData.DimSteps)[1]; + double xStart = (GenTransNumData.DimStartValues)[iDimX], zStart = (GenTransNumData.DimStartValues)[iDimZ]; + double xStep = (GenTransNumData.DimSteps)[iDimX], zStep = (GenTransNumData.DimSteps)[iDimZ]; + + double xEnd = xStart + (Nx - 1)*xStep, zEnd = zStart + (Nz - 1)*zStep; + + double AbsTolX = xStep*0.001, AbsTolZ = zStep*0.001; // To steer + if(OuterTransmIs == 1) + { + if((xRel < xStart - AbsTolX) || (xRel > xEnd + AbsTolX) || (zRel < zStart - AbsTolZ) || (zRel > zEnd + AbsTolZ)) + { + if(EPtrs.pExRe != 0) { *(EPtrs.pExRe) = 0.; *(EPtrs.pExIm) = 0.;} + if(EPtrs.pEzRe != 0) { *(EPtrs.pEzRe) = 0.; *(EPtrs.pEzIm) = 0.;} + return; + } + } + + double xr = 0., zr = 0.; + double T = 1., Ph = 0.; + //char NotExactRightEdgeX = 1, NotExactRightEdgeZ = 1; + + long ix = long((xRel - xStart)/xStep); + if(::fabs(xRel - ((ix + 1)*xStep + xStart)) < 1.E-05*xStep) ix++; + + //if(ix < 0) { ix = 0; xr = 0.;} + //else if(ix > Nxmi2) { ix = Nx - 1; xr = 0.; NotExactRightEdgeX = 0;} + //else xr = (xRel - (ix*xStep + xStart))/xStep; + + if(ix < 0) ix = 0; //OC241112 + //else if(ix > Nxmi2) ix = Nxmi2; + //xr = (xRel - (ix*xStep + xStart))/xStep; + else if(ix > Nxmi2) { ix = Nxmi2; xr = 1.;} + else xr = (xRel - (ix*xStep + xStart))/xStep; + + long iz = long((zRel - zStart)/zStep); + if(::fabs(zRel - ((iz + 1)*zStep + zStart)) < 1.E-05*zStep) iz++; + + //if(iz < 0) { iz = 0; zr = 0.;} + //else if(iz > Nzmi2) { iz = Nz - 1; zr = 0.; NotExactRightEdgeZ = 0;} + //else zr = (zRel - (iz*zStep + zStart))/zStep; + + if(iz < 0) iz = 0; + //else if(iz > Nzmi2) iz = Nzmi2; + //zr = (zRel - (iz*zStep + zStart))/zStep; + else if(iz > Nzmi2) { iz = Nzmi2; zr = 1.;} + else zr = (zRel - (iz*zStep + zStart))/zStep; + + double xrzr = xr*zr; + if((GenTransNumData.AmOfDims == 2) || ((GenTransNumData.AmOfDims == 3) && (Ne == 1))) + { + //long zPer = Nx << 1; + long long zPer = Nx << 1; + + //DOUBLE *p00 = (DOUBLE*)(GenTransNumData.pData) + (iz*zPer + (ix << 1)); + //DOUBLE *p10 = p00 + 2, *p01 = p00 + zPer; + //DOUBLE *p11 = p01 + 2; + //DOUBLE *p00p1 = p00+1, *p10p1 = p10+1, *p01p1 = p01+1, *p11p1 = p11+1; + double *p00 = (double*)(GenTransNumData.pData) + (iz*zPer + (ix << 1)); //OC26112019 (related to SRW port to IGOR XOP8 on Mac) + double *p10 = p00 + 2, *p01 = p00 + zPer; + double *p11 = p01 + 2; + double *p00p1 = p00+1, *p10p1 = p10+1, *p01p1 = p01+1, *p11p1 = p11+1; + + //double Axz = 0., Ax = 0., Az = 0., Bxz = 0., Bx = 0., Bz = 0.; + //if(NotExactRightEdgeX && NotExactRightEdgeZ) { Axz = *p00 - *p01 - *p10 + *p11; Bxz = *p00p1 - *p01p1 - *p10p1 + *p11p1;} + //if(NotExactRightEdgeX) { Ax = (*p10 - *p00); Bx = (*p10p1 - *p00p1);} + //if(NotExactRightEdgeZ) { Az = (*p01 - *p00); Bz = (*p01p1 - *p00p1);} + + double Axz = *p00 - *p01 - *p10 + *p11, Bxz = *p00p1 - *p01p1 - *p10p1 + *p11p1; + double Ax = (*p10 - *p00), Bx = (*p10p1 - *p00p1); + double Az = (*p01 - *p00), Bz = (*p01p1 - *p00p1); + + T = Axz*xrzr + Ax*xr + Az*zr + *p00; + Ph = Bxz*xrzr + Bx*xr + Bz*zr + *p00p1; + + //OCTEST 04032019 + //T = *p00 + Ax*xr + Az*zr; + //Ph = *p00p1 + Bx*xr + Bz*zr; + + //OCTEST 05032019 + //T = CGenMathInterp::InterpOnRegMesh2d(EXZ.x, EXZ.z, xStart, xStep, Nx, zStart, zStep, Nz, (double*)(GenTransNumData.pData), 3, 2); + //Ph = CGenMathInterp::InterpOnRegMesh2d(EXZ.x, EXZ.z, xStart, xStep, Nx, zStart, zStep, Nz, (double*)(GenTransNumData.pData) + 1, 3, 2); + //END OCTEST + } + else if(GenTransNumData.AmOfDims == 3) + {//bi-linear 3D interpolation + double eStart = (GenTransNumData.DimStartValues)[0]; + double eStep = (GenTransNumData.DimSteps)[0]; + + long ie = long((EXZ.e - eStart)/eStep + 1.e-10); + if(ie < 0) ie = 0; + else if(ie > Nemi2) ie = Nemi2; + + double er = (EXZ.e - (ie*eStep + eStart))/eStep; + //double erxr = er*xr, erzr = er*zr; + //double erxrzr = erxr*zr; + + //long xPer = Ne << 1; + //long zPer = Nx*xPer; + long long xPer = Ne << 1; + long long zPer = Nx*xPer; + //DOUBLE *p000 = (DOUBLE*)(GenTransNumData.pData) + (iz*zPer + ix*xPer + (ie << 1)); + //DOUBLE *p100 = p000 + 2, *p010 = p000 + xPer, *p001 = p000 + zPer; + //DOUBLE *p110 = p100 + xPer, *p101 = p100 + zPer, *p011 = p010 + zPer; + //DOUBLE *p111 = p110 + zPer; + double *p000 = (double*)(GenTransNumData.pData) + (iz*zPer + ix*xPer + (ie << 1)); //OC26112019 (related to SRW port to IGOR XOP8 on Mac) + double *p100 = p000 + 2, *p010 = p000 + xPer, *p001 = p000 + zPer; + double *p110 = p100 + xPer, *p101 = p100 + zPer, *p011 = p010 + zPer; + double *p111 = p110 + zPer; + + double one_mi_er = 1.- er, one_mi_xr = 1.- xr, one_mi_zr = 1.- zr; + double one_mi_er_one_mi_xr = one_mi_er*one_mi_xr, er_one_mi_xr = er*one_mi_xr; + double one_mi_er_xr = one_mi_er*xr, er_xr = er*xr; + T = ((*p000)*one_mi_er_one_mi_xr + (*p100)*er_one_mi_xr + (*p010)*one_mi_er_xr + (*p110)*er_xr)*one_mi_zr + + ((*p001)*one_mi_er_one_mi_xr + (*p101)*er_one_mi_xr + (*p011)*one_mi_er_xr + (*p111)*er_xr)*zr; + Ph = ((*(p000+1))*one_mi_er_one_mi_xr + (*(p100+1))*er_one_mi_xr + (*(p010+1))*one_mi_er_xr + (*(p110+1))*er_xr)*one_mi_zr + + ((*(p001+1))*one_mi_er_one_mi_xr + (*(p101+1))*er_one_mi_xr + (*(p011+1))*one_mi_er_xr + (*(p111+1))*er_xr)*zr; + + // inArFunc[] = {f(x0,y0,z0),f(x1,y0,z0),f(x0,y1,z0),f(x0,y0,z1),f(x1,y1,z0),f(x1,y0,z1),f(x0,y1,z1),f(x1,y1,z1)} //function values at the corners of the cube + //return inArFunc[0]*one_mi_xt*one_mi_yt*one_mi_zt + // + inArFunc[1]*xt*one_mi_yt*one_mi_zt + // + inArFunc[2]*one_mi_xt*yt*one_mi_zt + // + inArFunc[3]*one_mi_xt*one_mi_yt*zt + // + inArFunc[4]*xt*yt*one_mi_zt + // + inArFunc[5]*xt*one_mi_yt*zt + // + inArFunc[6]*one_mi_xt*yt*zt + // + inArFunc[7]*xt*yt*zt; + } + + if(OptPathOrPhase == 1) Ph *= EXZ.e*5.0676816042E+06; // TwoPi_d_Lambda_m + float CosPh, SinPh; CosAndSin(Ph, CosPh, SinPh); + if(EPtrs.pExRe != 0) + { + float NewExRe = (float)(T*((*(EPtrs.pExRe))*CosPh - (*(EPtrs.pExIm))*SinPh)); + float NewExIm = (float)(T*((*(EPtrs.pExRe))*SinPh + (*(EPtrs.pExIm))*CosPh)); + *(EPtrs.pExRe) = NewExRe; *(EPtrs.pExIm) = NewExIm; + } + if(EPtrs.pEzRe != 0) + { + float NewEzRe = (float)(T*((*(EPtrs.pEzRe))*CosPh - (*(EPtrs.pEzIm))*SinPh)); + float NewEzIm = (float)(T*((*(EPtrs.pEzRe))*SinPh + (*(EPtrs.pEzIm))*CosPh)); + *(EPtrs.pEzRe) = NewEzRe; *(EPtrs.pEzIm) = NewEzIm; + } + } + void RadPointModifier1D(srTEXZ& EXZ, srTEFieldPtrs& EPtrs, void* pBuf=0); //OC06092019 //void RadPointModifier1D(srTEXZ& EXZ, srTEFieldPtrs& EPtrs); diff --git a/cpp/src/core/sroptgtr_gpu.cu b/cpp/src/core/sroptgtr_gpu.cu new file mode 100644 index 00000000..9250740e --- /dev/null +++ b/cpp/src/core/sroptgtr_gpu.cu @@ -0,0 +1,32 @@ +/************************************************************************//** + * File: sroptgtr_gpu.cu + * Description: Optical element: Transmission (CUDA implementation) + * Project: Synchrotron Radiation Workshop + * First release: 2023 + * + * Copyright (C) Brookhaven National Laboratory + * All Rights Reserved + * + * @author H.Goel + * @version 1.0 + ***************************************************************************/ + +#ifdef _OFFLOAD_GPU +#include "sroptgtr.h" +#include "cuda_runtime.h" +#include "device_launch_parameters.h" +#include "math_constants.h" + +#include +#include +#include + +int srTGenTransmission::RadPointModifierParallel(srTSRWRadStructAccessData* pRadAccessData, void* pBufVars, long pBufVarsSz, TGPUUsageArg* pGPU) +{ + GenTransNumData.pData = (char*)CAuxGPU::ToDevice(pGPU, GenTransNumData.pData, GenTransNumData.DimSizes[0] * (int)GenTransNumData.DimSizes[1] * (int)GenTransNumData.DimSizes[2] * sizeof(double) * 2); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, GenTransNumData.pData); + int retCode = RadPointModifierParallelImpl(pRadAccessData, pBufVars, pBufVarsSz, this, pGPU); + GenTransNumData.pData = (char*)CAuxGPU::ToHostAndFree(pGPU, GenTransNumData.pData, GenTransNumData.DimSizes[0] * (int)GenTransNumData.DimSizes[1] * (int)GenTransNumData.DimSizes[2] * sizeof(double) * 2, true); + return retCode; +} //HG03092022 +#endif \ No newline at end of file diff --git a/cpp/src/core/sropthck.h b/cpp/src/core/sropthck.h index 45f09323..cc7fe350 100644 --- a/cpp/src/core/sropthck.h +++ b/cpp/src/core/sropthck.h @@ -167,7 +167,8 @@ class srTMirror : public srTFocusingElem { //return true; } - int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect) //virtual in srTGenOptElem + //int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect) //virtual in srTGenOptElem + int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect, void* pvGPU=0) //virtual in srTGenOptElem //HG04122023 { m_ParPrecWfrPropag = ParPrecWfrPropag; //store for use in a composite prapagator (through drif space, etc.) @@ -206,7 +207,8 @@ class srTMirror : public srTFocusingElem { //int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData, void* pBuf=0) //OC06092019 //OC01102019 (restored) - int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData) + //int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData) + int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData, void* pvGPU=0) //HG04122023 { if(m_propMeth == 1) return PropagateRadiationSimple_ThinElem(pRadAccessData); else if(m_propMeth == 2) return PropagateRadiationSimple_LocRayTracing(pRadAccessData); diff --git a/cpp/src/core/sroptpsh.h b/cpp/src/core/sroptpsh.h index ab0ac787..181df8df 100644 --- a/cpp/src/core/sroptpsh.h +++ b/cpp/src/core/sroptpsh.h @@ -75,7 +75,8 @@ class srTPhaseShift : public srTFocusingElem { } //int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, int MethNo, srTRadResizeVect& ResBeforeAndAfterVect) - int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect) + //int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect) + int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect, void* pvGPU=0) //HG04122023 { char &MethNo = ParPrecWfrPropag.MethNo; @@ -86,7 +87,8 @@ class srTPhaseShift : public srTFocusingElem { //int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData, void* pBuf=0) //OC06092019 //OC01102019 (restored) - int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData) + //int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData) + int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData, void* pvGPU=0) //HG04122023 { int result; srTWaveAccessData PhShWaveAccessData; @@ -94,8 +96,10 @@ class srTPhaseShift : public srTFocusingElem { //tPhaseShiftData = (DOUBLE*)(PhShWaveAccessData.pWaveData); tPhaseShiftData = (double*)(PhShWaveAccessData.pWaveData); //OC26112019 (related to SRW port to IGOR XOP8 on Mac) - if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result; - if(result = TraverseRadZXE(pRadAccessData)) return result; + //if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result; + if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0, 0, 0, pvGPU)) return result; //HG04122023 + //if(result = TraverseRadZXE(pRadAccessData)) return result; + if(result = TraverseRadZXE(pRadAccessData, 0, 0, pvGPU)) return result; //HG04122023 //srTSend Send; //if(result = Send.FinishWorkingWithWave(&PhShWaveAccessData)) return result; diff --git a/cpp/src/core/sroptsmr.h b/cpp/src/core/sroptsmr.h index 9d36eb82..f0d1e6b1 100644 --- a/cpp/src/core/sroptsmr.h +++ b/cpp/src/core/sroptsmr.h @@ -67,10 +67,12 @@ class srTSpherMirror : public srTFocusingElem { void SetupSpherMirrorApprox(); //int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, int MethNo, srTRadResizeVect& ResBeforeAndAfterVect) - int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect) + //int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect) + int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect, void* pvGPU=0) //HG04122023 { //if(UseSpherMirrorApprox) return ((srTGenOptElem*)(SpherMirrorApproxHndl.rep))->PropagateRadiation(pRadAccessData, MethNo, ResBeforeAndAfterVect); - if(UseSpherMirrorApprox) return ((srTGenOptElem*)(SpherMirrorApproxHndl.rep))->PropagateRadiation(pRadAccessData, ParPrecWfrPropag, ResBeforeAndAfterVect); + //if(UseSpherMirrorApprox) return ((srTGenOptElem*)(SpherMirrorApproxHndl.rep))->PropagateRadiation(pRadAccessData, ParPrecWfrPropag, ResBeforeAndAfterVect); + if(UseSpherMirrorApprox) return ((srTGenOptElem*)(SpherMirrorApproxHndl.rep))->PropagateRadiation(pRadAccessData, ParPrecWfrPropag, ResBeforeAndAfterVect, pvGPU); //HG04122023 else { char &MethNo = ParPrecWfrPropag.MethNo; diff --git a/cpp/src/core/sroptwgr.h b/cpp/src/core/sroptwgr.h index c9be6164..580b97d3 100644 --- a/cpp/src/core/sroptwgr.h +++ b/cpp/src/core/sroptwgr.h @@ -134,7 +134,8 @@ class srTWaveguideRect : public srTShapedOptElem { } //int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, int MethNo, srTRadResizeVect& ResizeBeforeAndAfterVect) - int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResizeBeforeAndAfterVect) + //int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResizeBeforeAndAfterVect) + int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResizeBeforeAndAfterVect, void* pvGPU=0) //HG04122023 { //Checks current sampling "resolution" in hor. and vert. directions //Makes necessary sampling for propag. through the waveguide (fit the waveguide with approx. the same resolution, include all harmonics until the cut-off) @@ -151,7 +152,8 @@ class srTWaveguideRect : public srTShapedOptElem { if(result = PropagateRadiationSimple_AngRepres(&AuxWfrData)) return result; srTRectAperture RectAp(Dx, Dz, TransvCenPoint.x, TransvCenPoint.y); - if(result = RectAp.TraverseRadZXE(&AuxWfrData)) return result; + //if(result = RectAp.TraverseRadZXE(&AuxWfrData)) return result; + if(result = RectAp.TraverseRadZXE(&AuxWfrData, 0, 0, pvGPU)) return result; //HG04122023 if(result = CopyElecFieldDataForOut(AuxWfrData, *pRadAccessData)) return result; AuxWfrData.DeleteElecFieldArrays(); //deletes Ex, Ez only diff --git a/cpp/src/core/sroptzp.h b/cpp/src/core/sroptzp.h index 813974de..68ae1ee7 100644 --- a/cpp/src/core/sroptzp.h +++ b/cpp/src/core/sroptzp.h @@ -100,7 +100,8 @@ class srTZonePlate : public srTFocusingElem { srTZonePlate() {} //int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, int MethNo, srTRadResizeVect& ResBeforeAndAfterVect) - int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect) + //int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect) + int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect, void* pvGPU=0) //HG04122023 { //if(ParPrecWfrPropag.AnalTreatment == 1) //{// Treating linear terms analytically @@ -111,7 +112,8 @@ class srTZonePlate : public srTFocusingElem { int result = 0; - if(MethNo == 0) result = PropagateRadiationMeth_0(pRadAccessData); + //if(MethNo == 0) result = PropagateRadiationMeth_0(pRadAccessData); + if(MethNo == 0) result = PropagateRadiationMeth_0(pRadAccessData, pvGPU); //HG04122023 //else return PropagateRadiationMeth_2(pRadAccessData, ResBeforeAndAfterVect); else result = PropagateRadiationMeth_2(pRadAccessData, ParPrecWfrPropag, ResBeforeAndAfterVect); @@ -125,11 +127,14 @@ class srTZonePlate : public srTFocusingElem { //int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData, void* pBuf=0) //OC06092019 //OC01102019 (restored) - int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData) + //int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData) + int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData, void* pvGPU=0) //HG04122023 { int result; - if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result; - if(result = TraverseRadZXE(pRadAccessData)) return result; + //if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result; + if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0, 0, 0, pvGPU)) return result; //HG04122023 + //if(result = TraverseRadZXE(pRadAccessData)) return result; + if(result = TraverseRadZXE(pRadAccessData, 0, 0, pvGPU)) return result; //HG04122023 return 0; } int PropagateRadiationSimple1D(srTRadSect1D* pSect1D) diff --git a/cpp/src/core/sroptzps.h b/cpp/src/core/sroptzps.h index 792d02c6..e5409814 100644 --- a/cpp/src/core/sroptzps.h +++ b/cpp/src/core/sroptzps.h @@ -80,7 +80,8 @@ class srTZonePlateSpec : public srTFocusingElem { } //int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, int MethNo, srTRadResizeVect& ResBeforeAndAfterVect) - int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect) + //int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect) + int PropagateRadiation(srTSRWRadStructAccessData* pRadAccessData, srTParPrecWfrPropag& ParPrecWfrPropag, srTRadResizeVect& ResBeforeAndAfterVect, void* pvGPU=0) //HG04122023 { char &MethNo = ParPrecWfrPropag.MethNo; //if(MethNo == 2) return PropagateRadiationMeth_2(pRadAccessData, ResBeforeAndAfterVect); @@ -90,11 +91,14 @@ class srTZonePlateSpec : public srTFocusingElem { //int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData, void* pBuf=0) //OC06092019 //OC01102019 (restored) - int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData) + //int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData) + int PropagateRadiationSimple(srTSRWRadStructAccessData* pRadAccessData, void* pvGPU=0) //HG04122023 { int result; - if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result; - return TraverseRadZXE(pRadAccessData); + //if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0)) return result; + if(pRadAccessData->Pres != 0) if(result = SetRadRepres(pRadAccessData, 0, 0, 0, pvGPU)) return result; //HG04122023 + //return TraverseRadZXE(pRadAccessData); + return TraverseRadZXE(pRadAccessData, 0, 0, pvGPU); //HG04122023 } int PropagateRadiationSimple1D(srTRadSect1D* pSect1D) { diff --git a/cpp/src/core/srradmnp.cpp b/cpp/src/core/srradmnp.cpp index 711bc2b4..ff3597b8 100644 --- a/cpp/src/core/srradmnp.cpp +++ b/cpp/src/core/srradmnp.cpp @@ -676,7 +676,9 @@ int srTRadGenManip::ExtractSingleElecIntensity1DvsZ(srTRadExtract& RadExtract) //************************************************************************* -int srTRadGenManip::ExtractSingleElecIntensity2DvsXZ(srTRadExtract& RadExtract) +//int srTRadGenManip::ExtractSingleElecIntensity2DvsXZ(srTRadExtract& RadExtract) +//int srTRadGenManip::ExtractSingleElecIntensity2DvsXZ(srTRadExtract& RadExtract, gpuUsageArg *pGpuUsage) //HG30112023 +int srTRadGenManip::ExtractSingleElecIntensity2DvsXZ(srTRadExtract& RadExtract, void* pvGPU) //HG02122023 { int PolCom = RadExtract.PolarizCompon; int Int_or_ReE = RadExtract.Int_or_Phase; @@ -690,6 +692,7 @@ int srTRadGenManip::ExtractSingleElecIntensity2DvsXZ(srTRadExtract& RadExtract) float *pI = 0, *pI1 = 0, *pI2 = 0, *pI3 = 0; //OC17042020 double *pId = 0, *pI1d = 0, *pI2d = 0, *pI3d = 0; long ne = RadAccessData.ne, nx = RadAccessData.nx, nz = RadAccessData.nz; + //long ne = RadAccessData.ne, nx = RadAccessData.nx, nz = RadAccessData.nz, nwfr = RadAccessData.nwfr; //HG30112023 //float *pI = 0; //DOUBLE *pId = 0; //double *pId = 0; //OC26112019 (related to SRW port to IGOR XOP8 on Mac) @@ -720,6 +723,7 @@ int srTRadGenManip::ExtractSingleElecIntensity2DvsXZ(srTRadExtract& RadExtract) //long long PerZ = PerX*RadAccessData.nx; long long PerX = ((long long)ne) << 1; //OC18042020 long long PerZ = PerX*nx; + long long PerWfr = PerZ*nz; //long ie0=0, ie1=0; long long ie0=0, ie1=0; //OC26042019 @@ -754,173 +758,181 @@ int srTRadGenManip::ExtractSingleElecIntensity2DvsXZ(srTRadExtract& RadExtract) //long Two_ie0 = ie0 << 1, Two_ie1 = ie1 << 1; long long Two_ie0 = ie0 << 1, Two_ie1 = ie1 << 1; //OC26042019 //long izPerZ = 0; - long long izPerZ = 0; long ix, ie; - for(long long iz=0; iz 0) //OC08052021 - { - if(pI != 0) - { - float newI = (float)(((*pI)*iter + resInt)*inv_iter_p_1); - *(pI++) = newI; - } - if(pId != 0) + + if(iter == 0) //OC08052021 { - double newI = ((*pId)*iter + resInt)*inv_iter_p_1; - *(pId++) = newI; + //OC140813 + if(pI != 0) *(pI++) = (float)resInt; + if(pId != 0) *(pId++) = resInt; //OC18042020 + //if(pId != 0) *(pId++) = (double)resInt; + if(allStokesReq) //OC18042020 + { + if(RadExtract.pExtractedData != 0) + { + *(pI1++) = (float)resInt1; *(pI2++) = (float)resInt2; *(pI3++) = (float)resInt3; + } + else + { + *(pI1d++) = resInt1; *(pI2d++) = resInt2; *(pI3d++) = resInt3; + } + } } - if(allStokesReq) + else if(iter > 0) //OC08052021 { - if(RadExtract.pExtractedData != 0) + if(pI != 0) { - float newI1 = (float)(((*pI1)*iter + resInt1)*inv_iter_p_1); - float newI2 = (float)(((*pI2)*iter + resInt2)*inv_iter_p_1); - float newI3 = (float)(((*pI3)*iter + resInt3)*inv_iter_p_1); - *(pI1++) = newI1; *(pI2++) = newI2; *(pI3++) = newI3; + float newI = (float)(((*pI)*iter + resInt)*inv_iter_p_1); + *(pI++) = newI; } - else + if(pId != 0) { - double newI1 = ((*pI1d)*iter + resInt1)*inv_iter_p_1; - double newI2 = ((*pI2d)*iter + resInt2)*inv_iter_p_1; - double newI3 = ((*pI3d)*iter + resInt3)*inv_iter_p_1; - *(pI1d++) = newI1; *(pI2d++) = newI2; *(pI3d++) = newI3; + double newI = ((*pId)*iter + resInt)*inv_iter_p_1; + *(pId++) = newI; } - } - } - else //OC08052021 - { - if(pI != 0) *(pI++) += (float)resInt; - if(pId != 0) *(pId++) += resInt; - if(allStokesReq) - { - if(RadExtract.pExtractedData != 0) + if(allStokesReq) { - *(pI1++) += (float)resInt1; *(pI2++) += (float)resInt2; *(pI3++) += (float)resInt3; + if(RadExtract.pExtractedData != 0) + { + float newI1 = (float)(((*pI1)*iter + resInt1)*inv_iter_p_1); + float newI2 = (float)(((*pI2)*iter + resInt2)*inv_iter_p_1); + float newI3 = (float)(((*pI3)*iter + resInt3)*inv_iter_p_1); + *(pI1++) = newI1; *(pI2++) = newI2; *(pI3++) = newI3; + } + else + { + double newI1 = ((*pI1d)*iter + resInt1)*inv_iter_p_1; + double newI2 = ((*pI2d)*iter + resInt2)*inv_iter_p_1; + double newI3 = ((*pI3d)*iter + resInt3)*inv_iter_p_1; + *(pI1d++) = newI1; *(pI2d++) = newI2; *(pI3d++) = newI3; + } } - else + } + else //OC08052021 + { + if(pI != 0) *(pI++) += (float)resInt; + if(pId != 0) *(pId++) += resInt; + if(allStokesReq) { - *(pI1d++) += resInt1; *(pI2d++) += resInt2; *(pI3d++) += resInt3; + if(RadExtract.pExtractedData != 0) + { + *(pI1++) += (float)resInt1; *(pI2++) += (float)resInt2; *(pI3++) += (float)resInt3; + } + else + { + *(pI1d++) += resInt1; *(pI2d++) += resInt2; *(pI3d++) += resInt3; + } } } - } - //ixPerX += PerX; - pEx_St += PerX; - pEz_St += PerX; - pEx_Fi += PerX; - pEz_Fi += PerX; + pEx_St += PerX; + pEz_St += PerX; + pEx_Fi += PerX; + pEz_Fi += PerX; + } + izPerZ += PerZ; } - izPerZ += PerZ; } if(arAuxInt != 0) delete[] arAuxInt; //OC150813 return 0; @@ -1570,7 +1582,8 @@ int srTRadGenManip::ExtractSingleElecMutualIntensityVsZ(srTRadExtract& RadExtrac //************************************************************************* -int srTRadGenManip::ExtractSingleElecMutualIntensityVsXZ(srTRadExtract& RadExtract) +//int srTRadGenManip::ExtractSingleElecMutualIntensityVsXZ(srTRadExtract& RadExtract) +int srTRadGenManip::ExtractSingleElecMutualIntensityVsXZ(srTRadExtract& RadExtract, void* pvGPU) //HG30112023 {//OC13122019 //This assumes "normal" data alignment in the complex "matrix" E(x,y)*E*(x',y') int res = 0; @@ -2107,154 +2120,137 @@ int srTRadGenManip::ExtractSingleElecMutualIntensityVsXZ(srTRadExtract& RadExtra if(DontNeedInterp) { - for(long long it=itStart; it<=itEnd; it++) //OC16042021 (to enable partial update of MI/CSD) - //for(long long it=0; it<=(itEnd-itStart); it++) //OC03032021 (to enable partial update of MI/CSD) - //for(long long it=0; it 0) - { - //double iter_p_1 = iter + 1; //OC20012020 - //long long iter_p_1 = iter + 1; - pMI[0] = (float)((pMI[0]*iter + ReMI)*inv_iter_p_1); //OC08052021 - pMI[1] = (float)((pMI[1]*iter + ImMI)*inv_iter_p_1); - //pMI[0] = (float)((pMI[0]*iter + ReMI)/iter_p_1); - //pMI[1] = (float)((pMI[1]*iter + ImMI)/iter_p_1); - } - else + float *pMI = pMI0 + (it - itStart)*PerArg; //OC16042021 + //float *pMI = pMI0 + it*PerArg; + for(long long i=0; i<=it; i++) { - pMI[0] += (float)ReMI; - pMI[1] += (float)ImMI; - } + //if(res = MutualIntensityComponent(pEx, pExT, pEz, pEzT, PolCom, iter, pMI)) return res; - pEx += PerX; pEz += PerX; - pMI += 2; - } + double ExRe = 0., ExIm = 0., EzRe = 0., EzIm = 0.; + double ExReT = 0., ExImT = 0., EzReT = 0., EzImT = 0.; + if(EhOK) { ExRe = *pEx; ExIm = *(pEx + 1); ExReT = *pExT; ExImT = *(pExT + 1); } + if(EvOK) { EzRe = *pEz; EzIm = *(pEz + 1); EzReT = *pEzT; EzImT = *(pEzT + 1); } + double ReMI = 0., ImMI = 0.; - pEx = pExInit0; - pEz = pEzInit0; - pExT += PerX; pEzT += PerX; - } - if(iter == 0) //OC16102021 - {//Setting to 0 symmetrical part of MI data (to avoid having garbage there) + switch(PolCom) + { + case 0: // Lin. Hor. + { + ReMI = ExRe*ExReT + ExIm*ExImT; + ImMI = ExIm*ExReT - ExRe*ExImT; + break; + } + case 1: // Lin. Vert. + { + ReMI = EzRe*EzReT + EzIm*EzImT; + ImMI = EzIm*EzReT - EzRe*EzImT; + break; + } + case 2: // Linear 45 deg. + { + double ExRe_p_EzRe = ExRe + EzRe, ExIm_p_EzIm = ExIm + EzIm; + double ExRe_p_EzReT = ExReT + EzReT, ExIm_p_EzImT = ExImT + EzImT; + ReMI = 0.5*(ExRe_p_EzRe*ExRe_p_EzReT + ExIm_p_EzIm*ExIm_p_EzImT); + ImMI = 0.5*(ExIm_p_EzIm*ExRe_p_EzReT - ExRe_p_EzRe*ExIm_p_EzImT); + break; + } + case 3: // Linear 135 deg. + { + double ExRe_mi_EzRe = ExRe - EzRe, ExIm_mi_EzIm = ExIm - EzIm; + double ExRe_mi_EzReT = ExReT - EzReT, ExIm_mi_EzImT = ExImT - EzImT; + ReMI = 0.5*(ExRe_mi_EzRe*ExRe_mi_EzReT + ExIm_mi_EzIm*ExIm_mi_EzImT); + ImMI = 0.5*(ExIm_mi_EzIm*ExRe_mi_EzReT - ExRe_mi_EzRe*ExIm_mi_EzImT); + break; + } + case 5: // Circ. Left //OC08092019: corrected to be in compliance with definitions for right-hand frame (x,z,s) and with corresponding definition and calculation of Stokes params + //case 4: // Circ. Right + { + double ExRe_mi_EzIm = ExRe - EzIm, ExIm_p_EzRe = ExIm + EzRe; + double ExRe_mi_EzImT = ExReT - EzImT, ExIm_p_EzReT = ExImT + EzReT; + ReMI = 0.5*(ExRe_mi_EzIm*ExRe_mi_EzImT + ExIm_p_EzRe*ExIm_p_EzReT); + ImMI = 0.5*(ExIm_p_EzRe*ExRe_mi_EzImT - ExRe_mi_EzIm*ExIm_p_EzReT); + break; + } + case 4: // Circ. Right //OC08092019: corrected to be in compliance with definitions for right-hand frame (x,z,s) and with corresponding definition and calculation of Stokes params + //case 5: // Circ. Left + { + double ExRe_p_EzIm = ExRe + EzIm, ExIm_mi_EzRe = ExIm - EzRe; + double ExRe_p_EzImT = ExReT + EzImT, ExIm_mi_EzReT = ExImT - EzReT; + ReMI = 0.5*(ExRe_p_EzIm*ExRe_p_EzImT + ExIm_mi_EzRe*ExIm_mi_EzReT); + ImMI = 0.5*(ExIm_mi_EzRe*ExRe_p_EzImT - ExRe_p_EzIm*ExIm_mi_EzReT); + break; + } + case -1: // s0 + { + ReMI = ExRe*ExReT + ExIm*ExImT + EzRe*EzReT + EzIm*EzImT; + ImMI = ExIm*ExReT - ExRe*ExImT + EzIm*EzReT - EzRe*EzImT; + break; + } + case -2: // s1 + { + ReMI = ExRe*ExReT + ExIm*ExImT - (EzRe*EzReT + EzIm*EzImT); + ImMI = ExIm*ExReT - ExRe*ExImT - (EzIm*EzReT - EzRe*EzImT); + break; + } + case -3: // s2 + { + ReMI = ExImT*EzIm + ExIm*EzImT + ExReT*EzRe + ExRe*EzReT; + ImMI = ExReT*EzIm - ExRe*EzImT - ExImT*EzRe + ExIm*EzReT; + break; + } + case -4: // s3 + { + ReMI = ExReT*EzIm + ExRe*EzImT - ExImT*EzRe - ExIm*EzReT; + ImMI = ExIm*EzImT - ExImT*EzIm - ExReT*EzRe + ExRe*EzReT; + break; + } + default: // total mutual intensity, same as s0 + { + ReMI = ExRe*ExReT + ExIm*ExImT + EzRe*EzReT + EzIm*EzImT; + ImMI = ExIm*ExReT - ExRe*ExImT + EzIm*EzReT - EzRe*EzImT; + break; + //return CAN_NOT_EXTRACT_MUT_INT; + } + } + if(iter == 0) + { + pMI[0] = (float)ReMI; + pMI[1] = (float)ImMI; + } + else if(iter > 0) + { + //double iter_p_1 = iter + 1; //OC20012020 + //long long iter_p_1 = iter + 1; + pMI[0] = (float)((pMI[0]*iter + ReMI)*inv_iter_p_1); //OC08052021 + pMI[1] = (float)((pMI[1]*iter + ImMI)*inv_iter_p_1); + //pMI[0] = (float)((pMI[0]*iter + ReMI)/iter_p_1); + //pMI[1] = (float)((pMI[1]*iter + ImMI)/iter_p_1); + } + else + { + pMI[0] += (float)ReMI; + pMI[1] += (float)ImMI; + } - for(long long it=itStart; it<=itEnd; it++) //OC16042021 (to enable partial update of MI/CSD) - { - float *pMI = pMI0 + (it - itStart)*(PerArg + 2) + 2; //OC29042022 (?) - //float *pMI = pMI0 + (it - itStart)*PerArg; - for(long long i=it+1; i<=itEnd; i++) - //for(long long i=0; i<=it; i++) - { - *(pMI++) = 0.; - *(pMI++) = 0.; + pEx += PerX; pEz += PerX; + pMI += 2; } + + pEx = pExInit0; + pEz = pEzInit0; + pExT += PerX; pEzT += PerX; } } } @@ -3536,8 +3532,7 @@ void srTRadGenManip::MutualIntSumPart(srTWaveAccessData* pwI1, srTWaveAccessData long long itStart = pwI2->itStart; if(itStart < 0) itStart = 0; long long itFin = pwI2->itFin; - if(itFin < 0) itFin = nxnz - 1; //OC04102021 - //if(itFin < 0) itFin = nxnz; + if(itFin < 0) itFin = nxnz; double aux; //OC27042021 @@ -3724,7 +3719,6 @@ void srTRadGenManip::MutualIntFillHalfHermit(srTWaveAccessData* pwI) *pMIt = -imMI; //Hermitian matrix property } } - //int aha = 1; } else if(pDataD != 0) { @@ -3793,22 +3787,18 @@ void srTRadGenManip::MutualIntTreatComQuadPhTerm(srTWaveAccessData* pwI, double* for(long long izt=0; izt +#include +#include +#include +#include +#include "srradmnp.h" +#include "gmmeth.h" + +template +__global__ void ExtractSingleElecIntensity2DvsXZ_Kernel(srTRadExtract RadExtract, srTSRWRadStructAccessData RadAccessData, srTRadGenManip *obj, double* arAuxInt, long long ie0, long long ie1, double InvStepRelArg, int Int_or_ReE) +{ + int ix = (blockIdx.x * blockDim.x + threadIdx.x); //nx range + int iz = (blockIdx.y * blockDim.y + threadIdx.y); //nz range + + if (ix < RadAccessData.nx && iz < RadAccessData.nz) + { + //int PolCom = RadExtract.PolarizCompon; + + //bool allStokesReq = (PolCom == -5); //OC18042020 + + float* pI = 0, * pI1 = 0, * pI2 = 0, * pI3 = 0; //OC17042020 + double* pId = 0, * pI1d = 0, * pI2d = 0, * pI3d = 0; + long ne = RadAccessData.ne, nx = RadAccessData.nx, nz = RadAccessData.nz; + //float *pI = 0; + //DOUBLE *pId = 0; + //double *pId = 0; //OC26112019 (related to SRW port to IGOR XOP8 on Mac) + long long nxnz = ((long long)nx) * ((long long)nz); + if (Int_or_ReE != 2) + { + pI = RadExtract.pExtractedData; + if (allStokesReq) //OC17042020 + { + pI1 = pI + nxnz; pI2 = pI1 + nxnz; pI3 = pI2 + nxnz; + } + } + else + { + pId = RadExtract.pExtractedDataD; + if (allStokesReq) //OC17042020 + { + pI1d = pId + nxnz; pI2d = pI1d + nxnz; pI3d = pI2d + nxnz; + } + } + + float* pEx0 = RadAccessData.pBaseRadX; + float* pEz0 = RadAccessData.pBaseRadZ; + + //long PerX = RadAccessData.ne << 1; + //long PerZ = PerX*RadAccessData.nx; + //long long PerX = RadAccessData.ne << 1; + //long long PerZ = PerX*RadAccessData.nx; + long long PerX = ((long long)ne) << 1; //OC18042020 + long long PerZ = PerX * nx; + + //bool intOverEnIsRequired = (RadExtract.Int_or_Phase == 7) && (ne > 1); //OC18042020 + double resInt, resInt1, resInt2, resInt3; + double ConstPhotEnInteg = 1.; + long long Two_ie0 = ie0 << 1, Two_ie1 = ie1 << 1; //OC26042019 + long ie; + + long offset = iz * PerZ + ix * PerX; + long offsetDiv2 = offset >> 1; + + float* pEx_StartForX = pEx0 + offset; + float* pEz_StartForX = pEz0 + offset; + if (pI != 0) + { + pI += offsetDiv2; + if (allStokesReq) + { + pI1 += offsetDiv2; + pI2 += offsetDiv2; + pI3 += offsetDiv2; + } + } + + if (pId != 0) + { + pId += offsetDiv2; + if (allStokesReq) + { + pI1d += offsetDiv2; + pI2d += offsetDiv2; + pI3d += offsetDiv2; + } + } + + //long ixPerX = 0; + + float* pEx_St = pEx_StartForX + Two_ie0; + float* pEz_St = pEz_StartForX + Two_ie0; + float* pEx_Fi = pEx_StartForX + Two_ie1; + float* pEz_Fi = pEz_StartForX + Two_ie1; + + if (intOverEnIsRequired) //OC140813 + {//integrate over photon energy / time + double* tInt = arAuxInt; + float* pEx_StAux = pEx_St; + float* pEz_StAux = pEz_St; + + if (!allStokesReq) //OC17042020 + { + for (ie = 0; ie < ne; ie++) //OC18042020 + //for(int ie=0; ieIntensityComponent(pEx_StAux, pEz_StAux, PolCom, Int_or_ReE); + pEx_StAux += 2; + pEz_StAux += 2; + } + resInt = ConstPhotEnInteg * CGenMathMeth::Integ1D_FuncDefByArray(arAuxInt, ne, RadAccessData.eStep); //OC18042020 + //resInt = ConstPhotEnInteg*CGenMathMeth::Integ1D_FuncDefByArray(arAuxInt, RadAccessData.ne, RadAccessData.eStep); + } + else + { + for (ie = 0; ie < ne; ie++) + { + *(tInt++) = obj->IntensityComponent(pEx_StAux, pEz_StAux, -1, Int_or_ReE); + pEx_StAux += 2; pEz_StAux += 2; + } + resInt = ConstPhotEnInteg * CGenMathMeth::Integ1D_FuncDefByArray(arAuxInt, ne, RadAccessData.eStep); + + tInt = arAuxInt; pEx_StAux = pEx_St; pEz_StAux = pEz_St; + for (ie = 0; ie < ne; ie++) + { + *(tInt++) = obj->IntensityComponent(pEx_StAux, pEz_StAux, -2, Int_or_ReE); + pEx_StAux += 2; pEz_StAux += 2; + } + resInt1 = ConstPhotEnInteg * CGenMathMeth::Integ1D_FuncDefByArray(arAuxInt, ne, RadAccessData.eStep); + + tInt = arAuxInt; pEx_StAux = pEx_St; pEz_StAux = pEz_St; + for (ie = 0; ie < ne; ie++) + { + *(tInt++) = obj->IntensityComponent(pEx_StAux, pEz_StAux, -3, Int_or_ReE); + pEx_StAux += 2; pEz_StAux += 2; + } + resInt2 = ConstPhotEnInteg * CGenMathMeth::Integ1D_FuncDefByArray(arAuxInt, ne, RadAccessData.eStep); + + tInt = arAuxInt; pEx_StAux = pEx_St; pEz_StAux = pEz_St; + for (ie = 0; ie < ne; ie++) + { + *(tInt++) = obj->IntensityComponent(pEx_StAux, pEz_StAux, -4, Int_or_ReE); + pEx_StAux += 2; pEz_StAux += 2; + } + resInt3 = ConstPhotEnInteg * CGenMathMeth::Integ1D_FuncDefByArray(arAuxInt, ne, RadAccessData.eStep); + } + } + else + { + if (!allStokesReq) //OC18042020 + { + resInt = obj->IntensityComponentSimpleInterpol(pEx_St, pEx_Fi, pEz_St, pEz_Fi, InvStepRelArg, PolCom, Int_or_ReE); + } + else //OC18042020 + { + resInt = obj->IntensityComponentSimpleInterpol(pEx_St, pEx_Fi, pEz_St, pEz_Fi, InvStepRelArg, -1, Int_or_ReE); + resInt1 = obj->IntensityComponentSimpleInterpol(pEx_St, pEx_Fi, pEz_St, pEz_Fi, InvStepRelArg, -2, Int_or_ReE); + resInt2 = obj->IntensityComponentSimpleInterpol(pEx_St, pEx_Fi, pEz_St, pEz_Fi, InvStepRelArg, -3, Int_or_ReE); + resInt3 = obj->IntensityComponentSimpleInterpol(pEx_St, pEx_Fi, pEz_St, pEz_Fi, InvStepRelArg, -4, Int_or_ReE); + } + } + //OC140813 + if (pI != 0) *pI = (float)resInt; + if (pId != 0) *pId = resInt; //OC18042020 + //if(pId != 0) *(pId++) = (double)resInt; + if (allStokesReq) //OC18042020 + { + if (RadExtract.pExtractedData != 0) + { + *pI1 = (float)resInt1; *pI2 = (float)resInt2; *pI3 = (float)resInt3; + } + else + { + *pI1d = resInt1; *pI2d = resInt2; *pI3d = resInt3; + } + } + } +} + +template +static inline void ExtractSingleElecIntensity2DvsXZ_GPUSub(dim3 &blocks, dim3 &threads, srTRadExtract RadExtract, srTSRWRadStructAccessData RadAccessData, srTRadGenManip *local_copy, double* arAuxInt, long long ie0, long long ie1, double InvStepRelArg, int Int_or_ReE) +{ + switch(RadExtract.PolarizCompon) + { + case 5: ExtractSingleElecIntensity2DvsXZ_Kernel<<>>(RadExtract, RadAccessData, local_copy, arAuxInt, ie0, ie1, InvStepRelArg, Int_or_ReE); break; + case 4: ExtractSingleElecIntensity2DvsXZ_Kernel<<>>(RadExtract, RadAccessData, local_copy, arAuxInt, ie0, ie1, InvStepRelArg, Int_or_ReE); break; + case 3: ExtractSingleElecIntensity2DvsXZ_Kernel<<>>(RadExtract, RadAccessData, local_copy, arAuxInt, ie0, ie1, InvStepRelArg, Int_or_ReE); break; + case 2: ExtractSingleElecIntensity2DvsXZ_Kernel<<>>(RadExtract, RadAccessData, local_copy, arAuxInt, ie0, ie1, InvStepRelArg, Int_or_ReE); break; + case 1: ExtractSingleElecIntensity2DvsXZ_Kernel<<>>(RadExtract, RadAccessData, local_copy, arAuxInt, ie0, ie1, InvStepRelArg, Int_or_ReE); break; + case 0: ExtractSingleElecIntensity2DvsXZ_Kernel<<>>(RadExtract, RadAccessData, local_copy, arAuxInt, ie0, ie1, InvStepRelArg, Int_or_ReE); break; + case -1: ExtractSingleElecIntensity2DvsXZ_Kernel<<>>(RadExtract, RadAccessData, local_copy, arAuxInt, ie0, ie1, InvStepRelArg, Int_or_ReE); break; + case -2: ExtractSingleElecIntensity2DvsXZ_Kernel<<>>(RadExtract, RadAccessData, local_copy, arAuxInt, ie0, ie1, InvStepRelArg, Int_or_ReE); break; + case -3: ExtractSingleElecIntensity2DvsXZ_Kernel<<>>(RadExtract, RadAccessData, local_copy, arAuxInt, ie0, ie1, InvStepRelArg, Int_or_ReE); break; + case -4: ExtractSingleElecIntensity2DvsXZ_Kernel<<>>(RadExtract, RadAccessData, local_copy, arAuxInt, ie0, ie1, InvStepRelArg, Int_or_ReE); break; + default: ExtractSingleElecIntensity2DvsXZ_Kernel<<>>(RadExtract, RadAccessData, local_copy, arAuxInt, ie0, ie1, InvStepRelArg, Int_or_ReE); break; + } +} + +int srTRadGenManip::ExtractSingleElecIntensity2DvsXZ_GPU(srTRadExtract& RadExtract, double* arAuxInt, long long ie0, long long ie1, double InvStepRelArg, TGPUUsageArg* pGPU) +{ + srTSRWRadStructAccessData& RadAccessData = *((srTSRWRadStructAccessData*)(hRadAccessData.ptr())); + + const int bs = 256; + dim3 blocks(RadAccessData.nx / bs + ((RadAccessData.nx & (bs - 1)) != 0), RadAccessData.nz); + dim3 threads(bs, 1); + + if (RadAccessData.pBaseRadX != NULL) + { + RadAccessData.pBaseRadX = (float*)CAuxGPU::ToDevice(pGPU, RadAccessData.pBaseRadX, 2*RadAccessData.ne*RadAccessData.nx*RadAccessData.nz*sizeof(float)); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, RadAccessData.pBaseRadX); + } + if (RadAccessData.pBaseRadZ != NULL) + { + RadAccessData.pBaseRadZ = (float*)CAuxGPU::ToDevice(pGPU, RadAccessData.pBaseRadZ, 2*RadAccessData.ne*RadAccessData.nx*RadAccessData.nz*sizeof(float)); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, RadAccessData.pBaseRadZ); + } + + srTRadGenManip *local_copy = (srTRadGenManip*)CAuxGPU::ToDevice(pGPU, this, sizeof(srTRadGenManip)); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, local_copy); + + arAuxInt = (double*)CAuxGPU::ToDevice(pGPU, arAuxInt, RadAccessData.ne*sizeof(double)); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, arAuxInt); + + bool allStokesReq = (RadExtract.PolarizCompon == -5); + bool intOverEnIsRequired = (RadExtract.Int_or_Phase == 7) && (RadAccessData.ne > 1); + + int Int_or_ReE = RadExtract.Int_or_Phase; + if (Int_or_ReE == 7) Int_or_ReE = 0; //OC150813: time/phot. energy integrated single-e intensity requires "normal" intensity here + + if (allStokesReq) + if (intOverEnIsRequired) + ExtractSingleElecIntensity2DvsXZ_GPUSub (blocks, threads, RadExtract, RadAccessData, local_copy, arAuxInt, ie0, ie1, InvStepRelArg, Int_or_ReE); + else + ExtractSingleElecIntensity2DvsXZ_GPUSub (blocks, threads, RadExtract, RadAccessData, local_copy, arAuxInt, ie0, ie1, InvStepRelArg, Int_or_ReE); + else + if (intOverEnIsRequired) + ExtractSingleElecIntensity2DvsXZ_GPUSub (blocks, threads, RadExtract, RadAccessData, local_copy, arAuxInt, ie0, ie1, InvStepRelArg, Int_or_ReE); + else + ExtractSingleElecIntensity2DvsXZ_GPUSub (blocks, threads, RadExtract, RadAccessData, local_copy, arAuxInt, ie0, ie1, InvStepRelArg, Int_or_ReE); + + CAuxGPU::ToHostAndFree(pGPU, local_copy, sizeof(srTRadGenManip), true); + CAuxGPU::ToHostAndFree(pGPU, arAuxInt, RadAccessData.ne*sizeof(double), true); + CAuxGPU::MarkUpdated(pGPU, RadAccessData.pBaseRadX, true, false); + CAuxGPU::MarkUpdated(pGPU, RadAccessData.pBaseRadZ, true, false); + +#ifndef _DEBUG + if (RadAccessData.pBaseRadX != NULL) + RadAccessData.pBaseRadX = (float*)CAuxGPU::GetHostPtr(pGPU, RadAccessData.pBaseRadX); + if (RadAccessData.pBaseRadZ != NULL) + RadAccessData.pBaseRadZ = (float*)CAuxGPU::GetHostPtr(pGPU, RadAccessData.pBaseRadZ); +#endif + +#ifdef _DEBUG + if (RadAccessData.pBaseRadX != NULL) + RadAccessData.pBaseRadX = (float*)CAuxGPU::ToHostAndFree(pGPU, RadAccessData.pBaseRadX, 2*RadAccessData.ne*RadAccessData.nx*RadAccessData.nz*sizeof(float)); + if (RadAccessData.pBaseRadZ != NULL) + RadAccessData.pBaseRadZ = (float*)CAuxGPU::ToHostAndFree(pGPU, RadAccessData.pBaseRadZ, 2*RadAccessData.ne*RadAccessData.nx*RadAccessData.nz*sizeof(float)); + cudaStreamSynchronize(0); + auto err = cudaGetLastError(); + printf("%s\r\n", cudaGetErrorString(err)); +#endif + return 0; +} + +template +__global__ void ExtractSingleElecMutualIntensityVsXZ_Kernel(const float* __restrict__ pEx0, const float* __restrict__ pEz0, float* __restrict__ pMI0, long nxnz, long itStart, long itEnd, long PerX, long iter0) +{ + //Calculate coordinates as the typical triangular matrix + int i0 = (blockIdx.x * blockDim.x + threadIdx.x); //<=nxnz range + int it0_0 = (blockIdx.y * blockDim.y + threadIdx.y); //nxnz/(2*itPerBlk) range + long iter = iter0; + + if (i0 > nxnz) return; + if (it0_0 > nxnz / 2) return; + + for (int it0 = it0_0 * itPerBlk; it0 < it0_0 * itPerBlk + itPerBlk; it0++) + { + long it = it0; + long i = i0; + if (i0 > it0) //If the coordinates are past the triangular bounds, switch to the lower half of the triangle + { + it = nxnz - it0 - 1; + i = i0 - (it0 + 1); + } + + if (it >= itEnd) { + return; + } + + //float* pMI = pMI0 + it0 * (nxnz << 1) + (i0 << 1); //Compact representation coordinates + float* pMI = pMI0 + (it - itStart) * (nxnz << 1) + (i << 1); //Full representation coordinates + const float* pEx = pEx0 + i * PerX; + const float* pEz = pEz0 + i * PerX; + const float* pExT = pEx0 + (it - itStart) * PerX; + const float* pEzT = pEz0 + (it - itStart) * PerX; + + float ExRe = 0., ExIm = 0., EzRe = 0., EzIm = 0.; + float ExReT = 0., ExImT = 0., EzReT = 0., EzImT = 0.; + + { + if (EhOK) + { + ExRe = *pEx; ExIm = *(pEx + 1); + if (i != (it - itStart)) { + ExReT = *pExT; ExImT = *(pExT + 1); + } + else { + ExReT = ExRe; + ExImT = ExIm; + } + } + if (EvOK) { + EzRe = *pEz; EzIm = *(pEz + 1); + if (i != (it - itStart)) { + EzReT = *pEzT; EzImT = *(pEzT + 1); + } + else { + EzReT = EzRe; + EzImT = EzIm; + } + } + } + float ReMI = 0., ImMI = 0.; + + switch (PolCom) + { + case 0: // Lin. Hor. + { + ReMI = ExRe * ExReT + ExIm * ExImT; + ImMI = ExIm * ExReT - ExRe * ExImT; + break; + } + case 1: // Lin. Vert. + { + ReMI = EzRe * EzReT + EzIm * EzImT; + ImMI = EzIm * EzReT - EzRe * EzImT; + break; + } + case 2: // Linear 45 deg. + { + float ExRe_p_EzRe = ExRe + EzRe, ExIm_p_EzIm = ExIm + EzIm; + float ExRe_p_EzReT = ExReT + EzReT, ExIm_p_EzImT = ExImT + EzImT; + ReMI = 0.5f * (ExRe_p_EzRe * ExRe_p_EzReT + ExIm_p_EzIm * ExIm_p_EzImT); + ImMI = 0.5f * (ExIm_p_EzIm * ExRe_p_EzReT - ExRe_p_EzRe * ExIm_p_EzImT); + break; + } + case 3: // Linear 135 deg. + { + float ExRe_mi_EzRe = ExRe - EzRe, ExIm_mi_EzIm = ExIm - EzIm; + float ExRe_mi_EzReT = ExReT - EzReT, ExIm_mi_EzImT = ExImT - EzImT; + ReMI = 0.5f * (ExRe_mi_EzRe * ExRe_mi_EzReT + ExIm_mi_EzIm * ExIm_mi_EzImT); + ImMI = 0.5f * (ExIm_mi_EzIm * ExRe_mi_EzReT - ExRe_mi_EzRe * ExIm_mi_EzImT); + break; + } + case 5: // Circ. Left //OC08092019: corrected to be in compliance with definitions for right-hand frame (x,z,s) and with corresponding definition and calculation of Stokes params + //case 4: // Circ. Right + { + float ExRe_mi_EzIm = ExRe - EzIm, ExIm_p_EzRe = ExIm + EzRe; + float ExRe_mi_EzImT = ExReT - EzImT, ExIm_p_EzReT = ExImT + EzReT; + ReMI = 0.5f * (ExRe_mi_EzIm * ExRe_mi_EzImT + ExIm_p_EzRe * ExIm_p_EzReT); + ImMI = 0.5f * (ExIm_p_EzRe * ExRe_mi_EzImT - ExRe_mi_EzIm * ExIm_p_EzReT); + break; + } + case 4: // Circ. Right //OC08092019: corrected to be in compliance with definitions for right-hand frame (x,z,s) and with corresponding definition and calculation of Stokes params + //case 5: // Circ. Left + { + float ExRe_p_EzIm = ExRe + EzIm, ExIm_mi_EzRe = ExIm - EzRe; + float ExRe_p_EzImT = ExReT + EzImT, ExIm_mi_EzReT = ExImT - EzReT; + ReMI = 0.5f * (ExRe_p_EzIm * ExRe_p_EzImT + ExIm_mi_EzRe * ExIm_mi_EzReT); + ImMI = 0.5f * (ExIm_mi_EzRe * ExRe_p_EzImT - ExRe_p_EzIm * ExIm_mi_EzReT); + break; + } + case -1: // s0 + { + ReMI = ExRe * ExReT + ExIm * ExImT + EzRe * EzReT + EzIm * EzImT; + ImMI = ExIm * ExReT - ExRe * ExImT + EzIm * EzReT - EzRe * EzImT; + break; + } + case -2: // s1 + { + ReMI = ExRe * ExReT + ExIm * ExImT - (EzRe * EzReT + EzIm * EzImT); + ImMI = ExIm * ExReT - ExRe * ExImT - (EzIm * EzReT - EzRe * EzImT); + break; + } + case -3: // s2 + { + ReMI = ExImT * EzIm + ExIm * EzImT + ExReT * EzRe + ExRe * EzReT; + ImMI = ExReT * EzIm - ExRe * EzImT - ExImT * EzRe + ExIm * EzReT; + break; + } + case -4: // s3 + { + ReMI = ExReT * EzIm + ExRe * EzImT - ExImT * EzRe - ExIm * EzReT; + ImMI = ExIm * EzImT - ExImT * EzIm - ExReT * EzRe + ExRe * EzReT; + break; + } + default: // total mutual intensity, same as s0 + { + ReMI = ExRe * ExReT + ExIm * ExImT + EzRe * EzReT + EzIm * EzImT; + ImMI = ExIm * ExReT - ExRe * ExImT + EzIm * EzReT - EzRe * EzImT; + break; + //return CAN_NOT_EXTRACT_MUT_INT; + } + } + + if (gt1_iter > 0) + { + pMI[0] = (pMI[0] * iter + (float)ReMI) / (float)(iter + 1.); + pMI[1] = (pMI[1] * iter + (float)ImMI) / (float)(iter + 1.); + } + else if (gt1_iter == 0) + { + pMI[0] = (float)ReMI; + pMI[1] = (float)ImMI; + } + else + { + pMI[0] += (float)ReMI; + pMI[1] += (float)ImMI; + } + } +} + +template +int ExtractSingleElecMutualIntensityVsXZ_GPUSub(float* pEx, float* pEz, float* pMI0, long nx, long nz, long ne, long itStart, long itEnd, long PerX, long iter, bool EhOK, bool EvOK, TGPUUsageArg* pGPU) +{ + long long nxnz = ((long long)nx) * ((long long)nz); + + const int itPerBlk = 1; + dim3 threads = dim3(48, 16, 1); + dim3 grid = dim3((nxnz + 1) / threads.x + (threads.x > 1), (nxnz / 2) / (threads.y * itPerBlk) + (threads.y > 1), 1); + + pEx = (float*)CAuxGPU::ToDevice(pGPU, pEx, nxnz*2*sizeof(float)); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, pEx); + + pEz = (float*)CAuxGPU::ToDevice(pGPU, pEz, nxnz*2*sizeof(float)); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, pEz); + + pMI0 = (float*)CAuxGPU::ToDevice(pGPU, pMI0, (itEnd - itStart)*nxnz*2*sizeof(float)); + + if (EhOK) + { + if (EvOK) ExtractSingleElecMutualIntensityVsXZ_Kernel << > > (pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter); + else ExtractSingleElecMutualIntensityVsXZ_Kernel << > > (pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter); + } + else + { + if (EvOK) ExtractSingleElecMutualIntensityVsXZ_Kernel << > > (pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter); + else ExtractSingleElecMutualIntensityVsXZ_Kernel << > > (pEx, pEz, pMI0, nxnz, itStart, itEnd, PerX, iter); + } + + pEx = (float*)CAuxGPU::ToHostAndFree(pGPU, pEx, nxnz * 2 * sizeof(float), true); + pEz = (float*)CAuxGPU::ToHostAndFree(pGPU, pEz, nxnz * 2 * sizeof(float), true); + + CAuxGPU::MarkUpdated(pGPU, pMI0, true, false); + +#ifdef _DEBUG + if (pMI0 != NULL) + pMI0 = (float*)CAuxGPU::ToHostAndFree(pGPU, pMI0, (itEnd - itStart)*ne*nx*nz*2*sizeof(float)); + + cudaStreamSynchronize(0); + auto err = cudaGetLastError(); + printf("%s\r\n", cudaGetErrorString(err)); +#endif + return 0; +} + +int srTRadGenManip::ExtractSingleElecMutualIntensityVsXZ_GPU(float* pEx, float* pEz, float* pMI0, long nx, long nz, long ne, long itStart, long itEnd, long PerX, long iter, int PolCom, bool EhOK, bool EvOK, TGPUUsageArg* pGPU) +{ + if (iter > 0) + { + switch (PolCom) + { + case 1: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< 1, 1>(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, EhOK, EvOK, pGPU); + case 2: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< 2, 1>(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, EhOK, EvOK, pGPU); + case 0: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< 0, 1>(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, EhOK, EvOK, pGPU); + case 3: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< 3, 1>(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, EhOK, EvOK, pGPU); + case 4: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< 4, 1>(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, EhOK, EvOK, pGPU); + case 5: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< 5, 1>(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, EhOK, EvOK, pGPU); + case -1: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -1, 1>(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, EhOK, EvOK, pGPU); + case -2: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -2, 1>(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, EhOK, EvOK, pGPU); + case -3: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -3, 1>(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, EhOK, EvOK, pGPU); + case -4: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -4, 1>(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, EhOK, EvOK, pGPU); + default: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -5, 1>(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, EhOK, EvOK, pGPU); + } + } + else if (iter == 0) + { + switch (PolCom) + { + case 0: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< 0, 0>(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, EhOK, EvOK, pGPU); + case 1: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< 1, 0>(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, EhOK, EvOK, pGPU); + case 2: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< 2, 0>(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, EhOK, EvOK, pGPU); + case 3: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< 3, 0>(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, EhOK, EvOK, pGPU); + case 4: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< 4, 0>(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, EhOK, EvOK, pGPU); + case 5: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< 5, 0>(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, EhOK, EvOK, pGPU); + case -1: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -1, 0>(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, EhOK, EvOK, pGPU); + case -2: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -2, 0>(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, EhOK, EvOK, pGPU); + case -3: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -3, 0>(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, EhOK, EvOK, pGPU); + case -4: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -4, 0>(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, EhOK, EvOK, pGPU); + default: return ExtractSingleElecMutualIntensityVsXZ_GPUSub< -5, 0>(pEx, pEz, pMI0, nx, nz, ne, itStart, itEnd, PerX, iter, EhOK, EvOK, pGPU); + } + } +} + +#endif \ No newline at end of file diff --git a/cpp/src/core/srradstr.cpp b/cpp/src/core/srradstr.cpp index d2571a41..f7c7a87d 100644 --- a/cpp/src/core/srradstr.cpp +++ b/cpp/src/core/srradstr.cpp @@ -2700,7 +2700,8 @@ void srTSRWRadStructAccessData::CheckAndResetPhaseTermsLin() //************************************************************************* -void srTSRWRadStructAccessData::MirrorFieldData(int sx, int sz) +//void srTSRWRadStructAccessData::MirrorFieldData(int sx, int sz) +void srTSRWRadStructAccessData::MirrorFieldData(int sx, int sz, void* pvGPU) //HG02122023 {// sx < 0 means mirroring should be done vs x // sz < 0 means mirroring should be done vs z //long PerX = ne << 1; @@ -2711,6 +2712,14 @@ void srTSRWRadStructAccessData::MirrorFieldData(int sx, int sz) float *pEX0 = pBaseRadX; float *pEZ0 = pBaseRadZ; +#ifdef _OFFLOAD_GPU //HG02122023 + if (CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) + { + MirrorFieldData_GPU(sx, sz, (TGPUUsageArg*)pvGPU); + return; + } +#endif + if((sx > 0) && (sz > 0)) return; //no mirroring is necessary else if((sx < 0) && (sz > 0)) //mirroring with respect to x { diff --git a/cpp/src/core/srradstr.h b/cpp/src/core/srradstr.h index f9b78ba8..dee50e5f 100644 --- a/cpp/src/core/srradstr.h +++ b/cpp/src/core/srradstr.h @@ -32,6 +32,10 @@ #include "srigorre.h" #endif +#ifdef _OFFLOAD_GPU //OC28072023 +#include "auxgpu.h" //HG04122023 +#endif + #include "srobject.h" //************************************************************************* @@ -72,8 +76,8 @@ class srTSRWRadStructAccessData : public CGenObject { waveHndl wRad, wRadX, wRadZ; int hStateRadX, hStateRadZ; double eStep, eStart, xStep, xStart, zStep, zStart; - long ne, nx, nz; - //long long ne, nx, nz; //OC26042019 + long ne, nx, nz; //OC03082023 (rolled back) + //long long ne, nx, nz; //HG //OC26042019 double xStartTr, zStartTr; bool UseStartTrToShiftAtChangingRepresToCoord; @@ -242,7 +246,16 @@ class srTSRWRadStructAccessData : public CGenObject { void CheckAndSubtractPhaseTermsLin(double newXc, double newZc); void CheckAndResetPhaseTermsLin(); void EstimateOversamplingFactors(double& estimOverSampX, double& estimOverSampZ); - void MirrorFieldData(int sx, int sz); + + void MirrorFieldData(int sx, int sz, void* pvGPU=0); //OC28072023 + //void MirrorFieldData(int sx, int sz); + +#ifdef _OFFLOAD_GPU + void MirrorFieldData_GPU(int sx, int sz, TGPUUsageArg* pGPU); //OC03082023 + //void MirrorFieldData_GPU(int sx, int sz, void* pGpuUsage); //HG28072023 + void MultiplyElFieldByPhaseLin_GPU(double xMult, double zMult, TGPUUsageArg* pGPU); //OC03082023 + //void MultiplyElFieldByPhaseLin_GPU(double xMult, double zMult, void* pGpuUsage); //HG28072023 +#endif int SetupWfrEdgeCorrData(float* pDataEx, float* pDataEz, srTDataPtrsForWfrEdgeCorr& DataPtrsForWfrEdgeCorr); void MakeWfrEdgeCorrection(float* pDataEx, float* pDataEz, srTDataPtrsForWfrEdgeCorr& DataPtrs); @@ -491,12 +504,33 @@ class srTSRWRadStructAccessData : public CGenObject { } } - void MultiplyElFieldByPhaseLin(double xMult, double zMult) + void MultiplyElFieldByPhaseLin(double xMult, double zMult, void* pvGPU=0) //OC28072023 + //void MultiplyElFieldByPhaseLin(double xMult, double zMult) { bool RadXisDefined = (pBaseRadX != 0); bool RadZisDefined = (pBaseRadZ != 0); if((!RadXisDefined) && (!RadZisDefined)) return; +#ifdef _OFFLOAD_GPU //OC28072023 + //TGPUUsageArg *pGPU = (TGPUUsageArg*)pvGPU; + //GPU_COND(pvGPU, + //{ + // MultiplyElFieldByPhaseLin_GPU(xMult, zMult, pGPU); + // //MultiplyElFieldByPhaseLin_GPU(xMult, zMult, pGPU); + // return; + //} + + //if(pvGPU != 0) //HG02122023 Null check is already done by CAuxGPU::GPUEnabled + //{ + TGPUUsageArg *pGPU = (TGPUUsageArg*)pvGPU; + if(CAuxGPU::GPUEnabled(pGPU)) + { + MultiplyElFieldByPhaseLin_GPU(xMult, zMult, pGPU); + return; //HG02122023 + } + //} +#endif + float *tEx = pBaseRadX; float *tEz = pBaseRadZ; diff --git a/cpp/src/core/srradstr_gpu.cu b/cpp/src/core/srradstr_gpu.cu new file mode 100644 index 00000000..658d39e0 --- /dev/null +++ b/cpp/src/core/srradstr_gpu.cu @@ -0,0 +1,330 @@ +/************************************************************************//** + * File: srradstr_gpu.cu + * Description: Auxiliary structures for various SR calculation methods (CUDA implementation) + * Project: Synchrotron Radiation Workshop + * First release: 2023 + * + * Copyright (C) Brookhaven National Laboratory + * All Rights Reserved + * + * @author H.Goel + * @version 1.0 + ***************************************************************************/ + +#ifdef _OFFLOAD_GPU + +#include "cuda_runtime.h" +#include "device_launch_parameters.h" +#include "math_constants.h" +#include +#include +#include +#include "srradstr.h" + +__global__ void MultiplyElFieldByPhaseLin_Kernel(double xMult, double zMult, float* pBaseRadX, float* pBaseRadZ, int nx, int nz, int ne, float xStart, float zStart, float xStep, float zStep) { + int ix = (blockIdx.x * blockDim.x + threadIdx.x); //nx range + int iz = (blockIdx.y * blockDim.y + threadIdx.y); //nz range + + if (ix < nx && iz < nz) + { + bool RadXisDefined = (pBaseRadX != 0); + bool RadZisDefined = (pBaseRadZ != 0); + + double z = zStart + iz * zStep; + double x = xStart + ix * xStep; + double dPhZ = zMult * z; + double dPh = dPhZ + xMult * x; + double cosPh, sinPh; + sincos(dPh, &sinPh, &cosPh); + + long long offset = iz * nx * ne * 2 + ix * ne * 2; + float* tEx = pBaseRadX + offset; + float* tEz = pBaseRadZ + offset; + for (int ie = 0; ie < ne; ie++) + { + if (RadXisDefined) + { + //*(tEx++) *= a; *(tEx++) *= a; + double newReEx = (*tEx) * cosPh - (*(tEx + 1)) * sinPh; + double newImEx = (*tEx) * sinPh + (*(tEx + 1)) * cosPh; + *(tEx++) = (float)newReEx; *(tEx++) = (float)newImEx; + } + if (RadZisDefined) + { + //*(tEz++) *= a; *(tEz++) *= a; + double newReEz = (*tEz) * cosPh - (*(tEz + 1)) * sinPh; + double newImEz = (*tEz) * sinPh + (*(tEz + 1)) * cosPh; + *(tEz++) = (float)newReEz; *(tEz++) = (float)newImEz; + } + } + } +} + +void srTSRWRadStructAccessData::MultiplyElFieldByPhaseLin_GPU(double xMult, double zMult, TGPUUsageArg* pGPU) //OC03082023 +//void srTSRWRadStructAccessData::MultiplyElFieldByPhaseLin_GPU(double xMult, double zMult, void* pGpuUsage) +{ + //TGPUUsageArg *pGpuUsage_ = (TGPUUsageArg*)pGpuUsage; //OC03082023 (commented-out) + if (pBaseRadX != NULL) + { + pBaseRadX = (float*)CAuxGPU::ToDevice(pGPU, pBaseRadX, nz * nx * ne * 2 * sizeof(float)); //OC03082023 + //pBaseRadX = (float*)CAuxGPU::ToDevice(pGpuUsage_, pBaseRadX, nz * nx * ne * 2 * sizeof(float)); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, pBaseRadX); //OC03082023 + //CAuxGPU::EnsureDeviceMemoryReady(pGpuUsage_, pBaseRadX); + } + if (pBaseRadZ != NULL) + { + pBaseRadZ = (float*)CAuxGPU::ToDevice(pGPU, pBaseRadZ, nz * nx * ne * 2 * sizeof(float)); //OC03082023 + //pBaseRadZ = (float*)CAuxGPU::ToDevice(pGpuUsage_, pBaseRadZ, nz * nx * ne * 2 * sizeof(float)); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, pBaseRadZ); //OC03082023 + //CAuxGPU::EnsureDeviceMemoryReady(pGpuUsage_, pBaseRadZ); + } + + const int bs = 256; + dim3 blocks(nx / bs + ((nx & (bs - 1)) != 0), nz); + dim3 threads(bs, 1); + MultiplyElFieldByPhaseLin_Kernel<<>> (xMult, zMult, pBaseRadX, pBaseRadZ, nx, nz, ne, (float)xStart, (float)zStart, (float)xStep, (float)zStep); + //MultiplyElFieldByPhaseLin_Kernel<<>> (xMult, zMult, pBaseRadX, pBaseRadZ, nz, nx, ne, zStart, zStep, xStart, xStep); + + if (pBaseRadX != NULL) + CAuxGPU::MarkUpdated(pGPU, pBaseRadX, true, false); //OC03082023 + //CAuxGPU::MarkUpdated(pGpuUsage_, pBaseRadX, true, false); + if (pBaseRadZ != NULL) + CAuxGPU::MarkUpdated(pGPU, pBaseRadZ, true, false); //OC03082023 + //CAuxGPU::MarkUpdated(pGpuUsage_, pBaseRadZ, true, false); + +#ifdef _DEBUG + if (pBaseRadX != NULL) + pBaseRadX = (float*)CAuxGPU::ToHostAndFree(pGPU, pBaseRadX, nz * nx * ne * 2 * sizeof(float)); //OC03082023 + //pBaseRadX = (float*)CAuxGPU::ToHostAndFree(pGpuUsage_, pBaseRadX, nz * nx * ne * 2 * sizeof(float)); + if (pBaseRadZ != NULL) + pBaseRadZ = (float*)CAuxGPU::ToHostAndFree(pGPU, pBaseRadZ, nz * nx * ne * 2 * sizeof(float)); //OC03082023 + //pBaseRadZ = (float*)CAuxGPU::ToHostAndFree(pGpuUsage_, pBaseRadZ, nz * nx * ne * 2 * sizeof(float)); + cudaStreamSynchronize(0); + //auto err = cudaGetLastError(); + //printf("%s\r\n", cudaGetErrorString(err)); +#endif +} + +template __global__ void MirrorFieldData_Kernel(long nx, long nz, long ne, float* pEX0, float* pEZ0) { + int ix = (blockIdx.x * blockDim.x + threadIdx.x); //nx range + int iz = (blockIdx.y * blockDim.y + threadIdx.y); //nz range + + if (ix < nx && iz < nz) + { + long long PerX = ne << 1; + long long PerZ = PerX * nx; + float buf; + + if (mode == 0) + { + if (ix >= (nx >> 1)) + return; + + long long nx_mi_1 = nx - 1; //OC26042019 + for (long long ie = 0; ie < ne; ie++) + { + //long Two_ie = ie << 1; + long long Two_ie = ie << 1; //OC26042019 + + //long izPerZ = iz*PerZ; + long long izPerZ = iz * PerZ; + float* pEX_StartForX = pEX0 + izPerZ; + float* pEZ_StartForX = pEZ0 + izPerZ; + + //long ixPerX_p_Two_ie = ix*PerX + Two_ie; + long long ixPerX_p_Two_ie = ix * PerX + Two_ie; + float* pEX = pEX_StartForX + ixPerX_p_Two_ie; + float* pEZ = pEZ_StartForX + ixPerX_p_Two_ie; + + //long rev_ixPerX_p_Two_ie = (nx_mi_1 - ix)*PerX + Two_ie; + long long rev_ixPerX_p_Two_ie = (nx_mi_1 - ix) * PerX + Two_ie; + float* rev_pEX = pEX_StartForX + rev_ixPerX_p_Two_ie; + float* rev_pEZ = pEZ_StartForX + rev_ixPerX_p_Two_ie; + + if (pEX0 != 0) + { + buf = *rev_pEX; *(rev_pEX++) = *pEX; *(pEX++) = buf; + buf = *rev_pEX; *rev_pEX = *pEX; *pEX = buf; + } + if (pEZ0 != 0) + { + buf = *rev_pEZ; *(rev_pEZ++) = *pEZ; *(pEZ++) = buf; + buf = *rev_pEZ; *rev_pEZ = *pEZ; *pEZ = buf; + } + } + } + else if (mode == 1) + { + if (iz >= (nz >> 1)) + return; + + long long nz_mi_1 = nz - 1; //OC26042019 + for (long long ie = 0; ie < ne; ie++) + { + //long Two_ie = ie << 1; + long long Two_ie = ie << 1; + + //long izPerZ = iz*PerZ; + long long izPerZ = iz * PerZ; + float* pEX_StartForX = pEX0 + izPerZ; + float* pEZ_StartForX = pEZ0 + izPerZ; + + //long rev_izPerZ = (nz_mi_1 - iz)*PerZ; + long long rev_izPerZ = (nz_mi_1 - iz) * PerZ; + float* rev_pEX_StartForX = pEX0 + rev_izPerZ; + float* rev_pEZ_StartForX = pEZ0 + rev_izPerZ; + + //long ixPerX_p_Two_ie = ix*PerX + Two_ie; + long long ixPerX_p_Two_ie = ix * PerX + Two_ie; + float* pEX = pEX_StartForX + ixPerX_p_Two_ie; + float* pEZ = pEZ_StartForX + ixPerX_p_Two_ie; + + float* rev_pEX = rev_pEX_StartForX + ixPerX_p_Two_ie; + float* rev_pEZ = rev_pEZ_StartForX + ixPerX_p_Two_ie; + + if (pEX0 != 0) + { + buf = *rev_pEX; *(rev_pEX++) = *pEX; *(pEX++) = buf; + buf = *rev_pEX; *rev_pEX = *pEX; *pEX = buf; + } + if (pEZ0 != 0) + { + buf = *rev_pEZ; *(rev_pEZ++) = *pEZ; *(pEZ++) = buf; + buf = *rev_pEZ; *rev_pEZ = *pEZ; *pEZ = buf; + } + } + } + else if (mode == 2) + { + if (iz >= (nz >> 1)) + return; + + long long nx_mi_1 = nx - 1; //OC26042019 + long long nz_mi_1 = nz - 1; + for (long long ie = 0; ie < ne; ie++) //OC26042019 + //for(long ie=0; ie> 1); iz++) + long long Two_ie = ie << 1; //OC26042019 + + //long izPerZ = iz*PerZ; + long long izPerZ = iz * PerZ; + float* pEX_StartForX = pEX0 + izPerZ; + float* pEZ_StartForX = pEZ0 + izPerZ; + + //long rev_izPerZ = (nz_mi_1 - iz)*PerZ; + long long rev_izPerZ = (nz_mi_1 - iz) * PerZ; + float* rev_pEX_StartForX = pEX0 + rev_izPerZ; + float* rev_pEZ_StartForX = pEZ0 + rev_izPerZ; + + //long ixPerX_p_Two_ie = ix*PerX + Two_ie; + long long ixPerX_p_Two_ie = ix * PerX + Two_ie; + float* pEX = pEX_StartForX + ixPerX_p_Two_ie; + float* pEZ = pEZ_StartForX + ixPerX_p_Two_ie; + + //long rev_ixPerX_p_Two_ie = (nx_mi_1 - ix)*PerX + Two_ie; + long long rev_ixPerX_p_Two_ie = (nx_mi_1 - ix) * PerX + Two_ie; + float* rev_pEX = rev_pEX_StartForX + rev_ixPerX_p_Two_ie; + float* rev_pEZ = rev_pEZ_StartForX + rev_ixPerX_p_Two_ie; + + if (pEX0 != 0) + { + buf = *rev_pEX; *(rev_pEX++) = *pEX; *(pEX++) = buf; + buf = *rev_pEX; *rev_pEX = *pEX; *pEX = buf; + } + if (pEZ0 != 0) + { + buf = *rev_pEZ; *(rev_pEZ++) = *pEZ; *(pEZ++) = buf; + buf = *rev_pEZ; *rev_pEZ = *pEZ; *pEZ = buf; + } + + if (((nz >> 1) << 1) != nz) + { + //long izPerZ = ((nz >> 1) + 1)*PerZ; + long long izPerZ = ((nz >> 1) + 1) * PerZ; + float* pEX_StartForX = pEX0 + izPerZ; + float* pEZ_StartForX = pEZ0 + izPerZ; + + //long ixPerX_p_Two_ie = ix*PerX + Two_ie; + long long ixPerX_p_Two_ie = ix * PerX + Two_ie; + float* pEX = pEX_StartForX + ixPerX_p_Two_ie; + float* pEZ = pEZ_StartForX + ixPerX_p_Two_ie; + + //long rev_ixPerX_p_Two_ie = (nx_mi_1 - ix)*PerX + Two_ie; + long long rev_ixPerX_p_Two_ie = (nx_mi_1 - ix) * PerX + Two_ie; + float* rev_pEX = pEX_StartForX + rev_ixPerX_p_Two_ie; + float* rev_pEZ = pEZ_StartForX + rev_ixPerX_p_Two_ie; + + if (pEX0 != 0) + { + buf = *rev_pEX; *(rev_pEX++) = *pEX; *(pEX++) = buf; + buf = *rev_pEX; *rev_pEX = *pEX; *pEX = buf; + } + if (pEZ0 != 0) + { + buf = *rev_pEZ; *(rev_pEZ++) = *pEZ; *(pEZ++) = buf; + buf = *rev_pEZ; *rev_pEZ = *pEZ; *pEZ = buf; + } + } + } + } + } +} + +void srTSRWRadStructAccessData::MirrorFieldData_GPU(int sx, int sz, TGPUUsageArg* pGPU) //OC03082023 +//void srTSRWRadStructAccessData::MirrorFieldData_GPU(int sx, int sz, void* pGpuUsage) +{ + //TGPUUsageArg *pGpuUsage_ = (TGPUUsageArg*)pGpuUsage; //OC03082023 (commented-out) + float *pEX0 = pBaseRadX; + float *pEZ0 = pBaseRadZ; + + if (pEX0 != NULL) + { + pEX0 = (float*)CAuxGPU::ToDevice(pGPU, pEX0, nz * nx * ne * 2 * sizeof(float)); //OC03082023 + //pEX0 = (float*)CAuxGPU::ToDevice(pGpuUsage_, pEX0, nz * nx * ne * 2 * sizeof(float)); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, pEX0); //OC03082023 + //CAuxGPU::EnsureDeviceMemoryReady(pGpuUsage_, pEX0); + } + if (pEZ0 != NULL) + { + pEZ0 = (float*)CAuxGPU::ToDevice(pGPU, pEZ0, nz * nx * ne * 2 * sizeof(float)); //OC03082023 + //pEZ0 = (float*)CAuxGPU::ToDevice(pGpuUsage_, pEZ0, nz * nx * ne * 2 * sizeof(float)); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, pEZ0); //OC03082023 + //CAuxGPU::EnsureDeviceMemoryReady(pGpuUsage_, pEZ0); + } + + const int bs = 256; + dim3 blocks(nx / bs + ((nx & (bs - 1)) != 0), nz); + dim3 threads(bs, 1); + + if ((sx > 0) && (sz > 0)) + return; + else if ((sx < 0) && (sz > 0)) + MirrorFieldData_Kernel<0> <<>>(nx, nz, ne, pEX0, pEZ0); + else if ((sx > 0) && (sz < 0)) + MirrorFieldData_Kernel<1> <<>> (nx, nz, ne, pEX0, pEZ0); + else + MirrorFieldData_Kernel<2> <<>> (nx, nz, ne, pEX0, pEZ0); + + if (pEX0 != NULL) + CAuxGPU::MarkUpdated(pGPU, pEX0, true, false); //OC03082023 + //CAuxGPU::MarkUpdated(pGpuUsage_, pEX0, true, false); + if (pEZ0 != NULL) + CAuxGPU::MarkUpdated(pGPU, pEZ0, true, false); //OC03082023 + //CAuxGPU::MarkUpdated(pGpuUsage_, pEZ0, true, false); + +#ifdef _DEBUG + if (pEX0 != NULL) + pEX0 = (float*)CAuxGPU::ToHostAndFree(pGPU, pEX0, nz * nx * ne * 2 * sizeof(float)); //OC03082023 + //pEX0 = (float*)CAuxGPU::ToHostAndFree(pGpuUsage_, pEX0, nz * nx * ne * 2 * sizeof(float)); + if (pEZ0 != NULL) + pEZ0 = (float*)CAuxGPU::ToHostAndFree(pGPU, pEZ0, nz * nx * ne * 2 * sizeof(float)); + //pEZ0 = (float*)CAuxGPU::ToHostAndFree(pGpuUsage_, pEZ0, nz * nx * ne * 2 * sizeof(float)); + cudaStreamSynchronize(0); + //auto err = cudaGetLastError(); + //printf("%s\r\n", cudaGetErrorString(err)); +#endif +} + +#endif \ No newline at end of file diff --git a/cpp/src/core/srstraux.h b/cpp/src/core/srstraux.h index ad7dec02..49aff638 100644 --- a/cpp/src/core/srstraux.h +++ b/cpp/src/core/srstraux.h @@ -203,6 +203,9 @@ struct srTStokesC { struct srTEFieldPtrs { float *pExRe, *pExIm, *pEzRe, *pEzIm; +#ifdef _OFFLOAD_GPU //HG02122023 + GPU_PORTABLE +#endif srTEFieldPtrs(float* In_pExRe =0, float* In_pExIm =0, float* In_pEzRe =0, float* In_pEzIm =0) { pExRe = In_pExRe; pExIm = In_pExIm; pEzRe = In_pEzRe; pEzIm = In_pEzIm; @@ -1588,6 +1591,9 @@ struct srTInterpolAux01 { double cAx2z0, cAx2z1, cAx2z2, cAx2z3, cAx3z0, cAx3z1, cAx3z2, cAx3z3; double cLAx1z0, cLAx0z1, cLAx1z1; +#ifdef _OFFLOAD_GPU //HG02122023 + GPU_PORTABLE +#endif srTInterpolAux01() { cAx0z1 = 0.1666666667; @@ -1654,10 +1660,18 @@ struct srTInterpolAuxF { float f03, f13, f23, f33; float fAvg, fNorm; + +#ifdef _OFFLOAD_GPU //HG02122023 + GPU_PORTABLE +#endif void SetUpAvg() { fAvg = (float)(0.0625*(f00 + f10 + f20 + f30 + f01 + f11 + f21 + f31 + f02 + f12 + f22 + f32 + f03 + f13 + f23 + f33)); } + +#ifdef _OFFLOAD_GPU //HG02122023 + GPU_PORTABLE +#endif void NormalizeByAvg() { const float CritNorm = 1.; @@ -1724,11 +1738,17 @@ struct srTDataPtrsForWfrEdgeCorr { double dxSt, dxFi, dzSt, dzFi, dx, dz; char WasSetup; +#ifdef _OFFLOAD_GPU //HG02122023 + GPU_PORTABLE +#endif srTDataPtrsForWfrEdgeCorr() { InitializeAll(); } +#ifdef _OFFLOAD_GPU //HG02122023 + GPU_PORTABLE +#endif void InitializeAll() { ExpArrXSt = ExpArrXFi = 0; @@ -1747,6 +1767,9 @@ struct srTDataPtrsForWfrEdgeCorr { } WasSetup = 0; } +#ifdef _OFFLOAD_GPU //HG02122023 + GPU_PORTABLE +#endif void DisposeData() { if(ExpArrXSt != 0) delete[] ExpArrXSt; diff --git a/cpp/src/ext/genmath/gmfft.cpp b/cpp/src/ext/genmath/gmfft.cpp index dbea0340..58b766a2 100644 --- a/cpp/src/ext/genmath/gmfft.cpp +++ b/cpp/src/ext/genmath/gmfft.cpp @@ -132,6 +132,23 @@ long CGenMathFFT::LenGoodNum1000s = 11; long CGenMathFFT::GoodNum10000s[] = { 0,479,636,743,830,900,960,1017,1064,1109,1150 }; long CGenMathFFT::LenGoodNum10000s = 11; +#ifdef _OFFLOAD_GPU +long CGenMathFFT1D::PlanLen; +long CGenMathFFT1D::dPlanLen; +long CGenMathFFT1D::HowMany; +long CGenMathFFT1D::dHowMany; +cufftHandle CGenMathFFT1D::Plan1DFFT_cu; +cufftHandle CGenMathFFT1D::dPlan1DFFT_cu; +#endif + +#ifdef _OFFLOAD_GPU +long CGenMathFFT2D::PlanNx; +long CGenMathFFT2D::PlanNy; +long CGenMathFFT2D::dPlanNx; +long CGenMathFFT2D::dPlanNy; +cufftHandle CGenMathFFT2D::Plan2DFFT_cu; +cufftHandle CGenMathFFT2D::dPlan2DFFT_cu; +#endif //************************************************************************* void CGenMathFFT::NextCorrectNumberForFFT(long& n) @@ -206,22 +223,38 @@ void CGenMathFFT::NextCorrectNumberForFFT(long& n) } //************************************************************************* - -int CGenMathFFT1D::Make1DFFT_InPlace(CGenMathFFT1DInfo& FFT1DInfo) +//int CGenMathFFT1D::Make1DFFT_InPlace(CGenMathFFT1DInfo& FFT1DInfo) +//int CGenMathFFT1D::Make1DFFT_InPlace(CGenMathFFT1DInfo& FFT1DInfo, gpuUsageArg *pGpuUsage) //HG18072022 +int CGenMathFFT1D::Make1DFFT_InPlace(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU) //OC06092023 { - //long TotAmOfPo = (FFT1DInfo.Nx << 1)*FFT1DInfo.HowMany; - long long TotAmOfPo = ((long long)(FFT1DInfo.Nx << 1))*((long long)FFT1DInfo.HowMany); - float* AuxDataCont = new float[TotAmOfPo]; - if(AuxDataCont == 0) return MEMORY_ALLOCATION_FAILURE; - FFT1DInfo.pOutData = AuxDataCont; +#ifdef _OFFLOAD_GPU //OC06092023 (to avoid useless operations / calls at execution on CPU) + if(CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023 + //GPU_COND(pvGPU, //OC06092023 + //GPU_COND(pGpuUsage, + { + //HG03082022 GPU can do an inplace fft without being given a temporary buffer + FFT1DInfo.pOutData = FFT1DInfo.pInData; + int result; + if(result = Make1DFFT(FFT1DInfo, pvGPU)) return result; //OC06092023 + //if(result = Make1DFFT(FFT1DInfo, pGpuUsage)) return result; + }//) + else +#endif + { + //long TotAmOfPo = (FFT1DInfo.Nx << 1)*FFT1DInfo.HowMany; + long long TotAmOfPo = ((long long)(FFT1DInfo.Nx << 1))*((long long)FFT1DInfo.HowMany); + float* AuxDataCont = new float[TotAmOfPo]; + if(AuxDataCont == 0) return MEMORY_ALLOCATION_FAILURE; + FFT1DInfo.pOutData = AuxDataCont; - int result; - if(result = Make1DFFT(FFT1DInfo)) return result; + int result; + if(result = Make1DFFT(FFT1DInfo)) return result; - float *tOut = FFT1DInfo.pInData, *t = AuxDataCont; - for(int ix=0; ix 0)? -1 : 1; @@ -345,164 +408,471 @@ int CGenMathFFT2D::Make2DFFT(CGenMathFFT2DInfo& FFT2DInfo, fftwnd_plan* pPrecrea //if(NeedsShiftBeforeY) FillArrayShift('y', t0SignMult*y0_Before, FFT2DInfo.yStep); if(NeedsShiftBeforeX) {//OC02022019 - if(m_ArrayShiftX != 0) FillArrayShift('x', t0SignMult*x0_Before, FFT2DInfo.xStep, m_ArrayShiftX); - else if(m_dArrayShiftX != 0) FillArrayShift('x', t0SignMult*x0_Before, FFT2DInfo.xStep, m_dArrayShiftX); + if(m_ArrayShiftX != 0) + FillArrayShift('x', t0SignMult*x0_Before, FFT2DInfo.xStep, m_ArrayShiftX); + else if(m_dArrayShiftX != 0) + FillArrayShift('x', t0SignMult*x0_Before, FFT2DInfo.xStep, m_dArrayShiftX); } if(NeedsShiftBeforeY) {//OC02022019 - if(m_ArrayShiftY != 0) FillArrayShift('y', t0SignMult*y0_Before, FFT2DInfo.yStep, m_ArrayShiftY); - else if(m_dArrayShiftY != 0) FillArrayShift('y', t0SignMult*y0_Before, FFT2DInfo.yStep, m_dArrayShiftY); + if(m_ArrayShiftY != 0) + FillArrayShift('y', t0SignMult*y0_Before, FFT2DInfo.yStep, m_ArrayShiftY); + else if(m_dArrayShiftY != 0) + FillArrayShift('y', t0SignMult*y0_Before, FFT2DInfo.yStep, m_dArrayShiftY); } - if(NeedsShiftBeforeX || NeedsShiftBeforeY) + +#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) + if(DataToFFT != 0) CAuxGPU::EnsureDeviceMemoryReady((TGPUUsageArg*)pvGPU, DataToFFT); //OC06092023 + else if(dDataToFFT != 0) CAuxGPU::EnsureDeviceMemoryReady((TGPUUsageArg*)pvGPU, dDataToFFT); + //if (DataToFFT != 0) CAuxGPU::EnsureDeviceMemoryReady(pGpuUsage, DataToFFT); + //else if (dDataToFFT != 0) CAuxGPU::EnsureDeviceMemoryReady(pGpuUsage, dDataToFFT); +#endif + + if(NeedsShiftBeforeX || NeedsShiftBeforeY) //HG02112021 { - if(DataToFFT != 0) TreatShifts(DataToFFT); +#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) + if(CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023 + { + //GPU_COND(pvGPU, { //OC06092023 + //GPU_COND(pGpuUsage, { + TGPUUsageArg *pGPU = (TGPUUsageArg*)pvGPU; + if(DataToFFT != 0) { + m_ArrayShiftX = (float*)CAuxGPU::ToDevice(pGPU, m_ArrayShiftX, (Nx << 1) * sizeof(float), false); //OC06092023 + m_ArrayShiftY = (float*)CAuxGPU::ToDevice(pGPU, m_ArrayShiftY, (Ny << 1) * sizeof(float), false); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, m_ArrayShiftX); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, m_ArrayShiftY); + //m_ArrayShiftX = (float*)CAuxGPU::ToDevice(pGpuUsage, m_ArrayShiftX, (Nx << 1) * sizeof(float), false); + //m_ArrayShiftY = (float*)CAuxGPU::ToDevice(pGpuUsage, m_ArrayShiftY, (Ny << 1) * sizeof(float), false); + //CAuxGPU::EnsureDeviceMemoryReady(pGpuUsage, m_ArrayShiftX); + //CAuxGPU::EnsureDeviceMemoryReady(pGpuUsage, m_ArrayShiftY); + TreatShifts2D_GPU((float*)DataToFFT, Nx, Ny, NeedsShiftBeforeX, NeedsShiftBeforeY, m_ArrayShiftX, m_ArrayShiftY); + m_ArrayShiftX = (float*)CAuxGPU::ToHostAndFree(pGPU, m_ArrayShiftX, (Nx << 1) * sizeof(float), true); //OC06092023 + m_ArrayShiftY = (float*)CAuxGPU::ToHostAndFree(pGPU, m_ArrayShiftY, (Ny << 1) * sizeof(float), true); + //m_ArrayShiftX = (float*)CAuxGPU::ToHostAndFree(pGpuUsage, m_ArrayShiftX, (Nx << 1) * sizeof(float), true); + //m_ArrayShiftY = (float*)CAuxGPU::ToHostAndFree(pGpuUsage, m_ArrayShiftY, (Ny << 1) * sizeof(float), true); + } + else if(dDataToFFT != 0) { + m_dArrayShiftX = (double*)CAuxGPU::ToDevice(pGPU, m_dArrayShiftX, (Nx << 1) * sizeof(double), false); //OC06092023 + m_dArrayShiftY = (double*)CAuxGPU::ToDevice(pGPU, m_dArrayShiftY, (Ny << 1) * sizeof(double), false); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, m_dArrayShiftX); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, m_dArrayShiftY); + //m_dArrayShiftX = (double*)CAuxGPU::ToDevice(pGpuUsage, m_dArrayShiftX, (Nx << 1) * sizeof(double), false); + //m_dArrayShiftY = (double*)CAuxGPU::ToDevice(pGpuUsage, m_dArrayShiftY, (Ny << 1) * sizeof(double), false); + //CAuxGPU::EnsureDeviceMemoryReady(pGpuUsage, m_dArrayShiftX); + //CAuxGPU::EnsureDeviceMemoryReady(pGpuUsage, m_dArrayShiftY); + TreatShifts2D_GPU((double*)dDataToFFT, Nx, Ny, NeedsShiftBeforeX, NeedsShiftBeforeY, m_dArrayShiftX, m_dArrayShiftY); + m_dArrayShiftX = (double*)CAuxGPU::ToHostAndFree(pGPU, m_dArrayShiftX, (Nx << 1) * sizeof(double), true); //OC06092023 + m_dArrayShiftY = (double*)CAuxGPU::ToHostAndFree(pGPU, m_dArrayShiftY, (Ny << 1) * sizeof(double), true); + //m_dArrayShiftX = (double*)CAuxGPU::ToHostAndFree(pGpuUsage, m_dArrayShiftX, (Nx << 1) * sizeof(double), true); + //m_dArrayShiftY = (double*)CAuxGPU::ToHostAndFree(pGpuUsage, m_dArrayShiftY, (Ny << 1) * sizeof(double), true); + } + }//) + else +#endif + { + if(DataToFFT != 0) TreatShifts(DataToFFT); #ifdef _FFTW3 //OC27022019 - else if(dDataToFFT != 0) TreatShifts(dDataToFFT); //OC02022019 + else if(dDataToFFT != 0) TreatShifts(dDataToFFT); //OC02022019 #endif + } } + bool alreadyNormalized = false; //HG17032022 + //double Mult = FFT2DInfo.xStep*FFT2DInfo.yStep; + double Mult = FFT2DInfo.xStep * FFT2DInfo.yStep * FFT2DInfo.ExtraMult; //OC20112017 if(FFT2DInfo.Dir > 0) { - //Plan2DFFT = fftw2d_create_plan(Ny, Nx, FFTW_FORWARD, FFTW_IN_PLACE); - //OC27102018 - //SY: adopted for OpenMP -#ifdef _FFTW3 //OC28012019 - - if(DataToFFT != 0) +#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) + if(CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023 + //GPU_COND(pvGPU, //OC06092023 + //GPU_COND(pGpuUsage, //HG02112021 { - if(pPrecreatedPlan2DFFT == 0) Plan2DFFT = fftwf_plan_dft_2d(Ny, Nx, DataToFFT, DataToFFT, FFTW_FORWARD, FFTW_ESTIMATE); - else Plan2DFFT = *pPrecreatedPlan2DFFT; - if(Plan2DFFT == 0) return ERROR_IN_FFT; + if(DataToFFT != 0) + { + if(pPrecreatedPlan2DFFT == 0) + { + if((Plan2DFFT_cu == NULL) || (!(PlanNx == Nx && PlanNy == Ny))) //OC06092023 //HG04122023 + //if((Plan2DFFT_cu == NULL) || (!(PlanNx == Nx && PlanNy == Ny && HowMany == FFT2DInfo.howMany))) //OC06092023 + //if (Plan2DFFT_cu == NULL | !(PlanNx == Nx && PlanNy == Ny && HowMany == FFT2DInfo.howMany)) + { + if(Plan2DFFT_cu != NULL) + { + cufftDestroy(Plan2DFFT_cu); + Plan2DFFT_cu = NULL; + } + + PlanNx = Nx; + PlanNy = Ny; + int plan_shape[2]; plan_shape[0] = Nx; plan_shape[1] = Ny; + cufftPlanMany(&Plan2DFFT_cu, 2, plan_shape, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, 1); //HG04122023 + //cufftPlanMany(&Plan2DFFT_cu, 2, plan_shape, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, FFT2DInfo.howMany); + //cufftPlan2d(&Plan2DFFT_cu, Nx, Ny, CUFFT_C2C); + } + } + else Plan2DFFT_cu = *(cufftHandle*)pPrecreatedPlan2DFFT; + if(Plan2DFFT_cu == 0) return ERROR_IN_FFT; - fftwf_execute(Plan2DFFT); - } - else if(dDataToFFT != 0) + auto res = cufftExecC2C(Plan2DFFT_cu, (cufftComplex*)DataToFFT, (cufftComplex*)DataToFFT, CUFFT_FORWARD); +// if (res != CUFFT_SUCCESS) +// printf("CUFFT Error: %d\r\n", res); + } + else if(dDataToFFT != 0) + { + if(pdPrecreatedPlan2DFFT == 0) + { + if((dPlan2DFFT_cu == NULL) || (!(dPlanNx == Nx && dPlanNy == Ny))) //OC06092023 //HG04122023 + //if((dPlan2DFFT_cu == NULL) || (!(dPlanNx == Nx && dPlanNy == Ny && dHowMany == FFT2DInfo.howMany))) //OC06092023 + //if (dPlan2DFFT_cu == NULL | !(dPlanNx == Nx && dPlanNy == Ny && dHowMany == FFT2DInfo.howMany)) + { + if(dPlan2DFFT_cu != NULL) + { + cufftDestroy(dPlan2DFFT_cu); + dPlan2DFFT_cu = NULL; + } + + dPlanNx = Nx; + dPlanNy = Ny; + int plan_shape[2]; plan_shape[0] = Nx; plan_shape[1] = Ny; + cufftPlanMany(&Plan2DFFT_cu, 2, plan_shape, 0, 0, 0, 0, 0, 0, CUFFT_Z2Z, 1); //HG04122023 + //cufftPlanMany(&Plan2DFFT_cu, 2, plan_shape, 0, 0, 0, 0, 0, 0, CUFFT_Z2Z, FFT2DInfo.howMany); + //cufftPlan2d(&dPlan2DFFT_cu, Nx, Ny, CUFFT_Z2Z); + } + } + else dPlan2DFFT_cu = *(cufftHandle*)pdPrecreatedPlan2DFFT; + if(dPlan2DFFT_cu == 0) return ERROR_IN_FFT; + + cufftExecZ2Z(dPlan2DFFT_cu, (cufftDoubleComplex*)dDataToFFT, (cufftDoubleComplex*)dDataToFFT, CUFFT_FORWARD); + } + }//) + else +#endif { - if(pdPrecreatedPlan2DFFT == 0) dPlan2DFFT = fftw_plan_dft_2d(Ny, Nx, dDataToFFT, dDataToFFT, FFTW_FORWARD, FFTW_ESTIMATE); - else dPlan2DFFT = *pdPrecreatedPlan2DFFT; - if(dPlan2DFFT == 0) return ERROR_IN_FFT; + //Plan2DFFT = fftw2d_create_plan(Ny, Nx, FFTW_FORWARD, FFTW_IN_PLACE); + //OC27102018 + //SY: adopted for OpenMP +#if _FFTW3 //OC28012019 - fftw_execute(dPlan2DFFT); - } + for(long iHowMany = 0; iHowMany < FFT2DInfo.howMany; iHowMany++) + { + long iFFT = Nx * Ny * iHowMany; + if(DataToFFT != 0) + { + if(pPrecreatedPlan2DFFT == 0) Plan2DFFT = fftwf_plan_dft_2d(Ny, Nx, DataToFFT + iFFT, DataToFFT + iFFT, FFTW_FORWARD, FFTW_ESTIMATE); + else Plan2DFFT = *pPrecreatedPlan2DFFT; + if(Plan2DFFT == 0) return ERROR_IN_FFT; + + fftwf_execute(Plan2DFFT); + } + else if(dDataToFFT != 0) + { + if(pdPrecreatedPlan2DFFT == 0) dPlan2DFFT = fftw_plan_dft_2d(Ny, Nx, dDataToFFT + iFFT, dDataToFFT + iFFT, FFTW_FORWARD, FFTW_ESTIMATE); + else dPlan2DFFT = *pdPrecreatedPlan2DFFT; + if(dPlan2DFFT == 0) return ERROR_IN_FFT; + + fftw_execute(dPlan2DFFT); + } + } #else - if(pPrecreatedPlan2DFFT == 0) Plan2DFFT = fftw2d_create_plan(Ny, Nx, FFTW_FORWARD, FFTW_IN_PLACE); - else Plan2DFFT = *pPrecreatedPlan2DFFT; - if(Plan2DFFT == 0) return ERROR_IN_FFT; - fftwnd(Plan2DFFT, 1, DataToFFT, 1, 0, DataToFFT, 1, 0); + if(pPrecreatedPlan2DFFT == 0) Plan2DFFT = fftw2d_create_plan(Ny, Nx, FFTW_FORWARD, FFTW_IN_PLACE); + else Plan2DFFT = *pPrecreatedPlan2DFFT; + if(Plan2DFFT == 0) return ERROR_IN_FFT; + fftwnd(Plan2DFFT, 1, DataToFFT, 1, 0, DataToFFT, 1, 0); #endif + } - if(DataToFFT != 0) +#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) + if(CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023 + //GPU_COND(pvGPU, //OC06092023 + //GPU_COND(pGpuUsage, //HG18072022 { - RepairSignAfter2DFFT(DataToFFT); - RotateDataAfter2DFFT(DataToFFT); - } + if(DataToFFT != 0) + { + //RepairSignAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany); + //RotateDataAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany); + //RepairSignAndRotateDataAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany, Mult); + //RepairSignAndRotateDataAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany, (float)Mult); //OC06092023 + RepairSignAndRotateDataAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, (float)Mult); //OC06092023 //HG04122023 + } + else if(dDataToFFT != 0) + { + //RepairSignAfter2DFFT_GPU((double*)dDataToFFT, Nx, Ny, FFT2DInfo.howMany); + //RotateDataAfter2DFFT_GPU((double*)dDataToFFT, Nx, Ny, FFT2DInfo.howMany); + //RepairSignAndRotateDataAfter2DFFT_GPU((double*)dDataToFFT, Nx, Ny, FFT2DInfo.howMany, Mult); + RepairSignAndRotateDataAfter2DFFT_GPU((double*)dDataToFFT, Nx, Ny, Mult); //HG04122023 + } + alreadyNormalized = true; + }//) + else +#endif + { + if(DataToFFT != 0) + { + RepairSignAfter2DFFT(DataToFFT); + RotateDataAfter2DFFT(DataToFFT); + } #ifdef _FFTW3 //OC27022019 - else if(dDataToFFT != 0) - { - RepairSignAfter2DFFT(dDataToFFT); - RotateDataAfter2DFFT(dDataToFFT); - } + else if(dDataToFFT != 0) + { + RepairSignAfter2DFFT(dDataToFFT); + RotateDataAfter2DFFT(dDataToFFT); + } #endif + } } else { - //Plan2DFFT = fftw2d_create_plan(Ny, Nx, FFTW_BACKWARD, FFTW_IN_PLACE); - //OC27102018 - //SY: adopted for OpenMP -#ifdef _FFTW3 //OC28012019 - if(DataToFFT != 0) +#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) + if(CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023 + //GPU_COND(pvGPU, //OC06092023 + //GPU_COND(pGpuUsage, //HG18072022 { - if(pPrecreatedPlan2DFFT == 0) Plan2DFFT = fftwf_plan_dft_2d(Ny, Nx, DataToFFT, DataToFFT, FFTW_BACKWARD, FFTW_ESTIMATE); + if(DataToFFT != 0) + { + if(pPrecreatedPlan2DFFT == 0) { + if((Plan2DFFT_cu == NULL) || (!(PlanNx == Nx && PlanNy == Ny))) //OC06092023 //HG04122023 + //if((Plan2DFFT_cu == NULL) || (!(PlanNx == Nx && PlanNy == Ny && HowMany == FFT2DInfo.howMany))) //OC06092023 + //if (Plan2DFFT_cu == NULL | !(PlanNx == Nx && PlanNy == Ny && HowMany == FFT2DInfo.howMany)) + { + if(Plan2DFFT_cu != NULL){ + cufftDestroy(Plan2DFFT_cu); + Plan2DFFT_cu = NULL; + } + + PlanNx = Nx; + PlanNy = Ny; + //HowMany = FFT2DInfo.howMany; //HG04122023 (Commented out) + int plan_shape[2]; plan_shape[0] = Nx; plan_shape[1] = Ny; + cufftPlanMany(&Plan2DFFT_cu, 2, plan_shape, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, 1); //HG04122023 + //cufftPlanMany(&Plan2DFFT_cu, 2, plan_shape, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, FFT2DInfo.howMany); + //cufftPlan2d(&Plan2DFFT_cu, Nx, Ny, CUFFT_C2C); + } + } + else Plan2DFFT_cu = *(cufftHandle*)pPrecreatedPlan2DFFT; + if(Plan2DFFT_cu == 0) return ERROR_IN_FFT; + + //RotateDataAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany); + //RepairSignAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany); + RotateDataAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny); //HG04122023 + RepairSignAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny); + cufftExecC2C(Plan2DFFT_cu, (cufftComplex*)DataToFFT, (cufftComplex*)DataToFFT, CUFFT_INVERSE); + } + else if(dDataToFFT != 0) + { + if(pdPrecreatedPlan2DFFT == 0) { + if((dPlan2DFFT_cu == NULL) || (!(dPlanNx == Nx && dPlanNy == Ny))) //OC06092023 //HG04122023 + //if((dPlan2DFFT_cu == NULL) || (!(dPlanNx == Nx && dPlanNy == Ny && dHowMany == FFT2DInfo.howMany))) //OC06092023 + //if (dPlan2DFFT_cu == NULL | !(dPlanNx == Nx && dPlanNy == Ny && dHowMany == FFT2DInfo.howMany)) + { + if(dPlan2DFFT_cu != NULL){ + cufftDestroy(dPlan2DFFT_cu); + dPlan2DFFT_cu = NULL; + } + + dPlanNx = Nx; + dPlanNy = Ny; + //dHowMany = FFT2DInfo.howMany; //HG04122023 (Commented out) + int plan_shape[2]; plan_shape[0] = Nx; plan_shape[1] = Ny; + cufftPlanMany(&Plan2DFFT_cu, 2, plan_shape, NULL, 0, 0, NULL, 0, 0, CUFFT_Z2Z, FFT2DInfo.howMany); + //cufftPlan2d(&dPlan2DFFT_cu, Nx, Ny, CUFFT_Z2Z); + } + } + else dPlan2DFFT_cu = *(cufftHandle*)pdPrecreatedPlan2DFFT; + if(dPlan2DFFT_cu == 0) return ERROR_IN_FFT; + + //RotateDataAfter2DFFT_GPU((double*)dDataToFFT, Nx, Ny, FFT2DInfo.howMany); + //RepairSignAfter2DFFT_GPU((double*)dDataToFFT, Nx, Ny, FFT2DInfo.howMany); + RotateDataAfter2DFFT_GPU((double*)dDataToFFT, Nx, Ny); + RepairSignAfter2DFFT_GPU((double*)dDataToFFT, Nx, Ny); + cufftExecZ2Z(dPlan2DFFT_cu, (cufftDoubleComplex*)dDataToFFT, (cufftDoubleComplex*)dDataToFFT, CUFFT_INVERSE); + } + }//) + else +#endif + { + //Plan2DFFT = fftw2d_create_plan(Ny, Nx, FFTW_BACKWARD, FFTW_IN_PLACE); + //OC27102018 + //SY: adopted for OpenMP +#ifdef _FFTW3 //OC28012019 + for (long iHowMany = 0; iHowMany < FFT2DInfo.howMany; iHowMany++) + { + long iFFT = Nx * Ny * iHowMany; + if(DataToFFT != 0) + { + if(pPrecreatedPlan2DFFT == 0) Plan2DFFT = fftwf_plan_dft_2d(Ny, Nx, DataToFFT + iFFT, DataToFFT + iFFT, FFTW_BACKWARD, FFTW_ESTIMATE); + else Plan2DFFT = *pPrecreatedPlan2DFFT; + if(Plan2DFFT == 0) return ERROR_IN_FFT; + RotateDataAfter2DFFT(DataToFFT); + RepairSignAfter2DFFT(DataToFFT); + fftwf_execute(Plan2DFFT); + } + else if(dDataToFFT != 0) + { + if(pdPrecreatedPlan2DFFT == 0) dPlan2DFFT = fftw_plan_dft_2d(Ny, Nx, dDataToFFT + iFFT, dDataToFFT + iFFT, FFTW_BACKWARD, FFTW_ESTIMATE); + else dPlan2DFFT = *pdPrecreatedPlan2DFFT; + if(dPlan2DFFT == 0) return ERROR_IN_FFT; + RotateDataAfter2DFFT(dDataToFFT); + RepairSignAfter2DFFT(dDataToFFT); + fftw_execute(dPlan2DFFT); + } + } +#else + if(pPrecreatedPlan2DFFT == 0) Plan2DFFT = fftw2d_create_plan(Ny, Nx, FFTW_BACKWARD, FFTW_IN_PLACE); else Plan2DFFT = *pPrecreatedPlan2DFFT; if(Plan2DFFT == 0) return ERROR_IN_FFT; RotateDataAfter2DFFT(DataToFFT); RepairSignAfter2DFFT(DataToFFT); - fftwf_execute(Plan2DFFT); - } - else if(dDataToFFT != 0) - { - if(pdPrecreatedPlan2DFFT == 0) dPlan2DFFT = fftw_plan_dft_2d(Ny, Nx, dDataToFFT, dDataToFFT, FFTW_BACKWARD, FFTW_ESTIMATE); - else dPlan2DFFT = *pdPrecreatedPlan2DFFT; - if(dPlan2DFFT == 0) return ERROR_IN_FFT; - RotateDataAfter2DFFT(dDataToFFT); - RepairSignAfter2DFFT(dDataToFFT); - fftw_execute(dPlan2DFFT); - } -#else - if(pPrecreatedPlan2DFFT == 0) Plan2DFFT = fftw2d_create_plan(Ny, Nx, FFTW_BACKWARD, FFTW_IN_PLACE); - else Plan2DFFT = *pPrecreatedPlan2DFFT; - if(Plan2DFFT == 0) return ERROR_IN_FFT; - RotateDataAfter2DFFT(DataToFFT); - RepairSignAfter2DFFT(DataToFFT); - fftwnd(Plan2DFFT, 1, DataToFFT, 1, 0, DataToFFT, 1, 0); + fftwnd(Plan2DFFT, 1, DataToFFT, 1, 0, DataToFFT, 1, 0); #endif + } } - //double Mult = FFT2DInfo.xStep*FFT2DInfo.yStep; - double Mult = FFT2DInfo.xStep*FFT2DInfo.yStep*FFT2DInfo.ExtraMult; //OC20112017 - - if(DataToFFT != 0) NormalizeDataAfter2DFFT(DataToFFT, Mult); + if(!alreadyNormalized){ +#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) + if(CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023 + //GPU_COND(pvGPU, //OC06092023 + //GPU_COND(pGpuUsage, //HG18072022 + { + //if (DataToFFT != 0) + // NormalizeDataAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany, Mult); + //else if (dDataToFFT != 0) + // NormalizeDataAfter2DFFT_GPU((double*)dDataToFFT, Nx, Ny, FFT2DInfo.howMany, Mult); + if(DataToFFT != 0) //HG04122023 + NormalizeDataAfter2DFFT_GPU((float*)DataToFFT, Nx, Ny, Mult); + else if(dDataToFFT != 0) + NormalizeDataAfter2DFFT_GPU((double*)dDataToFFT, Nx, Ny, Mult); + }//) + else +#endif + { + if(DataToFFT != 0) NormalizeDataAfter2DFFT(DataToFFT, Mult); #ifdef _FFTW3 //OC27022019 - else if(dDataToFFT != 0) NormalizeDataAfter2DFFT(dDataToFFT, Mult); + else if(dDataToFFT != 0) NormalizeDataAfter2DFFT(dDataToFFT, Mult); #endif + } + } //if(NeedsShiftAfterX) FillArrayShift('x', t0SignMult*x0_After, FFT2DInfo.xStepTr); //if(NeedsShiftAfterY) FillArrayShift('y', t0SignMult*y0_After, FFT2DInfo.yStepTr); - if(NeedsShiftAfterX) + + if(NeedsShiftAfterX) {//OC02022019 - if(m_ArrayShiftX != 0) FillArrayShift('x', t0SignMult*x0_After, FFT2DInfo.xStepTr, m_ArrayShiftX); - else if(m_dArrayShiftX != 0) FillArrayShift('x', t0SignMult*x0_After, FFT2DInfo.xStepTr, m_dArrayShiftX); + if(m_ArrayShiftX != 0) FillArrayShift('x', t0SignMult * x0_After, FFT2DInfo.xStepTr, m_ArrayShiftX); + else if(m_dArrayShiftX != 0) FillArrayShift('x', t0SignMult * x0_After, FFT2DInfo.xStepTr, m_dArrayShiftX); } - if(NeedsShiftAfterY) + if(NeedsShiftAfterY) {//OC02022019 - if(m_ArrayShiftY != 0) FillArrayShift('y', t0SignMult*y0_After, FFT2DInfo.yStepTr, m_ArrayShiftY); - else if(m_dArrayShiftY != 0) FillArrayShift('y', t0SignMult*y0_After, FFT2DInfo.yStepTr, m_dArrayShiftY); + if(m_ArrayShiftY != 0) FillArrayShift('y', t0SignMult * y0_After, FFT2DInfo.yStepTr, m_ArrayShiftY); + else if(m_dArrayShiftY != 0) FillArrayShift('y', t0SignMult * y0_After, FFT2DInfo.yStepTr, m_dArrayShiftY); } - if(NeedsShiftAfterX || NeedsShiftAfterY) + if(NeedsShiftAfterX || NeedsShiftAfterY) { - if(DataToFFT != 0) TreatShifts(DataToFFT); +#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) + if(CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023 + //GPU_COND(pvGPU, //OC06092023 + //GPU_COND(pGpuUsage, //HG18072022 + { + TGPUUsageArg *pGPU = (TGPUUsageArg*)pvGPU; + if(DataToFFT != 0) { + m_ArrayShiftX = (float*)CAuxGPU::ToDevice(pGPU, m_ArrayShiftX, (Nx << 1) * sizeof(float), false); //OC06092023 + m_ArrayShiftY = (float*)CAuxGPU::ToDevice(pGPU, m_ArrayShiftY, (Ny << 1) * sizeof(float), false); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, m_ArrayShiftX); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, m_ArrayShiftY); + //m_ArrayShiftX = (float*)CAuxGPU::ToDevice(pGpuUsage, m_ArrayShiftX, (Nx << 1) * sizeof(float), false); + //m_ArrayShiftY = (float*)CAuxGPU::ToDevice(pGpuUsage, m_ArrayShiftY, (Ny << 1) * sizeof(float), false); + //CAuxGPU::EnsureDeviceMemoryReady(pGpuUsage, m_ArrayShiftX); + //CAuxGPU::EnsureDeviceMemoryReady(pGpuUsage, m_ArrayShiftY); + //TreatShifts2D_GPU((float*)DataToFFT, Nx, Ny, FFT2DInfo.howMany, NeedsShiftAfterX, NeedsShiftAfterY, m_ArrayShiftX, m_ArrayShiftY); + TreatShifts2D_GPU((float*)DataToFFT, Nx, Ny, NeedsShiftAfterX, NeedsShiftAfterY, m_ArrayShiftX, m_ArrayShiftY); //HG04122023 + m_ArrayShiftX = (float*)CAuxGPU::ToHostAndFree(pGPU, m_ArrayShiftX, (Nx << 1) * sizeof(float), true); //OC06092023 + m_ArrayShiftY = (float*)CAuxGPU::ToHostAndFree(pGPU, m_ArrayShiftY, (Ny << 1) * sizeof(float), true); + //m_ArrayShiftX = (float*)CAuxGPU::ToHostAndFree(pGpuUsage, m_ArrayShiftX, (Nx << 1) * sizeof(float), true); + //m_ArrayShiftY = (float*)CAuxGPU::ToHostAndFree(pGpuUsage, m_ArrayShiftY, (Ny << 1) * sizeof(float), true); + } + else if(dDataToFFT != 0) { + m_dArrayShiftX = (double*)CAuxGPU::ToDevice(pGPU, m_dArrayShiftX, (Nx << 1) * sizeof(double), false); //OC06092023 + m_dArrayShiftY = (double*)CAuxGPU::ToDevice(pGPU, m_dArrayShiftY, (Ny << 1) * sizeof(double), false); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, m_dArrayShiftX); + CAuxGPU::EnsureDeviceMemoryReady(pGPU, m_dArrayShiftY); + //m_dArrayShiftX = (double*)CAuxGPU::ToDevice(pGpuUsage, m_dArrayShiftX, (Nx << 1) * sizeof(double), false); + //m_dArrayShiftY = (double*)CAuxGPU::ToDevice(pGpuUsage, m_dArrayShiftY, (Ny << 1) * sizeof(double), false); + //CAuxGPU::EnsureDeviceMemoryReady(pGpuUsage, m_dArrayShiftX); + //CAuxGPU::EnsureDeviceMemoryReady(pGpuUsage, m_dArrayShiftY); + //TreatShifts2D_GPU((double*)dDataToFFT, Nx, Ny, FFT2DInfo.howMany, NeedsShiftAfterX, NeedsShiftAfterY, m_dArrayShiftX, m_dArrayShiftY); + TreatShifts2D_GPU((double*)dDataToFFT, Nx, Ny, NeedsShiftAfterX, NeedsShiftAfterY, m_dArrayShiftX, m_dArrayShiftY); //HG04122023 + m_dArrayShiftX = (double*)CAuxGPU::ToHostAndFree(pGPU, m_dArrayShiftX, (Nx << 1) * sizeof(double), true); //OC06092023 + m_dArrayShiftY = (double*)CAuxGPU::ToHostAndFree(pGPU, m_dArrayShiftY, (Ny << 1) * sizeof(double), true); + //m_dArrayShiftX = (double*)CAuxGPU::ToHostAndFree(pGpuUsage, m_dArrayShiftX, (Nx << 1) * sizeof(double), true); + //m_dArrayShiftY = (double*)CAuxGPU::ToHostAndFree(pGpuUsage, m_dArrayShiftY, (Ny << 1) * sizeof(double), true); + } + }//) + else +#endif + { + if(DataToFFT != 0) TreatShifts(DataToFFT); #ifdef _FFTW3 //OC27022019 - else if(dDataToFFT != 0) TreatShifts(dDataToFFT); //OC02022019 + else if(dDataToFFT != 0) TreatShifts(dDataToFFT); //OC02022019 #endif + } } //OC_NERSC: to comment-out the following line for NERSC (to avoid crash with "python-mpi") //fftwnd_destroy_plan(Plan2DFFT); //OC27102018 //SY: adopted for OpenMP - -#ifdef _FFTW3 //OC28012019 - if(DataToFFT != 0) +#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) + if(CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023 + //GPU_COND(pvGPU, //OC06092023 + //GPU_COND(pGpuUsage, //HG02112021 { - if(pPrecreatedPlan2DFFT == 0) fftwf_destroy_plan(Plan2DFFT); - } - else if(dDataToFFT != 0) //OC03022019 + if(FFT2DInfo.pData != 0) + { + CAuxGPU::MarkUpdated((TGPUUsageArg*)pvGPU, DataToFFT, true, false); //OC06092023 + //CAuxGPU::MarkUpdated(pGpuUsage, DataToFFT, true, false); + } + else if(FFT2DInfo.pdData != 0) + { + CAuxGPU::MarkUpdated((TGPUUsageArg*)pvGPU, dDataToFFT, true, false); //OC06092023 + //CAuxGPU::MarkUpdated(pGpuUsage, dDataToFFT, true, false); + } + }//) + else +#endif { - if(pdPrecreatedPlan2DFFT == 0) fftw_destroy_plan(dPlan2DFFT); - } +#if _FFTW3 //OC28012019 + if(DataToFFT != 0) + { + if(pPrecreatedPlan2DFFT == 0) fftwf_destroy_plan(Plan2DFFT); + } + else if(dDataToFFT != 0) //OC03022019 + { + if(pdPrecreatedPlan2DFFT == 0) fftw_destroy_plan(dPlan2DFFT); + } #else - if(pPrecreatedPlan2DFFT == 0) fftwnd_destroy_plan(Plan2DFFT); + if(pPrecreatedPlan2DFFT == 0) fftwnd_destroy_plan(Plan2DFFT); #endif + } //if(ArrayShiftX != 0) { delete[] ArrayShiftX; ArrayShiftX = 0;} //if(ArrayShiftY != 0) { delete[] ArrayShiftY; ArrayShiftY = 0;} - if(m_ArrayShiftX != 0) { delete[] m_ArrayShiftX; m_ArrayShiftX = 0;} - if(m_ArrayShiftY != 0) { delete[] m_ArrayShiftY; m_ArrayShiftY = 0;} - if(m_dArrayShiftX != 0) { delete[] m_dArrayShiftX; m_dArrayShiftX = 0;} //OC02022019 - if(m_dArrayShiftY != 0) { delete[] m_dArrayShiftY; m_dArrayShiftY = 0;} - + if(m_ArrayShiftX != 0) { delete[] m_ArrayShiftX;} + if(m_ArrayShiftY != 0) { delete[] m_ArrayShiftY;} + if(m_dArrayShiftX != 0) { delete[] m_dArrayShiftX;} //OC02022019 + if(m_dArrayShiftY != 0) { delete[] m_dArrayShiftY;} + return 0; } //************************************************************************* //Forward FFT: Int f(x)*exp(-i*2*Pi*qx*x)dx //Backward FFT: Int f(qx)*exp(i*2*Pi*qx*x)dqx -int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo) +//int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo) +//int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, gpuUsageArg *pGpuUsage) //HG20012022 +int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU) //OC05092023 {// Assumes Nx, Ny even ! //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: //double start; @@ -535,254 +905,454 @@ int CGenMathFFT1D::Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo) { m_ArrayShiftX = new float[Nx << 1]; if(m_ArrayShiftX == 0) return MEMORY_ALLOCATION_FAILURE; + +#ifdef _OFFLOAD_GPU //OC05092023 (check for memory leak / misuse!) + m_ArrayShiftX = (float*)CAuxGPU::ToDevice((TGPUUsageArg*)pvGPU, m_ArrayShiftX, (Nx << 1) * sizeof(float), true); + //m_ArrayShiftX = (float*)CAuxGPU::ToDevice(pGpuUsage, m_ArrayShiftX, (Nx << 1) * sizeof(float), true); //HG20012022 +#endif } else if(FFT1DInfo.pdInData != 0) { m_dArrayShiftX = new double[Nx << 1]; if(m_dArrayShiftX == 0) return MEMORY_ALLOCATION_FAILURE; + +#ifdef _OFFLOAD_GPU //OC05092023 + m_dArrayShiftX = (double*)CAuxGPU::ToDevice((TGPUUsageArg*)pvGPU, m_dArrayShiftX, (Nx << 1) * sizeof(double), true); + //m_dArrayShiftX = (double*)CAuxGPU::ToDevice(pGpuUsage, m_dArrayShiftX, (Nx << 1) * sizeof(double), true); //HG20012022 +#endif } } #ifdef _FFTW3 //OC28012019 fftwf_plan Plan1DFFT; - fftwf_complex *DataToFFT=0, *OutDataFFT=0; //, *pOutDataFFT=0; + fftwf_complex* DataToFFT = 0, * OutDataFFT = 0; //, *pOutDataFFT=0; fftw_plan dPlan1DFFT; - fftw_complex *dDataToFFT=0, *dOutDataFFT=0; //, *pdOutDataFFT=0; + fftw_complex* dDataToFFT = 0, * dOutDataFFT = 0; //, *pdOutDataFFT=0; +#endif - if((FFT1DInfo.pInData != 0) && (FFT1DInfo.pOutData != 0)) +//HG20012022 +//#ifdef _DEBUG +// if (pGpuUsage != NULL) +// printf ("GPU: Make1DFFT\n"); +//#endif +#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) + if(CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023 + //GPU_COND(pvGPU, //OC06092023 + //GPU_COND(pGpuUsage, //HG20012022 { - DataToFFT = (fftwf_complex*)(FFT1DInfo.pInData); - OutDataFFT = (fftwf_complex*)(FFT1DInfo.pOutData); - //pOutDataFFT = OutDataFFT; //OC03092016 to be used solely in fftw call - } - else if((FFT1DInfo.pdInData != 0) && (FFT1DInfo.pdOutData != 0)) + if((FFT1DInfo.pInData != 0) && (FFT1DInfo.pOutData != 0)) + { + DataToFFT = (fftwf_complex*)CAuxGPU::ToDevice((TGPUUsageArg*)pvGPU, FFT1DInfo.pInData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(float)); //OC06092023 + OutDataFFT = (fftwf_complex*)CAuxGPU::ToDevice((TGPUUsageArg*)pvGPU, FFT1DInfo.pOutData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(float), true); + //DataToFFT = (fftwf_complex*)CAuxGPU::ToDevice(pGpuUsage, FFT1DInfo.pInData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(float)); + //OutDataFFT = (fftwf_complex*)CAuxGPU::ToDevice(pGpuUsage, FFT1DInfo.pOutData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(float), true); + } + else if((FFT1DInfo.pdInData != 0) && (FFT1DInfo.pdOutData != 0)) + { + dDataToFFT = (fftw_complex*)CAuxGPU::ToDevice((TGPUUsageArg*)pvGPU, FFT1DInfo.pdInData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(double)); //OC06092023 + dOutDataFFT = (fftw_complex*)CAuxGPU::ToDevice((TGPUUsageArg*)pvGPU, FFT1DInfo.pdOutData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(double), true); + //dDataToFFT = (fftw_complex*)CAuxGPU::ToDevice(pGpuUsage, FFT1DInfo.pdInData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(double)); + //dOutDataFFT = (fftw_complex*)CAuxGPU::ToDevice(pGpuUsage, FFT1DInfo.pdOutData, FFT1DInfo.Nx * FFT1DInfo.HowMany * 2 * sizeof(double), true); + } + }//) + else +#endif { - dDataToFFT = (fftw_complex*)(FFT1DInfo.pdInData); - dOutDataFFT = (fftw_complex*)(FFT1DInfo.pdOutData); - //pdOutDataFFT = dOutDataFFT; - } +#ifdef _FFTW3 //OC28012019 + if((FFT1DInfo.pInData != 0) && (FFT1DInfo.pOutData != 0)) + { + DataToFFT = (fftwf_complex*)(FFT1DInfo.pInData); + OutDataFFT = (fftwf_complex*)(FFT1DInfo.pOutData); + //pOutDataFFT = OutDataFFT; //OC03092016 to be used solely in fftw call + } + else if((FFT1DInfo.pdInData != 0) && (FFT1DInfo.pdOutData != 0)) + { + dDataToFFT = (fftw_complex*)(FFT1DInfo.pdInData); + dOutDataFFT = (fftw_complex*)(FFT1DInfo.pdOutData); + //pdOutDataFFT = dOutDataFFT; + } #else - fftw_plan Plan1DFFT; - FFTW_COMPLEX *DataToFFT = (FFTW_COMPLEX*)(FFT1DInfo.pInData); - FFTW_COMPLEX *OutDataFFT = (FFTW_COMPLEX*)(FFT1DInfo.pOutData); - FFTW_COMPLEX *pOutDataFFT = OutDataFFT; //OC03092016 to be used solely in fftw call -/** - Pointed-out by Sergey Yakubov (E-XFEL). - From FFTW 2.1.5 docs: - void fftw(fftw_plan plan, int howmany, - fftw_complex *in, int istride, int idist, - fftw_complex *out, int ostride, int odist); - ... - out, ostride and odist describe the output array(s). The format is the same as for the input array. - In-place transforms: If the plan specifies an in-place transform, ostride and odist are always ignored. - If out is NULL, out is ignored, too. Otherwise, out is interpreted as a pointer to an array of n complex numbers, - that FFTW will use as temporary space to perform the in-place computation. out is used as scratch space and its contents destroyed. - In this case, out must be an ordinary array whose elements are contiguous in memory (no striding). -**/ -#endif - - char t0SignMult = (FFT1DInfo.Dir > 0)? -1 : 1; - if(NeedsShiftBeforeX) + fftw_plan Plan1DFFT; + FFTW_COMPLEX* DataToFFT = (FFTW_COMPLEX*)(FFT1DInfo.pInData); + FFTW_COMPLEX* OutDataFFT = (FFTW_COMPLEX*)(FFT1DInfo.pOutData); + FFTW_COMPLEX* pOutDataFFT = OutDataFFT; //OC03092016 to be used solely in fftw call + /** + Pointed-out by Sergey Yakubov (E-XFEL). + From FFTW 2.1.5 docs: + void fftw(fftw_plan plan, int howmany, + fftw_complex *in, int istride, int idist, + fftw_complex *out, int ostride, int odist); + ... + out, ostride and odist describe the output array(s). The format is the same as for the input array. + In-place transforms: If the plan specifies an in-place transform, ostride and odist are always ignored. + If out is NULL, out is ignored, too. Otherwise, out is interpreted as a pointer to an array of n complex numbers, + that FFTW will use as temporary space to perform the in-place computation. out is used as scratch space and its contents destroyed. + In this case, out must be an ordinary array whose elements are contiguous in memory (no striding). + **/ +#endif + } + +#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) + if(DataToFFT != 0) CAuxGPU::EnsureDeviceMemoryReady((TGPUUsageArg*)pvGPU, DataToFFT); + else if(dDataToFFT != 0) CAuxGPU::EnsureDeviceMemoryReady((TGPUUsageArg*)pvGPU, dDataToFFT); + //if (DataToFFT != 0) CAuxGPU::EnsureDeviceMemoryReady(pGpuUsage, DataToFFT); + //else if (dDataToFFT != 0) CAuxGPU::EnsureDeviceMemoryReady(pGpuUsage, dDataToFFT); +#endif + + char t0SignMult = (FFT1DInfo.Dir > 0) ? -1 : 1; + if(NeedsShiftBeforeX) { - //FillArrayShift(t0SignMult*x0_Before, FFT1DInfo.xStep); - if(m_ArrayShiftX != 0) FillArrayShift(t0SignMult*x0_Before, FFT1DInfo.xStep, m_ArrayShiftX); - else if(m_dArrayShiftX != 0) FillArrayShift(t0SignMult*x0_Before, FFT1DInfo.xStep, m_dArrayShiftX); +#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) + if(CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023 + //GPU_COND(pvGPU, + //GPU_COND(pGpuUsage, //HG20012022 + { + if(m_ArrayShiftX != 0) FillArrayShift_GPU(t0SignMult * x0_Before, FFT1DInfo.xStep, Nx, m_ArrayShiftX); + else if(m_dArrayShiftX != 0) FillArrayShift_GPU(t0SignMult * x0_Before, FFT1DInfo.xStep, Nx, m_dArrayShiftX); - if(DataToFFT != 0) TreatShift(DataToFFT, FFT1DInfo.HowMany); + if(DataToFFT != 0) TreatShift_GPU((float*)DataToFFT, FFT1DInfo.HowMany, Nx, m_ArrayShiftX); + else if(dDataToFFT != 0) TreatShift_GPU((double*)dDataToFFT, FFT1DInfo.HowMany, Nx, m_dArrayShiftX); + }//) + else +#endif + { + //FillArrayShift(t0SignMult*x0_Before, FFT1DInfo.xStep); + if(m_ArrayShiftX != 0) FillArrayShift(t0SignMult * x0_Before, FFT1DInfo.xStep, m_ArrayShiftX); + else if(m_dArrayShiftX != 0) FillArrayShift(t0SignMult * x0_Before, FFT1DInfo.xStep, m_dArrayShiftX); + + if(DataToFFT != 0) TreatShift(DataToFFT, FFT1DInfo.HowMany); #ifdef _FFTW3 //OC27022019 - else if(dDataToFFT != 0) TreatShift(dDataToFFT, FFT1DInfo.HowMany); + else if(dDataToFFT != 0) TreatShift(dDataToFFT, FFT1DInfo.HowMany); #endif + } } //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: //srwlPrintTime("::Make1DFFT : before fft",&start); int flags = FFTW_ESTIMATE; //OC30012019 + bool alreadyNormalized = false; //HG17032022 + //double Mult = FFT1DInfo.xStep; + double Mult = FFT1DInfo.xStep * FFT1DInfo.MultExtra; - if(FFT1DInfo.Dir > 0) + if(FFT1DInfo.Dir > 0) //HG17112021 { - //int flags = FFTW_ESTIMATE; +#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) + if(CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023 + //GPU_COND(pvGPU, + //GPU_COND(pGpuUsage, + { + int arN[] = { (int)Nx }; //OC14052020 + if(DataToFFT != 0) + { + if(PlanLen != Nx) { + PlanLen = Nx; + if(Plan1DFFT_cu != NULL) + { + cufftDestroy(Plan1DFFT_cu); + Plan1DFFT_cu = NULL; + } + cufftPlanMany(&Plan1DFFT_cu, 1, arN, NULL, 1, Nx, NULL, 1, Nx, CUFFT_C2C, FFT1DInfo.HowMany); + } + if(Plan1DFFT_cu == 0) return ERROR_IN_FFT; + cufftExecC2C(Plan1DFFT_cu, (cufftComplex*)DataToFFT, (cufftComplex*)OutDataFFT, CUFFT_FORWARD); + } + else if(dDataToFFT != 0) //OC02022019 + { + if(dPlanLen != Nx) { + if(dPlan1DFFT_cu != NULL) + { + cufftDestroy(dPlan1DFFT_cu); + dPlan1DFFT_cu = NULL; + } + dPlanLen = Nx; + cufftPlanMany(&dPlan1DFFT_cu, 1, arN, NULL, 1, Nx, NULL, 1, Nx, CUFFT_Z2Z, FFT1DInfo.HowMany); + } + if(dPlan1DFFT_cu == 0) return ERROR_IN_FFT; + cufftExecZ2Z(dPlan1DFFT_cu, (cufftDoubleComplex*)dDataToFFT, (cufftDoubleComplex*)dOutDataFFT, CUFFT_FORWARD); + } + }//) + else +#endif + { + //int flags = FFTW_ESTIMATE; #ifdef _FFTW3 //OC28012019 #ifdef _WITH_OMP //Still needs to be tested! - if(DataToFFT != 0) - { - fftwf_init_threads(); //initialize threading support - int nthreads = omp_get_max_threads(); //detect number of OpenMP threads that are available - fftwf_plan_with_nthreads(nthreads); - } - else if(dDataToFFT != 0) //OC02022019 - { - fftw_init_threads(); //initialize threading support - int nthreads = omp_get_max_threads(); //detect number of OpenMP threads that are available - fftw_plan_with_nthreads(nthreads); - } + if(DataToFFT != 0) + { + fftwf_init_threads(); //initialize threading support + int nthreads = omp_get_max_threads(); //detect number of OpenMP threads that are available + fftwf_plan_with_nthreads(nthreads); + } + else if(dDataToFFT != 0) //OC02022019 + { + fftw_init_threads(); //initialize threading support + int nthreads = omp_get_max_threads(); //detect number of OpenMP threads that are available + fftw_plan_with_nthreads(nthreads); + } #endif //ifndef _WITH_OMP - - int arN[] = {(int)Nx}; //OC14052020 - //int arN[] = {Nx}; - if(DataToFFT != 0) - { - //Plan1DFFT = fftwf_plan_many_dft(1, arN, FFT1DInfo.HowMany, DataToFFT, NULL, 1, Nx, pOutDataFFT, NULL, 1, Nx, FFTW_FORWARD, flags); - Plan1DFFT = fftwf_plan_many_dft(1, arN, FFT1DInfo.HowMany, DataToFFT, NULL, 1, Nx, OutDataFFT, NULL, 1, Nx, FFTW_FORWARD, flags); //OC02022019 - if(Plan1DFFT == 0) return ERROR_IN_FFT; - fftwf_execute(Plan1DFFT); - } - else if(dDataToFFT != 0) //OC02022019 - { - dPlan1DFFT = fftw_plan_many_dft(1, arN, FFT1DInfo.HowMany, dDataToFFT, NULL, 1, Nx, dOutDataFFT, NULL, 1, Nx, FFTW_FORWARD, flags); - if(dPlan1DFFT == 0) return ERROR_IN_FFT; - fftw_execute(dPlan1DFFT); - } + int arN[] = { (int)Nx }; //OC14052020 + //int arN[] = {Nx}; + if(DataToFFT != 0) + { + //Plan1DFFT = fftwf_plan_many_dft(1, arN, FFT1DInfo.HowMany, DataToFFT, NULL, 1, Nx, pOutDataFFT, NULL, 1, Nx, FFTW_FORWARD, flags); + Plan1DFFT = fftwf_plan_many_dft(1, arN, FFT1DInfo.HowMany, DataToFFT, NULL, 1, Nx, OutDataFFT, NULL, 1, Nx, FFTW_FORWARD, flags); //OC02022019 + if(Plan1DFFT == 0) return ERROR_IN_FFT; + fftwf_execute(Plan1DFFT); + } + else if(dDataToFFT != 0) //OC02022019 + { + dPlan1DFFT = fftw_plan_many_dft(1, arN, FFT1DInfo.HowMany, dDataToFFT, NULL, 1, Nx, dOutDataFFT, NULL, 1, Nx, FFTW_FORWARD, flags); + if(dPlan1DFFT == 0) return ERROR_IN_FFT; + fftw_execute(dPlan1DFFT); + } #else //ifndef _FFTW3 - if(DataToFFT == OutDataFFT) - { - flags |= FFTW_IN_PLACE; - pOutDataFFT = 0; //OC03092016 (see FFTW 2.1.5 doc clause above) - } - Plan1DFFT = fftw_create_plan(Nx, FFTW_FORWARD, flags); - if(Plan1DFFT == 0) return ERROR_IN_FFT; + if(DataToFFT == OutDataFFT) + { + flags |= FFTW_IN_PLACE; + pOutDataFFT = 0; //OC03092016 (see FFTW 2.1.5 doc clause above) + } + Plan1DFFT = fftw_create_plan(Nx, FFTW_FORWARD, flags); + if(Plan1DFFT == 0) return ERROR_IN_FFT; - //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: - //srwlPrintTime("::Make1DFFT : fft create plan dir>0",&start); + //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: + //srwlPrintTime("::Make1DFFT : fft create plan dir>0",&start); #ifndef _WITH_OMP //OC27102018 //fftw(Plan1DFFT, FFT1DInfo.HowMany, DataToFFT, 1, Nx, OutDataFFT, 1, Nx); - fftw(Plan1DFFT, FFT1DInfo.HowMany, DataToFFT, 1, Nx, pOutDataFFT, 1, Nx); //OC03092016 + fftw(Plan1DFFT, FFT1DInfo.HowMany, DataToFFT, 1, Nx, pOutDataFFT, 1, Nx); //OC03092016 #else //OC27102018 //SY: split one call into many (for OpenMP) - #pragma omp parallel for if (omp_get_num_threads()==1) // to avoid nested multi-threading (just in case) - for(int i=0; i0",&start); - if(OutDataFFT != 0) +#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) + if(CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023 + //GPU_COND(pvGPU, + //GPU_COND(pGpuUsage, //HG20012022 { - RepairSignAfter1DFFT(OutDataFFT, FFT1DInfo.HowMany); - RotateDataAfter1DFFT(OutDataFFT, FFT1DInfo.HowMany); - } - -#ifdef _FFTW3 //OC27022019 - else if(dOutDataFFT != 0) + if(OutDataFFT != 0) + { + RepairAndRotateDataAfter1DFFT_GPU((float*)OutDataFFT, FFT1DInfo.HowMany, Nx, (float)Mult); //OC06092023 + //RepairAndRotateDataAfter1DFFT_GPU((float*)OutDataFFT, FFT1DInfo.HowMany, Nx, Mult); + //RepairSignAfter1DFFT_GPU((float*)OutDataFFT, FFT1DInfo.HowMany, Nx); + //RotateDataAfter1DFFT_GPU((float*)OutDataFFT, FFT1DInfo.HowMany, Nx); + } + else if(dOutDataFFT != 0) + { + RepairAndRotateDataAfter1DFFT_GPU((double*)dOutDataFFT, FFT1DInfo.HowMany, Nx, Mult); + //RepairSignAfter1DFFT_GPU((double*)dOutDataFFT, FFT1DInfo.HowMany, Nx); + //RotateDataAfter1DFFT_GPU((double*)dOutDataFFT, FFT1DInfo.HowMany, Nx); + } + alreadyNormalized = true; + }//) + else +#endif { - RepairSignAfter1DFFT(dOutDataFFT, FFT1DInfo.HowMany); - RotateDataAfter1DFFT(dOutDataFFT, FFT1DInfo.HowMany); - } + if(OutDataFFT != 0) + { + RepairSignAfter1DFFT(OutDataFFT, FFT1DInfo.HowMany); + RotateDataAfter1DFFT(OutDataFFT, FFT1DInfo.HowMany); + } +#ifdef _FFTW3 //OC27022019 + else if(dOutDataFFT != 0) + { + RepairSignAfter1DFFT(dOutDataFFT, FFT1DInfo.HowMany); + RotateDataAfter1DFFT(dOutDataFFT, FFT1DInfo.HowMany); + } #endif + } } else { //int flags = FFTW_ESTIMATE; //OC30012019 (commented-out) -#ifdef _FFTW3 //OC28012019 -#ifdef _WITH_OMP - - //Still needs to be tested! - if(DataToFFT != 0) +#ifdef _OFFLOAD_GPU //OC06092023 (to avoid #include "auxgpu.h" for CPU) + if(CAuxGPU::GPUEnabled((TGPUUsageArg*)pvGPU)) //HG04122023 + //GPU_COND(pvGPU, + //GPU_COND(pGpuUsage, //HG20012022 { - fftwf_init_threads(); //initialize threading support - int nthreads = omp_get_max_threads(); //detect number of OpenMP threads that are available - fftwf_plan_with_nthreads(nthreads); - } - else if(dDataToFFT != 0) - { - fftw_init_threads(); //initialize threading support - int nthreads = omp_get_max_threads(); //detect number of OpenMP threads that are available - fftw_plan_with_nthreads(nthreads); - } - -#endif - - int arN[] = {(int)Nx}; //OC14052020 - //int arN[] = {Nx}; - if(DataToFFT != 0) - { - //Plan1DFFT = fftwf_plan_many_dft(1, arN, FFT1DInfo.HowMany, DataToFFT, NULL, 1, Nx, pOutDataFFT, NULL, 1, Nx, FFTW_BACKWARD, flags); - Plan1DFFT = fftwf_plan_many_dft(1, arN, FFT1DInfo.HowMany, DataToFFT, NULL, 1, Nx, OutDataFFT, NULL, 1, Nx, FFTW_BACKWARD, flags); //OC02022019 - if(Plan1DFFT == 0) return ERROR_IN_FFT; + int arN[] = { (int)Nx }; //OC14052020 + //int arN[] = {Nx}; + if(DataToFFT != 0) + { + if(PlanLen != Nx) { + PlanLen = Nx; + HowMany = FFT1DInfo.HowMany; + if(Plan1DFFT_cu != NULL) + { + cufftDestroy(Plan1DFFT_cu); + Plan1DFFT_cu = NULL; + } + cufftPlanMany(&Plan1DFFT_cu, 1, arN, NULL, 1, Nx, NULL, 1, Nx, CUFFT_C2C, FFT1DInfo.HowMany); + } + if(Plan1DFFT_cu == 0) return ERROR_IN_FFT; - RotateDataAfter1DFFT(DataToFFT, FFT1DInfo.HowMany); - RepairSignAfter1DFFT(DataToFFT, FFT1DInfo.HowMany); + RotateDataAfter1DFFT_GPU((float*)DataToFFT, FFT1DInfo.HowMany, Nx); + RepairSignAfter1DFFT_GPU((float*)DataToFFT, FFT1DInfo.HowMany, Nx); + cufftExecC2C(Plan1DFFT_cu, (cufftComplex*)DataToFFT, (cufftComplex*)OutDataFFT, CUFFT_INVERSE); + } + else if(dDataToFFT != 0) //OC02022019 + { + if(dPlanLen != Nx) + { + dPlanLen = Nx; + dHowMany = FFT1DInfo.HowMany; + if(dPlan1DFFT_cu != NULL) + { + cufftDestroy(dPlan1DFFT_cu); + dPlan1DFFT_cu = NULL; + } + cufftPlanMany(&dPlan1DFFT_cu, 1, arN, NULL, 1, Nx, NULL, 1, Nx, CUFFT_Z2Z, FFT1DInfo.HowMany); + } + if(dPlan1DFFT_cu == 0) return ERROR_IN_FFT; - fftwf_execute(Plan1DFFT); - } - else if(dDataToFFT != 0) //OC02022019 + RotateDataAfter1DFFT_GPU((double*)dDataToFFT, FFT1DInfo.HowMany, Nx); + RepairSignAfter1DFFT_GPU((double*)dDataToFFT, FFT1DInfo.HowMany, Nx); + cufftExecZ2Z(dPlan1DFFT_cu, (cufftDoubleComplex*)dDataToFFT, (cufftDoubleComplex*)dOutDataFFT, CUFFT_INVERSE); + } + }//) + else +#endif { - dPlan1DFFT = fftw_plan_many_dft(1, arN, FFT1DInfo.HowMany, dDataToFFT, NULL, 1, Nx, dOutDataFFT, NULL, 1, Nx, FFTW_BACKWARD, flags); - if(dPlan1DFFT == 0) return ERROR_IN_FFT; +#ifdef _FFTW3 //OC28012019 +#ifdef _WITH_OMP - RotateDataAfter1DFFT(dDataToFFT, FFT1DInfo.HowMany); - RepairSignAfter1DFFT(dDataToFFT, FFT1DInfo.HowMany); + //Still needs to be tested! + if(DataToFFT != 0) + { + fftwf_init_threads(); //initialize threading support + int nthreads = omp_get_max_threads(); //detect number of OpenMP threads that are available + fftwf_plan_with_nthreads(nthreads); + } + else if(dDataToFFT != 0) + { + fftw_init_threads(); //initialize threading support + int nthreads = omp_get_max_threads(); //detect number of OpenMP threads that are available + fftw_plan_with_nthreads(nthreads); + } - fftw_execute(dPlan1DFFT); - } +#endif + int arN[] = { (int)Nx }; //OC14052020 + //int arN[] = {Nx}; + if(DataToFFT != 0) + { + //Plan1DFFT = fftwf_plan_many_dft(1, arN, FFT1DInfo.HowMany, DataToFFT, NULL, 1, Nx, pOutDataFFT, NULL, 1, Nx, FFTW_BACKWARD, flags); + Plan1DFFT = fftwf_plan_many_dft(1, arN, FFT1DInfo.HowMany, DataToFFT, NULL, 1, Nx, OutDataFFT, NULL, 1, Nx, FFTW_BACKWARD, flags); //OC02022019 + if(Plan1DFFT == 0) return ERROR_IN_FFT; + RotateDataAfter1DFFT(DataToFFT, FFT1DInfo.HowMany); + RepairSignAfter1DFFT(DataToFFT, FFT1DInfo.HowMany); + fftwf_execute(Plan1DFFT); + } + else if(dDataToFFT != 0) //OC02022019 + { + dPlan1DFFT = fftw_plan_many_dft(1, arN, FFT1DInfo.HowMany, dDataToFFT, NULL, 1, Nx, dOutDataFFT, NULL, 1, Nx, FFTW_BACKWARD, flags); + if(dPlan1DFFT == 0) return ERROR_IN_FFT; + RotateDataAfter1DFFT(dDataToFFT, FFT1DInfo.HowMany); + RepairSignAfter1DFFT(dDataToFFT, FFT1DInfo.HowMany); + fftw_execute(dPlan1DFFT); + } #else //ifndef _FFTW3 - if(DataToFFT == OutDataFFT) - { - flags |= FFTW_IN_PLACE; - pOutDataFFT = 0; //OC03092016 (see FFTW 2.1.5 doc clause above) - } - Plan1DFFT = fftw_create_plan(Nx, FFTW_BACKWARD, flags); - if(Plan1DFFT == 0) return ERROR_IN_FFT; + if(DataToFFT == OutDataFFT) + { + flags |= FFTW_IN_PLACE; + pOutDataFFT = 0; //OC03092016 (see FFTW 2.1.5 doc clause above) + } + Plan1DFFT = fftw_create_plan(Nx, FFTW_BACKWARD, flags); + if(Plan1DFFT == 0) return ERROR_IN_FFT; - //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: - //srwlPrintTime("::Make1DFFT : fft create plan dir<0",&start); + //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: + //srwlPrintTime("::Make1DFFT : fft create plan dir<0",&start); - RotateDataAfter1DFFT(DataToFFT, FFT1DInfo.HowMany); - //srwlPrintTime("::Make1DFFT : rotate dir<0",&start); + RotateDataAfter1DFFT(DataToFFT, FFT1DInfo.HowMany); + //srwlPrintTime("::Make1DFFT : rotate dir<0",&start); - RepairSignAfter1DFFT(DataToFFT, FFT1DInfo.HowMany); - //srwlPrintTime("::Make1DFFT : repair dir<0",&start); + RepairSignAfter1DFFT(DataToFFT, FFT1DInfo.HowMany); + //srwlPrintTime("::Make1DFFT : repair dir<0",&start); #ifndef _WITH_OMP //OC27102018 //fftw(Plan1DFFT, FFT1DInfo.HowMany, DataToFFT, 1, Nx, OutDataFFT, 1, Nx); - fftw(Plan1DFFT, FFT1DInfo.HowMany, DataToFFT, 1, Nx, pOutDataFFT, 1, Nx); //OC03092016 + fftw(Plan1DFFT, FFT1DInfo.HowMany, DataToFFT, 1, Nx, pOutDataFFT, 1, Nx); //OC03092016 #else //OC27102018 //SY: split one call into many (for OpenMP) - #pragma omp parallel for if (omp_get_num_threads()==1) // to avoid nested multi-threading (just in case) - for(int i=0; i +#include +#include +#include "gmfft.h" + +#define GMFFT_BLOCK_SIZE 256 + +template __global__ void RepairSignAfter1DFFT_Kernel(T* pAfterFFT, long HowMany, long Nx2) +{ + int ix = (blockIdx.x * blockDim.x + threadIdx.x) * 4 + 2; //Nx range + + if (ix < Nx2) + { + for (long k = 0; k < HowMany; k++) + { + pAfterFFT[ix + k * Nx2] = -pAfterFFT[ix + k * Nx2]; + pAfterFFT[ix + k * Nx2 + 1] = -pAfterFFT[ix + k * Nx2 + 1]; + } + } +} + +template __global__ void RotateDataAfter1DFFT_Kernel(T* pAfterFFT, long HowMany, long Nx2, long Nx) +{ + int ix = (blockIdx.x * blockDim.x + threadIdx.x) * 2; //HalfNx range + + if (ix < Nx) + { + for (long k = 0; k < HowMany; k++) + { + T t1_0 = pAfterFFT[ix + Nx2 * k]; + T t1_1 = pAfterFFT[ix + Nx2 * k + 1]; + + pAfterFFT[ix + Nx2 * k] = pAfterFFT[ix + Nx + Nx2 * k]; + pAfterFFT[ix + Nx2 * k + 1] = pAfterFFT[ix + Nx + Nx2 * k + 1]; + pAfterFFT[ix + Nx + Nx2 * k] = t1_0; + pAfterFFT[ix + Nx + Nx2 * k + 1] = t1_1; + } + } +} + +template __global__ void RepairAndRotateAfter1DFFT_Kernel(T* pAfterFFT, long HowMany, long Nx, float Mult) +{ + int ix = (blockIdx.x * blockDim.x + threadIdx.x); //HalfNx range + + long HalfNx = Nx / 2; + long Nx2 = Nx * 2; + if (ix < HalfNx) + { + float sx0 = 1 - 2 * (ix % 2); + float sx1 = 1 - 2 * ((HalfNx + ix) % 2); + + float s1 = sx0 * Mult; + float s2 = sx1 * Mult; + + int idx = ix * 2; + for (long i = 0; i < HowMany; i++){ + T* t1 = pAfterFFT + i * Nx2, *t2 = pAfterFFT + (HalfNx) * 2 + i * Nx2; + + T buf_r = t1[idx] * s1; + T buf_im = t1[idx + 1] * s1; + + t1[idx] = t2[idx] * s2; + t1[idx + 1] = t2[idx + 1] * s2; + + t2[idx] = buf_r; + t2[idx + 1] = buf_im; + } + } +} + +template __global__ void NormalizeDataAfter1DFFT_Kernel(T* pAfterFFT, long HowMany, long Nx2, T Mult) +{ + int ix = (blockIdx.x * blockDim.x + threadIdx.x) * 2; //Nx range + + if (ix < Nx2) + { + for (long i = 0; i < HowMany; i++) { + pAfterFFT[ix + i * Nx2] *= Mult; + pAfterFFT[ix + i * Nx2 + 1] *= Mult; + } + } +} + +template __global__ void FillArrayShift_Kernel(double t0, double tStep, long N, T* arShiftX) +{ + int ix = (blockIdx.x * blockDim.x + threadIdx.x); //HalfNx range + + double t0TwoPi = t0 * 2 * CUDART_PI; + double q = tStep * ix; + + if (ix < N) + { + if (ix == 0) { + arShiftX[N] = 1.0; + arShiftX[N + 1] = 0.0; + } + + ix *= 2; + if (ix < N - 2) + { + sincos(q * t0TwoPi, &arShiftX[N + 2 + 1 + ix], &arShiftX[N + 2 + ix]); + arShiftX[N - 2 - ix] = arShiftX[N + 2 + ix]; + arShiftX[N - 1 - ix] = -arShiftX[N + 2 + 1 + ix]; + } + + if (ix == N - 2) + { + sincos(-q * t0TwoPi, &arShiftX[1], &arShiftX[0]); + } + } +} + +template __global__ void TreatShift_Kernel(T* pData, long HowMany, long Nx2, T* tShiftX) +{ + int ix = (blockIdx.x * blockDim.x + threadIdx.x) * 2; //Nx range + + if (ix < Nx2) + { + T MultX_Re = tShiftX[ix]; + T MultX_Im = tShiftX[ix + 1]; + + for (long k = 0; k < HowMany; k++) + { + T buf_r = pData[ix + k * Nx2]; + T buf_im = pData[ix + k * Nx2 + 1]; + + T NewRe = buf_r * MultX_Re - buf_im * MultX_Im; + T NewIm = buf_r * MultX_Im + buf_im * MultX_Re; + pData[ix + k * Nx2] = NewRe; + pData[ix + k * Nx2 + 1] = NewIm; + } + } +} + +void CGenMathFFT1D::RepairSignAfter1DFFT_GPU(float* pAfterFFT, long HowMany, long Nx) +{ + + dim3 blocks(Nx / GMFFT_BLOCK_SIZE + ((Nx & (GMFFT_BLOCK_SIZE - 1)) != 0)); + dim3 threads(GMFFT_BLOCK_SIZE, 1); + RepairSignAfter1DFFT_Kernel << > > (pAfterFFT, HowMany, Nx * 2); + +//#ifdef _DEBUG +// cudaStreamSynchronize(0); +// auto err = cudaGetLastError(); +// printf("%s\r\n", cudaGetErrorString(err)); +//#endif +} + +void CGenMathFFT1D::RotateDataAfter1DFFT_GPU(float* pAfterFFT, long HowMany, long Nx) +{ + + dim3 blocks(Nx / (2 * GMFFT_BLOCK_SIZE) + ((Nx / 2 & (GMFFT_BLOCK_SIZE - 1)) != 0)); + dim3 threads(GMFFT_BLOCK_SIZE, 1); + RotateDataAfter1DFFT_Kernel << > > (pAfterFFT, HowMany, Nx * 2, Nx); + +//#ifdef _DEBUG +// cudaStreamSynchronize(0); +// auto err = cudaGetLastError(); +// printf("%s\r\n", cudaGetErrorString(err)); +//#endif +} + +void CGenMathFFT1D::RepairAndRotateDataAfter1DFFT_GPU(float* pAfterFFT, long HowMany, long Nx, float Mult) +{ + +//#ifdef _DEBUG +// cudaStreamSynchronize(0); +// auto err = cudaGetLastError(); +// printf("%s\r\n", cudaGetErrorString(err)); +//#endif + + + dim3 blocks(Nx / (2 * GMFFT_BLOCK_SIZE) + (((Nx / 2) & (GMFFT_BLOCK_SIZE - 1)) != 0), 1); + dim3 threads(GMFFT_BLOCK_SIZE, 1); + RepairAndRotateAfter1DFFT_Kernel << > > (pAfterFFT, HowMany, Nx, Mult); + +//#ifdef _DEBUG +// cudaStreamSynchronize(0); +// err = cudaGetLastError(); +// printf("%s\r\n", cudaGetErrorString(err)); +//#endif +} + +void CGenMathFFT1D::NormalizeDataAfter1DFFT_GPU(float* pAfterFFT, long HowMany, long Nx, double Mult) +{ + + dim3 blocks(Nx / GMFFT_BLOCK_SIZE + ((Nx & (GMFFT_BLOCK_SIZE - 1)) != 0), 1); + dim3 threads(GMFFT_BLOCK_SIZE, 1); + NormalizeDataAfter1DFFT_Kernel << > > (pAfterFFT, HowMany, Nx * 2, (float)Mult); //OC06092023 + //NormalizeDataAfter1DFFT_Kernel << > > (pAfterFFT, HowMany, Nx * 2, Mult); + +//#ifdef _DEBUG +// cudaStreamSynchronize(0); +// auto err = cudaGetLastError(); +// printf("%s\r\n", cudaGetErrorString(err)); +//#endif +} + +void CGenMathFFT1D::FillArrayShift_GPU(double t0, double tStep, long Nx, float* tShiftX) +{ + + dim3 blocks(Nx / (2 * GMFFT_BLOCK_SIZE) + ((Nx / 2 & (GMFFT_BLOCK_SIZE - 1)) != 0), 1); + dim3 threads(GMFFT_BLOCK_SIZE, 1); + FillArrayShift_Kernel << > > (t0, tStep, Nx, tShiftX); + +//#ifdef _DEBUG +// cudaStreamSynchronize(0); +// auto err = cudaGetLastError(); +// printf("%s\r\n", cudaGetErrorString(err)); +//#endif +} + +void CGenMathFFT1D::TreatShift_GPU(float* pData, long HowMany, long Nx, float* tShiftX) +{ + + dim3 blocks(Nx / GMFFT_BLOCK_SIZE + ((Nx & (GMFFT_BLOCK_SIZE - 1)) != 0)); + dim3 threads(GMFFT_BLOCK_SIZE, 1); + TreatShift_Kernel << > > (pData, HowMany, Nx * 2, tShiftX); + +//#ifdef _DEBUG +// cudaStreamSynchronize(0); +// auto err = cudaGetLastError(); +// printf("%s\r\n", cudaGetErrorString(err)); +//#endif +} + +void CGenMathFFT1D::RepairSignAfter1DFFT_GPU(double* pAfterFFT, long HowMany, long Nx) +{ + + dim3 blocks(Nx / GMFFT_BLOCK_SIZE + ((Nx & (GMFFT_BLOCK_SIZE - 1)) != 0)); + dim3 threads(GMFFT_BLOCK_SIZE, 1); + RepairSignAfter1DFFT_Kernel << > > (pAfterFFT, HowMany, Nx * 2); + +//#ifdef _DEBUG +// cudaStreamSynchronize(0); +// auto err = cudaGetLastError(); +// printf("%s\r\n", cudaGetErrorString(err)); +//#endif +} + +void CGenMathFFT1D::RotateDataAfter1DFFT_GPU(double* pAfterFFT, long HowMany, long Nx) +{ + + dim3 blocks(Nx / (2 * GMFFT_BLOCK_SIZE) + ((Nx & (2 * GMFFT_BLOCK_SIZE - 1)) != 0)); + dim3 threads(GMFFT_BLOCK_SIZE, 1); + RotateDataAfter1DFFT_Kernel << > > (pAfterFFT, HowMany, Nx * 2, Nx); + +//#ifdef _DEBUG +// cudaStreamSynchronize(0); +// auto err = cudaGetLastError(); +// printf("%s\r\n", cudaGetErrorString(err)); +//#endif +} + +void CGenMathFFT1D::RepairAndRotateDataAfter1DFFT_GPU(double* pAfterFFT, long HowMany, long Nx, double Mult) +{ + + dim3 blocks(Nx / (2 * GMFFT_BLOCK_SIZE) + (((Nx / 2) & (GMFFT_BLOCK_SIZE - 1)) != 0), 1); + dim3 threads(GMFFT_BLOCK_SIZE, 1); + RepairAndRotateAfter1DFFT_Kernel << > > (pAfterFFT, HowMany, Nx, (float)Mult); //OC06092023 (check why it's not ..T Mult..) + //RepairAndRotateAfter1DFFT_Kernel << > > (pAfterFFT, HowMany, Nx, Mult); + +//#ifdef _DEBUG +// cudaStreamSynchronize(0); +// auto err = cudaGetLastError(); +// printf("%s\r\n", cudaGetErrorString(err)); +//#endif +} + +void CGenMathFFT1D::NormalizeDataAfter1DFFT_GPU(double* pAfterFFT, long HowMany, long Nx, double Mult) +{ + + dim3 blocks(Nx / GMFFT_BLOCK_SIZE + ((Nx & (GMFFT_BLOCK_SIZE - 1)) != 0)); + dim3 threads(GMFFT_BLOCK_SIZE, 1); + NormalizeDataAfter1DFFT_Kernel << > > (pAfterFFT, HowMany, Nx * 2, Mult); + +//#ifdef _DEBUG +// cudaStreamSynchronize(0); +// auto err = cudaGetLastError(); +// printf("%s\r\n", cudaGetErrorString(err)); +//#endif +} + +void CGenMathFFT1D::FillArrayShift_GPU(double t0, double tStep, long Nx, double* tShiftX) +{ + + dim3 blocks(Nx / (2 * GMFFT_BLOCK_SIZE) + ((Nx & (2 * GMFFT_BLOCK_SIZE - 1)) != 0), 1); + dim3 threads(GMFFT_BLOCK_SIZE, 1); + FillArrayShift_Kernel << > > (t0, tStep, Nx, tShiftX); + +//#ifdef _DEBUG +// cudaStreamSynchronize(0); +// auto err = cudaGetLastError(); +// printf("%s\r\n", cudaGetErrorString(err)); +//#endif +} + +void CGenMathFFT1D::TreatShift_GPU(double* pData, long HowMany, long Nx, double* tShiftX) +{ + + dim3 blocks(Nx / GMFFT_BLOCK_SIZE + ((Nx & (GMFFT_BLOCK_SIZE - 1)) != 0)); + dim3 threads(GMFFT_BLOCK_SIZE, 1); + TreatShift_Kernel << > > (pData, HowMany, Nx * 2, tShiftX); + +//#ifdef _DEBUG +// cudaStreamSynchronize(0); +// auto err = cudaGetLastError(); +// printf("%s\r\n", cudaGetErrorString(err)); +//#endif +} + + +template __global__ void RepairSignAfter2DFFT_Kernel(T* pAfterFFT, long Nx, long Ny) +{ + int ix = (blockIdx.x * blockDim.x + threadIdx.x); //Nx range + int iy = (blockIdx.y * blockDim.y + threadIdx.y); //Ny range + + float sx0 = 1 - 2 * (ix % 2); + float sy0 = 1 - 2 * (iy % 2); + float s = sx0 * sy0; + + if (ix < Nx && iy < Ny) + { + pAfterFFT[(ix + iy * Nx) * 2] *= s; + pAfterFFT[(ix + iy * Nx) * 2 + 1] *= s; + } +} + +template __global__ void RotateDataAfter2DFFT_Kernel(T* pAfterFFT, long HalfNx, long Nx, long HalfNy, long Ny) +{ + int ix = (blockIdx.x * blockDim.x + threadIdx.x); //HalfNx range + int iy = (blockIdx.y * blockDim.y + threadIdx.y); //HalfNy range + + if (ix < HalfNx && iy < HalfNy) + { + int idx = (ix + iy * Nx) * 2; + long long HalfNyNx = ((long long)HalfNy) * ((long long)Nx); + T* t1 = pAfterFFT, *t2 = pAfterFFT + (HalfNyNx + HalfNx) * 2; + T* t3 = pAfterFFT + HalfNx * 2, *t4 = pAfterFFT + HalfNyNx * 2; + + T buf_r = t1[idx]; + T buf_im = t1[idx + 1]; + t1[idx] = t2[idx]; + t1[idx + 1] = t2[idx + 1]; + + t2[idx] = buf_r; + t2[idx + 1] = buf_im; + + buf_r = t3[idx]; + buf_im = t3[idx + 1]; + t3[idx] = t4[idx]; + t3[idx + 1] = t4[idx + 1]; + + t4[idx] = buf_r; + t4[idx + 1] = buf_im; + } +} + +template __global__ void RepairSignAndRotateDataAfter2DFFT_Kernel(T* pAfterFFT, long HalfNx, long Nx, long HalfNy, long Ny, T2 Mult) +{ + int ix = (blockIdx.x * blockDim.x + threadIdx.x); //HalfNx range + int iy = (blockIdx.y * blockDim.y + threadIdx.y); //HalfNy range + + if (ix < HalfNx) + { + float sx0 = 1.f - 2.f * (ix % 2); + float sy0 = 1.f - 2.f * (iy % 2); + float sx1 = 1.f - 2.f * ((HalfNx + ix) % 2); + float sy1 = 1.f - 2.f * ((HalfNy + iy) % 2); + + float s1 = sx0 * sy0 * Mult; + float s2 = sx1 * sy1 * Mult; + float s3 = sx1 * sy0 * Mult; + float s4 = sx0 * sy1 * Mult; + + int idx = (ix + iy * Nx); + + long long HalfNyNx = ((long long)HalfNy) * ((long long)Nx); + T* t1 = pAfterFFT, *t2 = pAfterFFT + (HalfNyNx + HalfNx); + T* t3 = pAfterFFT + HalfNx, *t4 = pAfterFFT + HalfNyNx; + + T buf1 = t1[idx]; + buf1.x *= s1; + buf1.y *= s1; + + T buf2 = t2[idx]; + buf2.x *= s2; + buf2.y *= s2; + + t1[idx] = buf2; + t2[idx] = buf1; + + buf1 = t3[idx]; + buf1.x *= s3; + buf1.y *= s3; + + buf2 = t4[idx]; + buf2.x *= s4; + buf2.y *= s4; + + t3[idx] = buf2; + t4[idx] = buf1; + } +} + +template __global__ void NormalizeDataAfter2DFFT_Kernel(T* pAfterFFT, long Nx2Ny2, long n, T Mult) +{ + int ix = (blockIdx.x * blockDim.x + threadIdx.x) * 2; //Nx range + + if (ix < Nx2Ny2) + { + pAfterFFT[ix] *= Mult; + pAfterFFT[ix + 1] *= Mult; + } +} + +template __global__ void TreatShift2D_Kernel(T* pData, long Nx2, long Ny, T* tShiftX, T* tShiftY) +{ + int ix = (blockIdx.x * blockDim.x + threadIdx.x) * 2; //Nx range + int iy = (blockIdx.y * blockDim.y + threadIdx.y); //Ny range + + if (ix < Nx2) + { + T MultRe = 1; + T MultIm = 0; + + T MultX_Re = 1; + T MultX_Im = 0; + + T MultY_Re = 1; + T MultY_Im = 0; + + if (NeedsShiftY) + { + MultY_Re = tShiftY[iy * 2]; + MultY_Im = tShiftY[iy * 2 + 1]; + } + if (NeedsShiftX) + { + MultX_Re = tShiftX[ix]; + MultX_Im = tShiftX[ix + 1]; + + if (NeedsShiftY) + { + MultRe = MultX_Re * MultY_Re - MultX_Im * MultY_Im; + MultIm = MultX_Re * MultY_Im + MultX_Im * MultY_Re; + } + else + { + MultRe = MultX_Re; + MultIm = MultX_Im; + } + } + else + { + MultRe = MultY_Re; + MultIm = MultY_Im; + } + + long offset = iy * Nx2 + ix; + T buf_r = pData[offset]; + T buf_im = pData[offset + 1]; + T NewRe = buf_r * MultRe - buf_im * MultIm; + T NewIm = buf_r * MultIm + buf_im * MultRe; + pData[offset] = NewRe; + pData[offset + 1] = NewIm; + } +} + +void CGenMathFFT2D::RepairSignAfter2DFFT_GPU(float* pAfterFFT, long Nx, long Ny) +{ + + dim3 blocks(Nx / GMFFT_BLOCK_SIZE + ((Nx & (GMFFT_BLOCK_SIZE - 1)) != 0), Ny); + dim3 threads(GMFFT_BLOCK_SIZE, 1); + RepairSignAfter2DFFT_Kernel << > > (pAfterFFT, Nx, Ny); +} + +void CGenMathFFT2D::RotateDataAfter2DFFT_GPU(float* pAfterFFT, long Nx, long Ny) +{ + + dim3 blocks(Nx / (2 * GMFFT_BLOCK_SIZE) + ((Nx / 2 & (GMFFT_BLOCK_SIZE - 1)) != 0), Ny); + dim3 threads(GMFFT_BLOCK_SIZE, 1); + RotateDataAfter2DFFT_Kernel << > > (pAfterFFT, Nx / 2, Nx, Ny / 2, Ny); +} + +void CGenMathFFT2D::RepairSignAndRotateDataAfter2DFFT_GPU(float* pAfterFFT, long Nx, long Ny, float Mult) +{ + + dim3 blocks(Nx / (2 * GMFFT_BLOCK_SIZE) + ((Nx / 2 & (GMFFT_BLOCK_SIZE - 1)) != 0), Ny/2); + dim3 threads(GMFFT_BLOCK_SIZE, 1); + RepairSignAndRotateDataAfter2DFFT_Kernel << > > ((float2*)pAfterFFT, Nx / 2, Nx, Ny / 2, Ny, Mult); +} + +void CGenMathFFT2D::NormalizeDataAfter2DFFT_GPU(float* pAfterFFT, long Nx, long Ny, double Mult) +{ + + dim3 blocks((Nx * Ny) / GMFFT_BLOCK_SIZE + (((Nx * Ny) & (GMFFT_BLOCK_SIZE - 1)) != 0), 1); + dim3 threads(GMFFT_BLOCK_SIZE, 1); + NormalizeDataAfter2DFFT_Kernel << > > (pAfterFFT, Nx * Ny * 2, 1, (float)Mult); //OC06092023 + //NormalizeDataAfter2DFFT_Kernel << > > (pAfterFFT, Nx * Ny * 2, howMany,1, Mult); +} + +void CGenMathFFT2D::TreatShifts2D_GPU(float* pData, long Nx, long Ny, bool NeedsShiftX, bool NeedsShiftY, float* m_ArrayShiftX, float* m_ArrayShiftY) +{ + + dim3 blocks((Nx) / GMFFT_BLOCK_SIZE + (((Nx) & (GMFFT_BLOCK_SIZE - 1)) != 0), Ny); + dim3 threads(GMFFT_BLOCK_SIZE, 1); + + if (NeedsShiftX && NeedsShiftY) TreatShift2D_Kernel << > > (pData, Nx * 2, Ny, m_ArrayShiftX, m_ArrayShiftY); + else if (NeedsShiftX) TreatShift2D_Kernel << > > (pData, Nx * 2, Ny, m_ArrayShiftX, m_ArrayShiftY); + else if (NeedsShiftY) TreatShift2D_Kernel << > > (pData, Nx * 2, Ny, m_ArrayShiftX, m_ArrayShiftY); +} + +void CGenMathFFT2D::RepairSignAfter2DFFT_GPU(double* pAfterFFT, long Nx, long Ny) +{ + + dim3 blocks(Nx / GMFFT_BLOCK_SIZE + ((Nx & (GMFFT_BLOCK_SIZE - 1)) != 0), Ny); + dim3 threads(GMFFT_BLOCK_SIZE, 1); + RepairSignAfter2DFFT_Kernel << > > (pAfterFFT, Nx, Ny); +} + +void CGenMathFFT2D::RotateDataAfter2DFFT_GPU(double* pAfterFFT, long Nx, long Ny) +{ + + dim3 blocks(Nx / (2 * GMFFT_BLOCK_SIZE) + ((Nx / 2 & (GMFFT_BLOCK_SIZE - 1)) != 0), Ny); + dim3 threads(GMFFT_BLOCK_SIZE, 1); + RotateDataAfter2DFFT_Kernel << > > (pAfterFFT, Nx / 2, Nx, Ny / 2, Ny); +} + +void CGenMathFFT2D::RepairSignAndRotateDataAfter2DFFT_GPU(double* pAfterFFT, long Nx, long Ny, double Mult) +{ + + dim3 blocks(Nx / (2 * GMFFT_BLOCK_SIZE) + ((Nx / 2 & (GMFFT_BLOCK_SIZE - 1)) != 0), Ny/2); + dim3 threads(GMFFT_BLOCK_SIZE, 1); + RepairSignAndRotateDataAfter2DFFT_Kernel << > > ((double2*)pAfterFFT, Nx / 2, Nx, Ny / 2, Ny, Mult); +} + +void CGenMathFFT2D::NormalizeDataAfter2DFFT_GPU(double* pAfterFFT, long Nx, long Ny, double Mult) +{ + + dim3 blocks((Nx * Ny) / GMFFT_BLOCK_SIZE + (((Nx * Ny) & (GMFFT_BLOCK_SIZE - 1)) != 0), 1); + dim3 threads(GMFFT_BLOCK_SIZE, 1); + NormalizeDataAfter2DFFT_Kernel << > > (pAfterFFT, Nx * Ny * 2,1, Mult); +} + +void CGenMathFFT2D::TreatShifts2D_GPU(double* pData, long Nx, long Ny, bool NeedsShiftX, bool NeedsShiftY, double* m_ArrayShiftX, double* m_ArrayShiftY) +{ + + dim3 blocks((Nx) / GMFFT_BLOCK_SIZE + (((Nx) & (GMFFT_BLOCK_SIZE - 1)) != 0), Ny); + dim3 threads(GMFFT_BLOCK_SIZE, 1); + + if (NeedsShiftX && NeedsShiftY) TreatShift2D_Kernel << > > (pData, Nx * 2, Ny, m_ArrayShiftX, m_ArrayShiftY); + else if (NeedsShiftX) TreatShift2D_Kernel << > > (pData, Nx * 2, Ny, m_ArrayShiftX, m_ArrayShiftY); + else if (NeedsShiftY) TreatShift2D_Kernel << > > (pData, Nx * 2, Ny, m_ArrayShiftX, m_ArrayShiftY); +} +#endif \ No newline at end of file diff --git a/cpp/src/ext/genmath/gmmeth.h b/cpp/src/ext/genmath/gmmeth.h index 6388ae26..619a7d01 100644 --- a/cpp/src/ext/genmath/gmmeth.h +++ b/cpp/src/ext/genmath/gmmeth.h @@ -18,6 +18,10 @@ #include "gmobj.h" #endif +#ifdef _OFFLOAD_GPU //HG04122023 +#include "auxgpu.h" +#endif + #include "gmvect.h" #include #include @@ -163,7 +167,11 @@ class CGenMathMeth //static double Integ1D_FuncDefByArray(double* FuncArr, long Np, double Step); //static double Integ1D_FuncDefByArray(float* FuncArr, long Np, double Step); //template static double Integ1D_FuncDefByArray(T* FuncArr, long Np, double Step) +#ifdef _OFFLOAD_GPU //HG04122023 + template GPU_PORTABLE static double Integ1D_FuncDefByArray(T* FuncArr, long long Np, double Step) +#else template static double Integ1D_FuncDefByArray(T* FuncArr, long long Np, double Step) +#endif { if((FuncArr == 0) || (Np < 2) || (Step == 0)) return 0; //if(Np == 2) return (double)(0.5*(FuncArr[0] + FuncArr[1])); diff --git a/cpp/src/ext/utils/utidev.cpp b/cpp/src/ext/utils/utidev.cpp deleted file mode 100644 index 3a2057f1..00000000 --- a/cpp/src/ext/utils/utidev.cpp +++ /dev/null @@ -1,97 +0,0 @@ -/************************************************************************//** - * File: utidev.cpp - * Description: Auxiliary utilities to support GPU management - * - * @author H.Goel - * @version 0.1 - ***************************************************************************/ - -#include -#include -#include - -#ifdef _OFFLOAD_GPU -#include -#endif - -#include "utidev.h" - -static bool isGPUAvailable = false; -static bool isGPUEnabled = false; -static bool GPUAvailabilityTested = false; -static bool deviceOffloadInitialized = false; - -static void CheckGPUAvailability() -{ -#ifdef _OFFLOAD_GPU - if (!GPUAvailabilityTested) - { - isGPUAvailable = false; - GPUAvailabilityTested = true; - int deviceCount = 0; - if (cudaGetDeviceCount(&deviceCount) != cudaSuccess) - return; - - if (deviceCount < 1) - return; - - isGPUAvailable = true; - } -#else - isGPUAvailable = false; - isGPUEnabled = false; - GPUAvailabilityTested = true; -#endif -} - -bool UtiDev::GPUAvailable() -{ - CheckGPUAvailability(); - return isGPUAvailable; -} - -bool UtiDev::GPUEnabled(gpuUsageArg_t *arg) -{ -#ifdef _OFFLOAD_GPU - if (arg == NULL) - return false; - if (*arg > 0) { - //if (cudaSetDevice(*arg - 1) != cudaSuccess) return false; - return GPUAvailable(); - } -#endif - return false; -} - -void UtiDev::SetGPUStatus(bool enabled) -{ - isGPUEnabled = enabled && GPUAvailable(); -} - -int UtiDev::GetDevice(gpuUsageArg_t* arg) -{ -#ifdef _OFFLOAD_GPU - if (arg == NULL) - return cudaCpuDeviceId; - - int curDevice = 0; - cudaGetDevice(&curDevice); - return curDevice; -#else - return 0; -#endif -} - -void UtiDev::Init() { - deviceOffloadInitialized = true; -#ifdef _OFFLOAD_GPU - cudaDeviceSynchronize(); -#endif -} - -void UtiDev::Fini() { -#ifdef _OFFLOAD_GPU - cudaDeviceSynchronize(); -#endif - //deviceOffloadInitialized = false; -} \ No newline at end of file diff --git a/cpp/src/ext/utils/utidev.h b/cpp/src/ext/utils/utidev.h deleted file mode 100644 index 2df059fd..00000000 --- a/cpp/src/ext/utils/utidev.h +++ /dev/null @@ -1,71 +0,0 @@ -/************************************************************************//** - * File: utidev.h - * Description: GPU offloading detection and control - * Project: Synchrotron Radiation Workshop (and possibly others) - * First release: 2022 - * - * @author H. Goel - * @version 0.1 - ***************************************************************************/ - -#ifndef __UTIGPU_H -#define __UTIGPU_H - -#include -#include - -#ifdef _OFFLOAD_GPU -#include -#endif - -typedef int gpuUsageArg_t; - -#define ALLOC_ARRAY(type, size) (type *)UtiDev::malloc(sizeof(type)*(size)) -#define FREE_ARRAY(x) UtiDev::free(x); x=NULL -#define ALLOC_STRUCT(type) (type *)UtiDev::malloc(sizeof(type)) -#define FREE_STRUCT(x) UtiDev::free(x); x=NULL - -#ifdef _OFFLOAD_GPU -#define GPU_ENABLED(arg) UtiDev::GPUEnabled(arg) -#define GPU_COND(arg, code) if (GPU_ENABLED(arg)) { code } -#define GPU_PORTABLE __device__ __host__ -#else -#define GPU_COND(arg, code) if(0) { } -#define GPU_ENABLED(arg) 0 -#define GPU_PORTABLE -#endif - - //************************************************************************* -class UtiDev -{ -public: - static void Init(); - static void Fini(); - static bool GPUAvailable(); //CheckGPUAvailable etc - static bool GPUEnabled(gpuUsageArg_t *arg); - static void SetGPUStatus(bool enabled); - static int GetDevice(gpuUsageArg_t* arg); - - static inline void* malloc(size_t sz) { -#ifdef _OFFLOAD_GPU - void *ptr; - auto err = cudaMallocManaged(&ptr, sz); - if (err != cudaSuccess) - printf("Allocation Failure\r\n"); - return ptr; -#else - return std::malloc(sz); -#endif - } - - static inline void free(void* ptr) { -#ifdef _OFFLOAD_GPU - cudaFree(ptr); -#else - std::free(ptr); -#endif - } -}; - -//************************************************************************* -#endif \ No newline at end of file diff --git a/cpp/src/lib/auxgpu.cpp b/cpp/src/lib/auxgpu.cpp new file mode 100644 index 00000000..d65db5e0 --- /dev/null +++ b/cpp/src/lib/auxgpu.cpp @@ -0,0 +1,370 @@ +/************************************************************************//** + * File: auxgpu.cpp + * Description: Auxiliary utilities to manage GPU usage + * Project: Synchrotron Radiation Workshop + * First release: 2023 + * + * Copyright (C) Brookhaven National Laboratory + * All Rights Reserved + * + * @author H.Goel + * @version 1.0 + ***************************************************************************/ + +#include +#include +#include + +#ifdef _OFFLOAD_GPU +#include +#endif + +#include "auxgpu.h" + +static bool isGPUAvailable = false; +static bool isGPUEnabled = false; +static bool GPUAvailabilityTested = false; +static bool deviceOffloadInitialized = false; +static int deviceCount = 0; + +#ifdef _OFFLOAD_GPU +typedef struct +{ + void *devicePtr; + void *hostPtr; + size_t size; + bool HostToDevUpdated; + bool DevToHostUpdated; + cudaEvent_t h2d_event; + cudaEvent_t d2h_event; +} memAllocInfo_t; +static std::map gpuMap; +static cudaStream_t memcpy_stream; +static bool memcpy_stream_initialized = false; +static int current_device = -1; +#endif + +static void CheckGPUAvailability() +{ +#ifdef _OFFLOAD_GPU + if (!GPUAvailabilityTested) + { + isGPUAvailable = false; + GPUAvailabilityTested = true; + int deviceCount = 0; + if (cudaGetDeviceCount(&deviceCount) != cudaSuccess) + return; + + if (deviceCount < 1) + return; + + isGPUAvailable = true; + } +#else + isGPUAvailable = false; + isGPUEnabled = false; + GPUAvailabilityTested = true; +#endif +} + +bool CAuxGPU::GPUAvailable() +{ + CheckGPUAvailability(); + return isGPUAvailable; +} + +bool CAuxGPU::GPUEnabled(TGPUUsageArg *arg) +{ +#ifdef _OFFLOAD_GPU + if (arg == NULL) + return false; + if (arg->deviceIndex > 0) { + if (arg->deviceIndex <= deviceCount) + { + if (memcpy_stream_initialized && current_device != arg->deviceIndex) + { + cudaStreamDestroy(memcpy_stream); + memcpy_stream_initialized = false; + } + cudaSetDevice(arg->deviceIndex - 1); + if (!memcpy_stream_initialized) + cudaStreamCreateWithFlags(&memcpy_stream, cudaStreamNonBlocking); + current_device = arg->deviceIndex; + memcpy_stream_initialized = true; + } + //TODO: Add warning that GPU isn't available + return GPUAvailable(); + } +#endif + return false; +} + +void CAuxGPU::SetGPUStatus(bool enabled) +{ + isGPUEnabled = enabled && GPUAvailable(); +} + +int CAuxGPU::GetDevice(TGPUUsageArg* arg) +{ +#ifdef _OFFLOAD_GPU + if (arg == NULL) + return cudaCpuDeviceId; + + int curDevice = 0; + cudaGetDevice(&curDevice); + return curDevice; +#else + return 0; +#endif +} + +void* CAuxGPU::ToDevice(TGPUUsageArg* arg, void* hostPtr, size_t size, bool dontCopy) +{ +#ifdef _OFFLOAD_GPU + if (arg == NULL) + return hostPtr; + if (arg->deviceIndex == 0) + return hostPtr; + if (hostPtr == NULL) + return hostPtr; + if (size == 0) + return hostPtr; + if (!GPUEnabled(arg)) + return hostPtr; + if (gpuMap.find(hostPtr) != gpuMap.end()){ + memAllocInfo_t info = gpuMap[hostPtr]; + void* devPtr = info.devicePtr; + hostPtr = info.hostPtr; + if (gpuMap[devPtr].HostToDevUpdated && !dontCopy){ + cudaMemcpyAsync(devPtr, hostPtr, size, cudaMemcpyHostToDevice, memcpy_stream); + cudaEventRecord(gpuMap[devPtr].h2d_event, memcpy_stream); + } +//#if _DEBUG +// printf("ToDevice: %p -> %p, %d, D2H: %d, H2D: %d\n", hostPtr, devPtr, size, gpuMap[devPtr].DevToHostUpdated, gpuMap[devPtr].HostToDevUpdated); //HG28072023 +//#endif + gpuMap[devPtr].HostToDevUpdated = false; + return devPtr; + } + + void *devicePtr = NULL; + cudaError_t err = cudaMalloc(&devicePtr, size); + if (err != cudaSuccess) // Try again after freeing up some memory HG24072023 + { + cudaStreamSynchronize(0); + err = cudaMalloc(&devicePtr, size); + } + if (err != cudaSuccess) + return NULL; +//#if _DEBUG +// printf("ToDevice: %p -> %p, %d\n", hostPtr, devicePtr, size); //HG28072023 +//#endif + memAllocInfo_t info; + info.devicePtr = devicePtr; + info.hostPtr = hostPtr; + info.DevToHostUpdated = false; + info.HostToDevUpdated = false; + cudaEventCreateWithFlags(&info.h2d_event, cudaEventDisableTiming); + cudaEventCreateWithFlags(&info.d2h_event, cudaEventDisableTiming); + if (!dontCopy){ + cudaMemcpyAsync(devicePtr, hostPtr, size, cudaMemcpyHostToDevice, memcpy_stream); + cudaEventRecord(info.h2d_event, memcpy_stream); + } + info.size = size; + gpuMap[hostPtr] = info; + gpuMap[devicePtr] = info; + return devicePtr; +#else + return hostPtr; +#endif +} + +void CAuxGPU::EnsureDeviceMemoryReady(TGPUUsageArg* arg, void* hostPtr) +{ +#ifdef _OFFLOAD_GPU + if (arg == NULL) + return; + if (arg->deviceIndex == 0) + return; + if (hostPtr == NULL) + return; + if (!GPUEnabled(arg)) + return; + if (gpuMap.find(hostPtr) != gpuMap.end()){ + void* devPtr = gpuMap[hostPtr].devicePtr; + if (gpuMap[devPtr].HostToDevUpdated){ + cudaStreamWaitEvent(0, gpuMap[devPtr].h2d_event); + } +//#if _DEBUG +// printf("EnsureDeviceMemoryReady: %p -> %p, %d, D2H: %d, H2D: %d\n", hostPtr, devPtr, gpuMap[devPtr].size, gpuMap[devPtr].DevToHostUpdated, gpuMap[devPtr].HostToDevUpdated); //HG28072023 +//#endif + } +#endif +} + +void* CAuxGPU::GetHostPtr(TGPUUsageArg* arg, void* devicePtr) +{ +#ifdef _OFFLOAD_GPU + if (arg == NULL) + return devicePtr; + if (arg->deviceIndex == 0) + return devicePtr; + if (devicePtr == NULL) + return devicePtr; + if (!GPUEnabled(arg)) + return devicePtr; + memAllocInfo_t info; + if (gpuMap.find(devicePtr) == gpuMap.end()) + return devicePtr; + info = gpuMap[devicePtr]; +//#if _DEBUG +// printf("GetHostPtr: %p -> %p\n", devicePtr, info.hostPtr); //HG28072023 +//#endif + return info.hostPtr; +#else + return devicePtr; +#endif +} + +void* CAuxGPU::ToHostAndFree(TGPUUsageArg* arg, void* devicePtr, size_t size, bool dontCopy) +{ +#ifdef _OFFLOAD_GPU + if (arg == NULL) + return devicePtr; + if (arg->deviceIndex == 0) + return devicePtr; + if (devicePtr == NULL) + return devicePtr; + if (size == 0) + return devicePtr; + if (!GPUEnabled(arg)) + return devicePtr; + memAllocInfo_t info; + if (gpuMap.find(devicePtr) == gpuMap.end()) + return devicePtr; + info = gpuMap[devicePtr]; + devicePtr = info.devicePtr; + void *hostPtr = info.hostPtr; + if (!dontCopy && info.DevToHostUpdated) + { + cudaStreamWaitEvent(memcpy_stream, info.d2h_event, 0); + cudaMemcpyAsync(hostPtr, devicePtr, size, cudaMemcpyDeviceToHost, memcpy_stream); + cudaEventRecord(info.d2h_event); + cudaEventSynchronize(info.d2h_event); // we can't treat host memory as valid until the copy is complete + } +//#if _DEBUG +// printf("ToHostAndFree: %p -> %p, %d\n", devicePtr, hostPtr, size); //HG28072023 +//#endif + cudaStreamWaitEvent(0, info.h2d_event); + cudaStreamWaitEvent(0, info.d2h_event); + cudaFreeAsync(devicePtr, 0); + cudaEventDestroy(info.h2d_event); + cudaEventDestroy(info.d2h_event); + gpuMap.erase(devicePtr); + gpuMap.erase(hostPtr); + return hostPtr; +#else + return devicePtr; +#endif +} + +void CAuxGPU::FreeHost(void* ptr) +{ +#ifdef _OFFLOAD_GPU + if (ptr == NULL) + return; + if (gpuMap.find(ptr) == gpuMap.end()) + return; + memAllocInfo_t info = gpuMap[ptr]; + void *hostPtr = info.hostPtr; + void *devicePtr = info.devicePtr; +//#if _DEBUG +// printf("FreeHost: %p, %p\n", devicePtr, hostPtr); +//#endif + //cudaStreamWaitEvent(0, info.h2d_event); + //cudaStreamWaitEvent(0, info.d2h_event); + cudaFreeAsync(devicePtr, 0); + //cudaEventDestroy(info.h2d_event); + //cudaEventDestroy(info.d2h_event); + std::free(hostPtr); //OC02082023 + //CAuxGPU::free(hostPtr); + gpuMap.erase(devicePtr); + gpuMap.erase(hostPtr); +#endif + return; +} + +void CAuxGPU::MarkUpdated(TGPUUsageArg* arg, void* ptr, bool devToHost, bool hostToDev) +{ +#ifdef _OFFLOAD_GPU + if (arg == NULL) + return; + if (arg->deviceIndex == 0) + return; + if (ptr == NULL) + return; + if (!GPUEnabled(arg)) + return; + if (gpuMap.find(ptr) == gpuMap.end()) + return; + void* devPtr = gpuMap[ptr].devicePtr; + void* hostPtr = gpuMap[ptr].hostPtr; + gpuMap[devPtr].DevToHostUpdated = devToHost; + gpuMap[devPtr].HostToDevUpdated = hostToDev; + gpuMap[hostPtr].DevToHostUpdated = devToHost; + gpuMap[hostPtr].HostToDevUpdated = hostToDev; + if (devToHost) + cudaEventRecord(gpuMap[devPtr].d2h_event, 0); +//#if _DEBUG +// printf("MarkUpdated: %p -> %p, D2H: %d, H2D: %d\n", ptr, devPtr, devToHost, hostToDev); +//#endif +#endif +} + +void CAuxGPU::Init() { + deviceOffloadInitialized = true; +#ifdef _OFFLOAD_GPU + cudaGetDeviceCount(&deviceCount); + cudaDeviceSynchronize(); +#endif +} + +void CAuxGPU::Fini() { +#ifdef _OFFLOAD_GPU + SetGPUStatus(false); //HG30112023 Disable GPU + + // Copy back all updated data + bool updated = false; + bool freed = false; + for (std::map::const_iterator it = gpuMap.cbegin(); it != gpuMap.cend(); it++) + { + if (it->second.DevToHostUpdated){ + cudaStreamWaitEvent(memcpy_stream, it->second.d2h_event, 0); + cudaMemcpyAsync(it->second.hostPtr, it->second.devicePtr, it->second.size, cudaMemcpyDeviceToHost, memcpy_stream); +//#if _DEBUG +// printf("Fini: %p -> %p, %d\n", it->second.devicePtr, it->second.hostPtr, it->second.size); +//#endif + updated = true; + gpuMap[it->second.hostPtr].DevToHostUpdated = false; + gpuMap[it->second.devicePtr].DevToHostUpdated = false; + } + } + for (std::map::const_iterator it = gpuMap.cbegin(); it != gpuMap.cend(); it++) + { + if (it->first == it->second.devicePtr) + { + cudaStreamWaitEvent(0, it->second.h2d_event); + cudaStreamWaitEvent(0, it->second.d2h_event); + cudaFreeAsync(it->second.devicePtr, 0); + freed = true; + cudaEventDestroy(it->second.h2d_event); + cudaEventDestroy(it->second.d2h_event); + } + } + if (updated | freed) + cudaStreamSynchronize(0); + gpuMap.clear(); +//#if _DEBUG +// printf("Fini: %d\n", gpuMap.size()); +//#endif +#endif +} \ No newline at end of file diff --git a/cpp/src/lib/auxgpu.h b/cpp/src/lib/auxgpu.h new file mode 100644 index 00000000..9d64d450 --- /dev/null +++ b/cpp/src/lib/auxgpu.h @@ -0,0 +1,62 @@ +/************************************************************************//** + * File: auxgpu.h + * Description: Auxiliary utilities to manage GPU usage + * Project: Synchrotron Radiation Workshop + * First release: 2023 + * + * Copyright (C) Brookhaven National Laboratory + * All Rights Reserved + * + * @author H.Goel + * @version 1.0 + ***************************************************************************/ + +#ifndef __UTIGPU_H +#define __UTIGPU_H + +#include +#include + +#ifdef _OFFLOAD_GPU +#include +#include +//#if CUDART_VERSION < 11020 +//#error CUDA version too low, need at least 11.2 +//#endif +#endif + +typedef struct +{ + int deviceIndex; // -1 means no device, TODO +} TGPUUsageArg; + +#ifdef _OFFLOAD_GPU +#define GPU_COND(arg, code) if (arg && CAuxGPU::GPUEnabled((TGPUUsageArg*)arg)) { code } +//#define GPU_COND(arg, code) if (arg && CAuxGPU::GPUEnabled(arg)) { code } +#define GPU_PORTABLE __device__ __host__ +#else +#define GPU_COND(arg, code) if(0) { } +#define GPU_PORTABLE +#endif + + //************************************************************************* +class CAuxGPU +{ +private: +public: + static void Init(); + static void Fini(); + static bool GPUAvailable(); //CheckGPUAvailable etc + static bool GPUEnabled(TGPUUsageArg *arg); + static void SetGPUStatus(bool enabled); + static int GetDevice(TGPUUsageArg* arg); + static void* ToDevice(TGPUUsageArg* arg, void* hostPtr, size_t size, bool dontCopy = false); + static void* GetHostPtr(TGPUUsageArg* arg, void* devicePtr); + static void* ToHostAndFree(TGPUUsageArg* arg, void* devicePtr, size_t size, bool dontCopy = false); + static void EnsureDeviceMemoryReady(TGPUUsageArg* arg, void* devicePtr); + static void FreeHost(void* ptr); + static void MarkUpdated(TGPUUsageArg* arg, void* ptr, bool devToHost, bool hostToDev); +}; + +//************************************************************************* +#endif \ No newline at end of file diff --git a/cpp/src/lib/srwlib.cpp b/cpp/src/lib/srwlib.cpp index c36043c7..fac92539 100644 --- a/cpp/src/lib/srwlib.cpp +++ b/cpp/src/lib/srwlib.cpp @@ -29,6 +29,9 @@ #include "srisosrc.h" #include "srmatsta.h" +#ifdef _OFFLOAD_GPU +#include "auxgpu.h" //OC27072023 +#endif //#include //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP //------------------------------------------------------------------------- @@ -751,7 +754,8 @@ EXP int CALL srwlCalcPowDenSR(SRWLStokes* pStokes, SRWLPartBeam* pElBeam, SRWLPr //------------------------------------------------------------------------- -EXP int CALL srwlCalcIntFromElecField(char* pInt, SRWLWfr* pWfr, char polar, char intType, char depType, double e, double x, double y, double* pMeth, void* pFldTrj) //OC23022020 +EXP int CALL srwlCalcIntFromElecField(char* pInt, SRWLWfr* pWfr, char polar, char intType, char depType, double e, double x, double y, double* pMeth, void* pFldTrj, void* pvGPU) //OC26072023 +//EXP int CALL srwlCalcIntFromElecField(char* pInt, SRWLWfr* pWfr, char polar, char intType, char depType, double e, double x, double y, double* pMeth, void* pFldTrj) //OC23022020 //EXP int CALL srwlCalcIntFromElecField(char* pInt, SRWLWfr* pWfr, char polar, char intType, char depType, double e, double x, double y, double *pMeth) //OC16122019 //EXP int CALL srwlCalcIntFromElecField(char* pInt, SRWLWfr* pWfr, char polar, char intType, char depType, double e, double x, double y, int *pMeth) //OC13122019 //EXP int CALL srwlCalcIntFromElecField(char* pInt, SRWLWfr* pWfr, char polar, char intType, char depType, double e, double x, double y) @@ -796,7 +800,8 @@ EXP int CALL srwlCalcIntFromElecField(char* pInt, SRWLWfr* pWfr, char polar, cha //pFldTrj = pTrjData; } - radGenManip.ExtractRadiation((int)polar, (int)arIntTypeConv[intType], (int)depType, wfr.Pres, e, x, y, pInt, pMeth, pTrjDat); //OC23022020 + radGenManip.ExtractRadiation((int)polar, (int)arIntTypeConv[intType], (int)depType, wfr.Pres, e, x, y, pInt, pMeth, pTrjDat, pvGPU); //HG03122023 + //radGenManip.ExtractRadiation((int)polar, (int)arIntTypeConv[intType], (int)depType, wfr.Pres, e, x, y, pInt, pMeth, pTrjDat); //OC23022020 //radGenManip.ExtractRadiation((int)polar, (int)arIntTypeConv[intType], (int)depType, wfr.Pres, e, x, y, pInt, pMeth); //OC13122019 //radGenManip.ExtractRadiation((int)polar, (int)intType, (int)depType, wfr.Pres, e, x, y, pInt); @@ -994,7 +999,8 @@ EXP int CALL srwlSetRepresElecField(SRWLWfr* pWfr, char repr) //------------------------------------------------------------------------- -EXP int CALL srwlPropagElecField(SRWLWfr* pWfr, SRWLOptC* pOpt, int nInt, char** arID, SRWLRadMesh* arIM, char** arI) //OC15082018 +EXP int CALL srwlPropagElecField(SRWLWfr* pWfr, SRWLOptC* pOpt, int nInt, char** arID, SRWLRadMesh* arIM, char** arI, void* pvGPU) //OC26072023 (from HG) +//EXP int CALL srwlPropagElecField(SRWLWfr* pWfr, SRWLOptC* pOpt, int nInt, char** arID, SRWLRadMesh* arIM, char** arI) //OC15082018 //EXP int CALL srwlPropagElecField(SRWLWfr* pWfr, SRWLOptC* pOpt) { if((pWfr == 0) || (pOpt == 0)) return SRWL_INCORRECT_PARAM_FOR_WFR_PROP; @@ -1014,7 +1020,8 @@ EXP int CALL srwlPropagElecField(SRWLWfr* pWfr, SRWLOptC* pOpt, int nInt, char** //srwlPrintTime("srwlPropagElecField: CheckRadStructForPropagation",&start); //if(locErNo = optCont.PropagateRadiationGuided(wfr)) return locErNo; - if(locErNo = optCont.PropagateRadiationGuided(wfr, nInt, arID, arIM, arI)) return locErNo; //OC15082018 + //if(locErNo = optCont.PropagateRadiationGuided(wfr, nInt, arID, arIM, arI)) return locErNo; //OC15082018 + if(locErNo = optCont.PropagateRadiationGuided(wfr, nInt, arID, arIM, arI, pvGPU)) return locErNo; //OC15082018 //HG03122023 //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: //srwlPrintTime("srwlPropagElecField: PropagateRadiationGuided",&start); @@ -1047,7 +1054,8 @@ EXP int CALL srwlCalcTransm(SRWLOptT* pOpTr, const double* pDelta, const double* //------------------------------------------------------------------------- -EXP int CALL srwlUtiFFT(char* pcData, char typeData, double* arMesh, int nMesh, int dir) +EXP int CALL srwlUtiFFT(char* pcData, char typeData, double* arMesh, int nMesh, int dir, void* pvGPU) //OC26072023 +//EXP int CALL srwlUtiFFT(char* pcData, char typeData, double* arMesh, int nMesh, int dir) { if((pcData == 0) || (arMesh == 0) || ((typeData != 'f') && (typeData != 'd')) || (nMesh < 3) || (dir == 0)) return SRWL_INCORRECT_PARAM_FOR_FFT; //OC31012019 @@ -1092,7 +1100,8 @@ EXP int CALL srwlUtiFFT(char* pcData, char typeData, double* arMesh, int nMesh, FFT1DInfo.UseGivenStartTrValue = 0; CGenMathFFT1D FFT1D; - if(locErNo = FFT1D.Make1DFFT(FFT1DInfo)) return locErNo; + //if(locErNo = FFT1D.Make1DFFT(FFT1DInfo)) return locErNo; + if(locErNo = FFT1D.Make1DFFT(FFT1DInfo, pvGPU)) return locErNo; //HG03122023 arMesh[0] = FFT1DInfo.xStartTr; arMesh[1] = FFT1DInfo.xStepTr; @@ -1122,7 +1131,8 @@ EXP int CALL srwlUtiFFT(char* pcData, char typeData, double* arMesh, int nMesh, FFT2DInfo.UseGivenStartTrValues = 0; CGenMathFFT2D FFT2D; - if(locErNo = FFT2D.Make2DFFT(FFT2DInfo)) return locErNo; + //if(locErNo = FFT2D.Make2DFFT(FFT2DInfo)) return locErNo; + if(locErNo = FFT2D.Make2DFFT(FFT2DInfo, 0, 0, pvGPU)) return locErNo; //HG03122023 arMesh[0] = FFT2DInfo.xStartTr; arMesh[1] = FFT2DInfo.xStepTr; @@ -1538,6 +1548,61 @@ EXP int CALL srwlPropagRadMultiE(SRWLStokes* pStokes, SRWLWfr* pWfr0, SRWLOptC* return 0; } +//------------------------------------------------------------------------- +#ifdef _OFFLOAD_GPU //OC30102023 +EXP int CALL srwlUtiGPUProc(int op, void* pvGPU) //HG04122023 +{ + if(op == 0) CAuxGPU::Fini(); + if(op == 1) CAuxGPU::Init(); + return 0; +} + +/* HG30112023 +EXP bool CALL srwlUtiGPUAvailable() //OC27072023 +//EXP bool CALL srwlCAuxGPUAvailable() //HG +{ + return CAuxGPU::GPUAvailable(); //OC05092023 + //return CAuxGPU::GPUAvailable(); +} + +//------------------------------------------------------------------------- + +EXP bool CALL srwlUtiGPUEnabled() //OC27072023 +//EXP bool CALL srwlCAuxGPUEnabled() //HG +{ + return CAuxGPU::GPUEnabled(nullptr); //OC05092023 + //return CAuxGPU::GPUEnabled(nullptr); +} + +//------------------------------------------------------------------------- + +EXP void CALL srwlUtiGPUSetStatus(bool enable) //OC27072023 +//EXP void CALL srwlCAuxGPUSetStatus(bool enable) //HG +{ + CAuxGPU::SetGPUStatus(enable); //OC05092023 + //CAuxGPU::SetGPUStatus(enable); +} + +//------------------------------------------------------------------------- + +EXP void CALL srwlUtiGPUInit() //OC27072023 +//EXP void CALL srwlCAuxGPUInit() //HG +{ + CAuxGPU::Init(); //OC05092023 (why void?) + //CAuxGPU::Init(); +} + +//------------------------------------------------------------------------- + +EXP void CALL srwlUtiGPUFini() //OC27072023 +//EXP void CALL srwlCAuxGPUFini() //HG +{ + CAuxGPU::Fini(); //OC05092023 (why void?) + //CAuxGPU::Fini(); +} +*/ + +#endif //------------------------------------------------------------------------- //Added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP: /* diff --git a/cpp/src/lib/srwlib.h b/cpp/src/lib/srwlib.h index aa448d31..9b73c400 100644 --- a/cpp/src/lib/srwlib.h +++ b/cpp/src/lib/srwlib.h @@ -729,10 +729,12 @@ EXP int CALL srwlCalcPowDenSR(SRWLStokes* pStokes, SRWLPartBeam* pElBeam, SRWLPr * arMeth[18]: used for mutual intensity calculaiton / update: index of first general conjugated position to start updating the mutual intensity * arMeth[19]: used for mutual intensity calculaiton / update: index of last general conjugated position to finish updating the mutual intensity * @param [in] pFldTrj auxiliary pointer to magnetic field or trajectory of central electron + * @param [in] pvGPU optional GPU utilization related parameters (TGPUUsageArg*) * @return integer error (>0) or warnig (<0) code * @see ... */ -EXP int CALL srwlCalcIntFromElecField(char* pInt, SRWLWfr* pWfr, char pol, char intType, char depType, double e, double x, double y, double* arMeth=0, void* pFldTrj=0); +EXP int CALL srwlCalcIntFromElecField(char* pInt, SRWLWfr* pWfr, char pol, char intType, char depType, double e, double x, double y, double* arMeth=0, void* pFldTrj=0, void* pvGPU=0); //OC26072023 (from HG) +//EXP int CALL srwlCalcIntFromElecField(char* pInt, SRWLWfr* pWfr, char pol, char intType, char depType, double e, double x, double y, double* arMeth=0, void* pFldTrj=0); //EXP int CALL srwlCalcIntFromElecField(char* pInt, SRWLWfr* pWfr, char pol, char intType, char depType, double e, double x, double y, double* arMeth=0); //EXP int CALL srwlCalcIntFromElecField(char* pInt, SRWLWfr* pWfr, char pol, char intType, char depType, double e, double x, double y); @@ -799,10 +801,12 @@ EXP int CALL srwlSetRepresElecField(SRWLWfr* pWfr, char repr); * "Propagates" Electric Field Wavefront through Optical Elements and free spaces * @param [in, out] pWfr pointer to pre-calculated Wavefront structure * @param [in] pOpt pointer to container of optical elements the propagation should be done through + * @param [in] pvGPU optional GPU utilization related parameters (TGPUUsageArg*) * @return integer error (>0) or warnig (<0) code * @see ... */ -EXP int CALL srwlPropagElecField(SRWLWfr* pWfr, SRWLOptC* pOpt, int nInt=0, char** arID=0, SRWLRadMesh* arIM=0, char** arI=0); //OC15082018 +EXP int CALL srwlPropagElecField(SRWLWfr* pWfr, SRWLOptC* pOpt, int nInt=0, char** arID=0, SRWLRadMesh* arIM=0, char** arI=0, void* pvGPU=0); //OC26072023 (from HG) +//EXP int CALL srwlPropagElecField(SRWLWfr* pWfr, SRWLOptC* pOpt, int nInt=0, char** arID=0, SRWLRadMesh* arIM=0, char** arI=0); //OC15082018 //EXP int CALL srwlPropagElecField(SRWLWfr* pWfr, SRWLOptC* pOpt); /** TEST @@ -846,10 +850,12 @@ EXP int CALL srwlCalcTransm(SRWLOptT* pOpTr, const double* pDelta, const double* * arMesh[5]: (optional) number of points of the second argument * @param [in] nMesh length of arMesh array (3 or 6 elements) * @param [in] dir direction for the FFT (>0 means forward, <0 means backward) + * @param [in] pvGPU optional GPU utilization related parameters (TGPUUsageArg*) * @return integer error (>0) or warnig (<0) code * @see ... */ -EXP int CALL srwlUtiFFT(char* pcData, char typeData, double* arMesh, int nMesh, int dir); +EXP int CALL srwlUtiFFT(char* pcData, char typeData, double* arMesh, int nMesh, int dir, void* pvGPU=0); //OC26072023 (from HG) +//EXP int CALL srwlUtiFFT(char* pcData, char typeData, double* arMesh, int nMesh, int dir); /** * Convolves real data with 1D or 2D Gaussian (depending on arguments) @@ -964,6 +970,56 @@ EXP int CALL srwlUtiUndFromMagFldTab(SRWLMagFldC* pUndCnt, SRWLMagFldC* pMagCnt, */ EXP int CALL srwlUtiUndFindMagFldInterpInds(int* arResInds, int* pnResInds, double* arGaps, double* arPhases, int nVals, double arPrecPar[5]); +#ifdef _OFFLOAD_GPU //HG30112023 +/** + * Implements GPU related operations. + * @param [in] op operation to be performed: + * 0= Deinitialize GPU + * 1= Initialize GPU + * @param [in] pvGPU optional GPU utilization related parameters (TGPUUsageArg*) + * @return integer error (>0) or warnig (<0) code + * @see ... + */ +EXP int CALL srwlUtiGPUProc(int op, void* pvGpu=0); + +/** + * Checks if GPU offloading is available + * @return true if available + * @see ... + */ +//EXP bool CALL srwlUtiGPUAvailable(); //OC26072023 +//EXP bool CALL srwlCAuxGPUAvailable(); //HG + +/** + * Checks if GPU offloading is enabled + * @return true if enabled + * @see ... + */ +//EXP bool CALL srwlUtiGPUEnabled(); //OC26072023 +//EXP bool CALL srwlCAuxGPUEnabled(); //HG + +/** + * Enable/Disable GPU offloading + * @see ... + */ +//EXP void CALL srwlUtiGPUSetStatus(bool enable); +//EXP void CALL srwlCAuxGPUSetStatus(bool enable); //HG + +/** + * Initialize device offloading + * @see ... + */ +//EXP void CALL srwlUtiGPUInit(); //OC26072023 +//EXP void CALL srwlCAuxGPUInit(); //HG + +/** + * Finalize device offloading + * @see ... + */ +//EXP void CALL srwlUtiGPUFini(); //OC26072023 +//EXP void CALL srwlCAuxGPUFini(); //HG +#endif + /** * These functions were added by S.Yakubov (for profiling?) at parallelizing SRW via OpenMP EXP void CALL srwlPrintTime(const char* str, double* start); diff --git a/cpp/vc/SRW.sln b/cpp/vc/SRW.sln index d62533af..57eb7848 100644 --- a/cpp/vc/SRW.sln +++ b/cpp/vc/SRW.sln @@ -1,14 +1,14 @@ Microsoft Visual Studio Solution File, Format Version 12.00 # Visual Studio Version 17 -VisualStudioVersion = 17.0.31912.275 +VisualStudioVersion = 17.4.33110.190 MinimumVisualStudioVersion = 10.0.40219.1 -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "SRWLIB", "SRWLIB.vcxproj", "{A7E707A6-D325-42AE-A0D0-3C97C38D36A6}" -EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "SRWLClientPython", "SRWLClientPython.vcxproj", "{B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}" ProjectSection(ProjectDependencies) = postProject {A7E707A6-D325-42AE-A0D0-3C97C38D36A6} = {A7E707A6-D325-42AE-A0D0-3C97C38D36A6} EndProjectSection EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "SRWLIB", "SRWLIB.vcxproj", "{A7E707A6-D325-42AE-A0D0-3C97C38D36A6}" +EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "SRWLClientIgor", "SRWLClientIgor.vcxproj", "{0D473386-2B3E-4586-8516-DD4DCF6D4E1E}" EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "SRWLClientC", "SRWLClientC.vcxproj", "{AA6DEE02-E699-4AF6-A8FC-655906C9CDC9}" @@ -23,32 +23,32 @@ Global Release|x64 = Release|x64 EndGlobalSection GlobalSection(ProjectConfigurationPlatforms) = postSolution - {A7E707A6-D325-42AE-A0D0-3C97C38D36A6}.Debug|Mixed Platforms.ActiveCfg = Debug|Win32 - {A7E707A6-D325-42AE-A0D0-3C97C38D36A6}.Debug|Mixed Platforms.Build.0 = Debug|Win32 + {B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Debug|Mixed Platforms.ActiveCfg = Debug_Py3_9|x64 + {B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Debug|Mixed Platforms.Build.0 = Debug_Py3_9|x64 + {B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Debug|Win32.ActiveCfg = Debug_Py2x|Win32 + {B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Debug|Win32.Build.0 = Debug_Py2x|Win32 + {B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Debug|x64.ActiveCfg = Debug_Py3_9|x64 + {B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Debug|x64.Build.0 = Debug_Py3_9|x64 + {B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Release|Mixed Platforms.ActiveCfg = Release_Py3_9|Win32 + {B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Release|Mixed Platforms.Build.0 = Release_Py3_9|Win32 + {B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Release|Win32.ActiveCfg = Release_Py2x|Win32 + {B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Release|Win32.Build.0 = Release_Py2x|Win32 + {B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Release|x64.ActiveCfg = Release_Py3_9|x64 + {B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Release|x64.Build.0 = Release_Py3_9|x64 + {A7E707A6-D325-42AE-A0D0-3C97C38D36A6}.Debug|Mixed Platforms.ActiveCfg = Debug_cuda|x64 + {A7E707A6-D325-42AE-A0D0-3C97C38D36A6}.Debug|Mixed Platforms.Build.0 = Debug_cuda|x64 {A7E707A6-D325-42AE-A0D0-3C97C38D36A6}.Debug|Win32.ActiveCfg = Debug|Win32 {A7E707A6-D325-42AE-A0D0-3C97C38D36A6}.Debug|Win32.Build.0 = Debug|Win32 - {A7E707A6-D325-42AE-A0D0-3C97C38D36A6}.Debug|x64.ActiveCfg = Debug|x64 - {A7E707A6-D325-42AE-A0D0-3C97C38D36A6}.Debug|x64.Build.0 = Debug|x64 + {A7E707A6-D325-42AE-A0D0-3C97C38D36A6}.Debug|x64.ActiveCfg = Debug_cuda|x64 + {A7E707A6-D325-42AE-A0D0-3C97C38D36A6}.Debug|x64.Build.0 = Debug_cuda|x64 {A7E707A6-D325-42AE-A0D0-3C97C38D36A6}.Release|Mixed Platforms.ActiveCfg = Release|Win32 {A7E707A6-D325-42AE-A0D0-3C97C38D36A6}.Release|Mixed Platforms.Build.0 = Release|Win32 {A7E707A6-D325-42AE-A0D0-3C97C38D36A6}.Release|Win32.ActiveCfg = Release|Win32 {A7E707A6-D325-42AE-A0D0-3C97C38D36A6}.Release|Win32.Build.0 = Release|Win32 - {A7E707A6-D325-42AE-A0D0-3C97C38D36A6}.Release|x64.ActiveCfg = Release|x64 - {A7E707A6-D325-42AE-A0D0-3C97C38D36A6}.Release|x64.Build.0 = Release|x64 - {B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Debug|Mixed Platforms.ActiveCfg = Debug_Py3_3|Win32 - {B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Debug|Mixed Platforms.Build.0 = Debug_Py3_3|Win32 - {B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Debug|Win32.ActiveCfg = Debug_Py3_3|Win32 - {B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Debug|Win32.Build.0 = Debug_Py3_3|Win32 - {B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Debug|x64.ActiveCfg = Debug_Py3_11|x64 - {B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Debug|x64.Build.0 = Debug_Py3_11|x64 - {B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Release|Mixed Platforms.ActiveCfg = Release_Py3_3|Win32 - {B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Release|Mixed Platforms.Build.0 = Release_Py3_3|Win32 - {B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Release|Win32.ActiveCfg = Release_Py2x|Win32 - {B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Release|Win32.Build.0 = Release_Py2x|Win32 - {B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Release|x64.ActiveCfg = Release_Py3_11|x64 - {B04ABD04-7AC6-4516-B8A7-E2CBC18B4333}.Release|x64.Build.0 = Release_Py3_11|x64 - {0D473386-2B3E-4586-8516-DD4DCF6D4E1E}.Debug|Mixed Platforms.ActiveCfg = Debug|Win32 - {0D473386-2B3E-4586-8516-DD4DCF6D4E1E}.Debug|Mixed Platforms.Build.0 = Debug|Win32 + {A7E707A6-D325-42AE-A0D0-3C97C38D36A6}.Release|x64.ActiveCfg = Release_cuda|x64 + {A7E707A6-D325-42AE-A0D0-3C97C38D36A6}.Release|x64.Build.0 = Release_cuda|x64 + {0D473386-2B3E-4586-8516-DD4DCF6D4E1E}.Debug|Mixed Platforms.ActiveCfg = Debug|x64 + {0D473386-2B3E-4586-8516-DD4DCF6D4E1E}.Debug|Mixed Platforms.Build.0 = Debug|x64 {0D473386-2B3E-4586-8516-DD4DCF6D4E1E}.Debug|Win32.ActiveCfg = Debug|Win32 {0D473386-2B3E-4586-8516-DD4DCF6D4E1E}.Debug|Win32.Build.0 = Debug|Win32 {0D473386-2B3E-4586-8516-DD4DCF6D4E1E}.Debug|x64.ActiveCfg = Debug|x64 @@ -59,8 +59,8 @@ Global {0D473386-2B3E-4586-8516-DD4DCF6D4E1E}.Release|Win32.Build.0 = Release|Win32 {0D473386-2B3E-4586-8516-DD4DCF6D4E1E}.Release|x64.ActiveCfg = Release|x64 {0D473386-2B3E-4586-8516-DD4DCF6D4E1E}.Release|x64.Build.0 = Release|x64 - {AA6DEE02-E699-4AF6-A8FC-655906C9CDC9}.Debug|Mixed Platforms.ActiveCfg = Debug|Win32 - {AA6DEE02-E699-4AF6-A8FC-655906C9CDC9}.Debug|Mixed Platforms.Build.0 = Debug|Win32 + {AA6DEE02-E699-4AF6-A8FC-655906C9CDC9}.Debug|Mixed Platforms.ActiveCfg = Debug|x64 + {AA6DEE02-E699-4AF6-A8FC-655906C9CDC9}.Debug|Mixed Platforms.Build.0 = Debug|x64 {AA6DEE02-E699-4AF6-A8FC-655906C9CDC9}.Debug|Win32.ActiveCfg = Debug|Win32 {AA6DEE02-E699-4AF6-A8FC-655906C9CDC9}.Debug|Win32.Build.0 = Debug|Win32 {AA6DEE02-E699-4AF6-A8FC-655906C9CDC9}.Debug|x64.ActiveCfg = Debug|x64 diff --git a/cpp/vc/SRWLClientPython.vcxproj b/cpp/vc/SRWLClientPython.vcxproj index 88a2cc49..ed3d9959 100644 --- a/cpp/vc/SRWLClientPython.vcxproj +++ b/cpp/vc/SRWLClientPython.vcxproj @@ -9,6 +9,14 @@ Debug_Py2x x64 + + Debug_Py3_11_cuda + Win32 + + + Debug_Py3_11_cuda + x64 + Debug_Py3_11 Win32 @@ -73,6 +81,14 @@ Release_Py2x x64 + + Release_Py3_11_cuda + Win32 + + + Release_Py3_11_cuda + x64 + Release_Py3_11 Win32 @@ -190,6 +206,12 @@ Unicode true + + DynamicLibrary + v143 + Unicode + true + DynamicLibrary v143 @@ -220,6 +242,11 @@ v143 Unicode + + DynamicLibrary + v143 + Unicode + DynamicLibrary v143 @@ -283,6 +310,13 @@ true false + + DynamicLibrary + v143 + Unicode + true + false + DynamicLibrary v143 @@ -317,6 +351,12 @@ Unicode false + + DynamicLibrary + v143 + Unicode + false + DynamicLibrary v143 @@ -353,6 +393,9 @@ + + + @@ -371,6 +414,9 @@ + + + @@ -401,6 +447,9 @@ + + + @@ -419,6 +468,9 @@ + + + @@ -456,6 +508,11 @@ $(Platform)\$(Configuration)\ true + + $(SolutionDir)$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + true + $(SolutionDir)$(Platform)\$(Configuration)\ $(Platform)\$(Configuration)\ @@ -503,6 +560,13 @@ srwlpy .pyd + + $(ProjectDir) + $(Platform)\$(Configuration)\ + true + srwlpy + .pyd + $(ProjectDir) $(Platform)\$(Configuration)\ @@ -559,6 +623,13 @@ srwlpy .pyd + + $(ProjectDir) + $(Platform)\$(Configuration)\ + false + srwlpy + .pyd + $(ProjectDir) $(Platform)\$(Configuration)\ @@ -608,6 +679,13 @@ srwlpy .pyd + + $(ProjectDir) + $(Platform)\$(Configuration)\ + false + srwlpy + .pyd + $(SolutionDir)$(Platform)\$(Configuration)\ $(Platform)\$(Configuration)\ @@ -648,7 +726,7 @@ ..\..\..\Python33\libs\python33.lib;srw_win32.lib;%(AdditionalDependencies) - $(SolutionDir)..\..\env\work\srw_python\srwlpy.pyd + $(SolutionDir)..\..\env\python\srwpy\srwlpy.pyd LIBC;%(IgnoreSpecificDefaultLibraries) true Windows @@ -674,7 +752,7 @@ ..\..\..\Python33\libs\python33.lib;srw_win32.lib;%(AdditionalDependencies) - $(SolutionDir)..\..\env\work\srw_python\srwlpy.pyd + $(SolutionDir)..\..\env\python\srwpy\srwlpy.pyd LIBC;%(IgnoreSpecificDefaultLibraries) true Windows @@ -701,7 +779,7 @@ ..\..\..\Python33\libs\python33.lib;srw_win32.lib;%(AdditionalDependencies) - $(SolutionDir)..\..\env\work\srw_python\srwlpy.pyd + $(SolutionDir)..\..\env\python\srwpy\srwlpy.pyd LIBC;%(IgnoreSpecificDefaultLibraries) true Windows @@ -728,7 +806,7 @@ ..\..\..\Python33\libs\python33.lib;srw_win32.lib;%(AdditionalDependencies) - $(SolutionDir)..\..\env\work\srw_python\srwlpy.pyd + $(SolutionDir)..\..\env\python\srwpy\srwlpy.pyd LIBC;%(IgnoreSpecificDefaultLibraries) true Windows @@ -755,7 +833,7 @@ ..\..\..\Python33\libs\python33.lib;srw_win32.lib;%(AdditionalDependencies) - $(SolutionDir)..\..\env\work\srw_python\srwlpy.pyd + $(SolutionDir)..\..\env\python\srwpy\srwlpy.pyd LIBC;%(IgnoreSpecificDefaultLibraries) true Windows @@ -782,7 +860,34 @@ ..\..\..\Python33\libs\python33.lib;srw_win32.lib;%(AdditionalDependencies) - $(SolutionDir)..\..\env\work\srw_python\srwlpy.pyd + $(SolutionDir)..\..\env\python\srwpy\srwlpy.pyd + LIBC;%(IgnoreSpecificDefaultLibraries) + true + Windows + MachineX86 + false + + + + + + + + + Disabled + ..\src\lib;..\..\..\Python33\include;%(AdditionalIncludeDirectories) + WIN32;_DEBUG;_WINDOWS;_USRDLL;SRWLIB_CLIENT;SRWLCLIENTPYTHON_EXPORTS;_CRT_SECURE_NO_WARNINGS;NON_UNIX_STDIO;%(PreprocessorDefinitions) + true + EnableFastChecks + MultiThreadedDebug + + + Level3 + EditAndContinue + + + ..\..\..\Python33\libs\python33.lib;srw_win32.lib;%(AdditionalDependencies) + $(SolutionDir)..\..\env\python\srwpy\srwlpy.pyd LIBC;%(IgnoreSpecificDefaultLibraries) true Windows @@ -809,7 +914,7 @@ ..\..\..\Python33\libs\python33.lib;srw_win32.lib;%(AdditionalDependencies) - $(SolutionDir)..\..\env\work\srw_python\srwlpy.pyd + $(SolutionDir)..\..\env\python\srwpy\srwlpy.pyd LIBC;%(IgnoreSpecificDefaultLibraries) true Windows @@ -854,7 +959,7 @@ Default - copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\work\srw_python\" + copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\python\srwpy\" @@ -874,7 +979,7 @@ ..\..\..\Python35_x64\libs\python35.lib;srw_x64.lib;%(AdditionalDependencies) - $(SolutionDir)..\..\env\work\srw_python\srwlpy.pyd + $(SolutionDir)..\..\env\python\srwpy\srwlpy.pyd LIBCMT;%(IgnoreSpecificDefaultLibraries) true Windows @@ -902,7 +1007,7 @@ ..\..\..\Python36_x64\libs\python36.lib;srw_x64.lib;%(AdditionalDependencies) - $(SolutionDir)..\..\env\work\srw_python\srwlpy.pyd + $(SolutionDir)..\..\env\python\srwpy\srwlpy.pyd LIBCMT;%(IgnoreSpecificDefaultLibraries) true Windows @@ -930,7 +1035,7 @@ ..\..\..\Python38_x64\libs\python38.lib;srw_x64.lib;%(AdditionalDependencies) - $(SolutionDir)..\..\env\work\srw_python\srwlpy.pyd + $(SolutionDir)..\..\env\python\srwpy\srwlpy.pyd LIBCMT;%(IgnoreSpecificDefaultLibraries) true Windows @@ -958,7 +1063,7 @@ ..\..\..\Python39_x64\libs\python39.lib;srw_x64.lib;%(AdditionalDependencies) - $(SolutionDir)..\..\env\work\srw_python\srwlpy.pyd + $(SolutionDir)..\..\env\python\srwpy\srwlpy.pyd LIBCMT;%(IgnoreSpecificDefaultLibraries) true Windows @@ -997,6 +1102,34 @@ + + + X64 + + + Disabled + ..\src\lib;..\..\..\Python311_x64\include;$(CUDA_PATH)\include;%(AdditionalIncludeDirectories) + WIN32;_OFFLOAD_GPU;_DEBUG;_WINDOWS;_USRDLL;SRWLIB_CLIENT;SRWLCLIENTPYTHON_EXPORTS;_CRT_SECURE_NO_WARNINGS;NON_UNIX_STDIO;%(PreprocessorDefinitions) + true + EnableFastChecks + MultiThreadedDebug + NotUsing + Level3 + ProgramDatabase + + + ..\..\..\Python311_x64\libs\python311.lib;$(CUDA_PATH)\lib\x64\cudart_static.lib;$(CUDA_PATH)\lib\x64\cudadevrt.lib;srw_x64.lib;%(AdditionalDependencies) + $(SolutionDir)..\..\env\python\srwpy\srwlpy.pyd + LIBCMT;%(IgnoreSpecificDefaultLibraries) + true + Windows + MachineX64 + + + + + + X64 @@ -1014,7 +1147,7 @@ ..\..\..\Python37_x64\libs\python37.lib;srw_x64.lib;%(AdditionalDependencies) - $(SolutionDir)..\..\env\work\srw_python\srwlpy.pyd + $(SolutionDir)..\..\env\python\srwpy\srwlpy.pyd LIBCMT;%(IgnoreSpecificDefaultLibraries) true Windows @@ -1052,7 +1185,7 @@ false - copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\work\srw_python\" + copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\python\srwpy\" @@ -1083,7 +1216,7 @@ false - copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\work\srw_python\" + copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\python\srwpy\" @@ -1114,7 +1247,7 @@ false - copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\work\srw_python\" + copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\python\srwpy\" @@ -1145,7 +1278,7 @@ false - copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\work\srw_python\" + copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\python\srwpy\" @@ -1176,7 +1309,7 @@ false - copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\work\srw_python\" + copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\python\srwpy\" @@ -1207,7 +1340,7 @@ false - copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\work\srw_python\" + copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\python\srwpy\" @@ -1238,7 +1371,38 @@ false - copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\work\srw_python\" + copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\python\srwpy\" + + + + + MaxSpeed + false + ..\src\lib;..\..\..\Python36\include;%(AdditionalIncludeDirectories) + WIN32;NDEBUG;_WINDOWS;_USRDLL;SRWLCLIENTPYTHON_EXPORTS;SRWLIB_CLIENT;_CRT_SECURE_NO_WARNINGS;NON_UNIX_STDIO;%(PreprocessorDefinitions) + MultiThreaded + true + + + Level3 + ProgramDatabase + false + Default + true + + + ..\..\..\Python36\libs\python36.lib;srw_win32.lib;%(AdditionalDependencies) + srwlpy.pyd + LIBC;%(IgnoreSpecificDefaultLibraries) + true + Windows + true + true + MachineX86 + false + + + copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\python\srwpy\" @@ -1275,7 +1439,7 @@ srwlpy.pgd - copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\work\srw_python\" + copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\python\srwpy\" @@ -1313,7 +1477,7 @@ srwlpy.pgd - copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\work\srw_python\" + copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\python\srwpy\" @@ -1351,7 +1515,7 @@ srwlpy.pgd - copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\work\srw_python\" + copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\python\srwpy\" @@ -1389,7 +1553,7 @@ srwlpy.pgd - copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\work\srw_python\" + copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\python\srwpy\" @@ -1427,7 +1591,7 @@ srwlpy.pgd - copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\work\srw_python\" + copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\clients\python\srwpy\" @@ -1465,7 +1629,7 @@ srwlpy.pgd - copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\work\srw_python\" + copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\python\srwpy\" @@ -1506,6 +1670,44 @@ copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\python\srwpy\" + + + X64 + + + MaxSpeed + false + ..\src\lib;..\..\..\Python311_x64\include;$(CUDA_PATH)\include;%(AdditionalIncludeDirectories) + WIN32;_OFFLOAD_GPU;NDEBUG;_WINDOWS;_USRDLL;SRWLCLIENTPYTHON_EXPORTS;SRWLIB_CLIENT;_CRT_SECURE_NO_WARNINGS;NON_UNIX_STDIO;%(PreprocessorDefinitions) + MultiThreaded + false + + + Level3 + None + Speed + OnlyExplicitInline + true + false + true + Precise + + + ..\..\..\Python311_x64\libs\python311.lib;$(CUDA_PATH)\lib\x64\cudart_static.lib;$(CUDA_PATH)\lib\x64\cudadevrt.lib;srw_x64.lib;%(AdditionalDependencies) + srwlpy.pyd + LIBC;%(IgnoreSpecificDefaultLibraries) + true + Windows + true + true + MachineX64 + Default + srwlpy.pgd + + + copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\python\srwpy\" + + Disabled @@ -1520,7 +1722,7 @@ ..\..\..\Python32\libs\python32.lib;srw_win32.lib;%(AdditionalDependencies) - $(SolutionDir)..\..\env\work\srw_python\srwlpy.pyd + $(SolutionDir)..\..\env\python\srwpy\srwlpy.pyd LIBC;%(IgnoreSpecificDefaultLibraries) true Windows @@ -1548,7 +1750,7 @@ ..\..\..\Python27_x64\libs\python27.lib;srw_x64.lib;%(AdditionalDependencies) - $(SolutionDir)..\..\env\work\srw_python\srwlpy.pyd + $(SolutionDir)..\..\env\python\srwpy\srwlpy.pyd LIBCMT;%(IgnoreSpecificDefaultLibraries) true Windows @@ -1582,7 +1784,7 @@ false - copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\work\srw_python\" + copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\python\srwpy\" @@ -1616,7 +1818,7 @@ MachineX64 - copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\work\srw_python\" + copy $(SolutionDir)srwlpy.pyd "$(SolutionDir)..\..\env\python\srwpy\" diff --git a/cpp/vc/SRWLClientPython.vcxproj.user b/cpp/vc/SRWLClientPython.vcxproj.user index 0e4ca8e2..c029e307 100644 --- a/cpp/vc/SRWLClientPython.vcxproj.user +++ b/cpp/vc/SRWLClientPython.vcxproj.user @@ -36,6 +36,12 @@ ..\..\env\work\srw_python WindowsLocalDebugger + + C:\SoftwareDevelopments\Python39_x64\python.exe + ELETTRA-CDI-Source-Test-Tandem-350-eV.py + ..\..\env\work\srw_python + WindowsLocalDebugger + ..\..\Python37_x64\python.exe SRWLIB_Example04_test_mi4d_resize_mesh.py @@ -84,6 +90,12 @@ split-delay-test-vcc.py ..\..\env\work\srw_python + + C:\SoftwareDevelopments\Python38_x64\python.exe + WindowsLocalDebugger + split-delay-test-vcc.py + ..\..\env\work\srw_python + C:\SoftwareDevelopments\Python27_x64\python.exe WindowsLocalDebugger @@ -114,6 +126,12 @@ ..\..\env\work\srw_python WindowsLocalDebugger + + python + test_hdf5_convert.py + ..\..\env\work\srw_python + WindowsLocalDebugger + python smf-preliminary-03-an-2d-test-01.py diff --git a/cpp/vc/SRWLIB.vcxproj b/cpp/vc/SRWLIB.vcxproj index d0a4e611..e2215a8e 100644 --- a/cpp/vc/SRWLIB.vcxproj +++ b/cpp/vc/SRWLIB.vcxproj @@ -33,6 +33,14 @@ Debug_fftw2 x64 + + Release_cuda + Win32 + + + Release_cuda + x64 + Release_omph Win32 @@ -70,7 +78,6 @@ {A7E707A6-D325-42AE-A0D0-3C97C38D36A6} SRWLIB 10.0 - $(CUDA_PATH) @@ -85,6 +92,12 @@ false MultiByte + + StaticLibrary + v143 + false + MultiByte + StaticLibrary v143 @@ -134,6 +147,13 @@ MultiByte false + + StaticLibrary + v143 + false + MultiByte + false + StaticLibrary v143 @@ -175,7 +195,7 @@ - + @@ -185,6 +205,10 @@ + + + + @@ -217,6 +241,10 @@ + + + + @@ -295,6 +323,11 @@ $(Platform)\$(Configuration)\ srw_win32 + + $(SolutionDir) + $(Platform)\$(Configuration)\ + srw_win32 + $(SolutionDir) $(Platform)\$(Configuration)\ @@ -317,6 +350,12 @@ true srw_x64 + + $(SolutionDir) + $(Platform)\$(Configuration)\ + true + srw_x64 + $(SolutionDir) $(Platform)\$(Configuration)\ @@ -579,7 +618,7 @@ Disabled - ..\src\core;..\src\lib;..\src\ext\genmath;..\src\ext\auxparse;%(AdditionalIncludeDirectories) + ..\src\core;..\src\lib;..\src\ext\genmath;..\src\ext\auxparse;$(CUDA_PATH)\include;%(AdditionalIncludeDirectories) _DEBUG;WIN32;_WINDOWS;_USRDLL;__VC__;SRWLIB_STATIC;_GM_WITHOUT_BASE;_CRT_SECURE_NO_WARNINGS;NON_UNIX_STDIO;_FFTW3;_OFFLOAD_GPU;%(PreprocessorDefinitions) EnableFastChecks MultiThreadedDebug @@ -602,13 +641,19 @@ 0x0809 - ..\..\ext_lib\fftw3_64.lib;..\..\ext_lib\fftw3f_64.lib;%(AdditionalDependencies) + ..\..\ext_lib\fftw3_64.lib;..\..\ext_lib\fftw3f_64.lib;cudart_static.lib;cufft.lib;cudadevrt.lib;%(AdditionalDependencies) srw_x64.lib + $(CUDA_PATH)\lib\x64 + + 64 + compute_60,sm_60 + _OFFLOAD_GPU;_USE_CUDA; + @@ -729,6 +774,46 @@ copy $(TargetPath) "$(SolutionDir)..\..\env\work\srw_python\lib\" + + + NDEBUG;%(PreprocessorDefinitions) + true + true + Win32 + .\Release/SRWLIB.tlb + + + OnlyExplicitInline + Neither + ..\src\lib;..\src\core;..\src\ext\genmath;..\src\ext\auxparse;%(AdditionalIncludeDirectories) + NDEBUG;WIN32;_WINDOWS;_USRDLL;__VC__;SRWLIB_STATIC;_GM_WITHOUT_BASE;_CRT_SECURE_NO_WARNINGS;NON_UNIX_STDIO;%(PreprocessorDefinitions) + true + MultiThreaded + true + true + + + + + $(IntDir) + $(IntDir)vc90.pdb + Level2 + true + Default + Default + + + NDEBUG;%(PreprocessorDefinitions) + 0x0809 + + + ..\..\ext_lib\fftw_f.lib;%(AdditionalDependencies) + srw_win32.lib + + + copy $(TargetPath) "$(SolutionDir)..\..\env\work\srw_python\lib\" + + NDEBUG;%(PreprocessorDefinitions) @@ -901,6 +986,60 @@ + + + NDEBUG;%(PreprocessorDefinitions) + true + true + X64 + .\Release/SRWLIB.tlb + + + OnlyExplicitInline + Speed + ..\src\lib;..\src\core;..\src\ext\genmath;..\src\ext\auxparse;$(CUDA_PATH)\include;%(AdditionalIncludeDirectories) + NDEBUG;WIN32;_WINDOWS;_USRDLL;__VC__;SRWLIB_STATIC;_GM_WITHOUT_BASE;_CRT_SECURE_NO_WARNINGS;NON_UNIX_STDIO;_FFTW3;_OFFLOAD_GPU;%(PreprocessorDefinitions) + true + MultiThreaded + false + true + + + + + $(IntDir) + $(IntDir)vc90.pdb + Level3 + true + + + Default + true + MaxSpeed + true + Precise + + + NDEBUG;%(PreprocessorDefinitions) + 0x0809 + + + ..\..\ext_lib\fftw3_64.lib;..\..\ext_lib\fftw3f_64.lib;cudart_static.lib;cufft.lib;cudadevrt.lib;%(AdditionalDependencies) + srw_x64.lib + + + $(CUDA_PATH)\lib\x64 + + + + + + + 64 + compute_60,sm_60 + _OFFLOAD_GPU;_USE_CUDA + + NDEBUG;%(PreprocessorDefinitions) @@ -1045,6 +1184,31 @@ + + + + + + + + + + + + + + + + + + + + + + + + + @@ -1097,36 +1261,13 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - + + + @@ -1196,13 +1337,21 @@ - + + + + + + + + + - + \ No newline at end of file diff --git a/cpp/vc/SRWLIB.vcxproj.filters b/cpp/vc/SRWLIB.vcxproj.filters index c77ca1a2..6c69e3cc 100644 --- a/cpp/vc/SRWLIB.vcxproj.filters +++ b/cpp/vc/SRWLIB.vcxproj.filters @@ -393,8 +393,8 @@ f2c - - core + + lib @@ -614,8 +614,34 @@ lib - + + lib + + core + + core + + + + + core + + + core + + + core + + + core + + + core + + + core + \ No newline at end of file