From a9ebb862ebe8e457825e2c2646b7021be687b6ec Mon Sep 17 00:00:00 2001 From: strongtu Date: Fri, 21 Apr 2017 12:05:00 +0800 Subject: [PATCH 001/189] fix build --- guetzli.vcxproj | 55 +++++++++++- guetzli.vcxproj.filters | 157 ++++++++++++++++++++++++++++++++- guetzli.vcxproj.user | 8 ++ guetzli/guetzli.cc | 4 +- guetzli_static.vcxproj | 55 +++++++++++- guetzli_static.vcxproj.filters | 157 ++++++++++++++++++++++++++++++++- 6 files changed, 429 insertions(+), 7 deletions(-) create mode 100644 guetzli.vcxproj.user diff --git a/guetzli.vcxproj b/guetzli.vcxproj index 5b7ffeb9..dd49fa15 100644 --- a/guetzli.vcxproj +++ b/guetzli.vcxproj @@ -116,7 +116,7 @@ NotUsing Level3 - .;third_party\butteraugli;%(AdditionalIncludeDirectories) + .;third_party\butteraugli;third_party\libpng;third_party\zlib;%(AdditionalIncludeDirectories) Full true true @@ -150,7 +150,7 @@ NotUsing Level3 - .;third_party\butteraugli;%(AdditionalIncludeDirectories) + .;third_party\butteraugli;third_party\libpng;third_party\zlib;%(AdditionalIncludeDirectories) EditAndContinue Disabled @@ -188,6 +188,20 @@ + + + + + + + + + + + + + + @@ -211,6 +225,43 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/guetzli.vcxproj.filters b/guetzli.vcxproj.filters index da2297c5..be2fe5a3 100644 --- a/guetzli.vcxproj.filters +++ b/guetzli.vcxproj.filters @@ -1,4 +1,4 @@ - + @@ -13,6 +13,12 @@ {FD6FCB41-6929-36EC-F288-50C65E41EC5B} + + {40be58d6-6dfc-45a3-8ca1-7d1b14051ddc} + + + {cb89c1ac-8399-4814-88f2-4b69576bc9f9} + @@ -93,6 +99,48 @@ third_party\butteraugli\butteraugli + + third_party\libpng + + + third_party\libpng + + + third_party\libpng + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + @@ -158,5 +206,112 @@ third_party\butteraugli\butteraugli + + third_party\libpng + + + third_party\libpng + + + third_party\libpng + + + third_party\libpng + + + third_party\libpng + + + third_party\libpng + + + third_party\libpng + + + third_party\libpng + + + third_party\libpng + + + third_party\libpng + + + third_party\libpng + + + third_party\libpng + + + third_party\libpng + + + third_party\libpng + + + third_party\libpng + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + + + third_party\libpng + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + \ No newline at end of file diff --git a/guetzli.vcxproj.user b/guetzli.vcxproj.user new file mode 100644 index 00000000..da467b73 --- /dev/null +++ b/guetzli.vcxproj.user @@ -0,0 +1,8 @@ + + + + test.jpg out.jpg + $(OutDir) + WindowsLocalDebugger + + \ No newline at end of file diff --git a/guetzli/guetzli.cc b/guetzli/guetzli.cc index d4e282b8..85cd4bb7 100644 --- a/guetzli/guetzli.cc +++ b/guetzli/guetzli.cc @@ -164,7 +164,9 @@ std::string ReadFileOrDie(const char* filename) { off_t buffer_size = 8192; if (fseek(f, 0, SEEK_END) == 0) { - buffer_size = std::max(ftell(f), 1); +// buffer_size = std::max(ftell(f), 1); + long size = ftell(f); + buffer_size = size > 0 ? size : 1; if (fseek(f, 0, SEEK_SET) != 0) { perror("fseek"); exit(1); diff --git a/guetzli_static.vcxproj b/guetzli_static.vcxproj index 02e6b436..5d9dd9cd 100644 --- a/guetzli_static.vcxproj +++ b/guetzli_static.vcxproj @@ -110,7 +110,7 @@ NotUsing Level3 - .;third_party\butteraugli;%(AdditionalIncludeDirectories) + .;third_party\butteraugli;third_party\libpng;third_party\zlib;%(AdditionalIncludeDirectories) Full true true @@ -140,7 +140,7 @@ NotUsing Level3 - .;third_party\butteraugli;%(AdditionalIncludeDirectories) + .;third_party\butteraugli;third_party\libpng;third_party\zlib;%(AdditionalIncludeDirectories) EditAndContinue Disabled @@ -176,6 +176,20 @@ + + + + + + + + + + + + + + @@ -198,6 +212,43 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/guetzli_static.vcxproj.filters b/guetzli_static.vcxproj.filters index ec134ccc..9362cd94 100644 --- a/guetzli_static.vcxproj.filters +++ b/guetzli_static.vcxproj.filters @@ -1,4 +1,4 @@ - + @@ -13,6 +13,12 @@ {FD6FCB41-6929-36EC-F288-50C65E41EC5B} + + {61f0e3eb-c213-49c5-883a-060bdaf927bb} + + + {ba7b6163-a7d1-4f14-b4b3-3d35f296563a} + @@ -93,6 +99,48 @@ third_party\butteraugli\butteraugli + + third_party\libpng + + + third_party\libpng + + + third_party\libpng + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + @@ -155,5 +203,112 @@ third_party\butteraugli\butteraugli + + third_party\libpng + + + third_party\libpng + + + third_party\libpng + + + third_party\libpng + + + third_party\libpng + + + third_party\libpng + + + third_party\libpng + + + third_party\libpng + + + third_party\libpng + + + third_party\libpng + + + third_party\libpng + + + third_party\libpng + + + third_party\libpng + + + third_party\libpng + + + third_party\libpng + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + + + third_party\libpng + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + \ No newline at end of file From 9ff693c27f04484f8a55a94b37d27e422edf0f86 Mon Sep 17 00:00:00 2001 From: strongtu Date: Tue, 25 Apr 2017 12:53:07 +0800 Subject: [PATCH 002/189] add sample picture --- guetzli.vcxproj.user | 8 -------- 1 file changed, 8 deletions(-) delete mode 100644 guetzli.vcxproj.user diff --git a/guetzli.vcxproj.user b/guetzli.vcxproj.user deleted file mode 100644 index da467b73..00000000 --- a/guetzli.vcxproj.user +++ /dev/null @@ -1,8 +0,0 @@ - - - - test.jpg out.jpg - $(OutDir) - WindowsLocalDebugger - - \ No newline at end of file From fb1032b3edbc9b3c5213e80a4084e3f113b4bd61 Mon Sep 17 00:00:00 2001 From: strongtu Date: Wed, 26 Apr 2017 10:55:14 +0800 Subject: [PATCH 003/189] Merge branch 'master' of https://github.com/ianhuang-777/guetzli --- guetzli.vcxproj | 151 +++++++--- guetzli.vcxproj.filters | 258 +++++++++++++++++- .../butteraugli/butteraugli/butteraugli.cc | 38 ++- 3 files changed, 398 insertions(+), 49 deletions(-) diff --git a/guetzli.vcxproj b/guetzli.vcxproj index dd49fa15..05a625ec 100644 --- a/guetzli.vcxproj +++ b/guetzli.vcxproj @@ -113,22 +113,24 @@ - - NotUsing - Level3 - .;third_party\butteraugli;third_party\libpng;third_party\zlib;%(AdditionalIncludeDirectories) - Full - true - true - false - true - - - Console + + NotUsing + Level3 + .;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories) + Full + true + true + false + true + PERFTOOLS_DLL_DECL=;%(PreprocessorDefinitions) + + + Console true true shlwapi.lib;%(AdditionalDependencies) mainCRTStartup + __tcmalloc @@ -147,21 +149,23 @@ - - NotUsing - Level3 - .;third_party\butteraugli;third_party\libpng;third_party\zlib;%(AdditionalIncludeDirectories) - EditAndContinue - Disabled - - - Console - true - shlwapi.lib;%(AdditionalDependencies) - mainCRTStartup - - - + + NotUsing + Level3 + .;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories) + EditAndContinue + Disabled + PERFTOOLS_DLL_DECL=;%(PreprocessorDefinitions) + + + Console + true + shlwapi.lib;%(AdditionalDependencies) + mainCRTStartup + __tcmalloc + + + @@ -185,9 +189,58 @@ - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -222,9 +275,41 @@ - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/guetzli.vcxproj.filters b/guetzli.vcxproj.filters index be2fe5a3..b35df618 100644 --- a/guetzli.vcxproj.filters +++ b/guetzli.vcxproj.filters @@ -19,9 +19,12 @@ {cb89c1ac-8399-4814-88f2-4b69576bc9f9} - - - + + {f2b475de-6219-478e-9e5e-08f07ef25dbc} + + + + guetzli @@ -141,9 +144,156 @@ third_party\zlib - - - + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + + guetzli @@ -296,6 +446,102 @@ third_party\zlib + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + diff --git a/third_party/butteraugli/butteraugli/butteraugli.cc b/third_party/butteraugli/butteraugli/butteraugli.cc index 7bfae8b1..39af122a 100644 --- a/third_party/butteraugli/butteraugli/butteraugli.cc +++ b/third_party/butteraugli/butteraugli/butteraugli.cc @@ -1304,6 +1304,9 @@ double MaskDcB(double delta) { // square_size square with coordinates // x - offset .. x + square_size - offset - 1, // y - offset .. y + square_size - offset - 1. + +// ʵ¼Ê¹ý³ÌÖÐsqure_sizeһֱΪ4£¬offsetΪ0£¬¿ÉÒÔSIMDÌØ»¯ + void MinSquareVal(size_t square_size, size_t offset, size_t xsize, size_t ysize, float *values) { @@ -1311,26 +1314,41 @@ void MinSquareVal(size_t square_size, size_t offset, // offset is not negative and smaller than square_size. assert(offset < square_size); std::vector tmp(xsize * ysize); + for (size_t y = 0; y < ysize; ++y) { const size_t minh = offset > y ? 0 : y - offset; const size_t maxh = std::min(ysize, y + square_size - offset); + + float *pTmpPoint = &tmp[y * xsize]; + float *pValuePoint = &values[minh * xsize]; + for (size_t x = 0; x < xsize; ++x) { - double min = values[x + minh * xsize]; - for (size_t j = minh + 1; j < maxh; ++j) { - min = fmin(min, values[x + j * xsize]); - } - tmp[x + y * xsize] = static_cast(min); + float *pValues = pValuePoint++; + float min = *pValues; + + for (size_t j = minh + 1; j < maxh; ++j) { + pValues += xsize; + if (*pValues < min) min = *pValues; + } + *pTmpPoint++ = min; } } for (size_t x = 0; x < xsize; ++x) { const size_t minw = offset > x ? 0 : x - offset; const size_t maxw = std::min(xsize, x + square_size - offset); + + float *pValuePoint = &values[x]; + float *pTmpPoint = &tmp[minw]; + for (size_t y = 0; y < ysize; ++y) { - double min = tmp[minw + y * xsize]; - for (size_t j = minw + 1; j < maxw; ++j) { - min = fmin(min, tmp[j + y * xsize]); - } - values[x + y * xsize] = static_cast(min); + float * pTmp = pTmpPoint; pTmpPoint += xsize; + float min = *pTmp; + + for (size_t j = minw + 1; j < maxw; ++j) { + pTmp++; + if (*pTmp < min) min = *pTmp; + } + *pValuePoint = min; pValuePoint += xsize; } } } From e89cdcf362529b42c228f4ae4239151d5fa86a72 Mon Sep 17 00:00:00 2001 From: strongtu Date: Wed, 26 Apr 2017 17:40:31 +0800 Subject: [PATCH 004/189] float is enough --- third_party/butteraugli/butteraugli/butteraugli.cc | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/third_party/butteraugli/butteraugli/butteraugli.cc b/third_party/butteraugli/butteraugli/butteraugli.cc index 39af122a..8871bcdb 100644 --- a/third_party/butteraugli/butteraugli/butteraugli.cc +++ b/third_party/butteraugli/butteraugli/butteraugli.cc @@ -64,25 +64,25 @@ static void Convolution(size_t xsize, size_t ysize, size_t len, size_t offset, const float* __restrict__ multipliers, const float* __restrict__ inp, - double border_ratio, + float border_ratio, float* __restrict__ result) { PROFILER_FUNC; - double weight_no_border = 0; + float weight_no_border = 0; for (size_t j = 0; j <= 2 * offset; ++j) { weight_no_border += multipliers[j]; } for (size_t x = 0, ox = 0; x < xsize; x += xstep, ox++) { int minx = x < offset ? 0 : x - offset; int maxx = std::min(xsize, x + len - offset) - 1; - double weight = 0.0; + float weight = 0.0; for (int j = minx; j <= maxx; ++j) { weight += multipliers[j - x + offset]; } // Interpolate linearly between the no-border scaling and border scaling. weight = (1.0 - border_ratio) * weight + border_ratio * weight_no_border; - double scale = 1.0 / weight; + float scale = 1.0 / weight; for (size_t y = 0; y < ysize; ++y) { - double sum = 0.0; + float sum = 0.0; for (int j = minx; j <= maxx; ++j) { sum += inp[y * xsize + j] * multipliers[j - x + offset]; } @@ -739,6 +739,7 @@ const double *GetOpsinAbsorbance() { return &kMix[0]; } +// mixÊÇÒ»¸ö[4x4]¾ØÕó£¬Óëin[,,,1]½øÐвæ³Ë void OpsinAbsorbance(const double in[3], double out[3]) { const double *mix = GetOpsinAbsorbance(); out[0] = mix[0] * in[0] + mix[1] * in[1] + mix[2] * in[2] + mix[3]; From fe645a92b4e2f393a03a64914dae86718ac3af4e Mon Sep 17 00:00:00 2001 From: strongtu Date: Thu, 27 Apr 2017 17:23:23 +0800 Subject: [PATCH 005/189] Add OpenCL Support --- clguetzli/clguetzli.cl | 25 + clguetzli/clguetzli.cpp | 76 +++ clguetzli/clguetzli.h | 5 + clguetzli/ocl.cpp | 470 ++++++++++++++++++ clguetzli/ocl.h | 64 +++ clguetzli/utils.cpp | 96 ++++ clguetzli/utils.h | 36 ++ guetzli.vcxproj | 33 +- guetzli.vcxproj.filters | 26 +- guetzli_static.vcxproj | 4 +- .../butteraugli/butteraugli/butteraugli.cc | 6 + 11 files changed, 826 insertions(+), 15 deletions(-) create mode 100644 clguetzli/clguetzli.cl create mode 100644 clguetzli/clguetzli.cpp create mode 100644 clguetzli/clguetzli.h create mode 100644 clguetzli/ocl.cpp create mode 100644 clguetzli/ocl.h create mode 100644 clguetzli/utils.cpp create mode 100644 clguetzli/utils.h diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl new file mode 100644 index 00000000..d71249b3 --- /dev/null +++ b/clguetzli/clguetzli.cl @@ -0,0 +1,25 @@ +__kernel void MinSquareVal(__global float* pA, __global float* pC, int square_size, int offset) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + const int width = get_global_size(0); + const int height = get_global_size(1); + + int minH = offset > y ? 0 : y - offset; + int maxH = y + square_size - offset > height ? y + square_size - offset : height; + + int minW = offset > x ? 0 : x - offset; + int maxW = x + square_size - offset > width ? x + square_size - offset : width; + + float minValue = pA[minH * width + minW]; + + for (int j = minH; j < maxH; j++) + { + for (int i = minW; i < maxW; i++) + { + float tmp = pA[j * width + i]; + if (tmp < minValue) minValue = tmp; + } + } + pC[y * width + x] = minValue; +} diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp new file mode 100644 index 00000000..377a468b --- /dev/null +++ b/clguetzli/clguetzli.cpp @@ -0,0 +1,76 @@ +#include "clguetzli.h" +#include "ocl.h" + +void clMinSquareVal(size_t square_size, size_t offset, + size_t xsize, size_t ysize, + float *values) +{ + cl_int err = CL_SUCCESS; + + ocl_args_d_t ocl; + SetupOpenCL(&ocl, CL_DEVICE_TYPE_GPU); + + cl_uint optimizedSize = ((sizeof(cl_float) * xsize * ysize - 1) / 64 + 1) * 64; + cl_float* inputA = (cl_float*)_aligned_malloc(optimizedSize, 4096); + cl_float* outputC = (cl_float*)_aligned_malloc(optimizedSize, 4096); + + memcpy(inputA, values, sizeof(cl_float) * xsize * ysize); + + ocl.srcA = clCreateBuffer(ocl.context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(cl_float) * xsize * ysize, inputA, &err); + ocl.dstMem = clCreateBuffer(ocl.context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, sizeof(cl_float) * xsize * ysize, outputC, &err); + + char* source = nullptr; + size_t src_size = 0; + ReadSourceFromFile("clguetzli\\clguetzli.cl", &source, &src_size); + + ocl.program = clCreateProgramWithSource(ocl.context, 1, (const char**)&source, &src_size, &err); + + delete[] source; + + err = clBuildProgram(ocl.program, 1, &ocl.device, "", NULL, NULL); + if (CL_SUCCESS != err) + { + LogError("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err)); + } + ocl.kernel = clCreateKernel(ocl.program, "MinSquareVal", &err); + if (CL_SUCCESS != err) + { + LogError("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err)); + } + + cl_int cloffset = offset; + cl_int clsquare_size = square_size; + + clSetKernelArg(ocl.kernel, 0, sizeof(cl_mem), (void*)&ocl.srcA); + clSetKernelArg(ocl.kernel, 1, sizeof(cl_mem), (void*)&ocl.dstMem); + clSetKernelArg(ocl.kernel, 2, sizeof(cl_int), (void*)&cloffset); + clSetKernelArg(ocl.kernel, 3, sizeof(cl_int), (void*)&clsquare_size); + + size_t globalWorkSize[2] = { xsize, ysize }; + err = clEnqueueNDRangeKernel(ocl.commandQueue, ocl.kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); + if (CL_SUCCESS != err) + { + LogError("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err)); + } + err = clFinish(ocl.commandQueue); + if (CL_SUCCESS != err) + { + LogError("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err)); + } + + cl_float *resultPtr = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, ocl.dstMem, true, CL_MAP_READ, 0, sizeof(cl_float) * xsize * ysize, 0, NULL, NULL, &err); + if (CL_SUCCESS != err) + { + LogError("Error: clEnqueueMapBuffer returned %s\n", TranslateOpenCLError(err)); + } + err = clFinish(ocl.commandQueue); + if (CL_SUCCESS != err) + { + LogError("Error: clFinish returned %s\n", TranslateOpenCLError(err)); + } + + memcpy(values, resultPtr, sizeof(cl_float) * xsize * ysize); + + _aligned_free(inputA); + _aligned_free(outputC); +} \ No newline at end of file diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h new file mode 100644 index 00000000..a6cf8242 --- /dev/null +++ b/clguetzli/clguetzli.h @@ -0,0 +1,5 @@ +#pragma once + +void clMinSquareVal(size_t square_size, size_t offset, + size_t xsize, size_t ysize, + float *values); diff --git a/clguetzli/ocl.cpp b/clguetzli/ocl.cpp new file mode 100644 index 00000000..077d3464 --- /dev/null +++ b/clguetzli/ocl.cpp @@ -0,0 +1,470 @@ +#include "ocl.h" +#include + + +ocl_args_d_t::ocl_args_d_t() : + context(NULL), + device(NULL), + commandQueue(NULL), + program(NULL), + kernel(NULL), + platformVersion(OPENCL_VERSION_1_2), + deviceVersion(OPENCL_VERSION_1_2), + compilerVersion(OPENCL_VERSION_1_2), + srcA(NULL), + srcB(NULL), + dstMem(NULL) +{ +} + +/* +* destructor - called only once +* Release all OpenCL objects +* This is a regular sequence of calls to deallocate all created OpenCL resources in bootstrapOpenCL. +* +* You may want to call these deallocation procedures in the middle of your application execution +* (not at the end) if you don't further need OpenCL runtime. +* You may want to do that in order to free some memory, for example, +* or recreate OpenCL objects with different parameters. +* +*/ +ocl_args_d_t::~ocl_args_d_t() +{ + cl_int err = CL_SUCCESS; + + if (kernel) + { + err = clReleaseKernel(kernel); + if (CL_SUCCESS != err) + { + LogError("Error: clReleaseKernel returned '%s'.\n", TranslateOpenCLError(err)); + } + } + if (program) + { + err = clReleaseProgram(program); + if (CL_SUCCESS != err) + { + LogError("Error: clReleaseProgram returned '%s'.\n", TranslateOpenCLError(err)); + } + } + if (srcA) + { + err = clReleaseMemObject(srcA); + if (CL_SUCCESS != err) + { + LogError("Error: clReleaseMemObject returned '%s'.\n", TranslateOpenCLError(err)); + } + } + if (srcB) + { + err = clReleaseMemObject(srcB); + if (CL_SUCCESS != err) + { + LogError("Error: clReleaseMemObject returned '%s'.\n", TranslateOpenCLError(err)); + } + } + if (dstMem) + { + err = clReleaseMemObject(dstMem); + if (CL_SUCCESS != err) + { + LogError("Error: clReleaseMemObject returned '%s'.\n", TranslateOpenCLError(err)); + } + } + if (commandQueue) + { + err = clReleaseCommandQueue(commandQueue); + if (CL_SUCCESS != err) + { + LogError("Error: clReleaseCommandQueue returned '%s'.\n", TranslateOpenCLError(err)); + } + } + if (device) + { + err = clReleaseDevice(device); + if (CL_SUCCESS != err) + { + LogError("Error: clReleaseDevice returned '%s'.\n", TranslateOpenCLError(err)); + } + } + if (context) + { + err = clReleaseContext(context); + if (CL_SUCCESS != err) + { + LogError("Error: clReleaseContext returned '%s'.\n", TranslateOpenCLError(err)); + } + } + + /* + * Note there is no procedure to deallocate platform + * because it was not created at the startup, + * but just queried from OpenCL runtime. + */ +} + +const char* TranslateOpenCLError(cl_int errorCode) +{ + switch (errorCode) + { + case CL_SUCCESS: return "CL_SUCCESS"; + case CL_DEVICE_NOT_FOUND: return "CL_DEVICE_NOT_FOUND"; + case CL_DEVICE_NOT_AVAILABLE: return "CL_DEVICE_NOT_AVAILABLE"; + case CL_COMPILER_NOT_AVAILABLE: return "CL_COMPILER_NOT_AVAILABLE"; + case CL_MEM_OBJECT_ALLOCATION_FAILURE: return "CL_MEM_OBJECT_ALLOCATION_FAILURE"; + case CL_OUT_OF_RESOURCES: return "CL_OUT_OF_RESOURCES"; + case CL_OUT_OF_HOST_MEMORY: return "CL_OUT_OF_HOST_MEMORY"; + case CL_PROFILING_INFO_NOT_AVAILABLE: return "CL_PROFILING_INFO_NOT_AVAILABLE"; + case CL_MEM_COPY_OVERLAP: return "CL_MEM_COPY_OVERLAP"; + case CL_IMAGE_FORMAT_MISMATCH: return "CL_IMAGE_FORMAT_MISMATCH"; + case CL_IMAGE_FORMAT_NOT_SUPPORTED: return "CL_IMAGE_FORMAT_NOT_SUPPORTED"; + case CL_BUILD_PROGRAM_FAILURE: return "CL_BUILD_PROGRAM_FAILURE"; + case CL_MAP_FAILURE: return "CL_MAP_FAILURE"; + case CL_MISALIGNED_SUB_BUFFER_OFFSET: return "CL_MISALIGNED_SUB_BUFFER_OFFSET"; //-13 + case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST: return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST"; //-14 + case CL_COMPILE_PROGRAM_FAILURE: return "CL_COMPILE_PROGRAM_FAILURE"; //-15 + case CL_LINKER_NOT_AVAILABLE: return "CL_LINKER_NOT_AVAILABLE"; //-16 + case CL_LINK_PROGRAM_FAILURE: return "CL_LINK_PROGRAM_FAILURE"; //-17 + case CL_DEVICE_PARTITION_FAILED: return "CL_DEVICE_PARTITION_FAILED"; //-18 + case CL_KERNEL_ARG_INFO_NOT_AVAILABLE: return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE"; //-19 + case CL_INVALID_VALUE: return "CL_INVALID_VALUE"; + case CL_INVALID_DEVICE_TYPE: return "CL_INVALID_DEVICE_TYPE"; + case CL_INVALID_PLATFORM: return "CL_INVALID_PLATFORM"; + case CL_INVALID_DEVICE: return "CL_INVALID_DEVICE"; + case CL_INVALID_CONTEXT: return "CL_INVALID_CONTEXT"; + case CL_INVALID_QUEUE_PROPERTIES: return "CL_INVALID_QUEUE_PROPERTIES"; + case CL_INVALID_COMMAND_QUEUE: return "CL_INVALID_COMMAND_QUEUE"; + case CL_INVALID_HOST_PTR: return "CL_INVALID_HOST_PTR"; + case CL_INVALID_MEM_OBJECT: return "CL_INVALID_MEM_OBJECT"; + case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR"; + case CL_INVALID_IMAGE_SIZE: return "CL_INVALID_IMAGE_SIZE"; + case CL_INVALID_SAMPLER: return "CL_INVALID_SAMPLER"; + case CL_INVALID_BINARY: return "CL_INVALID_BINARY"; + case CL_INVALID_BUILD_OPTIONS: return "CL_INVALID_BUILD_OPTIONS"; + case CL_INVALID_PROGRAM: return "CL_INVALID_PROGRAM"; + case CL_INVALID_PROGRAM_EXECUTABLE: return "CL_INVALID_PROGRAM_EXECUTABLE"; + case CL_INVALID_KERNEL_NAME: return "CL_INVALID_KERNEL_NAME"; + case CL_INVALID_KERNEL_DEFINITION: return "CL_INVALID_KERNEL_DEFINITION"; + case CL_INVALID_KERNEL: return "CL_INVALID_KERNEL"; + case CL_INVALID_ARG_INDEX: return "CL_INVALID_ARG_INDEX"; + case CL_INVALID_ARG_VALUE: return "CL_INVALID_ARG_VALUE"; + case CL_INVALID_ARG_SIZE: return "CL_INVALID_ARG_SIZE"; + case CL_INVALID_KERNEL_ARGS: return "CL_INVALID_KERNEL_ARGS"; + case CL_INVALID_WORK_DIMENSION: return "CL_INVALID_WORK_DIMENSION"; + case CL_INVALID_WORK_GROUP_SIZE: return "CL_INVALID_WORK_GROUP_SIZE"; + case CL_INVALID_WORK_ITEM_SIZE: return "CL_INVALID_WORK_ITEM_SIZE"; + case CL_INVALID_GLOBAL_OFFSET: return "CL_INVALID_GLOBAL_OFFSET"; + case CL_INVALID_EVENT_WAIT_LIST: return "CL_INVALID_EVENT_WAIT_LIST"; + case CL_INVALID_EVENT: return "CL_INVALID_EVENT"; + case CL_INVALID_OPERATION: return "CL_INVALID_OPERATION"; + case CL_INVALID_GL_OBJECT: return "CL_INVALID_GL_OBJECT"; + case CL_INVALID_BUFFER_SIZE: return "CL_INVALID_BUFFER_SIZE"; + case CL_INVALID_MIP_LEVEL: return "CL_INVALID_MIP_LEVEL"; + case CL_INVALID_GLOBAL_WORK_SIZE: return "CL_INVALID_GLOBAL_WORK_SIZE"; //-63 + case CL_INVALID_PROPERTY: return "CL_INVALID_PROPERTY"; //-64 + case CL_INVALID_IMAGE_DESCRIPTOR: return "CL_INVALID_IMAGE_DESCRIPTOR"; //-65 + case CL_INVALID_COMPILER_OPTIONS: return "CL_INVALID_COMPILER_OPTIONS"; //-66 + case CL_INVALID_LINKER_OPTIONS: return "CL_INVALID_LINKER_OPTIONS"; //-67 + case CL_INVALID_DEVICE_PARTITION_COUNT: return "CL_INVALID_DEVICE_PARTITION_COUNT"; //-68 + // case CL_INVALID_PIPE_SIZE: return "CL_INVALID_PIPE_SIZE"; //-69 + // case CL_INVALID_DEVICE_QUEUE: return "CL_INVALID_DEVICE_QUEUE"; //-70 + + default: + return "UNKNOWN ERROR CODE"; + } +} + + +/* +* Check whether an OpenCL platform is the required platform +* (based on the platform's name) +*/ +bool CheckPreferredPlatformMatch(cl_platform_id platform, const char* preferredPlatform) +{ + size_t stringLength = 0; + cl_int err = CL_SUCCESS; + bool match = false; + + // In order to read the platform's name, we first read the platform's name string length (param_value is NULL). + // The value returned in stringLength + err = clGetPlatformInfo(platform, CL_PLATFORM_NAME, 0, NULL, &stringLength); + if (CL_SUCCESS != err) + { + LogError("Error: clGetPlatformInfo() to get CL_PLATFORM_NAME length returned '%s'.\n", TranslateOpenCLError(err)); + return false; + } + + // Now, that we know the platform's name string length, we can allocate enough space before read it + std::vector platformName(stringLength); + + // Read the platform's name string + // The read value returned in platformName + err = clGetPlatformInfo(platform, CL_PLATFORM_NAME, stringLength, &platformName[0], NULL); + if (CL_SUCCESS != err) + { + LogError("Error: clGetplatform_ids() to get CL_PLATFORM_NAME returned %s.\n", TranslateOpenCLError(err)); + return false; + } + + // Now check if the platform's name is the required one + if (strstr(&platformName[0], preferredPlatform) != 0) + { + // The checked platform is the one we're looking for + match = true; + } + + return match; +} + +/* +* Find and return the preferred OpenCL platform +* In case that preferredPlatform is NULL, the ID of the first discovered platform will be returned +*/ +cl_platform_id FindOpenCLPlatform(const char* preferredPlatform, cl_device_type deviceType) +{ + cl_uint numPlatforms = 0; + cl_int err = CL_SUCCESS; + + // Get (in numPlatforms) the number of OpenCL platforms available + // No platform ID will be return, since platforms is NULL + err = clGetPlatformIDs(0, NULL, &numPlatforms); + if (CL_SUCCESS != err) + { + LogError("Error: clGetplatform_ids() to get num platforms returned %s.\n", TranslateOpenCLError(err)); + return NULL; + } + LogInfo("Number of available platforms: %u\n", numPlatforms); + + if (0 == numPlatforms) + { + LogError("Error: No platforms found!\n"); + return NULL; + } + + std::vector platforms(numPlatforms); + + // Now, obtains a list of numPlatforms OpenCL platforms available + // The list of platforms available will be returned in platforms + err = clGetPlatformIDs(numPlatforms, &platforms[0], NULL); + if (CL_SUCCESS != err) + { + LogError("Error: clGetplatform_ids() to get platforms returned %s.\n", TranslateOpenCLError(err)); + return NULL; + } + + // Check if one of the available platform matches the preferred requirements + for (cl_uint i = 0; i < numPlatforms; i++) + { + bool match = true; + cl_uint numDevices = 0; + + // If the preferredPlatform is not NULL then check if platforms[i] is the required one + // Otherwise, continue the check with platforms[i] + if ((NULL != preferredPlatform) && (strlen(preferredPlatform) > 0)) + { + // In case we're looking for a specific platform + match = CheckPreferredPlatformMatch(platforms[i], preferredPlatform); + } + + // match is true if the platform's name is the required one or don't care (NULL) + if (match) + { + // Obtains the number of deviceType devices available on platform + // When the function failed we expect numDevices to be zero. + // We ignore the function return value since a non-zero error code + // could happen if this platform doesn't support the specified device type. + err = clGetDeviceIDs(platforms[i], deviceType, 0, NULL, &numDevices); + if (CL_SUCCESS != err) + { + LogError("clGetDeviceIDs() returned %s.\n", TranslateOpenCLError(err)); + } + + if (0 != numDevices) + { + // There is at list one device that answer the requirements + return platforms[i]; + } + } + } + + return NULL; +} + + +/* +* This function read the OpenCL platdorm and device versions +* (using clGetxxxInfo API) and stores it in the ocl structure. +* Later it will enable us to support both OpenCL 1.2 and 2.0 platforms and devices +* in the same program. +*/ +int GetPlatformAndDeviceVersion(cl_platform_id platformId, ocl_args_d_t *ocl) +{ + cl_int err = CL_SUCCESS; + + // Read the platform's version string length (param_value is NULL). + // The value returned in stringLength + size_t stringLength = 0; + err = clGetPlatformInfo(platformId, CL_PLATFORM_VERSION, 0, NULL, &stringLength); + if (CL_SUCCESS != err) + { + LogError("Error: clGetPlatformInfo() to get CL_PLATFORM_VERSION length returned '%s'.\n", TranslateOpenCLError(err)); + return err; + } + + // Now, that we know the platform's version string length, we can allocate enough space before read it + std::vector platformVersion(stringLength); + + // Read the platform's version string + // The read value returned in platformVersion + err = clGetPlatformInfo(platformId, CL_PLATFORM_VERSION, stringLength, &platformVersion[0], NULL); + if (CL_SUCCESS != err) + { + LogError("Error: clGetplatform_ids() to get CL_PLATFORM_VERSION returned %s.\n", TranslateOpenCLError(err)); + return err; + } + + if (strstr(&platformVersion[0], "OpenCL 2.0") != NULL) + { + ocl->platformVersion = OPENCL_VERSION_2_0; + } + + // Read the device's version string length (param_value is NULL). + err = clGetDeviceInfo(ocl->device, CL_DEVICE_VERSION, 0, NULL, &stringLength); + if (CL_SUCCESS != err) + { + LogError("Error: clGetDeviceInfo() to get CL_DEVICE_VERSION length returned '%s'.\n", TranslateOpenCLError(err)); + return err; + } + + // Now, that we know the device's version string length, we can allocate enough space before read it + std::vector deviceVersion(stringLength); + + // Read the device's version string + // The read value returned in deviceVersion + err = clGetDeviceInfo(ocl->device, CL_DEVICE_VERSION, stringLength, &deviceVersion[0], NULL); + if (CL_SUCCESS != err) + { + LogError("Error: clGetDeviceInfo() to get CL_DEVICE_VERSION returned %s.\n", TranslateOpenCLError(err)); + return err; + } + + if (strstr(&deviceVersion[0], "OpenCL 2.0") != NULL) + { + ocl->deviceVersion = OPENCL_VERSION_2_0; + } + + // Read the device's OpenCL C version string length (param_value is NULL). + err = clGetDeviceInfo(ocl->device, CL_DEVICE_OPENCL_C_VERSION, 0, NULL, &stringLength); + if (CL_SUCCESS != err) + { + LogError("Error: clGetDeviceInfo() to get CL_DEVICE_OPENCL_C_VERSION length returned '%s'.\n", TranslateOpenCLError(err)); + return err; + } + + // Now, that we know the device's OpenCL C version string length, we can allocate enough space before read it + std::vector compilerVersion(stringLength); + + // Read the device's OpenCL C version string + // The read value returned in compilerVersion + err = clGetDeviceInfo(ocl->device, CL_DEVICE_OPENCL_C_VERSION, stringLength, &compilerVersion[0], NULL); + if (CL_SUCCESS != err) + { + LogError("Error: clGetDeviceInfo() to get CL_DEVICE_OPENCL_C_VERSION returned %s.\n", TranslateOpenCLError(err)); + return err; + } + + else if (strstr(&compilerVersion[0], "OpenCL C 2.0") != NULL) + { + ocl->compilerVersion = OPENCL_VERSION_2_0; + } + + return err; +} + + +/* +* This function picks/creates necessary OpenCL objects which are needed. +* The objects are: +* OpenCL platform, device, context, and command queue. +* +* All these steps are needed to be performed once in a regular OpenCL application. +* This happens before actual compute kernels calls are performed. +* +* For convenience, in this application you store all those basic OpenCL objects in structure ocl_args_d_t, +* so this function populates fields of this structure, which is passed as parameter ocl. +* Please, consider reviewing the fields before going further. +* The structure definition is right in the beginning of this file. +*/ +int SetupOpenCL(ocl_args_d_t *ocl, cl_device_type deviceType) +{ + // The following variable stores return codes for all OpenCL calls. + cl_int err = CL_SUCCESS; + + // Query for all available OpenCL platforms on the system + // Here you enumerate all platforms and pick one which name has preferredPlatform as a sub-string + deviceType = CL_DEVICE_TYPE_GPU; + cl_platform_id platformId = FindOpenCLPlatform("Intel", deviceType); + if (NULL == platformId) + { + deviceType = CL_DEVICE_TYPE_CPU; + platformId = FindOpenCLPlatform("", deviceType); + } + + if (NULL == platformId) + { + LogError("Error: Failed to find OpenCL platform.\n"); + return CL_INVALID_VALUE; + } + + // Create context with device of specified type. + // Required device type is passed as function argument deviceType. + // So you may use this function to create context for any CPU or GPU OpenCL device. + // The creation is synchronized (pfn_notify is NULL) and NULL user_data + cl_context_properties contextProperties[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platformId, 0 }; + ocl->context = clCreateContextFromType(contextProperties, deviceType, NULL, NULL, &err); + if ((CL_SUCCESS != err) || (NULL == ocl->context)) + { + LogError("Couldn't create a context, clCreateContextFromType() returned '%s'.\n", TranslateOpenCLError(err)); + return err; + } + + // Query for OpenCL device which was used for context creation + err = clGetContextInfo(ocl->context, CL_CONTEXT_DEVICES, sizeof(cl_device_id), &ocl->device, NULL); + if (CL_SUCCESS != err) + { + LogError("Error: clGetContextInfo() to get list of devices returned %s.\n", TranslateOpenCLError(err)); + return err; + } + + // Read the OpenCL platform's version and the device OpenCL and OpenCL C versions + GetPlatformAndDeviceVersion(platformId, ocl); + + // Create command queue. + // OpenCL kernels are enqueued for execution to a particular device through special objects called command queues. + // Command queue guarantees some ordering between calls and other OpenCL commands. + // Here you create a simple in-order OpenCL command queue that doesn't allow execution of two kernels in parallel on a target device. +#ifdef CL_VERSION_2_0 + if (OPENCL_VERSION_2_0 == ocl->deviceVersion) + { + const cl_command_queue_properties properties[] = { CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0 }; + ocl->commandQueue = clCreateCommandQueueWithProperties(ocl->context, ocl->device, properties, &err); + } + else { + // default behavior: OpenCL 1.2 + cl_command_queue_properties properties = CL_QUEUE_PROFILING_ENABLE; + ocl->commandQueue = clCreateCommandQueue(ocl->context, ocl->device, properties, &err); + } +#else + // default behavior: OpenCL 1.2 + cl_command_queue_properties properties = CL_QUEUE_PROFILING_ENABLE; + ocl->commandQueue = clCreateCommandQueue(ocl->context, ocl->device, properties, &err); +#endif + if (CL_SUCCESS != err) + { + LogError("Error: clCreateCommandQueue() returned %s.\n", TranslateOpenCLError(err)); + return err; + } + + return CL_SUCCESS; +} \ No newline at end of file diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h new file mode 100644 index 00000000..2e2cf02c --- /dev/null +++ b/clguetzli/ocl.h @@ -0,0 +1,64 @@ +#pragma once + +#include "CL\cl.h" +#include "utils.h" + +// Macros for OpenCL versions +#define OPENCL_VERSION_1_2 1.2f +#define OPENCL_VERSION_2_0 2.0f + +struct ocl_args_d_t; + +/* This function helps to create informative messages in +* case when OpenCL errors occur. It returns a string +* representation for an OpenCL error code. +* (E.g. "CL_DEVICE_NOT_FOUND" instead of just -1.) +*/ +const char* TranslateOpenCLError(cl_int errorCode); + +/* +* This function picks/creates necessary OpenCL objects which are needed. +* The objects are: +* OpenCL platform, device, context, and command queue. +* +* All these steps are needed to be performed once in a regular OpenCL application. +* This happens before actual compute kernels calls are performed. +* +* For convenience, in this application you store all those basic OpenCL objects in structure ocl_args_d_t, +* so this function populates fields of this structure, which is passed as parameter ocl. +* Please, consider reviewing the fields before going further. +* The structure definition is right in the beginning of this file. +*/ +int SetupOpenCL(ocl_args_d_t *ocl, cl_device_type deviceType); + + +/* Convenient container for all OpenCL specific objects used in the sample +* +* It consists of two parts: +* - regular OpenCL objects which are used in almost each normal OpenCL applications +* - several OpenCL objects that are specific for this particular sample +* +* You collect all these objects in one structure for utility purposes +* only, there is no OpenCL specific here: just to avoid global variables +* and make passing all these arguments in functions easier. +*/ +struct ocl_args_d_t +{ + ocl_args_d_t(); + ~ocl_args_d_t(); + + // Regular OpenCL objects: + cl_context context; // hold the context handler + cl_device_id device; // hold the selected device handler + cl_command_queue commandQueue; // hold the commands-queue handler + cl_program program; // hold the program handler + cl_kernel kernel; // hold the kernel handler + float platformVersion; // hold the OpenCL platform version (default 1.2) + float deviceVersion; // hold the OpenCL device version (default. 1.2) + float compilerVersion; // hold the device OpenCL C version (default. 1.2) + + // Objects that are specific for algorithm implemented in this sample + cl_mem srcA; // hold first source buffer + cl_mem srcB; // hold second source buffer + cl_mem dstMem; // hold destination buffer +}; diff --git a/clguetzli/utils.cpp b/clguetzli/utils.cpp new file mode 100644 index 00000000..24520cd8 --- /dev/null +++ b/clguetzli/utils.cpp @@ -0,0 +1,96 @@ +/***************************************************************************** + * Copyright (c) 2013-2016 Intel Corporation + * All rights reserved. + * + * WARRANTY DISCLAIMER + * + * THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE + * MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Intel Corporation is the author of the Materials, and requests that all + * problem reports or change requests be submitted to it directly + *****************************************************************************/ + +#include +#include +#include +#include +#include +#include "CL\cl.h" +#include "CL\cl_ext.h" +#include "utils.h" +#include + + +//we want to use POSIX functions +#pragma warning( push ) +#pragma warning( disable : 4996 ) + + +void LogInfo(const char* str, ...) +{ + if (str) + { + va_list args; + va_start(args, str); + + vfprintf(stdout, str, args); + + va_end(args); + } +} + +void LogError(const char* str, ...) +{ + if (str) + { + va_list args; + va_start(args, str); + + vfprintf(stderr, str, args); + + va_end(args); + } +} + +// Upload the OpenCL C source code to output argument source +// The memory resource is implicitly allocated in the function +// and should be deallocated by the caller +int ReadSourceFromFile(const char* fileName, char** source, size_t* sourceSize) +{ + int errorCode = CL_SUCCESS; + + FILE* fp = NULL; + fopen_s(&fp, fileName, "rb"); + if (fp == NULL) + { + LogError("Error: Couldn't find program source file '%s'.\n", fileName); + errorCode = CL_INVALID_VALUE; + } + else { + fseek(fp, 0, SEEK_END); + *sourceSize = ftell(fp); + fseek(fp, 0, SEEK_SET); + + *source = new char[*sourceSize]; + if (*source == NULL) + { + LogError("Error: Couldn't allocate %d bytes for program source from file '%s'.\n", *sourceSize, fileName); + errorCode = CL_OUT_OF_HOST_MEMORY; + } + else { + fread(*source, 1, *sourceSize, fp); + } + } + return errorCode; +} +#pragma warning( pop ) \ No newline at end of file diff --git a/clguetzli/utils.h b/clguetzli/utils.h new file mode 100644 index 00000000..294f7137 --- /dev/null +++ b/clguetzli/utils.h @@ -0,0 +1,36 @@ +/***************************************************************************** + * Copyright (c) 2013-2016 Intel Corporation + * All rights reserved. + * + * WARRANTY DISCLAIMER + * + * THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE + * MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Intel Corporation is the author of the Materials, and requests that all + * problem reports or change requests be submitted to it directly + *****************************************************************************/ + +#include "CL\cl.h" +#include + + +#pragma once + +// Print useful information to the default output. Same usage as with printf +void LogInfo(const char* str, ...); + +// Print error notification to the default output. Same usage as with printf +void LogError(const char* str, ...); + +// Read OpenCL source code from fileName and store it in source. The number of read bytes returns in sourceSize +int ReadSourceFromFile(const char* fileName, char** source, size_t* sourceSize); diff --git a/guetzli.vcxproj b/guetzli.vcxproj index 05a625ec..cf770719 100644 --- a/guetzli.vcxproj +++ b/guetzli.vcxproj @@ -97,7 +97,7 @@ NotUsing Level3 - .;third_party\butteraugli;%(AdditionalIncludeDirectories) + .;$(INTELOCLSDKROOT)include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories) Full true true @@ -108,8 +108,9 @@ Console true true - shlwapi.lib;%(AdditionalDependencies) + OpenCL.lib;shlwapi.lib;%(AdditionalDependencies) mainCRTStartup + $(INTELOCLSDKROOT)lib\x64 @@ -137,15 +138,16 @@ NotUsing Level3 - .;third_party\butteraugli;%(AdditionalIncludeDirectories) + .;$(INTELOCLSDKROOT)include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories) EditAndContinue Disabled Console true - shlwapi.lib;%(AdditionalDependencies) + OpenCL.lib;shlwapi.lib;%(AdditionalDependencies) mainCRTStartup + $(INTELOCLSDKROOT)lib\x64 @@ -166,9 +168,12 @@ - - - + + + + + + @@ -255,11 +260,14 @@ - - - - - + + + + + + + + @@ -342,6 +350,7 @@ + diff --git a/guetzli.vcxproj.filters b/guetzli.vcxproj.filters index b35df618..12e7d8f4 100644 --- a/guetzli.vcxproj.filters +++ b/guetzli.vcxproj.filters @@ -22,6 +22,9 @@ {f2b475de-6219-478e-9e5e-08f07ef25dbc} + + {64847a89-ca39-4556-ba0e-d6875c4d39ca} + @@ -291,6 +294,15 @@ third_party\tcmalloc_minimal + + clguetzli + + + clguetzli + + + clguetzli + @@ -542,6 +554,15 @@ third_party\tcmalloc_minimal + + clguetzli + + + clguetzli + + + clguetzli + @@ -559,5 +580,8 @@ third_party\zlib - + + clguetzli + + \ No newline at end of file diff --git a/guetzli_static.vcxproj b/guetzli_static.vcxproj index 5d9dd9cd..44a911b2 100644 --- a/guetzli_static.vcxproj +++ b/guetzli_static.vcxproj @@ -93,7 +93,7 @@ NotUsing Level3 - .;third_party\butteraugli;%(AdditionalIncludeDirectories) + .;third_party\butteraugli;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows%(AdditionalIncludeDirectories) Full true true @@ -127,7 +127,7 @@ NotUsing Level3 - .;third_party\butteraugli;%(AdditionalIncludeDirectories) + .;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories) EditAndContinue Disabled diff --git a/third_party/butteraugli/butteraugli/butteraugli.cc b/third_party/butteraugli/butteraugli/butteraugli.cc index 8871bcdb..834cf2f8 100644 --- a/third_party/butteraugli/butteraugli/butteraugli.cc +++ b/third_party/butteraugli/butteraugli/butteraugli.cc @@ -40,6 +40,8 @@ #include #include +#include "clguetzli\clguetzli.h" + // Restricted pointers speed up Convolution(); MSVC uses a different keyword. #ifdef _MSC_VER #define __restrict__ __restrict @@ -68,6 +70,7 @@ static void Convolution(size_t xsize, size_t ysize, float* __restrict__ result) { PROFILER_FUNC; float weight_no_border = 0; + for (size_t j = 0; j <= 2 * offset; ++j) { weight_no_border += multipliers[j]; } @@ -1311,6 +1314,9 @@ double MaskDcB(double delta) { void MinSquareVal(size_t square_size, size_t offset, size_t xsize, size_t ysize, float *values) { + +// clMinSquareVal(square_size, offset, xsize, ysize, values); + PROFILER_FUNC; // offset is not negative and smaller than square_size. assert(offset < square_size); From c72cece021ae550987696a59e2d252eb2c8de5e3 Mon Sep 17 00:00:00 2001 From: strongtu Date: Thu, 27 Apr 2017 19:38:16 +0800 Subject: [PATCH 006/189] MinSquareVal with OpenCL --- clguetzli/clguetzli.cl | 43 +++++++- clguetzli/clguetzli.cpp | 101 +++++++++++++++--- clguetzli/clguetzli.h | 8 ++ clguetzli/ocl.cpp | 69 +++++++++++- clguetzli/ocl.h | 13 +++ .../butteraugli/butteraugli/butteraugli.cc | 3 +- 6 files changed, 216 insertions(+), 21 deletions(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index d71249b3..8d0aabff 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -6,10 +6,10 @@ __kernel void MinSquareVal(__global float* pA, __global float* pC, int square_si const int height = get_global_size(1); int minH = offset > y ? 0 : y - offset; - int maxH = y + square_size - offset > height ? y + square_size - offset : height; + int maxH = min(y + square_size - offset, height);// < height ? y + square_size - offset : height; int minW = offset > x ? 0 : x - offset; - int maxW = x + square_size - offset > width ? x + square_size - offset : width; + int maxW = min(x + square_size - offset, width);// < width ? x + square_size - offset : width; float minValue = pA[minH * width + minW]; @@ -21,5 +21,44 @@ __kernel void MinSquareVal(__global float* pA, __global float* pC, int square_si if (tmp < minValue) minValue = tmp; } } + pC[y * width + x] = minValue; } + +__kernel void Convolution(__global float* multipliers, __global float* inp, __global float* result, + int xstep, int len, int offset, float border_ratio) +{ + const int ox = get_global_id(0); + const int y = get_global_id(1); + const int oxsize = get_global_size(0); + const int ysize = get_global_size(1); + + const int x = ox * xstep; + const int xsize = oxsize * xstep; + + float weight_no_border = 0; + for (int j = 0; j <= 2 * offset; j++) + { + weight_no_border += multipliers[j]; + } + + int minx = x < offset ? 0 : x - offset; + int maxx = min(xsize, x + len - offset) - 1; + + float weight = 0.0; + for (int j = minx; j < maxx; j++) + { + weight += multipliers[j - x + offset]; + } + + weight = (1.0 - border_ratio) * weight + border_ratio * weight_no_border; + float scale = 1.0 / weight; + + float sum = 0.0; + for (int j = minx; j < maxx; j++) + { + sum += inp[y * xsize + j] * multipliers[j - x + offset]; + } + + result[y * oxsize + ox] = sum * scale; +} diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index 377a468b..94a15040 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -1,23 +1,21 @@ #include "clguetzli.h" #include "ocl.h" -void clMinSquareVal(size_t square_size, size_t offset, - size_t xsize, size_t ysize, - float *values) +ocl_args_d_t& getOcl(void) { - cl_int err = CL_SUCCESS; + static bool bInit = false; + static ocl_args_d_t ocl; - ocl_args_d_t ocl; - SetupOpenCL(&ocl, CL_DEVICE_TYPE_GPU); - - cl_uint optimizedSize = ((sizeof(cl_float) * xsize * ysize - 1) / 64 + 1) * 64; - cl_float* inputA = (cl_float*)_aligned_malloc(optimizedSize, 4096); - cl_float* outputC = (cl_float*)_aligned_malloc(optimizedSize, 4096); + if (bInit == true) return ocl; - memcpy(inputA, values, sizeof(cl_float) * xsize * ysize); + bInit = true; + SetupOpenCL(&ocl, CL_DEVICE_TYPE_GPU); - ocl.srcA = clCreateBuffer(ocl.context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(cl_float) * xsize * ysize, inputA, &err); - ocl.dstMem = clCreateBuffer(ocl.context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, sizeof(cl_float) * xsize * ysize, outputC, &err); + cl_int err = SetupOpenCL(&ocl, CL_DEVICE_TYPE_GPU); + if (CL_SUCCESS != err) + { + LogError("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err)); + } char* source = nullptr; size_t src_size = 0; @@ -38,13 +36,28 @@ void clMinSquareVal(size_t square_size, size_t offset, LogError("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err)); } + return ocl; +} + +void clMinSquareVal(size_t square_size, size_t offset, + size_t xsize, size_t ysize, + float *values) +{ + cl_int err = CL_SUCCESS; + ocl_args_d_t &ocl = getOcl(); + + ocl.allocA(sizeof(cl_float) * xsize * ysize); + ocl.allocC(sizeof(cl_float) * xsize * ysize); + + memcpy(ocl.inputA, values, sizeof(cl_float) * xsize * ysize); + cl_int cloffset = offset; cl_int clsquare_size = square_size; clSetKernelArg(ocl.kernel, 0, sizeof(cl_mem), (void*)&ocl.srcA); clSetKernelArg(ocl.kernel, 1, sizeof(cl_mem), (void*)&ocl.dstMem); - clSetKernelArg(ocl.kernel, 2, sizeof(cl_int), (void*)&cloffset); - clSetKernelArg(ocl.kernel, 3, sizeof(cl_int), (void*)&clsquare_size); + clSetKernelArg(ocl.kernel, 2, sizeof(cl_int), (void*)&clsquare_size); + clSetKernelArg(ocl.kernel, 3, sizeof(cl_int), (void*)&cloffset); size_t globalWorkSize[2] = { xsize, ysize }; err = clEnqueueNDRangeKernel(ocl.commandQueue, ocl.kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); @@ -70,7 +83,61 @@ void clMinSquareVal(size_t square_size, size_t offset, } memcpy(values, resultPtr, sizeof(cl_float) * xsize * ysize); +} + +void clConvolution(size_t xsize, size_t ysize, + size_t xstep, + size_t len, size_t offset, + const float* multipliers, + const float* inp, + float border_ratio, + float* result) +{ + cl_int err = CL_SUCCESS; + ocl_args_d_t &ocl = getOcl(); + + ocl.allocA(sizeof(cl_float) * len); + ocl.allocB(sizeof(cl_float) * xsize * ysize); + ocl.allocC(sizeof(cl_float) * xsize * ysize / xstep); + + memcpy(ocl.inputA, multipliers, sizeof(cl_float) * len); + memcpy(ocl.inputB, inp, sizeof(cl_float) * xsize * ysize); + + cl_int clxstep = xstep; + cl_int cllen = len; + cl_int cloffset = offset; + cl_float clborder_ratio = border_ratio; + + clSetKernelArg(ocl.kernel, 0, sizeof(cl_mem), (void*)&ocl.srcA); + clSetKernelArg(ocl.kernel, 1, sizeof(cl_mem), (void*)&ocl.srcB); + clSetKernelArg(ocl.kernel, 2, sizeof(cl_mem), (void*)&ocl.dstMem); + clSetKernelArg(ocl.kernel, 3, sizeof(cl_int), (void*)&clxstep); + clSetKernelArg(ocl.kernel, 4, sizeof(cl_int), (void*)&cllen); + clSetKernelArg(ocl.kernel, 5, sizeof(cl_int), (void*)&cloffset); + clSetKernelArg(ocl.kernel, 6, sizeof(cl_float), (void*)&clborder_ratio); + + size_t globalWorkSize[2] = { xsize / xstep, ysize }; + err = clEnqueueNDRangeKernel(ocl.commandQueue, ocl.kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); + if (CL_SUCCESS != err) + { + LogError("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err)); + } + err = clFinish(ocl.commandQueue); + if (CL_SUCCESS != err) + { + LogError("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err)); + } + + cl_float *resultPtr = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, ocl.dstMem, true, CL_MAP_READ, 0, sizeof(cl_float) * xsize * ysize, 0, NULL, NULL, &err); + if (CL_SUCCESS != err) + { + LogError("Error: clEnqueueMapBuffer returned %s\n", TranslateOpenCLError(err)); + } + err = clFinish(ocl.commandQueue); + if (CL_SUCCESS != err) + { + LogError("Error: clFinish returned %s\n", TranslateOpenCLError(err)); + } - _aligned_free(inputA); - _aligned_free(outputC); + memcpy(result, resultPtr, sizeof(cl_float) * xsize * ysize / xstep); } \ No newline at end of file diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h index a6cf8242..31c2e7ba 100644 --- a/clguetzli/clguetzli.h +++ b/clguetzli/clguetzli.h @@ -3,3 +3,11 @@ void clMinSquareVal(size_t square_size, size_t offset, size_t xsize, size_t ysize, float *values); + +void clConvolution(size_t xsize, size_t ysize, + size_t xstep, + size_t len, size_t offset, + const float* multiplier, + const float* inp, + float border_ratio, + float* result); \ No newline at end of file diff --git a/clguetzli/ocl.cpp b/clguetzli/ocl.cpp index 077d3464..be1e9071 100644 --- a/clguetzli/ocl.cpp +++ b/clguetzli/ocl.cpp @@ -102,6 +102,73 @@ ocl_args_d_t::~ocl_args_d_t() * because it was not created at the startup, * but just queried from OpenCL runtime. */ + + if (inputA) _aligned_free(inputA); + if (inputB) _aligned_free(inputB); + if (outputC) _aligned_free(outputC); +} + +void* ocl_args_d_t::allocA(size_t s) +{ + if (s < lenA) return inputA; + lenA = 0; + _aligned_free(inputA); + clReleaseMemObject(srcA); + + cl_uint optimizedSize = ((s - 1) / 64 + 1) * 64; + inputA = _aligned_malloc(optimizedSize, 4096); + lenA = s; + + cl_int err = 0; + srcA = clCreateBuffer(this->context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, s, inputA, &err); + if (CL_SUCCESS != err) + { + LogError("Error: allocA() for buffer returned %s.\n", TranslateOpenCLError(err)); + } + + return inputA; +} + +void* ocl_args_d_t::allocB(size_t s) +{ + if (s < lenB) return inputB; + lenB = 0; + _aligned_free(inputB); + clReleaseMemObject(srcB); + + cl_uint optimizedSize = ((s - 1) / 64 + 1) * 64; + inputB = _aligned_malloc(optimizedSize, 4096); + lenB = s; + + cl_int err = 0; + srcB = clCreateBuffer(this->context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, s, inputB, &err); + if (CL_SUCCESS != err) + { + LogError("Error: allocB() for buffer returned %s.\n", TranslateOpenCLError(err)); + } + + return inputB; +} + +void* ocl_args_d_t::allocC(size_t s) +{ + if (s < lenC) return outputC; + lenC = 0; + _aligned_free(outputC); + clReleaseMemObject(dstMem); + + cl_uint optimizedSize = ((s - 1) / 64 + 1) * 64; + outputC = _aligned_malloc(optimizedSize, 4096); + lenC = s; + + cl_int err = 0; + dstMem = clCreateBuffer(this->context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, s, outputC, &err); + if (CL_SUCCESS != err) + { + LogError("Error: allocB() for buffer returned %s.\n", TranslateOpenCLError(err)); + } + + return outputC; } const char* TranslateOpenCLError(cl_int errorCode) @@ -404,7 +471,7 @@ int SetupOpenCL(ocl_args_d_t *ocl, cl_device_type deviceType) // Query for all available OpenCL platforms on the system // Here you enumerate all platforms and pick one which name has preferredPlatform as a sub-string deviceType = CL_DEVICE_TYPE_GPU; - cl_platform_id platformId = FindOpenCLPlatform("Intel", deviceType); + cl_platform_id platformId = FindOpenCLPlatform("", deviceType); if (NULL == platformId) { deviceType = CL_DEVICE_TYPE_CPU; diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h index 2e2cf02c..0a9e50b2 100644 --- a/clguetzli/ocl.h +++ b/clguetzli/ocl.h @@ -47,6 +47,10 @@ struct ocl_args_d_t ocl_args_d_t(); ~ocl_args_d_t(); + void* allocA(size_t s); + void* allocB(size_t s); + void* allocC(size_t s); + // Regular OpenCL objects: cl_context context; // hold the context handler cl_device_id device; // hold the selected device handler @@ -61,4 +65,13 @@ struct ocl_args_d_t cl_mem srcA; // hold first source buffer cl_mem srcB; // hold second source buffer cl_mem dstMem; // hold destination buffer + + void* inputA; + size_t lenA; + + void* inputB; + size_t lenB; + + void* outputC; + size_t lenC; }; diff --git a/third_party/butteraugli/butteraugli/butteraugli.cc b/third_party/butteraugli/butteraugli/butteraugli.cc index 834cf2f8..0f84aa48 100644 --- a/third_party/butteraugli/butteraugli/butteraugli.cc +++ b/third_party/butteraugli/butteraugli/butteraugli.cc @@ -1315,7 +1315,8 @@ void MinSquareVal(size_t square_size, size_t offset, size_t xsize, size_t ysize, float *values) { -// clMinSquareVal(square_size, offset, xsize, ysize, values); + clMinSquareVal(square_size, offset, xsize, ysize, values); + return; PROFILER_FUNC; // offset is not negative and smaller than square_size. From c354348db7b3eab528e0e346463db7449a52df81 Mon Sep 17 00:00:00 2001 From: strongtu Date: Fri, 28 Apr 2017 00:43:31 +0800 Subject: [PATCH 007/189] =?UTF-8?q?OpenCL=20=E4=BC=98=E5=8C=96=E5=8D=B7?= =?UTF-8?q?=E7=A7=AF=20=E7=94=B1=E4=BA=8E=E6=9C=89=E5=A4=A7=E9=87=8F8x8?= =?UTF-8?q?=E5=B0=8F=E5=9B=BE=E5=83=8F=E5=9D=97=E7=9A=84=E5=8D=B7=E7=A7=AF?= =?UTF-8?q?=E6=93=8D=E4=BD=9C=EF=BC=8C=E6=AD=A4=E5=A4=84GPU=E6=80=A7?= =?UTF-8?q?=E8=83=BD=E6=B2=A1=E6=9C=89=E5=BE=97=E5=88=B0=E6=9C=80=E5=A4=A7?= =?UTF-8?q?=E5=8F=91=E6=8C=A5=EF=BC=8C=E5=8F=AA=E9=92=88=E5=AF=B9=E5=A4=A7?= =?UTF-8?q?=E5=9B=BE=E5=83=8F=E5=9D=97=E9=87=87=E7=94=A8OpenCL?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clguetzli.cl | 8 ++-- clguetzli/clguetzli.cpp | 47 ++++++++++++------- clguetzli/ocl.cpp | 30 +++++++++--- clguetzli/ocl.h | 8 +++- .../butteraugli/butteraugli/butteraugli.cc | 33 ++++++++++--- 5 files changed, 91 insertions(+), 35 deletions(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index 8d0aabff..be73ceeb 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -26,15 +26,15 @@ __kernel void MinSquareVal(__global float* pA, __global float* pC, int square_si } __kernel void Convolution(__global float* multipliers, __global float* inp, __global float* result, - int xstep, int len, int offset, float border_ratio) + int xsize, int xstep, int len, int offset, float border_ratio) { const int ox = get_global_id(0); const int y = get_global_id(1); + const int oxsize = get_global_size(0); const int ysize = get_global_size(1); const int x = ox * xstep; - const int xsize = oxsize * xstep; float weight_no_border = 0; for (int j = 0; j <= 2 * offset; j++) @@ -43,7 +43,7 @@ __kernel void Convolution(__global float* multipliers, __global float* inp, __gl } int minx = x < offset ? 0 : x - offset; - int maxx = min(xsize, x + len - offset) - 1; + int maxx = min(xsize, x + len - offset); float weight = 0.0; for (int j = minx; j < maxx; j++) @@ -60,5 +60,5 @@ __kernel void Convolution(__global float* multipliers, __global float* inp, __gl sum += inp[y * xsize + j] * multipliers[j - x + offset]; } - result[y * oxsize + ox] = sum * scale; + result[ox * ysize + y] = sum * scale; } diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index 94a15040..2b598416 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -30,10 +30,15 @@ ocl_args_d_t& getOcl(void) { LogError("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err)); } - ocl.kernel = clCreateKernel(ocl.program, "MinSquareVal", &err); + ocl.kernel[KERNEL_MINSQUAREVAL] = clCreateKernel(ocl.program, "MinSquareVal", &err); if (CL_SUCCESS != err) { - LogError("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err)); + LogError("Error: clCreateKernel(MinSquareVal) for source program returned %s.\n", TranslateOpenCLError(err)); + } + ocl.kernel[KERNEL_CONVOLUTION] = clCreateKernel(ocl.program, "Convolution", &err); + if (CL_SUCCESS != err) + { + LogError("Error: clCreateKernel(Convolution) for source program returned %s.\n", TranslateOpenCLError(err)); } return ocl; @@ -54,13 +59,14 @@ void clMinSquareVal(size_t square_size, size_t offset, cl_int cloffset = offset; cl_int clsquare_size = square_size; - clSetKernelArg(ocl.kernel, 0, sizeof(cl_mem), (void*)&ocl.srcA); - clSetKernelArg(ocl.kernel, 1, sizeof(cl_mem), (void*)&ocl.dstMem); - clSetKernelArg(ocl.kernel, 2, sizeof(cl_int), (void*)&clsquare_size); - clSetKernelArg(ocl.kernel, 3, sizeof(cl_int), (void*)&cloffset); + cl_kernel kernel = ocl.kernel[KERNEL_MINSQUAREVAL]; + clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&ocl.srcA); + clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&ocl.dstMem); + clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&clsquare_size); + clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&cloffset); size_t globalWorkSize[2] = { xsize, ysize }; - err = clEnqueueNDRangeKernel(ocl.commandQueue, ocl.kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); + err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); if (CL_SUCCESS != err) { LogError("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err)); @@ -96,28 +102,33 @@ void clConvolution(size_t xsize, size_t ysize, cl_int err = CL_SUCCESS; ocl_args_d_t &ocl = getOcl(); + size_t oxsize = xsize / xstep; + ocl.allocA(sizeof(cl_float) * len); ocl.allocB(sizeof(cl_float) * xsize * ysize); - ocl.allocC(sizeof(cl_float) * xsize * ysize / xstep); + ocl.allocC(sizeof(cl_float) * oxsize * ysize); memcpy(ocl.inputA, multipliers, sizeof(cl_float) * len); memcpy(ocl.inputB, inp, sizeof(cl_float) * xsize * ysize); + cl_int clxsize = xsize; cl_int clxstep = xstep; cl_int cllen = len; cl_int cloffset = offset; cl_float clborder_ratio = border_ratio; - clSetKernelArg(ocl.kernel, 0, sizeof(cl_mem), (void*)&ocl.srcA); - clSetKernelArg(ocl.kernel, 1, sizeof(cl_mem), (void*)&ocl.srcB); - clSetKernelArg(ocl.kernel, 2, sizeof(cl_mem), (void*)&ocl.dstMem); - clSetKernelArg(ocl.kernel, 3, sizeof(cl_int), (void*)&clxstep); - clSetKernelArg(ocl.kernel, 4, sizeof(cl_int), (void*)&cllen); - clSetKernelArg(ocl.kernel, 5, sizeof(cl_int), (void*)&cloffset); - clSetKernelArg(ocl.kernel, 6, sizeof(cl_float), (void*)&clborder_ratio); + cl_kernel kernel = ocl.kernel[KERNEL_CONVOLUTION]; + clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&ocl.srcA); + clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&ocl.srcB); + clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&ocl.dstMem); + clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&clxsize); + clSetKernelArg(kernel, 4, sizeof(cl_int), (void*)&clxstep); + clSetKernelArg(kernel, 5, sizeof(cl_int), (void*)&cllen); + clSetKernelArg(kernel, 6, sizeof(cl_int), (void*)&cloffset); + clSetKernelArg(kernel, 7, sizeof(cl_float), (void*)&clborder_ratio); size_t globalWorkSize[2] = { xsize / xstep, ysize }; - err = clEnqueueNDRangeKernel(ocl.commandQueue, ocl.kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); + err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); if (CL_SUCCESS != err) { LogError("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err)); @@ -128,7 +139,7 @@ void clConvolution(size_t xsize, size_t ysize, LogError("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err)); } - cl_float *resultPtr = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, ocl.dstMem, true, CL_MAP_READ, 0, sizeof(cl_float) * xsize * ysize, 0, NULL, NULL, &err); + cl_float *resultPtr = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, ocl.dstMem, true, CL_MAP_READ, 0, sizeof(cl_float) * oxsize * ysize, 0, NULL, NULL, &err); if (CL_SUCCESS != err) { LogError("Error: clEnqueueMapBuffer returned %s\n", TranslateOpenCLError(err)); @@ -139,5 +150,5 @@ void clConvolution(size_t xsize, size_t ysize, LogError("Error: clFinish returned %s\n", TranslateOpenCLError(err)); } - memcpy(result, resultPtr, sizeof(cl_float) * xsize * ysize / xstep); + memcpy(result, resultPtr, sizeof(cl_float) * oxsize * ysize); } \ No newline at end of file diff --git a/clguetzli/ocl.cpp b/clguetzli/ocl.cpp index be1e9071..50d3ad6c 100644 --- a/clguetzli/ocl.cpp +++ b/clguetzli/ocl.cpp @@ -7,14 +7,23 @@ ocl_args_d_t::ocl_args_d_t() : device(NULL), commandQueue(NULL), program(NULL), - kernel(NULL), platformVersion(OPENCL_VERSION_1_2), deviceVersion(OPENCL_VERSION_1_2), compilerVersion(OPENCL_VERSION_1_2), srcA(NULL), srcB(NULL), - dstMem(NULL) + dstMem(NULL), + inputA(NULL), + lenA(0), + inputB(NULL), + lenB(0), + outputC(NULL), + lenC(0) { + for (int i = 0; i < KERNEL_COUNT; i++) + { + kernel[i] = NULL; + } } /* @@ -31,7 +40,15 @@ ocl_args_d_t::ocl_args_d_t() : ocl_args_d_t::~ocl_args_d_t() { cl_int err = CL_SUCCESS; - + for (int i = 0; i < KERNEL_COUNT; i++) + { + err = clReleaseKernel(kernel[i]); + if (CL_SUCCESS != err) + { + LogError("Error: clReleaseKernel returned '%s'.\n", TranslateOpenCLError(err)); + } + } +/* if (kernel) { err = clReleaseKernel(kernel); @@ -40,6 +57,7 @@ ocl_args_d_t::~ocl_args_d_t() LogError("Error: clReleaseKernel returned '%s'.\n", TranslateOpenCLError(err)); } } +*/ if (program) { err = clReleaseProgram(program); @@ -110,7 +128,7 @@ ocl_args_d_t::~ocl_args_d_t() void* ocl_args_d_t::allocA(size_t s) { - if (s < lenA) return inputA; + if (s <= lenA) return inputA; lenA = 0; _aligned_free(inputA); clReleaseMemObject(srcA); @@ -131,7 +149,7 @@ void* ocl_args_d_t::allocA(size_t s) void* ocl_args_d_t::allocB(size_t s) { - if (s < lenB) return inputB; + if (s <= lenB) return inputB; lenB = 0; _aligned_free(inputB); clReleaseMemObject(srcB); @@ -152,7 +170,7 @@ void* ocl_args_d_t::allocB(size_t s) void* ocl_args_d_t::allocC(size_t s) { - if (s < lenC) return outputC; + if (s <= lenC) return outputC; lenC = 0; _aligned_free(outputC); clReleaseMemObject(dstMem); diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h index 0a9e50b2..5f21a0e3 100644 --- a/clguetzli/ocl.h +++ b/clguetzli/ocl.h @@ -42,6 +42,12 @@ int SetupOpenCL(ocl_args_d_t *ocl, cl_device_type deviceType); * only, there is no OpenCL specific here: just to avoid global variables * and make passing all these arguments in functions easier. */ + +#define KERNEL_MINSQUAREVAL 0 +#define KERNEL_CONVOLUTION 1 + +#define KERNEL_COUNT 2 + struct ocl_args_d_t { ocl_args_d_t(); @@ -56,7 +62,7 @@ struct ocl_args_d_t cl_device_id device; // hold the selected device handler cl_command_queue commandQueue; // hold the commands-queue handler cl_program program; // hold the program handler - cl_kernel kernel; // hold the kernel handler + cl_kernel kernel[KERNEL_COUNT]; // hold the kernel handler float platformVersion; // hold the OpenCL platform version (default 1.2) float deviceVersion; // hold the OpenCL device version (default. 1.2) float compilerVersion; // hold the device OpenCL C version (default. 1.2) diff --git a/third_party/butteraugli/butteraugli/butteraugli.cc b/third_party/butteraugli/butteraugli/butteraugli.cc index 0f84aa48..26cbca23 100644 --- a/third_party/butteraugli/butteraugli/butteraugli.cc +++ b/third_party/butteraugli/butteraugli/butteraugli.cc @@ -62,12 +62,19 @@ inline double DotProduct(const float u[3], const double v[3]) { // Computes a horizontal convolution and transposes the result. static void Convolution(size_t xsize, size_t ysize, - size_t xstep, - size_t len, size_t offset, - const float* __restrict__ multipliers, - const float* __restrict__ inp, - float border_ratio, - float* __restrict__ result) { + size_t xstep, + size_t len, size_t offset, + const float* __restrict__ multipliers, + const float* __restrict__ inp, + float border_ratio, + float* __restrict__ result) { + + if (xsize > 100 && ysize > 100) + { + clConvolution(xsize, ysize, xstep, len, offset, multipliers, inp, border_ratio, result); + return; + } + PROFILER_FUNC; float weight_no_border = 0; @@ -92,6 +99,20 @@ static void Convolution(size_t xsize, size_t ysize, result[ox * ysize + y] = static_cast(sum * scale); } } + + return; + + // for verify + std::vector tmp(xsize / xstep * ysize); + clConvolution(xsize, ysize, xstep, len, offset, multipliers, inp, border_ratio, &tmp[0]); + + for (int i = 0; i < xsize / xstep * ysize; i++) + { + if (fabs(result[i] - tmp[i]) > 0.0001) + { + tmp[i] = result[i]; + } + } } void Blur(size_t xsize, size_t ysize, float* channel, double sigma, From 5d8ba53d419b21010b2d3462535b18c8b525115e Mon Sep 17 00:00:00 2001 From: strongtu Date: Fri, 28 Apr 2017 09:46:22 +0800 Subject: [PATCH 008/189] fix setupopencl --- clguetzli/clguetzli.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index 2b598416..211e3fb4 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -9,8 +9,6 @@ ocl_args_d_t& getOcl(void) if (bInit == true) return ocl; bInit = true; - SetupOpenCL(&ocl, CL_DEVICE_TYPE_GPU); - cl_int err = SetupOpenCL(&ocl, CL_DEVICE_TYPE_GPU); if (CL_SUCCESS != err) { From 82265a603e37f1b892f19dd417238197355a4760 Mon Sep 17 00:00:00 2001 From: strongtu Date: Tue, 2 May 2017 11:55:37 +0800 Subject: [PATCH 009/189] Merge branch 'master' of https://github.com/ianhuang-777/guetzli # Conflicts: # third_party/butteraugli/butteraugli/butteraugli.cc --- clguetzli/clguetzli.cl | 4 +- clguetzli/clguetzli.cpp | 4 +- clguetzli/clguetzli.h | 2 + guetzli.vcxproj | 64 +++++++++++++------ guetzli.vcxproj.filters | 6 +- guetzli/guetzli.cc | 10 ++- .../butteraugli/butteraugli/butteraugli.cc | 37 +++++++++-- 7 files changed, 95 insertions(+), 32 deletions(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index be73ceeb..6159832d 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -6,10 +6,10 @@ __kernel void MinSquareVal(__global float* pA, __global float* pC, int square_si const int height = get_global_size(1); int minH = offset > y ? 0 : y - offset; - int maxH = min(y + square_size - offset, height);// < height ? y + square_size - offset : height; + int maxH = min(y + square_size - offset, height); int minW = offset > x ? 0 : x - offset; - int maxW = min(x + square_size - offset, width);// < width ? x + square_size - offset : width; + int maxW = min(x + square_size - offset, width); float minValue = pA[minH * width + minW]; diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index 211e3fb4..5db62cc2 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -1,6 +1,8 @@ #include "clguetzli.h" #include "ocl.h" +extern bool g_useOpenCL = false; + ocl_args_d_t& getOcl(void) { static bool bInit = false; @@ -17,7 +19,7 @@ ocl_args_d_t& getOcl(void) char* source = nullptr; size_t src_size = 0; - ReadSourceFromFile("clguetzli\\clguetzli.cl", &source, &src_size); + ReadSourceFromFile("clguetzli.cl", &source, &src_size); ocl.program = clCreateProgramWithSource(ocl.context, 1, (const char**)&source, &src_size, &err); diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h index 31c2e7ba..df3dbc1d 100644 --- a/clguetzli/clguetzli.h +++ b/clguetzli/clguetzli.h @@ -1,5 +1,7 @@ #pragma once +extern bool g_useOpenCL; + void clMinSquareVal(size_t square_size, size_t offset, size_t xsize, size_t ysize, float *values); diff --git a/guetzli.vcxproj b/guetzli.vcxproj index cf770719..fb32ae0f 100644 --- a/guetzli.vcxproj +++ b/guetzli.vcxproj @@ -51,6 +51,7 @@ + @@ -78,6 +79,8 @@ obj\x86\Release\guetzli\ guetzli .exe + $(VC_IncludePath);$(WindowsSDK_IncludePath);$(VC_IncludeThirdParty) + $(VC_LibraryPath_x86);$(WindowsSDK_LibraryPath_x86);$(NETFXKitsDir)Lib\um\x86;$(VC_LibThirdParty_x86) true @@ -92,6 +95,8 @@ obj\x86\Debug\guetzli\ guetzli .exe + $(VC_IncludePath);$(WindowsSDK_IncludePath);$(VC_IncludeThirdParty) + $(VC_LibraryPath_x86);$(WindowsSDK_LibraryPath_x86);$(NETFXKitsDir)Lib\um\x86;$(VC_LibThirdParty_x86) @@ -109,30 +114,43 @@ true true OpenCL.lib;shlwapi.lib;%(AdditionalDependencies) - mainCRTStartup + mainCRTStartup $(INTELOCLSDKROOT)lib\x64 - - - + + + "$(INTELOCLSDKROOT)bin\x64\ioc64.exe" -cmd=build -input="%(FullPath)" -output="x64\Release\%(Filename).out" -VS -device=GPU -simd=default -targetos=current -bo=" " + + + OpenCL Code Builder + + + false + + + + copy $(ProjectDir)\clguetzli\clguetzli.cl $(ProjectDir)\clguetzli.cl + + + NotUsing Level3 .;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories) - Full + Disabled true - true + false false true - PERFTOOLS_DLL_DECL=;%(PreprocessorDefinitions) + PERFTOOLS_DLL_DECL=;ENABLE_OPENCL;%(PreprocessorDefinitions) Console - true - true - shlwapi.lib;%(AdditionalDependencies) - mainCRTStartup + true + true + shlwapi.lib;OpenCL.lib;%(AdditionalDependencies) + mainCRTStartup __tcmalloc - + @@ -146,23 +164,26 @@ Console true OpenCL.lib;shlwapi.lib;%(AdditionalDependencies) - mainCRTStartup + mainCRTStartup $(INTELOCLSDKROOT)lib\x64 - - - + + + copy $(ProjectDir)\clguetzli\clguetzli.cl $(ProjectDir)\clguetzli.cl + + + NotUsing Level3 .;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories) EditAndContinue Disabled - PERFTOOLS_DLL_DECL=;%(PreprocessorDefinitions) + PERFTOOLS_DLL_DECL=;ENABLE_OPENCL;%(PreprocessorDefinitions) Console true - shlwapi.lib;%(AdditionalDependencies) + shlwapi.lib;OpenCL.lib;%(AdditionalDependencies) mainCRTStartup __tcmalloc @@ -350,7 +371,11 @@ - + + Document + + + @@ -359,5 +384,6 @@ + \ No newline at end of file diff --git a/guetzli.vcxproj.filters b/guetzli.vcxproj.filters index 12e7d8f4..308cad47 100644 --- a/guetzli.vcxproj.filters +++ b/guetzli.vcxproj.filters @@ -580,8 +580,10 @@ third_party\zlib - + + + clguetzli - + \ No newline at end of file diff --git a/guetzli/guetzli.cc b/guetzli/guetzli.cc index 85cd4bb7..3355265e 100644 --- a/guetzli/guetzli.cc +++ b/guetzli/guetzli.cc @@ -28,6 +28,7 @@ #include "guetzli/processor.h" #include "guetzli/quality.h" #include "guetzli/stats.h" +#include "clguetzli\clguetzli.h" namespace { @@ -225,7 +226,8 @@ void Usage() { " Default value is %d.\n" " --memlimit M - Memory limit in MB. Guetzli will fail if unable to stay under\n" " the limit. Default limit is %d MB.\n" - " --nomemlimit - Do not limit memory usage.\n", kDefaultJPEGQuality, kDefaultMemlimitMB); + " --nomemlimit - Do not limit memory usage.\n" + " --opencl - Use OpenCL\n", kDefaultJPEGQuality, kDefaultMemlimitMB); exit(1); } @@ -256,7 +258,11 @@ int main(int argc, char** argv) { memlimit_mb = atoi(argv[opt_idx]); } else if (!strcmp(argv[opt_idx], "--nomemlimit")) { memlimit_mb = -1; - } else if (!strcmp(argv[opt_idx], "--")) { + } + else if (!strcmp(argv[opt_idx], "--opencl")) { + g_useOpenCL = true; + } + else if (!strcmp(argv[opt_idx], "--")) { opt_idx++; break; } else { diff --git a/third_party/butteraugli/butteraugli/butteraugli.cc b/third_party/butteraugli/butteraugli/butteraugli.cc index 26cbca23..4fb7eb21 100644 --- a/third_party/butteraugli/butteraugli/butteraugli.cc +++ b/third_party/butteraugli/butteraugli/butteraugli.cc @@ -69,11 +69,14 @@ static void Convolution(size_t xsize, size_t ysize, float border_ratio, float* __restrict__ result) { - if (xsize > 100 && ysize > 100) +#if (defined ENABLE_OPENCL) && (!defined ENABLE_OPENCL_CHECK) + if (g_useOpenCL && xsize > 100 && ysize > 100) { clConvolution(xsize, ysize, xstep, len, offset, multipliers, inp, border_ratio, result); return; } +#endif // ENABLE_OPENCL + PROFILER_FUNC; float weight_no_border = 0; @@ -100,8 +103,8 @@ static void Convolution(size_t xsize, size_t ysize, } } - return; +#ifdef ENABLE_OPENCL_CHECK // for verify std::vector tmp(xsize / xstep * ysize); clConvolution(xsize, ysize, xstep, len, offset, multipliers, inp, border_ratio, &tmp[0]); @@ -110,9 +113,10 @@ static void Convolution(size_t xsize, size_t ysize, { if (fabs(result[i] - tmp[i]) > 0.0001) { - tmp[i] = result[i]; + assert(false); } } +#endif // ENABLE_OPENCL_CHECK } void Blur(size_t xsize, size_t ysize, float* channel, double sigma, @@ -1335,11 +1339,21 @@ double MaskDcB(double delta) { void MinSquareVal(size_t square_size, size_t offset, size_t xsize, size_t ysize, float *values) { - - clMinSquareVal(square_size, offset, xsize, ysize, values); - return; +#if (defined ENABLE_OPENCL) && (!defined ENABLE_OPENCL_CHECK) + if (g_useOpenCL) + { + clMinSquareVal(square_size, offset, xsize, ysize, values); + return; + } +#endif // ENABLE_OPENCL PROFILER_FUNC; + +#ifdef ENABLE_OPENCL_CHECK + std::vector backup(xsize * ysize); + memcpy(&backup[0], values, xsize * ysize); +#endif + // offset is not negative and smaller than square_size. assert(offset < square_size); std::vector tmp(xsize * ysize); @@ -1380,6 +1394,17 @@ void MinSquareVal(size_t square_size, size_t offset, *pValuePoint = min; pValuePoint += xsize; } } + +#ifdef ENABLE_OPENCL_CHECK + clMinSquareVal(square_size, offset, xsize, ysize, backup.data()); + for (int i = 0; i < xsize * ysize; i++) + { + if (fabs(backup[i] - values[i]) > 0.0001) + { + assert(false); + } + } +#endif } // ===== Functions used by Mask only ===== From 775c63c56b6b5e02cca4f666b1be68232f8cf987 Mon Sep 17 00:00:00 2001 From: ianhuang-777 <306168910@qq.com> Date: Wed, 3 May 2017 00:37:52 +0800 Subject: [PATCH 010/189] Add comment for understanding. --- guetzli/processor.cc | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/guetzli/processor.cc b/guetzli/processor.cc index 9986f9ed..134dfe17 100644 --- a/guetzli/processor.cc +++ b/guetzli/processor.cc @@ -381,10 +381,10 @@ void Processor::ComputeBlockZeroingOrder( static const double kWeight[3] = { 1.0, 0.22, 0.20 }; #include "guetzli/order.inc" std::vector > input_order; - for (int c = 0; c < 3; ++c) { + for (int c = 0; c < 3; ++c) { // TOBEREMOVE:¼ÆËãÊäÈëblockµÄinput_order,·Ç0µÄ´ò·Ö if (!(comp_mask & (1 << c))) continue; for (int k = 1; k < kDCTBlockSize; ++k) { - int idx = c * kDCTBlockSize + k; + int idx = c * kDCTBlockSize + k; // TOBEREMOVE:ÿ¸ö·ÖÁ¿ÒÀ´Î if (block[idx] != 0) { float score; if (params_.new_zeroing_model) { @@ -412,7 +412,7 @@ void Processor::ComputeBlockZeroingOrder( coeff_t candidate_block[kBlockSize]; memcpy(candidate_block, processed_block, sizeof(candidate_block)); const int idx = input_order[i].first; - candidate_block[idx] = 0; + candidate_block[idx] = 0; // TOBEREMOVE:¶Ô±ÈblockµÄÅÅÐòµÃ·ÖǰiµÍµÄÖÃ0(i¸ù¾Ýinput_orderÊý¾Ý±ä»¯¶ø±ä»¯)£¬²¢ÏÈÉèÖûضԱÈͼÏñµÄÈý¸ö·ÖÁ¿¶ÔÓ¦blockÖÐÈ¥£¬ºóÐøÔÙ×ö¶Ô±È²ÉÓᣠfor (int c = 0; c < 3; ++c) { if (comp_mask & (1 << c)) { img->component(c).SetCoeffBlock( @@ -425,12 +425,12 @@ void Processor::ComputeBlockZeroingOrder( int block_xx = block_x * factor_x + ix; int block_yy = block_y * factor_y + iy; if (8 * block_xx < img->width() && 8 * block_yy < img->height()) { - float err = static_cast(comparator_->CompareBlock(*img, ix, iy)); + float err = static_cast(comparator_->CompareBlock(*img, ix, iy)); // TOBEREMOVE:ºÍԭͼµÄ¶ÔÓ¦block±È½Ï£¬·µ»Ø´íÎóÖµ max_err = std::max(max_err, err); } } } - if (max_err < best_err) { + if (max_err < best_err) { // TOBEREMOVE:ÕÒ³ö×îС´íÎóÖµµÄi best_err = max_err; best_i = i; } @@ -438,7 +438,7 @@ void Processor::ComputeBlockZeroingOrder( int idx = input_order[best_i].first; processed_block[idx] = 0; input_order.erase(input_order.begin() + best_i); - output_order->push_back({idx, best_err}); + output_order->push_back({idx, best_err}); // TOBEREMOVE:½«ÉÏÃæ¼ÆËã³öÀ´µÄ×îС´íÎóµÄidx£¬¶ÔÓ¦µ½¶Ô±ÈblockÖеĶÔӦλÖÃÕæÕýµÄÖÃΪ0,ÒÆ³ýinput_orderÏ¼´Ñ¡È¡µ±Ç°Öµ£¬·ÅÈëoutput_order,²¢ÕýʽµÄÉèÖõ½¶Ô±ÈͼÏñÖÐÈ¥¡£ for (int c = 0; c < 3; ++c) { if (comp_mask & (1 << c)) { img->component(c).SetCoeffBlock( @@ -446,6 +446,8 @@ void Processor::ComputeBlockZeroingOrder( } } } + + // TOBEREMOVE:×îÖÕÒÆ³ýerrÊý´óÓÚerrorÏÞÖÆµÄÏî·µ»Ø£¬²¢»¹Ô­¶Ô±ÈͼÏñµ½Ô­Ê¼Öµ¡£ // Make the block error values monotonic. float min_err = 1e10; for (int i = output_order->size() - 1; i >= 0; --i) { @@ -560,7 +562,7 @@ void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, candidate_coeff_errors.reserve(60 * num_blocks); std::vector block_order; block_order.reserve(3 * kDCTBlockSize); - comparator_->StartBlockComparisons(); + comparator_->StartBlockComparisons(); // TOBEREMOVE:³õʼ»¯Ò»Ð©²ÎÊý for (int block_y = 0, block_ix = 0; block_y < block_height; ++block_y) { for (int block_x = 0; block_x < block_width; ++block_x, ++block_ix) { coeff_t block[kBlockSize] = { 0 }; @@ -570,25 +572,25 @@ void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, assert(img->component(c).factor_x() == factor_x); assert(img->component(c).factor_y() == factor_y); img->component(c).GetCoeffBlock(block_x, block_y, - &block[c * kDCTBlockSize]); + &block[c * kDCTBlockSize]); // TOBEREMOVE:È¡³ö¶Ô±ÈͼÏñblockϵÊý const JPEGComponent& comp = jpg.components[c]; int jpg_block_ix = block_y * comp.width_in_blocks + block_x; memcpy(&orig_block[c * kDCTBlockSize], &comp.coeffs[jpg_block_ix * kDCTBlockSize], - kDCTBlockSize * sizeof(orig_block[0])); + kDCTBlockSize * sizeof(orig_block[0])); // TOBEREMOVE:È¡³öԭʼͼÏñblockϵÊý } } block_order.clear(); ComputeBlockZeroingOrder(block, orig_block, block_x, block_y, factor_x, - factor_y, comp_mask, img, &block_order); + factor_y, comp_mask, img, &block_order); // TOBEREMOVE:´«ÈëԭʼblockºÍ¶Ô±ÈͼÏñblock¼ÆËãzeroing order·ÅÈëblock_order candidate_coeff_offsets[block_ix] = candidate_coeffs.size(); - for (size_t i = 0; i < block_order.size(); ++i) { + for (size_t i = 0; i < block_order.size(); ++i) { // TOBEREMOVE:°Ñ½á¹û¸³Öµµ½ºòѡϵÊý candidate_coeffs.push_back(block_order[i].idx); candidate_coeff_errors.push_back(block_order[i].block_err); } } } - comparator_->FinishBlockComparisons(); + comparator_->FinishBlockComparisons(); // TOBEREMOVE:Çå³ý²ÎÊý candidate_coeff_offsets[num_blocks] = candidate_coeffs.size(); std::vector ac_histograms(ncomp); From 4061ccb9cbd87d12be5e5616b7a0393e8e754dd4 Mon Sep 17 00:00:00 2001 From: strongtu Date: Wed, 3 May 2017 17:08:08 +0800 Subject: [PATCH 011/189] =?UTF-8?q?=E5=B0=9D=E8=AF=95=E7=9C=8B=E4=B8=80?= =?UTF-8?q?=E4=B8=8B=E5=85=A8OpenCL=E5=8C=96Blur=E5=87=BD=E6=95=B0?= =?UTF-8?q?=EF=BC=8C=E4=B8=8D=E8=BF=87=E7=9B=AE=E5=89=8D=E8=AE=A1=E7=AE=97?= =?UTF-8?q?=E8=AF=AF=E5=B7=AE=E6=9C=89=E4=BA=9B=E5=A4=A7=EF=BC=8C=E6=98=AF?= =?UTF-8?q?=E5=90=A6=E6=9C=89Bug=EF=BC=9F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clguetzli.cl | 142 ++++++++++++++++++ clguetzli/clguetzli.cpp | 90 +++++++++++ clguetzli/clguetzli.h | 4 +- clguetzli/ocl.cpp | 6 +- clguetzli/ocl.h | 5 +- guetzli.vcxproj | 12 +- .../butteraugli/butteraugli/butteraugli.cc | 39 ++++- 7 files changed, 284 insertions(+), 14 deletions(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index 6159832d..67443e17 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -62,3 +62,145 @@ __kernel void Convolution(__global float* multipliers, __global float* inp, __gl result[ox * ysize + y] = sum * scale; } +/* +__kernel void ConvolutionX(__global float* multipliers, __global float* inp, __global float* result, + int len, int offset, float border_ratio) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + + const int xsize = get_global_size(0); + const int ysize = get_global_size(1); + + float weight_no_border = 0; + for (int j = 0; j <= 2 * offset; j++) + { + weight_no_border += multipliers[j]; + } + + int minx = x < offset ? 0 : x - offset; + int maxx = min(xsize, x + len - offset); + + int miny = y < offset ? 0 : y - offset; + int maxy = min(ysize, y + len - offset); + + float weightX = 0.0; + for (int j = minx; j < maxx; j++) + { + weightX += multipliers[j - x + offset]; + } + + weightX = (1.0 - border_ratio) * weightX + border_ratio * weight_no_border; + + float weightY = 0.0; + for (int j = miny; j < maxy; j++) + { + weightY += multipliers[j - y + offset]; + } + + weightY = (1.0 - border_ratio) * weightY + border_ratio * weight_no_border; + + + float sum = 0.0; + for (int j = miny; j < maxy; j++) + { + float sumx = 0.0; + for (int i = minx; i < maxx; i++) + { + sumx += inp[j * xsize + i] * multipliers[i - x + offset]; + } + + sum += sumx * multipliers[j - y + offset]; + } + + result[y * xsize + x] = sum / weightY / weightX; +} +*/ + +__kernel void ConvolutionX(__global float* multipliers, __global float* inp, __global float* result, + int len, int offset, float border_ratio) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + + const int xsize = get_global_size(0); + const int ysize = get_global_size(1); + + float weight_no_border = 0; + for (int j = 0; j <= 2 * offset; j++) + { + weight_no_border += multipliers[j]; + } + + int minx = x < offset ? 0 : x - offset; + int maxx = min(xsize, x + len - offset); + + float weight = 0.0; + for (int j = minx; j < maxx; j++) + { + weight += multipliers[j - x + offset]; + } + + weight = (1.0 - border_ratio) * weight + border_ratio * weight_no_border; + float scale = 1.0 / weight; + + float sum = 0.0; + for (int j = minx; j < maxx; j++) + { + sum += inp[y * xsize + j] * multipliers[j - x + offset]; + } + + result[x * ysize + y] = sum * scale; +} + +__kernel void ConvolutionY(__global float* multipliers, __global float* inp, __global float* result, + int len, int offset, float border_ratio) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + + const int xsize = get_global_size(0); + const int ysize = get_global_size(1); + + float weight_no_border = 0; + for (int j = 0; j <= 2 * offset; j++) + { + weight_no_border += multipliers[j]; + } + + int miny = y < offset ? 0 : y - offset; + int maxy = min(ysize, y + len - offset); + + float weight = 0.0; + for (int j = miny; j < maxy; j++) + { + weight += multipliers[j - y + offset]; + } + + weight = (1.0 - border_ratio) * weight + border_ratio * weight_no_border; + float scale = 1.0 / weight; + + float sum = 0.0; + for (int j = miny; j < maxy; j++) + { + sum += inp[j * xsize + x] * multipliers[j - y + offset]; + } + + result[y * xsize + x] = sum * scale; +} + +__kernel void DownSample(__global float* pA, __global float* pC, int square) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + + const int xsize = get_global_size(0); + const int ysize = get_global_size(1); + + const int oxsize = xsize / square; + + const int sample_x = x / square; + const int sample_y = y / square; + + pC[y * xsize + x] = pA[sample_y * oxsize + sample_x]; +} \ No newline at end of file diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index 5db62cc2..bbd578ad 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -1,3 +1,6 @@ +#include +#include +#include #include "clguetzli.h" #include "ocl.h" @@ -40,6 +43,9 @@ ocl_args_d_t& getOcl(void) { LogError("Error: clCreateKernel(Convolution) for source program returned %s.\n", TranslateOpenCLError(err)); } + ocl.kernel[KERNEL_CONVOLUTIONX] = clCreateKernel(ocl.program, "ConvolutionX", &err); + ocl.kernel[KERNEL_CONVOLUTIONY] = clCreateKernel(ocl.program, "ConvolutionY", &err); + ocl.kernel[KERNEL_DOWNSAMPLE] = clCreateKernel(ocl.program, "DownSample", &err); return ocl; } @@ -151,4 +157,88 @@ void clConvolution(size_t xsize, size_t ysize, } memcpy(result, resultPtr, sizeof(cl_float) * oxsize * ysize); +} + +void clBlur(size_t xsize, size_t ysize, float* channel, double sigma, double border_ratio) +{ + double m = 2.25; // Accuracy increases when m is increased. + const double scaler = -1.0 / (2 * sigma * sigma); + // For m = 9.0: exp(-scaler * diff * diff) < 2^ {-52} + const int diff = std::max(1, m * fabs(sigma)); + const int expn_size = 2 * diff + 1; + std::vector expn(expn_size); + for (int i = -diff; i <= diff; ++i) { + expn[i + diff] = static_cast(exp(scaler * i * i)); + } + + const int xstep = std::max(1, int(sigma / 3)); + + cl_int err = CL_SUCCESS; + ocl_args_d_t &ocl = getOcl(); + + ocl.allocA(sizeof(cl_float) * expn_size); + ocl.allocB(sizeof(cl_float) * xsize * ysize); + ocl.allocC(sizeof(cl_float) * xsize * ysize); + + memcpy(ocl.inputA, expn.data(), sizeof(cl_float) * expn_size); + memcpy(ocl.inputB, channel, sizeof(cl_float) * xsize * ysize); + + cl_int clxsize = xsize; + cl_int clxstep = xstep; + cl_int cllen = expn_size; + cl_int cloffset = diff; + cl_float clborder_ratio = border_ratio; + + cl_kernel kernel = ocl.kernel[KERNEL_CONVOLUTION]; + clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&ocl.srcA); + clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&ocl.srcB); + clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&ocl.dstMem); + clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&clxsize); + clSetKernelArg(kernel, 4, sizeof(cl_int), (void*)&clxstep); + clSetKernelArg(kernel, 5, sizeof(cl_int), (void*)&cllen); + clSetKernelArg(kernel, 6, sizeof(cl_int), (void*)&cloffset); + clSetKernelArg(kernel, 7, sizeof(cl_float), (void*)&clborder_ratio); + + size_t globalWorkSize[2] = { xsize / xstep, ysize }; + err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); + err = clFinish(ocl.commandQueue); + + globalWorkSize[0] = ysize / xstep; + globalWorkSize[1] = xsize / xstep; + clxsize = ysize; + clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&ocl.srcA); + clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&ocl.dstMem); + clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&ocl.srcB); + clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&clxsize); + clSetKernelArg(kernel, 4, sizeof(cl_int), (void*)&clxstep); + clSetKernelArg(kernel, 5, sizeof(cl_int), (void*)&cllen); + clSetKernelArg(kernel, 6, sizeof(cl_int), (void*)&cloffset); + clSetKernelArg(kernel, 7, sizeof(cl_float), (void*)&clborder_ratio); + + err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); + err = clFinish(ocl.commandQueue); + + cl_int clstep = xstep; + if (clstep <= 1) + { + cl_float *resultPtr = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, ocl.srcB, true, CL_MAP_READ, 0, sizeof(cl_float) * xsize * ysize, 0, NULL, NULL, &err); + err = clFinish(ocl.commandQueue); + memcpy(channel, resultPtr, sizeof(cl_float) * xsize * ysize); + } + else + { + kernel = ocl.kernel[KERNEL_DOWNSAMPLE]; + clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&ocl.srcB); + clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&ocl.dstMem); + clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&clstep); + + globalWorkSize[0] = ysize; + globalWorkSize[1] = xsize; + err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); + err = clFinish(ocl.commandQueue); + + cl_float *resultPtr = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, ocl.dstMem, true, CL_MAP_READ, 0, sizeof(cl_float) * xsize * ysize, 0, NULL, NULL, &err); + err = clFinish(ocl.commandQueue); + memcpy(channel, resultPtr, sizeof(cl_float) * xsize * ysize); + } } \ No newline at end of file diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h index df3dbc1d..e918b0d9 100644 --- a/clguetzli/clguetzli.h +++ b/clguetzli/clguetzli.h @@ -12,4 +12,6 @@ void clConvolution(size_t xsize, size_t ysize, const float* multiplier, const float* inp, float border_ratio, - float* result); \ No newline at end of file + float* result); + +void clBlur(size_t xsize, size_t ysize, float* channel, double sigma, double border_ratio); diff --git a/clguetzli/ocl.cpp b/clguetzli/ocl.cpp index 50d3ad6c..3dd34e80 100644 --- a/clguetzli/ocl.cpp +++ b/clguetzli/ocl.cpp @@ -138,7 +138,7 @@ void* ocl_args_d_t::allocA(size_t s) lenA = s; cl_int err = 0; - srcA = clCreateBuffer(this->context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, s, inputA, &err); + srcA = clCreateBuffer(this->context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, s, inputA, &err); if (CL_SUCCESS != err) { LogError("Error: allocA() for buffer returned %s.\n", TranslateOpenCLError(err)); @@ -159,7 +159,7 @@ void* ocl_args_d_t::allocB(size_t s) lenB = s; cl_int err = 0; - srcB = clCreateBuffer(this->context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, s, inputB, &err); + srcB = clCreateBuffer(this->context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, s, inputB, &err); if (CL_SUCCESS != err) { LogError("Error: allocB() for buffer returned %s.\n", TranslateOpenCLError(err)); @@ -180,7 +180,7 @@ void* ocl_args_d_t::allocC(size_t s) lenC = s; cl_int err = 0; - dstMem = clCreateBuffer(this->context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, s, outputC, &err); + dstMem = clCreateBuffer(this->context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, s, outputC, &err); if (CL_SUCCESS != err) { LogError("Error: allocB() for buffer returned %s.\n", TranslateOpenCLError(err)); diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h index 5f21a0e3..f8a86045 100644 --- a/clguetzli/ocl.h +++ b/clguetzli/ocl.h @@ -45,8 +45,11 @@ int SetupOpenCL(ocl_args_d_t *ocl, cl_device_type deviceType); #define KERNEL_MINSQUAREVAL 0 #define KERNEL_CONVOLUTION 1 +#define KERNEL_CONVOLUTIONX 2 +#define KERNEL_CONVOLUTIONY 3 +#define KERNEL_DOWNSAMPLE 4 -#define KERNEL_COUNT 2 +#define KERNEL_COUNT 5 struct ocl_args_d_t { diff --git a/guetzli.vcxproj b/guetzli.vcxproj index fb32ae0f..3aa98abf 100644 --- a/guetzli.vcxproj +++ b/guetzli.vcxproj @@ -108,6 +108,7 @@ true false true + ENABLE_OPENCL;_UNICODE;UNICODE;%(PreprocessorDefinitions) Console @@ -157,11 +158,12 @@ NotUsing Level3 .;$(INTELOCLSDKROOT)include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories) - EditAndContinue - Disabled - - - Console + EditAndContinue + Disabled + ENABLE_OPENCL;ENABLE_OPENCL_CHECK;_UNICODE;UNICODE;%(PreprocessorDefinitions) + + + Console true OpenCL.lib;shlwapi.lib;%(AdditionalDependencies) mainCRTStartup diff --git a/third_party/butteraugli/butteraugli/butteraugli.cc b/third_party/butteraugli/butteraugli/butteraugli.cc index 4fb7eb21..dbcac422 100644 --- a/third_party/butteraugli/butteraugli/butteraugli.cc +++ b/third_party/butteraugli/butteraugli/butteraugli.cc @@ -68,7 +68,7 @@ static void Convolution(size_t xsize, size_t ysize, const float* __restrict__ inp, float border_ratio, float* __restrict__ result) { - +/* #if (defined ENABLE_OPENCL) && (!defined ENABLE_OPENCL_CHECK) if (g_useOpenCL && xsize > 100 && ysize > 100) { @@ -76,7 +76,7 @@ static void Convolution(size_t xsize, size_t ysize, return; } #endif // ENABLE_OPENCL - +*/ PROFILER_FUNC; float weight_no_border = 0; @@ -103,7 +103,7 @@ static void Convolution(size_t xsize, size_t ysize, } } - + /* #ifdef ENABLE_OPENCL_CHECK // for verify std::vector tmp(xsize / xstep * ysize); @@ -117,10 +117,24 @@ static void Convolution(size_t xsize, size_t ysize, } } #endif // ENABLE_OPENCL_CHECK +*/ } void Blur(size_t xsize, size_t ysize, float* channel, double sigma, double border_ratio) { + +#if (defined ENABLE_OPENCL) && (!defined ENABLE_OPENCL_CHECK) + if (g_useOpenCL && xsize > 100 && ysize > 100) + { + clBlur(xsize, ysize, channel, sigma, border_ratio); + return; + } +#endif // ENABLE_OPENCL +#ifdef ENABLE_OPENCL_CHECK + std::vector tmpChannel(xsize * ysize); + memcpy(tmpChannel.data(), channel, xsize * ysize * sizeof(float)); +#endif + PROFILER_FUNC; double m = 2.25; // Accuracy increases when m is increased. const double scaler = -1.0 / (2 * sigma * sigma); @@ -156,6 +170,23 @@ void Blur(size_t xsize, size_t ysize, float* channel, double sigma, } } } + +#ifdef ENABLE_OPENCL_CHECK + // for verify + { + if (xsize < 100 || ysize < 100) return; + + clBlur(xsize, ysize, tmpChannel.data(), sigma, border_ratio); + + for (int i = 0; i < xsize * ysize; i++) + { + if (fabs(channel[i] - tmpChannel[i]) > 0.0001) + { + float k = channel[i] - tmpChannel[i]; + } + } + } +#endif // ENABLE_OPENCL_CHECK } // To change this to n, add the relevant FFTn function and kFFTnMapIndexTable. @@ -1351,7 +1382,7 @@ void MinSquareVal(size_t square_size, size_t offset, #ifdef ENABLE_OPENCL_CHECK std::vector backup(xsize * ysize); - memcpy(&backup[0], values, xsize * ysize); + memcpy(&backup[0], values, xsize * ysize * sizeof(float)); #endif // offset is not negative and smaller than square_size. From d9a87afad06f2e6933cd8d6f5b768e16b4a7b1f9 Mon Sep 17 00:00:00 2001 From: strongtu Date: Thu, 4 May 2017 09:43:13 +0800 Subject: [PATCH 012/189] add opencl process line --- clguetzli/clguetzli.cl | 27 +++++- clguetzli/clguetzli.cpp | 176 +++++++++++++++++++++++++++++++++++++++- clguetzli/clguetzli.h | 6 +- clguetzli/ocl.cpp | 12 +++ clguetzli/ocl.h | 2 + 5 files changed, 218 insertions(+), 5 deletions(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index 67443e17..af70a10b 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -1,3 +1,11 @@ +float minfun(float a, float b) +{ + if (a < b) + return a; + else + return b; +} + __kernel void MinSquareVal(__global float* pA, __global float* pC, int square_size, int offset) { const int x = get_global_id(0); @@ -17,14 +25,25 @@ __kernel void MinSquareVal(__global float* pA, __global float* pC, int square_si { for (int i = minW; i < maxW; i++) { - float tmp = pA[j * width + i]; - if (tmp < minValue) minValue = tmp; + minValue = minfun(minValue, pA[j * width + i]); +// float tmp = pA[j * width + i]; +// if (tmp < minValue) minValue = tmp; } } pC[y * width + x] = minValue; } +float calcWeight(__global float* multipliers, int len) +{ + float weight_no_border = 0; + for (int j = 0; j < len; j++) + { + weight_no_border += multipliers[j]; + } + return weight_no_border; +} + __kernel void Convolution(__global float* multipliers, __global float* inp, __global float* result, int xsize, int xstep, int len, int offset, float border_ratio) { @@ -35,12 +54,14 @@ __kernel void Convolution(__global float* multipliers, __global float* inp, __gl const int ysize = get_global_size(1); const int x = ox * xstep; - +/* float weight_no_border = 0; for (int j = 0; j <= 2 * offset; j++) { weight_no_border += multipliers[j]; } +*/ + float weight_no_border = calcWeight(multipliers, len); int minx = x < offset ? 0 : x - offset; int maxx = min(xsize, x + len - offset); diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index bbd578ad..ea8ff091 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -241,4 +241,178 @@ void clBlur(size_t xsize, size_t ysize, float* channel, double sigma, double bor err = clFinish(ocl.commandQueue); memcpy(channel, resultPtr, sizeof(cl_float) * xsize * ysize); } -} \ No newline at end of file +} + +void clConvolutionEx(cl_mem image, size_t xsize, size_t ysize, cl_mem expn, size_t expn_size, + int step, int offset, double border_ratio, cl_mem result) +{ + // Convolution +} + +void clUpsampleEx(cl_mem image, size_t xsize, size_t ysize, size_t xstep, size_t ystep, cl_mem result) +{ +/* + for (size_t y = 0; y < ysize; y++) { + for (size_t x = 0; x < xsize; x++) { + // TODO: Use correct rounding. + channel[y * xsize + x] = + downsampled_output[(y / ystep) * dxsize + (x / xstep)]; + } + } +*/ +} + +void clBlurEx(cl_mem image, size_t xsize, size_t ysize, double sigma, double border_ratio) +{ + double m = 2.25; // Accuracy increases when m is increased. + const double scaler = -1.0 / (2 * sigma * sigma); + // For m = 9.0: exp(-scaler * diff * diff) < 2^ {-52} + const int diff = std::max(1, m * fabs(sigma)); + const int expn_size = 2 * diff + 1; + std::vector expn(expn_size); + for (int i = -diff; i <= diff; ++i) { + expn[i + diff] = static_cast(exp(scaler * i * i)); + } + + const int xstep = std::max(1, int(sigma / 3)); + const int ystep = xstep; + int dxsize = (xsize + xstep - 1) / xstep; + int dysize = (ysize + ystep - 1) / ystep; + + cl_int err = 0; + ocl_args_d_t &ocl = getOcl(); + cl_mem mem_expn = ocl.allocMem(sizeof(cl_float) * expn_size); + + clEnqueueWriteBuffer(ocl.commandQueue, mem_expn, CL_FALSE, 0, sizeof(cl_float) * expn_size, expn.data(), 0, NULL, NULL); + err = clFinish(ocl.commandQueue); + + if (xstep > 1) + { + ocl.allocA(sizeof(cl_float) * dxsize * ysize); + ocl.allocB(sizeof(cl_float) * dxsize * dysize); + + clConvolutionEx(image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio, ocl.srcA); + clConvolutionEx(ocl.srcA, ysize, dxsize, mem_expn, expn_size, ystep, diff, border_ratio, ocl.srcB); + clUpsampleEx(ocl.srcB, dxsize, dysize, xstep, ystep, image); + } + else + { + ocl.allocA(sizeof(cl_float) * xsize * ysize); + clConvolutionEx(image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio, ocl.srcA); + clConvolutionEx(ocl.srcA, ysize, dxsize, mem_expn, expn_size, ystep, diff, border_ratio, image); + } + + clReleaseMemObject(mem_expn); +} + +void clOpsinDynamicsImageEx(cl_mem r, cl_mem g, cl_mem b, size_t size) +{ +/* + for (size_t i = 0; i < rgb[0].size(); ++i) { + double sensitivity[3]; + { + // Calculate sensitivity[3] based on the smoothed image gamma derivative. + double pre_rgb[3] = { blurred[0][i], blurred[1][i], blurred[2][i] }; + double pre_mixed[3]; + OpsinAbsorbance(pre_rgb, pre_mixed); + sensitivity[0] = Gamma(pre_mixed[0]) / pre_mixed[0]; + sensitivity[1] = Gamma(pre_mixed[1]) / pre_mixed[1]; + sensitivity[2] = Gamma(pre_mixed[2]) / pre_mixed[2]; + } + double cur_rgb[3] = { rgb[0][i], rgb[1][i], rgb[2][i] }; + double cur_mixed[3]; + OpsinAbsorbance(cur_rgb, cur_mixed); + cur_mixed[0] *= sensitivity[0]; + cur_mixed[1] *= sensitivity[1]; + cur_mixed[2] *= sensitivity[2]; + double x, y, z; + RgbToXyb(cur_mixed[0], cur_mixed[1], cur_mixed[2], &x, &y, &z); + rgb[0][i] = static_cast(x); + rgb[1][i] = static_cast(y); + rgb[2][i] = static_cast(z); +*/ +} + +void clOpsinDynamicsImage(size_t xsize, size_t ysize, float* r, float* g, float* b) +{ + static const double kSigma = 1.1; + + cl_int channel_size = xsize * ysize * sizeof(float); + + cl_int err = 0; + ocl_args_d_t &ocl = getOcl(); + cl_mem mem_r = ocl.allocMem(channel_size); + cl_mem mem_g = ocl.allocMem(channel_size); + cl_mem mem_b = ocl.allocMem(channel_size); + + clEnqueueWriteBuffer(ocl.commandQueue, mem_r, CL_FALSE, 0, channel_size, r, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, mem_g, CL_FALSE, 0, channel_size, g, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, mem_b, CL_FALSE, 0, channel_size, b, 0, NULL, NULL); + err = clFinish(ocl.commandQueue); + + clBlurEx(mem_r, xsize, ysize, kSigma, 0.0); + clBlurEx(mem_g, xsize, ysize, kSigma, 0.0); + clBlurEx(mem_b, xsize, ysize, kSigma, 0.0); + + clOpsinDynamicsImageEx(mem_r, mem_g, mem_b, xsize * ysize); + + cl_float *result_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mem_r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + cl_float *result_g = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mem_g, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + cl_float *result_b = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mem_b, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + err = clFinish(ocl.commandQueue); + + memcpy(r, result_r, channel_size); + memcpy(g, result_g, channel_size); + memcpy(b, result_b, channel_size); + + clReleaseMemObject(mem_r); + clReleaseMemObject(mem_g); + clReleaseMemObject(mem_b); +} + +void clMaskHighIntensityChangeEx(cl_mem r, cl_mem g, cl_mem b, + cl_mem r2, cl_mem g2,cl_mem b2, + size_t xsize, size_t ysize) +{ + // MaskHighIntensityChange +} + +void clEdgeDetectorMap(cl_mem r, cl_mem g, cl_mem b, + cl_mem r2, cl_mem g2, cl_mem b2, + size_t xsize, size_t ysize) +{ + static const double kSigma[3] = { 1.5, 0.586, 0.4 }; + clBlurEx(r, xsize, ysize, kSigma[0], 0.0); + clBlurEx(r2, xsize, ysize, kSigma[0], 0.0); + clBlurEx(g, xsize, ysize, kSigma[1], 0.0); + clBlurEx(r2, xsize, ysize, kSigma[1], 0.0); + clBlurEx(b, xsize, ysize, kSigma[2], 0.0); + clBlurEx(b2, xsize, ysize, kSigma[2], 0.0); +} +void clDiffmapOpsinDynamicsImage(float* r, float* g, float* b, + float* r2, float* g2, float* b2, + size_t xsize, size_t ysize, + float* result) +{ + + cl_int channel_size = xsize * ysize * sizeof(float); + + cl_int err = 0; + ocl_args_d_t &ocl = getOcl(); + cl_mem mem_r = ocl.allocMem(channel_size); + cl_mem mem_g = ocl.allocMem(channel_size); + cl_mem mem_b = ocl.allocMem(channel_size); + cl_mem mem_r2 = ocl.allocMem(channel_size); + cl_mem mem_g2 = ocl.allocMem(channel_size); + cl_mem mem_b2 = ocl.allocMem(channel_size); + + clEnqueueWriteBuffer(ocl.commandQueue, mem_r, CL_FALSE, 0, channel_size, r, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, mem_g, CL_FALSE, 0, channel_size, g, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, mem_b, CL_FALSE, 0, channel_size, b, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, mem_r2, CL_FALSE, 0, channel_size, r2, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, mem_g2, CL_FALSE, 0, channel_size, g2, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, mem_b2, CL_FALSE, 0, channel_size, b2, 0, NULL, NULL); + err = clFinish(ocl.commandQueue); + + clMaskHighIntensityChangeEx(mem_r, mem_g, mem_b, mem_r2, mem_g2, mem_b2, xsize, ysize); +} diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h index e918b0d9..fa489667 100644 --- a/clguetzli/clguetzli.h +++ b/clguetzli/clguetzli.h @@ -1,7 +1,11 @@ #pragma once - +#include "CL\cl.h" extern bool g_useOpenCL; +void clOpsinDynamicsImage(size_t xsize, size_t ysize, float* r, float* g, float& b); + +void clBlurEx(cl_mem image, size_t xsize, size_t ysize, double sigma, double border_ratio); + void clMinSquareVal(size_t square_size, size_t offset, size_t xsize, size_t ysize, float *values); diff --git a/clguetzli/ocl.cpp b/clguetzli/ocl.cpp index 3dd34e80..36fc4041 100644 --- a/clguetzli/ocl.cpp +++ b/clguetzli/ocl.cpp @@ -189,6 +189,18 @@ void* ocl_args_d_t::allocC(size_t s) return outputC; } +cl_mem ocl_args_d_t::allocMem(size_t s) +{ + cl_uint optimizedSize = ((s - 1) / 64 + 1) * 64; + cl_int err = 0; + cl_mem mem = clCreateBuffer(this->context, CL_MEM_READ_WRITE, s, nullptr, &err); + if (CL_SUCCESS != err) + { + LogError("Error: allocMem() for buffer returned %s.\n", TranslateOpenCLError(err)); + } + return mem; +} + const char* TranslateOpenCLError(cl_int errorCode) { switch (errorCode) diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h index f8a86045..04bf5b1d 100644 --- a/clguetzli/ocl.h +++ b/clguetzli/ocl.h @@ -60,6 +60,8 @@ struct ocl_args_d_t void* allocB(size_t s); void* allocC(size_t s); + cl_mem allocMem(size_t s); + // Regular OpenCL objects: cl_context context; // hold the context handler cl_device_id device; // hold the selected device handler From b6188431f0839d5e0608ecc746412bac11f6c4fb Mon Sep 17 00:00:00 2001 From: strongtu Date: Thu, 4 May 2017 09:45:05 +0800 Subject: [PATCH 013/189] Merge branch 'master' of https://github.com/ianhuang-777/guetzli --- third_party/butteraugli/butteraugli/butteraugli.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/third_party/butteraugli/butteraugli/butteraugli.cc b/third_party/butteraugli/butteraugli/butteraugli.cc index dbcac422..fb895b34 100644 --- a/third_party/butteraugli/butteraugli/butteraugli.cc +++ b/third_party/butteraugli/butteraugli/butteraugli.cc @@ -1381,8 +1381,7 @@ void MinSquareVal(size_t square_size, size_t offset, PROFILER_FUNC; #ifdef ENABLE_OPENCL_CHECK - std::vector backup(xsize * ysize); - memcpy(&backup[0], values, xsize * ysize * sizeof(float)); + std::vector backup(values, values + xsize * ysize); #endif // offset is not negative and smaller than square_size. From 0ba681731a7bf7f9c02cbdd003391f8979bf3a23 Mon Sep 17 00:00:00 2001 From: strongtu Date: Thu, 4 May 2017 14:09:31 +0800 Subject: [PATCH 014/189] add function --- clguetzli/clguetzli.cpp | 94 +++++++++++++++++++++++++++++------------ clguetzli/ocl.cpp | 25 +++++++++++ clguetzli/ocl.h | 8 ++++ 3 files changed, 101 insertions(+), 26 deletions(-) diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index ea8ff091..21f5cc73 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -370,25 +370,55 @@ void clOpsinDynamicsImage(size_t xsize, size_t ysize, float* r, float* g, float* clReleaseMemObject(mem_b); } -void clMaskHighIntensityChangeEx(cl_mem r, cl_mem g, cl_mem b, - cl_mem r2, cl_mem g2,cl_mem b2, - size_t xsize, size_t ysize) +void clMaskHighIntensityChangeEx(ocl_channels rgb, ocl_channels rgb2, size_t xsize, size_t ysize) { // MaskHighIntensityChange } -void clEdgeDetectorMap(cl_mem r, cl_mem g, cl_mem b, - cl_mem r2, cl_mem g2, cl_mem b2, - size_t xsize, size_t ysize) +void clEdgeDetectorMapEx(ocl_channels rgb, ocl_channels rgb2, size_t xsize, size_t ysize, cl_mem result) { static const double kSigma[3] = { 1.5, 0.586, 0.4 }; - clBlurEx(r, xsize, ysize, kSigma[0], 0.0); - clBlurEx(r2, xsize, ysize, kSigma[0], 0.0); - clBlurEx(g, xsize, ysize, kSigma[1], 0.0); - clBlurEx(r2, xsize, ysize, kSigma[1], 0.0); - clBlurEx(b, xsize, ysize, kSigma[2], 0.0); - clBlurEx(b2, xsize, ysize, kSigma[2], 0.0); + clBlurEx(rgb.r, xsize, ysize, kSigma[0], 0.0); + clBlurEx(rgb2.r, xsize, ysize, kSigma[0], 0.0); + clBlurEx(rgb.g, xsize, ysize, kSigma[1], 0.0); + clBlurEx(rgb2.g, xsize, ysize, kSigma[1], 0.0); + clBlurEx(rgb.b, xsize, ysize, kSigma[2], 0.0); + clBlurEx(rgb2.b, xsize, ysize, kSigma[2], 0.0); + + // EdgeDetectorLowFreq } + +void clBlockDiffMapEx(ocl_channels rgb, ocl_channels rgb2, + size_t xsize, size_t ysize, + cl_mem block_diff_dc, cl_mem block_diff_ac) +{ + +} + +void clEdgeDetectorLowFreqEx(ocl_channels rgb, ocl_channels rgb2, + size_t xsize, size_t ysize, + cl_mem block_diff_ac) +{ + +} + +void clMaskEx(ocl_channels rgb, ocl_channels rgb2, + size_t xsize, size_t ysize, + ocl_channels mask, ocl_channels mask_dc) +{ + +} + +void clCombineChannelsEx(ocl_channels mask, ocl_channels mask_dc, cl_mem block_diff_dc, cl_mem block_diff_ac, cl_mem edge_detector_map, cl_mem result) +{ + +} + +void clCalculateDiffmap(cl_mem result, size_t xsize, size_t ysize, int step) +{ + +} + void clDiffmapOpsinDynamicsImage(float* r, float* g, float* b, float* r2, float* g2, float* b2, size_t xsize, size_t ysize, @@ -399,20 +429,32 @@ void clDiffmapOpsinDynamicsImage(float* r, float* g, float* b, cl_int err = 0; ocl_args_d_t &ocl = getOcl(); - cl_mem mem_r = ocl.allocMem(channel_size); - cl_mem mem_g = ocl.allocMem(channel_size); - cl_mem mem_b = ocl.allocMem(channel_size); - cl_mem mem_r2 = ocl.allocMem(channel_size); - cl_mem mem_g2 = ocl.allocMem(channel_size); - cl_mem mem_b2 = ocl.allocMem(channel_size); - - clEnqueueWriteBuffer(ocl.commandQueue, mem_r, CL_FALSE, 0, channel_size, r, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, mem_g, CL_FALSE, 0, channel_size, g, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, mem_b, CL_FALSE, 0, channel_size, b, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, mem_r2, CL_FALSE, 0, channel_size, r2, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, mem_g2, CL_FALSE, 0, channel_size, g2, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, mem_b2, CL_FALSE, 0, channel_size, b2, 0, NULL, NULL); + ocl_channels xyb = ocl.allocMemChannels(channel_size); + ocl_channels xyb2 = ocl.allocMemChannels(channel_size); + + clEnqueueWriteBuffer(ocl.commandQueue, xyb.r, CL_FALSE, 0, channel_size, r, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, xyb.g, CL_FALSE, 0, channel_size, g, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, xyb.b, CL_FALSE, 0, channel_size, b, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, xyb2.r, CL_FALSE, 0, channel_size, r2, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, xyb2.g, CL_FALSE, 0, channel_size, g2, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, xyb2.b, CL_FALSE, 0, channel_size, b2, 0, NULL, NULL); err = clFinish(ocl.commandQueue); - clMaskHighIntensityChangeEx(mem_r, mem_g, mem_b, mem_r2, mem_g2, mem_b2, xsize, ysize); + clMaskHighIntensityChangeEx(xyb, xyb2, xsize, ysize); + + cl_mem edge_detector_map = ocl.allocMem(3 * xsize * ysize); + cl_mem block_diff_dc = ocl.allocMem(3 * xsize * ysize); + cl_mem block_diff_ac = ocl.allocMem(3 * xsize * ysize); + + ocl_channels mask; + ocl_channels mask_dc; + + cl_mem mem_result; + + clEdgeDetectorMapEx(xyb, xyb2, xsize, ysize, edge_detector_map); + clBlockDiffMapEx(xyb, xyb2, xsize, ysize, block_diff_dc, block_diff_ac); + clEdgeDetectorLowFreqEx(xyb, xyb2, xsize, ysize, block_diff_ac); + + clMaskEx(xyb, xyb2, xsize, ysize, mask, mask_dc); + clCombineChannelsEx(mask, mask_dc, block_diff_dc, block_diff_ac, edge_detector_map, mem_result); } diff --git a/clguetzli/ocl.cpp b/clguetzli/ocl.cpp index 36fc4041..8272ac8c 100644 --- a/clguetzli/ocl.cpp +++ b/clguetzli/ocl.cpp @@ -201,6 +201,31 @@ cl_mem ocl_args_d_t::allocMem(size_t s) return mem; } +ocl_channels ocl_args_d_t::allocMemChannels(size_t s) +{ + cl_uint optimizedSize = ((s - 1) / 64 + 1) * 64; + cl_int err = 0; + + ocl_channels img; + img.r = clCreateBuffer(this->context, CL_MEM_READ_WRITE, s, nullptr, &err); + if (CL_SUCCESS != err) + { + LogError("Error: allocMemR() for buffer returned %s.\n", TranslateOpenCLError(err)); + } + img.g = clCreateBuffer(this->context, CL_MEM_READ_WRITE, s, nullptr, &err); + if (CL_SUCCESS != err) + { + LogError("Error: allocMemG() for buffer returned %s.\n", TranslateOpenCLError(err)); + } + img.b = clCreateBuffer(this->context, CL_MEM_READ_WRITE, s, nullptr, &err); + if (CL_SUCCESS != err) + { + LogError("Error: allocMemB() for buffer returned %s.\n", TranslateOpenCLError(err)); + } + + return img; +} + const char* TranslateOpenCLError(cl_int errorCode) { switch (errorCode) diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h index 04bf5b1d..0161d1a1 100644 --- a/clguetzli/ocl.h +++ b/clguetzli/ocl.h @@ -51,6 +51,13 @@ int SetupOpenCL(ocl_args_d_t *ocl, cl_device_type deviceType); #define KERNEL_COUNT 5 +struct ocl_channels +{ + cl_mem r; + cl_mem g; + cl_mem b; +}; + struct ocl_args_d_t { ocl_args_d_t(); @@ -61,6 +68,7 @@ struct ocl_args_d_t void* allocC(size_t s); cl_mem allocMem(size_t s); + ocl_channels allocMemChannels(size_t s); // Regular OpenCL objects: cl_context context; // hold the context handler From d4c9ed96b2ccb96dc8e960ddc47a475f3e55940c Mon Sep 17 00:00:00 2001 From: strongtu Date: Thu, 4 May 2017 17:01:08 +0800 Subject: [PATCH 015/189] =?UTF-8?q?=E6=90=AD=E5=BB=BA=20clDiffmapOpsinDyna?= =?UTF-8?q?micsImage=20=E7=9A=84=E8=AE=A1=E7=AE=97=E6=B5=81=E7=A8=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clguetzli.cpp | 230 ++++++++++++++++++++++++++++++++++------ clguetzli/clguetzli.h | 4 +- clguetzli/ocl.cpp | 31 +++--- clguetzli/ocl.h | 16 ++- 4 files changed, 223 insertions(+), 58 deletions(-) diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index 21f5cc73..2ff242c4 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -242,14 +242,20 @@ void clBlur(size_t xsize, size_t ysize, float* channel, double sigma, double bor memcpy(channel, resultPtr, sizeof(cl_float) * xsize * ysize); } } - -void clConvolutionEx(cl_mem image, size_t xsize, size_t ysize, cl_mem expn, size_t expn_size, - int step, int offset, double border_ratio, cl_mem result) +//========================================================= +// ian todo +void clConvolutionEx(cl_mem image, size_t xsize, size_t ysize, + cl_mem expn, size_t expn_size, + int step, int offset, double border_ratio, + cl_mem result/*out*/) { // Convolution } -void clUpsampleEx(cl_mem image, size_t xsize, size_t ysize, size_t xstep, size_t ystep, cl_mem result) +// ian todo +void clUpsampleEx(cl_mem image, size_t xsize, size_t ysize, + size_t xstep, size_t ystep, + cl_mem result/*out*/) { /* for (size_t y = 0; y < ysize; y++) { @@ -262,7 +268,9 @@ void clUpsampleEx(cl_mem image, size_t xsize, size_t ysize, size_t xstep, size_t */ } -void clBlurEx(cl_mem image, size_t xsize, size_t ysize, double sigma, double border_ratio) +void clBlurEx(cl_mem image/*out, opt*/, size_t xsize, size_t ysize, + double sigma, double border_ratio, + cl_mem result/*out, opt*/) { double m = 2.25; // Accuracy increases when m is increased. const double scaler = -1.0 / (2 * sigma * sigma); @@ -293,19 +301,20 @@ void clBlurEx(cl_mem image, size_t xsize, size_t ysize, double sigma, double bor clConvolutionEx(image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio, ocl.srcA); clConvolutionEx(ocl.srcA, ysize, dxsize, mem_expn, expn_size, ystep, diff, border_ratio, ocl.srcB); - clUpsampleEx(ocl.srcB, dxsize, dysize, xstep, ystep, image); + clUpsampleEx(ocl.srcB, dxsize, dysize, xstep, ystep, result ? result : image); } else { ocl.allocA(sizeof(cl_float) * xsize * ysize); clConvolutionEx(image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio, ocl.srcA); - clConvolutionEx(ocl.srcA, ysize, dxsize, mem_expn, expn_size, ystep, diff, border_ratio, image); + clConvolutionEx(ocl.srcA, ysize, dxsize, mem_expn, expn_size, ystep, diff, border_ratio, result ? result : image); } clReleaseMemObject(mem_expn); } -void clOpsinDynamicsImageEx(cl_mem r, cl_mem g, cl_mem b, size_t size) +// ian todo +void clOpsinDynamicsImageEx(ocl_channels rgb/*in,out*/, size_t size) { /* for (size_t i = 0; i < rgb[0].size(); ++i) { @@ -333,6 +342,7 @@ void clOpsinDynamicsImageEx(cl_mem r, cl_mem g, cl_mem b, size_t size) */ } +// strong todo void clOpsinDynamicsImage(size_t xsize, size_t ysize, float* r, float* g, float* b) { static const double kSigma = 1.1; @@ -341,41 +351,41 @@ void clOpsinDynamicsImage(size_t xsize, size_t ysize, float* r, float* g, float* cl_int err = 0; ocl_args_d_t &ocl = getOcl(); - cl_mem mem_r = ocl.allocMem(channel_size); - cl_mem mem_g = ocl.allocMem(channel_size); - cl_mem mem_b = ocl.allocMem(channel_size); + ocl_channels rgb = ocl.allocMemChannels(channel_size); - clEnqueueWriteBuffer(ocl.commandQueue, mem_r, CL_FALSE, 0, channel_size, r, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, mem_g, CL_FALSE, 0, channel_size, g, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, mem_b, CL_FALSE, 0, channel_size, b, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, rgb.r, CL_FALSE, 0, channel_size, r, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, rgb.g, CL_FALSE, 0, channel_size, g, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, rgb.b, CL_FALSE, 0, channel_size, b, 0, NULL, NULL); err = clFinish(ocl.commandQueue); - clBlurEx(mem_r, xsize, ysize, kSigma, 0.0); - clBlurEx(mem_g, xsize, ysize, kSigma, 0.0); - clBlurEx(mem_b, xsize, ysize, kSigma, 0.0); + clBlurEx(rgb.r, xsize, ysize, kSigma, 0.0); + clBlurEx(rgb.g, xsize, ysize, kSigma, 0.0); + clBlurEx(rgb.b, xsize, ysize, kSigma, 0.0); - clOpsinDynamicsImageEx(mem_r, mem_g, mem_b, xsize * ysize); + clOpsinDynamicsImageEx(rgb, xsize * ysize); - cl_float *result_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mem_r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); - cl_float *result_g = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mem_g, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); - cl_float *result_b = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mem_b, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + cl_float *result_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, rgb.r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + cl_float *result_g = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, rgb.g, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + cl_float *result_b = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, rgb.b, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); err = clFinish(ocl.commandQueue); memcpy(r, result_r, channel_size); memcpy(g, result_g, channel_size); memcpy(b, result_b, channel_size); - clReleaseMemObject(mem_r); - clReleaseMemObject(mem_g); - clReleaseMemObject(mem_b); + ocl.releaseMemChannels(rgb); } -void clMaskHighIntensityChangeEx(ocl_channels rgb, ocl_channels rgb2, size_t xsize, size_t ysize) +// ian todo +void clMaskHighIntensityChangeEx(ocl_channels rgb/*in,out*/, + ocl_channels rgb2/*in,out*/, + size_t xsize, size_t ysize) { // MaskHighIntensityChange } -void clEdgeDetectorMapEx(ocl_channels rgb, ocl_channels rgb2, size_t xsize, size_t ysize, cl_mem result) +// strong todo +void clEdgeDetectorMapEx(ocl_channels rgb, ocl_channels rgb2, size_t xsize, size_t ysize, cl_mem result/*out*/) { static const double kSigma[3] = { 1.5, 0.586, 0.4 }; clBlurEx(rgb.r, xsize, ysize, kSigma[0], 0.0); @@ -388,37 +398,184 @@ void clEdgeDetectorMapEx(ocl_channels rgb, ocl_channels rgb2, size_t xsize, size // EdgeDetectorLowFreq } +// strong todo void clBlockDiffMapEx(ocl_channels rgb, ocl_channels rgb2, size_t xsize, size_t ysize, - cl_mem block_diff_dc, cl_mem block_diff_ac) + cl_mem block_diff_dc/*out*/, cl_mem block_diff_ac/*out*/) { } +// strong todo void clEdgeDetectorLowFreqEx(ocl_channels rgb, ocl_channels rgb2, size_t xsize, size_t ysize, - cl_mem block_diff_ac) + cl_mem block_diff_ac/*out*/) +{ + static const double kSigma = 14; + static const double kMul = 10; + + clBlurEx(rgb.r, xsize, ysize, kSigma, 0.0); + clBlurEx(rgb2.r, xsize, ysize, kSigma, 0.0); + clBlurEx(rgb.g, xsize, ysize, kSigma, 0.0); + clBlurEx(rgb2.g, xsize, ysize, kSigma, 0.0); + clBlurEx(rgb.b, xsize, ysize, kSigma, 0.0); + clBlurEx(rgb2.b, xsize, ysize, kSigma, 0.0); +} + +// ian todo +void clDiffPrecomputeEx(ocl_channels rgb, ocl_channels rgb2, size_t xsize, size_t ysize, ocl_channels mask/*out*/) +{ + +} + +// ian todo +void clScaleImageEx(cl_mem img, size_t size, float w, cl_mem result/*out*/) +{ + +} + +// ian todo +void clAverage5x5Ex(cl_mem img/*in,out*/, size_t xsize, size_t ysize) +{ + static const float w = 0.679144890667f; + static const float scale = 1.0f / (5.0f + 4 * w); + + cl_mem tmp0; + cl_mem tmp1; + clScaleImageEx(img, xsize * ysize, w, tmp0); + clScaleImageEx(img, xsize * ysize, 1, tmp1); + // average5x5 calc + + clScaleImageEx(img, xsize * ysize, scale, img); +} + +// +void clMinSquareValEx(cl_mem img/*in,out*/, size_t xsize, size_t ysize, size_t square_size, size_t offset) { } +static const double kInternalGoodQualityThreshold = 14.921561160295326; +static const double kGlobalScale = 1.0 / kInternalGoodQualityThreshold; + +// ian todo void clMaskEx(ocl_channels rgb, ocl_channels rgb2, size_t xsize, size_t ysize, - ocl_channels mask, ocl_channels mask_dc) + ocl_channels mask/*out*/, ocl_channels mask_dc/*out*/) { - + clDiffPrecomputeEx(rgb, rgb2, xsize, ysize, mask); + for (int i = 0; i < 3; i++) + { + clAverage5x5Ex(mask.ch[i], xsize, ysize); + clMinSquareValEx(mask.ch[i], xsize, ysize, 4, 0); + + static const double sigma[3] = { + 9.65781083553, + 14.2644604355, + 4.53358927369, + }; + + clBlurEx(mask.ch[i], xsize, ysize, sigma[i], 0.0); + } +/* + static const double w00 = 232.206464018; + static const double w11 = 22.9455222245; + static const double w22 = 503.962310606; + + mask_dc->resize(3); + for (int i = 0; i < 3; ++i) { + (*mask_dc)[i].resize(xsize * ysize); + } + for (size_t y = 0; y < ysize; ++y) { + for (size_t x = 0; x < xsize; ++x) { + const size_t idx = y * xsize + x; + const double s0 = (*mask)[0][idx]; + const double s1 = (*mask)[1][idx]; + const double s2 = (*mask)[2][idx]; + const double p0 = w00 * s0; + const double p1 = w11 * s1; + const double p2 = w22 * s2; + + (*mask)[0][idx] = static_cast(MaskX(p0)); + (*mask)[1][idx] = static_cast(MaskY(p1)); + (*mask)[2][idx] = static_cast(MaskB(p2)); + (*mask_dc)[0][idx] = static_cast(MaskDcX(p0)); + (*mask_dc)[1][idx] = static_cast(MaskDcY(p1)); + (*mask_dc)[2][idx] = static_cast(MaskDcB(p2)); + } + } +*/ + for (int i = 0; i < 3; i++) + { + clScaleImageEx(mask.ch[i], xsize * ysize, kGlobalScale * kGlobalScale, mask.ch[i]); + clScaleImageEx(mask_dc.ch[i], xsize * ysize, kGlobalScale * kGlobalScale, mask_dc.ch[i]); + } } -void clCombineChannelsEx(ocl_channels mask, ocl_channels mask_dc, cl_mem block_diff_dc, cl_mem block_diff_ac, cl_mem edge_detector_map, cl_mem result) +// ian todo +void clCombineChannelsEx(ocl_channels mask, ocl_channels mask_dc, cl_mem block_diff_dc, cl_mem block_diff_ac, cl_mem edge_detector_map, size_t step, cl_mem result/*out*/) { } -void clCalculateDiffmap(cl_mem result, size_t xsize, size_t ysize, int step) +// strong todo +void clCalculateDiffmapEx(cl_mem result/*in,out*/, size_t xsize, size_t ysize, int step) { +/* + int s2 = (8 - step) / 2; + { + // Upsample and take square root. + std::vector diffmap_out(xsize * ysize); + const size_t res_xsize = (xsize + step - 1) / step; + for (size_t res_y = 0; res_y + 8 - step < ysize; res_y += step) { + for (size_t res_x = 0; res_x + 8 - step < xsize; res_x += step) { + size_t res_ix = (res_y * res_xsize + res_x) / step; + float orig_val = (*diffmap)[res_ix]; + constexpr float kInitialSlope = 100; + // TODO(b/29974893): Until that is fixed do not call sqrt on very small + // numbers. + double val = orig_val < (1.0 / (kInitialSlope * kInitialSlope)) + ? kInitialSlope * orig_val + : std::sqrt(orig_val); + for (size_t off_y = 0; off_y < step; ++off_y) { + for (size_t off_x = 0; off_x < step; ++off_x) { + diffmap_out[(res_y + off_y + s2) * xsize + + res_x + off_x + s2] = val; + } + } + } + } + *diffmap = diffmap_out; + } +*/ + static const double kSigma = 8.8510880283; + static const double mul1 = 24.8235314874; + static const double scale = 1.0 / (1.0 + mul1); + const int s = 8 - step; + const int s2 = (8 - step) / 2; + cl_mem blurred; +/* + for (size_t y = 0; y < ysize - s; ++y) { + for (size_t x = 0; x < xsize - s; ++x) { + blurred[y * (xsize - s) + x] = (*diffmap)[(y + s2) * xsize + x + s2]; + } + } +*/ + static const double border_ratio = 0.03027655136; + clBlurEx(blurred, xsize - s, ysize - s, kSigma, border_ratio); +/* + for (size_t y = 0; y < ysize - s; ++y) { + for (size_t x = 0; x < xsize - s; ++x) { + (*diffmap)[(y + s2) * xsize + x + s2] + += static_cast(mul1) * blurred[y * (xsize - s) + x]; + } + } +*/ + clScaleImageEx(result, xsize * ysize, scale, result); } +// strong todo void clDiffmapOpsinDynamicsImage(float* r, float* g, float* b, float* r2, float* g2, float* b2, size_t xsize, size_t ysize, @@ -440,8 +597,6 @@ void clDiffmapOpsinDynamicsImage(float* r, float* g, float* b, clEnqueueWriteBuffer(ocl.commandQueue, xyb2.b, CL_FALSE, 0, channel_size, b2, 0, NULL, NULL); err = clFinish(ocl.commandQueue); - clMaskHighIntensityChangeEx(xyb, xyb2, xsize, ysize); - cl_mem edge_detector_map = ocl.allocMem(3 * xsize * ysize); cl_mem block_diff_dc = ocl.allocMem(3 * xsize * ysize); cl_mem block_diff_ac = ocl.allocMem(3 * xsize * ysize); @@ -451,10 +606,15 @@ void clDiffmapOpsinDynamicsImage(float* r, float* g, float* b, cl_mem mem_result; + clMaskHighIntensityChangeEx(xyb, xyb2, xsize, ysize); + clEdgeDetectorMapEx(xyb, xyb2, xsize, ysize, edge_detector_map); clBlockDiffMapEx(xyb, xyb2, xsize, ysize, block_diff_dc, block_diff_ac); clEdgeDetectorLowFreqEx(xyb, xyb2, xsize, ysize, block_diff_ac); + int step = 4; clMaskEx(xyb, xyb2, xsize, ysize, mask, mask_dc); - clCombineChannelsEx(mask, mask_dc, block_diff_dc, block_diff_ac, edge_detector_map, mem_result); + clCombineChannelsEx(mask, mask_dc, block_diff_dc, block_diff_ac, edge_detector_map, step, mem_result); + + clCalculateDiffmapEx(mem_result, xsize, ysize, step); } diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h index fa489667..edaa0688 100644 --- a/clguetzli/clguetzli.h +++ b/clguetzli/clguetzli.h @@ -2,9 +2,7 @@ #include "CL\cl.h" extern bool g_useOpenCL; -void clOpsinDynamicsImage(size_t xsize, size_t ysize, float* r, float* g, float& b); - -void clBlurEx(cl_mem image, size_t xsize, size_t ysize, double sigma, double border_ratio); +void clBlurEx(cl_mem image, size_t xsize, size_t ysize, double sigma, double border_ratio, cl_mem result = nullptr); void clMinSquareVal(size_t square_size, size_t offset, size_t xsize, size_t ysize, diff --git a/clguetzli/ocl.cpp b/clguetzli/ocl.cpp index 8272ac8c..26c68f5c 100644 --- a/clguetzli/ocl.cpp +++ b/clguetzli/ocl.cpp @@ -207,25 +207,26 @@ ocl_channels ocl_args_d_t::allocMemChannels(size_t s) cl_int err = 0; ocl_channels img; - img.r = clCreateBuffer(this->context, CL_MEM_READ_WRITE, s, nullptr, &err); - if (CL_SUCCESS != err) - { - LogError("Error: allocMemR() for buffer returned %s.\n", TranslateOpenCLError(err)); - } - img.g = clCreateBuffer(this->context, CL_MEM_READ_WRITE, s, nullptr, &err); - if (CL_SUCCESS != err) - { - LogError("Error: allocMemG() for buffer returned %s.\n", TranslateOpenCLError(err)); - } - img.b = clCreateBuffer(this->context, CL_MEM_READ_WRITE, s, nullptr, &err); - if (CL_SUCCESS != err) - { - LogError("Error: allocMemB() for buffer returned %s.\n", TranslateOpenCLError(err)); - } + for (int i = 0; i < 3; i++) + { + img.ch[i] = clCreateBuffer(this->context, CL_MEM_READ_WRITE, s, nullptr, &err); + if (CL_SUCCESS != err) + { + LogError("Error: allocMemChannel(%d) for buffer returned %s.\n", i, TranslateOpenCLError(err)); + } + } return img; } +void ocl_args_d_t::releaseMemChannels(ocl_channels rgb) +{ + for (int i = 0; i < 3; i++) + { + clReleaseMemObject(rgb.ch[i]); + } +} + const char* TranslateOpenCLError(cl_int errorCode) { switch (errorCode) diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h index 0161d1a1..5eb19560 100644 --- a/clguetzli/ocl.h +++ b/clguetzli/ocl.h @@ -51,12 +51,17 @@ int SetupOpenCL(ocl_args_d_t *ocl, cl_device_type deviceType); #define KERNEL_COUNT 5 -struct ocl_channels +typedef union ocl_channels_t { - cl_mem r; - cl_mem g; - cl_mem b; -}; + struct + { + cl_mem r; + cl_mem g; + cl_mem b; + }; + + cl_mem ch[3]; +}ocl_channels; struct ocl_args_d_t { @@ -69,6 +74,7 @@ struct ocl_args_d_t cl_mem allocMem(size_t s); ocl_channels allocMemChannels(size_t s); + void releaseMemChannels(ocl_channels rgb); // Regular OpenCL objects: cl_context context; // hold the context handler From dba4c851828cdb34cba2fe9240b6778559636d67 Mon Sep 17 00:00:00 2001 From: ianuming Date: Thu, 4 May 2017 19:50:38 +0800 Subject: [PATCH 016/189] Convert OpsinDynamicsImage to opencl --- clguetzli/clguetzli.cl | 101 ++++++++++++++++++ clguetzli/clguetzli.cpp | 59 +++++----- clguetzli/ocl.cpp | 3 +- clguetzli/ocl.h | 16 +-- .../butteraugli/butteraugli/butteraugli.cc | 78 ++++++++++++++ 5 files changed, 221 insertions(+), 36 deletions(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index af70a10b..68f7eff0 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -224,4 +224,105 @@ __kernel void DownSample(__global float* pA, __global float* pC, int square) const int sample_y = y / square; pC[y * xsize + x] = pA[sample_y * oxsize + sample_x]; +} + +void OpsinAbsorbance(const double in[3], double out[3]) +{ + const float mix[12] = { + 0.348036746003, + 0.577814843137, + 0.0544556093735, + 0.774145581713, + 0.26922717275, + 0.767247733938, + 0.0366922708552, + 0.920130265014, + 0.0882062883536, + 0.158581714673, + 0.712857943858, + 10.6524069248, + }; + + out[0] = mix[0] * in[0] + mix[1] * in[1] + mix[2] * in[2] + mix[3]; + out[1] = mix[4] * in[0] + mix[5] * in[1] + mix[6] * in[2] + mix[7]; + out[2] = mix[8] * in[0] + mix[9] * in[1] + mix[10] * in[2] + mix[11]; +} + +double EvaluatePolynomial(const double x, const double *coefficients, int n) +{ + double b1 = 0.0; + double b2 = 0.0; + + for (int i = n - 1; i >= 0; i--) + { + if (i == 0) { + const double x_b1 = x * b1; + b1 = x_b1 - b2 + coefficients[0]; + break; + } + const double x_b1 = x * b1; + const double t = (x_b1 + x_b1) - b2 + coefficients[i]; + b2 = b1; + b1 = t; + } + + return b1; +} + +float Gamma(double v) +{ + double min_value = 0.770000000000000; + double max_value = 274.579999999999984; + + static const double p[5 + 1] = { + 881.979476556478289, 1496.058452015812463, 908.662212739659481, + 373.566100223287378, 85.840860336314364, 6.683258861509244, + }; + static const double q[5 + 1] = { + 12.262350348616792, 20.557285797683576, 12.161463238367844, + 4.711532733641639, 0.899112889751053, 0.035662329617191, + }; + + const double x01 = (v - min_value) / (max_value - min_value); + const double xc = 2.0 * x01 - 1.0; + + const double yp = EvaluatePolynomial(xc, p, 6); + const double yq = EvaluatePolynomial(xc, q, 6); + if (yq == 0.0) return 0.0; + return (float)(yp / yq); +} + +void RgbToXyb(double r, double g, double b, double *valx, double *valy, double *valz) +{ + static const double a0 = 1.01611726948; + static const double a1 = 0.982482243696; + static const double a2 = 1.43571362627; + static const double a3 = 0.896039849412; + *valx = a0 * r - a1 * g; + *valy = a2 * r + a3 * g; + *valz = b; +} + +__kernel void OpsinDynamicsImage(__global float *r, __global float *g, __global float *b, __global float *r_blurred, __global float *g_blurred, __global float *b_blurred, int size) +{ + const int i = get_global_id(0); + double pre[3] = { r_blurred[i], g_blurred[i], b_blurred[i] }; + double pre_mixed[3]; + OpsinAbsorbance(pre, pre_mixed); + double sensitivity[3]; + sensitivity[0] = Gamma(pre_mixed[0]) / pre_mixed[0]; + sensitivity[1] = Gamma(pre_mixed[1]) / pre_mixed[1]; + sensitivity[2] = Gamma(pre_mixed[2]) / pre_mixed[2]; + + double cur_rgb[3] = { r_blurred[i], g_blurred[i], b_blurred[i] }; + double cur_mixed[3]; + OpsinAbsorbance(cur_rgb, cur_mixed); + cur_mixed[0] *= sensitivity[0]; + cur_mixed[1] *= sensitivity[1]; + cur_mixed[2] *= sensitivity[2]; + double x, y, z; + RgbToXyb(cur_mixed[0], cur_mixed[1], cur_mixed[2], &x, &y, &z); + r[i] = x; + g[i] = y; + b[i] = z; } \ No newline at end of file diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index 2ff242c4..6b7b2036 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -22,7 +22,7 @@ ocl_args_d_t& getOcl(void) char* source = nullptr; size_t src_size = 0; - ReadSourceFromFile("clguetzli.cl", &source, &src_size); + ReadSourceFromFile("clguetzli\\clguetzli.cl", &source, &src_size); ocl.program = clCreateProgramWithSource(ocl.context, 1, (const char**)&source, &src_size, &err); @@ -46,6 +46,7 @@ ocl_args_d_t& getOcl(void) ocl.kernel[KERNEL_CONVOLUTIONX] = clCreateKernel(ocl.program, "ConvolutionX", &err); ocl.kernel[KERNEL_CONVOLUTIONY] = clCreateKernel(ocl.program, "ConvolutionY", &err); ocl.kernel[KERNEL_DOWNSAMPLE] = clCreateKernel(ocl.program, "DownSample", &err); + ocl.kernel[KERNEL_OPSINDYNAMICSIMAGE] = clCreateKernel(ocl.program, "OpsinDynamicsImage", &err); return ocl; } @@ -314,32 +315,30 @@ void clBlurEx(cl_mem image/*out, opt*/, size_t xsize, size_t ysize, } // ian todo -void clOpsinDynamicsImageEx(ocl_channels rgb/*in,out*/, size_t size) +void clOpsinDynamicsImageEx(ocl_channels rgb/*in,out*/, ocl_channels rgb_blurred, size_t size) { -/* - for (size_t i = 0; i < rgb[0].size(); ++i) { - double sensitivity[3]; - { - // Calculate sensitivity[3] based on the smoothed image gamma derivative. - double pre_rgb[3] = { blurred[0][i], blurred[1][i], blurred[2][i] }; - double pre_mixed[3]; - OpsinAbsorbance(pre_rgb, pre_mixed); - sensitivity[0] = Gamma(pre_mixed[0]) / pre_mixed[0]; - sensitivity[1] = Gamma(pre_mixed[1]) / pre_mixed[1]; - sensitivity[2] = Gamma(pre_mixed[2]) / pre_mixed[2]; - } - double cur_rgb[3] = { rgb[0][i], rgb[1][i], rgb[2][i] }; - double cur_mixed[3]; - OpsinAbsorbance(cur_rgb, cur_mixed); - cur_mixed[0] *= sensitivity[0]; - cur_mixed[1] *= sensitivity[1]; - cur_mixed[2] *= sensitivity[2]; - double x, y, z; - RgbToXyb(cur_mixed[0], cur_mixed[1], cur_mixed[2], &x, &y, &z); - rgb[0][i] = static_cast(x); - rgb[1][i] = static_cast(y); - rgb[2][i] = static_cast(z); -*/ + ocl_args_d_t &ocl = getOcl(); + cl_int clSize = size; + cl_kernel kernel = ocl.kernel[KERNEL_MINSQUAREVAL]; + clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&rgb.r); + clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&rgb.g); + clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&rgb.b); + clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&rgb_blurred.r); + clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&rgb_blurred.g); + clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&rgb_blurred.b); + clSetKernelArg(kernel, 6, sizeof(cl_int), (void*)&clSize); + + size_t globalWorkSize[1] = { clSize }; + cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL); + if (CL_SUCCESS != err) + { + LogError("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err)); + } + err = clFinish(ocl.commandQueue); + if (CL_SUCCESS != err) + { + LogError("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err)); + } } // strong todo @@ -352,21 +351,26 @@ void clOpsinDynamicsImage(size_t xsize, size_t ysize, float* r, float* g, float* cl_int err = 0; ocl_args_d_t &ocl = getOcl(); ocl_channels rgb = ocl.allocMemChannels(channel_size); + ocl_channels rgb_blurred = ocl.allocMemChannels(channel_size); clEnqueueWriteBuffer(ocl.commandQueue, rgb.r, CL_FALSE, 0, channel_size, r, 0, NULL, NULL); clEnqueueWriteBuffer(ocl.commandQueue, rgb.g, CL_FALSE, 0, channel_size, g, 0, NULL, NULL); clEnqueueWriteBuffer(ocl.commandQueue, rgb.b, CL_FALSE, 0, channel_size, b, 0, NULL, NULL); + clEnqueueCopyBuffer(ocl.commandQueue, rgb.r, rgb_blurred.r, 0, 0, channel_size, 0, NULL, NULL); + clEnqueueCopyBuffer(ocl.commandQueue, rgb.g, rgb_blurred.g, 0, 0, channel_size, 0, NULL, NULL); + clEnqueueCopyBuffer(ocl.commandQueue, rgb.b, rgb_blurred.b, 0, 0, channel_size, 0, NULL, NULL); err = clFinish(ocl.commandQueue); clBlurEx(rgb.r, xsize, ysize, kSigma, 0.0); clBlurEx(rgb.g, xsize, ysize, kSigma, 0.0); clBlurEx(rgb.b, xsize, ysize, kSigma, 0.0); - clOpsinDynamicsImageEx(rgb, xsize * ysize); + clOpsinDynamicsImageEx(rgb, rgb_blurred, xsize * ysize); cl_float *result_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, rgb.r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); cl_float *result_g = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, rgb.g, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); cl_float *result_b = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, rgb.b, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + err = clFinish(ocl.commandQueue); memcpy(r, result_r, channel_size); @@ -374,6 +378,7 @@ void clOpsinDynamicsImage(size_t xsize, size_t ysize, float* r, float* g, float* memcpy(b, result_b, channel_size); ocl.releaseMemChannels(rgb); + ocl.releaseMemChannels(rgb_blurred); } // ian todo diff --git a/clguetzli/ocl.cpp b/clguetzli/ocl.cpp index 26c68f5c..5387a454 100644 --- a/clguetzli/ocl.cpp +++ b/clguetzli/ocl.cpp @@ -526,8 +526,7 @@ int SetupOpenCL(ocl_args_d_t *ocl, cl_device_type deviceType) // Query for all available OpenCL platforms on the system // Here you enumerate all platforms and pick one which name has preferredPlatform as a sub-string - deviceType = CL_DEVICE_TYPE_GPU; - cl_platform_id platformId = FindOpenCLPlatform("", deviceType); + cl_platform_id platformId = FindOpenCLPlatform("Intel", deviceType); if (NULL == platformId) { deviceType = CL_DEVICE_TYPE_CPU; diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h index 5eb19560..e5cf3d7c 100644 --- a/clguetzli/ocl.h +++ b/clguetzli/ocl.h @@ -43,13 +43,15 @@ int SetupOpenCL(ocl_args_d_t *ocl, cl_device_type deviceType); * and make passing all these arguments in functions easier. */ -#define KERNEL_MINSQUAREVAL 0 -#define KERNEL_CONVOLUTION 1 -#define KERNEL_CONVOLUTIONX 2 -#define KERNEL_CONVOLUTIONY 3 -#define KERNEL_DOWNSAMPLE 4 - -#define KERNEL_COUNT 5 +enum KernelName { + KERNEL_MINSQUAREVAL, + KERNEL_CONVOLUTION, + KERNEL_CONVOLUTIONX, + KERNEL_CONVOLUTIONY, + KERNEL_DOWNSAMPLE, + KERNEL_OPSINDYNAMICSIMAGE, + KERNEL_COUNT, +}; typedef union ocl_channels_t { diff --git a/third_party/butteraugli/butteraugli/butteraugli.cc b/third_party/butteraugli/butteraugli/butteraugli.cc index fb895b34..98451cca 100644 --- a/third_party/butteraugli/butteraugli/butteraugli.cc +++ b/third_party/butteraugli/butteraugli/butteraugli.cc @@ -928,6 +928,24 @@ inline void ClenshawRecursion<0>(const double x, const double *coefficients, *b1 = x_b1 - (*b2) + coefficients[0]; } +void ClenshawRecursion_fun(const double x, const double *coefficients, + double *b1, double *b2, int n) +{ + if (n == 0) { + const double x_b1 = x * (*b1); + // The final iteration differs - no 2 * x_b1 here. + *b1 = x_b1 - (*b2) + coefficients[0]; + return; + } + + const double x_b1 = x * (*b1); + const double t = (x_b1 + x_b1) - (*b2) + coefficients[n]; + *b2 = *b1; + *b1 = t; + + ClenshawRecursion_fun(x, coefficients, b1, b2, n - 1); +} + // Rational polynomial := dividing two polynomial evaluations. These are easier // to find than minimax polynomials. struct RationalPolynomial { @@ -936,10 +954,34 @@ struct RationalPolynomial { const double (&coefficients)[N]) { double b1 = 0.0; double b2 = 0.0; + ClenshawRecursion(x, coefficients, &b1, &b2); + return b1; } +#ifdef ENABLE_OPENCL_CHECK + static double EvaluatePolynomialNonRecursion(const double x, const double *coefficients, int n) { + double b1 = 0.0; + double b2 = 0.0; + + for (int i = n - 1; i >= 0; i--) + { + if (i == 0) { + const double x_b1 = x * b1; + b1 = x_b1 - b2 + coefficients[0]; + break; + } + const double x_b1 = x * b1; + const double t = (x_b1 + x_b1) - b2 + coefficients[i]; + b2 = b1; + b1 = t; + } + + return b1; + } +#endif // ENABLE_OPENCL_CHECK + // Evaluates the polynomial at x (in [min_value, max_value]). inline double operator()(const float x) const { // First normalize to [0, 1]. @@ -978,6 +1020,32 @@ static inline float GammaPolynomial(float value) { return static_cast(r(value)); } +#ifdef ENABLE_OPENCL_CHECK +static double GammaNonRecursion(double v) { + double min_value = 0.770000000000000; + double max_value = 274.579999999999984; + + double p[5 + 1] = { + 881.979476556478289, 1496.058452015812463, 908.662212739659481, + 373.566100223287378, 85.840860336314364, 6.683258861509244, + }; + double q[5 + 1] = { + 12.262350348616792, 20.557285797683576, 12.161463238367844, + 4.711532733641639, 0.899112889751053, 0.035662329617191, + }; + + // First normalize to [0, 1]. + const double x01 = (v - min_value) / (max_value - min_value); + // And then to [-1, 1] domain of Chebyshev polynomials. + const double xc = 2.0 * x01 - 1.0; + + const double yp = RationalPolynomial::EvaluatePolynomialNonRecursion(xc, p, 6); + const double yq = RationalPolynomial::EvaluatePolynomialNonRecursion(xc, q, 6); + if (yq == 0.0) return 0.0; + return static_cast(yp / yq); +} +#endif // ENABLE_OPENCL_CHECK + static inline double Gamma(double v) { // return SimpleGamma(v); return GammaPolynomial(static_cast(v)); @@ -1001,6 +1069,16 @@ void OpsinDynamicsImage(size_t xsize, size_t ysize, sensitivity[0] = Gamma(pre_mixed[0]) / pre_mixed[0]; sensitivity[1] = Gamma(pre_mixed[1]) / pre_mixed[1]; sensitivity[2] = Gamma(pre_mixed[2]) / pre_mixed[2]; + +#ifdef ENABLE_OPENCL_CHECK + double sensitivity_new[3]; + sensitivity_new[0] = GammaNonRecursion(pre_mixed[0]) / pre_mixed[0]; + assert(fabs(sensitivity[0] - sensitivity_new[0]) < 0.01); + sensitivity_new[1] = GammaNonRecursion(pre_mixed[1]) / pre_mixed[1]; + assert(fabs(sensitivity[1] - sensitivity_new[1]) < 0.01); + sensitivity_new[2] = GammaNonRecursion(pre_mixed[2]) / pre_mixed[2]; + assert(fabs(sensitivity[2] - sensitivity_new[2]) < 0.01); +#endif // ENABLE_OPENCL_CHECK } double cur_rgb[3] = { rgb[0][i], rgb[1][i], rgb[2][i] }; double cur_mixed[3]; From ac4254e6f1692adedff283051b164b355c6f735a Mon Sep 17 00:00:00 2001 From: strongtu Date: Thu, 4 May 2017 20:25:12 +0800 Subject: [PATCH 017/189] fix opencl compile error --- clguetzli/clguetzli.cl | 25 ++++++++----------------- clguetzli/clguetzli.cpp | 11 ++++------- 2 files changed, 12 insertions(+), 24 deletions(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index 68f7eff0..bcf367b7 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -1,11 +1,3 @@ -float minfun(float a, float b) -{ - if (a < b) - return a; - else - return b; -} - __kernel void MinSquareVal(__global float* pA, __global float* pC, int square_size, int offset) { const int x = get_global_id(0); @@ -25,9 +17,8 @@ __kernel void MinSquareVal(__global float* pA, __global float* pC, int square_si { for (int i = minW; i < maxW; i++) { - minValue = minfun(minValue, pA[j * width + i]); -// float tmp = pA[j * width + i]; -// if (tmp < minValue) minValue = tmp; + float tmp = pA[j * width + i]; + if (tmp < minValue) minValue = tmp; } } @@ -274,11 +265,11 @@ float Gamma(double v) double min_value = 0.770000000000000; double max_value = 274.579999999999984; - static const double p[5 + 1] = { + /*static*/ const double p[5 + 1] = { 881.979476556478289, 1496.058452015812463, 908.662212739659481, 373.566100223287378, 85.840860336314364, 6.683258861509244, }; - static const double q[5 + 1] = { + /*static*/ const double q[5 + 1] = { 12.262350348616792, 20.557285797683576, 12.161463238367844, 4.711532733641639, 0.899112889751053, 0.035662329617191, }; @@ -294,10 +285,10 @@ float Gamma(double v) void RgbToXyb(double r, double g, double b, double *valx, double *valy, double *valz) { - static const double a0 = 1.01611726948; - static const double a1 = 0.982482243696; - static const double a2 = 1.43571362627; - static const double a3 = 0.896039849412; + /*static*/ const double a0 = 1.01611726948; + /*static*/ const double a1 = 0.982482243696; + /*static*/ const double a2 = 1.43571362627; + /*static*/ const double a3 = 0.896039849412; *valx = a0 * r - a1 * g; *valy = a2 * r + a3 * g; *valz = b; diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index 6b7b2036..f3c9afce 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -356,14 +356,11 @@ void clOpsinDynamicsImage(size_t xsize, size_t ysize, float* r, float* g, float* clEnqueueWriteBuffer(ocl.commandQueue, rgb.r, CL_FALSE, 0, channel_size, r, 0, NULL, NULL); clEnqueueWriteBuffer(ocl.commandQueue, rgb.g, CL_FALSE, 0, channel_size, g, 0, NULL, NULL); clEnqueueWriteBuffer(ocl.commandQueue, rgb.b, CL_FALSE, 0, channel_size, b, 0, NULL, NULL); - clEnqueueCopyBuffer(ocl.commandQueue, rgb.r, rgb_blurred.r, 0, 0, channel_size, 0, NULL, NULL); - clEnqueueCopyBuffer(ocl.commandQueue, rgb.g, rgb_blurred.g, 0, 0, channel_size, 0, NULL, NULL); - clEnqueueCopyBuffer(ocl.commandQueue, rgb.b, rgb_blurred.b, 0, 0, channel_size, 0, NULL, NULL); err = clFinish(ocl.commandQueue); - clBlurEx(rgb.r, xsize, ysize, kSigma, 0.0); - clBlurEx(rgb.g, xsize, ysize, kSigma, 0.0); - clBlurEx(rgb.b, xsize, ysize, kSigma, 0.0); + clBlurEx(rgb.r, xsize, ysize, kSigma, 0.0, rgb_blurred.r); + clBlurEx(rgb.g, xsize, ysize, kSigma, 0.0, rgb_blurred.g); + clBlurEx(rgb.b, xsize, ysize, kSigma, 0.0, rgb_blurred.b); clOpsinDynamicsImageEx(rgb, rgb_blurred, xsize * ysize); @@ -454,7 +451,7 @@ void clAverage5x5Ex(cl_mem img/*in,out*/, size_t xsize, size_t ysize) clScaleImageEx(img, xsize * ysize, scale, img); } -// +// ian todo void clMinSquareValEx(cl_mem img/*in,out*/, size_t xsize, size_t ysize, size_t square_size, size_t offset) { From 0afb0a361d06d1d54dedf5e8abe0668ff1e4b8f6 Mon Sep 17 00:00:00 2001 From: ianhuang-777 <306168910@qq.com> Date: Thu, 4 May 2017 21:05:17 +0800 Subject: [PATCH 018/189] open cl compiler error fix --- clguetzli/clguetzli.cl | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index bcf367b7..bf3b22ef 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -260,16 +260,16 @@ double EvaluatePolynomial(const double x, const double *coefficients, int n) return b1; } -float Gamma(double v) +double Gamma(double v) { double min_value = 0.770000000000000; double max_value = 274.579999999999984; - /*static*/ const double p[5 + 1] = { + const double p[5 + 1] = { 881.979476556478289, 1496.058452015812463, 908.662212739659481, 373.566100223287378, 85.840860336314364, 6.683258861509244, }; - /*static*/ const double q[5 + 1] = { + const double q[5 + 1] = { 12.262350348616792, 20.557285797683576, 12.161463238367844, 4.711532733641639, 0.899112889751053, 0.035662329617191, }; @@ -285,10 +285,10 @@ float Gamma(double v) void RgbToXyb(double r, double g, double b, double *valx, double *valy, double *valz) { - /*static*/ const double a0 = 1.01611726948; - /*static*/ const double a1 = 0.982482243696; - /*static*/ const double a2 = 1.43571362627; - /*static*/ const double a3 = 0.896039849412; + const double a0 = 1.01611726948; + const double a1 = 0.982482243696; + const double a2 = 1.43571362627; + const double a3 = 0.896039849412; *valx = a0 * r - a1 * g; *valy = a2 * r + a3 * g; *valz = b; From 2cbc518b60a38830be6770a8ebc9f7a04a93f37c Mon Sep 17 00:00:00 2001 From: ianhuang-777 <306168910@qq.com> Date: Thu, 4 May 2017 22:48:50 +0800 Subject: [PATCH 019/189] Implement clConvolutionEx --- clguetzli/clguetzli.cpp | 49 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 43 insertions(+), 6 deletions(-) diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index f3c9afce..bf92f5a4 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -243,14 +243,51 @@ void clBlur(size_t xsize, size_t ysize, float* channel, double sigma, double bor memcpy(channel, resultPtr, sizeof(cl_float) * xsize * ysize); } } -//========================================================= -// ian todo -void clConvolutionEx(cl_mem image, size_t xsize, size_t ysize, - cl_mem expn, size_t expn_size, - int step, int offset, double border_ratio, + +void clConvolutionEx(cl_mem inp, size_t xsize, size_t ysize, + cl_mem multipliers, size_t len, + int xstep, int offset, double border_ratio, cl_mem result/*out*/) { - // Convolution + cl_int err = CL_SUCCESS; + ocl_args_d_t &ocl = getOcl(); + + size_t oxsize = xsize / xstep; + + ocl.allocA(sizeof(cl_float) * len); + ocl.allocB(sizeof(cl_float) * xsize * ysize); + ocl.allocC(sizeof(cl_float) * oxsize * ysize); + + memcpy(ocl.inputA, multipliers, sizeof(cl_float) * len); + memcpy(ocl.inputB, inp, sizeof(cl_float) * xsize * ysize); + + cl_int clxsize = xsize; + cl_int clxstep = xstep; + cl_int cllen = len; + cl_int cloffset = offset; + cl_float clborder_ratio = border_ratio; + + cl_kernel kernel = ocl.kernel[KERNEL_CONVOLUTION]; + clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&multipliers); + clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&inp); + clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&result); + clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&clxsize); + clSetKernelArg(kernel, 4, sizeof(cl_int), (void*)&clxstep); + clSetKernelArg(kernel, 5, sizeof(cl_int), (void*)&cllen); + clSetKernelArg(kernel, 6, sizeof(cl_int), (void*)&cloffset); + clSetKernelArg(kernel, 7, sizeof(cl_float), (void*)&clborder_ratio); + + size_t globalWorkSize[2] = { oxsize, ysize }; + err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); + if (CL_SUCCESS != err) + { + LogError("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err)); + } + err = clFinish(ocl.commandQueue); + if (CL_SUCCESS != err) + { + LogError("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err)); + } } // ian todo From fad11fc21f50a4bcfbab72e08e3a9ff79b5cada7 Mon Sep 17 00:00:00 2001 From: ianhuang-777 <306168910@qq.com> Date: Thu, 4 May 2017 22:50:55 +0800 Subject: [PATCH 020/189] Remove useless code --- clguetzli/clguetzli.cpp | 7 ------- 1 file changed, 7 deletions(-) diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index bf92f5a4..92a8558a 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -254,13 +254,6 @@ void clConvolutionEx(cl_mem inp, size_t xsize, size_t ysize, size_t oxsize = xsize / xstep; - ocl.allocA(sizeof(cl_float) * len); - ocl.allocB(sizeof(cl_float) * xsize * ysize); - ocl.allocC(sizeof(cl_float) * oxsize * ysize); - - memcpy(ocl.inputA, multipliers, sizeof(cl_float) * len); - memcpy(ocl.inputB, inp, sizeof(cl_float) * xsize * ysize); - cl_int clxsize = xsize; cl_int clxstep = xstep; cl_int cllen = len; From b5013933a421e5bf520783aee1cf445ffeb33309 Mon Sep 17 00:00:00 2001 From: ianhuang-777 <306168910@qq.com> Date: Thu, 4 May 2017 23:16:41 +0800 Subject: [PATCH 021/189] Implement clUpsampleEx --- clguetzli/clguetzli.cl | 8 ++++---- clguetzli/clguetzli.cpp | 39 ++++++++++++++++++++++++++------------- 2 files changed, 30 insertions(+), 17 deletions(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index bf3b22ef..1f1ff8e2 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -201,7 +201,7 @@ __kernel void ConvolutionY(__global float* multipliers, __global float* inp, __g result[y * xsize + x] = sum * scale; } -__kernel void DownSample(__global float* pA, __global float* pC, int square) +__kernel void DownSample(__global float* pA, __global float* pC, int xstep, int ystep) { const int x = get_global_id(0); const int y = get_global_id(1); @@ -209,10 +209,10 @@ __kernel void DownSample(__global float* pA, __global float* pC, int square) const int xsize = get_global_size(0); const int ysize = get_global_size(1); - const int oxsize = xsize / square; + const int oxsize = xsize / xstep; - const int sample_x = x / square; - const int sample_y = y / square; + const int sample_x = x / xstep; + const int sample_y = y / ystep; pC[y * xsize + x] = pA[sample_y * oxsize + sample_x]; } diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index 92a8558a..9446886f 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -232,6 +232,7 @@ void clBlur(size_t xsize, size_t ysize, float* channel, double sigma, double bor clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&ocl.srcB); clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&ocl.dstMem); clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&clstep); + clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&clstep); globalWorkSize[0] = ysize; globalWorkSize[1] = xsize; @@ -274,29 +275,41 @@ void clConvolutionEx(cl_mem inp, size_t xsize, size_t ysize, err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); if (CL_SUCCESS != err) { - LogError("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err)); + LogError("Error: clConvolutionEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err)); } err = clFinish(ocl.commandQueue); if (CL_SUCCESS != err) { - LogError("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err)); + LogError("Error: clConvolutionEx() clFinish returned %s.\n", TranslateOpenCLError(err)); } } -// ian todo void clUpsampleEx(cl_mem image, size_t xsize, size_t ysize, size_t xstep, size_t ystep, cl_mem result/*out*/) { -/* - for (size_t y = 0; y < ysize; y++) { - for (size_t x = 0; x < xsize; x++) { - // TODO: Use correct rounding. - channel[y * xsize + x] = - downsampled_output[(y / ystep) * dxsize + (x / xstep)]; - } + cl_int err = CL_SUCCESS; + ocl_args_d_t &ocl = getOcl(); + + cl_int clxstep = xstep; + cl_int clystep = ystep; + cl_kernel kernel = ocl.kernel[KERNEL_DOWNSAMPLE]; + clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&ocl.srcB); + clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&ocl.dstMem); + clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&clxstep); + clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&clystep); + + size_t globalWorkSize[2] = { ysize, xsize }; + err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); + if (CL_SUCCESS != err) + { + LogError("Error: clUpsampleEx clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err)); + } + err = clFinish(ocl.commandQueue); + if (CL_SUCCESS != err) + { + LogError("Error: clUpsampleEx clFinish returned %s.\n", TranslateOpenCLError(err)); } -*/ } void clBlurEx(cl_mem image/*out, opt*/, size_t xsize, size_t ysize, @@ -362,12 +375,12 @@ void clOpsinDynamicsImageEx(ocl_channels rgb/*in,out*/, ocl_channels rgb_blurred cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL); if (CL_SUCCESS != err) { - LogError("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err)); + LogError("Error: clOpsinDynamicsImageEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err)); } err = clFinish(ocl.commandQueue); if (CL_SUCCESS != err) { - LogError("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err)); + LogError("Error: clOpsinDynamicsImageEx() clFinish returned %s.\n", TranslateOpenCLError(err)); } } From 8909cdacb2fd6ab6690fe80ad734558f0fc4f68c Mon Sep 17 00:00:00 2001 From: ianhuang-777 <306168910@qq.com> Date: Thu, 4 May 2017 23:52:23 +0800 Subject: [PATCH 022/189] Implement clMinSquareValEx --- clguetzli/clguetzli.cpp | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index 9446886f..a0fd943b 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -357,7 +357,6 @@ void clBlurEx(cl_mem image/*out, opt*/, size_t xsize, size_t ysize, clReleaseMemObject(mem_expn); } -// ian todo void clOpsinDynamicsImageEx(ocl_channels rgb/*in,out*/, ocl_channels rgb_blurred, size_t size) { ocl_args_d_t &ocl = getOcl(); @@ -494,10 +493,38 @@ void clAverage5x5Ex(cl_mem img/*in,out*/, size_t xsize, size_t ysize) clScaleImageEx(img, xsize * ysize, scale, img); } -// ian todo void clMinSquareValEx(cl_mem img/*in,out*/, size_t xsize, size_t ysize, size_t square_size, size_t offset) { + cl_int err = CL_SUCCESS; + ocl_args_d_t &ocl = getOcl(); + cl_int cloffset = offset; + cl_int clsquare_size = square_size; + ocl.allocA(sizeof(cl_float) * xsize * ysize); + + cl_kernel kernel = ocl.kernel[KERNEL_MINSQUAREVAL]; + clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&img); + clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&ocl.srcA); + clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&clsquare_size); + clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&cloffset); + + size_t globalWorkSize[2] = { xsize, ysize }; + err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); + if (CL_SUCCESS != err) + { + LogError("Error: clMinSquareValEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err)); + } + + err = clEnqueueCopyBuffer(ocl.commandQueue, ocl.srcA, img, 0, 0, sizeof(cl_float) * xsize * ysize, 0, NULL, NULL); + if (CL_SUCCESS != err) + { + LogError("Error: clMinSquareValEx() clEnqueueCopyBuffer returned %s.\n", TranslateOpenCLError(err)); + } + err = clFinish(ocl.commandQueue); + if (CL_SUCCESS != err) + { + LogError("Error: clMinSquareValEx() clFinish returned %s.\n", TranslateOpenCLError(err)); + } } static const double kInternalGoodQualityThreshold = 14.921561160295326; From 5ea138c058b6008ef209da8ed7de374d825f55af Mon Sep 17 00:00:00 2001 From: ianhuang-777 <306168910@qq.com> Date: Fri, 5 May 2017 00:48:29 +0800 Subject: [PATCH 023/189] Implement clMaskEx --- clguetzli/clguetzli.cl | 125 +++++++++++++++++++++++++++++++++++++++- clguetzli/clguetzli.cpp | 66 +++++++++++---------- clguetzli/ocl.h | 1 + 3 files changed, 161 insertions(+), 31 deletions(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index 1f1ff8e2..73e06b62 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -316,4 +316,127 @@ __kernel void OpsinDynamicsImage(__global float *r, __global float *g, __global r[i] = x; g[i] = y; b[i] = z; -} \ No newline at end of file +} + + +double InterpolateClampNegative(const double *array, + int size, double sx) { + if (sx < 0) { + sx = 0; + } + double ix = fabs(sx); + int baseix = (int)(ix); + double res; + if (baseix >= size - 1) { + res = array[size - 1]; + } + else { + double mix = ix - baseix; + int nextix = baseix + 1; + res = array[baseix] + mix * (array[nextix] - array[baseix]); + } + return res; +} + +void MakeMask(double extmul, double extoff, + double mul, double offset, + double scaler, double *result) +{ + for (size_t i = 0; i < 512; ++i) { + const double c = mul / ((0.01 * scaler * i) + offset); + result[i] = 1.0 + extmul * (c + extoff); + result[i] *= result[i]; + } +} + +double MaskX(double delta) { + const double extmul = 0.975741017749; + const double extoff = -4.25328244168; + const double offset = 0.454909521427; + const double scaler = 0.0738288224836; + const double mul = 20.8029176447; + double lut[512]; + MakeMask(extmul, extoff, mul, offset, scaler, lut); + return InterpolateClampNegative(lut, 512, delta); +} + +double MaskY(double delta) { + const double extmul = 0.373995618954; + const double extoff = 1.5307267433; + const double offset = 0.911952641929; + const double scaler = 1.1731667845; + const double mul = 16.2447033988; + double lut[512]; + MakeMask(extmul, extoff, mul, offset, scaler, lut); + return InterpolateClampNegative(lut, 512, delta); +} + +double MaskB(double delta) { + const double extmul = 0.61582234137; + const double extoff = -4.25376118646; + const double offset = 1.05105070921; + const double scaler = 0.47434643535; + const double mul = 31.1444967089; + double lut[512]; + MakeMask(extmul, extoff, mul, offset, scaler, lut); + return InterpolateClampNegative(lut, 512, delta); +} + +double MaskDcX(double delta) { + const double extmul = 1.79116943438; + const double extoff = -3.86797479189; + const double offset = 0.670960225853; + const double scaler = 0.486575865525; + const double mul = 20.4563479139; + double lut[512]; + MakeMask(extmul, extoff, mul, offset, scaler, lut); + return InterpolateClampNegative(lut, 512, delta); +} + +double MaskDcY(double delta) { + const double extmul = 0.212223514236; + const double extoff = -3.65647120524; + const double offset = 1.73396799447; + const double scaler = 0.170392660501; + const double mul = 21.6566724788; + double lut[512]; + MakeMask(extmul, extoff, mul, offset, scaler, lut); + return InterpolateClampNegative(lut, 512, delta); +} + +double MaskDcB(double delta) { + const double extmul = 0.349376011816; + const double extoff = -0.894711072781; + const double offset = 0.901647926679; + const double scaler = 0.380086095024; + const double mul = 18.0373825149; + double lut[512]; + MakeMask(extmul, extoff, mul, offset, scaler, lut); + return InterpolateClampNegative(lut, 512, delta); +} + +__kernel void DoMask(__global float *mask_x, __global float *mask_y, __global float *mask_b, __global float *mask_dc_x, __global float *mask_dc_y, __global float *mask_dc_b, int xsize, int ysize) +{ + const double w00 = 232.206464018; + const double w11 = 22.9455222245; + const double w22 = 503.962310606; + + const int x = get_global_id(0); + const int y = get_global_id(1); + + const size_t idx = y * xsize + x; + const double s0 = mask_x[idx]; + const double s1 = mask_y[idx]; + const double s2 = mask_b[idx]; + const double p0 = w00 * s0; + const double p1 = w11 * s1; + const double p2 = w22 * s2; + + mask_x[idx] = (float)(MaskX(p0)); + mask_y[idx] = (float)(MaskY(p1)); + mask_b[idx] = (float)(MaskB(p2)); + mask_dc_x[idx] = (float)(MaskDcX(p0)); + mask_dc_y[idx] = (float)(MaskDcY(p1)); + mask_dc_b[idx] = (float)(MaskDcB(p2)); + +} diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index a0fd943b..7bd75e8e 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -47,6 +47,7 @@ ocl_args_d_t& getOcl(void) ocl.kernel[KERNEL_CONVOLUTIONY] = clCreateKernel(ocl.program, "ConvolutionY", &err); ocl.kernel[KERNEL_DOWNSAMPLE] = clCreateKernel(ocl.program, "DownSample", &err); ocl.kernel[KERNEL_OPSINDYNAMICSIMAGE] = clCreateKernel(ocl.program, "OpsinDynamicsImage", &err); + ocl.kernel[KERNEL_DOMASK] = clCreateKernel(ocl.program, "DoMask", &err); return ocl; } @@ -530,7 +531,37 @@ void clMinSquareValEx(cl_mem img/*in,out*/, size_t xsize, size_t ysize, size_t s static const double kInternalGoodQualityThreshold = 14.921561160295326; static const double kGlobalScale = 1.0 / kInternalGoodQualityThreshold; -// ian todo +void clDoMask(ocl_channels mask/*in, out*/, ocl_channels mask_dc/*in, out*/, size_t xsize, size_t ysize) +{ + cl_int err = CL_SUCCESS; + ocl_args_d_t &ocl = getOcl(); + + cl_int clxsize = xsize; + cl_int clysize = ysize; + + cl_kernel kernel = ocl.kernel[KERNEL_DOMASK]; + clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&mask.r); + clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&mask.g); + clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&mask.b); + clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&mask_dc.r); + clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&mask_dc.g); + clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&mask_dc.b); + clSetKernelArg(kernel, 6, sizeof(cl_int), (void*)&clxsize); + clSetKernelArg(kernel, 7, sizeof(cl_int), (void*)&clysize); + + size_t globalWorkSize[2] = { xsize, ysize }; + err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); + if (CL_SUCCESS != err) + { + LogError("Error: clDoMask() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err)); + } + err = clFinish(ocl.commandQueue); + if (CL_SUCCESS != err) + { + LogError("Error: clDoMask() clFinish returned %s.\n", TranslateOpenCLError(err)); + } +} + void clMaskEx(ocl_channels rgb, ocl_channels rgb2, size_t xsize, size_t ysize, ocl_channels mask/*out*/, ocl_channels mask_dc/*out*/) @@ -549,34 +580,9 @@ void clMaskEx(ocl_channels rgb, ocl_channels rgb2, clBlurEx(mask.ch[i], xsize, ysize, sigma[i], 0.0); } -/* - static const double w00 = 232.206464018; - static const double w11 = 22.9455222245; - static const double w22 = 503.962310606; - mask_dc->resize(3); - for (int i = 0; i < 3; ++i) { - (*mask_dc)[i].resize(xsize * ysize); - } - for (size_t y = 0; y < ysize; ++y) { - for (size_t x = 0; x < xsize; ++x) { - const size_t idx = y * xsize + x; - const double s0 = (*mask)[0][idx]; - const double s1 = (*mask)[1][idx]; - const double s2 = (*mask)[2][idx]; - const double p0 = w00 * s0; - const double p1 = w11 * s1; - const double p2 = w22 * s2; - - (*mask)[0][idx] = static_cast(MaskX(p0)); - (*mask)[1][idx] = static_cast(MaskY(p1)); - (*mask)[2][idx] = static_cast(MaskB(p2)); - (*mask_dc)[0][idx] = static_cast(MaskDcX(p0)); - (*mask_dc)[1][idx] = static_cast(MaskDcY(p1)); - (*mask_dc)[2][idx] = static_cast(MaskDcB(p2)); - } - } -*/ + clDoMask(mask, mask_dc, xsize, ysize); + for (int i = 0; i < 3; i++) { clScaleImageEx(mask.ch[i], xsize * ysize, kGlobalScale * kGlobalScale, mask.ch[i]); @@ -673,8 +679,8 @@ void clDiffmapOpsinDynamicsImage(float* r, float* g, float* b, cl_mem block_diff_dc = ocl.allocMem(3 * xsize * ysize); cl_mem block_diff_ac = ocl.allocMem(3 * xsize * ysize); - ocl_channels mask; - ocl_channels mask_dc; + ocl_channels mask = ocl.allocMemChannels(channel_size); + ocl_channels mask_dc = ocl.allocMemChannels(channel_size); cl_mem mem_result; diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h index e5cf3d7c..6cb0a916 100644 --- a/clguetzli/ocl.h +++ b/clguetzli/ocl.h @@ -50,6 +50,7 @@ enum KernelName { KERNEL_CONVOLUTIONY, KERNEL_DOWNSAMPLE, KERNEL_OPSINDYNAMICSIMAGE, + KERNEL_DOMASK, KERNEL_COUNT, }; From 437fa09e6b58fafaf244b94f34742cadc41d0e9b Mon Sep 17 00:00:00 2001 From: ianhuang-777 <306168910@qq.com> Date: Fri, 5 May 2017 01:03:47 +0800 Subject: [PATCH 024/189] Implement clScaleImageEx --- clguetzli/clguetzli.cl | 6 ++++++ clguetzli/clguetzli.cpp | 29 ++++++++++++++++++++++++++++- clguetzli/ocl.h | 1 + 3 files changed, 35 insertions(+), 1 deletion(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index 73e06b62..4c70b380 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -440,3 +440,9 @@ __kernel void DoMask(__global float *mask_x, __global float *mask_y, __global fl mask_dc_b[idx] = (float)(MaskDcB(p2)); } + +__kernel void ScaleImage(double scale, __global float *result) +{ + const int i = get_global_id(0); + result[i] *= (float)(scale); +} diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index 7bd75e8e..67fc7503 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -48,6 +48,7 @@ ocl_args_d_t& getOcl(void) ocl.kernel[KERNEL_DOWNSAMPLE] = clCreateKernel(ocl.program, "DownSample", &err); ocl.kernel[KERNEL_OPSINDYNAMICSIMAGE] = clCreateKernel(ocl.program, "OpsinDynamicsImage", &err); ocl.kernel[KERNEL_DOMASK] = clCreateKernel(ocl.program, "DoMask", &err); + ocl.kernel[KERNEL_SCALEIMAGE] = clCreateKernel(ocl.program, "ScaleImage", &err); return ocl; } @@ -473,10 +474,36 @@ void clDiffPrecomputeEx(ocl_channels rgb, ocl_channels rgb2, size_t xsize, size_ } -// ian todo void clScaleImageEx(cl_mem img, size_t size, float w, cl_mem result/*out*/) { + cl_int err = CL_SUCCESS; + ocl_args_d_t &ocl = getOcl(); + + cl_int clsize = size; + cl_float clscale = w; + + + err = clEnqueueCopyBuffer(ocl.commandQueue, img, result, 0, 0, clsize, 0, NULL, NULL); + if (CL_SUCCESS != err) + { + LogError("Error: clScaleImageEx() clEnqueueCopyBuffer returned %s.\n", TranslateOpenCLError(err)); + } + cl_kernel kernel = ocl.kernel[KERNEL_SCALEIMAGE]; + clSetKernelArg(kernel, 0, sizeof(cl_int), (void*)&clscale); + clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&result); + + size_t globalWorkSize[1] = { clsize }; + err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL); + if (CL_SUCCESS != err) + { + LogError("Error: clScaleImageEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err)); + } + err = clFinish(ocl.commandQueue); + if (CL_SUCCESS != err) + { + LogError("Error: clScaleImageEx() clFinish returned %s.\n", TranslateOpenCLError(err)); + } } // ian todo diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h index 6cb0a916..53fac1c8 100644 --- a/clguetzli/ocl.h +++ b/clguetzli/ocl.h @@ -51,6 +51,7 @@ enum KernelName { KERNEL_DOWNSAMPLE, KERNEL_OPSINDYNAMICSIMAGE, KERNEL_DOMASK, + KERNEL_SCALEIMAGE, KERNEL_COUNT, }; From a31adf1d993beb5c3dce93eecd7b7766c77c1876 Mon Sep 17 00:00:00 2001 From: strongtu Date: Fri, 5 May 2017 01:40:35 +0800 Subject: [PATCH 025/189] =?UTF-8?q?=E9=AA=8C=E8=AF=81clOpinDynamicImage?= =?UTF-8?q?=E7=9A=84=E6=95=88=E6=9E=9C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clguetzli.cpp | 2 +- clguetzli/clguetzli.h | 2 ++ .../butteraugli/butteraugli/butteraugli.cc | 22 ++++++++++++++++--- 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index 67fc7503..24802e59 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -363,7 +363,7 @@ void clOpsinDynamicsImageEx(ocl_channels rgb/*in,out*/, ocl_channels rgb_blurred { ocl_args_d_t &ocl = getOcl(); cl_int clSize = size; - cl_kernel kernel = ocl.kernel[KERNEL_MINSQUAREVAL]; + cl_kernel kernel = ocl.kernel[KERNEL_OPSINDYNAMICSIMAGE]; clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&rgb.r); clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&rgb.g); clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&rgb.b); diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h index edaa0688..ca8b0b32 100644 --- a/clguetzli/clguetzli.h +++ b/clguetzli/clguetzli.h @@ -17,3 +17,5 @@ void clConvolution(size_t xsize, size_t ysize, float* result); void clBlur(size_t xsize, size_t ysize, float* channel, double sigma, double border_ratio); + +void clOpsinDynamicsImage(size_t xsize, size_t ysize, float* r, float* g, float* b); \ No newline at end of file diff --git a/third_party/butteraugli/butteraugli/butteraugli.cc b/third_party/butteraugli/butteraugli/butteraugli.cc index 98451cca..d0c59129 100644 --- a/third_party/butteraugli/butteraugli/butteraugli.cc +++ b/third_party/butteraugli/butteraugli/butteraugli.cc @@ -123,6 +123,7 @@ static void Convolution(size_t xsize, size_t ysize, void Blur(size_t xsize, size_t ysize, float* channel, double sigma, double border_ratio) { +/* #if (defined ENABLE_OPENCL) && (!defined ENABLE_OPENCL_CHECK) if (g_useOpenCL && xsize > 100 && ysize > 100) { @@ -134,7 +135,7 @@ void Blur(size_t xsize, size_t ysize, float* channel, double sigma, std::vector tmpChannel(xsize * ysize); memcpy(tmpChannel.data(), channel, xsize * ysize * sizeof(float)); #endif - +*/ PROFILER_FUNC; double m = 2.25; // Accuracy increases when m is increased. const double scaler = -1.0 / (2 * sigma * sigma); @@ -171,6 +172,7 @@ void Blur(size_t xsize, size_t ysize, float* channel, double sigma, } } + /* #ifdef ENABLE_OPENCL_CHECK // for verify { @@ -187,6 +189,7 @@ void Blur(size_t xsize, size_t ysize, float* channel, double sigma, } } #endif // ENABLE_OPENCL_CHECK +*/ } // To change this to n, add the relevant FFTn function and kFFTnMapIndexTable. @@ -1053,6 +1056,17 @@ static inline double Gamma(double v) { void OpsinDynamicsImage(size_t xsize, size_t ysize, std::vector > &rgb) { + + if (g_useOpenCL && xsize > 100 && ysize > 100) + { + float * r = rgb[0].data(); + float * g = rgb[1].data(); + float * b = rgb[2].data(); + + clOpsinDynamicsImage(xsize, ysize, r, g, b); + return; + } + PROFILER_FUNC; std::vector > blurred = rgb; static const double kSigma = 1.1; @@ -1448,6 +1462,7 @@ double MaskDcB(double delta) { void MinSquareVal(size_t square_size, size_t offset, size_t xsize, size_t ysize, float *values) { +/* #if (defined ENABLE_OPENCL) && (!defined ENABLE_OPENCL_CHECK) if (g_useOpenCL) { @@ -1461,7 +1476,7 @@ void MinSquareVal(size_t square_size, size_t offset, #ifdef ENABLE_OPENCL_CHECK std::vector backup(values, values + xsize * ysize); #endif - +*/ // offset is not negative and smaller than square_size. assert(offset < square_size); std::vector tmp(xsize * ysize); @@ -1502,7 +1517,7 @@ void MinSquareVal(size_t square_size, size_t offset, *pValuePoint = min; pValuePoint += xsize; } } - +/* #ifdef ENABLE_OPENCL_CHECK clMinSquareVal(square_size, offset, xsize, ysize, backup.data()); for (int i = 0; i < xsize * ysize; i++) @@ -1513,6 +1528,7 @@ void MinSquareVal(size_t square_size, size_t offset, } } #endif +*/ } // ===== Functions used by Mask only ===== From c30b44db45a5c3c005ea6822dc49c4c5d03cef3e Mon Sep 17 00:00:00 2001 From: strongtu Date: Fri, 5 May 2017 09:23:45 +0800 Subject: [PATCH 026/189] =?UTF-8?q?=E5=B0=9D=E8=AF=95=E5=8F=8C=E7=B2=BE?= =?UTF-8?q?=E5=BA=A6=E8=BF=90=E7=AE=97=E6=94=AF=E6=8C=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clguetzli.cl | 12 ++++++++++-- clguetzli/clguetzli.cpp | 13 ++++++++++++- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index 4c70b380..9deee460 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -1,3 +1,11 @@ +//#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable +//#elif defined(cl_amd_fp64) +//#pragma OPENCL EXTENSION cl_amd_fp64 : enable +//#else +//#error "Double precision floating point not supported by OpenCL implementation." +//#endif + __kernel void MinSquareVal(__global float* pA, __global float* pC, int square_size, int offset) { const int x = get_global_id(0); @@ -211,8 +219,8 @@ __kernel void DownSample(__global float* pA, __global float* pC, int xstep, int const int oxsize = xsize / xstep; - const int sample_x = x / xstep; - const int sample_y = y / ystep; + const int sample_x = x / xstep * xstep; + const int sample_y = y / ystep * ystep; pC[y * xsize + x] = pA[sample_y * oxsize + sample_x]; } diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index 24802e59..2ca1ceb5 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -22,7 +22,7 @@ ocl_args_d_t& getOcl(void) char* source = nullptr; size_t src_size = 0; - ReadSourceFromFile("clguetzli\\clguetzli.cl", &source, &src_size); + ReadSourceFromFile("clguetzli.cl", &source, &src_size); ocl.program = clCreateProgramWithSource(ocl.context, 1, (const char**)&source, &src_size, &err); @@ -32,6 +32,17 @@ ocl_args_d_t& getOcl(void) if (CL_SUCCESS != err) { LogError("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err)); + + if (err == CL_BUILD_PROGRAM_FAILURE) + { + size_t log_size = 0; + clGetProgramBuildInfo(ocl.program, ocl.device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); + + std::vector build_log(log_size); + clGetProgramBuildInfo(ocl.program, ocl.device, CL_PROGRAM_BUILD_LOG, log_size, &build_log[0], NULL); + + LogError("Error happened during the build of OpenCL program.\nBuild log:%s", &build_log[0]); + } } ocl.kernel[KERNEL_MINSQUAREVAL] = clCreateKernel(ocl.program, "MinSquareVal", &err); if (CL_SUCCESS != err) From 2e4cf390dc32269ba810877750671d8f4b4314fe Mon Sep 17 00:00:00 2001 From: strongtu Date: Fri, 5 May 2017 12:00:33 +0800 Subject: [PATCH 027/189] Print More DeviceInfo --- clguetzli/ocl.cpp | 28 ++++-- .../butteraugli/butteraugli/butteraugli.cc | 94 +------------------ 2 files changed, 22 insertions(+), 100 deletions(-) diff --git a/clguetzli/ocl.cpp b/clguetzli/ocl.cpp index 5387a454..05f5470f 100644 --- a/clguetzli/ocl.cpp +++ b/clguetzli/ocl.cpp @@ -382,12 +382,18 @@ cl_platform_id FindOpenCLPlatform(const char* preferredPlatform, cl_device_type bool match = true; cl_uint numDevices = 0; - // If the preferredPlatform is not NULL then check if platforms[i] is the required one - // Otherwise, continue the check with platforms[i] + size_t nameLen = 0; + clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 0, NULL, &nameLen); + + std::vector platformName(nameLen + 1); + clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, nameLen, &platformName[0], NULL); + platformName[nameLen] = 0; + + LogError("DeviceName: %s\n", platformName.data()); + if ((NULL != preferredPlatform) && (strlen(preferredPlatform) > 0)) { - // In case we're looking for a specific platform - match = CheckPreferredPlatformMatch(platforms[i], preferredPlatform); + match = (strstr(&platformName[0], preferredPlatform) != 0); } // match is true if the platform's name is the required one or don't care (NULL) @@ -400,12 +406,20 @@ cl_platform_id FindOpenCLPlatform(const char* preferredPlatform, cl_device_type err = clGetDeviceIDs(platforms[i], deviceType, 0, NULL, &numDevices); if (CL_SUCCESS != err) { - LogError("clGetDeviceIDs() returned %s.\n", TranslateOpenCLError(err)); + if (CL_DEVICE_TYPE_GPU == deviceType) + { + LogError("%s try GPU returned %s.\n", platformName.data(), TranslateOpenCLError(err)); + } + if (CL_DEVICE_TYPE_CPU == deviceType) + { + LogError("%s try CPU returned %s.\n", platformName.data(), TranslateOpenCLError(err)); + } } if (0 != numDevices) { // There is at list one device that answer the requirements + LogError("SelectDevice: %s GPU=%d\n", platformName.data(), deviceType == CL_DEVICE_TYPE_GPU ? 1 : 0); return platforms[i]; } } @@ -526,11 +540,11 @@ int SetupOpenCL(ocl_args_d_t *ocl, cl_device_type deviceType) // Query for all available OpenCL platforms on the system // Here you enumerate all platforms and pick one which name has preferredPlatform as a sub-string - cl_platform_id platformId = FindOpenCLPlatform("Intel", deviceType); + cl_platform_id platformId = FindOpenCLPlatform(nullptr, deviceType); if (NULL == platformId) { deviceType = CL_DEVICE_TYPE_CPU; - platformId = FindOpenCLPlatform("", deviceType); + platformId = FindOpenCLPlatform(nullptr, deviceType); } if (NULL == platformId) diff --git a/third_party/butteraugli/butteraugli/butteraugli.cc b/third_party/butteraugli/butteraugli/butteraugli.cc index d0c59129..0753d713 100644 --- a/third_party/butteraugli/butteraugli/butteraugli.cc +++ b/third_party/butteraugli/butteraugli/butteraugli.cc @@ -68,16 +68,6 @@ static void Convolution(size_t xsize, size_t ysize, const float* __restrict__ inp, float border_ratio, float* __restrict__ result) { -/* -#if (defined ENABLE_OPENCL) && (!defined ENABLE_OPENCL_CHECK) - if (g_useOpenCL && xsize > 100 && ysize > 100) - { - clConvolution(xsize, ysize, xstep, len, offset, multipliers, inp, border_ratio, result); - return; - } -#endif // ENABLE_OPENCL -*/ - PROFILER_FUNC; float weight_no_border = 0; @@ -102,40 +92,11 @@ static void Convolution(size_t xsize, size_t ysize, result[ox * ysize + y] = static_cast(sum * scale); } } - - /* -#ifdef ENABLE_OPENCL_CHECK - // for verify - std::vector tmp(xsize / xstep * ysize); - clConvolution(xsize, ysize, xstep, len, offset, multipliers, inp, border_ratio, &tmp[0]); - - for (int i = 0; i < xsize / xstep * ysize; i++) - { - if (fabs(result[i] - tmp[i]) > 0.0001) - { - assert(false); - } - } -#endif // ENABLE_OPENCL_CHECK -*/ } void Blur(size_t xsize, size_t ysize, float* channel, double sigma, double border_ratio) { -/* -#if (defined ENABLE_OPENCL) && (!defined ENABLE_OPENCL_CHECK) - if (g_useOpenCL && xsize > 100 && ysize > 100) - { - clBlur(xsize, ysize, channel, sigma, border_ratio); - return; - } -#endif // ENABLE_OPENCL -#ifdef ENABLE_OPENCL_CHECK - std::vector tmpChannel(xsize * ysize); - memcpy(tmpChannel.data(), channel, xsize * ysize * sizeof(float)); -#endif -*/ PROFILER_FUNC; double m = 2.25; // Accuracy increases when m is increased. const double scaler = -1.0 / (2 * sigma * sigma); @@ -171,25 +132,6 @@ void Blur(size_t xsize, size_t ysize, float* channel, double sigma, } } } - - /* -#ifdef ENABLE_OPENCL_CHECK - // for verify - { - if (xsize < 100 || ysize < 100) return; - - clBlur(xsize, ysize, tmpChannel.data(), sigma, border_ratio); - - for (int i = 0; i < xsize * ysize; i++) - { - if (fabs(channel[i] - tmpChannel[i]) > 0.0001) - { - float k = channel[i] - tmpChannel[i]; - } - } - } -#endif // ENABLE_OPENCL_CHECK -*/ } // To change this to n, add the relevant FFTn function and kFFTnMapIndexTable. @@ -1452,31 +1394,9 @@ double MaskDcB(double delta) { return InterpolateClampNegative(lut.data(), lut.size(), delta); } -// Replaces values[x + y * xsize] with the minimum of the values in the -// square_size square with coordinates -// x - offset .. x + square_size - offset - 1, -// y - offset .. y + square_size - offset - 1. - -// ʵ¼Ê¹ý³ÌÖÐsqure_sizeһֱΪ4£¬offsetΪ0£¬¿ÉÒÔSIMDÌØ»¯ - void MinSquareVal(size_t square_size, size_t offset, - size_t xsize, size_t ysize, + size_t xsize, size_t ysize, float *values) { -/* -#if (defined ENABLE_OPENCL) && (!defined ENABLE_OPENCL_CHECK) - if (g_useOpenCL) - { - clMinSquareVal(square_size, offset, xsize, ysize, values); - return; - } -#endif // ENABLE_OPENCL - - PROFILER_FUNC; - -#ifdef ENABLE_OPENCL_CHECK - std::vector backup(values, values + xsize * ysize); -#endif -*/ // offset is not negative and smaller than square_size. assert(offset < square_size); std::vector tmp(xsize * ysize); @@ -1517,18 +1437,6 @@ void MinSquareVal(size_t square_size, size_t offset, *pValuePoint = min; pValuePoint += xsize; } } -/* -#ifdef ENABLE_OPENCL_CHECK - clMinSquareVal(square_size, offset, xsize, ysize, backup.data()); - for (int i = 0; i < xsize * ysize; i++) - { - if (fabs(backup[i] - values[i]) > 0.0001) - { - assert(false); - } - } -#endif -*/ } // ===== Functions used by Mask only ===== From fd520d3236334b73b0c0d17f8680db47beb415ab Mon Sep 17 00:00:00 2001 From: strongtu Date: Fri, 5 May 2017 16:12:01 +0800 Subject: [PATCH 028/189] Merge branch 'master' of https://github.com/ianhuang-777/guetzli # Conflicts: # clguetzli/clguetzli.cl # clguetzli/clguetzli.cpp --- clguetzli/clguetzli.cl | 177 +++++++++++++++++- clguetzli/clguetzli.cpp | 127 ++++++++++--- clguetzli/clguetzli.h | 8 +- clguetzli/ocl.h | 3 +- .../butteraugli/butteraugli/butteraugli.cc | 9 + 5 files changed, 295 insertions(+), 29 deletions(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index 9deee460..3f9820dc 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -302,7 +302,10 @@ void RgbToXyb(double r, double g, double b, double *valx, double *valy, double * *valz = b; } -__kernel void OpsinDynamicsImage(__global float *r, __global float *g, __global float *b, __global float *r_blurred, __global float *g_blurred, __global float *b_blurred, int size) +__kernel void OpsinDynamicsImage( + __global float *r, __global float *g, __global float *b, + __global float *r_blurred, __global float *g_blurred, __global float *b_blurred, + int size) { const int i = get_global_id(0); double pre[3] = { r_blurred[i], g_blurred[i], b_blurred[i] }; @@ -423,7 +426,10 @@ double MaskDcB(double delta) { return InterpolateClampNegative(lut, 512, delta); } -__kernel void DoMask(__global float *mask_x, __global float *mask_y, __global float *mask_b, __global float *mask_dc_x, __global float *mask_dc_y, __global float *mask_dc_b, int xsize, int ysize) +__kernel void DoMask( + __global float *mask_x, __global float *mask_y, __global float *mask_b, + __global float *mask_dc_x, __global float *mask_dc_y, __global float *mask_dc_b, + int xsize, int ysize) { const double w00 = 232.206464018; const double w11 = 22.9455222245; @@ -454,3 +460,170 @@ __kernel void ScaleImage(double scale, __global float *result) const int i = get_global_id(0); result[i] *= (float)(scale); } + +double DotProduct(float u[3], double v[3]) { + return u[0] * v[0] + u[1] * v[1] + u[2] * v[2]; +} + +__kernel void CombineChannels( + __global float *mask_x, __global float *mask_y, __global float *mask_b, + __global float *mask_dc_x, __global float *mask_dc_y, __global float *mask_dc_b, + __global float *block_diff_dc, + __global float *block_diff_ac, + __global float *edge_detector_map, + int xsize, int ysize, + int step, + int res_xsize, + __global float *result) +{ + const int res_x = get_global_id(0); + const int res_y = get_global_id(1); + + if (res_x * step >= xsize - (8 - step)) return; + if (res_y * step >= ysize - (8 - step)) return; + + double mask[3]; + double dc_mask[3]; + mask[0] = mask_x[(res_y + 3) * xsize + (res_x + 3)]; + dc_mask[0] = mask_dc_x[(res_y + 3) * xsize + (res_x + 3)]; + + mask[1] = mask_y[(res_y + 3) * xsize + (res_x + 3)]; + dc_mask[1] = mask_dc_y[(res_y + 3) * xsize + (res_x + 3)]; + + mask[1] = mask_b[(res_y + 3) * xsize + (res_x + 3)]; + dc_mask[1] = mask_dc_b[(res_y + 3) * xsize + (res_x + 3)]; + + size_t res_ix = (res_y * res_xsize + res_x) / step; + result[res_ix] = (float)( + DotProduct((float *)&block_diff_dc[3 * res_ix], dc_mask) + + DotProduct((float *)&block_diff_ac[3 * res_ix], mask) + + DotProduct((float *)&edge_detector_map[3 * res_ix], mask)); +} + +inline double Interpolate(const double *array, int size, double sx) { + double ix = fabs(sx); + + int baseix = static_cast(ix); + double res; + if (baseix >= size - 1) { + res = array[size - 1]; + } + else { + double mix = ix - baseix; + int nextix = baseix + 1; + res = array[baseix] + mix * (array[nextix] - array[baseix]); + } + if (sx < 0) res = -res; + return res; +} + +std::array MakeLowFreqColorDiffDy() { + std::array lut; + static const double inc = 5.2511644570349185; + lut[0] = 0.0; + for (int i = 1; i < 21; ++i) { + lut[i] = lut[i - 1] + inc; + } + return lut; +} + +const double *GetLowFreqColorDiffDy() { + static const std::array kLut = MakeLowFreqColorDiffDy(); + return kLut.data(); +} + +void XybLowFreqToVals(double x, double y, double z, + double *valx, double *valy, double *valz) { + static const double xmul = 6.64482198135; + static const double ymul = 0.837846224276; + static const double zmul = 7.34905756986; + static const double y_to_z_mul = 0.0812519812628; + z += y_to_z_mul * y; + *valz = z * zmul; + *valx = x * xmul; + *valy = Interpolate(GetLowFreqColorDiffDy(), 21, y * ymul); +} + +void XybDiffLowFreqSquaredAccumulate(double r0, double g0, double b0, + double r1, double g1, double b1, + double factor, double res[3]) { + double valx0, valy0, valz0; + double valx1, valy1, valz1; + XybLowFreqToVals(r0, g0, b0, &valx0, &valy0, &valz0); + if (r1 == 0.0 && g1 == 0.0 && b1 == 0.0) { + PROFILER_ZONE("XybDiff r1=g1=b1=0"); + res[0] += factor * valx0 * valx0; + res[1] += factor * valy0 * valy0; + res[2] += factor * valz0 * valz0; + return; + } + XybLowFreqToVals(r1, g1, b1, &valx1, &valy1, &valz1); + // Approximate the distance of the colors by their respective distances + // to gray. + double valx = valx0 - valx1; + double valy = valy0 - valy1; + double valz = valz0 - valz1; + res[0] += factor * valx * valx; + res[1] += factor * valy * valy; + res[2] += factor * valz * valz; +} + +__kernel void edgeDetectorMap(__global float *result, __global float *r, __global float *g, __global float* b, __global float *r2, __global float* g2, __global float *b2, int xsize, int ysize, int step) +{ + const int result_x = get_global_id(0); + const int result_y = get_global_id(1); + + const int result_xsize = get_global_size(0); + const int result_ysize = get_global_size(1); + + int pos_x = result_x * step; + int pos_y = result_y * step; + + int local_count = 0; + double local_xyb[3] = { 0 }; + const double w = 0.711100840192; + + int offset[4][2] = { { 0£¬0}£¬ { 0£¬7}£¬{ 7£¬0}£¬{ 7£¬7} }; + int edgeSize = 3; + + for (int k = 0; i < 4; k++) + { + int x = pos_x + offset[k][0]; + int y = pos_y + offset[k][1]; + + if (x >= edgeSize && x + edgeSize < xsize) { + size_t ix = y * xsize + (x - edgeSize); + size_t ix2 = ix + 2 * edgeSize; + XybDiffLowFreqSquaredAccumulate( + w * (r[ix] - r[ix2]), + w * (g[ix] - g[ix2]), + w * (b[ix] - b[ix2]), + w * (r2[ix] - r2[ix2]), + w * (g2[ix] - g2[ix2]), + w * (b2[ix] - b2[ix2]), + 1.0, local_xyb); + ++local_count; + } + if (y >= edgeSize && y + edgeSize < ysize) { + size_t ix = (y - edgeSize) * xsize + x; + size_t ix2 = ix + 2 * edgeSize * xsize; + XybDiffLowFreqSquaredAccumulate( + w * (r[ix] - r[ix2]), + w * (g[ix] - g[ix2]), + w * (b[ix] - b[ix2]), + w * (r2[ix] - r2[ix2]), + w * (g2[ix] - g2[ix2]), + w * (b2[ix] - b2[ix2]), + 1.0, local_xyb); + ++local_count; + } + } + + static const double weight = 0.01617112696; + const double mul = weight * 8.0 / local_count; + + int idx = (result_y * result_xsize + result_x) * 3; + result[idx] = local_xyb[0]; + result[idx + 1] = local_xyb[1]; + result[idx + 2] = local_xyb[2]; +} diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index 2ca1ceb5..c96fb8f1 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -60,6 +60,7 @@ ocl_args_d_t& getOcl(void) ocl.kernel[KERNEL_OPSINDYNAMICSIMAGE] = clCreateKernel(ocl.program, "OpsinDynamicsImage", &err); ocl.kernel[KERNEL_DOMASK] = clCreateKernel(ocl.program, "DoMask", &err); ocl.kernel[KERNEL_SCALEIMAGE] = clCreateKernel(ocl.program, "ScaleImage", &err); + ocl.kernel[KERNEL_COMBINECHANNELS] = clCreateKernel(ocl.program, "CombineChannels", &err); return ocl; } @@ -396,7 +397,6 @@ void clOpsinDynamicsImageEx(ocl_channels rgb/*in,out*/, ocl_channels rgb_blurred } } -// strong todo void clOpsinDynamicsImage(size_t xsize, size_t ysize, float* r, float* g, float* b) { static const double kSigma = 1.1; @@ -442,17 +442,25 @@ void clMaskHighIntensityChangeEx(ocl_channels rgb/*in,out*/, } // strong todo -void clEdgeDetectorMapEx(ocl_channels rgb, ocl_channels rgb2, size_t xsize, size_t ysize, cl_mem result/*out*/) +void clEdgeDetectorMapEx(ocl_channels rgb, ocl_channels rgb2, size_t xsize, size_t ysize, size_t step, cl_mem result/*out*/) { + cl_int channel_size = xsize * ysize * sizeof(float); + + cl_int err = CL_SUCCESS; + ocl_args_d_t &ocl = getOcl(); + + ocl_channels rgb_blured = ocl.allocMemChannels(channel_size); + ocl_channels rgb2_blured = ocl.allocMemChannels(channel_size); + static const double kSigma[3] = { 1.5, 0.586, 0.4 }; - clBlurEx(rgb.r, xsize, ysize, kSigma[0], 0.0); - clBlurEx(rgb2.r, xsize, ysize, kSigma[0], 0.0); - clBlurEx(rgb.g, xsize, ysize, kSigma[1], 0.0); - clBlurEx(rgb2.g, xsize, ysize, kSigma[1], 0.0); - clBlurEx(rgb.b, xsize, ysize, kSigma[2], 0.0); - clBlurEx(rgb2.b, xsize, ysize, kSigma[2], 0.0); + for (int i = 0; i < 3; i++) + { + clBlurEx(rgb.ch[i], xsize, ysize, kSigma[i], 0.0, rgb_blured.ch[i]); + clBlurEx(rgb2.ch[i], xsize, ysize, kSigma[i], 0.0, rgb2_blured.ch[i]); + } // EdgeDetectorLowFreq + } // strong todo @@ -468,15 +476,23 @@ void clEdgeDetectorLowFreqEx(ocl_channels rgb, ocl_channels rgb2, size_t xsize, size_t ysize, cl_mem block_diff_ac/*out*/) { + cl_int channel_size = xsize * ysize * sizeof(float); + static const double kSigma = 14; static const double kMul = 10; - clBlurEx(rgb.r, xsize, ysize, kSigma, 0.0); - clBlurEx(rgb2.r, xsize, ysize, kSigma, 0.0); - clBlurEx(rgb.g, xsize, ysize, kSigma, 0.0); - clBlurEx(rgb2.g, xsize, ysize, kSigma, 0.0); - clBlurEx(rgb.b, xsize, ysize, kSigma, 0.0); - clBlurEx(rgb2.b, xsize, ysize, kSigma, 0.0); + cl_int err = 0; + ocl_args_d_t &ocl = getOcl(); + ocl_channels rgb_blured = ocl.allocMemChannels(channel_size); + ocl_channels rgb2_blured = ocl.allocMemChannels(channel_size); + + static const double kSigma[3] = { 1.5, 0.586, 0.4 }; + + for (int i = 0; i < 3; i++) + { + clBlurEx(rgb.ch[i], xsize, ysize, kSigma, 0.0, rgb_blured.ch[i]); + clBlurEx(rgb2.ch[i], xsize, ysize, kSigma, 0.0, rgb2_blured.ch[i]); + } } // ian todo @@ -628,10 +644,52 @@ void clMaskEx(ocl_channels rgb, ocl_channels rgb2, } } -// ian todo -void clCombineChannelsEx(ocl_channels mask, ocl_channels mask_dc, cl_mem block_diff_dc, cl_mem block_diff_ac, cl_mem edge_detector_map, size_t step, cl_mem result/*out*/) +void clCombineChannelsEx( + ocl_channels mask, + ocl_channels mask_dc, + cl_mem block_diff_dc, + cl_mem block_diff_ac, + cl_mem edge_detector_map, + size_t xsize, size_t ysize, + size_t step, + size_t res_xsize, + cl_mem result/*out*/) { + cl_int err = CL_SUCCESS; + ocl_args_d_t &ocl = getOcl(); + + cl_int clxsize = xsize; + cl_int clysize = ysize; + cl_int clstep = step; + cl_int clres_xsize = res_xsize; + cl_kernel kernel = ocl.kernel[KERNEL_COMBINECHANNELS]; + clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&mask.r); + clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&mask.g); + clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&mask.b); + clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&mask_dc.r); + clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&mask_dc.g); + clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&mask_dc.b); + clSetKernelArg(kernel, 6, sizeof(cl_mem), (void*)&block_diff_dc); + clSetKernelArg(kernel, 7, sizeof(cl_mem), (void*)&block_diff_ac); + clSetKernelArg(kernel, 8, sizeof(cl_mem), (void*)&edge_detector_map); + clSetKernelArg(kernel, 9, sizeof(cl_int), (void*)&clxsize); + clSetKernelArg(kernel, 10, sizeof(cl_int), (void*)&clysize); + clSetKernelArg(kernel, 11, sizeof(cl_int), (void*)&clstep); + clSetKernelArg(kernel, 12, sizeof(cl_int), (void*)&clres_xsize); + clSetKernelArg(kernel, 13, sizeof(cl_mem), (void*)&result); + + size_t globalWorkSize[2] = { xsize / step, ysize /step }; + err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); + if (CL_SUCCESS != err) + { + LogError("Error: clCombineChannelsEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err)); + } + err = clFinish(ocl.commandQueue); + if (CL_SUCCESS != err) + { + LogError("Error: clCombineChannelsEx() clFinish returned %s.\n", TranslateOpenCLError(err)); + } } // strong todo @@ -691,10 +749,10 @@ void clCalculateDiffmapEx(cl_mem result/*in,out*/, size_t xsize, size_t ysize, i clScaleImageEx(result, xsize * ysize, scale, result); } -// strong todo -void clDiffmapOpsinDynamicsImage(float* r, float* g, float* b, +void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b, float* r2, float* g2, float* b2, size_t xsize, size_t ysize, + size_t step, float* result) { @@ -713,24 +771,43 @@ void clDiffmapOpsinDynamicsImage(float* r, float* g, float* b, clEnqueueWriteBuffer(ocl.commandQueue, xyb2.b, CL_FALSE, 0, channel_size, b2, 0, NULL, NULL); err = clFinish(ocl.commandQueue); - cl_mem edge_detector_map = ocl.allocMem(3 * xsize * ysize); - cl_mem block_diff_dc = ocl.allocMem(3 * xsize * ysize); - cl_mem block_diff_ac = ocl.allocMem(3 * xsize * ysize); + cl_mem edge_detector_map = ocl.allocMem(3 * xsize * ysize * sizeof(float)); + cl_mem block_diff_dc = ocl.allocMem(3 * xsize * ysize * sizeof(float)); + cl_mem block_diff_ac = ocl.allocMem(3 * xsize * ysize * sizeof(float)); ocl_channels mask = ocl.allocMemChannels(channel_size); ocl_channels mask_dc = ocl.allocMemChannels(channel_size); - cl_mem mem_result; + size_t res_xsize_; // ³ÉÔ±±äÁ¿£¬ÐèÒª´«µÝ + size_t res_ysize_; // ³ÉÔ±±äÁ¿£¬ÐèÒª´«µÝ + cl_mem mem_result = ocl.allocMem(channel_size); clMaskHighIntensityChangeEx(xyb, xyb2, xsize, ysize); - clEdgeDetectorMapEx(xyb, xyb2, xsize, ysize, edge_detector_map); + clEdgeDetectorMapEx(xyb, xyb2, xsize, ysize, step, edge_detector_map); clBlockDiffMapEx(xyb, xyb2, xsize, ysize, block_diff_dc, block_diff_ac); clEdgeDetectorLowFreqEx(xyb, xyb2, xsize, ysize, block_diff_ac); - int step = 4; clMaskEx(xyb, xyb2, xsize, ysize, mask, mask_dc); - clCombineChannelsEx(mask, mask_dc, block_diff_dc, block_diff_ac, edge_detector_map, step, mem_result); + + size_t xsize_ = 0, ysize_ = 0; // ³ÉÔ±±äÁ¿£¬ÐèÒª´«µÝ + clCombineChannelsEx(mask, mask_dc, block_diff_dc, block_diff_ac, edge_detector_map, xsize_, ysize_, step, res_xsize_, mem_result); clCalculateDiffmapEx(mem_result, xsize, ysize, step); + + + cl_float *result_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mem_result, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + memcpy(result, result_r, channel_size); + + ocl.releaseMemChannels(xyb); + ocl.releaseMemChannels(xyb2); + + clReleaseMemObject(edge_detector_map); + clReleaseMemObject(block_diff_dc); + clReleaseMemObject(block_diff_ac); + + ocl.releaseMemChannels(mask); + ocl.releaseMemChannels(mask_dc); + + clReleaseMemObject(mem_result); } diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h index ca8b0b32..6f29dd35 100644 --- a/clguetzli/clguetzli.h +++ b/clguetzli/clguetzli.h @@ -18,4 +18,10 @@ void clConvolution(size_t xsize, size_t ysize, void clBlur(size_t xsize, size_t ysize, float* channel, double sigma, double border_ratio); -void clOpsinDynamicsImage(size_t xsize, size_t ysize, float* r, float* g, float* b); \ No newline at end of file +void clOpsinDynamicsImage(size_t xsize, size_t ysize, float* r, float* g, float* b); + +void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b, + float* r2, float* g2, float* b2, + size_t xsize, size_t ysize, + size_t step, + float* result); \ No newline at end of file diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h index 53fac1c8..d0370bb3 100644 --- a/clguetzli/ocl.h +++ b/clguetzli/ocl.h @@ -44,7 +44,7 @@ int SetupOpenCL(ocl_args_d_t *ocl, cl_device_type deviceType); */ enum KernelName { - KERNEL_MINSQUAREVAL, + KERNEL_MINSQUAREVAL = 0, KERNEL_CONVOLUTION, KERNEL_CONVOLUTIONX, KERNEL_CONVOLUTIONY, @@ -52,6 +52,7 @@ enum KernelName { KERNEL_OPSINDYNAMICSIMAGE, KERNEL_DOMASK, KERNEL_SCALEIMAGE, + KERNEL_COMBINECHANNELS, KERNEL_COUNT, }; diff --git a/third_party/butteraugli/butteraugli/butteraugli.cc b/third_party/butteraugli/butteraugli/butteraugli.cc index 0753d713..2fd045d8 100644 --- a/third_party/butteraugli/butteraugli/butteraugli.cc +++ b/third_party/butteraugli/butteraugli/butteraugli.cc @@ -1120,6 +1120,15 @@ void ButteraugliComparator::DiffmapOpsinDynamicsImage( const std::vector> &xyb0_arg, std::vector> &xyb1, std::vector &result) { +/* + if (g_useOpenCL && xsize_ > 100 && ysize_ > 100) + { + result.resize(xsize_ * ysize_); + clDiffmapOpsinDynamicsImage(xyb0_arg[0].data(), xyb0_arg[1].data(), xyb0_arg[2].data(), + xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), xsize_, ysize_, result.data()); + } +*/ + if (xsize_ < 8 || ysize_ < 8) return; auto xyb0 = xyb0_arg; { From 56ac179849f055000c43b78dc7d98283f948d0dd Mon Sep 17 00:00:00 2001 From: ianuming Date: Fri, 5 May 2017 20:01:05 +0800 Subject: [PATCH 029/189] Implement clMaskHighIntensityChangeEx --- clguetzli/clguetzli.cl | 71 ++++++++++++++++- clguetzli/clguetzli.cpp | 173 ++++++++++++++++++++++++++++++---------- clguetzli/ocl.h | 1 + 3 files changed, 200 insertions(+), 45 deletions(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index 3f9820dc..91f05490 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -503,7 +503,7 @@ __kernel void CombineChannels( inline double Interpolate(const double *array, int size, double sx) { double ix = fabs(sx); - int baseix = static_cast(ix); + int baseix = (int)(ix); double res; if (baseix >= size - 1) { res = array[size - 1]; @@ -517,6 +517,7 @@ inline double Interpolate(const double *array, int size, double sx) { return res; } +/* std::array MakeLowFreqColorDiffDy() { std::array lut; static const double inc = 5.2511644570349185; @@ -531,6 +532,7 @@ const double *GetLowFreqColorDiffDy() { static const std::array kLut = MakeLowFreqColorDiffDy(); return kLut.data(); } +*/ void XybLowFreqToVals(double x, double y, double z, double *valx, double *valy, double *valz) { @@ -541,7 +543,7 @@ void XybLowFreqToVals(double x, double y, double z, z += y_to_z_mul * y; *valz = z * zmul; *valx = x * xmul; - *valy = Interpolate(GetLowFreqColorDiffDy(), 21, y * ymul); + //*valy = Interpolate(GetLowFreqColorDiffDy(), 21, y * ymul); } void XybDiffLowFreqSquaredAccumulate(double r0, double g0, double b0, @@ -551,7 +553,7 @@ void XybDiffLowFreqSquaredAccumulate(double r0, double g0, double b0, double valx1, valy1, valz1; XybLowFreqToVals(r0, g0, b0, &valx0, &valy0, &valz0); if (r1 == 0.0 && g1 == 0.0 && b1 == 0.0) { - PROFILER_ZONE("XybDiff r1=g1=b1=0"); + //PROFILER_ZONE("XybDiff r1=g1=b1=0"); res[0] += factor * valx0 * valx0; res[1] += factor * valy0 * valy0; res[2] += factor * valz0 * valz0; @@ -583,9 +585,10 @@ __kernel void edgeDetectorMap(__global float *result, __global float *r, __globa double local_xyb[3] = { 0 }; const double w = 0.711100840192; - int offset[4][2] = { { 0£¬0}£¬ { 0£¬7}£¬{ 7£¬0}£¬{ 7£¬7} }; + //int offset[4][2] = { { 0£¬0}£¬ { 0£¬7}£¬{ 7£¬0}£¬{ 7£¬7} }; int edgeSize = 3; + /* for (int k = 0; i < 4; k++) { int x = pos_x + offset[k][0]; @@ -618,6 +621,7 @@ __kernel void edgeDetectorMap(__global float *result, __global float *r, __globa ++local_count; } } + */ static const double weight = 0.01617112696; const double mul = weight * 8.0 / local_count; @@ -627,3 +631,62 @@ __kernel void edgeDetectorMap(__global float *result, __global float *r, __globa result[idx + 1] = local_xyb[1]; result[idx + 2] = local_xyb[2]; } + +__kernel void MaskHighIntensityChange( + __global float *xyb0_x, __global float *xyb0_y, __global float *xyb0_b, + __global float *xyb1_x, __global float *xyb1_y, __global float *xyb1_b, + __global float *c0_x, __global float *c0_y, __global float *c0_b, + __global float *c1_x, __global float *c1_y, __global float *c1_b + ) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + const int xsize = get_global_size(0); + const int ysize = get_global_size(1); + + size_t ix = y * xsize + x; + const double ave[3] = { + (c0_x[ix] + c1_x[ix]) * 0.5, + (c0_y[ix] + c1_y[ix]) * 0.5, + (c0_b[ix] + c1_b[ix]) * 0.5, + }; + double sqr_max_diff = -1; + { + int offset[4] = + { -1, 1, -(int)(xsize), (int)(xsize) }; + int border[4] = + { x == 0, x + 1 == xsize, y == 0, y + 1 == ysize }; + for (int dir = 0; dir < 4; ++dir) { + if (border[dir]) { + continue; + } + const int ix2 = ix + offset[dir]; + double diff = 0.5 * (c0_y[ix2] + c1_y[ix2]) - ave[1]; + diff *= diff; + if (sqr_max_diff < diff) { + sqr_max_diff = diff; + } + } + } + const double kReductionX = 275.19165240059317; + const double kReductionY = 18599.41286306991; + const double kReductionZ = 410.8995306951065; + const double kChromaBalance = 106.95800948271017; + double chroma_scale = kChromaBalance / (ave[1] + kChromaBalance); + + const double mix[3] = { + chroma_scale * kReductionX / (sqr_max_diff + kReductionX), + kReductionY / (sqr_max_diff + kReductionY), + chroma_scale * kReductionZ / (sqr_max_diff + kReductionZ), + }; + // Interpolate lineraly between the average color and the actual + // color -- to reduce the importance of this pixel. + xyb0_x[ix] = (float)(mix[0] * c0_x[ix] + (1 - mix[0]) * ave[0]); + xyb1_x[ix] = (float)(mix[0] * c1_x[ix] + (1 - mix[0]) * ave[0]); + + xyb0_y[ix] = (float)(mix[1] * c0_y[ix] + (1 - mix[1]) * ave[1]); + xyb1_y[ix] = (float)(mix[1] * c1_y[ix] + (1 - mix[1]) * ave[1]); + + xyb0_b[ix] = (float)(mix[2] * c0_b[ix] + (1 - mix[2]) * ave[2]); + xyb1_b[ix] = (float)(mix[2] * c1_b[ix] + (1 - mix[2]) * ave[2]); +} diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index c96fb8f1..55d7953a 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -61,6 +61,7 @@ ocl_args_d_t& getOcl(void) ocl.kernel[KERNEL_DOMASK] = clCreateKernel(ocl.program, "DoMask", &err); ocl.kernel[KERNEL_SCALEIMAGE] = clCreateKernel(ocl.program, "ScaleImage", &err); ocl.kernel[KERNEL_COMBINECHANNELS] = clCreateKernel(ocl.program, "CombineChannels", &err); + ocl.kernel[KERNEL_MASKHIGHINTENSITYCHANGE] = clCreateKernel(ocl.program, "MaskHighIntensityChange", &err); return ocl; } @@ -433,12 +434,37 @@ void clOpsinDynamicsImage(size_t xsize, size_t ysize, float* r, float* g, float* ocl.releaseMemChannels(rgb_blurred); } -// ian todo -void clMaskHighIntensityChangeEx(ocl_channels rgb/*in,out*/, - ocl_channels rgb2/*in,out*/, +void clMaskHighIntensityChangeEx(ocl_channels xyb0_arg/*in,out*/, + ocl_channels xyb1/*in,out*/, + ocl_channels c0, + ocl_channels c1, size_t xsize, size_t ysize) { - // MaskHighIntensityChange + cl_int err = CL_SUCCESS; + ocl_args_d_t &ocl = getOcl(); + + cl_kernel kernel = ocl.kernel[KERNEL_MASKHIGHINTENSITYCHANGE]; + clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&xyb0_arg.r); + clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&xyb0_arg.g); + clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&xyb0_arg.b); + clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&c0.r); + clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&c0.g); + clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&c0.b); + clSetKernelArg(kernel, 6, sizeof(cl_mem), (void*)&c1.r); + clSetKernelArg(kernel, 7, sizeof(cl_mem), (void*)&c1.g); + clSetKernelArg(kernel, 8, sizeof(cl_mem), (void*)&c1.b); + + size_t globalWorkSize[2] = { xsize, ysize }; + err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); + if (CL_SUCCESS != err) + { + LogError("Error: clScaleImageEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err)); + } + err = clFinish(ocl.commandQueue); + if (CL_SUCCESS != err) + { + LogError("Error: clScaleImageEx() clFinish returned %s.\n", TranslateOpenCLError(err)); + } } // strong todo @@ -486,7 +512,7 @@ void clEdgeDetectorLowFreqEx(ocl_channels rgb, ocl_channels rgb2, ocl_channels rgb_blured = ocl.allocMemChannels(channel_size); ocl_channels rgb2_blured = ocl.allocMemChannels(channel_size); - static const double kSigma[3] = { 1.5, 0.586, 0.4 }; + //static const double kSigma[3] = { 1.5, 0.586, 0.4 }; for (int i = 0; i < 3; i++) { @@ -501,7 +527,7 @@ void clDiffPrecomputeEx(ocl_channels rgb, ocl_channels rgb2, size_t xsize, size_ } -void clScaleImageEx(cl_mem img, size_t size, float w, cl_mem result/*out*/) +void clScaleImageEx(cl_mem img/*in, out*/, size_t size, float w) { cl_int err = CL_SUCCESS; ocl_args_d_t &ocl = getOcl(); @@ -509,16 +535,9 @@ void clScaleImageEx(cl_mem img, size_t size, float w, cl_mem result/*out*/) cl_int clsize = size; cl_float clscale = w; - - err = clEnqueueCopyBuffer(ocl.commandQueue, img, result, 0, 0, clsize, 0, NULL, NULL); - if (CL_SUCCESS != err) - { - LogError("Error: clScaleImageEx() clEnqueueCopyBuffer returned %s.\n", TranslateOpenCLError(err)); - } - cl_kernel kernel = ocl.kernel[KERNEL_SCALEIMAGE]; clSetKernelArg(kernel, 0, sizeof(cl_int), (void*)&clscale); - clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&result); + clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&img); size_t globalWorkSize[1] = { clsize }; err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL); @@ -536,16 +555,75 @@ void clScaleImageEx(cl_mem img, size_t size, float w, cl_mem result/*out*/) // ian todo void clAverage5x5Ex(cl_mem img/*in,out*/, size_t xsize, size_t ysize) { - static const float w = 0.679144890667f; - static const float scale = 1.0f / (5.0f + 4 * w); + if (xsize < 4 || ysize < 4) { + // TODO: Make this work for small dimensions as well. + return; + } - cl_mem tmp0; - cl_mem tmp1; - clScaleImageEx(img, xsize * ysize, w, tmp0); - clScaleImageEx(img, xsize * ysize, 1, tmp1); - // average5x5 calc + cl_int err = CL_SUCCESS; + ocl_args_d_t &ocl = getOcl(); - clScaleImageEx(img, xsize * ysize, scale, img); + size_t len = xsize * ysize * sizeof(float); + ocl.allocA(len); + ocl.allocB(len); + ocl.allocC(len); + cl_mem result = ocl.srcA; + cl_mem tmp0 = ocl.srcB; + cl_mem tmp1 = ocl.dstMem; + + err = clEnqueueCopyBuffer(ocl.commandQueue, img, result, 0, 0, len, 0, NULL, NULL); + err = clEnqueueCopyBuffer(ocl.commandQueue, img, tmp0, 0, 0, len, 0, NULL, NULL); + err = clEnqueueCopyBuffer(ocl.commandQueue, img, tmp1, 0, 0, len, 0, NULL, NULL); + + static const float w = 0.679144890667f; + static const float scale = 1.0f / (5.0f + 4 * w); + + clScaleImageEx(tmp1, xsize * ysize, w); + /* TODO + for (int y = 0; y < ysize; y++) { + const int row0 = y * xsize; + result[row0 + 1] += tmp0[row0]; + result[row0 + 0] += tmp0[row0 + 1]; + result[row0 + 2] += tmp0[row0 + 1]; + for (int x = 2; x < xsize - 2; ++x) { + result[row0 + x - 1] += tmp0[row0 + x]; + result[row0 + x + 1] += tmp0[row0 + x]; + } + result[row0 + xsize - 3] += tmp0[row0 + xsize - 2]; + result[row0 + xsize - 1] += tmp0[row0 + xsize - 2]; + result[row0 + xsize - 2] += tmp0[row0 + xsize - 1]; + if (y > 0) { + const int rowd1 = row0 - xsize; + result[rowd1 + 1] += tmp1[row0]; + result[rowd1 + 0] += tmp0[row0]; + for (int x = 1; x < xsize - 1; ++x) { + result[rowd1 + x + 1] += tmp1[row0 + x]; + result[rowd1 + x + 0] += tmp0[row0 + x]; + result[rowd1 + x - 1] += tmp1[row0 + x]; + } + result[rowd1 + xsize - 1] += tmp0[row0 + xsize - 1]; + result[rowd1 + xsize - 2] += tmp1[row0 + xsize - 1]; + } + if (y + 1 < ysize) { + const int rowu1 = row0 + xsize; + result[rowu1 + 1] += tmp1[row0]; + result[rowu1 + 0] += tmp0[row0]; + for (int x = 1; x < xsize - 1; ++x) { + result[rowu1 + x + 1] += tmp1[row0 + x]; + result[rowu1 + x + 0] += tmp0[row0 + x]; + result[rowu1 + x - 1] += tmp1[row0 + x]; + } + result[rowu1 + xsize - 1] += tmp0[row0 + xsize - 1]; + result[rowu1 + xsize - 2] += tmp1[row0 + xsize - 1]; + } + } + */ + err = clEnqueueCopyBuffer(ocl.commandQueue, result, img, 0, 0, len, 0, NULL, NULL); + if (CL_SUCCESS != err) + { + LogError("Error: clAverage5x5Ex() clEnqueueCopyBuffer returned %s.\n", TranslateOpenCLError(err)); + } + clScaleImageEx(img, xsize * ysize, scale); } void clMinSquareValEx(cl_mem img/*in,out*/, size_t xsize, size_t ysize, size_t square_size, size_t offset) @@ -639,8 +717,8 @@ void clMaskEx(ocl_channels rgb, ocl_channels rgb2, for (int i = 0; i < 3; i++) { - clScaleImageEx(mask.ch[i], xsize * ysize, kGlobalScale * kGlobalScale, mask.ch[i]); - clScaleImageEx(mask_dc.ch[i], xsize * ysize, kGlobalScale * kGlobalScale, mask_dc.ch[i]); + clScaleImageEx(mask.ch[i], xsize * ysize, kGlobalScale * kGlobalScale); + clScaleImageEx(mask_dc.ch[i], xsize * ysize, kGlobalScale * kGlobalScale); } } @@ -746,7 +824,7 @@ void clCalculateDiffmapEx(cl_mem result/*in,out*/, size_t xsize, size_t ysize, i } } */ - clScaleImageEx(result, xsize * ysize, scale, result); + clScaleImageEx(result, xsize * ysize, scale); } void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b, @@ -760,15 +838,26 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b, cl_int err = 0; ocl_args_d_t &ocl = getOcl(); - ocl_channels xyb = ocl.allocMemChannels(channel_size); - ocl_channels xyb2 = ocl.allocMemChannels(channel_size); - - clEnqueueWriteBuffer(ocl.commandQueue, xyb.r, CL_FALSE, 0, channel_size, r, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, xyb.g, CL_FALSE, 0, channel_size, g, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, xyb.b, CL_FALSE, 0, channel_size, b, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, xyb2.r, CL_FALSE, 0, channel_size, r2, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, xyb2.g, CL_FALSE, 0, channel_size, g2, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, xyb2.b, CL_FALSE, 0, channel_size, b2, 0, NULL, NULL); + ocl_channels xyb0_arg = ocl.allocMemChannels(channel_size); + ocl_channels xyb1 = ocl.allocMemChannels(channel_size); + + ocl_channels xyb0 = ocl.allocMemChannels(channel_size); + ocl_channels xyb1_c = ocl.allocMemChannels(channel_size); + + clEnqueueWriteBuffer(ocl.commandQueue, xyb0_arg.r, CL_FALSE, 0, channel_size, r, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, xyb0_arg.g, CL_FALSE, 0, channel_size, g, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, xyb0_arg.b, CL_FALSE, 0, channel_size, b, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, xyb1.r, CL_FALSE, 0, channel_size, r2, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, xyb1.g, CL_FALSE, 0, channel_size, g2, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, xyb1.b, CL_FALSE, 0, channel_size, b2, 0, NULL, NULL); + + + err = clEnqueueCopyBuffer(ocl.commandQueue, xyb0_arg.r, xyb0.r, 0, 0, channel_size, 0, NULL, NULL); + err = clEnqueueCopyBuffer(ocl.commandQueue, xyb0_arg.g, xyb0.g, 0, 0, channel_size, 0, NULL, NULL); + err = clEnqueueCopyBuffer(ocl.commandQueue, xyb0_arg.b, xyb0.b, 0, 0, channel_size, 0, NULL, NULL); + err = clEnqueueCopyBuffer(ocl.commandQueue, xyb1.r, xyb1_c.r, 0, 0, channel_size, 0, NULL, NULL); + err = clEnqueueCopyBuffer(ocl.commandQueue, xyb1.g, xyb1_c.g, 0, 0, channel_size, 0, NULL, NULL); + err = clEnqueueCopyBuffer(ocl.commandQueue, xyb1.b, xyb1_c.b, 0, 0, channel_size, 0, NULL, NULL); err = clFinish(ocl.commandQueue); cl_mem edge_detector_map = ocl.allocMem(3 * xsize * ysize * sizeof(float)); @@ -782,13 +871,13 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b, size_t res_ysize_; // ³ÉÔ±±äÁ¿£¬ÐèÒª´«µÝ cl_mem mem_result = ocl.allocMem(channel_size); - clMaskHighIntensityChangeEx(xyb, xyb2, xsize, ysize); + clMaskHighIntensityChangeEx(xyb0_arg, xyb1_c, xyb0, xyb1, xsize, ysize); - clEdgeDetectorMapEx(xyb, xyb2, xsize, ysize, step, edge_detector_map); - clBlockDiffMapEx(xyb, xyb2, xsize, ysize, block_diff_dc, block_diff_ac); - clEdgeDetectorLowFreqEx(xyb, xyb2, xsize, ysize, block_diff_ac); + //clEdgeDetectorMapEx(xyb0_arg, xyb1, xsize, ysize, edge_detector_map); + clBlockDiffMapEx(xyb0_arg, xyb1, xsize, ysize, block_diff_dc, block_diff_ac); + clEdgeDetectorLowFreqEx(xyb0_arg, xyb1, xsize, ysize, block_diff_ac); - clMaskEx(xyb, xyb2, xsize, ysize, mask, mask_dc); + clMaskEx(xyb0_arg, xyb1, xsize, ysize, mask, mask_dc); size_t xsize_ = 0, ysize_ = 0; // ³ÉÔ±±äÁ¿£¬ÐèÒª´«µÝ clCombineChannelsEx(mask, mask_dc, block_diff_dc, block_diff_ac, edge_detector_map, xsize_, ysize_, step, res_xsize_, mem_result); @@ -799,8 +888,10 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b, cl_float *result_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mem_result, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); memcpy(result, result_r, channel_size); - ocl.releaseMemChannels(xyb); - ocl.releaseMemChannels(xyb2); + ocl.releaseMemChannels(xyb0_arg); + ocl.releaseMemChannels(xyb1); + ocl.releaseMemChannels(xyb0); + ocl.releaseMemChannels(xyb1_c); clReleaseMemObject(edge_detector_map); clReleaseMemObject(block_diff_dc); diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h index d0370bb3..64f94de2 100644 --- a/clguetzli/ocl.h +++ b/clguetzli/ocl.h @@ -53,6 +53,7 @@ enum KernelName { KERNEL_DOMASK, KERNEL_SCALEIMAGE, KERNEL_COMBINECHANNELS, + KERNEL_MASKHIGHINTENSITYCHANGE, KERNEL_COUNT, }; From a024ec13719b0f6d52738e167115e15d4731b380 Mon Sep 17 00:00:00 2001 From: ianhuang-777 <306168910@qq.com> Date: Fri, 5 May 2017 22:46:54 +0800 Subject: [PATCH 030/189] Implement clDiffPrecomputeEx --- clguetzli/clguetzli.cl | 89 +++++++++++++++++++++++++++++++++++++++++ clguetzli/clguetzli.cpp | 45 ++++++++++++++++----- clguetzli/ocl.h | 1 + 3 files changed, 126 insertions(+), 9 deletions(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index 91f05490..846ef013 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -690,3 +690,92 @@ __kernel void MaskHighIntensityChange( xyb0_b[ix] = (float)(mix[2] * c0_b[ix] + (1 - mix[2]) * ave[2]); xyb1_b[ix] = (float)(mix[2] * c1_b[ix] + (1 - mix[2]) * ave[2]); } + +void XybToVals( + double x, double y, double z, + double *valx, double *valy, double *valz) +{ + static const double xmul = 0.758304045695; + static const double ymul = 2.28148649801; + static const double zmul = 1.87816926918; + + double lut[21] = { 0.0 }; + const double off = 11.38708334481672; + const double inc = 14.550189611520716; + lut[0] = 0.0; + lut[1] = off; + for (int i = 2; i < 21; ++i) { + lut[i] = lut[i - 1] + inc; + } + + *valx = Interpolate(lut, 21, x * xmul); + *valy = Interpolate(lut, 21, y * ymul); + *valz = zmul * z; +} + + +__kernel void DiffPrecompute( + __global float *xyb0_x, __global float *xyb0_y, __global float *xyb0_b, + __global float *xyb1_x, __global float *xyb1_y, __global float *xyb1_b, + __global float *mask_x, __global float *mask_y, __global float *mask_b ) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + const int xsize = get_global_size(0); + const int ysize = get_global_size(1); + + double valsh0[3] = { 0.0 }; + double valsv0[3] = { 0.0 }; + double valsh1[3] = { 0.0 }; + double valsv1[3] = { 0.0 }; + int ix2; + + size_t ix = x + xsize * y; + if (x + 1 < xsize) { + ix2 = ix + 1; + } + else { + ix2 = ix - 1; + } + { + double x0 = (xyb0_x[ix] - xyb0_x[ix2]); + double y0 = (xyb0_y[ix] - xyb0_y[ix2]); + double z0 = (xyb0_b[ix] - xyb0_b[ix2]); + XybToVals(x0, y0, z0, &valsh0[0], &valsh0[1], &valsh0[2]); + double x1 = (xyb1_x[ix] - xyb1_x[ix2]); + double y1 = (xyb1_y[ix] - xyb1_y[ix2]); + double z1 = (xyb1_b[ix] - xyb1_b[ix2]); + XybToVals(x1, y1, z1, &valsh1[0], &valsh1[1], &valsh1[2]); + } + if (y + 1 < ysize) { + ix2 = ix + xsize; + } + else { + ix2 = ix - xsize; + } + { + double x0 = (xyb0_x[ix] - xyb0_x[ix2]); + double y0 = (xyb0_y[ix] - xyb0_y[ix2]); + double z0 = (xyb0_b[ix] - xyb0_b[ix2]); + XybToVals(x0, y0, z0, &valsv0[0], &valsv0[1], &valsv0[2]); + double x1 = (xyb1_x[ix] - xyb1_x[ix2]); + double y1 = (xyb1_y[ix] - xyb1_y[ix2]); + double z1 = (xyb1_b[ix] - xyb1_b[ix2]); + XybToVals(x1, y1, z1, &valsv1[0], &valsv1[1], &valsv1[2]); + } + + double sup0 = fabs(valsh0[0]) + fabs(valsv0[0]); + double sup1 = fabs(valsh1[0]) + fabs(valsv1[0]); + double m = min(sup0, sup1); + mask_x[ix] = (float)(m); + + sup0 = fabs(valsh0[1]) + fabs(valsv0[1]); + sup1 = fabs(valsh1[1]) + fabs(valsv1[1]); + m = min(sup0, sup1); + mask_y[ix] = (float)(m); + + sup0 = fabs(valsh0[2]) + fabs(valsv0[2]); + sup1 = fabs(valsh1[2]) + fabs(valsv1[2]); + m = min(sup0, sup1); + mask_b[ix] = (float)(m); +} \ No newline at end of file diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index 55d7953a..e5d90d21 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -22,7 +22,7 @@ ocl_args_d_t& getOcl(void) char* source = nullptr; size_t src_size = 0; - ReadSourceFromFile("clguetzli.cl", &source, &src_size); + ReadSourceFromFile("clguetzli\\clguetzli.cl", &source, &src_size); ocl.program = clCreateProgramWithSource(ocl.context, 1, (const char**)&source, &src_size, &err); @@ -62,6 +62,7 @@ ocl_args_d_t& getOcl(void) ocl.kernel[KERNEL_SCALEIMAGE] = clCreateKernel(ocl.program, "ScaleImage", &err); ocl.kernel[KERNEL_COMBINECHANNELS] = clCreateKernel(ocl.program, "CombineChannels", &err); ocl.kernel[KERNEL_MASKHIGHINTENSITYCHANGE] = clCreateKernel(ocl.program, "MaskHighIntensityChange", &err); + ocl.kernel[KERNEL_DIFFPRECOMPUTE] = clCreateKernel(ocl.program, "DiffPrecompute", &err); return ocl; } @@ -447,12 +448,15 @@ void clMaskHighIntensityChangeEx(ocl_channels xyb0_arg/*in,out*/, clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&xyb0_arg.r); clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&xyb0_arg.g); clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&xyb0_arg.b); - clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&c0.r); - clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&c0.g); - clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&c0.b); - clSetKernelArg(kernel, 6, sizeof(cl_mem), (void*)&c1.r); - clSetKernelArg(kernel, 7, sizeof(cl_mem), (void*)&c1.g); - clSetKernelArg(kernel, 8, sizeof(cl_mem), (void*)&c1.b); + clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&xyb1.r); + clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&xyb1.g); + clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&xyb1.b); + clSetKernelArg(kernel, 6, sizeof(cl_mem), (void*)&c0.r); + clSetKernelArg(kernel, 7, sizeof(cl_mem), (void*)&c0.g); + clSetKernelArg(kernel, 8, sizeof(cl_mem), (void*)&c0.b); + clSetKernelArg(kernel, 9, sizeof(cl_mem), (void*)&c1.r); + clSetKernelArg(kernel, 10, sizeof(cl_mem), (void*)&c1.g); + clSetKernelArg(kernel, 11, sizeof(cl_mem), (void*)&c1.b); size_t globalWorkSize[2] = { xsize, ysize }; err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); @@ -521,10 +525,33 @@ void clEdgeDetectorLowFreqEx(ocl_channels rgb, ocl_channels rgb2, } } -// ian todo -void clDiffPrecomputeEx(ocl_channels rgb, ocl_channels rgb2, size_t xsize, size_t ysize, ocl_channels mask/*out*/) +void clDiffPrecomputeEx(ocl_channels xyb0, ocl_channels xyb1, size_t xsize, size_t ysize, ocl_channels mask/*out*/) { + cl_int err = CL_SUCCESS; + ocl_args_d_t &ocl = getOcl(); + + cl_kernel kernel = ocl.kernel[KERNEL_DIFFPRECOMPUTE]; + clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&xyb0.r); + clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&xyb0.g); + clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&xyb0.b); + clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&xyb1.r); + clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&xyb1.g); + clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&xyb1.b); + clSetKernelArg(kernel, 6, sizeof(cl_mem), (void*)&mask.r); + clSetKernelArg(kernel, 7, sizeof(cl_mem), (void*)&mask.g); + clSetKernelArg(kernel, 8, sizeof(cl_mem), (void*)&mask.b); + size_t globalWorkSize[2] = { xsize, ysize }; + err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); + if (CL_SUCCESS != err) + { + LogError("Error: clScaleImageEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err)); + } + err = clFinish(ocl.commandQueue); + if (CL_SUCCESS != err) + { + LogError("Error: clScaleImageEx() clFinish returned %s.\n", TranslateOpenCLError(err)); + } } void clScaleImageEx(cl_mem img/*in, out*/, size_t size, float w) diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h index 64f94de2..3e20a2f3 100644 --- a/clguetzli/ocl.h +++ b/clguetzli/ocl.h @@ -54,6 +54,7 @@ enum KernelName { KERNEL_SCALEIMAGE, KERNEL_COMBINECHANNELS, KERNEL_MASKHIGHINTENSITYCHANGE, + KERNEL_DIFFPRECOMPUTE, KERNEL_COUNT, }; From 13637b276899efddd257659495efb74ea389d8e6 Mon Sep 17 00:00:00 2001 From: ianhuang-777 <306168910@qq.com> Date: Fri, 5 May 2017 22:47:21 +0800 Subject: [PATCH 031/189] Implement clDiffPrecomputeEx --- clguetzli/clguetzli.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index e5d90d21..4ec86456 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -545,12 +545,12 @@ void clDiffPrecomputeEx(ocl_channels xyb0, ocl_channels xyb1, size_t xsize, size err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); if (CL_SUCCESS != err) { - LogError("Error: clScaleImageEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err)); + LogError("Error: clDiffPrecomputeEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err)); } err = clFinish(ocl.commandQueue); if (CL_SUCCESS != err) { - LogError("Error: clScaleImageEx() clFinish returned %s.\n", TranslateOpenCLError(err)); + LogError("Error: clDiffPrecomputeEx() clFinish returned %s.\n", TranslateOpenCLError(err)); } } From b7b19edc7cbbc7d072bb1eda642269d113be43a9 Mon Sep 17 00:00:00 2001 From: strongtu Date: Sat, 6 May 2017 15:54:12 +0800 Subject: [PATCH 032/189] Merge branch 'master' of https://github.com/ianhuang-777/guetzli # Conflicts: # clguetzli/clguetzli.cl # clguetzli/clguetzli.cpp # clguetzli/ocl.h --- clguetzli/clguetzli.cl | 787 ++++++++++++++++++++++++++++++++++++---- clguetzli/clguetzli.cpp | 378 +++++++++++++------ clguetzli/ocl.h | 7 + 3 files changed, 986 insertions(+), 186 deletions(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index 846ef013..86ffb63a 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -33,16 +33,6 @@ __kernel void MinSquareVal(__global float* pA, __global float* pC, int square_si pC[y * width + x] = minValue; } -float calcWeight(__global float* multipliers, int len) -{ - float weight_no_border = 0; - for (int j = 0; j < len; j++) - { - weight_no_border += multipliers[j]; - } - return weight_no_border; -} - __kernel void Convolution(__global float* multipliers, __global float* inp, __global float* result, int xsize, int xstep, int len, int offset, float border_ratio) { @@ -53,14 +43,12 @@ __kernel void Convolution(__global float* multipliers, __global float* inp, __gl const int ysize = get_global_size(1); const int x = ox * xstep; -/* + float weight_no_border = 0; for (int j = 0; j <= 2 * offset; j++) { weight_no_border += multipliers[j]; } -*/ - float weight_no_border = calcWeight(multipliers, len); int minx = x < offset ? 0 : x - offset; int maxx = min(xsize, x + len - offset); @@ -473,12 +461,14 @@ __kernel void CombineChannels( __global float *edge_detector_map, int xsize, int ysize, int step, - int res_xsize, __global float *result) { const int res_x = get_global_id(0); const int res_y = get_global_id(1); + const int res_xsize = get_global_size(0); + const int res_ysize = get_global_size(1); + if (res_x * step >= xsize - (8 - step)) return; if (res_y * step >= ysize - (8 - step)) return; @@ -500,7 +490,7 @@ __kernel void CombineChannels( DotProduct((float *)&edge_detector_map[3 * res_ix], mask)); } -inline double Interpolate(const double *array, int size, double sx) { +inline double Interpolate(__constant double *array, int size, double sx) { double ix = fabs(sx); int baseix = (int)(ix); @@ -517,33 +507,41 @@ inline double Interpolate(const double *array, int size, double sx) { return res; } -/* -std::array MakeLowFreqColorDiffDy() { - std::array lut; - static const double inc = 5.2511644570349185; - lut[0] = 0.0; - for (int i = 1; i < 21; ++i) { - lut[i] = lut[i - 1] + inc; - } - return lut; -} - -const double *GetLowFreqColorDiffDy() { - static const std::array kLut = MakeLowFreqColorDiffDy(); - return kLut.data(); -} -*/ +__constant double XybLowFreqToVals_inc = 5.2511644570349185; +__constant double XybLowFreqToVals_lut[21] = { + 0, + 1 * XybLowFreqToVals_inc, + 2 * XybLowFreqToVals_inc, + 3 * XybLowFreqToVals_inc, + 4 * XybLowFreqToVals_inc, + 5 * XybLowFreqToVals_inc, + 6 * XybLowFreqToVals_inc, + 7 * XybLowFreqToVals_inc, + 8 * XybLowFreqToVals_inc, + 9 * XybLowFreqToVals_inc, + 10 * XybLowFreqToVals_inc, + 11 * XybLowFreqToVals_inc, + 12 * XybLowFreqToVals_inc, + 13 * XybLowFreqToVals_inc, + 14 * XybLowFreqToVals_inc, + 15 * XybLowFreqToVals_inc, + 16 * XybLowFreqToVals_inc, + 17 * XybLowFreqToVals_inc, + 18 * XybLowFreqToVals_inc, + 19 * XybLowFreqToVals_inc, + 20 * XybLowFreqToVals_inc, +}; void XybLowFreqToVals(double x, double y, double z, double *valx, double *valy, double *valz) { - static const double xmul = 6.64482198135; - static const double ymul = 0.837846224276; - static const double zmul = 7.34905756986; - static const double y_to_z_mul = 0.0812519812628; + const double xmul = 6.64482198135; + const double ymul = 0.837846224276; + const double zmul = 7.34905756986; + const double y_to_z_mul = 0.0812519812628; z += y_to_z_mul * y; *valz = z * zmul; *valx = x * xmul; - //*valy = Interpolate(GetLowFreqColorDiffDy(), 21, y * ymul); + *valy = Interpolate(&XybLowFreqToVals_lut[0], 21, y * ymul); } void XybDiffLowFreqSquaredAccumulate(double r0, double g0, double b0, @@ -570,26 +568,31 @@ void XybDiffLowFreqSquaredAccumulate(double r0, double g0, double b0, res[2] += factor * valz * valz; } -__kernel void edgeDetectorMap(__global float *result, __global float *r, __global float *g, __global float* b, __global float *r2, __global float* g2, __global float *b2, int xsize, int ysize, int step) +__kernel void edgeDetectorMap(__global float *result, + __global float *r, __global float *g, __global float* b, + __global float *r2, __global float* g2, __global float *b2, + int xsize, int ysize, int step) { - const int result_x = get_global_id(0); - const int result_y = get_global_id(1); + const int res_x = get_global_id(0); + const int res_y = get_global_id(1); - const int result_xsize = get_global_size(0); - const int result_ysize = get_global_size(1); + const int res_xsize = get_global_size(0); + const int res_ysize = get_global_size(1); - int pos_x = result_x * step; - int pos_y = result_y * step; + int pos_x = res_x * step; + int pos_y = res_y * step; + + if (res_x * step >= xsize - (8 - step)) return; + if (res_y * step >= ysize - (8 - step)) return; int local_count = 0; double local_xyb[3] = { 0 }; const double w = 0.711100840192; - //int offset[4][2] = { { 0£¬0}£¬ { 0£¬7}£¬{ 7£¬0}£¬{ 7£¬7} }; + int offset[4][2] = {{0,0}, {0,7}, {7,0}, {7,7}}; int edgeSize = 3; - /* - for (int k = 0; i < 4; k++) + for (int k = 0; k < 4; k++) { int x = pos_x + offset[k][0]; int y = pos_y + offset[k][1]; @@ -621,17 +624,547 @@ __kernel void edgeDetectorMap(__global float *result, __global float *r, __globa ++local_count; } } - */ - static const double weight = 0.01617112696; + const double weight = 0.01617112696; const double mul = weight * 8.0 / local_count; - int idx = (result_y * result_xsize + result_x) * 3; + int idx = (res_y * res_xsize + res_x) * 3; result[idx] = local_xyb[0]; result[idx + 1] = local_xyb[1]; result[idx + 2] = local_xyb[2]; } +__kernel void edgeDetectorLowFreq(__global float *result, + __global float *r, __global float *g, __global float* b, + __global float *r2, __global float* g2, __global float *b2, + int xsize, int ysize, int step) +{ + const int res_x = get_global_id(0); + const int res_y = get_global_id(1); + + if (res_x < 8 / step) return; + + const int res_xsize = get_global_size(0); + const int res_ysize = get_global_size(1); + + int pos_x = (res_x - (8 / step)) * step; + int pos_y = res_y * step; + + if (pos_x + 8 >= xsize) return; + if (pos_y + 8 >= ysize) return; + + int ix = pos_y * xsize + pos_x; + + double diff[4][3]; + __global float* blurred0[3] = { r, g, b }; + __global float* blurred1[3] = { r2, g2, b2 }; + + for (int i = 0; i < 3; ++i) { + int ix2 = ix + 8; + diff[0][i] = + ((blurred1[i][ix] - blurred0[i][ix]) + + (blurred0[i][ix2] - blurred1[i][ix2])); + ix2 = ix + 8 * xsize; + diff[1][i] = + ((blurred1[i][ix] - blurred0[i][ix]) + + (blurred0[i][ix2] - blurred1[i][ix2])); + ix2 = ix + 6 * xsize + 6; + diff[2][i] = + ((blurred1[i][ix] - blurred0[i][ix]) + + (blurred0[i][ix2] - blurred1[i][ix2])); + ix2 = ix + 6 * xsize - 6; + diff[3][i] = pos_x < 8 ? 0 : + ((blurred1[i][ix] - blurred0[i][ix]) + + (blurred0[i][ix2] - blurred1[i][ix2])); + } + double max_diff_xyb[3] = { 0 }; + for (int k = 0; k < 4; ++k) { + double diff_xyb[3] = { 0 }; + XybDiffLowFreqSquaredAccumulate(diff[k][0], diff[k][1], diff[k][2], + 0, 0, 0, 1.0, + diff_xyb); + for (int i = 0; i < 3; ++i) { + max_diff_xyb[i] = max(max_diff_xyb[i], diff_xyb[i]); + } + } + + int res_ix = res_y * res_xsize + res_x; + + const double kMul = 10; + + result[res_ix * 3] = max_diff_xyb[0] * kMul; + result[res_ix * 3 + 1] = max_diff_xyb[1] * kMul; + result[res_ix * 3 + 2] = max_diff_xyb[2] * kMul; +} + +#define kBlockEdge 8 +#define kBlockSize (kBlockEdge * kBlockEdge) +#define kBlockEdgeHalf (kBlockEdge / 2) +#define kBlockHalf (kBlockEdge * kBlockEdgeHalf) + +__constant double csf8x8[kBlockHalf + kBlockEdgeHalf + 1] = { + 5.28270670524, + 0.0, + 0.0, + 0.0, + 0.3831134973, + 0.676303603859, + 3.58927792424, + 18.6104367002, + 18.6104367002, + 3.09093131948, + 1.0, + 0.498250875965, + 0.36198671102, + 0.308982169883, + 0.1312701920435, + 2.37370549629, + 3.58927792424, + 1.0, + 2.37370549629, + 0.991205724152, + 1.05178802919, + 0.627264168628, + 0.4, + 0.1312701920435, + 0.676303603859, + 0.498250875965, + 0.991205724152, + 0.5, + 0.3831134973, + 0.349686450518, + 0.627264168628, + 0.308982169883, + 0.3831134973, + 0.36198671102, + 1.05178802919, + 0.3831134973, + 0.12, +}; + +typedef struct __Complex +{ + double real; + double imag; +}Complex; + +constant double kSqrtHalf = 0.70710678118654752440084436210484903; + +void RealFFT8(const double* in, Complex* out) { + double t1, t2, t3, t5, t6, t7, t8; + t8 = in[6]; + t5 = in[2] - t8; + t8 += in[2]; + out[2].real = t8; + out[6].imag = -t5; + out[4].imag = t5; + t8 = in[4]; + t3 = in[0] - t8; + t8 += in[0]; + out[0].real = t8; + out[4].real = t3; + out[6].real = t3; + t7 = in[5]; + t3 = in[1] - t7; + t7 += in[1]; + out[1].real = t7; + t8 = in[7]; + t5 = in[3] - t8; + t8 += in[3]; + out[3].real = t8; + t2 = -t5; + t6 = t3 - t5; + t8 = kSqrtHalf; + t6 *= t8; + out[5].real = out[4].real - t6; + t1 = t3 + t5; + t1 *= t8; + out[5].imag = out[4].imag - t1; + t6 += out[4].real; + out[4].real = t6; + t1 += out[4].imag; + out[4].imag = t1; + t5 = t2 - t3; + t5 *= t8; + out[7].imag = out[6].imag - t5; + t2 += t3; + t2 *= t8; + out[7].real = out[6].real - t2; + t2 += out[6].real; + out[6].real = t2; + t5 += out[6].imag; + out[6].imag = t5; + t5 = out[2].real; + t1 = out[0].real - t5; + t7 = out[3].real; + t5 += out[0].real; + t3 = out[1].real - t7; + t7 += out[1].real; + t8 = t5 + t7; + out[0].real = t8; + t5 -= t7; + out[1].real = t5; + out[2].imag = t3; + out[3].imag = -t3; + out[3].real = t1; + out[2].real = t1; + out[0].imag = 0; + out[1].imag = 0; + + // Reorder to the correct output order. + // TODO: Modify the above computation so that this is not needed. + Complex tmp = out[2]; + out[2] = out[3]; + out[3] = out[5]; + out[5] = out[7]; + out[7] = out[4]; + out[4] = out[1]; + out[1] = out[6]; + out[6] = tmp; +} + +void TransposeBlock(Complex data[kBlockSize]) { + for (int i = 0; i < kBlockEdge; i++) { + for (int j = 0; j < i; j++) { + Complex tmp = data[kBlockEdge * i + j]; + data[kBlockEdge * i + j] = data[kBlockEdge * j + i]; + data[kBlockEdge * j + i] = tmp; + } + } +} + +// D. J. Bernstein's Fast Fourier Transform algorithm on 4 elements. +inline void FFT4(Complex* a) { + double t1, t2, t3, t4, t5, t6, t7, t8; + t5 = a[2].real; + t1 = a[0].real - t5; + t7 = a[3].real; + t5 += a[0].real; + t3 = a[1].real - t7; + t7 += a[1].real; + t8 = t5 + t7; + a[0].real = t8; + t5 -= t7; + a[1].real = t5; + t6 = a[2].imag; + t2 = a[0].imag - t6; + t6 += a[0].imag; + t5 = a[3].imag; + a[2].imag = t2 + t3; + t2 -= t3; + a[3].imag = t2; + t4 = a[1].imag - t5; + a[3].real = t1 + t4; + t1 -= t4; + a[2].real = t1; + t5 += a[1].imag; + a[0].imag = t6 + t5; + t6 -= t5; + a[1].imag = t6; +} + +// D. J. Bernstein's Fast Fourier Transform algorithm on 8 elements. +void FFT8(Complex* a) { + double t1, t2, t3, t4, t5, t6, t7, t8; + + t7 = a[4].imag; + t4 = a[0].imag - t7; + t7 += a[0].imag; + a[0].imag = t7; + + t8 = a[6].real; + t5 = a[2].real - t8; + t8 += a[2].real; + a[2].real = t8; + + t7 = a[6].imag; + a[6].imag = t4 - t5; + t4 += t5; + a[4].imag = t4; + + t6 = a[2].imag - t7; + t7 += a[2].imag; + a[2].imag = t7; + + t8 = a[4].real; + t3 = a[0].real - t8; + t8 += a[0].real; + a[0].real = t8; + + a[4].real = t3 - t6; + t3 += t6; + a[6].real = t3; + + t7 = a[5].real; + t3 = a[1].real - t7; + t7 += a[1].real; + a[1].real = t7; + + t8 = a[7].imag; + t6 = a[3].imag - t8; + t8 += a[3].imag; + a[3].imag = t8; + t1 = t3 - t6; + t3 += t6; + + t7 = a[5].imag; + t4 = a[1].imag - t7; + t7 += a[1].imag; + a[1].imag = t7; + + t8 = a[7].real; + t5 = a[3].real - t8; + t8 += a[3].real; + a[3].real = t8; + + t2 = t4 - t5; + t4 += t5; + + t6 = t1 - t4; + t8 = kSqrtHalf; + t6 *= t8; + a[5].real = a[4].real - t6; + t1 += t4; + t1 *= t8; + a[5].imag = a[4].imag - t1; + t6 += a[4].real; + a[4].real = t6; + t1 += a[4].imag; + a[4].imag = t1; + + t5 = t2 - t3; + t5 *= t8; + a[7].imag = a[6].imag - t5; + t2 += t3; + t2 *= t8; + a[7].real = a[6].real - t2; + t2 += a[6].real; + a[6].real = t2; + t5 += a[6].imag; + a[6].imag = t5; + + FFT4(a); + + // Reorder to the correct output order. + // TODO: Modify the above computation so that this is not needed. + Complex tmp = a[2]; + a[2] = a[3]; + a[3] = a[5]; + a[5] = a[7]; + a[7] = a[4]; + a[4] = a[1]; + a[1] = a[6]; + a[6] = tmp; +} + +double abssq(const Complex c) { + return c.real * c.real + c.imag * c.imag; +} + +void ButteraugliFFTSquared(double block[kBlockSize]) { + double global_mul = 0.000064; + Complex block_c[kBlockSize]; + + for (int y = 0; y < kBlockEdge; ++y) { + RealFFT8(block + y * kBlockEdge, block_c + y * kBlockEdge); + } + TransposeBlock(block_c); + double r0[kBlockEdge]; + double r1[kBlockEdge]; + for (int x = 0; x < kBlockEdge; ++x) { + r0[x] = block_c[x].real; + r1[x] = block_c[kBlockHalf + x].real; + } + RealFFT8(r0, block_c); + RealFFT8(r1, block_c + kBlockHalf); + for (int y = 1; y < kBlockEdgeHalf; ++y) { + FFT8(block_c + y * kBlockEdge); + } + for (int i = kBlockEdgeHalf; i < kBlockHalf + kBlockEdgeHalf + 1; ++i) { + block[i] = abssq(block_c[i]); + block[i] *= global_mul; + } +} + +__constant double MakeHighFreqColorDiffDy_off = 1.4103373714040413; +__constant double MakeHighFreqColorDiffDy_inc = 0.7084088867024; +__constant double MakeHighFreqColorDiffDy_lut[21] ={ + 0.0, + MakeHighFreqColorDiffDy_off, + MakeHighFreqColorDiffDy_off + 1 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 2 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 3 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 4 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 5 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 6 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 7 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 8 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 9 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 10 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 11 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 12 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 13 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 14 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 15 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 16 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 17 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 18 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 19 * MakeHighFreqColorDiffDy_inc, +}; + +double RemoveRangeAroundZero(double v, double range) { + if (v >= -range && v < range) { + return 0; + } + if (v < 0) { + return v + range; + } + else { + return v - range; + } +} + +// Computes 8x8 FFT of each channel of xyb0 and xyb1 and adds the total squared +// 3-dimensional xybdiff of the two blocks to diff_xyb_{dc,ac} and the average +// diff on the edges to diff_xyb_edge_dc. +void ButteraugliBlockDiff(double xyb0[3 * kBlockSize], + double xyb1[3 * kBlockSize], + double diff_xyb_dc[3], + double diff_xyb_ac[3], + double diff_xyb_edge_dc[3]) { + + double avgdiff_xyb[3] = { 0.0 }; + double avgdiff_edge[3][4] = { { 0.0 } }; + for (int i = 0; i < 3 * kBlockSize; ++i) { + const double diff_xyb = xyb0[i] - xyb1[i]; + const int c = i / kBlockSize; + avgdiff_xyb[c] += diff_xyb / kBlockSize; + const int k = i % kBlockSize; + const int kx = k % kBlockEdge; + const int ky = k / kBlockEdge; + const int h_edge_idx = ky == 0 ? 1 : ky == 7 ? 3 : -1; + const int v_edge_idx = kx == 0 ? 0 : kx == 7 ? 2 : -1; + if (h_edge_idx >= 0) { + avgdiff_edge[c][h_edge_idx] += diff_xyb / kBlockEdge; + } + if (v_edge_idx >= 0) { + avgdiff_edge[c][v_edge_idx] += diff_xyb / kBlockEdge; + } + } + XybDiffLowFreqSquaredAccumulate(avgdiff_xyb[0], + avgdiff_xyb[1], + avgdiff_xyb[2], + 0, 0, 0, csf8x8[0], + diff_xyb_dc); + for (int i = 0; i < 4; ++i) { + XybDiffLowFreqSquaredAccumulate(avgdiff_edge[0][i], + avgdiff_edge[1][i], + avgdiff_edge[2][i], + 0, 0, 0, csf8x8[0], + diff_xyb_edge_dc); + } + + double* xyb_avg = xyb0; + double* xyb_halfdiff = xyb1; + for (int i = 0; i < 3 * kBlockSize; ++i) { + double avg = (xyb0[i] + xyb1[i]) / 2; + double halfdiff = (xyb0[i] - xyb1[i]) / 2; + xyb_avg[i] = avg; + xyb_halfdiff[i] = halfdiff; + } + double *y_avg = &xyb_avg[kBlockSize]; + double *x_halfdiff_squared = &xyb_halfdiff[0]; + double *y_halfdiff = &xyb_halfdiff[kBlockSize]; + double *z_halfdiff_squared = &xyb_halfdiff[2 * kBlockSize]; + ButteraugliFFTSquared(y_avg); + ButteraugliFFTSquared(x_halfdiff_squared); + ButteraugliFFTSquared(y_halfdiff); + ButteraugliFFTSquared(z_halfdiff_squared); + + const double xmul = 64.8; + const double ymul = 1.753123908348329; + const double ymul2 = 1.51983458269; + const double zmul = 2.4; + + for (size_t i = kBlockEdgeHalf; i < kBlockHalf + kBlockEdgeHalf + 1; ++i) { + double d = csf8x8[i]; + diff_xyb_ac[0] += d * xmul * x_halfdiff_squared[i]; + diff_xyb_ac[2] += d * zmul * z_halfdiff_squared[i]; + + y_avg[i] = sqrt(y_avg[i]); + y_halfdiff[i] = sqrt(y_halfdiff[i]); + double y0 = y_avg[i] - y_halfdiff[i]; + double y1 = y_avg[i] + y_halfdiff[i]; + // Remove the impact of small absolute values. + // This improves the behavior with flat noise. + const double ylimit = 0.04; + y0 = RemoveRangeAroundZero(y0, ylimit); + y1 = RemoveRangeAroundZero(y1, ylimit); + if (y0 != y1) { + double valy0 = Interpolate(&MakeHighFreqColorDiffDy_lut[0], 21, y0 * ymul2); + double valy1 = Interpolate(&MakeHighFreqColorDiffDy_lut[0], 21, y1 * ymul2); + double valy = ymul * (valy0 - valy1); + diff_xyb_ac[1] += d * valy * valy; + } + } +} + +__kernel void blockDiffMap(__global float* r, __global float* g, __global float* b, + __global float* r2, __global float* g2, __global float* b2, + __global float* block_diff_dc, __global float* block_diff_ac, + int xsize, int ysize, int step) +{ + const int res_x = get_global_id(0); + const int res_y = get_global_id(1); + + const int res_xsize = get_global_size(0); + const int res_ysize = get_global_size(1); + + int pos_x = res_x * step; + int pos_y = res_y * step; + + if ((pos_x + kBlockEdge - step - 1) >= ysize) return; + if ((pos_y + kBlockEdge - step - 1) >= xsize) return; + + size_t res_ix = res_y * res_xsize + res_x; + size_t offset = min(res_y * step, ysize - 8) * xsize + min(res_x * step, xsize - 8); + + double block0[3 * kBlockEdge * kBlockEdge]; + double block1[3 * kBlockEdge * kBlockEdge]; + + double *block0_r = &block0[0]; + double *block0_g = &block0[kBlockEdge * kBlockEdge]; + double *block0_b = &block0[2 * kBlockEdge * kBlockEdge]; + + double *block1_r = &block1[0]; + double *block1_g = &block1[kBlockEdge * kBlockEdge]; + double *block1_b = &block1[2 * kBlockEdge * kBlockEdge]; + + for (int y = 0; y < kBlockEdge; y++) + { + for (int x = 0; x < kBlockEdge; x++) + { + block0_r[kBlockEdge * y + x] = r[offset + y * xsize + x]; + block0_g[kBlockEdge * y + x] = g[offset + y * xsize + x]; + block0_b[kBlockEdge * y + x] = b[offset + y * xsize + x]; + block1_r[kBlockEdge * y + x] = r2[offset + y * xsize + x]; + block1_g[kBlockEdge * y + x] = g2[offset + y * xsize + x]; + block1_b[kBlockEdge * y + x] = b2[offset + y * xsize + x]; + } + } + + double diff_xyb_dc[3] = { 0.0 }; + double diff_xyb_ac[3] = { 0.0 }; + double diff_xyb_edge_dc[3] = { 0.0 }; + + ButteraugliBlockDiff(block0, block1, diff_xyb_dc, diff_xyb_ac, diff_xyb_edge_dc); + + for (int i = 0; i < 3; i++) + { + block_diff_dc[3 * res_ix + i] = diff_xyb_dc[i]; + block_diff_ac[3 * res_ix + i] = diff_xyb_ac[i]; + } +} __kernel void MaskHighIntensityChange( __global float *xyb0_x, __global float *xyb0_y, __global float *xyb0_b, __global float *xyb1_x, __global float *xyb1_y, __global float *xyb1_b, @@ -691,29 +1224,46 @@ __kernel void MaskHighIntensityChange( xyb1_b[ix] = (float)(mix[2] * c1_b[ix] + (1 - mix[2]) * ave[2]); } + +__constant double XybToVals_off = 11.38708334481672; +__constant double XybToVals_inc = 14.550189611520716; +__constant double XybToVals_lut[21] = { + 0, + XybToVals_off, + XybToVals_off + 1 * XybToVals_inc, + XybToVals_off + 2 * XybToVals_inc, + XybToVals_off + 3 * XybToVals_inc, + XybToVals_off + 4 * XybToVals_inc, + XybToVals_off + 5 * XybToVals_inc, + XybToVals_off + 6 * XybToVals_inc, + XybToVals_off + 7 * XybToVals_inc, + XybToVals_off + 8 * XybToVals_inc, + XybToVals_off + 9 * XybToVals_inc, + XybToVals_off + 10 * XybToVals_inc, + XybToVals_off + 11 * XybToVals_inc, + XybToVals_off + 12 * XybToVals_inc, + XybToVals_off + 13 * XybToVals_inc, + XybToVals_off + 14 * XybToVals_inc, + XybToVals_off + 15 * XybToVals_inc, + XybToVals_off + 16 * XybToVals_inc, + XybToVals_off + 17 * XybToVals_inc, + XybToVals_off + 18 * XybToVals_inc, + XybToVals_off + 19 * XybToVals_inc, +}; + void XybToVals( double x, double y, double z, double *valx, double *valy, double *valz) { - static const double xmul = 0.758304045695; - static const double ymul = 2.28148649801; - static const double zmul = 1.87816926918; + const double xmul = 0.758304045695; + const double ymul = 2.28148649801; + const double zmul = 1.87816926918; - double lut[21] = { 0.0 }; - const double off = 11.38708334481672; - const double inc = 14.550189611520716; - lut[0] = 0.0; - lut[1] = off; - for (int i = 2; i < 21; ++i) { - lut[i] = lut[i - 1] + inc; - } - - *valx = Interpolate(lut, 21, x * xmul); - *valy = Interpolate(lut, 21, y * ymul); + *valx = Interpolate(&XybToVals_lut[0], 21, x * xmul); + *valy = Interpolate(&XybToVals_lut[0], 21, y * ymul); *valz = zmul * z; } - __kernel void DiffPrecompute( __global float *xyb0_x, __global float *xyb0_y, __global float *xyb0_b, __global float *xyb1_x, __global float *xyb1_y, __global float *xyb1_b, @@ -778,4 +1328,111 @@ __kernel void DiffPrecompute( sup1 = fabs(valsh1[2]) + fabs(valsv1[2]); m = min(sup0, sup1); mask_b[ix] = (float)(m); -} \ No newline at end of file +} + +void UpsampleSquareRoot(float *diffmap, size_t xsize, size_t ysize, int step, float *diffmap_out) +{ + const int res_x = get_global_id(0); + const int res_y = get_global_id(1); + + if (res_y + 8 - step >= ysize) return; + if (res_x + 8 - step >= xsize) return; + + int s2 = (8 - step) / 2; + // Upsample and take square root. + const size_t res_xsize = (xsize + step - 1) / step; + size_t res_ix = (res_y * res_xsize + res_x) / step; + float orig_val = diffmap[res_ix]; + const float kInitialSlope = 100; + // TODO(b/29974893): Until that is fixed do not call sqrt on very small + // numbers. + double val = orig_val < (1.0 / (kInitialSlope * kInitialSlope)) + ? kInitialSlope * orig_val + : sqrt(orig_val); + for (size_t off_y = 0; off_y < step; ++off_y) { + for (size_t off_x = 0; off_x < step; ++off_x) { + diffmap_out[(res_y + off_y + s2) * xsize + + res_x + off_x + s2] = val; + } + } +} + +void CalculateDiffmapGetBlurred(float *diffmap, int s, int s2, float *blurred) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + const int xsize = get_global_size(0); + const int ysize = get_global_size(1); + + blurred[y * xsize + x] = diffmap[(y + s2) * xsize + s + x + s2]; +} + +void GetDiffmapFromBlurred(float *blurred, int s, int s2, float *diffmap) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + const int xsize = get_global_size(0); + const int ysize = get_global_size(1); + + const double mul1 = 24.8235314874; + diffmap[(y + s2) * xsize + x + s2] += (float)(mul1) * blurred[y * (xsize - s) + x]; + +} + +void AverageAddImage(float *img, float *tmp0, float *tmp1) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + const int xsize = get_global_size(0); + const int ysize = get_global_size(1); + + const int row0 = y * xsize; + if (x == 0) // excute once per y + { + img[row0 + 1] += tmp0[row0]; + img[row0 + 0] += tmp0[row0 + 1]; + img[row0 + 2] += tmp0[row0 + 1]; + + img[row0 + xsize - 3] += tmp0[row0 + xsize - 2]; + img[row0 + xsize - 1] += tmp0[row0 + xsize - 2]; + img[row0 + xsize - 2] += tmp0[row0 + xsize - 1]; + + if (y > 0) { + const int rowd1 = row0 - xsize; + img[rowd1 + 1] += tmp1[row0]; + img[rowd1 + 0] += tmp0[row0]; + + img[rowd1 + xsize - 1] += tmp0[row0 + xsize - 1]; + img[rowd1 + xsize - 2] += tmp1[row0 + xsize - 1]; + } + if (y + 1 < ysize) { + const int rowu1 = row0 + xsize; + img[rowu1 + 1] += tmp1[row0]; + img[rowu1 + 0] += tmp0[row0]; + + img[rowu1 + xsize - 1] += tmp0[row0 + xsize - 1]; + img[rowu1 + xsize - 2] += tmp1[row0 + xsize - 1]; + } + } + + if (x >= 2 && x < xsize - 2) + { + img[row0 + x - 1] += tmp0[row0 + x]; + img[row0 + x + 1] += tmp0[row0 + x]; + } + + if (x >= 1 && x < xsize - 1) { + if (y > 0) { + const int rowd1 = row0 - xsize; + img[rowd1 + x + 1] += tmp1[row0 + x]; + img[rowd1 + x + 0] += tmp0[row0 + x]; + img[rowd1 + x - 1] += tmp1[row0 + x]; + } + if (y + 1 < ysize) { + const int rowu1 = row0 + xsize; + img[rowu1 + x + 1] += tmp1[row0 + x]; + img[rowu1 + x + 0] += tmp0[row0 + x]; + img[rowu1 + x - 1] += tmp1[row0 + x]; + } + } +} diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index 4ec86456..08d29fb7 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -63,6 +63,12 @@ ocl_args_d_t& getOcl(void) ocl.kernel[KERNEL_COMBINECHANNELS] = clCreateKernel(ocl.program, "CombineChannels", &err); ocl.kernel[KERNEL_MASKHIGHINTENSITYCHANGE] = clCreateKernel(ocl.program, "MaskHighIntensityChange", &err); ocl.kernel[KERNEL_DIFFPRECOMPUTE] = clCreateKernel(ocl.program, "DiffPrecompute", &err); + ocl.kernel[KERNEL_CALCULATEDIFFMAPGETBLURRED] = clCreateKernel(ocl.program, "CalculateDiffmapGetBlurred", &err); + ocl.kernel[KERNEL_GETDIFFMAPFROMBLURRED] = clCreateKernel(ocl.program, "GetDiffmapFromBlurred", &err); + ocl.kernel[KERNEL_AVERAGEADDIMAGE] = clCreateKernel(ocl.program, "AverageAddImage", &err); + ocl.kernel[KERNEL_EDGEDETECTOR] = clCreateKernel(ocl.program, "edgeDetectorMap", &err); + ocl.kernel[KERNEL_BLOCKDIFFMAP] = clCreateKernel(ocl.program, "blockDiffMap", &err); + ocl.kernel[KERNEL_EDGEDETECTORLOWFREQ] = clCreateKernel(ocl.program, "edgeDetectorLowFreq", &err); return ocl; } @@ -471,7 +477,6 @@ void clMaskHighIntensityChangeEx(ocl_channels xyb0_arg/*in,out*/, } } -// strong todo void clEdgeDetectorMapEx(ocl_channels rgb, ocl_channels rgb2, size_t xsize, size_t ysize, size_t step, cl_mem result/*out*/) { cl_int channel_size = xsize * ysize * sizeof(float); @@ -489,40 +494,134 @@ void clEdgeDetectorMapEx(ocl_channels rgb, ocl_channels rgb2, size_t xsize, size clBlurEx(rgb.ch[i], xsize, ysize, kSigma[i], 0.0, rgb_blured.ch[i]); clBlurEx(rgb2.ch[i], xsize, ysize, kSigma[i], 0.0, rgb2_blured.ch[i]); } - // EdgeDetectorLowFreq + cl_int clxsize = xsize; + cl_int clysize = ysize; + cl_int clstep = step; + + cl_kernel kernel = ocl.kernel[KERNEL_EDGEDETECTOR]; + clSetKernelArg(kernel, 0, sizeof(cl_mem), &result); + clSetKernelArg(kernel, 1, sizeof(cl_mem), &rgb_blured.r); + clSetKernelArg(kernel, 2, sizeof(cl_mem), &rgb_blured.g); + clSetKernelArg(kernel, 3, sizeof(cl_mem), &rgb_blured.b); + clSetKernelArg(kernel, 4, sizeof(cl_mem), &rgb2_blured.r); + clSetKernelArg(kernel, 5, sizeof(cl_mem), &rgb2_blured.g); + clSetKernelArg(kernel, 6, sizeof(cl_mem), &rgb2_blured.b); + clSetKernelArg(kernel, 7, sizeof(cl_int), &clxsize); + clSetKernelArg(kernel, 8, sizeof(cl_int), &clysize); + clSetKernelArg(kernel, 9, sizeof(cl_int), &clstep); + + const size_t res_xsize = (xsize + step - 1) / step; + const size_t res_ysize = (xsize + step - 1) / step; + + size_t globalWorkSize[2] = { res_xsize, res_ysize}; + err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); + if (CL_SUCCESS != err) + { + LogError("Error: clEdgeDetectorMapEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err)); + } + err = clFinish(ocl.commandQueue); + if (CL_SUCCESS != err) + { + LogError("Error: clEdgeDetectorMapEx() clFinish returned %s.\n", TranslateOpenCLError(err)); + } + + ocl.releaseMemChannels(rgb_blured); + ocl.releaseMemChannels(rgb2_blured); } -// strong todo void clBlockDiffMapEx(ocl_channels rgb, ocl_channels rgb2, - size_t xsize, size_t ysize, + size_t xsize, size_t ysize, size_t step, cl_mem block_diff_dc/*out*/, cl_mem block_diff_ac/*out*/) { + cl_int err = CL_SUCCESS; + ocl_args_d_t &ocl = getOcl(); + + cl_int clxsize = xsize; + cl_int clysize = ysize; + cl_int clstep = step; + cl_kernel kernel = ocl.kernel[KERNEL_BLOCKDIFFMAP]; + clSetKernelArg(kernel, 0, sizeof(cl_mem), &rgb.r); + clSetKernelArg(kernel, 1, sizeof(cl_mem), &rgb.g); + clSetKernelArg(kernel, 2, sizeof(cl_mem), &rgb.b); + clSetKernelArg(kernel, 3, sizeof(cl_mem), &rgb2.r); + clSetKernelArg(kernel, 4, sizeof(cl_mem), &rgb2.g); + clSetKernelArg(kernel, 5, sizeof(cl_mem), &rgb2.b); + clSetKernelArg(kernel, 6, sizeof(cl_mem), &block_diff_dc); + clSetKernelArg(kernel, 7, sizeof(cl_mem), &block_diff_ac); + clSetKernelArg(kernel, 8, sizeof(cl_int), &clxsize); + clSetKernelArg(kernel, 9, sizeof(cl_int), &clysize); + clSetKernelArg(kernel, 10, sizeof(cl_int), &clstep); + + const size_t res_xsize = (xsize + step - 1) / step; + const size_t res_ysize = (xsize + step - 1) / step; + + size_t globalWorkSize[2] = { res_xsize, res_ysize }; + err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); + if (CL_SUCCESS != err) + { + LogError("Error: clBlockDiffMapEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err)); + } + err = clFinish(ocl.commandQueue); + if (CL_SUCCESS != err) + { + LogError("Error: clBlockDiffMapEx() clFinish returned %s.\n", TranslateOpenCLError(err)); + } } -// strong todo void clEdgeDetectorLowFreqEx(ocl_channels rgb, ocl_channels rgb2, - size_t xsize, size_t ysize, + size_t xsize, size_t ysize, size_t step, cl_mem block_diff_ac/*out*/) { cl_int channel_size = xsize * ysize * sizeof(float); static const double kSigma = 14; - static const double kMul = 10; cl_int err = 0; ocl_args_d_t &ocl = getOcl(); ocl_channels rgb_blured = ocl.allocMemChannels(channel_size); ocl_channels rgb2_blured = ocl.allocMemChannels(channel_size); - //static const double kSigma[3] = { 1.5, 0.586, 0.4 }; - for (int i = 0; i < 3; i++) { clBlurEx(rgb.ch[i], xsize, ysize, kSigma, 0.0, rgb_blured.ch[i]); clBlurEx(rgb2.ch[i], xsize, ysize, kSigma, 0.0, rgb2_blured.ch[i]); } + + cl_int clxsize = xsize; + cl_int clysize = ysize; + cl_int clstep = step; + + cl_kernel kernel = ocl.kernel[KERNEL_EDGEDETECTORLOWFREQ]; + clSetKernelArg(kernel, 0, sizeof(cl_mem), &block_diff_ac); + clSetKernelArg(kernel, 1, sizeof(cl_mem), &rgb_blured.r); + clSetKernelArg(kernel, 2, sizeof(cl_mem), &rgb_blured.g); + clSetKernelArg(kernel, 3, sizeof(cl_mem), &rgb_blured.b); + clSetKernelArg(kernel, 4, sizeof(cl_mem), &rgb2_blured.r); + clSetKernelArg(kernel, 5, sizeof(cl_mem), &rgb2_blured.g); + clSetKernelArg(kernel, 6, sizeof(cl_mem), &rgb2_blured.b); + clSetKernelArg(kernel, 7, sizeof(cl_int), &clxsize); + clSetKernelArg(kernel, 8, sizeof(cl_int), &clysize); + clSetKernelArg(kernel, 9, sizeof(cl_int), &clstep); + + const size_t res_xsize = (xsize + step - 1) / step; + const size_t res_ysize = (xsize + step - 1) / step; + + size_t globalWorkSize[2] = { res_xsize, res_ysize }; + err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); + if (CL_SUCCESS != err) + { + LogError("Error: clEdgeDetectorLowFreqEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err)); + } + err = clFinish(ocl.commandQueue); + if (CL_SUCCESS != err) + { + LogError("Error: clEdgeDetectorLowFreqEx() clFinish returned %s.\n", TranslateOpenCLError(err)); + } + + ocl.releaseMemChannels(rgb_blured); + ocl.releaseMemChannels(rgb2_blured); } void clDiffPrecomputeEx(ocl_channels xyb0, ocl_channels xyb1, size_t xsize, size_t ysize, ocl_channels mask/*out*/) @@ -579,7 +678,29 @@ void clScaleImageEx(cl_mem img/*in, out*/, size_t size, float w) } } -// ian todo +void clAverageAddImage(cl_mem img, cl_mem tmp0, cl_mem tmp1, size_t xsize, size_t ysize) +{ + cl_int err = CL_SUCCESS; + ocl_args_d_t &ocl = getOcl(); + + cl_kernel kernel = ocl.kernel[KERNEL_AVERAGEADDIMAGE]; + clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&img); + clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&tmp0); + clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&tmp1); + + size_t globalWorkSize[2] = { xsize, ysize }; + err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); + if (CL_SUCCESS != err) + { + LogError("Error: clAverageAddImage() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err)); + } + err = clFinish(ocl.commandQueue); + if (CL_SUCCESS != err) + { + LogError("Error: clAverageAddImage() clFinish returned %s.\n", TranslateOpenCLError(err)); + } +} + void clAverage5x5Ex(cl_mem img/*in,out*/, size_t xsize, size_t ysize) { if (xsize < 4 || ysize < 4) { @@ -606,45 +727,8 @@ void clAverage5x5Ex(cl_mem img/*in,out*/, size_t xsize, size_t ysize) static const float scale = 1.0f / (5.0f + 4 * w); clScaleImageEx(tmp1, xsize * ysize, w); - /* TODO - for (int y = 0; y < ysize; y++) { - const int row0 = y * xsize; - result[row0 + 1] += tmp0[row0]; - result[row0 + 0] += tmp0[row0 + 1]; - result[row0 + 2] += tmp0[row0 + 1]; - for (int x = 2; x < xsize - 2; ++x) { - result[row0 + x - 1] += tmp0[row0 + x]; - result[row0 + x + 1] += tmp0[row0 + x]; - } - result[row0 + xsize - 3] += tmp0[row0 + xsize - 2]; - result[row0 + xsize - 1] += tmp0[row0 + xsize - 2]; - result[row0 + xsize - 2] += tmp0[row0 + xsize - 1]; - if (y > 0) { - const int rowd1 = row0 - xsize; - result[rowd1 + 1] += tmp1[row0]; - result[rowd1 + 0] += tmp0[row0]; - for (int x = 1; x < xsize - 1; ++x) { - result[rowd1 + x + 1] += tmp1[row0 + x]; - result[rowd1 + x + 0] += tmp0[row0 + x]; - result[rowd1 + x - 1] += tmp1[row0 + x]; - } - result[rowd1 + xsize - 1] += tmp0[row0 + xsize - 1]; - result[rowd1 + xsize - 2] += tmp1[row0 + xsize - 1]; - } - if (y + 1 < ysize) { - const int rowu1 = row0 + xsize; - result[rowu1 + 1] += tmp1[row0]; - result[rowu1 + 0] += tmp0[row0]; - for (int x = 1; x < xsize - 1; ++x) { - result[rowu1 + x + 1] += tmp1[row0 + x]; - result[rowu1 + x + 0] += tmp0[row0 + x]; - result[rowu1 + x - 1] += tmp1[row0 + x]; - } - result[rowu1 + xsize - 1] += tmp0[row0 + xsize - 1]; - result[rowu1 + xsize - 2] += tmp1[row0 + xsize - 1]; - } - } - */ + clAverageAddImage(result, tmp0, tmp1, xsize, ysize); + err = clEnqueueCopyBuffer(ocl.commandQueue, result, img, 0, 0, len, 0, NULL, NULL); if (CL_SUCCESS != err) { @@ -757,16 +841,17 @@ void clCombineChannelsEx( cl_mem edge_detector_map, size_t xsize, size_t ysize, size_t step, - size_t res_xsize, cl_mem result/*out*/) { cl_int err = CL_SUCCESS; ocl_args_d_t &ocl = getOcl(); + const size_t res_xsize = (xsize + step - 1) / step; + const size_t res_ysize = (xsize + step - 1) / step; + cl_int clxsize = xsize; cl_int clysize = ysize; cl_int clstep = step; - cl_int clres_xsize = res_xsize; cl_kernel kernel = ocl.kernel[KERNEL_COMBINECHANNELS]; clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&mask.r); @@ -781,10 +866,9 @@ void clCombineChannelsEx( clSetKernelArg(kernel, 9, sizeof(cl_int), (void*)&clxsize); clSetKernelArg(kernel, 10, sizeof(cl_int), (void*)&clysize); clSetKernelArg(kernel, 11, sizeof(cl_int), (void*)&clstep); - clSetKernelArg(kernel, 12, sizeof(cl_int), (void*)&clres_xsize); - clSetKernelArg(kernel, 13, sizeof(cl_mem), (void*)&result); + clSetKernelArg(kernel, 12, sizeof(cl_mem), (void*)&result); - size_t globalWorkSize[2] = { xsize / step, ysize /step }; + size_t globalWorkSize[2] = { res_xsize, res_ysize}; err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); if (CL_SUCCESS != err) { @@ -797,61 +881,115 @@ void clCombineChannelsEx( } } -// strong todo -void clCalculateDiffmapEx(cl_mem result/*in,out*/, size_t xsize, size_t ysize, int step) +void clUpsampleSquareRootEx(cl_mem diffmap, size_t xsize, size_t ysize, int step) { -/* - int s2 = (8 - step) / 2; - { - // Upsample and take square root. - std::vector diffmap_out(xsize * ysize); - const size_t res_xsize = (xsize + step - 1) / step; - for (size_t res_y = 0; res_y + 8 - step < ysize; res_y += step) { - for (size_t res_x = 0; res_x + 8 - step < xsize; res_x += step) { - size_t res_ix = (res_y * res_xsize + res_x) / step; - float orig_val = (*diffmap)[res_ix]; - constexpr float kInitialSlope = 100; - // TODO(b/29974893): Until that is fixed do not call sqrt on very small - // numbers. - double val = orig_val < (1.0 / (kInitialSlope * kInitialSlope)) - ? kInitialSlope * orig_val - : std::sqrt(orig_val); - for (size_t off_y = 0; off_y < step; ++off_y) { - for (size_t off_x = 0; off_x < step; ++off_x) { - diffmap_out[(res_y + off_y + s2) * xsize + - res_x + off_x + s2] = val; - } - } - } - } - *diffmap = diffmap_out; - } -*/ - static const double kSigma = 8.8510880283; - static const double mul1 = 24.8235314874; - static const double scale = 1.0 / (1.0 + mul1); - const int s = 8 - step; - const int s2 = (8 - step) / 2; - - cl_mem blurred; -/* - for (size_t y = 0; y < ysize - s; ++y) { - for (size_t x = 0; x < xsize - s; ++x) { - blurred[y * (xsize - s) + x] = (*diffmap)[(y + s2) * xsize + x + s2]; - } - } -*/ - static const double border_ratio = 0.03027655136; - clBlurEx(blurred, xsize - s, ysize - s, kSigma, border_ratio); -/* - for (size_t y = 0; y < ysize - s; ++y) { - for (size_t x = 0; x < xsize - s; ++x) { - (*diffmap)[(y + s2) * xsize + x + s2] - += static_cast(mul1) * blurred[y * (xsize - s) + x]; - } - } -*/ - clScaleImageEx(result, xsize * ysize, scale); + cl_int err = CL_SUCCESS; + ocl_args_d_t &ocl = getOcl(); + + cl_int clxsize = xsize; + cl_int clysize = ysize; + cl_int clstep = step; + ocl.allocC(xsize * ysize * sizeof(float)); + + cl_kernel kernel = ocl.kernel[KERNEL_UPSAMPLESQUAREROOT]; + clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&diffmap); + clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&xsize); + clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&ysize); + clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&step); + clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&ocl.dstMem); + + const size_t res_xsize = (xsize + step - 1) / step; + const size_t res_ysize = (xsize + step - 1) / step; + + size_t globalWorkSize[2] = { res_xsize, res_ysize }; + err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); + if (CL_SUCCESS != err) + { + LogError("Error: clUpsampleSquareRootEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err)); + } + err = clEnqueueCopyBuffer(ocl.commandQueue, ocl.dstMem, diffmap, 0, 0, xsize * ysize * sizeof(float), 0, NULL, NULL); + if (CL_SUCCESS != err) + { + LogError("Error: clUpsampleSquareRootEx() clEnqueueCopyBuffer returned %s.\n", TranslateOpenCLError(err)); + } + err = clFinish(ocl.commandQueue); + if (CL_SUCCESS != err) + { + LogError("Error: clUpsampleSquareRootEx() clFinish returned %s.\n", TranslateOpenCLError(err)); + } +} + +void clCalculateDiffmapGetBlurredEx(cl_mem diffmap, size_t xsize, size_t ysize, int s, int s2, cl_mem blurred) +{ + cl_int err = 0; + ocl_args_d_t &ocl = getOcl(); + + cl_int cls = s; + cl_int cls2 = s2; + cl_kernel kernel = ocl.kernel[KERNEL_CALCULATEDIFFMAPGETBLURRED]; + clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&diffmap); + clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&s); + clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&s2); + clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&blurred); + + size_t globalWorkSize[2] = { xsize, ysize }; + err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); + if (CL_SUCCESS != err) + { + LogError("Error: clCalculateDiffmapGetBlurredEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err)); + } + err = clFinish(ocl.commandQueue); + if (CL_SUCCESS != err) + { + LogError("Error: clCalculateDiffmapGetBlurredEx() clFinish returned %s.\n", TranslateOpenCLError(err)); + } +} + +void clGetDiffmapFromBlurredEx(cl_mem diffmap, size_t xsize, size_t ysize, int s, int s2, cl_mem blurred) +{ + cl_int err = 0; + ocl_args_d_t &ocl = getOcl(); + + cl_int cls = s; + cl_int cls2 = s2; + cl_kernel kernel = ocl.kernel[KERNEL_CALCULATEDIFFMAPGETBLURRED]; + clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&blurred); + clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&s); + clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&s2); + clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&diffmap); + + size_t globalWorkSize[2] = { xsize, ysize }; + err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); + if (CL_SUCCESS != err) + { + LogError("Error: clGetDiffmapFromBlurredEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err)); + } + err = clFinish(ocl.commandQueue); + if (CL_SUCCESS != err) + { + LogError("Error: clGetDiffmapFromBlurredEx() clFinish returned %s.\n", TranslateOpenCLError(err)); + } +} + +void clCalculateDiffmapEx(cl_mem diffmap/*in,out*/, size_t xsize, size_t ysize, int step) +{ + clUpsampleSquareRootEx(diffmap, xsize, ysize, step); + + static const double kSigma = 8.8510880283; + static const double mul1 = 24.8235314874; + static const double scale = 1.0 / (1.0 + mul1); + const int s = 8 - step; + int s2 = (8 - step) / 2; + + ocl_args_d_t &ocl = getOcl(); + ocl.allocA((xsize - s) * (ysize - s) * sizeof(float)); + cl_mem blurred = ocl.srcA; + clCalculateDiffmapGetBlurredEx(diffmap, (xsize - s), (ysize - s), s, s2, blurred); + + static const double border_ratio = 0.03027655136; + clBlurEx(blurred, xsize - s, ysize - s, kSigma, border_ratio); + clGetDiffmapFromBlurredEx(diffmap, (xsize - s), (ysize - s), s, s2, blurred); + clScaleImageEx(diffmap, xsize * ysize, scale); } void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b, @@ -887,31 +1025,29 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b, err = clEnqueueCopyBuffer(ocl.commandQueue, xyb1.b, xyb1_c.b, 0, 0, channel_size, 0, NULL, NULL); err = clFinish(ocl.commandQueue); - cl_mem edge_detector_map = ocl.allocMem(3 * xsize * ysize * sizeof(float)); - cl_mem block_diff_dc = ocl.allocMem(3 * xsize * ysize * sizeof(float)); - cl_mem block_diff_ac = ocl.allocMem(3 * xsize * ysize * sizeof(float)); - + cl_mem mem_result = ocl.allocMem(channel_size); ocl_channels mask = ocl.allocMemChannels(channel_size); ocl_channels mask_dc = ocl.allocMemChannels(channel_size); - size_t res_xsize_; // ³ÉÔ±±äÁ¿£¬ÐèÒª´«µÝ - size_t res_ysize_; // ³ÉÔ±±äÁ¿£¬ÐèÒª´«µÝ - cl_mem mem_result = ocl.allocMem(channel_size); + const size_t res_xsize = (xsize + step - 1) / step; + const size_t res_ysize = (xsize + step - 1) / step; + + cl_mem edge_detector_map = ocl.allocMem(3 * res_xsize * res_ysize * sizeof(float)); + cl_mem block_diff_dc = ocl.allocMem(3 * res_xsize * res_ysize * sizeof(float)); + cl_mem block_diff_ac = ocl.allocMem(3 * res_xsize * res_ysize * sizeof(float)); clMaskHighIntensityChangeEx(xyb0_arg, xyb1_c, xyb0, xyb1, xsize, ysize); - //clEdgeDetectorMapEx(xyb0_arg, xyb1, xsize, ysize, edge_detector_map); - clBlockDiffMapEx(xyb0_arg, xyb1, xsize, ysize, block_diff_dc, block_diff_ac); - clEdgeDetectorLowFreqEx(xyb0_arg, xyb1, xsize, ysize, block_diff_ac); + clEdgeDetectorMapEx(xyb0_arg, xyb1, xsize, ysize, step, edge_detector_map); + clBlockDiffMapEx(xyb0_arg, xyb1, xsize, ysize, step, block_diff_dc, block_diff_ac); + clEdgeDetectorLowFreqEx(xyb0_arg, xyb1, xsize, ysize, step, block_diff_ac); clMaskEx(xyb0_arg, xyb1, xsize, ysize, mask, mask_dc); - size_t xsize_ = 0, ysize_ = 0; // ³ÉÔ±±äÁ¿£¬ÐèÒª´«µÝ - clCombineChannelsEx(mask, mask_dc, block_diff_dc, block_diff_ac, edge_detector_map, xsize_, ysize_, step, res_xsize_, mem_result); + clCombineChannelsEx(mask, mask_dc, block_diff_dc, block_diff_ac, edge_detector_map, xsize, ysize, step, mem_result); clCalculateDiffmapEx(mem_result, xsize, ysize, step); - cl_float *result_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mem_result, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); memcpy(result, result_r, channel_size); diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h index 3e20a2f3..ecd3af86 100644 --- a/clguetzli/ocl.h +++ b/clguetzli/ocl.h @@ -55,6 +55,13 @@ enum KernelName { KERNEL_COMBINECHANNELS, KERNEL_MASKHIGHINTENSITYCHANGE, KERNEL_DIFFPRECOMPUTE, + KERNEL_UPSAMPLESQUAREROOT, + KERNEL_CALCULATEDIFFMAPGETBLURRED, + KERNEL_GETDIFFMAPFROMBLURRED, + KERNEL_AVERAGEADDIMAGE, + KERNEL_EDGEDETECTOR, + KERNEL_BLOCKDIFFMAP, + KERNEL_EDGEDETECTORLOWFREQ, KERNEL_COUNT, }; From 4aeec41db3316788059ec1c72b0dca595d996021 Mon Sep 17 00:00:00 2001 From: strongtu Date: Sat, 6 May 2017 15:56:27 +0800 Subject: [PATCH 033/189] test for clDiffmapOpsinDynamicsImage --- third_party/butteraugli/butteraugli/butteraugli.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/third_party/butteraugli/butteraugli/butteraugli.cc b/third_party/butteraugli/butteraugli/butteraugli.cc index 2fd045d8..cc624bed 100644 --- a/third_party/butteraugli/butteraugli/butteraugli.cc +++ b/third_party/butteraugli/butteraugli/butteraugli.cc @@ -1120,14 +1120,14 @@ void ButteraugliComparator::DiffmapOpsinDynamicsImage( const std::vector> &xyb0_arg, std::vector> &xyb1, std::vector &result) { -/* + if (g_useOpenCL && xsize_ > 100 && ysize_ > 100) { result.resize(xsize_ * ysize_); clDiffmapOpsinDynamicsImage(xyb0_arg[0].data(), xyb0_arg[1].data(), xyb0_arg[2].data(), - xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), xsize_, ysize_, result.data()); + xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), xsize_, ysize_, step_, result.data()); } -*/ + if (xsize_ < 8 || ysize_ < 8) return; auto xyb0 = xyb0_arg; From da654cb6cfdda98f3b9d59161b68e4014a51ebcd Mon Sep 17 00:00:00 2001 From: strongtu Date: Sat, 6 May 2017 20:17:19 +0800 Subject: [PATCH 034/189] fix runtime bug --- clguetzli/clguetzli.cl | 23 +++++----- clguetzli/clguetzli.cpp | 44 +++++++++---------- .../butteraugli/butteraugli/butteraugli.cc | 1 + 3 files changed, 34 insertions(+), 34 deletions(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index 86ffb63a..cb93294b 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -207,8 +207,8 @@ __kernel void DownSample(__global float* pA, __global float* pC, int xstep, int const int oxsize = xsize / xstep; - const int sample_x = x / xstep * xstep; - const int sample_y = y / ystep * ystep; + const int sample_x = x / xstep; + const int sample_y = y / ystep; pC[y * xsize + x] = pA[sample_y * oxsize + sample_x]; } @@ -443,10 +443,10 @@ __kernel void DoMask( } -__kernel void ScaleImage(double scale, __global float *result) +__kernel void ScaleImage(float scale, __global float *result) { const int i = get_global_id(0); - result[i] *= (float)(scale); + result[i] *= scale; } double DotProduct(float u[3], double v[3]) { @@ -582,8 +582,11 @@ __kernel void edgeDetectorMap(__global float *result, int pos_x = res_x * step; int pos_y = res_y * step; - if (res_x * step >= xsize - (8 - step)) return; - if (res_y * step >= ysize - (8 - step)) return; + if (pos_x >= xsize - (8 - step)) return; + if (pos_y >= ysize - (8 - step)) return; + + pos_x = min(pos_x, xsize - 8); + pos_y = min(pos_y, ysize - 8); int local_count = 0; double local_xyb[3] = { 0 }; @@ -1330,7 +1333,7 @@ __kernel void DiffPrecompute( mask_b[ix] = (float)(m); } -void UpsampleSquareRoot(float *diffmap, size_t xsize, size_t ysize, int step, float *diffmap_out) +__kernel void UpsampleSquareRoot(__global float *diffmap, int xsize, int ysize, int step, __global float *diffmap_out) { const int res_x = get_global_id(0); const int res_y = get_global_id(1); @@ -1357,7 +1360,7 @@ void UpsampleSquareRoot(float *diffmap, size_t xsize, size_t ysize, int step, fl } } -void CalculateDiffmapGetBlurred(float *diffmap, int s, int s2, float *blurred) +kernel void CalculateDiffmapGetBlurred(__global float *diffmap, int s, int s2, __global float *blurred) { const int x = get_global_id(0); const int y = get_global_id(1); @@ -1367,7 +1370,7 @@ void CalculateDiffmapGetBlurred(float *diffmap, int s, int s2, float *blurred) blurred[y * xsize + x] = diffmap[(y + s2) * xsize + s + x + s2]; } -void GetDiffmapFromBlurred(float *blurred, int s, int s2, float *diffmap) +kernel void GetDiffmapFromBlurred(__global float *blurred, int s, int s2, __global float *diffmap) { const int x = get_global_id(0); const int y = get_global_id(1); @@ -1379,7 +1382,7 @@ void GetDiffmapFromBlurred(float *blurred, int s, int s2, float *diffmap) } -void AverageAddImage(float *img, float *tmp0, float *tmp1) +__kernel void AverageAddImage(__global float *img, __global float *tmp0, __global float *tmp1) { const int x = get_global_id(0); const int y = get_global_id(1); diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index 08d29fb7..722e56c8 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -45,15 +45,7 @@ ocl_args_d_t& getOcl(void) } } ocl.kernel[KERNEL_MINSQUAREVAL] = clCreateKernel(ocl.program, "MinSquareVal", &err); - if (CL_SUCCESS != err) - { - LogError("Error: clCreateKernel(MinSquareVal) for source program returned %s.\n", TranslateOpenCLError(err)); - } ocl.kernel[KERNEL_CONVOLUTION] = clCreateKernel(ocl.program, "Convolution", &err); - if (CL_SUCCESS != err) - { - LogError("Error: clCreateKernel(Convolution) for source program returned %s.\n", TranslateOpenCLError(err)); - } ocl.kernel[KERNEL_CONVOLUTIONX] = clCreateKernel(ocl.program, "ConvolutionX", &err); ocl.kernel[KERNEL_CONVOLUTIONY] = clCreateKernel(ocl.program, "ConvolutionY", &err); ocl.kernel[KERNEL_DOWNSAMPLE] = clCreateKernel(ocl.program, "DownSample", &err); @@ -63,6 +55,7 @@ ocl_args_d_t& getOcl(void) ocl.kernel[KERNEL_COMBINECHANNELS] = clCreateKernel(ocl.program, "CombineChannels", &err); ocl.kernel[KERNEL_MASKHIGHINTENSITYCHANGE] = clCreateKernel(ocl.program, "MaskHighIntensityChange", &err); ocl.kernel[KERNEL_DIFFPRECOMPUTE] = clCreateKernel(ocl.program, "DiffPrecompute", &err); + ocl.kernel[KERNEL_UPSAMPLESQUAREROOT] = clCreateKernel(ocl.program, "UpsampleSquareRoot", &err); ocl.kernel[KERNEL_CALCULATEDIFFMAPGETBLURRED] = clCreateKernel(ocl.program, "CalculateDiffmapGetBlurred", &err); ocl.kernel[KERNEL_GETDIFFMAPFROMBLURRED] = clCreateKernel(ocl.program, "GetDiffmapFromBlurred", &err); ocl.kernel[KERNEL_AVERAGEADDIMAGE] = clCreateKernel(ocl.program, "AverageAddImage", &err); @@ -316,12 +309,12 @@ void clUpsampleEx(cl_mem image, size_t xsize, size_t ysize, cl_int clxstep = xstep; cl_int clystep = ystep; cl_kernel kernel = ocl.kernel[KERNEL_DOWNSAMPLE]; - clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&ocl.srcB); - clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&ocl.dstMem); + clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&image); + clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&result); clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&clxstep); clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&clystep); - size_t globalWorkSize[2] = { ysize, xsize }; + size_t globalWorkSize[2] = { xsize, ysize }; err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); if (CL_SUCCESS != err) { @@ -367,7 +360,7 @@ void clBlurEx(cl_mem image/*out, opt*/, size_t xsize, size_t ysize, clConvolutionEx(image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio, ocl.srcA); clConvolutionEx(ocl.srcA, ysize, dxsize, mem_expn, expn_size, ystep, diff, border_ratio, ocl.srcB); - clUpsampleEx(ocl.srcB, dxsize, dysize, xstep, ystep, result ? result : image); + clUpsampleEx(ocl.srcB, xsize, ysize, xstep, ystep, result ? result : image); } else { @@ -468,12 +461,12 @@ void clMaskHighIntensityChangeEx(ocl_channels xyb0_arg/*in,out*/, err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); if (CL_SUCCESS != err) { - LogError("Error: clScaleImageEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err)); + LogError("Error: clMaskHighIntensityChangeEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err)); } err = clFinish(ocl.commandQueue); if (CL_SUCCESS != err) { - LogError("Error: clScaleImageEx() clFinish returned %s.\n", TranslateOpenCLError(err)); + LogError("Error: clMaskHighIntensityChangeEx() clFinish returned %s.\n", TranslateOpenCLError(err)); } } @@ -512,7 +505,7 @@ void clEdgeDetectorMapEx(ocl_channels rgb, ocl_channels rgb2, size_t xsize, size clSetKernelArg(kernel, 9, sizeof(cl_int), &clstep); const size_t res_xsize = (xsize + step - 1) / step; - const size_t res_ysize = (xsize + step - 1) / step; + const size_t res_ysize = (ysize + step - 1) / step; size_t globalWorkSize[2] = { res_xsize, res_ysize}; err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); @@ -555,7 +548,7 @@ void clBlockDiffMapEx(ocl_channels rgb, ocl_channels rgb2, clSetKernelArg(kernel, 10, sizeof(cl_int), &clstep); const size_t res_xsize = (xsize + step - 1) / step; - const size_t res_ysize = (xsize + step - 1) / step; + const size_t res_ysize = (ysize + step - 1) / step; size_t globalWorkSize[2] = { res_xsize, res_ysize }; err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); @@ -606,7 +599,7 @@ void clEdgeDetectorLowFreqEx(ocl_channels rgb, ocl_channels rgb2, clSetKernelArg(kernel, 9, sizeof(cl_int), &clstep); const size_t res_xsize = (xsize + step - 1) / step; - const size_t res_ysize = (xsize + step - 1) / step; + const size_t res_ysize = (ysize + step - 1) / step; size_t globalWorkSize[2] = { res_xsize, res_ysize }; err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); @@ -658,14 +651,13 @@ void clScaleImageEx(cl_mem img/*in, out*/, size_t size, float w) cl_int err = CL_SUCCESS; ocl_args_d_t &ocl = getOcl(); - cl_int clsize = size; cl_float clscale = w; cl_kernel kernel = ocl.kernel[KERNEL_SCALEIMAGE]; - clSetKernelArg(kernel, 0, sizeof(cl_int), (void*)&clscale); + clSetKernelArg(kernel, 0, sizeof(cl_float), (void*)&clscale); clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&img); - size_t globalWorkSize[1] = { clsize }; + size_t globalWorkSize[1] = { size }; err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL); if (CL_SUCCESS != err) { @@ -734,6 +726,8 @@ void clAverage5x5Ex(cl_mem img/*in,out*/, size_t xsize, size_t ysize) { LogError("Error: clAverage5x5Ex() clEnqueueCopyBuffer returned %s.\n", TranslateOpenCLError(err)); } + err = clFinish(ocl.commandQueue); + clScaleImageEx(img, xsize * ysize, scale); } @@ -847,7 +841,7 @@ void clCombineChannelsEx( ocl_args_d_t &ocl = getOcl(); const size_t res_xsize = (xsize + step - 1) / step; - const size_t res_ysize = (xsize + step - 1) / step; + const size_t res_ysize = (ysize + step - 1) / step; cl_int clxsize = xsize; cl_int clysize = ysize; @@ -896,10 +890,10 @@ void clUpsampleSquareRootEx(cl_mem diffmap, size_t xsize, size_t ysize, int step clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&xsize); clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&ysize); clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&step); - clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&ocl.dstMem); + clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&ocl.dstMem); const size_t res_xsize = (xsize + step - 1) / step; - const size_t res_ysize = (xsize + step - 1) / step; + const size_t res_ysize = (ysize + step - 1) / step; size_t globalWorkSize[2] = { res_xsize, res_ysize }; err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); @@ -907,6 +901,7 @@ void clUpsampleSquareRootEx(cl_mem diffmap, size_t xsize, size_t ysize, int step { LogError("Error: clUpsampleSquareRootEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err)); } + err = clFinish(ocl.commandQueue); err = clEnqueueCopyBuffer(ocl.commandQueue, ocl.dstMem, diffmap, 0, 0, xsize * ysize * sizeof(float), 0, NULL, NULL); if (CL_SUCCESS != err) { @@ -1030,7 +1025,7 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b, ocl_channels mask_dc = ocl.allocMemChannels(channel_size); const size_t res_xsize = (xsize + step - 1) / step; - const size_t res_ysize = (xsize + step - 1) / step; + const size_t res_ysize = (ysize + step - 1) / step; cl_mem edge_detector_map = ocl.allocMem(3 * res_xsize * res_ysize * sizeof(float)); cl_mem block_diff_dc = ocl.allocMem(3 * res_xsize * res_ysize * sizeof(float)); @@ -1049,6 +1044,7 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b, clCalculateDiffmapEx(mem_result, xsize, ysize, step); cl_float *result_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mem_result, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + err = clFinish(ocl.commandQueue); memcpy(result, result_r, channel_size); ocl.releaseMemChannels(xyb0_arg); diff --git a/third_party/butteraugli/butteraugli/butteraugli.cc b/third_party/butteraugli/butteraugli/butteraugli.cc index cc624bed..4faa70c7 100644 --- a/third_party/butteraugli/butteraugli/butteraugli.cc +++ b/third_party/butteraugli/butteraugli/butteraugli.cc @@ -1126,6 +1126,7 @@ void ButteraugliComparator::DiffmapOpsinDynamicsImage( result.resize(xsize_ * ysize_); clDiffmapOpsinDynamicsImage(xyb0_arg[0].data(), xyb0_arg[1].data(), xyb0_arg[2].data(), xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), xsize_, ysize_, step_, result.data()); + return; } From 2ceb6350ef14e9d0ce00015a64e29491e1d749ad Mon Sep 17 00:00:00 2001 From: strongtu Date: Sat, 6 May 2017 20:21:34 +0800 Subject: [PATCH 035/189] remove useless code --- clguetzli/clguetzli.cl | 126 -------------------------- clguetzli/clguetzli.cpp | 196 ---------------------------------------- clguetzli/ocl.h | 2 - 3 files changed, 324 deletions(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index cb93294b..6351aa7a 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -70,132 +70,6 @@ __kernel void Convolution(__global float* multipliers, __global float* inp, __gl result[ox * ysize + y] = sum * scale; } -/* -__kernel void ConvolutionX(__global float* multipliers, __global float* inp, __global float* result, - int len, int offset, float border_ratio) -{ - const int x = get_global_id(0); - const int y = get_global_id(1); - - const int xsize = get_global_size(0); - const int ysize = get_global_size(1); - - float weight_no_border = 0; - for (int j = 0; j <= 2 * offset; j++) - { - weight_no_border += multipliers[j]; - } - - int minx = x < offset ? 0 : x - offset; - int maxx = min(xsize, x + len - offset); - - int miny = y < offset ? 0 : y - offset; - int maxy = min(ysize, y + len - offset); - - float weightX = 0.0; - for (int j = minx; j < maxx; j++) - { - weightX += multipliers[j - x + offset]; - } - - weightX = (1.0 - border_ratio) * weightX + border_ratio * weight_no_border; - - float weightY = 0.0; - for (int j = miny; j < maxy; j++) - { - weightY += multipliers[j - y + offset]; - } - - weightY = (1.0 - border_ratio) * weightY + border_ratio * weight_no_border; - - - float sum = 0.0; - for (int j = miny; j < maxy; j++) - { - float sumx = 0.0; - for (int i = minx; i < maxx; i++) - { - sumx += inp[j * xsize + i] * multipliers[i - x + offset]; - } - - sum += sumx * multipliers[j - y + offset]; - } - - result[y * xsize + x] = sum / weightY / weightX; -} -*/ - -__kernel void ConvolutionX(__global float* multipliers, __global float* inp, __global float* result, - int len, int offset, float border_ratio) -{ - const int x = get_global_id(0); - const int y = get_global_id(1); - - const int xsize = get_global_size(0); - const int ysize = get_global_size(1); - - float weight_no_border = 0; - for (int j = 0; j <= 2 * offset; j++) - { - weight_no_border += multipliers[j]; - } - - int minx = x < offset ? 0 : x - offset; - int maxx = min(xsize, x + len - offset); - - float weight = 0.0; - for (int j = minx; j < maxx; j++) - { - weight += multipliers[j - x + offset]; - } - - weight = (1.0 - border_ratio) * weight + border_ratio * weight_no_border; - float scale = 1.0 / weight; - - float sum = 0.0; - for (int j = minx; j < maxx; j++) - { - sum += inp[y * xsize + j] * multipliers[j - x + offset]; - } - - result[x * ysize + y] = sum * scale; -} - -__kernel void ConvolutionY(__global float* multipliers, __global float* inp, __global float* result, - int len, int offset, float border_ratio) -{ - const int x = get_global_id(0); - const int y = get_global_id(1); - - const int xsize = get_global_size(0); - const int ysize = get_global_size(1); - - float weight_no_border = 0; - for (int j = 0; j <= 2 * offset; j++) - { - weight_no_border += multipliers[j]; - } - - int miny = y < offset ? 0 : y - offset; - int maxy = min(ysize, y + len - offset); - - float weight = 0.0; - for (int j = miny; j < maxy; j++) - { - weight += multipliers[j - y + offset]; - } - - weight = (1.0 - border_ratio) * weight + border_ratio * weight_no_border; - float scale = 1.0 / weight; - - float sum = 0.0; - for (int j = miny; j < maxy; j++) - { - sum += inp[j * xsize + x] * multipliers[j - y + offset]; - } - - result[y * xsize + x] = sum * scale; -} __kernel void DownSample(__global float* pA, __global float* pC, int xstep, int ystep) { diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index 722e56c8..989c5e2f 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -46,8 +46,6 @@ ocl_args_d_t& getOcl(void) } ocl.kernel[KERNEL_MINSQUAREVAL] = clCreateKernel(ocl.program, "MinSquareVal", &err); ocl.kernel[KERNEL_CONVOLUTION] = clCreateKernel(ocl.program, "Convolution", &err); - ocl.kernel[KERNEL_CONVOLUTIONX] = clCreateKernel(ocl.program, "ConvolutionX", &err); - ocl.kernel[KERNEL_CONVOLUTIONY] = clCreateKernel(ocl.program, "ConvolutionY", &err); ocl.kernel[KERNEL_DOWNSAMPLE] = clCreateKernel(ocl.program, "DownSample", &err); ocl.kernel[KERNEL_OPSINDYNAMICSIMAGE] = clCreateKernel(ocl.program, "OpsinDynamicsImage", &err); ocl.kernel[KERNEL_DOMASK] = clCreateKernel(ocl.program, "DoMask", &err); @@ -66,200 +64,6 @@ ocl_args_d_t& getOcl(void) return ocl; } -void clMinSquareVal(size_t square_size, size_t offset, - size_t xsize, size_t ysize, - float *values) -{ - cl_int err = CL_SUCCESS; - ocl_args_d_t &ocl = getOcl(); - - ocl.allocA(sizeof(cl_float) * xsize * ysize); - ocl.allocC(sizeof(cl_float) * xsize * ysize); - - memcpy(ocl.inputA, values, sizeof(cl_float) * xsize * ysize); - - cl_int cloffset = offset; - cl_int clsquare_size = square_size; - - cl_kernel kernel = ocl.kernel[KERNEL_MINSQUAREVAL]; - clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&ocl.srcA); - clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&ocl.dstMem); - clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&clsquare_size); - clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&cloffset); - - size_t globalWorkSize[2] = { xsize, ysize }; - err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); - if (CL_SUCCESS != err) - { - LogError("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err)); - } - err = clFinish(ocl.commandQueue); - if (CL_SUCCESS != err) - { - LogError("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err)); - } - - cl_float *resultPtr = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, ocl.dstMem, true, CL_MAP_READ, 0, sizeof(cl_float) * xsize * ysize, 0, NULL, NULL, &err); - if (CL_SUCCESS != err) - { - LogError("Error: clEnqueueMapBuffer returned %s\n", TranslateOpenCLError(err)); - } - err = clFinish(ocl.commandQueue); - if (CL_SUCCESS != err) - { - LogError("Error: clFinish returned %s\n", TranslateOpenCLError(err)); - } - - memcpy(values, resultPtr, sizeof(cl_float) * xsize * ysize); -} - -void clConvolution(size_t xsize, size_t ysize, - size_t xstep, - size_t len, size_t offset, - const float* multipliers, - const float* inp, - float border_ratio, - float* result) -{ - cl_int err = CL_SUCCESS; - ocl_args_d_t &ocl = getOcl(); - - size_t oxsize = xsize / xstep; - - ocl.allocA(sizeof(cl_float) * len); - ocl.allocB(sizeof(cl_float) * xsize * ysize); - ocl.allocC(sizeof(cl_float) * oxsize * ysize); - - memcpy(ocl.inputA, multipliers, sizeof(cl_float) * len); - memcpy(ocl.inputB, inp, sizeof(cl_float) * xsize * ysize); - - cl_int clxsize = xsize; - cl_int clxstep = xstep; - cl_int cllen = len; - cl_int cloffset = offset; - cl_float clborder_ratio = border_ratio; - - cl_kernel kernel = ocl.kernel[KERNEL_CONVOLUTION]; - clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&ocl.srcA); - clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&ocl.srcB); - clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&ocl.dstMem); - clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&clxsize); - clSetKernelArg(kernel, 4, sizeof(cl_int), (void*)&clxstep); - clSetKernelArg(kernel, 5, sizeof(cl_int), (void*)&cllen); - clSetKernelArg(kernel, 6, sizeof(cl_int), (void*)&cloffset); - clSetKernelArg(kernel, 7, sizeof(cl_float), (void*)&clborder_ratio); - - size_t globalWorkSize[2] = { xsize / xstep, ysize }; - err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); - if (CL_SUCCESS != err) - { - LogError("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err)); - } - err = clFinish(ocl.commandQueue); - if (CL_SUCCESS != err) - { - LogError("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err)); - } - - cl_float *resultPtr = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, ocl.dstMem, true, CL_MAP_READ, 0, sizeof(cl_float) * oxsize * ysize, 0, NULL, NULL, &err); - if (CL_SUCCESS != err) - { - LogError("Error: clEnqueueMapBuffer returned %s\n", TranslateOpenCLError(err)); - } - err = clFinish(ocl.commandQueue); - if (CL_SUCCESS != err) - { - LogError("Error: clFinish returned %s\n", TranslateOpenCLError(err)); - } - - memcpy(result, resultPtr, sizeof(cl_float) * oxsize * ysize); -} - -void clBlur(size_t xsize, size_t ysize, float* channel, double sigma, double border_ratio) -{ - double m = 2.25; // Accuracy increases when m is increased. - const double scaler = -1.0 / (2 * sigma * sigma); - // For m = 9.0: exp(-scaler * diff * diff) < 2^ {-52} - const int diff = std::max(1, m * fabs(sigma)); - const int expn_size = 2 * diff + 1; - std::vector expn(expn_size); - for (int i = -diff; i <= diff; ++i) { - expn[i + diff] = static_cast(exp(scaler * i * i)); - } - - const int xstep = std::max(1, int(sigma / 3)); - - cl_int err = CL_SUCCESS; - ocl_args_d_t &ocl = getOcl(); - - ocl.allocA(sizeof(cl_float) * expn_size); - ocl.allocB(sizeof(cl_float) * xsize * ysize); - ocl.allocC(sizeof(cl_float) * xsize * ysize); - - memcpy(ocl.inputA, expn.data(), sizeof(cl_float) * expn_size); - memcpy(ocl.inputB, channel, sizeof(cl_float) * xsize * ysize); - - cl_int clxsize = xsize; - cl_int clxstep = xstep; - cl_int cllen = expn_size; - cl_int cloffset = diff; - cl_float clborder_ratio = border_ratio; - - cl_kernel kernel = ocl.kernel[KERNEL_CONVOLUTION]; - clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&ocl.srcA); - clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&ocl.srcB); - clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&ocl.dstMem); - clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&clxsize); - clSetKernelArg(kernel, 4, sizeof(cl_int), (void*)&clxstep); - clSetKernelArg(kernel, 5, sizeof(cl_int), (void*)&cllen); - clSetKernelArg(kernel, 6, sizeof(cl_int), (void*)&cloffset); - clSetKernelArg(kernel, 7, sizeof(cl_float), (void*)&clborder_ratio); - - size_t globalWorkSize[2] = { xsize / xstep, ysize }; - err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); - err = clFinish(ocl.commandQueue); - - globalWorkSize[0] = ysize / xstep; - globalWorkSize[1] = xsize / xstep; - clxsize = ysize; - clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&ocl.srcA); - clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&ocl.dstMem); - clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&ocl.srcB); - clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&clxsize); - clSetKernelArg(kernel, 4, sizeof(cl_int), (void*)&clxstep); - clSetKernelArg(kernel, 5, sizeof(cl_int), (void*)&cllen); - clSetKernelArg(kernel, 6, sizeof(cl_int), (void*)&cloffset); - clSetKernelArg(kernel, 7, sizeof(cl_float), (void*)&clborder_ratio); - - err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); - err = clFinish(ocl.commandQueue); - - cl_int clstep = xstep; - if (clstep <= 1) - { - cl_float *resultPtr = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, ocl.srcB, true, CL_MAP_READ, 0, sizeof(cl_float) * xsize * ysize, 0, NULL, NULL, &err); - err = clFinish(ocl.commandQueue); - memcpy(channel, resultPtr, sizeof(cl_float) * xsize * ysize); - } - else - { - kernel = ocl.kernel[KERNEL_DOWNSAMPLE]; - clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&ocl.srcB); - clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&ocl.dstMem); - clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&clstep); - clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&clstep); - - globalWorkSize[0] = ysize; - globalWorkSize[1] = xsize; - err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); - err = clFinish(ocl.commandQueue); - - cl_float *resultPtr = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, ocl.dstMem, true, CL_MAP_READ, 0, sizeof(cl_float) * xsize * ysize, 0, NULL, NULL, &err); - err = clFinish(ocl.commandQueue); - memcpy(channel, resultPtr, sizeof(cl_float) * xsize * ysize); - } -} - void clConvolutionEx(cl_mem inp, size_t xsize, size_t ysize, cl_mem multipliers, size_t len, int xstep, int offset, double border_ratio, diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h index ecd3af86..7babc74e 100644 --- a/clguetzli/ocl.h +++ b/clguetzli/ocl.h @@ -46,8 +46,6 @@ int SetupOpenCL(ocl_args_d_t *ocl, cl_device_type deviceType); enum KernelName { KERNEL_MINSQUAREVAL = 0, KERNEL_CONVOLUTION, - KERNEL_CONVOLUTIONX, - KERNEL_CONVOLUTIONY, KERNEL_DOWNSAMPLE, KERNEL_OPSINDYNAMICSIMAGE, KERNEL_DOMASK, From 6981d9f5014eeee871c633ab14cfa98c29e3656c Mon Sep 17 00:00:00 2001 From: strongtu Date: Sat, 6 May 2017 22:29:57 +0800 Subject: [PATCH 036/189] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E6=B5=8B=E8=AF=95?= =?UTF-8?q?=E7=94=A8=E4=BE=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clguetzli.cpp | 59 ++++---- clguetzli/clguetzli.h | 22 ++- clguetzli/clguetzli_test.cpp | 135 ++++++++++++++++++ clguetzli/clguetzli_test.h | 16 +++ clguetzli/utils.h | 5 - guetzli.vcxproj | 2 + guetzli.vcxproj.filters | 6 + .../butteraugli/butteraugli/butteraugli.cc | 13 ++ 8 files changed, 211 insertions(+), 47 deletions(-) create mode 100644 clguetzli/clguetzli_test.cpp create mode 100644 clguetzli/clguetzli_test.h diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index 989c5e2f..9b1ae457 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -2,7 +2,6 @@ #include #include #include "clguetzli.h" -#include "ocl.h" extern bool g_useOpenCL = false; @@ -238,19 +237,30 @@ void clOpsinDynamicsImage(size_t xsize, size_t ysize, float* r, float* g, float* ocl.releaseMemChannels(rgb_blurred); } -void clMaskHighIntensityChangeEx(ocl_channels xyb0_arg/*in,out*/, - ocl_channels xyb1/*in,out*/, - ocl_channels c0, - ocl_channels c1, +void clMaskHighIntensityChangeEx(ocl_channels xyb0/*in,out*/, + ocl_channels xyb1/*in,out*/, size_t xsize, size_t ysize) { + cl_int channel_size = xsize * ysize * sizeof(float); + cl_int err = CL_SUCCESS; ocl_args_d_t &ocl = getOcl(); + ocl_channels c0 = ocl.allocMemChannels(channel_size); + ocl_channels c1 = ocl.allocMemChannels(channel_size); + + clEnqueueCopyBuffer(ocl.commandQueue, xyb0.r, c0.r, 0, 0, channel_size, 0, NULL, NULL); + clEnqueueCopyBuffer(ocl.commandQueue, xyb0.g, c0.g, 0, 0, channel_size, 0, NULL, NULL); + clEnqueueCopyBuffer(ocl.commandQueue, xyb0.b, c0.b, 0, 0, channel_size, 0, NULL, NULL); + clEnqueueCopyBuffer(ocl.commandQueue, xyb1.r, c1.r, 0, 0, channel_size, 0, NULL, NULL); + clEnqueueCopyBuffer(ocl.commandQueue, xyb1.g, c1.g, 0, 0, channel_size, 0, NULL, NULL); + clEnqueueCopyBuffer(ocl.commandQueue, xyb1.b, c1.b, 0, 0, channel_size, 0, NULL, NULL); + err = clFinish(ocl.commandQueue); + cl_kernel kernel = ocl.kernel[KERNEL_MASKHIGHINTENSITYCHANGE]; - clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&xyb0_arg.r); - clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&xyb0_arg.g); - clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&xyb0_arg.b); + clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&xyb0.r); + clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&xyb0.g); + clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&xyb0.b); clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&xyb1.r); clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&xyb1.g); clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&xyb1.b); @@ -272,6 +282,9 @@ void clMaskHighIntensityChangeEx(ocl_channels xyb0_arg/*in,out*/, { LogError("Error: clMaskHighIntensityChangeEx() clFinish returned %s.\n", TranslateOpenCLError(err)); } + + ocl.releaseMemChannels(c0); + ocl.releaseMemChannels(c1); } void clEdgeDetectorMapEx(ocl_channels rgb, ocl_channels rgb2, size_t xsize, size_t ysize, size_t step, cl_mem result/*out*/) @@ -802,26 +815,16 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b, cl_int err = 0; ocl_args_d_t &ocl = getOcl(); - ocl_channels xyb0_arg = ocl.allocMemChannels(channel_size); - ocl_channels xyb1 = ocl.allocMemChannels(channel_size); - ocl_channels xyb0 = ocl.allocMemChannels(channel_size); - ocl_channels xyb1_c = ocl.allocMemChannels(channel_size); + ocl_channels xyb1 = ocl.allocMemChannels(channel_size); - clEnqueueWriteBuffer(ocl.commandQueue, xyb0_arg.r, CL_FALSE, 0, channel_size, r, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, xyb0_arg.g, CL_FALSE, 0, channel_size, g, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, xyb0_arg.b, CL_FALSE, 0, channel_size, b, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, xyb0.r, CL_FALSE, 0, channel_size, r, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, xyb0.g, CL_FALSE, 0, channel_size, g, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, xyb0.b, CL_FALSE, 0, channel_size, b, 0, NULL, NULL); clEnqueueWriteBuffer(ocl.commandQueue, xyb1.r, CL_FALSE, 0, channel_size, r2, 0, NULL, NULL); clEnqueueWriteBuffer(ocl.commandQueue, xyb1.g, CL_FALSE, 0, channel_size, g2, 0, NULL, NULL); clEnqueueWriteBuffer(ocl.commandQueue, xyb1.b, CL_FALSE, 0, channel_size, b2, 0, NULL, NULL); - - err = clEnqueueCopyBuffer(ocl.commandQueue, xyb0_arg.r, xyb0.r, 0, 0, channel_size, 0, NULL, NULL); - err = clEnqueueCopyBuffer(ocl.commandQueue, xyb0_arg.g, xyb0.g, 0, 0, channel_size, 0, NULL, NULL); - err = clEnqueueCopyBuffer(ocl.commandQueue, xyb0_arg.b, xyb0.b, 0, 0, channel_size, 0, NULL, NULL); - err = clEnqueueCopyBuffer(ocl.commandQueue, xyb1.r, xyb1_c.r, 0, 0, channel_size, 0, NULL, NULL); - err = clEnqueueCopyBuffer(ocl.commandQueue, xyb1.g, xyb1_c.g, 0, 0, channel_size, 0, NULL, NULL); - err = clEnqueueCopyBuffer(ocl.commandQueue, xyb1.b, xyb1_c.b, 0, 0, channel_size, 0, NULL, NULL); err = clFinish(ocl.commandQueue); cl_mem mem_result = ocl.allocMem(channel_size); @@ -835,13 +838,13 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b, cl_mem block_diff_dc = ocl.allocMem(3 * res_xsize * res_ysize * sizeof(float)); cl_mem block_diff_ac = ocl.allocMem(3 * res_xsize * res_ysize * sizeof(float)); - clMaskHighIntensityChangeEx(xyb0_arg, xyb1_c, xyb0, xyb1, xsize, ysize); + clMaskHighIntensityChangeEx(xyb0, xyb1, xsize, ysize); - clEdgeDetectorMapEx(xyb0_arg, xyb1, xsize, ysize, step, edge_detector_map); - clBlockDiffMapEx(xyb0_arg, xyb1, xsize, ysize, step, block_diff_dc, block_diff_ac); - clEdgeDetectorLowFreqEx(xyb0_arg, xyb1, xsize, ysize, step, block_diff_ac); + clEdgeDetectorMapEx(xyb0, xyb1, xsize, ysize, step, edge_detector_map); + clBlockDiffMapEx(xyb0, xyb1, xsize, ysize, step, block_diff_dc, block_diff_ac); + clEdgeDetectorLowFreqEx(xyb0, xyb1, xsize, ysize, step, block_diff_ac); - clMaskEx(xyb0_arg, xyb1, xsize, ysize, mask, mask_dc); + clMaskEx(xyb0, xyb1, xsize, ysize, mask, mask_dc); clCombineChannelsEx(mask, mask_dc, block_diff_dc, block_diff_ac, edge_detector_map, xsize, ysize, step, mem_result); @@ -851,10 +854,8 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b, err = clFinish(ocl.commandQueue); memcpy(result, result_r, channel_size); - ocl.releaseMemChannels(xyb0_arg); ocl.releaseMemChannels(xyb1); ocl.releaseMemChannels(xyb0); - ocl.releaseMemChannels(xyb1_c); clReleaseMemObject(edge_detector_map); clReleaseMemObject(block_diff_dc); diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h index 6f29dd35..91dd25ac 100644 --- a/clguetzli/clguetzli.h +++ b/clguetzli/clguetzli.h @@ -1,22 +1,18 @@ #pragma once #include "CL\cl.h" +#include "ocl.h" + extern bool g_useOpenCL; -void clBlurEx(cl_mem image, size_t xsize, size_t ysize, double sigma, double border_ratio, cl_mem result = nullptr); +void clMaskHighIntensityChangeEx(ocl_channels xyb0/*in,out*/, + ocl_channels xyb1/*in,out*/, + size_t xsize, size_t ysize); -void clMinSquareVal(size_t square_size, size_t offset, +void clMaskEx(ocl_channels rgb, ocl_channels rgb2, size_t xsize, size_t ysize, - float *values); - -void clConvolution(size_t xsize, size_t ysize, - size_t xstep, - size_t len, size_t offset, - const float* multiplier, - const float* inp, - float border_ratio, - float* result); + ocl_channels mask/*out*/, ocl_channels mask_dc/*out*/); -void clBlur(size_t xsize, size_t ysize, float* channel, double sigma, double border_ratio); +void clBlurEx(cl_mem image, size_t xsize, size_t ysize, double sigma, double border_ratio, cl_mem result = nullptr); void clOpsinDynamicsImage(size_t xsize, size_t ysize, float* r, float* g, float* b); @@ -24,4 +20,4 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b, float* r2, float* g2, float* b2, size_t xsize, size_t ysize, size_t step, - float* result); \ No newline at end of file + float* result); diff --git a/clguetzli/clguetzli_test.cpp b/clguetzli/clguetzli_test.cpp new file mode 100644 index 00000000..2ff85802 --- /dev/null +++ b/clguetzli/clguetzli_test.cpp @@ -0,0 +1,135 @@ +#include +#include +#include +#include "clguetzli_test.h" +#include "clguetzli.h" +#include "ocl.h" + +bool floatCompare(const float* a, const float* b, size_t size) +{ + for (int i = 0; i < size; i++) + { + if (fabs(a[i] - b[i]) > 0.001) + { + return false; + } + } + return true; +} + +void clMaskHighIntensityChange(const float* r, const float* g, const float* b, + const float* r2, const float* g2, const float* b2, + size_t xsize, size_t ysize, + const float* result_r, const float* result_g, const float* result_b, + const float* result_r2, const float* result_g2, const float* result_b2) +{ + size_t channel_size = xsize * ysize * sizeof(float); + cl_int err = 0; + ocl_args_d_t &ocl = getOcl(); + ocl_channels xyb0 = ocl.allocMemChannels(channel_size); + ocl_channels xyb1 = ocl.allocMemChannels(channel_size); + + clEnqueueWriteBuffer(ocl.commandQueue, xyb0.r, CL_FALSE, 0, channel_size, r, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, xyb0.g, CL_FALSE, 0, channel_size, g, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, xyb0.b, CL_FALSE, 0, channel_size, b, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, xyb1.r, CL_FALSE, 0, channel_size, r2, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, xyb1.g, CL_FALSE, 0, channel_size, g2, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, xyb1.b, CL_FALSE, 0, channel_size, b2, 0, NULL, NULL); + + err = clFinish(ocl.commandQueue); + + clMaskHighIntensityChangeEx(xyb0, xyb1, xsize, ysize); + + cl_float *r0_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, xyb0.r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + cl_float *r0_g = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, xyb0.g, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + cl_float *r0_b = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, xyb0.b, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + cl_float *r1_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, xyb1.r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + cl_float *r1_g = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, xyb1.g, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + cl_float *r1_b = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, xyb1.b, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + + floatCompare(result_r, r0_r, xsize * ysize); + floatCompare(result_g, r0_g, xsize * ysize); + floatCompare(result_b, r0_b, xsize * ysize); + floatCompare(result_r2, r1_r, xsize * ysize); + floatCompare(result_g2, r1_g, xsize * ysize); + floatCompare(result_b2, r1_b, xsize * ysize); + + ocl.releaseMemChannels(xyb0); + ocl.releaseMemChannels(xyb1); +} + +// strong to +void clEdgeDetectorMap(void) +{ + +} + +// strong todo +void clBlockDiffMap(void) +{ + +} + +// strong to +void clEdgeDetectorLowFreq(void) +{ + +} + +void clMask(const float* r, const float* g, const float* b, + const float* r2, const float* g2, const float* b2, + size_t xsize, size_t ysize, + const float* mask_r, const float* mask_g, const float* mask_b, + const float* maskdc_r, const float* maskdc_g, const float* maskdc_b) +{ + size_t channel_size = xsize * ysize * sizeof(float); + cl_int err = 0; + ocl_args_d_t &ocl = getOcl(); + ocl_channels rgb = ocl.allocMemChannels(channel_size); + ocl_channels rgb2 = ocl.allocMemChannels(channel_size); + + ocl_channels mask = ocl.allocMemChannels(channel_size); + ocl_channels mask_dc = ocl.allocMemChannels(channel_size); + + clEnqueueWriteBuffer(ocl.commandQueue, rgb.r, CL_FALSE, 0, channel_size, r, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, rgb.g, CL_FALSE, 0, channel_size, g, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, rgb.b, CL_FALSE, 0, channel_size, b, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, rgb2.r, CL_FALSE, 0, channel_size, r2, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, rgb2.g, CL_FALSE, 0, channel_size, g2, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, rgb2.b, CL_FALSE, 0, channel_size, b2, 0, NULL, NULL); + + err = clFinish(ocl.commandQueue); + + clMaskEx(rgb, rgb2, xsize, ysize, mask/*out*/, mask_dc/*out*/); + + cl_float *r0_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask.r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + cl_float *r0_g = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask.g, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + cl_float *r0_b = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask.b, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + cl_float *r1_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask_dc.r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + cl_float *r1_g = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask_dc.g, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + cl_float *r1_b = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask_dc.b, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + + floatCompare(mask_r, r0_r, xsize * ysize); + floatCompare(mask_g, r0_g, xsize * ysize); + floatCompare(mask_b, r0_b, xsize * ysize); + floatCompare(maskdc_r, r1_r, xsize * ysize); + floatCompare(maskdc_g, r1_g, xsize * ysize); + floatCompare(maskdc_b, r1_b, xsize * ysize); + + ocl.releaseMemChannels(rgb); + ocl.releaseMemChannels(rgb2); + ocl.releaseMemChannels(mask); + ocl.releaseMemChannels(mask_dc); +} + +// ian todo +void clCombineChannels(void) +{ + +} + +// ian todo +void clCalculateDiffmapEx(void) +{ + +} diff --git a/clguetzli/clguetzli_test.h b/clguetzli/clguetzli_test.h new file mode 100644 index 00000000..6d3f58c2 --- /dev/null +++ b/clguetzli/clguetzli_test.h @@ -0,0 +1,16 @@ +#pragma once +#include "ocl.h" + +ocl_args_d_t& getOcl(void); + +void clMaskHighIntensityChange(const float* r, const float* g, const float* b, + const float* r2, const float* g2, const float* b2, + size_t xsize, size_t ysize, + const float* result_r, const float* result_g, const float* result_b, + const float* result_r2, const float* result_g2, const float* result_b2); + +void clMask(const float* r, const float* g, const float* b, + const float* r2, const float* g2, const float* b2, + size_t xsize, size_t ysize, + const float* mask_r, const float* mask_g, const float* mask_b, + const float* maskdc_r, const float* maskdc_g, const float* maskdc_b); diff --git a/clguetzli/utils.h b/clguetzli/utils.h index 294f7137..fc68fec5 100644 --- a/clguetzli/utils.h +++ b/clguetzli/utils.h @@ -19,11 +19,6 @@ * Intel Corporation is the author of the Materials, and requests that all * problem reports or change requests be submitted to it directly *****************************************************************************/ - -#include "CL\cl.h" -#include - - #pragma once // Print useful information to the default output. Same usage as with printf diff --git a/guetzli.vcxproj b/guetzli.vcxproj index 3aa98abf..e48d1682 100644 --- a/guetzli.vcxproj +++ b/guetzli.vcxproj @@ -192,6 +192,7 @@ + @@ -286,6 +287,7 @@ + diff --git a/guetzli.vcxproj.filters b/guetzli.vcxproj.filters index 308cad47..a74b94c9 100644 --- a/guetzli.vcxproj.filters +++ b/guetzli.vcxproj.filters @@ -303,6 +303,9 @@ clguetzli + + clguetzli + @@ -563,6 +566,9 @@ clguetzli + + clguetzli + diff --git a/third_party/butteraugli/butteraugli/butteraugli.cc b/third_party/butteraugli/butteraugli/butteraugli.cc index 4faa70c7..4ac771dd 100644 --- a/third_party/butteraugli/butteraugli/butteraugli.cc +++ b/third_party/butteraugli/butteraugli/butteraugli.cc @@ -41,6 +41,7 @@ #include #include "clguetzli\clguetzli.h" +#include "clguetzli\clguetzli_test.h" // Restricted pointers speed up Convolution(); MSVC uses a different keyword. #ifdef _MSC_VER @@ -828,6 +829,12 @@ void MaskHighIntensityChange( } } } + + clMaskHighIntensityChange(c0[0].data(), c0[1].data(), c0[2].data(), + c1[0].data(), c1[1].data(), c1[2].data(), + xsize, ysize, + xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), + xyb0[0].data(), xyb1[1].data(), xyb1[2].data()); } double SimpleGamma(double v) { @@ -1609,6 +1616,12 @@ void Mask(const std::vector > &xyb0, ScaleImage(kGlobalScale * kGlobalScale, &(*mask)[i]); ScaleImage(kGlobalScale * kGlobalScale, &(*mask_dc)[i]); } + + clMask(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), + xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), + xsize, ysize, + (*mask)[0].data(), (*mask)[1].data(), (*mask)[2].data(), + (*mask_dc)[0].data(), (*mask_dc)[1].data(), (*mask_dc)[2].data()); } } // namespace butteraugli From 7e1ad82e99777e372fe0792efd7236852916e820 Mon Sep 17 00:00:00 2001 From: strongtu Date: Sun, 7 May 2017 13:25:32 +0800 Subject: [PATCH 037/189] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E6=B5=8B=E8=AF=95?= =?UTF-8?q?=E7=94=A8=E4=BE=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clguetzli.h | 10 ++ clguetzli/clguetzli_test.cpp | 118 ++++++++++++++++-- clguetzli/clguetzli_test.h | 15 +++ .../butteraugli/butteraugli/butteraugli.cc | 14 +++ 4 files changed, 149 insertions(+), 8 deletions(-) diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h index 91dd25ac..a714bf44 100644 --- a/clguetzli/clguetzli.h +++ b/clguetzli/clguetzli.h @@ -12,6 +12,16 @@ void clMaskEx(ocl_channels rgb, ocl_channels rgb2, size_t xsize, size_t ysize, ocl_channels mask/*out*/, ocl_channels mask_dc/*out*/); +void clEdgeDetectorMapEx(ocl_channels rgb, ocl_channels rgb2, size_t xsize, size_t ysize, size_t step, cl_mem result/*out*/); + +void clBlockDiffMapEx(ocl_channels rgb, ocl_channels rgb2, + size_t xsize, size_t ysize, size_t step, + cl_mem block_diff_dc/*out*/, cl_mem block_diff_ac/*out*/); + +void clEdgeDetectorLowFreqEx(ocl_channels rgb, ocl_channels rgb2, + size_t xsize, size_t ysize, size_t step, + cl_mem block_diff_ac/*out*/); + void clBlurEx(cl_mem image, size_t xsize, size_t ysize, double sigma, double border_ratio, cl_mem result = nullptr); void clOpsinDynamicsImage(size_t xsize, size_t ysize, float* r, float* g, float* b); diff --git a/clguetzli/clguetzli_test.cpp b/clguetzli/clguetzli_test.cpp index 2ff85802..eeabe8a8 100644 --- a/clguetzli/clguetzli_test.cpp +++ b/clguetzli/clguetzli_test.cpp @@ -5,16 +5,17 @@ #include "clguetzli.h" #include "ocl.h" -bool floatCompare(const float* a, const float* b, size_t size) +int floatCompare(const float* a, const float* b, size_t size) { + int count = 0; for (int i = 0; i < size; i++) { if (fabs(a[i] - b[i]) > 0.001) { - return false; + count++; } } - return true; + return count; } void clMaskHighIntensityChange(const float* r, const float* g, const float* b, @@ -35,7 +36,6 @@ void clMaskHighIntensityChange(const float* r, const float* g, const float* b, clEnqueueWriteBuffer(ocl.commandQueue, xyb1.r, CL_FALSE, 0, channel_size, r2, 0, NULL, NULL); clEnqueueWriteBuffer(ocl.commandQueue, xyb1.g, CL_FALSE, 0, channel_size, g2, 0, NULL, NULL); clEnqueueWriteBuffer(ocl.commandQueue, xyb1.b, CL_FALSE, 0, channel_size, b2, 0, NULL, NULL); - err = clFinish(ocl.commandQueue); clMaskHighIntensityChangeEx(xyb0, xyb1, xsize, ysize); @@ -46,6 +46,7 @@ void clMaskHighIntensityChange(const float* r, const float* g, const float* b, cl_float *r1_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, xyb1.r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); cl_float *r1_g = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, xyb1.g, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); cl_float *r1_b = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, xyb1.b, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + err = clFinish(ocl.commandQueue); floatCompare(result_r, r0_r, xsize * ysize); floatCompare(result_g, r0_g, xsize * ysize); @@ -59,21 +60,122 @@ void clMaskHighIntensityChange(const float* r, const float* g, const float* b, } // strong to -void clEdgeDetectorMap(void) +void clEdgeDetectorMap(const float* r, const float* g, const float* b, + const float* r2, const float* g2, const float* b2, + size_t xsize, size_t ysize, size_t step, + const float* result) { + size_t channel_size = xsize * ysize * sizeof(float); + const size_t res_xsize = (xsize + step - 1) / step; + const size_t res_ysize = (ysize + step - 1) / step; + const size_t edgemap_size = res_xsize * res_ysize * 3 * sizeof(float); + + cl_int err = 0; + ocl_args_d_t &ocl = getOcl(); + ocl_channels xyb0 = ocl.allocMemChannels(channel_size); + ocl_channels xyb1 = ocl.allocMemChannels(channel_size); + cl_mem edge = ocl.allocMem(edgemap_size); + + clEnqueueWriteBuffer(ocl.commandQueue, xyb0.r, CL_FALSE, 0, channel_size, r, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, xyb0.g, CL_FALSE, 0, channel_size, g, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, xyb0.b, CL_FALSE, 0, channel_size, b, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, xyb1.r, CL_FALSE, 0, channel_size, r2, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, xyb1.g, CL_FALSE, 0, channel_size, g2, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, xyb1.b, CL_FALSE, 0, channel_size, b2, 0, NULL, NULL); + err = clFinish(ocl.commandQueue); + + clEdgeDetectorMapEx(xyb0, xyb1, xsize, ysize, step, edge); + cl_float *r_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, edge, true, CL_MAP_READ, 0, edgemap_size, 0, NULL, NULL, &err); + err = clFinish(ocl.commandQueue); + + floatCompare(result, r_r, res_xsize * res_ysize * 3); + + ocl.releaseMemChannels(xyb0); + ocl.releaseMemChannels(xyb1); + clReleaseMemObject(edge); } // strong todo -void clBlockDiffMap(void) +void clBlockDiffMap(const float* r, const float* g, const float* b, + const float* r2, const float* g2, const float* b2, + size_t xsize, size_t ysize, size_t step, + const float* result_diff_dc, const float* result_diff_ac) { + size_t channel_size = xsize * ysize * sizeof(float); + const size_t res_xsize = (xsize + step - 1) / step; + const size_t res_ysize = (ysize + step - 1) / step; + const size_t reschannel_size = res_xsize * res_ysize * 3 * sizeof(float); + + cl_int err = 0; + ocl_args_d_t &ocl = getOcl(); + ocl_channels xyb0 = ocl.allocMemChannels(channel_size); + ocl_channels xyb1 = ocl.allocMemChannels(channel_size); + cl_mem block_diff_dc = ocl.allocMem(reschannel_size); + cl_mem block_diff_ac = ocl.allocMem(reschannel_size); + + clEnqueueWriteBuffer(ocl.commandQueue, xyb0.r, CL_FALSE, 0, channel_size, r, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, xyb0.g, CL_FALSE, 0, channel_size, g, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, xyb0.b, CL_FALSE, 0, channel_size, b, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, xyb1.r, CL_FALSE, 0, channel_size, r2, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, xyb1.g, CL_FALSE, 0, channel_size, g2, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, xyb1.b, CL_FALSE, 0, channel_size, b2, 0, NULL, NULL); + err = clFinish(ocl.commandQueue); + + clBlockDiffMapEx(xyb0, xyb1, xsize, ysize, step, block_diff_dc, block_diff_ac); + + cl_float *r_dc = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, block_diff_dc, true, CL_MAP_READ, 0, reschannel_size, 0, NULL, NULL, &err); + cl_float *r_ac = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, block_diff_ac, true, CL_MAP_READ, 0, reschannel_size, 0, NULL, NULL, &err); + err = clFinish(ocl.commandQueue); + + floatCompare(r_dc, result_diff_dc, res_xsize * res_ysize * 3); + floatCompare(r_ac, result_diff_ac, res_xsize * res_ysize * 3); + + ocl.releaseMemChannels(xyb0); + ocl.releaseMemChannels(xyb1); + + clReleaseMemObject(block_diff_ac); + clReleaseMemObject(block_diff_dc); } // strong to -void clEdgeDetectorLowFreq(void) +void clEdgeDetectorLowFreq(const float* r, const float* g, const float* b, + const float* r2, const float* g2, const float* b2, + size_t xsize, size_t ysize, size_t step, + const float* result_diff_dc) { + size_t channel_size = xsize * ysize * sizeof(float); + const size_t res_xsize = (xsize + step - 1) / step; + const size_t res_ysize = (ysize + step - 1) / step; + const size_t reschannel_size = res_xsize * res_ysize * 3 * sizeof(float); + + cl_int err = 0; + ocl_args_d_t &ocl = getOcl(); + ocl_channels xyb0 = ocl.allocMemChannels(channel_size); + ocl_channels xyb1 = ocl.allocMemChannels(channel_size); + + cl_mem block_diff_dc = ocl.allocMem(reschannel_size); + clEnqueueWriteBuffer(ocl.commandQueue, xyb0.r, CL_FALSE, 0, channel_size, r, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, xyb0.g, CL_FALSE, 0, channel_size, g, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, xyb0.b, CL_FALSE, 0, channel_size, b, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, xyb1.r, CL_FALSE, 0, channel_size, r2, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, xyb1.g, CL_FALSE, 0, channel_size, g2, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, xyb1.b, CL_FALSE, 0, channel_size, b2, 0, NULL, NULL); + err = clFinish(ocl.commandQueue); + + clEdgeDetectorLowFreqEx(xyb0, xyb1, xsize, ysize, step, block_diff_dc); + + cl_float *r_dc = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, block_diff_dc, true, CL_MAP_READ, 0, reschannel_size, 0, NULL, NULL, &err); + err = clFinish(ocl.commandQueue); + + floatCompare(r_dc, result_diff_dc, res_xsize * res_ysize * 3); + + ocl.releaseMemChannels(xyb0); + ocl.releaseMemChannels(xyb1); + + clReleaseMemObject(block_diff_dc); } void clMask(const float* r, const float* g, const float* b, @@ -97,7 +199,6 @@ void clMask(const float* r, const float* g, const float* b, clEnqueueWriteBuffer(ocl.commandQueue, rgb2.r, CL_FALSE, 0, channel_size, r2, 0, NULL, NULL); clEnqueueWriteBuffer(ocl.commandQueue, rgb2.g, CL_FALSE, 0, channel_size, g2, 0, NULL, NULL); clEnqueueWriteBuffer(ocl.commandQueue, rgb2.b, CL_FALSE, 0, channel_size, b2, 0, NULL, NULL); - err = clFinish(ocl.commandQueue); clMaskEx(rgb, rgb2, xsize, ysize, mask/*out*/, mask_dc/*out*/); @@ -108,6 +209,7 @@ void clMask(const float* r, const float* g, const float* b, cl_float *r1_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask_dc.r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); cl_float *r1_g = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask_dc.g, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); cl_float *r1_b = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask_dc.b, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + err = clFinish(ocl.commandQueue); floatCompare(mask_r, r0_r, xsize * ysize); floatCompare(mask_g, r0_g, xsize * ysize); diff --git a/clguetzli/clguetzli_test.h b/clguetzli/clguetzli_test.h index 6d3f58c2..3b62144e 100644 --- a/clguetzli/clguetzli_test.h +++ b/clguetzli/clguetzli_test.h @@ -9,6 +9,21 @@ void clMaskHighIntensityChange(const float* r, const float* g, const float* b, const float* result_r, const float* result_g, const float* result_b, const float* result_r2, const float* result_g2, const float* result_b2); +void clEdgeDetectorMap(const float* r, const float* g, const float* b, + const float* r2, const float* g2, const float* b2, + size_t xsize, size_t ysize, size_t step, + const float* result); + +void clBlockDiffMap(const float* r, const float* g, const float* b, + const float* r2, const float* g2, const float* b2, + size_t xsize, size_t ysize, size_t step, + const float* result_diff_dc, const float* result_diff_ac); + +void clEdgeDetectorLowFreq(const float* r, const float* g, const float* b, + const float* r2, const float* g2, const float* b2, + size_t xsize, size_t ysize, size_t step, + const float* result_diff_dc); + void clMask(const float* r, const float* g, const float* b, const float* r2, const float* g2, const float* b2, size_t xsize, size_t ysize, diff --git a/third_party/butteraugli/butteraugli/butteraugli.cc b/third_party/butteraugli/butteraugli/butteraugli.cc index 4ac771dd..138234ad 100644 --- a/third_party/butteraugli/butteraugli/butteraugli.cc +++ b/third_party/butteraugli/butteraugli/butteraugli.cc @@ -1200,6 +1200,10 @@ void ButteraugliComparator::BlockDiffMap( } } } + clBlockDiffMap(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), + xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), + xsize_, ysize_, step_, + (*block_diff_dc).data(), (*block_diff_ac).data()); } void ButteraugliComparator::EdgeDetectorMap( @@ -1232,6 +1236,11 @@ void ButteraugliComparator::EdgeDetectorMap( } } } + + clEdgeDetectorMap(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), + xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), + xsize_, ysize_, step_, + (*edge_detector_map).data()); } void ButteraugliComparator::EdgeDetectorLowFreq( @@ -1288,6 +1297,11 @@ void ButteraugliComparator::EdgeDetectorLowFreq( } } } + + clEdgeDetectorLowFreq(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), + xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), + xsize_, ysize_, step_, + (*block_diff_ac).data()); } void ButteraugliComparator::CombineChannels( From 7ef1b6dccad8fa0c68ae0e18abcf1e5565594ae8 Mon Sep 17 00:00:00 2001 From: strongtu Date: Sun, 7 May 2017 14:05:44 +0800 Subject: [PATCH 038/189] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E6=B5=8B=E8=AF=95?= =?UTF-8?q?=E7=94=A8=E4=BE=8B=E6=A1=86=E6=9E=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clguetzli_test.cpp | 108 +++++++++++++++++++++++++++++------ 1 file changed, 91 insertions(+), 17 deletions(-) diff --git a/clguetzli/clguetzli_test.cpp b/clguetzli/clguetzli_test.cpp index eeabe8a8..6718a17e 100644 --- a/clguetzli/clguetzli_test.cpp +++ b/clguetzli/clguetzli_test.cpp @@ -5,7 +5,9 @@ #include "clguetzli.h" #include "ocl.h" -int floatCompare(const float* a, const float* b, size_t size) +#define FLOAT_COMPARE(a, b, c) floatCompare((a), (b), (c), __FUNCTION__, __LINE__ ) + +int floatCompare(const float* a, const float* b, size_t size, const char* szFunc, int line) { int count = 0; for (int i = 0; i < size; i++) @@ -15,6 +17,10 @@ int floatCompare(const float* a, const float* b, size_t size) count++; } } + if (count > 0) + { + LogError("CHK %s(%d) %d:%d\r\n", szFunc, line, count, size); + } return count; } @@ -48,12 +54,20 @@ void clMaskHighIntensityChange(const float* r, const float* g, const float* b, cl_float *r1_b = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, xyb1.b, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); err = clFinish(ocl.commandQueue); - floatCompare(result_r, r0_r, xsize * ysize); - floatCompare(result_g, r0_g, xsize * ysize); - floatCompare(result_b, r0_b, xsize * ysize); - floatCompare(result_r2, r1_r, xsize * ysize); - floatCompare(result_g2, r1_g, xsize * ysize); - floatCompare(result_b2, r1_b, xsize * ysize); + FLOAT_COMPARE(result_r, r0_r, xsize * ysize); + FLOAT_COMPARE(result_g, r0_g, xsize * ysize); + FLOAT_COMPARE(result_b, r0_b, xsize * ysize); + FLOAT_COMPARE(result_r2, r1_r, xsize * ysize); + FLOAT_COMPARE(result_g2, r1_g, xsize * ysize); + FLOAT_COMPARE(result_b2, r1_b, xsize * ysize); + + clEnqueueUnmapMemObject(ocl.commandQueue, xyb0.r, r0_r, channel_size, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, xyb0.g, r0_g, channel_size, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, xyb0.b, r0_b, channel_size, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, xyb1.r, r1_r, channel_size, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, xyb1.g, r1_g, channel_size, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, xyb1.b, r1_b, channel_size, NULL, NULL); + err = clFinish(ocl.commandQueue); ocl.releaseMemChannels(xyb0); ocl.releaseMemChannels(xyb1); @@ -89,7 +103,10 @@ void clEdgeDetectorMap(const float* r, const float* g, const float* b, cl_float *r_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, edge, true, CL_MAP_READ, 0, edgemap_size, 0, NULL, NULL, &err); err = clFinish(ocl.commandQueue); - floatCompare(result, r_r, res_xsize * res_ysize * 3); + FLOAT_COMPARE(result, r_r, res_xsize * res_ysize * 3); + + clEnqueueUnmapMemObject(ocl.commandQueue, edge, r_r, edgemap_size, NULL, NULL); + err = clFinish(ocl.commandQueue); ocl.releaseMemChannels(xyb0); ocl.releaseMemChannels(xyb1); @@ -129,8 +146,12 @@ void clBlockDiffMap(const float* r, const float* g, const float* b, cl_float *r_ac = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, block_diff_ac, true, CL_MAP_READ, 0, reschannel_size, 0, NULL, NULL, &err); err = clFinish(ocl.commandQueue); - floatCompare(r_dc, result_diff_dc, res_xsize * res_ysize * 3); - floatCompare(r_ac, result_diff_ac, res_xsize * res_ysize * 3); + FLOAT_COMPARE(r_dc, result_diff_dc, res_xsize * res_ysize * 3); + FLOAT_COMPARE(r_ac, result_diff_ac, res_xsize * res_ysize * 3); + + clEnqueueUnmapMemObject(ocl.commandQueue, block_diff_dc, r_dc, reschannel_size, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, block_diff_ac, r_ac, reschannel_size, NULL, NULL); + err = clFinish(ocl.commandQueue); ocl.releaseMemChannels(xyb0); ocl.releaseMemChannels(xyb1); @@ -170,7 +191,10 @@ void clEdgeDetectorLowFreq(const float* r, const float* g, const float* b, cl_float *r_dc = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, block_diff_dc, true, CL_MAP_READ, 0, reschannel_size, 0, NULL, NULL, &err); err = clFinish(ocl.commandQueue); - floatCompare(r_dc, result_diff_dc, res_xsize * res_ysize * 3); + FLOAT_COMPARE(r_dc, result_diff_dc, res_xsize * res_ysize * 3); + + clEnqueueUnmapMemObject(ocl.commandQueue, block_diff_dc, r_dc, reschannel_size, NULL, NULL); + err = clFinish(ocl.commandQueue); ocl.releaseMemChannels(xyb0); ocl.releaseMemChannels(xyb1); @@ -211,12 +235,20 @@ void clMask(const float* r, const float* g, const float* b, cl_float *r1_b = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask_dc.b, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); err = clFinish(ocl.commandQueue); - floatCompare(mask_r, r0_r, xsize * ysize); - floatCompare(mask_g, r0_g, xsize * ysize); - floatCompare(mask_b, r0_b, xsize * ysize); - floatCompare(maskdc_r, r1_r, xsize * ysize); - floatCompare(maskdc_g, r1_g, xsize * ysize); - floatCompare(maskdc_b, r1_b, xsize * ysize); + FLOAT_COMPARE(mask_r, r0_r, xsize * ysize); + FLOAT_COMPARE(mask_g, r0_g, xsize * ysize); + FLOAT_COMPARE(mask_b, r0_b, xsize * ysize); + FLOAT_COMPARE(maskdc_r, r1_r, xsize * ysize); + FLOAT_COMPARE(maskdc_g, r1_g, xsize * ysize); + FLOAT_COMPARE(maskdc_b, r1_b, xsize * ysize); + + clEnqueueUnmapMemObject(ocl.commandQueue, mask.r, r0_r, channel_size, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, mask.g, r0_g, channel_size, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, mask.b, r0_b, channel_size, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, mask_dc.r, r1_r, channel_size, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, mask_dc.g, r1_g, channel_size, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, mask_dc.b, r1_b, channel_size, NULL, NULL); + err = clFinish(ocl.commandQueue); ocl.releaseMemChannels(rgb); ocl.releaseMemChannels(rgb2); @@ -235,3 +267,45 @@ void clCalculateDiffmapEx(void) { } + +// +void clBlur(void) +{ + +} + +// +void clConvolution(void) +{ + +} + +// +void clUpsample(void) +{ + +} + +// +void clDiffPrecompute(void) +{ + +} + +// +void clAverage5x5(void) +{ + +} + +// +void clMinSquareVal(void) +{ + +} + +// +void clScaleImage(void) +{ + +} From 8d356925322225733afe7ae73b51e833acfb7400 Mon Sep 17 00:00:00 2001 From: strongtu Date: Sun, 7 May 2017 16:56:09 +0800 Subject: [PATCH 039/189] =?UTF-8?q?=E6=B5=8B=E8=AF=95=E7=94=A8=E4=BE=8B?= =?UTF-8?q?=E5=88=86=E5=B7=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clguetzli_test.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/clguetzli/clguetzli_test.cpp b/clguetzli/clguetzli_test.cpp index 6718a17e..b14eb15d 100644 --- a/clguetzli/clguetzli_test.cpp +++ b/clguetzli/clguetzli_test.cpp @@ -268,43 +268,43 @@ void clCalculateDiffmapEx(void) } -// +// strong todo void clBlur(void) { } -// +// strong todo void clConvolution(void) { } -// +// strong todo void clUpsample(void) { } -// +// ian todo void clDiffPrecompute(void) { } -// +// ian todo void clAverage5x5(void) { } -// +// strong todo void clMinSquareVal(void) { } -// +// ian todo void clScaleImage(void) { From 5864a11ba25c134967551bcea6d2c36050bcf107 Mon Sep 17 00:00:00 2001 From: strongtu Date: Sun, 7 May 2017 21:06:44 +0800 Subject: [PATCH 040/189] =?UTF-8?q?MapBuffer=E4=B9=8B=E5=90=8E=E8=A6=81?= =?UTF-8?q?=E8=BF=9B=E8=A1=8CUnmap?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clguetzli.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index 9b1ae457..487d260d 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -233,6 +233,11 @@ void clOpsinDynamicsImage(size_t xsize, size_t ysize, float* r, float* g, float* memcpy(g, result_g, channel_size); memcpy(b, result_b, channel_size); + clEnqueueUnmapMemObject(ocl.commandQueue, rgb.r, result_r, channel_size, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, rgb.g, result_g, channel_size, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, rgb.b, result_b, channel_size, NULL, NULL); + clFinish(ocl.commandQueue); + ocl.releaseMemChannels(rgb); ocl.releaseMemChannels(rgb_blurred); } @@ -854,6 +859,9 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b, err = clFinish(ocl.commandQueue); memcpy(result, result_r, channel_size); + clEnqueueUnmapMemObject(ocl.commandQueue, mem_result, result_r, channel_size, NULL, NULL); + clFinish(ocl.commandQueue); + ocl.releaseMemChannels(xyb1); ocl.releaseMemChannels(xyb0); From 8474de05eab0784518f0897b20f7e19953995214 Mon Sep 17 00:00:00 2001 From: strongtu Date: Sun, 7 May 2017 21:46:39 +0800 Subject: [PATCH 041/189] =?UTF-8?q?=E5=85=88=E6=8E=92=E6=9F=A5>100*100?= =?UTF-8?q?=E7=9A=84=E8=AE=A1=E7=AE=97=E7=B2=BE=E5=BA=A6=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clguetzli_test.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/clguetzli/clguetzli_test.cpp b/clguetzli/clguetzli_test.cpp index b14eb15d..e7410a52 100644 --- a/clguetzli/clguetzli_test.cpp +++ b/clguetzli/clguetzli_test.cpp @@ -30,6 +30,8 @@ void clMaskHighIntensityChange(const float* r, const float* g, const float* b, const float* result_r, const float* result_g, const float* result_b, const float* result_r2, const float* result_g2, const float* result_b2) { + if (xsize < 100 || ysize < 100) return; + size_t channel_size = xsize * ysize * sizeof(float); cl_int err = 0; ocl_args_d_t &ocl = getOcl(); @@ -79,6 +81,8 @@ void clEdgeDetectorMap(const float* r, const float* g, const float* b, size_t xsize, size_t ysize, size_t step, const float* result) { + if (xsize < 100 || ysize < 100) return; + size_t channel_size = xsize * ysize * sizeof(float); const size_t res_xsize = (xsize + step - 1) / step; const size_t res_ysize = (ysize + step - 1) / step; @@ -119,6 +123,8 @@ void clBlockDiffMap(const float* r, const float* g, const float* b, size_t xsize, size_t ysize, size_t step, const float* result_diff_dc, const float* result_diff_ac) { + if (xsize < 100 || ysize < 100) return; + size_t channel_size = xsize * ysize * sizeof(float); const size_t res_xsize = (xsize + step - 1) / step; const size_t res_ysize = (ysize + step - 1) / step; @@ -166,6 +172,8 @@ void clEdgeDetectorLowFreq(const float* r, const float* g, const float* b, size_t xsize, size_t ysize, size_t step, const float* result_diff_dc) { + if (xsize < 100 || ysize < 100) return; + size_t channel_size = xsize * ysize * sizeof(float); const size_t res_xsize = (xsize + step - 1) / step; const size_t res_ysize = (ysize + step - 1) / step; @@ -208,6 +216,8 @@ void clMask(const float* r, const float* g, const float* b, const float* mask_r, const float* mask_g, const float* mask_b, const float* maskdc_r, const float* maskdc_g, const float* maskdc_b) { + if (xsize < 100 || ysize < 100) return; + size_t channel_size = xsize * ysize * sizeof(float); cl_int err = 0; ocl_args_d_t &ocl = getOcl(); From 1e8972fb31ccdd2b2b30abb6619fd0331e78803a Mon Sep 17 00:00:00 2001 From: ianhuang-777 <306168910@qq.com> Date: Sun, 7 May 2017 23:55:38 +0800 Subject: [PATCH 042/189] Remove _constant for opencl 1.2 --- clguetzli/clguetzli.cl | 158 +++++++++++++++++++++-------------------- 1 file changed, 80 insertions(+), 78 deletions(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index 6351aa7a..59f80050 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -364,7 +364,7 @@ __kernel void CombineChannels( DotProduct((float *)&edge_detector_map[3 * res_ix], mask)); } -inline double Interpolate(__constant double *array, int size, double sx) { +inline double Interpolate(const double *array, int size, double sx) { double ix = fabs(sx); int baseix = (int)(ix); @@ -381,37 +381,38 @@ inline double Interpolate(__constant double *array, int size, double sx) { return res; } -__constant double XybLowFreqToVals_inc = 5.2511644570349185; -__constant double XybLowFreqToVals_lut[21] = { - 0, - 1 * XybLowFreqToVals_inc, - 2 * XybLowFreqToVals_inc, - 3 * XybLowFreqToVals_inc, - 4 * XybLowFreqToVals_inc, - 5 * XybLowFreqToVals_inc, - 6 * XybLowFreqToVals_inc, - 7 * XybLowFreqToVals_inc, - 8 * XybLowFreqToVals_inc, - 9 * XybLowFreqToVals_inc, - 10 * XybLowFreqToVals_inc, - 11 * XybLowFreqToVals_inc, - 12 * XybLowFreqToVals_inc, - 13 * XybLowFreqToVals_inc, - 14 * XybLowFreqToVals_inc, - 15 * XybLowFreqToVals_inc, - 16 * XybLowFreqToVals_inc, - 17 * XybLowFreqToVals_inc, - 18 * XybLowFreqToVals_inc, - 19 * XybLowFreqToVals_inc, - 20 * XybLowFreqToVals_inc, -}; - void XybLowFreqToVals(double x, double y, double z, double *valx, double *valy, double *valz) { const double xmul = 6.64482198135; const double ymul = 0.837846224276; const double zmul = 7.34905756986; const double y_to_z_mul = 0.0812519812628; + + const double XybLowFreqToVals_inc = 5.2511644570349185; + const double XybLowFreqToVals_lut[21] = { + 0, + 1 * XybLowFreqToVals_inc, + 2 * XybLowFreqToVals_inc, + 3 * XybLowFreqToVals_inc, + 4 * XybLowFreqToVals_inc, + 5 * XybLowFreqToVals_inc, + 6 * XybLowFreqToVals_inc, + 7 * XybLowFreqToVals_inc, + 8 * XybLowFreqToVals_inc, + 9 * XybLowFreqToVals_inc, + 10 * XybLowFreqToVals_inc, + 11 * XybLowFreqToVals_inc, + 12 * XybLowFreqToVals_inc, + 13 * XybLowFreqToVals_inc, + 14 * XybLowFreqToVals_inc, + 15 * XybLowFreqToVals_inc, + 16 * XybLowFreqToVals_inc, + 17 * XybLowFreqToVals_inc, + 18 * XybLowFreqToVals_inc, + 19 * XybLowFreqToVals_inc, + 20 * XybLowFreqToVals_inc, + }; + z += y_to_z_mul * y; *valz = z * zmul; *valx = x * xmul; @@ -863,32 +864,6 @@ void ButteraugliFFTSquared(double block[kBlockSize]) { } } -__constant double MakeHighFreqColorDiffDy_off = 1.4103373714040413; -__constant double MakeHighFreqColorDiffDy_inc = 0.7084088867024; -__constant double MakeHighFreqColorDiffDy_lut[21] ={ - 0.0, - MakeHighFreqColorDiffDy_off, - MakeHighFreqColorDiffDy_off + 1 * MakeHighFreqColorDiffDy_inc, - MakeHighFreqColorDiffDy_off + 2 * MakeHighFreqColorDiffDy_inc, - MakeHighFreqColorDiffDy_off + 3 * MakeHighFreqColorDiffDy_inc, - MakeHighFreqColorDiffDy_off + 4 * MakeHighFreqColorDiffDy_inc, - MakeHighFreqColorDiffDy_off + 5 * MakeHighFreqColorDiffDy_inc, - MakeHighFreqColorDiffDy_off + 6 * MakeHighFreqColorDiffDy_inc, - MakeHighFreqColorDiffDy_off + 7 * MakeHighFreqColorDiffDy_inc, - MakeHighFreqColorDiffDy_off + 8 * MakeHighFreqColorDiffDy_inc, - MakeHighFreqColorDiffDy_off + 9 * MakeHighFreqColorDiffDy_inc, - MakeHighFreqColorDiffDy_off + 10 * MakeHighFreqColorDiffDy_inc, - MakeHighFreqColorDiffDy_off + 11 * MakeHighFreqColorDiffDy_inc, - MakeHighFreqColorDiffDy_off + 12 * MakeHighFreqColorDiffDy_inc, - MakeHighFreqColorDiffDy_off + 13 * MakeHighFreqColorDiffDy_inc, - MakeHighFreqColorDiffDy_off + 14 * MakeHighFreqColorDiffDy_inc, - MakeHighFreqColorDiffDy_off + 15 * MakeHighFreqColorDiffDy_inc, - MakeHighFreqColorDiffDy_off + 16 * MakeHighFreqColorDiffDy_inc, - MakeHighFreqColorDiffDy_off + 17 * MakeHighFreqColorDiffDy_inc, - MakeHighFreqColorDiffDy_off + 18 * MakeHighFreqColorDiffDy_inc, - MakeHighFreqColorDiffDy_off + 19 * MakeHighFreqColorDiffDy_inc, -}; - double RemoveRangeAroundZero(double v, double range) { if (v >= -range && v < range) { return 0; @@ -963,6 +938,33 @@ void ButteraugliBlockDiff(double xyb0[3 * kBlockSize], const double ymul2 = 1.51983458269; const double zmul = 2.4; + const double MakeHighFreqColorDiffDy_off = 1.4103373714040413; + const double MakeHighFreqColorDiffDy_inc = 0.7084088867024; + const double MakeHighFreqColorDiffDy_lut[21] = { + 0.0, + MakeHighFreqColorDiffDy_off, + MakeHighFreqColorDiffDy_off + 1 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 2 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 3 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 4 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 5 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 6 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 7 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 8 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 9 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 10 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 11 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 12 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 13 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 14 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 15 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 16 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 17 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 18 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 19 * MakeHighFreqColorDiffDy_inc, + }; + + for (size_t i = kBlockEdgeHalf; i < kBlockHalf + kBlockEdgeHalf + 1; ++i) { double d = csf8x8[i]; diff_xyb_ac[0] += d * xmul * x_halfdiff_squared[i]; @@ -1102,32 +1104,6 @@ __kernel void MaskHighIntensityChange( } -__constant double XybToVals_off = 11.38708334481672; -__constant double XybToVals_inc = 14.550189611520716; -__constant double XybToVals_lut[21] = { - 0, - XybToVals_off, - XybToVals_off + 1 * XybToVals_inc, - XybToVals_off + 2 * XybToVals_inc, - XybToVals_off + 3 * XybToVals_inc, - XybToVals_off + 4 * XybToVals_inc, - XybToVals_off + 5 * XybToVals_inc, - XybToVals_off + 6 * XybToVals_inc, - XybToVals_off + 7 * XybToVals_inc, - XybToVals_off + 8 * XybToVals_inc, - XybToVals_off + 9 * XybToVals_inc, - XybToVals_off + 10 * XybToVals_inc, - XybToVals_off + 11 * XybToVals_inc, - XybToVals_off + 12 * XybToVals_inc, - XybToVals_off + 13 * XybToVals_inc, - XybToVals_off + 14 * XybToVals_inc, - XybToVals_off + 15 * XybToVals_inc, - XybToVals_off + 16 * XybToVals_inc, - XybToVals_off + 17 * XybToVals_inc, - XybToVals_off + 18 * XybToVals_inc, - XybToVals_off + 19 * XybToVals_inc, -}; - void XybToVals( double x, double y, double z, double *valx, double *valy, double *valz) @@ -1136,6 +1112,32 @@ void XybToVals( const double ymul = 2.28148649801; const double zmul = 1.87816926918; + const double XybToVals_off = 11.38708334481672; + const double XybToVals_inc = 14.550189611520716; + const double XybToVals_lut[21] = { + 0, + XybToVals_off, + XybToVals_off + 1 * XybToVals_inc, + XybToVals_off + 2 * XybToVals_inc, + XybToVals_off + 3 * XybToVals_inc, + XybToVals_off + 4 * XybToVals_inc, + XybToVals_off + 5 * XybToVals_inc, + XybToVals_off + 6 * XybToVals_inc, + XybToVals_off + 7 * XybToVals_inc, + XybToVals_off + 8 * XybToVals_inc, + XybToVals_off + 9 * XybToVals_inc, + XybToVals_off + 10 * XybToVals_inc, + XybToVals_off + 11 * XybToVals_inc, + XybToVals_off + 12 * XybToVals_inc, + XybToVals_off + 13 * XybToVals_inc, + XybToVals_off + 14 * XybToVals_inc, + XybToVals_off + 15 * XybToVals_inc, + XybToVals_off + 16 * XybToVals_inc, + XybToVals_off + 17 * XybToVals_inc, + XybToVals_off + 18 * XybToVals_inc, + XybToVals_off + 19 * XybToVals_inc, + }; + *valx = Interpolate(&XybToVals_lut[0], 21, x * xmul); *valy = Interpolate(&XybToVals_lut[0], 21, y * ymul); *valz = zmul * z; From 9400c2130301ca241f50d935867da37ab75dfdea Mon Sep 17 00:00:00 2001 From: ianuming Date: Mon, 8 May 2017 10:22:11 +0800 Subject: [PATCH 043/189] Remove _constant for opencl 2.0 --- clguetzli/clguetzli.cl | 84 +++++++++++++++++++++--------------------- 1 file changed, 43 insertions(+), 41 deletions(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index 59f80050..a8af0c0e 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -580,55 +580,15 @@ __kernel void edgeDetectorLowFreq(__global float *result, #define kBlockEdgeHalf (kBlockEdge / 2) #define kBlockHalf (kBlockEdge * kBlockEdgeHalf) -__constant double csf8x8[kBlockHalf + kBlockEdgeHalf + 1] = { - 5.28270670524, - 0.0, - 0.0, - 0.0, - 0.3831134973, - 0.676303603859, - 3.58927792424, - 18.6104367002, - 18.6104367002, - 3.09093131948, - 1.0, - 0.498250875965, - 0.36198671102, - 0.308982169883, - 0.1312701920435, - 2.37370549629, - 3.58927792424, - 1.0, - 2.37370549629, - 0.991205724152, - 1.05178802919, - 0.627264168628, - 0.4, - 0.1312701920435, - 0.676303603859, - 0.498250875965, - 0.991205724152, - 0.5, - 0.3831134973, - 0.349686450518, - 0.627264168628, - 0.308982169883, - 0.3831134973, - 0.36198671102, - 1.05178802919, - 0.3831134973, - 0.12, -}; - typedef struct __Complex { double real; double imag; }Complex; -constant double kSqrtHalf = 0.70710678118654752440084436210484903; void RealFFT8(const double* in, Complex* out) { + const double kSqrtHalf = 0.70710678118654752440084436210484903; double t1, t2, t3, t5, t6, t7, t8; t8 = in[6]; t5 = in[2] - t8; @@ -743,6 +703,7 @@ inline void FFT4(Complex* a) { // D. J. Bernstein's Fast Fourier Transform algorithm on 8 elements. void FFT8(Complex* a) { + const double kSqrtHalf = 0.70710678118654752440084436210484903; double t1, t2, t3, t4, t5, t6, t7, t8; t7 = a[4].imag; @@ -887,6 +848,47 @@ void ButteraugliBlockDiff(double xyb0[3 * kBlockSize], double avgdiff_xyb[3] = { 0.0 }; double avgdiff_edge[3][4] = { { 0.0 } }; + + const double csf8x8[kBlockHalf + kBlockEdgeHalf + 1] = { + 5.28270670524, + 0.0, + 0.0, + 0.0, + 0.3831134973, + 0.676303603859, + 3.58927792424, + 18.6104367002, + 18.6104367002, + 3.09093131948, + 1.0, + 0.498250875965, + 0.36198671102, + 0.308982169883, + 0.1312701920435, + 2.37370549629, + 3.58927792424, + 1.0, + 2.37370549629, + 0.991205724152, + 1.05178802919, + 0.627264168628, + 0.4, + 0.1312701920435, + 0.676303603859, + 0.498250875965, + 0.991205724152, + 0.5, + 0.3831134973, + 0.349686450518, + 0.627264168628, + 0.308982169883, + 0.3831134973, + 0.36198671102, + 1.05178802919, + 0.3831134973, + 0.12, + }; + for (int i = 0; i < 3 * kBlockSize; ++i) { const double diff_xyb = xyb0[i] - xyb1[i]; const int c = i / kBlockSize; From 6962f20172ae682afb4c641b92009cccaf642162 Mon Sep 17 00:00:00 2001 From: strongtu Date: Mon, 8 May 2017 11:04:57 +0800 Subject: [PATCH 044/189] =?UTF-8?q?=E4=BF=AE=E5=A4=8DnVidia=E6=98=BE?= =?UTF-8?q?=E5=8D=A1=E7=9A=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clguetzli.cl | 14 +++++++------- clguetzli/clguetzli_test.cpp | 5 +++++ 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index a8af0c0e..bede6431 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -323,7 +323,7 @@ __kernel void ScaleImage(float scale, __global float *result) result[i] *= scale; } -double DotProduct(float u[3], double v[3]) { +double DotProduct(__global float u[3], double v[3]) { return u[0] * v[0] + u[1] * v[1] + u[2] * v[2]; } @@ -359,9 +359,9 @@ __kernel void CombineChannels( size_t res_ix = (res_y * res_xsize + res_x) / step; result[res_ix] = (float)( - DotProduct((float *)&block_diff_dc[3 * res_ix], dc_mask) + - DotProduct((float *)&block_diff_ac[3 * res_ix], mask) + - DotProduct((float *)&edge_detector_map[3 * res_ix], mask)); + DotProduct(&block_diff_dc[3 * res_ix], dc_mask) + + DotProduct(&block_diff_ac[3 * res_ix], mask) + + DotProduct(&edge_detector_map[3 * res_ix], mask)); } inline double Interpolate(const double *array, int size, double sx) { @@ -800,7 +800,7 @@ double abssq(const Complex c) { return c.real * c.real + c.imag * c.imag; } -void ButteraugliFFTSquared(double block[kBlockSize]) { +void ButteraugliFFTSquared(__private double block[kBlockSize]) { double global_mul = 0.000064; Complex block_c[kBlockSize]; @@ -840,8 +840,8 @@ double RemoveRangeAroundZero(double v, double range) { // Computes 8x8 FFT of each channel of xyb0 and xyb1 and adds the total squared // 3-dimensional xybdiff of the two blocks to diff_xyb_{dc,ac} and the average // diff on the edges to diff_xyb_edge_dc. -void ButteraugliBlockDiff(double xyb0[3 * kBlockSize], - double xyb1[3 * kBlockSize], +void ButteraugliBlockDiff(__private double xyb0[3 * kBlockSize], + __private double xyb1[3 * kBlockSize], double diff_xyb_dc[3], double diff_xyb_ac[3], double diff_xyb_edge_dc[3]) { diff --git a/clguetzli/clguetzli_test.cpp b/clguetzli/clguetzli_test.cpp index e7410a52..971fa085 100644 --- a/clguetzli/clguetzli_test.cpp +++ b/clguetzli/clguetzli_test.cpp @@ -30,6 +30,7 @@ void clMaskHighIntensityChange(const float* r, const float* g, const float* b, const float* result_r, const float* result_g, const float* result_b, const float* result_r2, const float* result_g2, const float* result_b2) { + return; if (xsize < 100 || ysize < 100) return; size_t channel_size = xsize * ysize * sizeof(float); @@ -81,6 +82,7 @@ void clEdgeDetectorMap(const float* r, const float* g, const float* b, size_t xsize, size_t ysize, size_t step, const float* result) { + return; if (xsize < 100 || ysize < 100) return; size_t channel_size = xsize * ysize * sizeof(float); @@ -123,6 +125,7 @@ void clBlockDiffMap(const float* r, const float* g, const float* b, size_t xsize, size_t ysize, size_t step, const float* result_diff_dc, const float* result_diff_ac) { + return; if (xsize < 100 || ysize < 100) return; size_t channel_size = xsize * ysize * sizeof(float); @@ -172,6 +175,7 @@ void clEdgeDetectorLowFreq(const float* r, const float* g, const float* b, size_t xsize, size_t ysize, size_t step, const float* result_diff_dc) { + return; if (xsize < 100 || ysize < 100) return; size_t channel_size = xsize * ysize * sizeof(float); @@ -216,6 +220,7 @@ void clMask(const float* r, const float* g, const float* b, const float* mask_r, const float* mask_g, const float* mask_b, const float* maskdc_r, const float* maskdc_g, const float* maskdc_b) { + return; if (xsize < 100 || ysize < 100) return; size_t channel_size = xsize * ysize * sizeof(float); From c1f83bbb1f7b61786f8f9edfb9e3882153f72a4b Mon Sep 17 00:00:00 2001 From: strongtu Date: Mon, 8 May 2017 14:56:05 +0800 Subject: [PATCH 045/189] =?UTF-8?q?fixed=20n=E5=8D=A1=20=5F=5Fconstant?= =?UTF-8?q?=E7=9A=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clguetzli.cl | 170 ++++++++++++++++++----------------- clguetzli/clguetzli_test.cpp | 8 +- 2 files changed, 90 insertions(+), 88 deletions(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index bede6431..c742dfd3 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -87,9 +87,7 @@ __kernel void DownSample(__global float* pA, __global float* pC, int xstep, int pC[y * xsize + x] = pA[sample_y * oxsize + sample_x]; } -void OpsinAbsorbance(const double in[3], double out[3]) -{ - const float mix[12] = { +__constant float g_mix[12] = { 0.348036746003, 0.577814843137, 0.0544556093735, @@ -102,11 +100,13 @@ void OpsinAbsorbance(const double in[3], double out[3]) 0.158581714673, 0.712857943858, 10.6524069248, - }; +}; - out[0] = mix[0] * in[0] + mix[1] * in[1] + mix[2] * in[2] + mix[3]; - out[1] = mix[4] * in[0] + mix[5] * in[1] + mix[6] * in[2] + mix[7]; - out[2] = mix[8] * in[0] + mix[9] * in[1] + mix[10] * in[2] + mix[11]; +void OpsinAbsorbance(const double in[3], double out[3]) +{ + out[0] = g_mix[0] * in[0] + g_mix[1] * in[1] + g_mix[2] * in[2] + g_mix[3]; + out[1] = g_mix[4] * in[0] + g_mix[5] * in[1] + g_mix[6] * in[2] + g_mix[7]; + out[2] = g_mix[8] * in[0] + g_mix[9] * in[1] + g_mix[10] * in[2] + g_mix[11]; } double EvaluatePolynomial(const double x, const double *coefficients, int n) @@ -364,7 +364,7 @@ __kernel void CombineChannels( DotProduct(&edge_detector_map[3 * res_ix], mask)); } -inline double Interpolate(const double *array, int size, double sx) { +inline double Interpolate(__constant double *array, int size, double sx) { double ix = fabs(sx); int baseix = (int)(ix); @@ -381,6 +381,31 @@ inline double Interpolate(const double *array, int size, double sx) { return res; } +#define XybLowFreqToVals_inc 5.2511644570349185 +__constant double XybLowFreqToVals_lut[21] = { + 0, + 1 * XybLowFreqToVals_inc, + 2 * XybLowFreqToVals_inc, + 3 * XybLowFreqToVals_inc, + 4 * XybLowFreqToVals_inc, + 5 * XybLowFreqToVals_inc, + 6 * XybLowFreqToVals_inc, + 7 * XybLowFreqToVals_inc, + 8 * XybLowFreqToVals_inc, + 9 * XybLowFreqToVals_inc, + 10 * XybLowFreqToVals_inc, + 11 * XybLowFreqToVals_inc, + 12 * XybLowFreqToVals_inc, + 13 * XybLowFreqToVals_inc, + 14 * XybLowFreqToVals_inc, + 15 * XybLowFreqToVals_inc, + 16 * XybLowFreqToVals_inc, + 17 * XybLowFreqToVals_inc, + 18 * XybLowFreqToVals_inc, + 19 * XybLowFreqToVals_inc, + 20 * XybLowFreqToVals_inc, +}; + void XybLowFreqToVals(double x, double y, double z, double *valx, double *valy, double *valz) { const double xmul = 6.64482198135; @@ -388,31 +413,6 @@ void XybLowFreqToVals(double x, double y, double z, const double zmul = 7.34905756986; const double y_to_z_mul = 0.0812519812628; - const double XybLowFreqToVals_inc = 5.2511644570349185; - const double XybLowFreqToVals_lut[21] = { - 0, - 1 * XybLowFreqToVals_inc, - 2 * XybLowFreqToVals_inc, - 3 * XybLowFreqToVals_inc, - 4 * XybLowFreqToVals_inc, - 5 * XybLowFreqToVals_inc, - 6 * XybLowFreqToVals_inc, - 7 * XybLowFreqToVals_inc, - 8 * XybLowFreqToVals_inc, - 9 * XybLowFreqToVals_inc, - 10 * XybLowFreqToVals_inc, - 11 * XybLowFreqToVals_inc, - 12 * XybLowFreqToVals_inc, - 13 * XybLowFreqToVals_inc, - 14 * XybLowFreqToVals_inc, - 15 * XybLowFreqToVals_inc, - 16 * XybLowFreqToVals_inc, - 17 * XybLowFreqToVals_inc, - 18 * XybLowFreqToVals_inc, - 19 * XybLowFreqToVals_inc, - 20 * XybLowFreqToVals_inc, - }; - z += y_to_z_mul * y; *valz = z * zmul; *valx = x * xmul; @@ -837,6 +837,33 @@ double RemoveRangeAroundZero(double v, double range) { } } +#define MakeHighFreqColorDiffDy_off 1.4103373714040413 +#define MakeHighFreqColorDiffDy_inc 0.7084088867024 +__constant double MakeHighFreqColorDiffDy_lut[21] = { + 0.0, + MakeHighFreqColorDiffDy_off, + MakeHighFreqColorDiffDy_off + 1 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 2 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 3 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 4 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 5 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 6 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 7 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 8 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 9 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 10 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 11 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 12 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 13 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 14 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 15 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 16 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 17 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 18 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 19 * MakeHighFreqColorDiffDy_inc, +}; + + // Computes 8x8 FFT of each channel of xyb0 and xyb1 and adds the total squared // 3-dimensional xybdiff of the two blocks to diff_xyb_{dc,ac} and the average // diff on the edges to diff_xyb_edge_dc. @@ -940,31 +967,6 @@ void ButteraugliBlockDiff(__private double xyb0[3 * kBlockSize], const double ymul2 = 1.51983458269; const double zmul = 2.4; - const double MakeHighFreqColorDiffDy_off = 1.4103373714040413; - const double MakeHighFreqColorDiffDy_inc = 0.7084088867024; - const double MakeHighFreqColorDiffDy_lut[21] = { - 0.0, - MakeHighFreqColorDiffDy_off, - MakeHighFreqColorDiffDy_off + 1 * MakeHighFreqColorDiffDy_inc, - MakeHighFreqColorDiffDy_off + 2 * MakeHighFreqColorDiffDy_inc, - MakeHighFreqColorDiffDy_off + 3 * MakeHighFreqColorDiffDy_inc, - MakeHighFreqColorDiffDy_off + 4 * MakeHighFreqColorDiffDy_inc, - MakeHighFreqColorDiffDy_off + 5 * MakeHighFreqColorDiffDy_inc, - MakeHighFreqColorDiffDy_off + 6 * MakeHighFreqColorDiffDy_inc, - MakeHighFreqColorDiffDy_off + 7 * MakeHighFreqColorDiffDy_inc, - MakeHighFreqColorDiffDy_off + 8 * MakeHighFreqColorDiffDy_inc, - MakeHighFreqColorDiffDy_off + 9 * MakeHighFreqColorDiffDy_inc, - MakeHighFreqColorDiffDy_off + 10 * MakeHighFreqColorDiffDy_inc, - MakeHighFreqColorDiffDy_off + 11 * MakeHighFreqColorDiffDy_inc, - MakeHighFreqColorDiffDy_off + 12 * MakeHighFreqColorDiffDy_inc, - MakeHighFreqColorDiffDy_off + 13 * MakeHighFreqColorDiffDy_inc, - MakeHighFreqColorDiffDy_off + 14 * MakeHighFreqColorDiffDy_inc, - MakeHighFreqColorDiffDy_off + 15 * MakeHighFreqColorDiffDy_inc, - MakeHighFreqColorDiffDy_off + 16 * MakeHighFreqColorDiffDy_inc, - MakeHighFreqColorDiffDy_off + 17 * MakeHighFreqColorDiffDy_inc, - MakeHighFreqColorDiffDy_off + 18 * MakeHighFreqColorDiffDy_inc, - MakeHighFreqColorDiffDy_off + 19 * MakeHighFreqColorDiffDy_inc, - }; for (size_t i = kBlockEdgeHalf; i < kBlockHalf + kBlockEdgeHalf + 1; ++i) { @@ -1106,6 +1108,32 @@ __kernel void MaskHighIntensityChange( } +#define XybToVals_off 11.38708334481672 +#define XybToVals_inc 14.550189611520716 +__constant double XybToVals_lut[21] = { + 0, + XybToVals_off, + XybToVals_off + 1 * XybToVals_inc, + XybToVals_off + 2 * XybToVals_inc, + XybToVals_off + 3 * XybToVals_inc, + XybToVals_off + 4 * XybToVals_inc, + XybToVals_off + 5 * XybToVals_inc, + XybToVals_off + 6 * XybToVals_inc, + XybToVals_off + 7 * XybToVals_inc, + XybToVals_off + 8 * XybToVals_inc, + XybToVals_off + 9 * XybToVals_inc, + XybToVals_off + 10 * XybToVals_inc, + XybToVals_off + 11 * XybToVals_inc, + XybToVals_off + 12 * XybToVals_inc, + XybToVals_off + 13 * XybToVals_inc, + XybToVals_off + 14 * XybToVals_inc, + XybToVals_off + 15 * XybToVals_inc, + XybToVals_off + 16 * XybToVals_inc, + XybToVals_off + 17 * XybToVals_inc, + XybToVals_off + 18 * XybToVals_inc, + XybToVals_off + 19 * XybToVals_inc, +}; + void XybToVals( double x, double y, double z, double *valx, double *valy, double *valz) @@ -1114,32 +1142,6 @@ void XybToVals( const double ymul = 2.28148649801; const double zmul = 1.87816926918; - const double XybToVals_off = 11.38708334481672; - const double XybToVals_inc = 14.550189611520716; - const double XybToVals_lut[21] = { - 0, - XybToVals_off, - XybToVals_off + 1 * XybToVals_inc, - XybToVals_off + 2 * XybToVals_inc, - XybToVals_off + 3 * XybToVals_inc, - XybToVals_off + 4 * XybToVals_inc, - XybToVals_off + 5 * XybToVals_inc, - XybToVals_off + 6 * XybToVals_inc, - XybToVals_off + 7 * XybToVals_inc, - XybToVals_off + 8 * XybToVals_inc, - XybToVals_off + 9 * XybToVals_inc, - XybToVals_off + 10 * XybToVals_inc, - XybToVals_off + 11 * XybToVals_inc, - XybToVals_off + 12 * XybToVals_inc, - XybToVals_off + 13 * XybToVals_inc, - XybToVals_off + 14 * XybToVals_inc, - XybToVals_off + 15 * XybToVals_inc, - XybToVals_off + 16 * XybToVals_inc, - XybToVals_off + 17 * XybToVals_inc, - XybToVals_off + 18 * XybToVals_inc, - XybToVals_off + 19 * XybToVals_inc, - }; - *valx = Interpolate(&XybToVals_lut[0], 21, x * xmul); *valy = Interpolate(&XybToVals_lut[0], 21, y * ymul); *valz = zmul * z; diff --git a/clguetzli/clguetzli_test.cpp b/clguetzli/clguetzli_test.cpp index 971fa085..1bb20681 100644 --- a/clguetzli/clguetzli_test.cpp +++ b/clguetzli/clguetzli_test.cpp @@ -283,19 +283,19 @@ void clCalculateDiffmapEx(void) } -// strong todo +// chrisk todo void clBlur(void) { } -// strong todo +// chrisk todo void clConvolution(void) { } -// strong todo +// chirsk todo void clUpsample(void) { @@ -313,7 +313,7 @@ void clAverage5x5(void) } -// strong todo +// chrisk todo void clMinSquareVal(void) { From a8aba9b6b256b57383c0a680edbc8f9bef1ccac9 Mon Sep 17 00:00:00 2001 From: ianuming Date: Mon, 8 May 2017 15:23:44 +0800 Subject: [PATCH 046/189] Fix __constant error for nvidia device --- clguetzli/clguetzli.cl | 41 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index c742dfd3..c5284a52 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -587,8 +587,8 @@ typedef struct __Complex }Complex; +__constant double kSqrtHalf = 0.70710678118654752440084436210484903; void RealFFT8(const double* in, Complex* out) { - const double kSqrtHalf = 0.70710678118654752440084436210484903; double t1, t2, t3, t5, t6, t7, t8; t8 = in[6]; t5 = in[2] - t8; @@ -863,6 +863,45 @@ __constant double MakeHighFreqColorDiffDy_lut[21] = { MakeHighFreqColorDiffDy_off + 19 * MakeHighFreqColorDiffDy_inc, }; +__constant double csf8x8[kBlockHalf + kBlockEdgeHalf + 1] = { + 5.28270670524, + 0.0, + 0.0, + 0.0, + 0.3831134973, + 0.676303603859, + 3.58927792424, + 18.6104367002, + 18.6104367002, + 3.09093131948, + 1.0, + 0.498250875965, + 0.36198671102, + 0.308982169883, + 0.1312701920435, + 2.37370549629, + 3.58927792424, + 1.0, + 2.37370549629, + 0.991205724152, + 1.05178802919, + 0.627264168628, + 0.4, + 0.1312701920435, + 0.676303603859, + 0.498250875965, + 0.991205724152, + 0.5, + 0.3831134973, + 0.349686450518, + 0.627264168628, + 0.308982169883, + 0.3831134973, + 0.36198671102, + 1.05178802919, + 0.3831134973, + 0.12, +}; // Computes 8x8 FFT of each channel of xyb0 and xyb1 and adds the total squared // 3-dimensional xybdiff of the two blocks to diff_xyb_{dc,ac} and the average From d9e3808417e33d22bcf54aaa6e60418b0bc82ee3 Mon Sep 17 00:00:00 2001 From: ianuming Date: Mon, 8 May 2017 16:36:46 +0800 Subject: [PATCH 047/189] Optimize clDoMask --- clguetzli/clguetzli.cl | 99 ++++++----------------------------------- clguetzli/clguetzli.cpp | 90 ++++++++++++++++++++++++++++++++++--- clguetzli/ocl.h | 7 +++ 3 files changed, 104 insertions(+), 92 deletions(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index c5284a52..1a79c62e 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -192,7 +192,7 @@ __kernel void OpsinDynamicsImage( } -double InterpolateClampNegative(const double *array, +double InterpolateClampNegative(__global const double *array, int size, double sx) { if (sx < 0) { sx = 0; @@ -211,87 +211,11 @@ double InterpolateClampNegative(const double *array, return res; } -void MakeMask(double extmul, double extoff, - double mul, double offset, - double scaler, double *result) -{ - for (size_t i = 0; i < 512; ++i) { - const double c = mul / ((0.01 * scaler * i) + offset); - result[i] = 1.0 + extmul * (c + extoff); - result[i] *= result[i]; - } -} - -double MaskX(double delta) { - const double extmul = 0.975741017749; - const double extoff = -4.25328244168; - const double offset = 0.454909521427; - const double scaler = 0.0738288224836; - const double mul = 20.8029176447; - double lut[512]; - MakeMask(extmul, extoff, mul, offset, scaler, lut); - return InterpolateClampNegative(lut, 512, delta); -} - -double MaskY(double delta) { - const double extmul = 0.373995618954; - const double extoff = 1.5307267433; - const double offset = 0.911952641929; - const double scaler = 1.1731667845; - const double mul = 16.2447033988; - double lut[512]; - MakeMask(extmul, extoff, mul, offset, scaler, lut); - return InterpolateClampNegative(lut, 512, delta); -} - -double MaskB(double delta) { - const double extmul = 0.61582234137; - const double extoff = -4.25376118646; - const double offset = 1.05105070921; - const double scaler = 0.47434643535; - const double mul = 31.1444967089; - double lut[512]; - MakeMask(extmul, extoff, mul, offset, scaler, lut); - return InterpolateClampNegative(lut, 512, delta); -} - -double MaskDcX(double delta) { - const double extmul = 1.79116943438; - const double extoff = -3.86797479189; - const double offset = 0.670960225853; - const double scaler = 0.486575865525; - const double mul = 20.4563479139; - double lut[512]; - MakeMask(extmul, extoff, mul, offset, scaler, lut); - return InterpolateClampNegative(lut, 512, delta); -} - -double MaskDcY(double delta) { - const double extmul = 0.212223514236; - const double extoff = -3.65647120524; - const double offset = 1.73396799447; - const double scaler = 0.170392660501; - const double mul = 21.6566724788; - double lut[512]; - MakeMask(extmul, extoff, mul, offset, scaler, lut); - return InterpolateClampNegative(lut, 512, delta); -} - -double MaskDcB(double delta) { - const double extmul = 0.349376011816; - const double extoff = -0.894711072781; - const double offset = 0.901647926679; - const double scaler = 0.380086095024; - const double mul = 18.0373825149; - double lut[512]; - MakeMask(extmul, extoff, mul, offset, scaler, lut); - return InterpolateClampNegative(lut, 512, delta); -} - __kernel void DoMask( __global float *mask_x, __global float *mask_y, __global float *mask_b, __global float *mask_dc_x, __global float *mask_dc_y, __global float *mask_dc_b, - int xsize, int ysize) + __global double *lut_x, __global double *lut_y, __global double *lut_b, + __global double *lut_dc_x, __global double *lut_dc_y, __global double *lut_dc_b) { const double w00 = 232.206464018; const double w11 = 22.9455222245; @@ -300,6 +224,9 @@ __kernel void DoMask( const int x = get_global_id(0); const int y = get_global_id(1); + const int xsize = get_global_size(0); + const int ysize = get_global_size(1); + const size_t idx = y * xsize + x; const double s0 = mask_x[idx]; const double s1 = mask_y[idx]; @@ -308,16 +235,16 @@ __kernel void DoMask( const double p1 = w11 * s1; const double p2 = w22 * s2; - mask_x[idx] = (float)(MaskX(p0)); - mask_y[idx] = (float)(MaskY(p1)); - mask_b[idx] = (float)(MaskB(p2)); - mask_dc_x[idx] = (float)(MaskDcX(p0)); - mask_dc_y[idx] = (float)(MaskDcY(p1)); - mask_dc_b[idx] = (float)(MaskDcB(p2)); + mask_x[idx] = (float)(InterpolateClampNegative(lut_x, 512, p0)); + mask_y[idx] = (float)(InterpolateClampNegative(lut_y, 512, p1)); + mask_b[idx] = (float)(InterpolateClampNegative(lut_b, 512, p2)); + mask_dc_x[idx] = (float)(InterpolateClampNegative(lut_dc_x, 512, p0)); + mask_dc_y[idx] = (float)(InterpolateClampNegative(lut_dc_y, 512, p1)); + mask_dc_b[idx] = (float)(InterpolateClampNegative(lut_dc_b, 512, p2)); } -__kernel void ScaleImage(float scale, __global float *result) +__kernel void ScaleImage(double scale, __global float *result) { const int i = get_global_id(0); result[i] *= scale; diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index 487d260d..bc6e5bb1 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -188,7 +188,7 @@ void clOpsinDynamicsImageEx(ocl_channels rgb/*in,out*/, ocl_channels rgb_blurred clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&rgb_blurred.b); clSetKernelArg(kernel, 6, sizeof(cl_int), (void*)&clSize); - size_t globalWorkSize[1] = { clSize }; + size_t globalWorkSize[1] = { size }; cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL); if (CL_SUCCESS != err) { @@ -468,15 +468,15 @@ void clDiffPrecomputeEx(ocl_channels xyb0, ocl_channels xyb1, size_t xsize, size } } -void clScaleImageEx(cl_mem img/*in, out*/, size_t size, float w) +void clScaleImageEx(cl_mem img/*in, out*/, size_t size, double w) { cl_int err = CL_SUCCESS; ocl_args_d_t &ocl = getOcl(); - cl_float clscale = w; + cl_double clscale = w; cl_kernel kernel = ocl.kernel[KERNEL_SCALEIMAGE]; - clSetKernelArg(kernel, 0, sizeof(cl_float), (void*)&clscale); + clSetKernelArg(kernel, 0, sizeof(cl_double), (void*)&clscale); clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&img); size_t globalWorkSize[1] = { size }; @@ -587,6 +587,18 @@ void clMinSquareValEx(cl_mem img/*in,out*/, size_t xsize, size_t ysize, size_t s } } + +static void MakeMask(double extmul, double extoff, + double mul, double offset, + double scaler, double *result) +{ + for (size_t i = 0; i < 512; ++i) { + const double c = mul / ((0.01 * scaler * i) + offset); + result[i] = 1.0 + extmul * (c + extoff); + result[i] *= result[i]; + } +} + static const double kInternalGoodQualityThreshold = 14.921561160295326; static const double kGlobalScale = 1.0 / kInternalGoodQualityThreshold; @@ -598,6 +610,64 @@ void clDoMask(ocl_channels mask/*in, out*/, ocl_channels mask_dc/*in, out*/, siz cl_int clxsize = xsize; cl_int clysize = ysize; + double extmul = 0.975741017749; + double extoff = -4.25328244168; + double offset = 0.454909521427; + double scaler = 0.0738288224836; + double mul = 20.8029176447; + double lut_x[512]; + MakeMask(extmul, extoff, mul, offset, scaler, lut_x); + + extmul = 0.373995618954; + extoff = 1.5307267433; + offset = 0.911952641929; + scaler = 1.1731667845; + mul = 16.2447033988; + double lut_y[512]; + MakeMask(extmul, extoff, mul, offset, scaler, lut_y); + + extmul = 0.61582234137; + extoff = -4.25376118646; + offset = 1.05105070921; + scaler = 0.47434643535; + mul = 31.1444967089; + double lut_b[512]; + MakeMask(extmul, extoff, mul, offset, scaler, lut_b); + + extmul = 1.79116943438; + extoff = -3.86797479189; + offset = 0.670960225853; + scaler = 0.486575865525; + mul = 20.4563479139; + double lut_dcx[512]; + MakeMask(extmul, extoff, mul, offset, scaler, lut_dcx); + + extmul = 0.212223514236; + extoff = -3.65647120524; + offset = 1.73396799447; + scaler = 0.170392660501; + mul = 21.6566724788; + double lut_dcy[512]; + MakeMask(extmul, extoff, mul, offset, scaler, lut_dcy); + + extmul = 0.349376011816; + extoff = -0.894711072781; + offset = 0.901647926679; + scaler = 0.380086095024; + mul = 18.0373825149; + double lut_dcb[512]; + MakeMask(extmul, extoff, mul, offset, scaler, lut_dcb); + + size_t channel_size = 512 * 3 * sizeof(double); + ocl_channels xyb = ocl.allocMemChannels(channel_size); + ocl_channels xyb_dc = ocl.allocMemChannels(channel_size); + clEnqueueWriteBuffer(ocl.commandQueue, xyb.x, CL_FALSE, 0, channel_size, lut_x, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, xyb.y, CL_FALSE, 0, channel_size, lut_y, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, xyb.b, CL_FALSE, 0, channel_size, lut_b, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, xyb_dc.x, CL_FALSE, 0, channel_size, lut_dcx, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, xyb_dc.y, CL_FALSE, 0, channel_size, lut_dcy, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, xyb_dc.b, CL_FALSE, 0, channel_size, lut_dcb, 0, NULL, NULL); + cl_kernel kernel = ocl.kernel[KERNEL_DOMASK]; clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&mask.r); clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&mask.g); @@ -605,8 +675,12 @@ void clDoMask(ocl_channels mask/*in, out*/, ocl_channels mask_dc/*in, out*/, siz clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&mask_dc.r); clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&mask_dc.g); clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&mask_dc.b); - clSetKernelArg(kernel, 6, sizeof(cl_int), (void*)&clxsize); - clSetKernelArg(kernel, 7, sizeof(cl_int), (void*)&clysize); + clSetKernelArg(kernel, 6, sizeof(cl_mem), (void*)&xyb.x); + clSetKernelArg(kernel, 7, sizeof(cl_mem), (void*)&xyb.y); + clSetKernelArg(kernel, 8, sizeof(cl_mem), (void*)&xyb.b); + clSetKernelArg(kernel, 9, sizeof(cl_mem), (void*)&xyb_dc.x); + clSetKernelArg(kernel, 10, sizeof(cl_mem), (void*)&xyb_dc.y); + clSetKernelArg(kernel, 11, sizeof(cl_mem), (void*)&xyb_dc.b); size_t globalWorkSize[2] = { xsize, ysize }; err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); @@ -619,8 +693,12 @@ void clDoMask(ocl_channels mask/*in, out*/, ocl_channels mask_dc/*in, out*/, siz { LogError("Error: clDoMask() clFinish returned %s.\n", TranslateOpenCLError(err)); } + + ocl.releaseMemChannels(xyb); + ocl.releaseMemChannels(xyb_dc); } + void clMaskEx(ocl_channels rgb, ocl_channels rgb2, size_t xsize, size_t ysize, ocl_channels mask/*out*/, ocl_channels mask_dc/*out*/) diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h index 7babc74e..a210d1c1 100644 --- a/clguetzli/ocl.h +++ b/clguetzli/ocl.h @@ -72,6 +72,13 @@ typedef union ocl_channels_t cl_mem b; }; + struct + { + cl_mem x; + cl_mem y; + cl_mem b; + }; + cl_mem ch[3]; }ocl_channels; From 77314278f9e966127aba0984bf19b9da128dbfcd Mon Sep 17 00:00:00 2001 From: strongtu Date: Mon, 8 May 2017 16:54:31 +0800 Subject: [PATCH 048/189] =?UTF-8?q?32=E4=BD=8D=E5=B9=B3=E5=8F=B0=E7=BC=96?= =?UTF-8?q?=E8=AF=91=E9=85=8D=E7=BD=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- guetzli.vcxproj | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/guetzli.vcxproj b/guetzli.vcxproj index e48d1682..d2cf62c7 100644 --- a/guetzli.vcxproj +++ b/guetzli.vcxproj @@ -136,7 +136,7 @@ NotUsing Level3 - .;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories) + .;$(INTELOCLSDKROOT)include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories) Disabled true false @@ -151,9 +151,10 @@ shlwapi.lib;OpenCL.lib;%(AdditionalDependencies) mainCRTStartup __tcmalloc + $(INTELOCLSDKROOT)lib\x86 - - + + NotUsing Level3 @@ -177,7 +178,7 @@ NotUsing Level3 - .;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories) + .;$(INTELOCLSDKROOT)include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories) EditAndContinue Disabled PERFTOOLS_DLL_DECL=;ENABLE_OPENCL;%(PreprocessorDefinitions) @@ -188,6 +189,7 @@ shlwapi.lib;OpenCL.lib;%(AdditionalDependencies) mainCRTStartup __tcmalloc + $(INTELOCLSDKROOT)lib\x86 From 3116d6a216f2a20cf6c90ae40ff71e5ba9a11d76 Mon Sep 17 00:00:00 2001 From: ianuming Date: Mon, 8 May 2017 17:27:20 +0800 Subject: [PATCH 049/189] Move some local constant array to __constant --- clguetzli/clguetzli.cl | 80 ++++++++++------------------------------- clguetzli/clguetzli.cpp | 12 +++---- guetzli.vcxproj | 2 +- 3 files changed, 25 insertions(+), 69 deletions(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index 1a79c62e..564fff1e 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -109,7 +109,7 @@ void OpsinAbsorbance(const double in[3], double out[3]) out[2] = g_mix[8] * in[0] + g_mix[9] * in[1] + g_mix[10] * in[2] + g_mix[11]; } -double EvaluatePolynomial(const double x, const double *coefficients, int n) +double EvaluatePolynomial(const double x, __constant const double *coefficients, int n) { double b1 = 0.0; double b2 = 0.0; @@ -130,25 +130,25 @@ double EvaluatePolynomial(const double x, const double *coefficients, int n) return b1; } -double Gamma(double v) -{ - double min_value = 0.770000000000000; - double max_value = 274.579999999999984; - const double p[5 + 1] = { - 881.979476556478289, 1496.058452015812463, 908.662212739659481, - 373.566100223287378, 85.840860336314364, 6.683258861509244, - }; - const double q[5 + 1] = { - 12.262350348616792, 20.557285797683576, 12.161463238367844, - 4.711532733641639, 0.899112889751053, 0.035662329617191, - }; +__constant double g_gamma_p[5 + 1] = { + 881.979476556478289, 1496.058452015812463, 908.662212739659481, + 373.566100223287378, 85.840860336314364, 6.683258861509244, +}; +__constant double g_gamma_q[5 + 1] = { + 12.262350348616792, 20.557285797683576, 12.161463238367844, + 4.711532733641639, 0.899112889751053, 0.035662329617191, +}; +double Gamma(double v) +{ + const double min_value = 0.770000000000000; + const double max_value = 274.579999999999984; const double x01 = (v - min_value) / (max_value - min_value); const double xc = 2.0 * x01 - 1.0; - const double yp = EvaluatePolynomial(xc, p, 6); - const double yq = EvaluatePolynomial(xc, q, 6); + const double yp = EvaluatePolynomial(xc, g_gamma_p, 6); + const double yq = EvaluatePolynomial(xc, g_gamma_q, 6); if (yq == 0.0) return 0.0; return (float)(yp / yq); } @@ -842,46 +842,6 @@ void ButteraugliBlockDiff(__private double xyb0[3 * kBlockSize], double avgdiff_xyb[3] = { 0.0 }; double avgdiff_edge[3][4] = { { 0.0 } }; - const double csf8x8[kBlockHalf + kBlockEdgeHalf + 1] = { - 5.28270670524, - 0.0, - 0.0, - 0.0, - 0.3831134973, - 0.676303603859, - 3.58927792424, - 18.6104367002, - 18.6104367002, - 3.09093131948, - 1.0, - 0.498250875965, - 0.36198671102, - 0.308982169883, - 0.1312701920435, - 2.37370549629, - 3.58927792424, - 1.0, - 2.37370549629, - 0.991205724152, - 1.05178802919, - 0.627264168628, - 0.4, - 0.1312701920435, - 0.676303603859, - 0.498250875965, - 0.991205724152, - 0.5, - 0.3831134973, - 0.349686450518, - 0.627264168628, - 0.308982169883, - 0.3831134973, - 0.36198671102, - 1.05178802919, - 0.3831134973, - 0.12, - }; - for (int i = 0; i < 3 * kBlockSize; ++i) { const double diff_xyb = xyb0[i] - xyb1[i]; const int c = i / kBlockSize; @@ -933,8 +893,6 @@ void ButteraugliBlockDiff(__private double xyb0[3 * kBlockSize], const double ymul2 = 1.51983458269; const double zmul = 2.4; - - for (size_t i = kBlockEdgeHalf; i < kBlockHalf + kBlockEdgeHalf + 1; ++i) { double d = csf8x8[i]; diff_xyb_ac[0] += d * xmul * x_halfdiff_squared[i]; @@ -1034,13 +992,11 @@ __kernel void MaskHighIntensityChange( }; double sqr_max_diff = -1; { - int offset[4] = - { -1, 1, -(int)(xsize), (int)(xsize) }; - int border[4] = - { x == 0, x + 1 == xsize, y == 0, y + 1 == ysize }; + int offset[4] = { -1, 1, -(int)(xsize), (int)(xsize) }; + int border[4] = { x == 0, x + 1 == xsize, y == 0, y + 1 == ysize }; for (int dir = 0; dir < 4; ++dir) { if (border[dir]) { - continue; + continue; } const int ix2 = ix + offset[dir]; double diff = 0.5 * (c0_y[ix2] + c1_y[ix2]) - ave[1]; diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index bc6e5bb1..e55453d5 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -615,7 +615,7 @@ void clDoMask(ocl_channels mask/*in, out*/, ocl_channels mask_dc/*in, out*/, siz double offset = 0.454909521427; double scaler = 0.0738288224836; double mul = 20.8029176447; - double lut_x[512]; + static double lut_x[512]; MakeMask(extmul, extoff, mul, offset, scaler, lut_x); extmul = 0.373995618954; @@ -623,7 +623,7 @@ void clDoMask(ocl_channels mask/*in, out*/, ocl_channels mask_dc/*in, out*/, siz offset = 0.911952641929; scaler = 1.1731667845; mul = 16.2447033988; - double lut_y[512]; + static double lut_y[512]; MakeMask(extmul, extoff, mul, offset, scaler, lut_y); extmul = 0.61582234137; @@ -631,7 +631,7 @@ void clDoMask(ocl_channels mask/*in, out*/, ocl_channels mask_dc/*in, out*/, siz offset = 1.05105070921; scaler = 0.47434643535; mul = 31.1444967089; - double lut_b[512]; + static double lut_b[512]; MakeMask(extmul, extoff, mul, offset, scaler, lut_b); extmul = 1.79116943438; @@ -639,7 +639,7 @@ void clDoMask(ocl_channels mask/*in, out*/, ocl_channels mask_dc/*in, out*/, siz offset = 0.670960225853; scaler = 0.486575865525; mul = 20.4563479139; - double lut_dcx[512]; + static double lut_dcx[512]; MakeMask(extmul, extoff, mul, offset, scaler, lut_dcx); extmul = 0.212223514236; @@ -647,7 +647,7 @@ void clDoMask(ocl_channels mask/*in, out*/, ocl_channels mask_dc/*in, out*/, siz offset = 1.73396799447; scaler = 0.170392660501; mul = 21.6566724788; - double lut_dcy[512]; + static double lut_dcy[512]; MakeMask(extmul, extoff, mul, offset, scaler, lut_dcy); extmul = 0.349376011816; @@ -655,7 +655,7 @@ void clDoMask(ocl_channels mask/*in, out*/, ocl_channels mask_dc/*in, out*/, siz offset = 0.901647926679; scaler = 0.380086095024; mul = 18.0373825149; - double lut_dcb[512]; + static double lut_dcb[512]; MakeMask(extmul, extoff, mul, offset, scaler, lut_dcb); size_t channel_size = 512 * 3 * sizeof(double); diff --git a/guetzli.vcxproj b/guetzli.vcxproj index d2cf62c7..3026fb04 100644 --- a/guetzli.vcxproj +++ b/guetzli.vcxproj @@ -137,7 +137,7 @@ NotUsing Level3 .;$(INTELOCLSDKROOT)include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories) - Disabled + MaxSpeed true false false From 95f10c7ce8f8956f59cd72ad7c4e0c8899da4900 Mon Sep 17 00:00:00 2001 From: strongtu Date: Mon, 8 May 2017 20:47:51 +0800 Subject: [PATCH 050/189] for test --- guetzli/butteraugli_comparator.cc | 6 ++ guetzli/butteraugli_comparator.h | 4 ++ guetzli/guetzli.cc | 5 ++ guetzli/processor.cc | 106 ++++++++++++++++++------------ 4 files changed, 78 insertions(+), 43 deletions(-) diff --git a/guetzli/butteraugli_comparator.cc b/guetzli/butteraugli_comparator.cc index 1748b80d..9034d68e 100644 --- a/guetzli/butteraugli_comparator.cc +++ b/guetzli/butteraugli_comparator.cc @@ -22,6 +22,9 @@ #include "guetzli/gamma_correct.h" #include "guetzli/score.h" +int g_switchBlock = 0; +int g_compareBlock = 0; + namespace guetzli { ButteraugliComparator::ButteraugliComparator(const int width, const int height, @@ -94,6 +97,8 @@ void ButteraugliComparator::SwitchBlock(int block_x, int block_y, ::butteraugli::OpsinDynamicsImage(8, 8, per_block_pregamma_[bx]); } } + + g_switchBlock++; } double ButteraugliComparator::CompareBlock(const OutputImage& img, @@ -109,6 +114,7 @@ double ButteraugliComparator::CompareBlock(const OutputImage& img, std::vector > rgb1_c(3, std::vector(kDCTBlockSize)); img.ToLinearRGB(xmin, ymin, 8, 8, &rgb1_c); ::butteraugli::OpsinDynamicsImage(8, 8, rgb1_c); + g_compareBlock++; std::vector > rgb0 = rgb0_c; std::vector > rgb1 = rgb1_c; diff --git a/guetzli/butteraugli_comparator.h b/guetzli/butteraugli_comparator.h index 3879a599..098341e3 100644 --- a/guetzli/butteraugli_comparator.h +++ b/guetzli/butteraugli_comparator.h @@ -25,10 +25,14 @@ #include "guetzli/output_image.h" #include "guetzli/stats.h" +extern int g_switchBlock; +extern int g_compareBlock; + namespace guetzli { constexpr int kButteraugliStep = 3; + class ButteraugliComparator : public Comparator { public: ButteraugliComparator(const int width, const int height, diff --git a/guetzli/guetzli.cc b/guetzli/guetzli.cc index 3355265e..aa328cb1 100644 --- a/guetzli/guetzli.cc +++ b/guetzli/guetzli.cc @@ -233,6 +233,9 @@ void Usage() { } // namespace +extern int g_switchBlock; +extern int g_compareBlock; + int main(int argc, char** argv) { std::set_terminate(TerminateHandler); @@ -330,5 +333,7 @@ int main(int argc, char** argv) { } WriteFileOrDie(argv[opt_idx + 1], out_data); + + fprintf(stderr, "%d %d", g_switchBlock, g_compareBlock); return 0; } diff --git a/guetzli/processor.cc b/guetzli/processor.cc index 134dfe17..b6057f5e 100644 --- a/guetzli/processor.cc +++ b/guetzli/processor.cc @@ -362,63 +362,82 @@ bool Processor::SelectQuantMatrix(const JPEGData& jpg_in, const bool downsample, } +void func(const coeff_t block[kBlockSize], const coeff_t orig_block[kBlockSize], + const uint8_t comp_mask, guetzli::Params ¶ms_, std::vector > &input_order) +{ + static const uint8_t oldCsf[kDCTBlockSize] = { + 10, 10, 20, 40, 60, 70, 80, 90, + 10, 20, 30, 60, 70, 80, 90, 90, + 20, 30, 60, 70, 80, 90, 90, 90, + 40, 60, 70, 80, 90, 90, 90, 90, + 60, 70, 80, 90, 90, 90, 90, 90, + 70, 80, 90, 90, 90, 90, 90, 90, + 80, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + }; + static const double kWeight[3] = { 1.0, 0.22, 0.20 }; +#include "guetzli/order.inc" + + for (int c = 0; c < 3; ++c) { // TOBEREMOVE:¼ÆËãÊäÈëblockµÄinput_order,·Ç0µÄ´ò·Ö + if (!(comp_mask & (1 << c))) continue; + for (int k = 1; k < kDCTBlockSize; ++k) { + int idx = c * kDCTBlockSize + k; // TOBEREMOVE:ÿ¸ö·ÖÁ¿ÒÀ´Î + if (block[idx] != 0) { + float score; + if (params_.new_zeroing_model) { + score = std::abs(orig_block[idx]) * csf[idx] + bias[idx]; + } + else { + score = static_cast((std::abs(orig_block[idx]) - kJPEGZigZagOrder[k] / 64.0) * kWeight[c] / oldCsf[k]); + } + input_order.push_back(std::make_pair(idx, score)); + } + } + } + std::sort(input_order.begin(), input_order.end(), [](const std::pair& a, const std::pair& b) { return a.second < b.second; }); + +} + // REQUIRES: block[c*64...(c*64+63)] is all zero if (comp_mask & (1<* output_order) { - static const uint8_t oldCsf[kDCTBlockSize] = { - 10, 10, 20, 40, 60, 70, 80, 90, - 10, 20, 30, 60, 70, 80, 90, 90, - 20, 30, 60, 70, 80, 90, 90, 90, - 40, 60, 70, 80, 90, 90, 90, 90, - 60, 70, 80, 90, 90, 90, 90, 90, - 70, 80, 90, 90, 90, 90, 90, 90, - 80, 90, 90, 90, 90, 90, 90, 90, - 90, 90, 90, 90, 90, 90, 90, 90, - }; - static const double kWeight[3] = { 1.0, 0.22, 0.20 }; -#include "guetzli/order.inc" - std::vector > input_order; - for (int c = 0; c < 3; ++c) { // TOBEREMOVE:¼ÆËãÊäÈëblockµÄinput_order,·Ç0µÄ´ò·Ö - if (!(comp_mask & (1 << c))) continue; - for (int k = 1; k < kDCTBlockSize; ++k) { - int idx = c * kDCTBlockSize + k; // TOBEREMOVE:ÿ¸ö·ÖÁ¿ÒÀ´Î - if (block[idx] != 0) { - float score; - if (params_.new_zeroing_model) { - score = std::abs(orig_block[idx]) * csf[idx] + bias[idx]; - } else { - score = static_cast((std::abs(orig_block[idx]) - kJPEGZigZagOrder[k] / 64.0) * - kWeight[c] / oldCsf[k]); - } - input_order.push_back(std::make_pair(idx, score)); - } + + std::vector > input_order; + func(block, orig_block, comp_mask, params_, input_order); + if (input_order.size() > 10) + { + int i = 0; + i++; } - } - std::sort(input_order.begin(), input_order.end(), - [](const std::pair& a, const std::pair& b) { - return a.second < b.second; }); - coeff_t processed_block[kBlockSize]; - memcpy(processed_block, block, sizeof(processed_block)); - comparator_->SwitchBlock(block_x, block_y, factor_x, factor_y); + + + coeff_t processed_block[kBlockSize]; + memcpy(processed_block, block, sizeof(processed_block)); + + comparator_->SwitchBlock(block_x, block_y, factor_x, factor_y); + + while (!input_order.empty()) { float best_err = 1e17f; int best_i = 0; - for (size_t i = 0; i < std::min(params_.zeroing_greedy_lookahead, - input_order.size()); - ++i) { + for (size_t i = 0; i < std::min(params_.zeroing_greedy_lookahead, input_order.size()); ++i) + { coeff_t candidate_block[kBlockSize]; memcpy(candidate_block, processed_block, sizeof(candidate_block)); + const int idx = input_order[i].first; + candidate_block[idx] = 0; // TOBEREMOVE:¶Ô±ÈblockµÄÅÅÐòµÃ·ÖǰiµÍµÄÖÃ0(i¸ù¾Ýinput_orderÊý¾Ý±ä»¯¶ø±ä»¯)£¬²¢ÏÈÉèÖûضԱÈͼÏñµÄÈý¸ö·ÖÁ¿¶ÔÓ¦blockÖÐÈ¥£¬ºóÐøÔÙ×ö¶Ô±È²ÉÓᣠ+ for (int c = 0; c < 3; ++c) { if (comp_mask & (1 << c)) { - img->component(c).SetCoeffBlock( - block_x, block_y, &candidate_block[c * kDCTBlockSize]); + img->component(c).SetCoeffBlock(block_x, block_y, &candidate_block[c * kDCTBlockSize]); } } + float max_err = 0; for (int iy = 0; iy < factor_y; ++iy) { for (int ix = 0; ix < factor_x; ++ix) { @@ -430,19 +449,21 @@ void Processor::ComputeBlockZeroingOrder( } } } + if (max_err < best_err) { // TOBEREMOVE:ÕÒ³ö×îС´íÎóÖµµÄi best_err = max_err; best_i = i; } } + int idx = input_order[best_i].first; processed_block[idx] = 0; input_order.erase(input_order.begin() + best_i); + output_order->push_back({idx, best_err}); // TOBEREMOVE:½«ÉÏÃæ¼ÆËã³öÀ´µÄ×îС´íÎóµÄidx£¬¶ÔÓ¦µ½¶Ô±ÈblockÖеĶÔӦλÖÃÕæÕýµÄÖÃΪ0,ÒÆ³ýinput_orderÏ¼´Ñ¡È¡µ±Ç°Öµ£¬·ÅÈëoutput_order,²¢ÕýʽµÄÉèÖõ½¶Ô±ÈͼÏñÖÐÈ¥¡£ for (int c = 0; c < 3; ++c) { if (comp_mask & (1 << c)) { - img->component(c).SetCoeffBlock( - block_x, block_y, &processed_block[c * kDCTBlockSize]); + img->component(c).SetCoeffBlock(block_x, block_y, &processed_block[c * kDCTBlockSize]); } } } @@ -464,8 +485,7 @@ void Processor::ComputeBlockZeroingOrder( // Restore *img to the same state as it was at the start of this function. for (int c = 0; c < 3; ++c) { if (comp_mask & (1 << c)) { - img->component(c).SetCoeffBlock( - block_x, block_y, &block[c * kDCTBlockSize]); + img->component(c).SetCoeffBlock(block_x, block_y, &block[c * kDCTBlockSize]); } } } From 1a8fcc2f3e93e10051025c9dbc5c9e5e8696934c Mon Sep 17 00:00:00 2001 From: strongtu Date: Tue, 9 May 2017 15:10:07 +0800 Subject: [PATCH 051/189] =?UTF-8?q?=E6=B5=8B=E8=AF=95=E5=8D=B7=E7=A7=AF?= =?UTF-8?q?=E5=87=BD=E6=95=B0=EF=BC=8C=E8=8A=82=E7=9C=81=E4=B8=80=E5=9D=97?= =?UTF-8?q?=E4=B8=AD=E9=97=B4=E7=BC=93=E5=AD=98=E7=9A=84=E4=BD=BF=E7=94=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clguetzli.cl | 93 ++++++++++++++++++++++++++ clguetzli/clguetzli.cpp | 143 ++++++++++++++++++++++++++++++++++++++++ clguetzli/ocl.h | 3 + 3 files changed, 239 insertions(+) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index 564fff1e..4a6809be 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -33,6 +33,83 @@ __kernel void MinSquareVal(__global float* pA, __global float* pC, int square_si pC[y * width + x] = minValue; } +__kernel void ConvolutionX(__global float* multipliers, __global float* inp, __global float* result, + int step, int len, int offset, float border_ratio) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + + if (x % step != 0) return; + + const int xsize = get_global_size(0); + const int ysize = get_global_size(1); + + float weight_no_border = 0; + for (int j = 0; j <= 2 * offset; j++) + { + weight_no_border += multipliers[j]; + } + + int minx = x < offset ? 0 : x - offset; + int maxx = min(xsize, x + len - offset); + + float weight = 0.0; + for (int j = minx; j < maxx; j++) + { + weight += multipliers[j - x + offset]; + } + + weight = (1.0 - border_ratio) * weight + border_ratio * weight_no_border; + float scale = 1.0 / weight; + + float sum = 0.0; + for (int j = minx; j < maxx; j++) + { + sum += inp[y * xsize + j] * multipliers[j - x + offset]; + } + + result[y * xsize + x] = sum * scale; +} + +__kernel void ConvolutionY(__global float* multipliers, __global float* inp, __global float* result, + int step, int len, int offset, float border_ratio) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + + if (x % step != 0) return; + if (y % step != 0) return; + + const int xsize = get_global_size(0); + const int ysize = get_global_size(1); + + float weight_no_border = 0; + for (int j = 0; j <= 2 * offset; j++) + { + weight_no_border += multipliers[j]; + } + + int miny = y < offset ? 0 : y - offset; + int maxy = min(ysize, y + len - offset); + + float weight = 0.0; + for (int j = miny; j < maxy; j++) + { + weight += multipliers[j - y + offset]; + } + + weight = (1.0 - border_ratio) * weight + border_ratio * weight_no_border; + float scale = 1.0 / weight; + + float sum = 0.0; + for (int j = miny; j < maxy; j++) + { + sum += inp[j * xsize + x] * multipliers[j - y + offset]; + } + + result[y * xsize + x] = sum * scale; +} + __kernel void Convolution(__global float* multipliers, __global float* inp, __global float* result, int xsize, int xstep, int len, int offset, float border_ratio) { @@ -71,6 +148,22 @@ __kernel void Convolution(__global float* multipliers, __global float* inp, __gl result[ox * ysize + y] = sum * scale; } +__kernel void SquareSample(__global float* pA, __global float* pC, int xstep, int ystep) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + + int x_sample = x - x % xstep; + int y_sample = y - y % ystep; + + if (x_sample == x && y_sample == y) return; + + const int xsize = get_global_size(0); + const int ysize = get_global_size(1); + + pC[y * xsize + x] = pA[y_sample * xsize + x_sample]; +} + __kernel void DownSample(__global float* pA, __global float* pC, int xstep, int ystep) { const int x = get_global_id(0); diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index e55453d5..6b462dd7 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -45,6 +45,9 @@ ocl_args_d_t& getOcl(void) } ocl.kernel[KERNEL_MINSQUAREVAL] = clCreateKernel(ocl.program, "MinSquareVal", &err); ocl.kernel[KERNEL_CONVOLUTION] = clCreateKernel(ocl.program, "Convolution", &err); + ocl.kernel[KERNEL_CONVOLUTIONX] = clCreateKernel(ocl.program, "ConvolutionX", &err); + ocl.kernel[KERNEL_CONVOLUTIONY] = clCreateKernel(ocl.program, "ConvolutionY", &err); + ocl.kernel[KERNEL_SQUARESAMPLE] = clCreateKernel(ocl.program, "SquareSample", &err); ocl.kernel[KERNEL_DOWNSAMPLE] = clCreateKernel(ocl.program, "DownSample", &err); ocl.kernel[KERNEL_OPSINDYNAMICSIMAGE] = clCreateKernel(ocl.program, "OpsinDynamicsImage", &err); ocl.kernel[KERNEL_DOMASK] = clCreateKernel(ocl.program, "DoMask", &err); @@ -102,6 +105,104 @@ void clConvolutionEx(cl_mem inp, size_t xsize, size_t ysize, } } +void clConvolutionX(cl_mem inp, size_t xsize, size_t ysize, + cl_mem multipliers, size_t len, + int xstep, int offset, double border_ratio, + cl_mem result/*out*/) +{ + cl_int err = CL_SUCCESS; + ocl_args_d_t &ocl = getOcl(); + + cl_int clxstep = xstep; + cl_int cllen = len; + cl_int cloffset = offset; + cl_float clborder_ratio = border_ratio; + + cl_kernel kernel = ocl.kernel[KERNEL_CONVOLUTIONX]; + clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&multipliers); + clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&inp); + clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&result); + clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&xstep); + clSetKernelArg(kernel, 4, sizeof(cl_int), (void*)&cllen); + clSetKernelArg(kernel, 5, sizeof(cl_int), (void*)&cloffset); + clSetKernelArg(kernel, 6, sizeof(cl_float), (void*)&clborder_ratio); + + size_t globalWorkSize[2] = { xsize, ysize }; + err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); + if (CL_SUCCESS != err) + { + LogError("Error: clConvolutionEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err)); + } + err = clFinish(ocl.commandQueue); + if (CL_SUCCESS != err) + { + LogError("Error: clConvolutionEx() clFinish returned %s.\n", TranslateOpenCLError(err)); + } +} + +void clConvolutionY(cl_mem inp, size_t xsize, size_t ysize, + cl_mem multipliers, size_t len, + int xstep, int offset, double border_ratio, + cl_mem result/*out*/) +{ + cl_int err = CL_SUCCESS; + ocl_args_d_t &ocl = getOcl(); + + cl_int clxstep = xstep; + cl_int cllen = len; + cl_int cloffset = offset; + cl_float clborder_ratio = border_ratio; + + cl_kernel kernel = ocl.kernel[KERNEL_CONVOLUTIONY]; + clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&multipliers); + clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&inp); + clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&result); + clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&xstep); + clSetKernelArg(kernel, 4, sizeof(cl_int), (void*)&cllen); + clSetKernelArg(kernel, 5, sizeof(cl_int), (void*)&cloffset); + clSetKernelArg(kernel, 6, sizeof(cl_float), (void*)&clborder_ratio); + + size_t globalWorkSize[2] = { xsize, ysize }; + err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); + if (CL_SUCCESS != err) + { + LogError("Error: clConvolutionEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err)); + } + err = clFinish(ocl.commandQueue); + if (CL_SUCCESS != err) + { + LogError("Error: clConvolutionEx() clFinish returned %s.\n", TranslateOpenCLError(err)); + } +} + +void clUpsampleEx2(cl_mem image, size_t xsize, size_t ysize, + size_t xstep, size_t ystep, + cl_mem result/*out*/) +{ + cl_int err = CL_SUCCESS; + ocl_args_d_t &ocl = getOcl(); + + cl_int clxstep = xstep; + cl_int clystep = ystep; + cl_kernel kernel = ocl.kernel[KERNEL_SQUARESAMPLE]; + clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&image); + clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&result); + clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&clxstep); + clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&clystep); + + size_t globalWorkSize[2] = { xsize, ysize }; + err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); + if (CL_SUCCESS != err) + { + LogError("Error: clUpsampleEx clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err)); + } + err = clFinish(ocl.commandQueue); + if (CL_SUCCESS != err) + { + LogError("Error: clUpsampleEx clFinish returned %s.\n", TranslateOpenCLError(err)); + } +} + void clUpsampleEx(cl_mem image, size_t xsize, size_t ysize, size_t xstep, size_t ystep, cl_mem result/*out*/) @@ -130,10 +231,52 @@ void clUpsampleEx(cl_mem image, size_t xsize, size_t ysize, } } +void clBlurEx2(cl_mem image/*out, opt*/, size_t xsize, size_t ysize, + double sigma, double border_ratio, + cl_mem result/*out, opt*/) +{ + double m = 2.25; // Accuracy increases when m is increased. + const double scaler = -1.0 / (2 * sigma * sigma); + // For m = 9.0: exp(-scaler * diff * diff) < 2^ {-52} + const int diff = std::max(1, m * fabs(sigma)); + const int expn_size = 2 * diff + 1; + std::vector expn(expn_size); + for (int i = -diff; i <= diff; ++i) { + expn[i + diff] = static_cast(exp(scaler * i * i)); + } + + const int xstep = std::max(1, int(sigma / 3)); + + cl_int err = 0; + ocl_args_d_t &ocl = getOcl(); + cl_mem mem_expn = ocl.allocMem(sizeof(cl_float) * expn_size); + + clEnqueueWriteBuffer(ocl.commandQueue, mem_expn, CL_FALSE, 0, sizeof(cl_float) * expn_size, expn.data(), 0, NULL, NULL); + err = clFinish(ocl.commandQueue); + + if (xstep > 1) + { + ocl.allocA(sizeof(cl_float) * xsize * ysize); + clConvolutionX(image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio, ocl.srcA); + clConvolutionY(ocl.srcA, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio, result ? result : image); + clUpsampleEx2(result ? result : image, xsize, ysize, xstep, xstep, result ? result : image); + } + else + { + ocl.allocA(sizeof(cl_float) * xsize * ysize); + clConvolutionX(image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio, ocl.srcA); + clConvolutionY(ocl.srcA, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio, result ? result : image); + } + + clReleaseMemObject(mem_expn); +} void clBlurEx(cl_mem image/*out, opt*/, size_t xsize, size_t ysize, double sigma, double border_ratio, cl_mem result/*out, opt*/) { + clBlurEx2(image, xsize, ysize, sigma, border_ratio, result); + + return; double m = 2.25; // Accuracy increases when m is increased. const double scaler = -1.0 / (2 * sigma * sigma); // For m = 9.0: exp(-scaler * diff * diff) < 2^ {-52} diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h index a210d1c1..aac82f31 100644 --- a/clguetzli/ocl.h +++ b/clguetzli/ocl.h @@ -46,6 +46,9 @@ int SetupOpenCL(ocl_args_d_t *ocl, cl_device_type deviceType); enum KernelName { KERNEL_MINSQUAREVAL = 0, KERNEL_CONVOLUTION, + KERNEL_CONVOLUTIONX, + KERNEL_CONVOLUTIONY, + KERNEL_SQUARESAMPLE, KERNEL_DOWNSAMPLE, KERNEL_OPSINDYNAMICSIMAGE, KERNEL_DOMASK, From e919c9b424edcba0a8816db0755652dcf22fdf62 Mon Sep 17 00:00:00 2001 From: strongtu Date: Tue, 9 May 2017 20:10:48 +0800 Subject: [PATCH 052/189] Merge branch 'master' of https://github.com/ianhuang-777/guetzli Conflicts: clguetzli/clguetzli_test.cpp third_party/butteraugli/butteraugli/butteraugli.cc --- clguetzli/clguetzli.cl | 131 ++++++++++-------- clguetzli/clguetzli.cpp | 6 +- clguetzli/clguetzli.h | 12 +- clguetzli/clguetzli_test.cpp | 98 ++++++++++--- clguetzli/clguetzli_test.h | 29 ++++ .../butteraugli/butteraugli/butteraugli.cc | 38 +++-- 6 files changed, 229 insertions(+), 85 deletions(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index 4a6809be..31b87f9d 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -357,14 +357,14 @@ __kernel void CombineChannels( int step, __global float *result) { - const int res_x = get_global_id(0); - const int res_y = get_global_id(1); + const int res_x = get_global_id(0) * step; + const int res_y = get_global_id(1) * step; - const int res_xsize = get_global_size(0); - const int res_ysize = get_global_size(1); + const int res_xsize = (xsize + step - 1) / step; + const int res_ysize = (ysize + step - 1) / step; - if (res_x * step >= xsize - (8 - step)) return; - if (res_y * step >= ysize - (8 - step)) return; + //if (res_x * step >= xsize - (8 - step)) return; + //if (res_y * step >= ysize - (8 - step)) return; double mask[3]; double dc_mask[3]; @@ -374,8 +374,8 @@ __kernel void CombineChannels( mask[1] = mask_y[(res_y + 3) * xsize + (res_x + 3)]; dc_mask[1] = mask_dc_y[(res_y + 3) * xsize + (res_x + 3)]; - mask[1] = mask_b[(res_y + 3) * xsize + (res_x + 3)]; - dc_mask[1] = mask_dc_b[(res_y + 3) * xsize + (res_x + 3)]; + mask[2] = mask_b[(res_y + 3) * xsize + (res_x + 3)]; + dc_mask[2] = mask_dc_b[(res_y + 3) * xsize + (res_x + 3)]; size_t res_ix = (res_y * res_xsize + res_x) / step; result[res_ix] = (float)( @@ -463,6 +463,62 @@ void XybDiffLowFreqSquaredAccumulate(double r0, double g0, double b0, res[2] += factor * valz * valz; } +void Butteraugli8x8CornerEdgeDetectorDiff( + int pos_x, + int pos_y, + int xsize, + int ysize, + __global float *r, __global float *g, __global float* b, + __global float *r2, __global float* g2, __global float *b2, + double* diff_xyb) +{ + int local_count = 0; + double local_xyb[3] = { 0 }; + const double w = 0.711100840192; + + int offset[4][2] = { { 0,0 },{ 0,7 },{ 7,0 },{ 7,7 } }; + int edgeSize = 3; + + for (int k = 0; k < 4; k++) + { + int x = pos_x + offset[k][0]; + int y = pos_y + offset[k][1]; + + if (x >= edgeSize && x + edgeSize < xsize) { + size_t ix = y * xsize + (x - edgeSize); + size_t ix2 = ix + 2 * edgeSize; + XybDiffLowFreqSquaredAccumulate( + w * (r[ix] - r[ix2]), + w * (g[ix] - g[ix2]), + w * (b[ix] - b[ix2]), + w * (r2[ix] - r2[ix2]), + w * (g2[ix] - g2[ix2]), + w * (b2[ix] - b2[ix2]), + 1.0, local_xyb); + ++local_count; + } + if (y >= edgeSize && y + edgeSize < ysize) { + size_t ix = (y - edgeSize) * xsize + x; + size_t ix2 = ix + 2 * edgeSize * xsize; + XybDiffLowFreqSquaredAccumulate( + w * (r[ix] - r[ix2]), + w * (g[ix] - g[ix2]), + w * (b[ix] - b[ix2]), + w * (r2[ix] - r2[ix2]), + w * (g2[ix] - g2[ix2]), + w * (b2[ix] - b2[ix2]), + 1.0, local_xyb); + ++local_count; + } + } + + const double weight = 0.01617112696; + const double mul = weight * 8.0 / local_count; + for (int i = 0; i < 3; ++i) { + diff_xyb[i] += mul * local_xyb[i]; + } +} + __kernel void edgeDetectorMap(__global float *result, __global float *r, __global float *g, __global float* b, __global float *r2, __global float* g2, __global float *b2, @@ -483,53 +539,16 @@ __kernel void edgeDetectorMap(__global float *result, pos_x = min(pos_x, xsize - 8); pos_y = min(pos_y, ysize - 8); - int local_count = 0; - double local_xyb[3] = { 0 }; - const double w = 0.711100840192; - - int offset[4][2] = {{0,0}, {0,7}, {7,0}, {7,7}}; - int edgeSize = 3; - - for (int k = 0; k < 4; k++) - { - int x = pos_x + offset[k][0]; - int y = pos_y + offset[k][1]; - - if (x >= edgeSize && x + edgeSize < xsize) { - size_t ix = y * xsize + (x - edgeSize); - size_t ix2 = ix + 2 * edgeSize; - XybDiffLowFreqSquaredAccumulate( - w * (r[ix] - r[ix2]), - w * (g[ix] - g[ix2]), - w * (b[ix] - b[ix2]), - w * (r2[ix] - r2[ix2]), - w * (g2[ix] - g2[ix2]), - w * (b2[ix] - b2[ix2]), - 1.0, local_xyb); - ++local_count; - } - if (y >= edgeSize && y + edgeSize < ysize) { - size_t ix = (y - edgeSize) * xsize + x; - size_t ix2 = ix + 2 * edgeSize * xsize; - XybDiffLowFreqSquaredAccumulate( - w * (r[ix] - r[ix2]), - w * (g[ix] - g[ix2]), - w * (b[ix] - b[ix2]), - w * (r2[ix] - r2[ix2]), - w * (g2[ix] - g2[ix2]), - w * (b2[ix] - b2[ix2]), - 1.0, local_xyb); - ++local_count; - } - } - - const double weight = 0.01617112696; - const double mul = weight * 8.0 / local_count; + double diff_xyb[3] = { 0.0 }; + Butteraugli8x8CornerEdgeDetectorDiff(pos_x, pos_y, xsize, ysize, + r, g, b, + r2, g2, b2, + &diff_xyb[0]); int idx = (res_y * res_xsize + res_x) * 3; - result[idx] = local_xyb[0]; - result[idx + 1] = local_xyb[1]; - result[idx + 2] = local_xyb[2]; + result[idx] = diff_xyb[0]; + result[idx + 1] = diff_xyb[1]; + result[idx + 2] = diff_xyb[2]; } __kernel void edgeDetectorLowFreq(__global float *result, @@ -590,9 +609,9 @@ __kernel void edgeDetectorLowFreq(__global float *result, const double kMul = 10; - result[res_ix * 3] = max_diff_xyb[0] * kMul; - result[res_ix * 3 + 1] = max_diff_xyb[1] * kMul; - result[res_ix * 3 + 2] = max_diff_xyb[2] * kMul; + result[res_ix * 3] += max_diff_xyb[0] * kMul; + result[res_ix * 3 + 1] += max_diff_xyb[1] * kMul; + result[res_ix * 3 + 2] += max_diff_xyb[2] * kMul; } #define kBlockEdge 8 diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index 6b462dd7..fe014d70 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -13,7 +13,7 @@ ocl_args_d_t& getOcl(void) if (bInit == true) return ocl; bInit = true; - cl_int err = SetupOpenCL(&ocl, CL_DEVICE_TYPE_GPU); + cl_int err = SetupOpenCL(&ocl, CL_DEVICE_TYPE_CPU); if (CL_SUCCESS != err) { LogError("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err)); @@ -883,8 +883,8 @@ void clCombineChannelsEx( cl_int err = CL_SUCCESS; ocl_args_d_t &ocl = getOcl(); - const size_t res_xsize = (xsize + step - 1) / step; - const size_t res_ysize = (ysize + step - 1) / step; + const size_t res_xsize = ((xsize - 8 + step) + step - 1) / step; + const size_t res_ysize = ((ysize - 8 + step) + step - 1) / step; cl_int clxsize = xsize; cl_int clysize = ysize; diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h index a714bf44..9d6ceba2 100644 --- a/clguetzli/clguetzli.h +++ b/clguetzli/clguetzli.h @@ -20,7 +20,7 @@ void clBlockDiffMapEx(ocl_channels rgb, ocl_channels rgb2, void clEdgeDetectorLowFreqEx(ocl_channels rgb, ocl_channels rgb2, size_t xsize, size_t ysize, size_t step, - cl_mem block_diff_ac/*out*/); + cl_mem block_diff_ac/*in,out*/); void clBlurEx(cl_mem image, size_t xsize, size_t ysize, double sigma, double border_ratio, cl_mem result = nullptr); @@ -31,3 +31,13 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b, size_t xsize, size_t ysize, size_t step, float* result); + +void clCombineChannelsEx( + ocl_channels mask, + ocl_channels mask_dc, + cl_mem block_diff_dc, + cl_mem block_diff_ac, + cl_mem edge_detector_map, + size_t xsize, size_t ysize, + size_t step, + cl_mem result/*out*/); diff --git a/clguetzli/clguetzli_test.cpp b/clguetzli/clguetzli_test.cpp index 1bb20681..c229806e 100644 --- a/clguetzli/clguetzli_test.cpp +++ b/clguetzli/clguetzli_test.cpp @@ -30,7 +30,6 @@ void clMaskHighIntensityChange(const float* r, const float* g, const float* b, const float* result_r, const float* result_g, const float* result_b, const float* result_r2, const float* result_g2, const float* result_b2) { - return; if (xsize < 100 || ysize < 100) return; size_t channel_size = xsize * ysize * sizeof(float); @@ -82,7 +81,6 @@ void clEdgeDetectorMap(const float* r, const float* g, const float* b, size_t xsize, size_t ysize, size_t step, const float* result) { - return; if (xsize < 100 || ysize < 100) return; size_t channel_size = xsize * ysize * sizeof(float); @@ -125,7 +123,6 @@ void clBlockDiffMap(const float* r, const float* g, const float* b, size_t xsize, size_t ysize, size_t step, const float* result_diff_dc, const float* result_diff_ac) { - return; if (xsize < 100 || ysize < 100) return; size_t channel_size = xsize * ysize * sizeof(float); @@ -173,9 +170,9 @@ void clBlockDiffMap(const float* r, const float* g, const float* b, void clEdgeDetectorLowFreq(const float* r, const float* g, const float* b, const float* r2, const float* g2, const float* b2, size_t xsize, size_t ysize, size_t step, - const float* result_diff_dc) + const float* orign_ac, + const float* result_diff_ac) { - return; if (xsize < 100 || ysize < 100) return; size_t channel_size = xsize * ysize * sizeof(float); @@ -188,7 +185,7 @@ void clEdgeDetectorLowFreq(const float* r, const float* g, const float* b, ocl_channels xyb0 = ocl.allocMemChannels(channel_size); ocl_channels xyb1 = ocl.allocMemChannels(channel_size); - cl_mem block_diff_dc = ocl.allocMem(reschannel_size); + cl_mem block_diff_ac = ocl.allocMem(reschannel_size); clEnqueueWriteBuffer(ocl.commandQueue, xyb0.r, CL_FALSE, 0, channel_size, r, 0, NULL, NULL); clEnqueueWriteBuffer(ocl.commandQueue, xyb0.g, CL_FALSE, 0, channel_size, g, 0, NULL, NULL); @@ -196,22 +193,24 @@ void clEdgeDetectorLowFreq(const float* r, const float* g, const float* b, clEnqueueWriteBuffer(ocl.commandQueue, xyb1.r, CL_FALSE, 0, channel_size, r2, 0, NULL, NULL); clEnqueueWriteBuffer(ocl.commandQueue, xyb1.g, CL_FALSE, 0, channel_size, g2, 0, NULL, NULL); clEnqueueWriteBuffer(ocl.commandQueue, xyb1.b, CL_FALSE, 0, channel_size, b2, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, block_diff_ac, CL_FALSE, 0, reschannel_size, orign_ac, 0, NULL, NULL); + err = clFinish(ocl.commandQueue); - clEdgeDetectorLowFreqEx(xyb0, xyb1, xsize, ysize, step, block_diff_dc); + clEdgeDetectorLowFreqEx(xyb0, xyb1, xsize, ysize, step, block_diff_ac); - cl_float *r_dc = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, block_diff_dc, true, CL_MAP_READ, 0, reschannel_size, 0, NULL, NULL, &err); + cl_float *r_ac = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, block_diff_ac, true, CL_MAP_READ, 0, reschannel_size, 0, NULL, NULL, &err); err = clFinish(ocl.commandQueue); - FLOAT_COMPARE(r_dc, result_diff_dc, res_xsize * res_ysize * 3); + FLOAT_COMPARE(r_ac, result_diff_ac, res_xsize * res_ysize * 3); - clEnqueueUnmapMemObject(ocl.commandQueue, block_diff_dc, r_dc, reschannel_size, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, block_diff_ac, r_ac, reschannel_size, NULL, NULL); err = clFinish(ocl.commandQueue); ocl.releaseMemChannels(xyb0); ocl.releaseMemChannels(xyb1); - clReleaseMemObject(block_diff_dc); + clReleaseMemObject(block_diff_ac); } void clMask(const float* r, const float* g, const float* b, @@ -272,21 +271,82 @@ void clMask(const float* r, const float* g, const float* b, } // ian todo -void clCombineChannels(void) +void clCombineChannels(const float *mask_xyb_x, const float *mask_xyb_y, const float *mask_xyb_b, + const float *mask_xyb_dc_x, const float *mask_xyb_dc_y, const float *mask_xyb_dc_b, + const float *block_diff_dc, const float *block_diff_ac, + const float *edge_detector_map, + size_t xsize, size_t ysize, + size_t res_xsize, size_t res_ysize, + size_t step, + float *result) { + cl_int err = CL_SUCCESS; + ocl_args_d_t &ocl = getOcl(); + size_t channel_size = xsize * ysize * sizeof(float); + ocl_channels mask = ocl.allocMemChannels(channel_size); + ocl_channels mask_dc = ocl.allocMemChannels(channel_size); + cl_mem cl_block_diff_dc = ocl.allocMem(3 * res_xsize * res_ysize * sizeof(float)); + cl_mem cl_block_diff_ac = ocl.allocMem(3 * res_xsize * res_ysize * sizeof(float)); + cl_mem cl_edge_detector_map = ocl.allocMem(3 * res_xsize * res_ysize * sizeof(float)); + cl_mem cl_result = ocl.allocMem(res_xsize * res_ysize * sizeof(float)); + + clEnqueueWriteBuffer(ocl.commandQueue, mask.x, CL_FALSE, 0, channel_size, mask_xyb_x, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, mask.y, CL_FALSE, 0, channel_size, mask_xyb_y, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, mask.b, CL_FALSE, 0, channel_size, mask_xyb_b, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, mask_dc.x, CL_FALSE, 0, channel_size, mask_xyb_dc_x, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, mask_dc.y, CL_FALSE, 0, channel_size, mask_xyb_dc_y, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, mask_dc.b, CL_FALSE, 0, channel_size, mask_xyb_dc_b, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, cl_block_diff_dc, CL_FALSE, 0, 3 * res_xsize * res_ysize * sizeof(float), block_diff_dc, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, cl_block_diff_ac, CL_FALSE, 0, 3 * res_xsize * res_ysize * sizeof(float), block_diff_ac, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, cl_edge_detector_map, CL_FALSE, 0, 3 * res_xsize * res_ysize * sizeof(float), edge_detector_map, 0, NULL, NULL); + + clCombineChannelsEx(mask, mask_dc, cl_block_diff_dc, cl_block_diff_ac, cl_edge_detector_map, xsize, ysize, step, cl_result); + + cl_float *result_tmp = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, cl_result, true, CL_MAP_READ, 0, res_xsize * res_ysize * sizeof(float), 0, NULL, NULL, &err); + + FLOAT_COMPARE(result_tmp, result, res_xsize * res_ysize); + + ocl.releaseMemChannels(mask); + ocl.releaseMemChannels(mask_dc); + clReleaseMemObject(cl_block_diff_dc); + clReleaseMemObject(cl_block_diff_ac); + clReleaseMemObject(cl_edge_detector_map); + clReleaseMemObject(cl_result); } // ian todo -void clCalculateDiffmapEx(void) +void clCalculateDiffmapEx(const size_t xsize, const size_t ysize, + const size_t step, + float *diffmap) { } // chrisk todo -void clBlur(void) +void clBlur(float* channel, size_t xsize, size_t ysize, double sigma, double border_ratio, float* result) { + if (xsize < 100 || ysize < 100) return; + + size_t channel_size = xsize * ysize * sizeof(float); + cl_int err = 0; + ocl_args_d_t &ocl = getOcl(); + cl_mem r = ocl.allocMem(channel_size); + + clEnqueueWriteBuffer(ocl.commandQueue, r, CL_FALSE, 0, channel_size, channel, 0, NULL, NULL); + err = clFinish(ocl.commandQueue); + + clBlurEx(r, xsize, ysize, sigma, border_ratio, r); + cl_float *r_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + err = clFinish(ocl.commandQueue); + + FLOAT_COMPARE(result, r_r, xsize * ysize); + + clEnqueueUnmapMemObject(ocl.commandQueue, r, r_r, channel_size, NULL, NULL); + err = clFinish(ocl.commandQueue); + + clReleaseMemObject(r); } // chrisk todo @@ -302,13 +362,17 @@ void clUpsample(void) } // ian todo -void clDiffPrecompute(void) +void clDiffPrecompute( + const float *xyb0_x, const float *xyb0_y, const float *xyb0_b, + const float *xyb1_x, const float *xyb1_y, const float *xyb1_b, + size_t xsize, size_t ysize, + float *mask_x, float *mask_y, float *mask_b) { } // ian todo -void clAverage5x5(void) +void clAverage5x5(int xsize, int ysize, float *diffs) { } @@ -320,7 +384,7 @@ void clMinSquareVal(void) } // ian todo -void clScaleImage(void) +void clScaleImage(double scale, float *result) { } diff --git a/clguetzli/clguetzli_test.h b/clguetzli/clguetzli_test.h index 3b62144e..71500eff 100644 --- a/clguetzli/clguetzli_test.h +++ b/clguetzli/clguetzli_test.h @@ -9,6 +9,8 @@ void clMaskHighIntensityChange(const float* r, const float* g, const float* b, const float* result_r, const float* result_g, const float* result_b, const float* result_r2, const float* result_g2, const float* result_b2); +void clBlur(float* channel, size_t xsize, size_t ysize, double sigma, double border_ratio, float* result); + void clEdgeDetectorMap(const float* r, const float* g, const float* b, const float* r2, const float* g2, const float* b2, size_t xsize, size_t ysize, size_t step, @@ -22,6 +24,7 @@ void clBlockDiffMap(const float* r, const float* g, const float* b, void clEdgeDetectorLowFreq(const float* r, const float* g, const float* b, const float* r2, const float* g2, const float* b2, size_t xsize, size_t ysize, size_t step, + const float* orign_ac, const float* result_diff_dc); void clMask(const float* r, const float* g, const float* b, @@ -29,3 +32,29 @@ void clMask(const float* r, const float* g, const float* b, size_t xsize, size_t ysize, const float* mask_r, const float* mask_g, const float* mask_b, const float* maskdc_r, const float* maskdc_g, const float* maskdc_b); + +void clCombineChannels(const float *mask_xyb_x, const float *mask_xyb_y, const float *mask_xyb_b, + const float *mask_xyb_dc_x, const float *mask_xyb_dc_y, const float *mask_xyb_dc_b, + const float *block_diff_dc, const float *block_diff_ac, + const float *edge_detector_map, + size_t xsize, size_t ysize, + size_t res_xsize, size_t res_ysize, + size_t step, + float *result); + +void clCalculateDiffmapEx(const size_t xsize, const size_t ysize, + const size_t step, + float *diffmap); + +void clBlur(size_t xsize, size_t ysize, float* channel, double sigma, + double border_ratio); + +void clDiffPrecompute( + const float *xyb0_x, const float *xyb0_y, const float *xyb0_b, + const float *xyb1_x, const float *xyb1_y, const float *xyb1_b, + size_t xsize, size_t ysize, + float *mask_x, float *mask_y, float *mask_b); + +void clAverage5x5(int xsize, int ysize, float *diffs); + +void clScaleImage(double scale, float *result); diff --git a/third_party/butteraugli/butteraugli/butteraugli.cc b/third_party/butteraugli/butteraugli/butteraugli.cc index 138234ad..8869b518 100644 --- a/third_party/butteraugli/butteraugli/butteraugli.cc +++ b/third_party/butteraugli/butteraugli/butteraugli.cc @@ -98,6 +98,9 @@ static void Convolution(size_t xsize, size_t ysize, void Blur(size_t xsize, size_t ysize, float* channel, double sigma, double border_ratio) { + std::vector orignChannel(xsize * ysize); + memcpy(orignChannel.data(), channel, xsize * ysize * sizeof(float)); + PROFILER_FUNC; double m = 2.25; // Accuracy increases when m is increased. const double scaler = -1.0 / (2 * sigma * sigma); @@ -133,6 +136,8 @@ void Blur(size_t xsize, size_t ysize, float* channel, double sigma, } } } + + clBlur(orignChannel.data(), xsize, ysize, sigma, border_ratio, channel); } // To change this to n, add the relevant FFTn function and kFFTnMapIndexTable. @@ -834,7 +839,7 @@ void MaskHighIntensityChange( c1[0].data(), c1[1].data(), c1[2].data(), xsize, ysize, xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), - xyb0[0].data(), xyb1[1].data(), xyb1[2].data()); + xyb1[0].data(), xyb1[1].data(), xyb1[2].data()); } double SimpleGamma(double v) { @@ -1062,6 +1067,8 @@ static void ScaleImage(double scale, std::vector *result) { for (size_t i = 0; i < result->size(); ++i) { (*result)[i] *= static_cast(scale); } + + clScaleImage(scale, (*result).data()); } // Making a cluster of local errors to be more impactful than @@ -1121,6 +1128,7 @@ void CalculateDiffmap(const size_t xsize, const size_t ysize, } ScaleImage(scale, diffmap); } + clCalculateDiffmapEx(xsize, ysize, step, (*diffmap).data()); } void ButteraugliComparator::DiffmapOpsinDynamicsImage( @@ -1238,15 +1246,18 @@ void ButteraugliComparator::EdgeDetectorMap( } clEdgeDetectorMap(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), - xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), - xsize_, ysize_, step_, - (*edge_detector_map).data()); + xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), + xsize_, ysize_, step_, + (*edge_detector_map).data()); } void ButteraugliComparator::EdgeDetectorLowFreq( const std::vector > &xyb0, const std::vector > &xyb1, std::vector* block_diff_ac) { + + std::vector orign_ac = *block_diff_ac; + PROFILER_FUNC; static const double kSigma = 14; static const double kMul = 10; @@ -1299,9 +1310,9 @@ void ButteraugliComparator::EdgeDetectorLowFreq( } clEdgeDetectorLowFreq(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), - xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), - xsize_, ysize_, step_, - (*block_diff_ac).data()); + xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), + xsize_, ysize_, step_, + orign_ac.data(), (*block_diff_ac).data()); } void ButteraugliComparator::CombineChannels( @@ -1314,7 +1325,7 @@ void ButteraugliComparator::CombineChannels( PROFILER_FUNC; result->resize(res_xsize_ * res_ysize_); for (size_t res_y = 0; res_y + (8 - step_) < ysize_; res_y += step_) { - for (size_t res_x = 0; res_x + (8 - step_) < xsize_; res_x += step_) { + for (size_t res_x = 0, j = 0; res_x + (8 - step_) < xsize_; res_x += step_, j++) { size_t res_ix = (res_y * res_xsize_ + res_x) / step_; double mask[3]; double dc_mask[3]; @@ -1328,6 +1339,10 @@ void ButteraugliComparator::CombineChannels( DotProduct(&edge_detector_map[3 * res_ix], mask)); } } + clCombineChannels(mask_xyb[0].data(), mask_xyb[1].data(), mask_xyb[2].data(), + mask_xyb_dc[0].data(), mask_xyb_dc[1].data(), mask_xyb_dc[2].data(), + block_diff_dc.data(), + block_diff_ac.data(), edge_detector_map.data(), xsize_, ysize_, res_xsize_, res_ysize_, step_, (*result).data()); } double ButteraugliScoreFromDiffmap(const std::vector& diffmap) { @@ -1522,6 +1537,8 @@ void Average5x5(int xsize, int ysize, std::vector* diffs) { } *diffs = result; ScaleImage(scale, diffs); + + clAverage5x5(xsize, ysize, (*diffs).data()); } void DiffPrecompute( @@ -1577,6 +1594,11 @@ void DiffPrecompute( } } } + clDiffPrecompute( + xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), + xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), + xsize, ysize, + ((*mask)[0]).data(), ((*mask)[1]).data(), ((*mask)[2]).data()); } void Mask(const std::vector > &xyb0, From f947da99d3e62e3745c00391c5db2da0012e8212 Mon Sep 17 00:00:00 2001 From: strongtu Date: Wed, 10 May 2017 02:09:14 +0800 Subject: [PATCH 053/189] =?UTF-8?q?=E4=BF=AE=E6=AD=A3blockDiffMap=E8=AE=A1?= =?UTF-8?q?=E7=AE=97?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clguetzli.cl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index 31b87f9d..358e931b 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -1042,11 +1042,11 @@ __kernel void blockDiffMap(__global float* r, __global float* g, __global float* int pos_x = res_x * step; int pos_y = res_y * step; - if ((pos_x + kBlockEdge - step - 1) >= ysize) return; - if ((pos_y + kBlockEdge - step - 1) >= xsize) return; + if ((pos_x + kBlockEdge - step - 1) >= xsize) return; + if ((pos_y + kBlockEdge - step - 1) >= ysize) return; size_t res_ix = res_y * res_xsize + res_x; - size_t offset = min(res_y * step, ysize - 8) * xsize + min(res_x * step, xsize - 8); + size_t offset = min(pos_y, ysize - 8) * xsize + min(pos_x, xsize - 8); double block0[3 * kBlockEdge * kBlockEdge]; double block1[3 * kBlockEdge * kBlockEdge]; From 6ba5810988f6544fa24f01d606455e31db5036b3 Mon Sep 17 00:00:00 2001 From: Zhou Ke Date: Wed, 10 May 2017 14:21:13 +0800 Subject: [PATCH 054/189] Merge remote-tracking branch 'origin/master' --- .gitignore | 1 + clguetzli/clguetzli.cpp | 34 ++-- clguetzli/clguetzli.h | 6 + clguetzli/clguetzli_test.cpp | 72 +++++++-- clguetzli/clguetzli_test.h | 35 ++-- guetzli/guetzli.cc | 3 + .../butteraugli/butteraugli/butteraugli.cc | 149 ++++++++++++------ 7 files changed, 211 insertions(+), 89 deletions(-) diff --git a/.gitignore b/.gitignore index dd10da52..3d270281 100644 --- a/.gitignore +++ b/.gitignore @@ -15,3 +15,4 @@ ipch/ *.cachefile *.VC.db *.VC.VC.opendb +guetzli.vcxproj.user diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index fe014d70..2fb1262d 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -4,6 +4,7 @@ #include "clguetzli.h" extern bool g_useOpenCL = false; +extern bool g_checkOpenCL = false; ocl_args_d_t& getOcl(void) { @@ -74,7 +75,7 @@ void clConvolutionEx(cl_mem inp, size_t xsize, size_t ysize, cl_int err = CL_SUCCESS; ocl_args_d_t &ocl = getOcl(); - size_t oxsize = xsize / xstep; + size_t oxsize = (xsize + xstep - 1) / xstep; cl_int clxsize = xsize; cl_int clxstep = xstep; @@ -318,10 +319,21 @@ void clBlurEx(cl_mem image/*out, opt*/, size_t xsize, size_t ysize, clReleaseMemObject(mem_expn); } -void clOpsinDynamicsImageEx(ocl_channels rgb/*in,out*/, ocl_channels rgb_blurred, size_t size) +void clOpsinDynamicsImageEx(ocl_channels rgb/*in,out*/, size_t xsize, size_t ysize) { + static const double kSigma = 1.1; + + cl_int channel_size = xsize * ysize * sizeof(float); + + cl_int err = 0; ocl_args_d_t &ocl = getOcl(); - cl_int clSize = size; + ocl_channels rgb_blurred = ocl.allocMemChannels(channel_size); + + clBlurEx(rgb.r, xsize, ysize, kSigma, 0.0, rgb_blurred.r); + clBlurEx(rgb.g, xsize, ysize, kSigma, 0.0, rgb_blurred.g); + clBlurEx(rgb.b, xsize, ysize, kSigma, 0.0, rgb_blurred.b); + + cl_int clSize = xsize * ysize; cl_kernel kernel = ocl.kernel[KERNEL_OPSINDYNAMICSIMAGE]; clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&rgb.r); clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&rgb.g); @@ -331,8 +343,8 @@ void clOpsinDynamicsImageEx(ocl_channels rgb/*in,out*/, ocl_channels rgb_blurred clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&rgb_blurred.b); clSetKernelArg(kernel, 6, sizeof(cl_int), (void*)&clSize); - size_t globalWorkSize[1] = { size }; - cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL); + size_t globalWorkSize[1] = { xsize * ysize }; + err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL); if (CL_SUCCESS != err) { LogError("Error: clOpsinDynamicsImageEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err)); @@ -342,29 +354,24 @@ void clOpsinDynamicsImageEx(ocl_channels rgb/*in,out*/, ocl_channels rgb_blurred { LogError("Error: clOpsinDynamicsImageEx() clFinish returned %s.\n", TranslateOpenCLError(err)); } + + ocl.releaseMemChannels(rgb_blurred); } void clOpsinDynamicsImage(size_t xsize, size_t ysize, float* r, float* g, float* b) { - static const double kSigma = 1.1; - cl_int channel_size = xsize * ysize * sizeof(float); cl_int err = 0; ocl_args_d_t &ocl = getOcl(); ocl_channels rgb = ocl.allocMemChannels(channel_size); - ocl_channels rgb_blurred = ocl.allocMemChannels(channel_size); clEnqueueWriteBuffer(ocl.commandQueue, rgb.r, CL_FALSE, 0, channel_size, r, 0, NULL, NULL); clEnqueueWriteBuffer(ocl.commandQueue, rgb.g, CL_FALSE, 0, channel_size, g, 0, NULL, NULL); clEnqueueWriteBuffer(ocl.commandQueue, rgb.b, CL_FALSE, 0, channel_size, b, 0, NULL, NULL); err = clFinish(ocl.commandQueue); - clBlurEx(rgb.r, xsize, ysize, kSigma, 0.0, rgb_blurred.r); - clBlurEx(rgb.g, xsize, ysize, kSigma, 0.0, rgb_blurred.g); - clBlurEx(rgb.b, xsize, ysize, kSigma, 0.0, rgb_blurred.b); - - clOpsinDynamicsImageEx(rgb, rgb_blurred, xsize * ysize); + clOpsinDynamicsImageEx(rgb, xsize, ysize); cl_float *result_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, rgb.r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); cl_float *result_g = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, rgb.g, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); @@ -382,7 +389,6 @@ void clOpsinDynamicsImage(size_t xsize, size_t ysize, float* r, float* g, float* clFinish(ocl.commandQueue); ocl.releaseMemChannels(rgb); - ocl.releaseMemChannels(rgb_blurred); } void clMaskHighIntensityChangeEx(ocl_channels xyb0/*in,out*/, diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h index 9d6ceba2..21ec7237 100644 --- a/clguetzli/clguetzli.h +++ b/clguetzli/clguetzli.h @@ -3,6 +3,7 @@ #include "ocl.h" extern bool g_useOpenCL; +extern bool g_checkOpenCL; void clMaskHighIntensityChangeEx(ocl_channels xyb0/*in,out*/, ocl_channels xyb1/*in,out*/, @@ -41,3 +42,8 @@ void clCombineChannelsEx( size_t xsize, size_t ysize, size_t step, cl_mem result/*out*/); + +void clConvolutionEx(cl_mem inp, size_t xsize, size_t ysize, + cl_mem multipliers, size_t len, + int xstep, int offset, double border_ratio, + cl_mem result/*out*/); diff --git a/clguetzli/clguetzli_test.cpp b/clguetzli/clguetzli_test.cpp index c229806e..bac52b60 100644 --- a/clguetzli/clguetzli_test.cpp +++ b/clguetzli/clguetzli_test.cpp @@ -24,7 +24,7 @@ int floatCompare(const float* a, const float* b, size_t size, const char* szFunc return count; } -void clMaskHighIntensityChange(const float* r, const float* g, const float* b, +void tclMaskHighIntensityChange(const float* r, const float* g, const float* b, const float* r2, const float* g2, const float* b2, size_t xsize, size_t ysize, const float* result_r, const float* result_g, const float* result_b, @@ -76,7 +76,7 @@ void clMaskHighIntensityChange(const float* r, const float* g, const float* b, } // strong to -void clEdgeDetectorMap(const float* r, const float* g, const float* b, +void tclEdgeDetectorMap(const float* r, const float* g, const float* b, const float* r2, const float* g2, const float* b2, size_t xsize, size_t ysize, size_t step, const float* result) @@ -118,7 +118,7 @@ void clEdgeDetectorMap(const float* r, const float* g, const float* b, } // strong todo -void clBlockDiffMap(const float* r, const float* g, const float* b, +void tclBlockDiffMap(const float* r, const float* g, const float* b, const float* r2, const float* g2, const float* b2, size_t xsize, size_t ysize, size_t step, const float* result_diff_dc, const float* result_diff_ac) @@ -167,7 +167,7 @@ void clBlockDiffMap(const float* r, const float* g, const float* b, } // strong to -void clEdgeDetectorLowFreq(const float* r, const float* g, const float* b, +void tclEdgeDetectorLowFreq(const float* r, const float* g, const float* b, const float* r2, const float* g2, const float* b2, size_t xsize, size_t ysize, size_t step, const float* orign_ac, @@ -213,13 +213,12 @@ void clEdgeDetectorLowFreq(const float* r, const float* g, const float* b, clReleaseMemObject(block_diff_ac); } -void clMask(const float* r, const float* g, const float* b, +void tclMask(const float* r, const float* g, const float* b, const float* r2, const float* g2, const float* b2, size_t xsize, size_t ysize, const float* mask_r, const float* mask_g, const float* mask_b, const float* maskdc_r, const float* maskdc_g, const float* maskdc_b) { - return; if (xsize < 100 || ysize < 100) return; size_t channel_size = xsize * ysize * sizeof(float); @@ -271,7 +270,7 @@ void clMask(const float* r, const float* g, const float* b, } // ian todo -void clCombineChannels(const float *mask_xyb_x, const float *mask_xyb_y, const float *mask_xyb_b, +void tclCombineChannels(const float *mask_xyb_x, const float *mask_xyb_y, const float *mask_xyb_b, const float *mask_xyb_dc_x, const float *mask_xyb_dc_y, const float *mask_xyb_dc_b, const float *block_diff_dc, const float *block_diff_ac, const float *edge_detector_map, @@ -316,7 +315,7 @@ void clCombineChannels(const float *mask_xyb_x, const float *mask_xyb_y, const f } // ian todo -void clCalculateDiffmapEx(const size_t xsize, const size_t ysize, +void tclCalculateDiffmap(const size_t xsize, const size_t ysize, const size_t step, float *diffmap) { @@ -324,7 +323,7 @@ void clCalculateDiffmapEx(const size_t xsize, const size_t ysize, } // chrisk todo -void clBlur(float* channel, size_t xsize, size_t ysize, double sigma, double border_ratio, float* result) +void tclBlur(float* channel, size_t xsize, size_t ysize, double sigma, double border_ratio, float* result) { if (xsize < 100 || ysize < 100) return; @@ -350,19 +349,55 @@ void clBlur(float* channel, size_t xsize, size_t ysize, double sigma, double bor } // chrisk todo -void clConvolution(void) +void tclConvolution(float* result, size_t xsize, size_t ysize, + size_t xstep, + size_t len, size_t offset, + const float* multipliers, + const float* inp, + float border_ratio, + float* orign_result) { + return; + if (xsize < 100 || ysize < 100) return; + + int dxsize = (xsize + xstep - 1) / xstep; + size_t result_size = dxsize * ysize * sizeof(float); + size_t inp_size = xsize * ysize * sizeof(float); + size_t multipliers_size = len * sizeof(float); + cl_int err = 0; + ocl_args_d_t &ocl = getOcl(); + cl_mem r = ocl.allocMem(result_size); + cl_mem i = ocl.allocMem(inp_size); + cl_mem m = ocl.allocMem(len); + + clEnqueueWriteBuffer(ocl.commandQueue, r, CL_FALSE, 0, result_size, result, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, i, CL_FALSE, 0, inp_size, inp, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, m, CL_FALSE, 0, multipliers_size, multipliers, 0, NULL, NULL); + err = clFinish(ocl.commandQueue); + + clConvolutionEx(i, xsize, ysize, m, len, xstep, offset, border_ratio, r); + cl_float *r_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, r, true, CL_MAP_READ, 0, result_size, 0, NULL, NULL, &err); + err = clFinish(ocl.commandQueue); + + FLOAT_COMPARE(orign_result, r_r, dxsize * ysize); + + clEnqueueUnmapMemObject(ocl.commandQueue, r, r_r, result_size, NULL, NULL); + err = clFinish(ocl.commandQueue); + + clReleaseMemObject(r); + clReleaseMemObject(i); + clReleaseMemObject(m); } // chirsk todo -void clUpsample(void) +void tclUpsample(void) { } // ian todo -void clDiffPrecompute( +void tclDiffPrecompute( const float *xyb0_x, const float *xyb0_y, const float *xyb0_b, const float *xyb1_x, const float *xyb1_y, const float *xyb1_b, size_t xsize, size_t ysize, @@ -372,19 +407,26 @@ void clDiffPrecompute( } // ian todo -void clAverage5x5(int xsize, int ysize, float *diffs) +void tclAverage5x5(int xsize, int ysize, float *diffs) { } // chrisk todo -void clMinSquareVal(void) +void tclMinSquareVal(void) { } // ian todo -void clScaleImage(double scale, float *result) +void tclScaleImage(double scale, float *result) { } + +// strong todo +void tclOpsinDynamicsImage(float* r, float* g, float* b, size_t xsize, size_t ysize, + float* result_r, float* result_g, float* result_b) +{ + +} \ No newline at end of file diff --git a/clguetzli/clguetzli_test.h b/clguetzli/clguetzli_test.h index 71500eff..f57c16e0 100644 --- a/clguetzli/clguetzli_test.h +++ b/clguetzli/clguetzli_test.h @@ -3,37 +3,37 @@ ocl_args_d_t& getOcl(void); -void clMaskHighIntensityChange(const float* r, const float* g, const float* b, +void tclMaskHighIntensityChange(const float* r, const float* g, const float* b, const float* r2, const float* g2, const float* b2, size_t xsize, size_t ysize, const float* result_r, const float* result_g, const float* result_b, const float* result_r2, const float* result_g2, const float* result_b2); -void clBlur(float* channel, size_t xsize, size_t ysize, double sigma, double border_ratio, float* result); +void tclBlur(float* channel, size_t xsize, size_t ysize, double sigma, double border_ratio, float* result); -void clEdgeDetectorMap(const float* r, const float* g, const float* b, +void tclEdgeDetectorMap(const float* r, const float* g, const float* b, const float* r2, const float* g2, const float* b2, size_t xsize, size_t ysize, size_t step, const float* result); -void clBlockDiffMap(const float* r, const float* g, const float* b, +void tclBlockDiffMap(const float* r, const float* g, const float* b, const float* r2, const float* g2, const float* b2, size_t xsize, size_t ysize, size_t step, const float* result_diff_dc, const float* result_diff_ac); -void clEdgeDetectorLowFreq(const float* r, const float* g, const float* b, +void tclEdgeDetectorLowFreq(const float* r, const float* g, const float* b, const float* r2, const float* g2, const float* b2, size_t xsize, size_t ysize, size_t step, const float* orign_ac, const float* result_diff_dc); -void clMask(const float* r, const float* g, const float* b, +void tclMask(const float* r, const float* g, const float* b, const float* r2, const float* g2, const float* b2, size_t xsize, size_t ysize, const float* mask_r, const float* mask_g, const float* mask_b, const float* maskdc_r, const float* maskdc_g, const float* maskdc_b); -void clCombineChannels(const float *mask_xyb_x, const float *mask_xyb_y, const float *mask_xyb_b, +void tclCombineChannels(const float *mask_xyb_x, const float *mask_xyb_y, const float *mask_xyb_b, const float *mask_xyb_dc_x, const float *mask_xyb_dc_y, const float *mask_xyb_dc_b, const float *block_diff_dc, const float *block_diff_ac, const float *edge_detector_map, @@ -42,19 +42,30 @@ void clCombineChannels(const float *mask_xyb_x, const float *mask_xyb_y, const f size_t step, float *result); -void clCalculateDiffmapEx(const size_t xsize, const size_t ysize, +void tclCalculateDiffmap(const size_t xsize, const size_t ysize, const size_t step, float *diffmap); -void clBlur(size_t xsize, size_t ysize, float* channel, double sigma, +void tclConvolution(float* result, size_t xsize, size_t ysize, + size_t xstep, + size_t len, size_t offset, + const float* multipliers, + const float* inp, + float border_ratio, + float* orign_result); + +void tclBlur(size_t xsize, size_t ysize, float* channel, double sigma, double border_ratio); -void clDiffPrecompute( +void tclDiffPrecompute( const float *xyb0_x, const float *xyb0_y, const float *xyb0_b, const float *xyb1_x, const float *xyb1_y, const float *xyb1_b, size_t xsize, size_t ysize, float *mask_x, float *mask_y, float *mask_b); -void clAverage5x5(int xsize, int ysize, float *diffs); +void tclAverage5x5(int xsize, int ysize, float *diffs); + +void tclScaleImage(double scale, float *result); -void clScaleImage(double scale, float *result); +void tclOpsinDynamicsImage(float* r, float* g, float* b, size_t xsize, size_t ysize, + float* result_r, float* result_g, float* result_b); diff --git a/guetzli/guetzli.cc b/guetzli/guetzli.cc index aa328cb1..32103a74 100644 --- a/guetzli/guetzli.cc +++ b/guetzli/guetzli.cc @@ -265,6 +265,9 @@ int main(int argc, char** argv) { else if (!strcmp(argv[opt_idx], "--opencl")) { g_useOpenCL = true; } + else if (!strcmp(argv[opt_idx], "--checkcl")) { + g_checkOpenCL = true; + } else if (!strcmp(argv[opt_idx], "--")) { opt_idx++; break; diff --git a/third_party/butteraugli/butteraugli/butteraugli.cc b/third_party/butteraugli/butteraugli/butteraugli.cc index 8869b518..cce14dad 100644 --- a/third_party/butteraugli/butteraugli/butteraugli.cc +++ b/third_party/butteraugli/butteraugli/butteraugli.cc @@ -69,6 +69,11 @@ static void Convolution(size_t xsize, size_t ysize, const float* __restrict__ inp, float border_ratio, float* __restrict__ result) { + + int dxsize = (xsize + xstep - 1) / xstep; + std::vector newResult(dxsize * ysize); + memcpy(newResult.data(), result, dxsize * ysize * sizeof(float)); + PROFILER_FUNC; float weight_no_border = 0; @@ -93,13 +98,19 @@ static void Convolution(size_t xsize, size_t ysize, result[ox * ysize + y] = static_cast(sum * scale); } } + + tclConvolution(newResult.data(), xsize, ysize, xstep, len, offset, multipliers, inp, border_ratio, result); } void Blur(size_t xsize, size_t ysize, float* channel, double sigma, double border_ratio) { - std::vector orignChannel(xsize * ysize); - memcpy(orignChannel.data(), channel, xsize * ysize * sizeof(float)); + std::vector orignChannel; + if (g_checkOpenCL) + { + orignChannel.resize(xsize * ysize); + memcpy(orignChannel.data(), channel, xsize * ysize * sizeof(float)); + } PROFILER_FUNC; double m = 2.25; // Accuracy increases when m is increased. @@ -137,7 +148,10 @@ void Blur(size_t xsize, size_t ysize, float* channel, double sigma, } } - clBlur(orignChannel.data(), xsize, ysize, sigma, border_ratio, channel); + if (g_checkOpenCL) + { + tclBlur(orignChannel.data(), xsize, ysize, sigma, border_ratio, channel); + } } // To change this to n, add the relevant FFTn function and kFFTnMapIndexTable. @@ -835,11 +849,14 @@ void MaskHighIntensityChange( } } - clMaskHighIntensityChange(c0[0].data(), c0[1].data(), c0[2].data(), - c1[0].data(), c1[1].data(), c1[2].data(), - xsize, ysize, - xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), - xyb1[0].data(), xyb1[1].data(), xyb1[2].data()); + if (g_checkOpenCL) + { + tclMaskHighIntensityChange(c0[0].data(), c0[1].data(), c0[2].data(), + c1[0].data(), c1[1].data(), c1[2].data(), + xsize, ysize, + xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), + xyb1[0].data(), xyb1[1].data(), xyb1[2].data()); + } } double SimpleGamma(double v) { @@ -1021,6 +1038,12 @@ void OpsinDynamicsImage(size_t xsize, size_t ysize, return; } + std::vector< std::vector> orig_rgb; + if (g_checkOpenCL) + { + orig_rgb = rgb; + } + PROFILER_FUNC; std::vector > blurred = rgb; static const double kSigma = 1.1; @@ -1037,16 +1060,6 @@ void OpsinDynamicsImage(size_t xsize, size_t ysize, sensitivity[0] = Gamma(pre_mixed[0]) / pre_mixed[0]; sensitivity[1] = Gamma(pre_mixed[1]) / pre_mixed[1]; sensitivity[2] = Gamma(pre_mixed[2]) / pre_mixed[2]; - -#ifdef ENABLE_OPENCL_CHECK - double sensitivity_new[3]; - sensitivity_new[0] = GammaNonRecursion(pre_mixed[0]) / pre_mixed[0]; - assert(fabs(sensitivity[0] - sensitivity_new[0]) < 0.01); - sensitivity_new[1] = GammaNonRecursion(pre_mixed[1]) / pre_mixed[1]; - assert(fabs(sensitivity[1] - sensitivity_new[1]) < 0.01); - sensitivity_new[2] = GammaNonRecursion(pre_mixed[2]) / pre_mixed[2]; - assert(fabs(sensitivity[2] - sensitivity_new[2]) < 0.01); -#endif // ENABLE_OPENCL_CHECK } double cur_rgb[3] = { rgb[0][i], rgb[1][i], rgb[2][i] }; double cur_mixed[3]; @@ -1060,6 +1073,12 @@ void OpsinDynamicsImage(size_t xsize, size_t ysize, rgb[1][i] = static_cast(y); rgb[2][i] = static_cast(z); } + + if (g_checkOpenCL) + { + tclOpsinDynamicsImage(orig_rgb[0].data(), orig_rgb[1].data(), orig_rgb[2].data(), xsize, ysize, + rgb[0].data(), rgb[1].data(), rgb[2].data()); + } } static void ScaleImage(double scale, std::vector *result) { @@ -1068,7 +1087,10 @@ static void ScaleImage(double scale, std::vector *result) { (*result)[i] *= static_cast(scale); } - clScaleImage(scale, (*result).data()); + if (g_checkOpenCL) + { + tclScaleImage(scale, (*result).data()); + } } // Making a cluster of local errors to be more impactful than @@ -1128,7 +1150,10 @@ void CalculateDiffmap(const size_t xsize, const size_t ysize, } ScaleImage(scale, diffmap); } - clCalculateDiffmapEx(xsize, ysize, step, (*diffmap).data()); + if (g_checkOpenCL) + { + tclCalculateDiffmap(xsize, ysize, step, (*diffmap).data()); + } } void ButteraugliComparator::DiffmapOpsinDynamicsImage( @@ -1208,10 +1233,14 @@ void ButteraugliComparator::BlockDiffMap( } } } - clBlockDiffMap(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), - xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), - xsize_, ysize_, step_, - (*block_diff_dc).data(), (*block_diff_ac).data()); + + if (g_checkOpenCL) + { + tclBlockDiffMap(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), + xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), + xsize_, ysize_, step_, + (*block_diff_dc).data(), (*block_diff_ac).data()); + } } void ButteraugliComparator::EdgeDetectorMap( @@ -1245,10 +1274,13 @@ void ButteraugliComparator::EdgeDetectorMap( } } - clEdgeDetectorMap(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), - xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), - xsize_, ysize_, step_, - (*edge_detector_map).data()); + if (g_checkOpenCL) + { + tclEdgeDetectorMap(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), + xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), + xsize_, ysize_, step_, + (*edge_detector_map).data()); + } } void ButteraugliComparator::EdgeDetectorLowFreq( @@ -1256,7 +1288,11 @@ void ButteraugliComparator::EdgeDetectorLowFreq( const std::vector > &xyb1, std::vector* block_diff_ac) { - std::vector orign_ac = *block_diff_ac; + std::vector orign_ac; + if (g_checkOpenCL) + { + orign_ac = *block_diff_ac; + } PROFILER_FUNC; static const double kSigma = 14; @@ -1309,10 +1345,13 @@ void ButteraugliComparator::EdgeDetectorLowFreq( } } - clEdgeDetectorLowFreq(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), - xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), - xsize_, ysize_, step_, - orign_ac.data(), (*block_diff_ac).data()); + if (g_checkOpenCL) + { + tclEdgeDetectorLowFreq(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), + xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), + xsize_, ysize_, step_, + orign_ac.data(), (*block_diff_ac).data()); + } } void ButteraugliComparator::CombineChannels( @@ -1339,10 +1378,14 @@ void ButteraugliComparator::CombineChannels( DotProduct(&edge_detector_map[3 * res_ix], mask)); } } - clCombineChannels(mask_xyb[0].data(), mask_xyb[1].data(), mask_xyb[2].data(), - mask_xyb_dc[0].data(), mask_xyb_dc[1].data(), mask_xyb_dc[2].data(), - block_diff_dc.data(), - block_diff_ac.data(), edge_detector_map.data(), xsize_, ysize_, res_xsize_, res_ysize_, step_, (*result).data()); + + if (g_checkOpenCL) + { + tclCombineChannels(mask_xyb[0].data(), mask_xyb[1].data(), mask_xyb[2].data(), + mask_xyb_dc[0].data(), mask_xyb_dc[1].data(), mask_xyb_dc[2].data(), + block_diff_dc.data(), + block_diff_ac.data(), edge_detector_map.data(), xsize_, ysize_, res_xsize_, res_ysize_, step_, (*result).data()); + } } double ButteraugliScoreFromDiffmap(const std::vector& diffmap) { @@ -1538,7 +1581,10 @@ void Average5x5(int xsize, int ysize, std::vector* diffs) { *diffs = result; ScaleImage(scale, diffs); - clAverage5x5(xsize, ysize, (*diffs).data()); + if (g_checkOpenCL) + { + tclAverage5x5(xsize, ysize, (*diffs).data()); + } } void DiffPrecompute( @@ -1594,11 +1640,15 @@ void DiffPrecompute( } } } - clDiffPrecompute( - xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), - xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), - xsize, ysize, - ((*mask)[0]).data(), ((*mask)[1]).data(), ((*mask)[2]).data()); + + if (g_checkOpenCL) + { + tclDiffPrecompute( + xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), + xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), + xsize, ysize, + ((*mask)[0]).data(), ((*mask)[1]).data(), ((*mask)[2]).data()); + } } void Mask(const std::vector > &xyb0, @@ -1653,11 +1703,14 @@ void Mask(const std::vector > &xyb0, ScaleImage(kGlobalScale * kGlobalScale, &(*mask_dc)[i]); } - clMask(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), - xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), - xsize, ysize, - (*mask)[0].data(), (*mask)[1].data(), (*mask)[2].data(), - (*mask_dc)[0].data(), (*mask_dc)[1].data(), (*mask_dc)[2].data()); + if (g_checkOpenCL) + { + tclMask(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), + xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), + xsize, ysize, + (*mask)[0].data(), (*mask)[1].data(), (*mask)[2].data(), + (*mask_dc)[0].data(), (*mask_dc)[1].data(), (*mask_dc)[2].data()); + } } } // namespace butteraugli From 853222f43a805e99a936158ef69d690fe6e5ac41 Mon Sep 17 00:00:00 2001 From: Zhou Ke Date: Wed, 10 May 2017 14:54:56 +0800 Subject: [PATCH 055/189] add clMinSquareVal test --- clguetzli/clguetzli.h | 2 ++ clguetzli/clguetzli_test.cpp | 25 +++++++++++++++-- clguetzli/clguetzli_test.h | 4 +++ .../butteraugli/butteraugli/butteraugli.cc | 27 ++++++++++++++++--- 4 files changed, 52 insertions(+), 6 deletions(-) diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h index 21ec7237..3c05aeb9 100644 --- a/clguetzli/clguetzli.h +++ b/clguetzli/clguetzli.h @@ -47,3 +47,5 @@ void clConvolutionEx(cl_mem inp, size_t xsize, size_t ysize, cl_mem multipliers, size_t len, int xstep, int offset, double border_ratio, cl_mem result/*out*/); + +void clMinSquareValEx(cl_mem img/*in,out*/, size_t xsize, size_t ysize, size_t square_size, size_t offset); diff --git a/clguetzli/clguetzli_test.cpp b/clguetzli/clguetzli_test.cpp index bac52b60..25156a59 100644 --- a/clguetzli/clguetzli_test.cpp +++ b/clguetzli/clguetzli_test.cpp @@ -357,7 +357,6 @@ void tclConvolution(float* result, size_t xsize, size_t ysize, float border_ratio, float* orign_result) { - return; if (xsize < 100 || ysize < 100) return; int dxsize = (xsize + xstep - 1) / xstep; @@ -413,9 +412,31 @@ void tclAverage5x5(int xsize, int ysize, float *diffs) } // chrisk todo -void tclMinSquareVal(void) +void tclMinSquareVal(float *img, size_t square_size, size_t offset, + size_t xsize, size_t ysize, + float *values) { + if (xsize < 100 || ysize < 100) return; + + size_t img_size = xsize * ysize * sizeof(float); + cl_int err = 0; + ocl_args_d_t &ocl = getOcl(); + cl_mem r = ocl.allocMem(img_size); + + clEnqueueWriteBuffer(ocl.commandQueue, r, CL_FALSE, 0, img_size, img, 0, NULL, NULL); + err = clFinish(ocl.commandQueue); + clMinSquareValEx(r, xsize, ysize, square_size, offset); + + cl_float *r_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, r, true, CL_MAP_READ, 0, img_size, 0, NULL, NULL, &err); + err = clFinish(ocl.commandQueue); + + FLOAT_COMPARE(values, r_r, xsize * ysize); + + clEnqueueUnmapMemObject(ocl.commandQueue, r, r_r, img_size, NULL, NULL); + err = clFinish(ocl.commandQueue); + + clReleaseMemObject(r); } // ian todo diff --git a/clguetzli/clguetzli_test.h b/clguetzli/clguetzli_test.h index f57c16e0..4e94c490 100644 --- a/clguetzli/clguetzli_test.h +++ b/clguetzli/clguetzli_test.h @@ -69,3 +69,7 @@ void tclScaleImage(double scale, float *result); void tclOpsinDynamicsImage(float* r, float* g, float* b, size_t xsize, size_t ysize, float* result_r, float* result_g, float* result_b); + +void tclMinSquareVal(float *img, size_t square_size, size_t offset, + size_t xsize, size_t ysize, + float *values); diff --git a/third_party/butteraugli/butteraugli/butteraugli.cc b/third_party/butteraugli/butteraugli/butteraugli.cc index cce14dad..9168db05 100644 --- a/third_party/butteraugli/butteraugli/butteraugli.cc +++ b/third_party/butteraugli/butteraugli/butteraugli.cc @@ -70,9 +70,13 @@ static void Convolution(size_t xsize, size_t ysize, float border_ratio, float* __restrict__ result) { - int dxsize = (xsize + xstep - 1) / xstep; - std::vector newResult(dxsize * ysize); - memcpy(newResult.data(), result, dxsize * ysize * sizeof(float)); + std::vector newResult; + if (g_checkOpenCL) + { + int dxsize = (xsize + xstep - 1) / xstep; + newResult.resize(dxsize * ysize); + memcpy(newResult.data(), result, dxsize * ysize * sizeof(float)); + } PROFILER_FUNC; float weight_no_border = 0; @@ -99,7 +103,10 @@ static void Convolution(size_t xsize, size_t ysize, } } - tclConvolution(newResult.data(), xsize, ysize, xstep, len, offset, multipliers, inp, border_ratio, result); + if (g_checkOpenCL) + { + tclConvolution(newResult.data(), xsize, ysize, xstep, len, offset, multipliers, inp, border_ratio, result); + } } void Blur(size_t xsize, size_t ysize, float* channel, double sigma, @@ -1490,6 +1497,13 @@ void MinSquareVal(size_t square_size, size_t offset, assert(offset < square_size); std::vector tmp(xsize * ysize); + std::vector img; + if (g_checkOpenCL) + { + img.resize(xsize * ysize); + memcpy(img.data(), values, xsize * ysize * sizeof(float)); + } + for (size_t y = 0; y < ysize; ++y) { const size_t minh = offset > y ? 0 : y - offset; const size_t maxh = std::min(ysize, y + square_size - offset); @@ -1526,6 +1540,11 @@ void MinSquareVal(size_t square_size, size_t offset, *pValuePoint = min; pValuePoint += xsize; } } + + if (g_checkOpenCL) + { + tclMinSquareVal(img.data(), square_size, offset, xsize, ysize, values); + } } // ===== Functions used by Mask only ===== From 920de33da90a17a0e6fb9e0487b675758a7fabc1 Mon Sep 17 00:00:00 2001 From: strongtu Date: Thu, 11 May 2017 01:05:03 +0800 Subject: [PATCH 056/189] Merge branch 'master' of https://github.com/ianhuang-777/guetzli --- clguetzli/clguetzli.cl | 8 +-- clguetzli/clguetzli.cpp | 29 ++++++----- clguetzli/clguetzli.h | 6 ++- clguetzli/clguetzli_test.cpp | 50 ++++++++++++------- clguetzli/clguetzli_test.h | 1 + .../butteraugli/butteraugli/butteraugli.cc | 12 ++++- 6 files changed, 66 insertions(+), 40 deletions(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index 358e931b..b2f58c66 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -354,18 +354,13 @@ __kernel void CombineChannels( __global float *block_diff_ac, __global float *edge_detector_map, int xsize, int ysize, + int res_xsize, int step, __global float *result) { const int res_x = get_global_id(0) * step; const int res_y = get_global_id(1) * step; - const int res_xsize = (xsize + step - 1) / step; - const int res_ysize = (ysize + step - 1) / step; - - //if (res_x * step >= xsize - (8 - step)) return; - //if (res_y * step >= ysize - (8 - step)) return; - double mask[3]; double dc_mask[3]; mask[0] = mask_x[(res_y + 3) * xsize + (res_x + 3)]; @@ -382,6 +377,7 @@ __kernel void CombineChannels( DotProduct(&block_diff_dc[3 * res_ix], dc_mask) + DotProduct(&block_diff_ac[3 * res_ix], mask) + DotProduct(&edge_detector_map[3 * res_ix], mask)); + //result[res_ix] = 1; } inline double Interpolate(__constant double *array, int size, double sx) { diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index 2fb1262d..542c7858 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -14,7 +14,7 @@ ocl_args_d_t& getOcl(void) if (bInit == true) return ocl; bInit = true; - cl_int err = SetupOpenCL(&ocl, CL_DEVICE_TYPE_CPU); + cl_int err = SetupOpenCL(&ocl, CL_DEVICE_TYPE_GPU); if (CL_SUCCESS != err) { LogError("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err)); @@ -883,15 +883,17 @@ void clCombineChannelsEx( cl_mem block_diff_ac, cl_mem edge_detector_map, size_t xsize, size_t ysize, + size_t res_xsize, size_t step, cl_mem result/*out*/) { cl_int err = CL_SUCCESS; ocl_args_d_t &ocl = getOcl(); - const size_t res_xsize = ((xsize - 8 + step) + step - 1) / step; - const size_t res_ysize = ((ysize - 8 + step) + step - 1) / step; + const size_t work_xsize = ((xsize - 8 + step) + step - 1) / step; + const size_t work_ysize = ((ysize - 8 + step) + step - 1) / step; + cl_int clres_size = res_xsize; cl_int clxsize = xsize; cl_int clysize = ysize; cl_int clstep = step; @@ -908,10 +910,11 @@ void clCombineChannelsEx( clSetKernelArg(kernel, 8, sizeof(cl_mem), (void*)&edge_detector_map); clSetKernelArg(kernel, 9, sizeof(cl_int), (void*)&clxsize); clSetKernelArg(kernel, 10, sizeof(cl_int), (void*)&clysize); - clSetKernelArg(kernel, 11, sizeof(cl_int), (void*)&clstep); - clSetKernelArg(kernel, 12, sizeof(cl_mem), (void*)&result); + clSetKernelArg(kernel, 11, sizeof(cl_int), (void*)&clres_size); + clSetKernelArg(kernel, 12, sizeof(cl_int), (void*)&clstep); + clSetKernelArg(kernel, 13, sizeof(cl_mem), (void*)&result); - size_t globalWorkSize[2] = { res_xsize, res_ysize}; + size_t globalWorkSize[2] = { work_xsize, work_ysize }; err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); if (CL_SUCCESS != err) { @@ -1039,8 +1042,9 @@ void clCalculateDiffmapEx(cl_mem diffmap/*in,out*/, size_t xsize, size_t ysize, void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b, float* r2, float* g2, float* b2, size_t xsize, size_t ysize, + size_t res_xsize, size_t res_ysize, size_t step, - float* result) + float* result, size_t result_len) { cl_int channel_size = xsize * ysize * sizeof(float); @@ -1059,13 +1063,14 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b, err = clFinish(ocl.commandQueue); - cl_mem mem_result = ocl.allocMem(channel_size); + cl_mem mem_result = ocl.allocMem(res_xsize * res_ysize * sizeof(float)); + const float pattern = 0; + clEnqueueFillBuffer(ocl.commandQueue, mem_result, &pattern, sizeof(float), 0, res_xsize * res_ysize, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, mem_result, CL_FALSE, 0, result_len, result, 0, NULL, NULL); + ocl_channels mask = ocl.allocMemChannels(channel_size); ocl_channels mask_dc = ocl.allocMemChannels(channel_size); - const size_t res_xsize = (xsize + step - 1) / step; - const size_t res_ysize = (ysize + step - 1) / step; - cl_mem edge_detector_map = ocl.allocMem(3 * res_xsize * res_ysize * sizeof(float)); cl_mem block_diff_dc = ocl.allocMem(3 * res_xsize * res_ysize * sizeof(float)); cl_mem block_diff_ac = ocl.allocMem(3 * res_xsize * res_ysize * sizeof(float)); @@ -1078,7 +1083,7 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b, clMaskEx(xyb0, xyb1, xsize, ysize, mask, mask_dc); - clCombineChannelsEx(mask, mask_dc, block_diff_dc, block_diff_ac, edge_detector_map, xsize, ysize, step, mem_result); + clCombineChannelsEx(mask, mask_dc, block_diff_dc, block_diff_ac, edge_detector_map, xsize, ysize, res_xsize, step, mem_result); clCalculateDiffmapEx(mem_result, xsize, ysize, step); diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h index 3c05aeb9..9fe49b8f 100644 --- a/clguetzli/clguetzli.h +++ b/clguetzli/clguetzli.h @@ -25,13 +25,16 @@ void clEdgeDetectorLowFreqEx(ocl_channels rgb, ocl_channels rgb2, void clBlurEx(cl_mem image, size_t xsize, size_t ysize, double sigma, double border_ratio, cl_mem result = nullptr); +void clOpsinDynamicsImageEx(ocl_channels rgb/*in,out*/, size_t xsize, size_t ysize); + void clOpsinDynamicsImage(size_t xsize, size_t ysize, float* r, float* g, float* b); void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b, float* r2, float* g2, float* b2, size_t xsize, size_t ysize, + size_t res_xsize, size_t res_ysize, size_t step, - float* result); + float* result, size_t result_len); void clCombineChannelsEx( ocl_channels mask, @@ -40,6 +43,7 @@ void clCombineChannelsEx( cl_mem block_diff_ac, cl_mem edge_detector_map, size_t xsize, size_t ysize, + size_t res_xsize, size_t step, cl_mem result/*out*/); diff --git a/clguetzli/clguetzli_test.cpp b/clguetzli/clguetzli_test.cpp index 25156a59..9b96120a 100644 --- a/clguetzli/clguetzli_test.cpp +++ b/clguetzli/clguetzli_test.cpp @@ -30,8 +30,6 @@ void tclMaskHighIntensityChange(const float* r, const float* g, const float* b, const float* result_r, const float* result_g, const float* result_b, const float* result_r2, const float* result_g2, const float* result_b2) { - if (xsize < 100 || ysize < 100) return; - size_t channel_size = xsize * ysize * sizeof(float); cl_int err = 0; ocl_args_d_t &ocl = getOcl(); @@ -81,8 +79,6 @@ void tclEdgeDetectorMap(const float* r, const float* g, const float* b, size_t xsize, size_t ysize, size_t step, const float* result) { - if (xsize < 100 || ysize < 100) return; - size_t channel_size = xsize * ysize * sizeof(float); const size_t res_xsize = (xsize + step - 1) / step; const size_t res_ysize = (ysize + step - 1) / step; @@ -123,8 +119,6 @@ void tclBlockDiffMap(const float* r, const float* g, const float* b, size_t xsize, size_t ysize, size_t step, const float* result_diff_dc, const float* result_diff_ac) { - if (xsize < 100 || ysize < 100) return; - size_t channel_size = xsize * ysize * sizeof(float); const size_t res_xsize = (xsize + step - 1) / step; const size_t res_ysize = (ysize + step - 1) / step; @@ -173,8 +167,6 @@ void tclEdgeDetectorLowFreq(const float* r, const float* g, const float* b, const float* orign_ac, const float* result_diff_ac) { - if (xsize < 100 || ysize < 100) return; - size_t channel_size = xsize * ysize * sizeof(float); const size_t res_xsize = (xsize + step - 1) / step; const size_t res_ysize = (ysize + step - 1) / step; @@ -219,8 +211,6 @@ void tclMask(const float* r, const float* g, const float* b, const float* mask_r, const float* mask_g, const float* mask_b, const float* maskdc_r, const float* maskdc_g, const float* maskdc_b) { - if (xsize < 100 || ysize < 100) return; - size_t channel_size = xsize * ysize * sizeof(float); cl_int err = 0; ocl_args_d_t &ocl = getOcl(); @@ -269,7 +259,6 @@ void tclMask(const float* r, const float* g, const float* b, ocl.releaseMemChannels(mask_dc); } -// ian todo void tclCombineChannels(const float *mask_xyb_x, const float *mask_xyb_y, const float *mask_xyb_b, const float *mask_xyb_dc_x, const float *mask_xyb_dc_y, const float *mask_xyb_dc_b, const float *block_diff_dc, const float *block_diff_ac, @@ -277,6 +266,7 @@ void tclCombineChannels(const float *mask_xyb_x, const float *mask_xyb_y, const size_t xsize, size_t ysize, size_t res_xsize, size_t res_ysize, size_t step, + float *init_result, float *result) { cl_int err = CL_SUCCESS; @@ -299,8 +289,10 @@ void tclCombineChannels(const float *mask_xyb_x, const float *mask_xyb_y, const clEnqueueWriteBuffer(ocl.commandQueue, cl_block_diff_dc, CL_FALSE, 0, 3 * res_xsize * res_ysize * sizeof(float), block_diff_dc, 0, NULL, NULL); clEnqueueWriteBuffer(ocl.commandQueue, cl_block_diff_ac, CL_FALSE, 0, 3 * res_xsize * res_ysize * sizeof(float), block_diff_ac, 0, NULL, NULL); clEnqueueWriteBuffer(ocl.commandQueue, cl_edge_detector_map, CL_FALSE, 0, 3 * res_xsize * res_ysize * sizeof(float), edge_detector_map, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, cl_result, CL_FALSE, 0, res_xsize * res_ysize * sizeof(float), init_result, 0, NULL, NULL); + err = clFinish(ocl.commandQueue); - clCombineChannelsEx(mask, mask_dc, cl_block_diff_dc, cl_block_diff_ac, cl_edge_detector_map, xsize, ysize, step, cl_result); + clCombineChannelsEx(mask, mask_dc, cl_block_diff_dc, cl_block_diff_ac, cl_edge_detector_map, xsize, ysize, res_xsize, step, cl_result); cl_float *result_tmp = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, cl_result, true, CL_MAP_READ, 0, res_xsize * res_ysize * sizeof(float), 0, NULL, NULL, &err); @@ -325,8 +317,6 @@ void tclCalculateDiffmap(const size_t xsize, const size_t ysize, // chrisk todo void tclBlur(float* channel, size_t xsize, size_t ysize, double sigma, double border_ratio, float* result) { - if (xsize < 100 || ysize < 100) return; - size_t channel_size = xsize * ysize * sizeof(float); cl_int err = 0; ocl_args_d_t &ocl = getOcl(); @@ -357,8 +347,6 @@ void tclConvolution(float* result, size_t xsize, size_t ysize, float border_ratio, float* orign_result) { - if (xsize < 100 || ysize < 100) return; - int dxsize = (xsize + xstep - 1) / xstep; size_t result_size = dxsize * ysize * sizeof(float); size_t inp_size = xsize * ysize * sizeof(float); @@ -367,7 +355,7 @@ void tclConvolution(float* result, size_t xsize, size_t ysize, ocl_args_d_t &ocl = getOcl(); cl_mem r = ocl.allocMem(result_size); cl_mem i = ocl.allocMem(inp_size); - cl_mem m = ocl.allocMem(len); + cl_mem m = ocl.allocMem(multipliers_size); clEnqueueWriteBuffer(ocl.commandQueue, r, CL_FALSE, 0, result_size, result, 0, NULL, NULL); clEnqueueWriteBuffer(ocl.commandQueue, i, CL_FALSE, 0, inp_size, inp, 0, NULL, NULL); @@ -416,8 +404,6 @@ void tclMinSquareVal(float *img, size_t square_size, size_t offset, size_t xsize, size_t ysize, float *values) { - if (xsize < 100 || ysize < 100) return; - size_t img_size = xsize * ysize * sizeof(float); cl_int err = 0; ocl_args_d_t &ocl = getOcl(); @@ -449,5 +435,31 @@ void tclScaleImage(double scale, float *result) void tclOpsinDynamicsImage(float* r, float* g, float* b, size_t xsize, size_t ysize, float* result_r, float* result_g, float* result_b) { + size_t channel_size = xsize * ysize * sizeof(float); + cl_int err = 0; + ocl_args_d_t &ocl = getOcl(); + ocl_channels rgb = ocl.allocMemChannels(channel_size); + + clEnqueueWriteBuffer(ocl.commandQueue, rgb.r, CL_FALSE, 0, channel_size, r, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, rgb.g, CL_FALSE, 0, channel_size, g, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, rgb.b, CL_FALSE, 0, channel_size, b, 0, NULL, NULL); + err = clFinish(ocl.commandQueue); + + clOpsinDynamicsImageEx(rgb, xsize, ysize); + cl_float *r_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, rgb.r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + cl_float *r_g = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, rgb.g, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + cl_float *r_b = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, rgb.b, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + err = clFinish(ocl.commandQueue); + + FLOAT_COMPARE(result_r, r_r, xsize * ysize); + FLOAT_COMPARE(result_g, r_g, xsize * ysize); + FLOAT_COMPARE(result_b, r_b, xsize * ysize); + + clEnqueueUnmapMemObject(ocl.commandQueue, rgb.r, r_r, channel_size, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, rgb.g, r_g, channel_size, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, rgb.b, r_b, channel_size, NULL, NULL); + err = clFinish(ocl.commandQueue); + + ocl.releaseMemChannels(rgb); } \ No newline at end of file diff --git a/clguetzli/clguetzli_test.h b/clguetzli/clguetzli_test.h index 4e94c490..12d9d057 100644 --- a/clguetzli/clguetzli_test.h +++ b/clguetzli/clguetzli_test.h @@ -40,6 +40,7 @@ void tclCombineChannels(const float *mask_xyb_x, const float *mask_xyb_y, const size_t xsize, size_t ysize, size_t res_xsize, size_t res_ysize, size_t step, + float *init_result, float *result); void tclCalculateDiffmap(const size_t xsize, const size_t ysize, diff --git a/third_party/butteraugli/butteraugli/butteraugli.cc b/third_party/butteraugli/butteraugli/butteraugli.cc index 9168db05..ae2b8030 100644 --- a/third_party/butteraugli/butteraugli/butteraugli.cc +++ b/third_party/butteraugli/butteraugli/butteraugli.cc @@ -1172,7 +1172,7 @@ void ButteraugliComparator::DiffmapOpsinDynamicsImage( { result.resize(xsize_ * ysize_); clDiffmapOpsinDynamicsImage(xyb0_arg[0].data(), xyb0_arg[1].data(), xyb0_arg[2].data(), - xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), xsize_, ysize_, step_, result.data()); + xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), xsize_, ysize_, step_, res_xsize_, res_ysize_, result.data(), result.size()); return; } @@ -1368,8 +1368,16 @@ void ButteraugliComparator::CombineChannels( const std::vector& block_diff_ac, const std::vector& edge_detector_map, std::vector* result) { + PROFILER_FUNC; result->resize(res_xsize_ * res_ysize_); + + std::vector temp; + if (g_checkOpenCL) + { + temp = *result; + } + for (size_t res_y = 0; res_y + (8 - step_) < ysize_; res_y += step_) { for (size_t res_x = 0, j = 0; res_x + (8 - step_) < xsize_; res_x += step_, j++) { size_t res_ix = (res_y * res_xsize_ + res_x) / step_; @@ -1391,7 +1399,7 @@ void ButteraugliComparator::CombineChannels( tclCombineChannels(mask_xyb[0].data(), mask_xyb[1].data(), mask_xyb[2].data(), mask_xyb_dc[0].data(), mask_xyb_dc[1].data(), mask_xyb_dc[2].data(), block_diff_dc.data(), - block_diff_ac.data(), edge_detector_map.data(), xsize_, ysize_, res_xsize_, res_ysize_, step_, (*result).data()); + block_diff_ac.data(), edge_detector_map.data(), xsize_, ysize_, res_xsize_, res_ysize_, step_, &temp[0], &(*result)[0]); } } From 6ce71751c779613281c22140d112749ff431aad1 Mon Sep 17 00:00:00 2001 From: strongtu Date: Thu, 11 May 2017 01:19:20 +0800 Subject: [PATCH 057/189] =?UTF-8?q?=E4=BF=AE=E6=AD=A3OpsinDynamicsImage?= =?UTF-8?q?=E8=BF=90=E7=AE=97=E7=BB=93=E6=9E=9C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clguetzli.cl | 4 +++- third_party/butteraugli/butteraugli/butteraugli.cc | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index b2f58c66..2f5478b8 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -266,17 +266,19 @@ __kernel void OpsinDynamicsImage( double pre[3] = { r_blurred[i], g_blurred[i], b_blurred[i] }; double pre_mixed[3]; OpsinAbsorbance(pre, pre_mixed); + double sensitivity[3]; sensitivity[0] = Gamma(pre_mixed[0]) / pre_mixed[0]; sensitivity[1] = Gamma(pre_mixed[1]) / pre_mixed[1]; sensitivity[2] = Gamma(pre_mixed[2]) / pre_mixed[2]; - double cur_rgb[3] = { r_blurred[i], g_blurred[i], b_blurred[i] }; + double cur_rgb[3] = { r[i], g[i], b[i] }; double cur_mixed[3]; OpsinAbsorbance(cur_rgb, cur_mixed); cur_mixed[0] *= sensitivity[0]; cur_mixed[1] *= sensitivity[1]; cur_mixed[2] *= sensitivity[2]; + double x, y, z; RgbToXyb(cur_mixed[0], cur_mixed[1], cur_mixed[2], &x, &y, &z); r[i] = x; diff --git a/third_party/butteraugli/butteraugli/butteraugli.cc b/third_party/butteraugli/butteraugli/butteraugli.cc index ae2b8030..aa8f9e75 100644 --- a/third_party/butteraugli/butteraugli/butteraugli.cc +++ b/third_party/butteraugli/butteraugli/butteraugli.cc @@ -1064,7 +1064,7 @@ void OpsinDynamicsImage(size_t xsize, size_t ysize, double pre_rgb[3] = { blurred[0][i], blurred[1][i], blurred[2][i] }; double pre_mixed[3]; OpsinAbsorbance(pre_rgb, pre_mixed); - sensitivity[0] = Gamma(pre_mixed[0]) / pre_mixed[0]; + sensitivity[0] = Gamma(pre_mixed[0]) / pre_mixed[0];¡¡¡¡ sensitivity[1] = Gamma(pre_mixed[1]) / pre_mixed[1]; sensitivity[2] = Gamma(pre_mixed[2]) / pre_mixed[2]; } From 44df7121056b8167b25fbf6833a7df1f100846e0 Mon Sep 17 00:00:00 2001 From: Zhou Ke Date: Thu, 11 May 2017 09:44:00 +0800 Subject: [PATCH 058/189] remove redundant parameter --- clguetzli/clguetzli_test.cpp | 11 +++++------ clguetzli/clguetzli_test.h | 4 ++-- third_party/butteraugli/butteraugli/butteraugli.cc | 13 ++----------- 3 files changed, 9 insertions(+), 19 deletions(-) diff --git a/clguetzli/clguetzli_test.cpp b/clguetzli/clguetzli_test.cpp index 9b96120a..6679d0db 100644 --- a/clguetzli/clguetzli_test.cpp +++ b/clguetzli/clguetzli_test.cpp @@ -339,13 +339,13 @@ void tclBlur(float* channel, size_t xsize, size_t ysize, double sigma, double bo } // chrisk todo -void tclConvolution(float* result, size_t xsize, size_t ysize, +void tclConvolution(size_t xsize, size_t ysize, size_t xstep, size_t len, size_t offset, const float* multipliers, const float* inp, float border_ratio, - float* orign_result) + float* result) { int dxsize = (xsize + xstep - 1) / xstep; size_t result_size = dxsize * ysize * sizeof(float); @@ -353,11 +353,11 @@ void tclConvolution(float* result, size_t xsize, size_t ysize, size_t multipliers_size = len * sizeof(float); cl_int err = 0; ocl_args_d_t &ocl = getOcl(); - cl_mem r = ocl.allocMem(result_size); + ocl.allocA(result_size); + cl_mem r = ocl.srcA; cl_mem i = ocl.allocMem(inp_size); cl_mem m = ocl.allocMem(multipliers_size); - clEnqueueWriteBuffer(ocl.commandQueue, r, CL_FALSE, 0, result_size, result, 0, NULL, NULL); clEnqueueWriteBuffer(ocl.commandQueue, i, CL_FALSE, 0, inp_size, inp, 0, NULL, NULL); clEnqueueWriteBuffer(ocl.commandQueue, m, CL_FALSE, 0, multipliers_size, multipliers, 0, NULL, NULL); err = clFinish(ocl.commandQueue); @@ -367,12 +367,11 @@ void tclConvolution(float* result, size_t xsize, size_t ysize, cl_float *r_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, r, true, CL_MAP_READ, 0, result_size, 0, NULL, NULL, &err); err = clFinish(ocl.commandQueue); - FLOAT_COMPARE(orign_result, r_r, dxsize * ysize); + FLOAT_COMPARE(result, r_r, dxsize * ysize); clEnqueueUnmapMemObject(ocl.commandQueue, r, r_r, result_size, NULL, NULL); err = clFinish(ocl.commandQueue); - clReleaseMemObject(r); clReleaseMemObject(i); clReleaseMemObject(m); } diff --git a/clguetzli/clguetzli_test.h b/clguetzli/clguetzli_test.h index 12d9d057..1ce0466c 100644 --- a/clguetzli/clguetzli_test.h +++ b/clguetzli/clguetzli_test.h @@ -47,13 +47,13 @@ void tclCalculateDiffmap(const size_t xsize, const size_t ysize, const size_t step, float *diffmap); -void tclConvolution(float* result, size_t xsize, size_t ysize, +void tclConvolution(size_t xsize, size_t ysize, size_t xstep, size_t len, size_t offset, const float* multipliers, const float* inp, float border_ratio, - float* orign_result); + float* result); void tclBlur(size_t xsize, size_t ysize, float* channel, double sigma, double border_ratio); diff --git a/third_party/butteraugli/butteraugli/butteraugli.cc b/third_party/butteraugli/butteraugli/butteraugli.cc index aa8f9e75..948fea2e 100644 --- a/third_party/butteraugli/butteraugli/butteraugli.cc +++ b/third_party/butteraugli/butteraugli/butteraugli.cc @@ -69,15 +69,6 @@ static void Convolution(size_t xsize, size_t ysize, const float* __restrict__ inp, float border_ratio, float* __restrict__ result) { - - std::vector newResult; - if (g_checkOpenCL) - { - int dxsize = (xsize + xstep - 1) / xstep; - newResult.resize(dxsize * ysize); - memcpy(newResult.data(), result, dxsize * ysize * sizeof(float)); - } - PROFILER_FUNC; float weight_no_border = 0; @@ -105,7 +96,7 @@ static void Convolution(size_t xsize, size_t ysize, if (g_checkOpenCL) { - tclConvolution(newResult.data(), xsize, ysize, xstep, len, offset, multipliers, inp, border_ratio, result); + tclConvolution(xsize, ysize, xstep, len, offset, multipliers, inp, border_ratio, result); } } @@ -1064,7 +1055,7 @@ void OpsinDynamicsImage(size_t xsize, size_t ysize, double pre_rgb[3] = { blurred[0][i], blurred[1][i], blurred[2][i] }; double pre_mixed[3]; OpsinAbsorbance(pre_rgb, pre_mixed); - sensitivity[0] = Gamma(pre_mixed[0]) / pre_mixed[0];¡¡¡¡ + sensitivity[0] = Gamma(pre_mixed[0]) / pre_mixed[0]; sensitivity[1] = Gamma(pre_mixed[1]) / pre_mixed[1]; sensitivity[2] = Gamma(pre_mixed[2]) / pre_mixed[2]; } From 79cb8cd9446ee2fd4af865ae7d2c182bebadca5e Mon Sep 17 00:00:00 2001 From: ianhuang-777 <306168910@qq.com> Date: Thu, 11 May 2017 17:04:19 +0800 Subject: [PATCH 059/189] Merge branch 'master' of https://github.com/ianhuang-777/guetzli --- clguetzli/clguetzli.cl | 80 +++++++++------ clguetzli/clguetzli.cpp | 24 ++--- clguetzli/clguetzli.h | 10 ++ clguetzli/clguetzli_test.cpp | 97 +++++++++++++++++-- clguetzli/clguetzli_test.h | 15 ++- .../butteraugli/butteraugli/butteraugli.cc | 24 +++-- 6 files changed, 192 insertions(+), 58 deletions(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index 2f5478b8..b4e3cdc8 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -172,7 +172,7 @@ __kernel void DownSample(__global float* pA, __global float* pC, int xstep, int const int xsize = get_global_size(0); const int ysize = get_global_size(1); - const int oxsize = xsize / xstep; + const int oxsize = (xsize + xstep - 1) / xstep; const int sample_x = x / xstep; const int sample_y = y / ystep; @@ -1140,30 +1140,56 @@ __kernel void MaskHighIntensityChange( } -#define XybToVals_off 11.38708334481672 -#define XybToVals_inc 14.550189611520716 -__constant double XybToVals_lut[21] = { +#define XybToVals_off_x 11.38708334481672 +#define XybToVals_inc_x 14.550189611520716 +__constant double XybToVals_lut_x[21] = { 0, - XybToVals_off, - XybToVals_off + 1 * XybToVals_inc, - XybToVals_off + 2 * XybToVals_inc, - XybToVals_off + 3 * XybToVals_inc, - XybToVals_off + 4 * XybToVals_inc, - XybToVals_off + 5 * XybToVals_inc, - XybToVals_off + 6 * XybToVals_inc, - XybToVals_off + 7 * XybToVals_inc, - XybToVals_off + 8 * XybToVals_inc, - XybToVals_off + 9 * XybToVals_inc, - XybToVals_off + 10 * XybToVals_inc, - XybToVals_off + 11 * XybToVals_inc, - XybToVals_off + 12 * XybToVals_inc, - XybToVals_off + 13 * XybToVals_inc, - XybToVals_off + 14 * XybToVals_inc, - XybToVals_off + 15 * XybToVals_inc, - XybToVals_off + 16 * XybToVals_inc, - XybToVals_off + 17 * XybToVals_inc, - XybToVals_off + 18 * XybToVals_inc, - XybToVals_off + 19 * XybToVals_inc, + XybToVals_off_x, + XybToVals_off_x + 1 * XybToVals_inc_x, + XybToVals_off_x + 2 * XybToVals_inc_x, + XybToVals_off_x + 3 * XybToVals_inc_x, + XybToVals_off_x + 4 * XybToVals_inc_x, + XybToVals_off_x + 5 * XybToVals_inc_x, + XybToVals_off_x + 6 * XybToVals_inc_x, + XybToVals_off_x + 7 * XybToVals_inc_x, + XybToVals_off_x + 8 * XybToVals_inc_x, + XybToVals_off_x + 9 * XybToVals_inc_x, + XybToVals_off_x + 10 * XybToVals_inc_x, + XybToVals_off_x + 11 * XybToVals_inc_x, + XybToVals_off_x + 12 * XybToVals_inc_x, + XybToVals_off_x + 13 * XybToVals_inc_x, + XybToVals_off_x + 14 * XybToVals_inc_x, + XybToVals_off_x + 15 * XybToVals_inc_x, + XybToVals_off_x + 16 * XybToVals_inc_x, + XybToVals_off_x + 17 * XybToVals_inc_x, + XybToVals_off_x + 18 * XybToVals_inc_x, + XybToVals_off_x + 19 * XybToVals_inc_x, +}; + +#define XybToVals_off_y 1.4103373714040413 +#define XybToVals_inc_y 0.7084088867024 +__constant double XybToVals_lut_y[21] = { + 0, + XybToVals_off_y, + XybToVals_off_y + 1 * XybToVals_inc_y, + XybToVals_off_y + 2 * XybToVals_inc_y, + XybToVals_off_y + 3 * XybToVals_inc_y, + XybToVals_off_y + 4 * XybToVals_inc_y, + XybToVals_off_y + 5 * XybToVals_inc_y, + XybToVals_off_y + 6 * XybToVals_inc_y, + XybToVals_off_y + 7 * XybToVals_inc_y, + XybToVals_off_y + 8 * XybToVals_inc_y, + XybToVals_off_y + 9 * XybToVals_inc_y, + XybToVals_off_y + 10 * XybToVals_inc_y, + XybToVals_off_y + 11 * XybToVals_inc_y, + XybToVals_off_y + 12 * XybToVals_inc_y, + XybToVals_off_y + 13 * XybToVals_inc_y, + XybToVals_off_y + 14 * XybToVals_inc_y, + XybToVals_off_y + 15 * XybToVals_inc_y, + XybToVals_off_y + 16 * XybToVals_inc_y, + XybToVals_off_y + 17 * XybToVals_inc_y, + XybToVals_off_y + 18 * XybToVals_inc_y, + XybToVals_off_y + 19 * XybToVals_inc_y, }; void XybToVals( @@ -1174,8 +1200,8 @@ void XybToVals( const double ymul = 2.28148649801; const double zmul = 1.87816926918; - *valx = Interpolate(&XybToVals_lut[0], 21, x * xmul); - *valy = Interpolate(&XybToVals_lut[0], 21, y * ymul); + *valx = Interpolate(&XybToVals_lut_x[0], 21, x * xmul); + *valy = Interpolate(&XybToVals_lut_y[0], 21, y * ymul); *valz = zmul * z; } @@ -1195,7 +1221,7 @@ __kernel void DiffPrecompute( double valsv1[3] = { 0.0 }; int ix2; - size_t ix = x + xsize * y; + int ix = x + xsize * y; if (x + 1 < xsize) { ix2 = ix + 1; } diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index 542c7858..b37eabcd 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -594,14 +594,14 @@ void clDiffPrecomputeEx(ocl_channels xyb0, ocl_channels xyb1, size_t xsize, size ocl_args_d_t &ocl = getOcl(); cl_kernel kernel = ocl.kernel[KERNEL_DIFFPRECOMPUTE]; - clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&xyb0.r); - clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&xyb0.g); + clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&xyb0.x); + clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&xyb0.y); clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&xyb0.b); - clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&xyb1.r); - clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&xyb1.g); + clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&xyb1.x); + clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&xyb1.y); clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&xyb1.b); - clSetKernelArg(kernel, 6, sizeof(cl_mem), (void*)&mask.r); - clSetKernelArg(kernel, 7, sizeof(cl_mem), (void*)&mask.g); + clSetKernelArg(kernel, 6, sizeof(cl_mem), (void*)&mask.x); + clSetKernelArg(kernel, 7, sizeof(cl_mem), (void*)&mask.y); clSetKernelArg(kernel, 8, sizeof(cl_mem), (void*)&mask.b); size_t globalWorkSize[2] = { xsize, ysize }; @@ -935,14 +935,15 @@ void clUpsampleSquareRootEx(cl_mem diffmap, size_t xsize, size_t ysize, int step cl_int clxsize = xsize; cl_int clysize = ysize; cl_int clstep = step; - ocl.allocC(xsize * ysize * sizeof(float)); + + cl_mem mem_diffmap = ocl.allocMem(xsize * ysize * sizeof(float)); cl_kernel kernel = ocl.kernel[KERNEL_UPSAMPLESQUAREROOT]; clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&diffmap); clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&xsize); clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&ysize); clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&step); - clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&ocl.dstMem); + clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&mem_diffmap); const size_t res_xsize = (xsize + step - 1) / step; const size_t res_ysize = (ysize + step - 1) / step; @@ -954,7 +955,7 @@ void clUpsampleSquareRootEx(cl_mem diffmap, size_t xsize, size_t ysize, int step LogError("Error: clUpsampleSquareRootEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err)); } err = clFinish(ocl.commandQueue); - err = clEnqueueCopyBuffer(ocl.commandQueue, ocl.dstMem, diffmap, 0, 0, xsize * ysize * sizeof(float), 0, NULL, NULL); + err = clEnqueueCopyBuffer(ocl.commandQueue, mem_diffmap, diffmap, 0, 0, xsize * ysize * sizeof(float), 0, NULL, NULL); if (CL_SUCCESS != err) { LogError("Error: clUpsampleSquareRootEx() clEnqueueCopyBuffer returned %s.\n", TranslateOpenCLError(err)); @@ -964,6 +965,7 @@ void clUpsampleSquareRootEx(cl_mem diffmap, size_t xsize, size_t ysize, int step { LogError("Error: clUpsampleSquareRootEx() clFinish returned %s.\n", TranslateOpenCLError(err)); } + clReleaseMemObject(mem_diffmap); } void clCalculateDiffmapGetBlurredEx(cl_mem diffmap, size_t xsize, size_t ysize, int s, int s2, cl_mem blurred) @@ -1029,14 +1031,14 @@ void clCalculateDiffmapEx(cl_mem diffmap/*in,out*/, size_t xsize, size_t ysize, int s2 = (8 - step) / 2; ocl_args_d_t &ocl = getOcl(); - ocl.allocA((xsize - s) * (ysize - s) * sizeof(float)); - cl_mem blurred = ocl.srcA; + cl_mem blurred = ocl.allocMem((xsize - s) * (ysize - s) * sizeof(float)); clCalculateDiffmapGetBlurredEx(diffmap, (xsize - s), (ysize - s), s, s2, blurred); static const double border_ratio = 0.03027655136; clBlurEx(blurred, xsize - s, ysize - s, kSigma, border_ratio); clGetDiffmapFromBlurredEx(diffmap, (xsize - s), (ysize - s), s, s2, blurred); clScaleImageEx(diffmap, xsize * ysize, scale); + clReleaseMemObject(blurred); } void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b, diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h index 9fe49b8f..13111404 100644 --- a/clguetzli/clguetzli.h +++ b/clguetzli/clguetzli.h @@ -53,3 +53,13 @@ void clConvolutionEx(cl_mem inp, size_t xsize, size_t ysize, cl_mem result/*out*/); void clMinSquareValEx(cl_mem img/*in,out*/, size_t xsize, size_t ysize, size_t square_size, size_t offset); + +void clUpsampleEx(cl_mem image, size_t xsize, size_t ysize, + size_t xstep, size_t ystep, + cl_mem result/*out*/); + +void clCalculateDiffmapEx(cl_mem diffmap/*in,out*/, size_t xsize, size_t ysize, int step); + +void clScaleImageEx(cl_mem img/*in, out*/, size_t size, double w); + +void clDiffPrecomputeEx(ocl_channels xyb0, ocl_channels xyb1, size_t xsize, size_t ysize, ocl_channels mask/*out*/); diff --git a/clguetzli/clguetzli_test.cpp b/clguetzli/clguetzli_test.cpp index 6679d0db..8965a633 100644 --- a/clguetzli/clguetzli_test.cpp +++ b/clguetzli/clguetzli_test.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include "clguetzli_test.h" #include "clguetzli.h" #include "ocl.h" @@ -298,6 +299,7 @@ void tclCombineChannels(const float *mask_xyb_x, const float *mask_xyb_y, const FLOAT_COMPARE(result_tmp, result, res_xsize * res_ysize); + clEnqueueUnmapMemObject(ocl.commandQueue, cl_result, result_tmp, res_xsize * res_ysize * sizeof(float), NULL, NULL); ocl.releaseMemChannels(mask); ocl.releaseMemChannels(mask_dc); clReleaseMemObject(cl_block_diff_dc); @@ -309,9 +311,21 @@ void tclCombineChannels(const float *mask_xyb_x, const float *mask_xyb_y, const // ian todo void tclCalculateDiffmap(const size_t xsize, const size_t ysize, const size_t step, - float *diffmap) + float *diffmap, size_t org_len, + float *diffmap_cmp) { + cl_int err = CL_SUCCESS; + ocl_args_d_t &ocl = getOcl(); + size_t length = xsize * ysize * sizeof(float); + cl_mem mem_diffmap = ocl.allocMem(length); + clEnqueueWriteBuffer(ocl.commandQueue, mem_diffmap, CL_FALSE, 0, org_len * sizeof(float), diffmap, 0, NULL, NULL); + clCalculateDiffmapEx(mem_diffmap, xsize, ysize, step); + //cl_float *result_tmp = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mem_diffmap, true, CL_MAP_READ, 0, length, 0, NULL, NULL, &err); + //err = clFinish(ocl.commandQueue); + //FLOAT_COMPARE(result_tmp, diffmap_cmp, xsize * ysize); + //clEnqueueUnmapMemObject(ocl.commandQueue, mem_diffmap, result_tmp, length, NULL, NULL); + clReleaseMemObject(mem_diffmap); } // chrisk todo @@ -377,19 +391,75 @@ void tclConvolution(size_t xsize, size_t ysize, } // chirsk todo -void tclUpsample(void) +void tclUpsample(float* image, size_t xsize, size_t ysize, + size_t xstep, size_t ystep, + float* result) { + int dxsize = (xsize + xstep - 1) / xstep; + int dysize = (ysize + ystep - 1) / ystep; + size_t img_size = dxsize * dysize * sizeof(float); + size_t result_size = xsize * ysize * sizeof(float); + cl_int err = 0; + ocl_args_d_t &ocl = getOcl(); + cl_mem img = ocl.allocMem(img_size); + ocl.allocA(result_size); + cl_mem r = ocl.srcA; + + clEnqueueWriteBuffer(ocl.commandQueue, img, CL_FALSE, 0, img_size, image, 0, NULL, NULL); + err = clFinish(ocl.commandQueue); + + clUpsampleEx(img, xsize, ysize, xstep, ystep, r); + cl_float *r_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, r, true, CL_MAP_READ, 0, result_size, 0, NULL, NULL, &err); + err = clFinish(ocl.commandQueue); + + FLOAT_COMPARE(result, r_r, xsize * ysize); + + clEnqueueUnmapMemObject(ocl.commandQueue, r, r_r, result_size, NULL, NULL); + err = clFinish(ocl.commandQueue); + + clReleaseMemObject(img); } // ian todo void tclDiffPrecompute( - const float *xyb0_x, const float *xyb0_y, const float *xyb0_b, - const float *xyb1_x, const float *xyb1_y, const float *xyb1_b, + const std::vector > &xyb0, + const std::vector > &xyb1, size_t xsize, size_t ysize, - float *mask_x, float *mask_y, float *mask_b) + std::vector > *mask_cmp) { - + cl_int err = 0; + ocl_args_d_t &ocl = getOcl(); + size_t channel_size = xsize * ysize * sizeof(float); + ocl_channels cl_xyb0 = ocl.allocMemChannels(channel_size); + ocl_channels cl_xyb1 = ocl.allocMemChannels(channel_size); + ocl_channels cl_mask = ocl.allocMemChannels(channel_size); + + clEnqueueWriteBuffer(ocl.commandQueue, cl_xyb0.x, CL_FALSE, 0, channel_size, xyb0[0].data(), 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, cl_xyb0.y, CL_FALSE, 0, channel_size, xyb0[1].data(), 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, cl_xyb0.b, CL_FALSE, 0, channel_size, xyb0[2].data(), 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, cl_xyb1.x, CL_FALSE, 0, channel_size, xyb1[0].data(), 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, cl_xyb1.y, CL_FALSE, 0, channel_size, xyb1[1].data(), 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, cl_xyb1.b, CL_FALSE, 0, channel_size, xyb1[2].data(), 0, NULL, NULL); + + + clDiffPrecomputeEx(cl_xyb0, cl_xyb1, xsize, ysize, cl_mask); + + cl_float *r_x = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, cl_mask.x, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + cl_float *r_y = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, cl_mask.y, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + cl_float *r_b = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, cl_mask.b, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + err = clFinish(ocl.commandQueue); + + FLOAT_COMPARE(r_x, (*mask_cmp)[0].data(), xsize * ysize); + FLOAT_COMPARE(r_y, (*mask_cmp)[1].data(), xsize * ysize); + FLOAT_COMPARE(r_b, (*mask_cmp)[2].data(), xsize * ysize); + + ocl.releaseMemChannels(cl_xyb0); + ocl.releaseMemChannels(cl_xyb1); + ocl.releaseMemChannels(cl_mask); + clEnqueueUnmapMemObject(ocl.commandQueue, cl_mask.x, r_x, channel_size, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, cl_mask.y, r_y, channel_size, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, cl_mask.b, r_b, channel_size, NULL, NULL); } // ian todo @@ -424,10 +494,21 @@ void tclMinSquareVal(float *img, size_t square_size, size_t offset, clReleaseMemObject(r); } -// ian todo -void tclScaleImage(double scale, float *result) +void tclScaleImage(double scale, float *result_org, float *result_cmp, size_t length) { + cl_int err = 0; + ocl_args_d_t &ocl = getOcl(); + cl_mem mem_result_org = ocl.allocMem(length * sizeof(float)); + clEnqueueWriteBuffer(ocl.commandQueue, mem_result_org, CL_FALSE, 0, length * sizeof(float), result_org, 0, NULL, NULL); + clScaleImageEx(mem_result_org, length, scale); + + cl_float *r_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mem_result_org, true, CL_MAP_READ, 0, length * sizeof(float), 0, NULL, NULL, &err); + err = clFinish(ocl.commandQueue); + + FLOAT_COMPARE(r_r, result_cmp, length); + clEnqueueUnmapMemObject(ocl.commandQueue, mem_result_org, r_r, length * sizeof(float), NULL, NULL); + clReleaseMemObject(mem_result_org); } // strong todo diff --git a/clguetzli/clguetzli_test.h b/clguetzli/clguetzli_test.h index 1ce0466c..4c8b4cb0 100644 --- a/clguetzli/clguetzli_test.h +++ b/clguetzli/clguetzli_test.h @@ -45,7 +45,8 @@ void tclCombineChannels(const float *mask_xyb_x, const float *mask_xyb_y, const void tclCalculateDiffmap(const size_t xsize, const size_t ysize, const size_t step, - float *diffmap); + float *diffmap, size_t org_len, + float *diffmap_cmp); void tclConvolution(size_t xsize, size_t ysize, size_t xstep, @@ -59,14 +60,14 @@ void tclBlur(size_t xsize, size_t ysize, float* channel, double sigma, double border_ratio); void tclDiffPrecompute( - const float *xyb0_x, const float *xyb0_y, const float *xyb0_b, - const float *xyb1_x, const float *xyb1_y, const float *xyb1_b, + const std::vector > &xyb0, + const std::vector > &xyb1, size_t xsize, size_t ysize, - float *mask_x, float *mask_y, float *mask_b); + std::vector > *mask_cmp); void tclAverage5x5(int xsize, int ysize, float *diffs); -void tclScaleImage(double scale, float *result); +void tclScaleImage(double scale, float *result_org, float *result_cmp, size_t length); void tclOpsinDynamicsImage(float* r, float* g, float* b, size_t xsize, size_t ysize, float* result_r, float* result_g, float* result_b); @@ -74,3 +75,7 @@ void tclOpsinDynamicsImage(float* r, float* g, float* b, size_t xsize, size_t ys void tclMinSquareVal(float *img, size_t square_size, size_t offset, size_t xsize, size_t ysize, float *values); + +void tclUpsample(float* image, size_t xsize, size_t ysize, + size_t xstep, size_t ystep, + float* result); diff --git a/third_party/butteraugli/butteraugli/butteraugli.cc b/third_party/butteraugli/butteraugli/butteraugli.cc index 948fea2e..99b14e31 100644 --- a/third_party/butteraugli/butteraugli/butteraugli.cc +++ b/third_party/butteraugli/butteraugli/butteraugli.cc @@ -144,6 +144,10 @@ void Blur(size_t xsize, size_t ysize, float* channel, double sigma, downsampled_output[(y / ystep) * dxsize + (x / xstep)]; } } + if (g_checkOpenCL) + { + tclUpsample(downsampled_output.data(), xsize, ysize, xstep, ystep, channel); + } } if (g_checkOpenCL) @@ -1080,6 +1084,11 @@ void OpsinDynamicsImage(size_t xsize, size_t ysize, } static void ScaleImage(double scale, std::vector *result) { + std::vector result_org; + if (g_checkOpenCL) + { + result_org = *result; + } PROFILER_FUNC; for (size_t i = 0; i < result->size(); ++i) { (*result)[i] *= static_cast(scale); @@ -1087,7 +1096,7 @@ static void ScaleImage(double scale, std::vector *result) { if (g_checkOpenCL) { - tclScaleImage(scale, (*result).data()); + tclScaleImage(scale, result_org.data(), (*result).data(), (*result).size()); } } @@ -1096,6 +1105,11 @@ static void ScaleImage(double scale, std::vector *result) { void CalculateDiffmap(const size_t xsize, const size_t ysize, const size_t step, std::vector* diffmap) { + std::vector diffmap_org; + if (g_checkOpenCL) + { + diffmap_org = *diffmap; + } PROFILER_FUNC; // Shift the diffmap more correctly above the pixels, from 2.5 pixels to 0.5 // pixels distance over the original image. The border of 2 pixels on top and @@ -1150,7 +1164,7 @@ void CalculateDiffmap(const size_t xsize, const size_t ysize, } if (g_checkOpenCL) { - tclCalculateDiffmap(xsize, ysize, step, (*diffmap).data()); + tclCalculateDiffmap(xsize, ysize, step, diffmap_org.data(), diffmap_org.size(), (*diffmap).data()); } } @@ -1661,11 +1675,7 @@ void DiffPrecompute( if (g_checkOpenCL) { - tclDiffPrecompute( - xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), - xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), - xsize, ysize, - ((*mask)[0]).data(), ((*mask)[1]).data(), ((*mask)[2]).data()); + tclDiffPrecompute(xyb0, xyb1, xsize, ysize, mask); } } From 7b9cf14d13bcc944387b2a4acf98c33fdaa093b0 Mon Sep 17 00:00:00 2001 From: ianhuang-777 <306168910@qq.com> Date: Thu, 11 May 2017 23:49:35 +0800 Subject: [PATCH 060/189] Add tclAverage55 --- clguetzli/clguetzli.h | 2 ++ clguetzli/clguetzli_test.cpp | 12 +++++++++++- clguetzli/clguetzli_test.h | 2 +- third_party/butteraugli/butteraugli/butteraugli.cc | 8 +++++++- 4 files changed, 21 insertions(+), 3 deletions(-) diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h index 13111404..583e37e0 100644 --- a/clguetzli/clguetzli.h +++ b/clguetzli/clguetzli.h @@ -63,3 +63,5 @@ void clCalculateDiffmapEx(cl_mem diffmap/*in,out*/, size_t xsize, size_t ysize, void clScaleImageEx(cl_mem img/*in, out*/, size_t size, double w); void clDiffPrecomputeEx(ocl_channels xyb0, ocl_channels xyb1, size_t xsize, size_t ysize, ocl_channels mask/*out*/); + +void clAverage5x5Ex(cl_mem img/*in,out*/, size_t xsize, size_t ysize); diff --git a/clguetzli/clguetzli_test.cpp b/clguetzli/clguetzli_test.cpp index 8965a633..38e3e966 100644 --- a/clguetzli/clguetzli_test.cpp +++ b/clguetzli/clguetzli_test.cpp @@ -463,9 +463,19 @@ void tclDiffPrecompute( } // ian todo -void tclAverage5x5(int xsize, int ysize, float *diffs) +void tclAverage5x5(int xsize, int ysize, std::vector &diffs_org, std::vector &diffs_cmp) { + cl_int err = 0; + ocl_args_d_t &ocl = getOcl(); + cl_mem mem_diff = ocl.allocMem(xsize * ysize * sizeof(float)); + clEnqueueWriteBuffer(ocl.commandQueue, mem_diff, CL_FALSE, 0, xsize * ysize * sizeof(float), diffs_org.data(), 0, NULL, NULL); + clAverage5x5Ex(mem_diff, xsize, ysize); + cl_float *r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mem_diff, true, CL_MAP_READ, 0, xsize * ysize * sizeof(float), 0, NULL, NULL, &err); + err = clFinish(ocl.commandQueue); + FLOAT_COMPARE(r, diffs_cmp.data(), xsize * ysize); + clEnqueueUnmapMemObject(ocl.commandQueue, mem_diff, r, xsize * ysize * sizeof(float), NULL, NULL); + clReleaseMemObject(mem_diff); } // chrisk todo diff --git a/clguetzli/clguetzli_test.h b/clguetzli/clguetzli_test.h index 4c8b4cb0..226d3d0a 100644 --- a/clguetzli/clguetzli_test.h +++ b/clguetzli/clguetzli_test.h @@ -65,7 +65,7 @@ void tclDiffPrecompute( size_t xsize, size_t ysize, std::vector > *mask_cmp); -void tclAverage5x5(int xsize, int ysize, float *diffs); +void tclAverage5x5(int xsize, int ysize, std::vector &diffs_org, std::vector &diffs_cmp); void tclScaleImage(double scale, float *result_org, float *result_cmp, size_t length); diff --git a/third_party/butteraugli/butteraugli/butteraugli.cc b/third_party/butteraugli/butteraugli/butteraugli.cc index 99b14e31..fdc1f49a 100644 --- a/third_party/butteraugli/butteraugli/butteraugli.cc +++ b/third_party/butteraugli/butteraugli/butteraugli.cc @@ -1562,6 +1562,12 @@ void MinSquareVal(size_t square_size, size_t offset, // ===== Functions used by Mask only ===== void Average5x5(int xsize, int ysize, std::vector* diffs) { + std::vector diffs_org; + if (g_checkOpenCL) + { + diffs_org = *diffs; + } + PROFILER_FUNC; if (xsize < 4 || ysize < 4) { // TODO: Make this work for small dimensions as well. @@ -1615,7 +1621,7 @@ void Average5x5(int xsize, int ysize, std::vector* diffs) { if (g_checkOpenCL) { - tclAverage5x5(xsize, ysize, (*diffs).data()); + tclAverage5x5(xsize, ysize, diffs_org, *diffs); } } From 81c43547dcbf4041b31e3450aec7194dee979049 Mon Sep 17 00:00:00 2001 From: strongtu Date: Fri, 12 May 2017 14:01:22 +0800 Subject: [PATCH 061/189] =?UTF-8?q?=E4=BF=AE=E6=AD=A3=E8=AE=A1=E7=AE=97?= =?UTF-8?q?=E7=BB=93=E6=9E=9C+=E5=A2=9E=E5=8A=A0comparator=E5=AD=90?= =?UTF-8?q?=E7=B1=BB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clBlock.cpp | 394 ++++++++++++++++++ clguetzli/clBlock.h | 24 ++ clguetzli/clguetzli.cl | 29 +- clguetzli/clguetzli.cpp | 99 +++-- clguetzli/clguetzli.h | 3 +- clguetzli/ocl.h | 4 +- guetzli.vcxproj | 2 + guetzli.vcxproj.filters | 6 + guetzli/butteraugli_comparator.h | 2 +- guetzli/processor.cc | 14 +- .../butteraugli/butteraugli/butteraugli.cc | 3 +- 11 files changed, 508 insertions(+), 72 deletions(-) create mode 100644 clguetzli/clBlock.cpp create mode 100644 clguetzli/clBlock.h diff --git a/clguetzli/clBlock.cpp b/clguetzli/clBlock.cpp new file mode 100644 index 00000000..4650a813 --- /dev/null +++ b/clguetzli/clBlock.cpp @@ -0,0 +1,394 @@ +#include +#include +#include "clBlock.h" +#include "guetzli\idct.h" + + +typedef int16_t coeff_t; + +const double* NewSrgb8ToLinearTable() { + double* table = new double[256]; + int i = 0; + for (; i < 11; ++i) { + table[i] = i / 12.92; + } + for (; i < 256; ++i) { + table[i] = 255.0 * std::pow(((i / 255.0) + 0.055) / 1.055, 2.4); + } + return table; +} + +const double* Srgb8ToLinearTable() { + static const double* const kSrgb8ToLinearTable = NewSrgb8ToLinearTable(); + return kSrgb8ToLinearTable; +} + +static const int kCrToRedTable[256] = { + -179, -178, -177, -175, -174, -172, -171, -170, -168, -167, -165, -164, + -163, -161, -160, -158, -157, -156, -154, -153, -151, -150, -149, -147, + -146, -144, -143, -142, -140, -139, -137, -136, -135, -133, -132, -130, + -129, -128, -126, -125, -123, -122, -121, -119, -118, -116, -115, -114, + -112, -111, -109, -108, -107, -105, -104, -102, -101, -100, -98, -97, + -95, -94, -93, -91, -90, -88, -87, -86, -84, -83, -81, -80, + -79, -77, -76, -74, -73, -72, -70, -69, -67, -66, -64, -63, + -62, -60, -59, -57, -56, -55, -53, -52, -50, -49, -48, -46, + -45, -43, -42, -41, -39, -38, -36, -35, -34, -32, -31, -29, + -28, -27, -25, -24, -22, -21, -20, -18, -17, -15, -14, -13, + -11, -10, -8, -7, -6, -4, -3, -1, 0, 1, 3, 4, + 6, 7, 8, 10, 11, 13, 14, 15, 17, 18, 20, 21, + 22, 24, 25, 27, 28, 29, 31, 32, 34, 35, 36, 38, + 39, 41, 42, 43, 45, 46, 48, 49, 50, 52, 53, 55, + 56, 57, 59, 60, 62, 63, 64, 66, 67, 69, 70, 72, + 73, 74, 76, 77, 79, 80, 81, 83, 84, 86, 87, 88, + 90, 91, 93, 94, 95, 97, 98, 100, 101, 102, 104, 105, + 107, 108, 109, 111, 112, 114, 115, 116, 118, 119, 121, 122, + 123, 125, 126, 128, 129, 130, 132, 133, 135, 136, 137, 139, + 140, 142, 143, 144, 146, 147, 149, 150, 151, 153, 154, 156, + 157, 158, 160, 161, 163, 164, 165, 167, 168, 170, 171, 172, + 174, 175, 177, 178 +}; + +static const int kCbToBlueTable[256] = { + -227, -225, -223, -222, -220, -218, -216, -214, -213, -211, -209, -207, + -206, -204, -202, -200, -198, -197, -195, -193, -191, -190, -188, -186, + -184, -183, -181, -179, -177, -175, -174, -172, -170, -168, -167, -165, + -163, -161, -159, -158, -156, -154, -152, -151, -149, -147, -145, -144, + -142, -140, -138, -136, -135, -133, -131, -129, -128, -126, -124, -122, + -120, -119, -117, -115, -113, -112, -110, -108, -106, -105, -103, -101, + -99, -97, -96, -94, -92, -90, -89, -87, -85, -83, -82, -80, + -78, -76, -74, -73, -71, -69, -67, -66, -64, -62, -60, -58, + -57, -55, -53, -51, -50, -48, -46, -44, -43, -41, -39, -37, + -35, -34, -32, -30, -28, -27, -25, -23, -21, -19, -18, -16, + -14, -12, -11, -9, -7, -5, -4, -2, 0, 2, 4, 5, + 7, 9, 11, 12, 14, 16, 18, 19, 21, 23, 25, 27, + 28, 30, 32, 34, 35, 37, 39, 41, 43, 44, 46, 48, + 50, 51, 53, 55, 57, 58, 60, 62, 64, 66, 67, 69, + 71, 73, 74, 76, 78, 80, 82, 83, 85, 87, 89, 90, + 92, 94, 96, 97, 99, 101, 103, 105, 106, 108, 110, 112, + 113, 115, 117, 119, 120, 122, 124, 126, 128, 129, 131, 133, + 135, 136, 138, 140, 142, 144, 145, 147, 149, 151, 152, 154, + 156, 158, 159, 161, 163, 165, 167, 168, 170, 172, 174, 175, + 177, 179, 181, 183, 184, 186, 188, 190, 191, 193, 195, 197, + 198, 200, 202, 204, 206, 207, 209, 211, 213, 214, 216, 218, + 220, 222, 223, 225, +}; + +static const int kCrToGreenTable[256] = { + 5990656, 5943854, 5897052, 5850250, 5803448, 5756646, 5709844, 5663042, + 5616240, 5569438, 5522636, 5475834, 5429032, 5382230, 5335428, 5288626, + 5241824, 5195022, 5148220, 5101418, 5054616, 5007814, 4961012, 4914210, + 4867408, 4820606, 4773804, 4727002, 4680200, 4633398, 4586596, 4539794, + 4492992, 4446190, 4399388, 4352586, 4305784, 4258982, 4212180, 4165378, + 4118576, 4071774, 4024972, 3978170, 3931368, 3884566, 3837764, 3790962, + 3744160, 3697358, 3650556, 3603754, 3556952, 3510150, 3463348, 3416546, + 3369744, 3322942, 3276140, 3229338, 3182536, 3135734, 3088932, 3042130, + 2995328, 2948526, 2901724, 2854922, 2808120, 2761318, 2714516, 2667714, + 2620912, 2574110, 2527308, 2480506, 2433704, 2386902, 2340100, 2293298, + 2246496, 2199694, 2152892, 2106090, 2059288, 2012486, 1965684, 1918882, + 1872080, 1825278, 1778476, 1731674, 1684872, 1638070, 1591268, 1544466, + 1497664, 1450862, 1404060, 1357258, 1310456, 1263654, 1216852, 1170050, + 1123248, 1076446, 1029644, 982842, 936040, 889238, 842436, 795634, + 748832, 702030, 655228, 608426, 561624, 514822, 468020, 421218, + 374416, 327614, 280812, 234010, 187208, 140406, 93604, 46802, + 0, -46802, -93604, -140406, -187208, -234010, -280812, -327614, + -374416, -421218, -468020, -514822, -561624, -608426, -655228, -702030, + -748832, -795634, -842436, -889238, -936040, -982842, -1029644, -1076446, + -1123248, -1170050, -1216852, -1263654, -1310456, -1357258, -1404060, -1450862, + -1497664, -1544466, -1591268, -1638070, -1684872, -1731674, -1778476, -1825278, + -1872080, -1918882, -1965684, -2012486, -2059288, -2106090, -2152892, -2199694, + -2246496, -2293298, -2340100, -2386902, -2433704, -2480506, -2527308, -2574110, + -2620912, -2667714, -2714516, -2761318, -2808120, -2854922, -2901724, -2948526, + -2995328, -3042130, -3088932, -3135734, -3182536, -3229338, -3276140, -3322942, + -3369744, -3416546, -3463348, -3510150, -3556952, -3603754, -3650556, -3697358, + -3744160, -3790962, -3837764, -3884566, -3931368, -3978170, -4024972, -4071774, + -4118576, -4165378, -4212180, -4258982, -4305784, -4352586, -4399388, -4446190, + -4492992, -4539794, -4586596, -4633398, -4680200, -4727002, -4773804, -4820606, + -4867408, -4914210, -4961012, -5007814, -5054616, -5101418, -5148220, -5195022, + -5241824, -5288626, -5335428, -5382230, -5429032, -5475834, -5522636, -5569438, + -5616240, -5663042, -5709844, -5756646, -5803448, -5850250, -5897052, -5943854, +}; + +static const int kCbToGreenTable[256] = { + 2919680, 2897126, 2874572, 2852018, 2829464, 2806910, 2784356, 2761802, + 2739248, 2716694, 2694140, 2671586, 2649032, 2626478, 2603924, 2581370, + 2558816, 2536262, 2513708, 2491154, 2468600, 2446046, 2423492, 2400938, + 2378384, 2355830, 2333276, 2310722, 2288168, 2265614, 2243060, 2220506, + 2197952, 2175398, 2152844, 2130290, 2107736, 2085182, 2062628, 2040074, + 2017520, 1994966, 1972412, 1949858, 1927304, 1904750, 1882196, 1859642, + 1837088, 1814534, 1791980, 1769426, 1746872, 1724318, 1701764, 1679210, + 1656656, 1634102, 1611548, 1588994, 1566440, 1543886, 1521332, 1498778, + 1476224, 1453670, 1431116, 1408562, 1386008, 1363454, 1340900, 1318346, + 1295792, 1273238, 1250684, 1228130, 1205576, 1183022, 1160468, 1137914, + 1115360, 1092806, 1070252, 1047698, 1025144, 1002590, 980036, 957482, + 934928, 912374, 889820, 867266, 844712, 822158, 799604, 777050, + 754496, 731942, 709388, 686834, 664280, 641726, 619172, 596618, + 574064, 551510, 528956, 506402, 483848, 461294, 438740, 416186, + 393632, 371078, 348524, 325970, 303416, 280862, 258308, 235754, + 213200, 190646, 168092, 145538, 122984, 100430, 77876, 55322, + 32768, 10214, -12340, -34894, -57448, -80002, -102556, -125110, + -147664, -170218, -192772, -215326, -237880, -260434, -282988, -305542, + -328096, -350650, -373204, -395758, -418312, -440866, -463420, -485974, + -508528, -531082, -553636, -576190, -598744, -621298, -643852, -666406, + -688960, -711514, -734068, -756622, -779176, -801730, -824284, -846838, + -869392, -891946, -914500, -937054, -959608, -982162, -1004716, -1027270, + -1049824, -1072378, -1094932, -1117486, -1140040, -1162594, -1185148, -1207702, + -1230256, -1252810, -1275364, -1297918, -1320472, -1343026, -1365580, -1388134, + -1410688, -1433242, -1455796, -1478350, -1500904, -1523458, -1546012, -1568566, + -1591120, -1613674, -1636228, -1658782, -1681336, -1703890, -1726444, -1748998, + -1771552, -1794106, -1816660, -1839214, -1861768, -1884322, -1906876, -1929430, + -1951984, -1974538, -1997092, -2019646, -2042200, -2064754, -2087308, -2109862, + -2132416, -2154970, -2177524, -2200078, -2222632, -2245186, -2267740, -2290294, + -2312848, -2335402, -2357956, -2380510, -2403064, -2425618, -2448172, -2470726, + -2493280, -2515834, -2538388, -2560942, -2583496, -2606050, -2628604, -2651158, + -2673712, -2696266, -2718820, -2741374, -2763928, -2786482, -2809036, -2831590, +}; + +static const uint8_t kRangeLimitLut[4 * 256] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, + 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, + 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, + 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, + 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, + 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, + 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, + 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, + 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, + 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, + 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, + 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, + 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +}; + +static const uint8_t* kRangeLimit = kRangeLimitLut + 384; + +void CoeffToIDCT(coeff_t *block, uint8_t *idct) +{ + guetzli::ComputeBlockIDCT(block, idct); +} + +void IDCTToImage(const uint8_t idct[8 * 8], uint16_t *pixels_) +{ + const int block_x = 0; + const int block_y = 0; + const int width_ = 8; + const int height_ = 8; + + for (int iy = 0; iy < 8; ++iy) { + for (int ix = 0; ix < 8; ++ix) { + int x = 8 * block_x + ix; + int y = 8 * block_y + iy; + if (x >= width_ || y >= height_) continue; + int p = y * width_ + x; + pixels_[p] = idct[8 * iy + ix] << 4; + } + } +} + +// out = [YUVYUV....YUVYUV] +void ImageToYUV(uint16_t *pixels_, uint8_t *out) +{ + const int ymin = 0; + const int xmin = 0; + const int ysize = 8; + const int xsize = 8; + const int width_ = 8; + const int height_ = 8; + const int stride = 3; + + const int yend1 = ymin + ysize; + const int yend0 = std::min(yend1, height_); + int y = ymin; + for (; y < yend0; ++y) { + const int xend1 = xmin + xsize; + const int xend0 = std::min(xend1, width_); + int x = xmin; + int px = y * width_ + xmin; + for (; x < xend0; ++x, ++px, out += stride) { + *out = static_cast((pixels_[px] + 8 - (x & 1)) >> 4); + } + const int offset = -stride; + for (; x < xend1; ++x) { + *out = out[offset]; + out += stride; + } + } + for (; y < yend1; ++y) { + const int offset = -stride * xsize; + for (int x = 0; x < xsize; ++x) { + *out = out[offset]; + out += stride; + } + } +} + +// pixel = [YUVYUV...YUVYUV] to [RGBRGB...RGBRGB] +void YUVToRGB(uint8_t* pixelBlock) +{ + for (int i = 0; i < 64; i++) + { + uint8_t *pixel = &pixelBlock[i*3]; + + int y = pixel[0]; + int cb = pixel[1]; + int cr = pixel[2]; + pixel[0] = kRangeLimit[y + kCrToRedTable[cr]]; + pixel[1] = kRangeLimit[y + ((kCrToGreenTable[cr] + kCbToGreenTable[cb]) >> 16)]; + pixel[2] = kRangeLimit[y + kCbToBlueTable[cb]]; + } +} + +// block = [R....R][G....G][B.....] +void BlockToImage(coeff_t *block, float* r, float* g, float* b) +{ + uint8_t idct[8 * 8 * 3]; + CoeffToIDCT(&block[0], &idct[0]); + CoeffToIDCT(&block[8 * 8], &idct[8 * 8]); + CoeffToIDCT(&block[8 * 8 * 2], &idct[8 * 8 * 2]); + + uint16_t pixels[8 * 8 * 3]; + + IDCTToImage(&idct[0], &pixels[0]); + IDCTToImage(&idct[8*8], &pixels[8*8]); + IDCTToImage(&idct[8*8*2], &pixels[8*8*2]); + + uint8_t yuv[8 * 8 * 3]; + + ImageToYUV(&pixels[0], &yuv[0]); + ImageToYUV(&pixels[8*8], &yuv[8*8]); + ImageToYUV(&pixels[8*8*2], &yuv[8*8*2]); + + YUVToRGB(yuv); + + const double* lut = Srgb8ToLinearTable(); + for (int i = 0; i < 8 * 8; i++) + { + r[i] = lut[yuv[3 * i]]; + g[i] = lut[yuv[3 * i + 1]]; + b[i] = lut[yuv[3 * i + 2]]; + } +} + +namespace guetzli +{ + ButteraugliComparatorEx::ButteraugliComparatorEx(const int width, const int height, + const std::vector* rgb, + const float target_distance, ProcessStats* stats) + : ButteraugliComparator(width, height, rgb, target_distance, stats) + { + + } + + void ButteraugliComparatorEx::StartBlockComparisons() + { + ButteraugliComparator::StartBlockComparisons(); + } + + void ButteraugliComparatorEx::SwitchBlock(int block_x, int block_y, int factor_x, int factor_y) + { + ButteraugliComparator::SwitchBlock(block_x, block_y, factor_x, factor_y); + } + + double ButteraugliComparatorEx::CompareBlockEx(const OutputImage& img, int off_x, int off_y, coeff_t* candidate_block) + { + return 0; + int block_x = block_x_ * factor_x_ + off_x; + int block_y = block_y_ * factor_y_ + off_y; + int xmin = 8 * block_x; + int ymin = 8 * block_y; + int block_ix = off_y * factor_x_ + off_x; + const std::vector >& rgb0_c = per_block_pregamma_[block_ix]; + + std::vector > rgb1_c2(3, std::vector(kDCTBlockSize)); + img.ToLinearRGB(xmin, ymin, 8, 8, &rgb1_c2); + + std::vector > rgb1_c(3, std::vector(kDCTBlockSize)); + BlockToImage(candidate_block, rgb1_c[0].data(), rgb1_c[1].data(), rgb1_c[2].data()); + + ::butteraugli::OpsinDynamicsImage(8, 8, rgb1_c); + + std::vector > rgb0 = rgb0_c; + std::vector > rgb1 = rgb1_c; + + ::butteraugli::MaskHighIntensityChange(8, 8, rgb0_c, rgb1_c, rgb0, rgb1); + + double b0[3 * kDCTBlockSize]; + double b1[3 * kDCTBlockSize]; + for (int c = 0; c < 3; ++c) { + for (int ix = 0; ix < kDCTBlockSize; ++ix) { + b0[c * kDCTBlockSize + ix] = rgb0[c][ix]; + b1[c * kDCTBlockSize + ix] = rgb1[c][ix]; + } + } + double diff_xyz_dc[3] = { 0.0 }; + double diff_xyz_ac[3] = { 0.0 }; + double diff_xyz_edge_dc[3] = { 0.0 }; + ::butteraugli::ButteraugliBlockDiff(b0, b1, diff_xyz_dc, diff_xyz_ac, diff_xyz_edge_dc); + + double scale[3]; + for (int c = 0; c < 3; ++c) { + scale[c] = mask_xyz_[c][ymin * width_ + xmin]; + } + + static const double kEdgeWeight = 0.05; + + double diff = 0.0; + double diff_edge = 0.0; + for (int c = 0; c < 3; ++c) { + diff += diff_xyz_dc[c] * scale[c]; + diff += diff_xyz_ac[c] * scale[c]; + diff_edge += diff_xyz_edge_dc[c] * scale[c]; + } + return sqrt((1 - kEdgeWeight) * diff + kEdgeWeight * diff_edge); + } +} diff --git a/clguetzli/clBlock.h b/clguetzli/clBlock.h new file mode 100644 index 00000000..a3c91e71 --- /dev/null +++ b/clguetzli/clBlock.h @@ -0,0 +1,24 @@ +#pragma once +#include +#include "guetzli\butteraugli_comparator.h" + +namespace guetzli { + + class ButteraugliComparatorEx : public ButteraugliComparator + { + public: + ButteraugliComparatorEx(const int width, const int height, + const std::vector* rgb, + const float target_distance, ProcessStats* stats); + + void StartBlockComparisons(); + + void SwitchBlock(int block_x, int block_y, int factor_x, int factor_y); + + double CompareBlockEx(const OutputImage& img, int off_x, int off_y, coeff_t* candidate_block); + + protected: + std::vector imgOpsinDynamicsBlockList; + }; + +} \ No newline at end of file diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index b4e3cdc8..308ef1d3 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -1276,39 +1276,46 @@ __kernel void UpsampleSquareRoot(__global float *diffmap, int xsize, int ysize, const int res_x = get_global_id(0); const int res_y = get_global_id(1); - if (res_y + 8 - step >= ysize) return; - if (res_x + 8 - step >= xsize) return; + const int res_xsize = get_global_size(0); + const int res_ysize = get_global_size(1); + + const int pos_x = res_x * step; + const int pos_y = res_y * step; + + if (pos_y + 8 - step >= ysize) return; + if (pos_x + 8 - step >= xsize) return; int s2 = (8 - step) / 2; + // Upsample and take square root. - const size_t res_xsize = (xsize + step - 1) / step; - size_t res_ix = (res_y * res_xsize + res_x) / step; - float orig_val = diffmap[res_ix]; + float orig_val = diffmap[res_y * res_xsize + res_x]; + const float kInitialSlope = 100; // TODO(b/29974893): Until that is fixed do not call sqrt on very small // numbers. double val = orig_val < (1.0 / (kInitialSlope * kInitialSlope)) ? kInitialSlope * orig_val : sqrt(orig_val); + for (size_t off_y = 0; off_y < step; ++off_y) { for (size_t off_x = 0; off_x < step; ++off_x) { - diffmap_out[(res_y + off_y + s2) * xsize + - res_x + off_x + s2] = val; + diffmap_out[(pos_y + off_y + s2) * xsize + pos_x + off_x + s2] = val; } } } -kernel void CalculateDiffmapGetBlurred(__global float *diffmap, int s, int s2, __global float *blurred) +kernel void removeBorder(__global float *in, int in_xsize, int s, int s2, __global float *out) { const int x = get_global_id(0); const int y = get_global_id(1); + const int xsize = get_global_size(0); const int ysize = get_global_size(1); - blurred[y * xsize + x] = diffmap[(y + s2) * xsize + s + x + s2]; + out[y * xsize + x] = in[(y + s2) * (xsize + s) + x + s2]; } -kernel void GetDiffmapFromBlurred(__global float *blurred, int s, int s2, __global float *diffmap) +kernel void addBorder(__global float *out, int s, int s2, __global float *in) { const int x = get_global_id(0); const int y = get_global_id(1); @@ -1316,7 +1323,7 @@ kernel void GetDiffmapFromBlurred(__global float *blurred, int s, int s2, __glob const int ysize = get_global_size(1); const double mul1 = 24.8235314874; - diffmap[(y + s2) * xsize + x + s2] += (float)(mul1) * blurred[y * (xsize - s) + x]; + out[(y + s2) * xsize + x + s2] += (float)(mul1) * in[y * (xsize - s) + x]; } diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index b37eabcd..38d31785 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -57,8 +57,8 @@ ocl_args_d_t& getOcl(void) ocl.kernel[KERNEL_MASKHIGHINTENSITYCHANGE] = clCreateKernel(ocl.program, "MaskHighIntensityChange", &err); ocl.kernel[KERNEL_DIFFPRECOMPUTE] = clCreateKernel(ocl.program, "DiffPrecompute", &err); ocl.kernel[KERNEL_UPSAMPLESQUAREROOT] = clCreateKernel(ocl.program, "UpsampleSquareRoot", &err); - ocl.kernel[KERNEL_CALCULATEDIFFMAPGETBLURRED] = clCreateKernel(ocl.program, "CalculateDiffmapGetBlurred", &err); - ocl.kernel[KERNEL_GETDIFFMAPFROMBLURRED] = clCreateKernel(ocl.program, "GetDiffmapFromBlurred", &err); + ocl.kernel[KERNEL_ADDBORDER] = clCreateKernel(ocl.program, "addBorder", &err); + ocl.kernel[KERNEL_REMOVEBORDER] = clCreateKernel(ocl.program, "removeBorder", &err); ocl.kernel[KERNEL_AVERAGEADDIMAGE] = clCreateKernel(ocl.program, "AverageAddImage", &err); ocl.kernel[KERNEL_EDGEDETECTOR] = clCreateKernel(ocl.program, "edgeDetectorMap", &err); ocl.kernel[KERNEL_BLOCKDIFFMAP] = clCreateKernel(ocl.program, "blockDiffMap", &err); @@ -936,7 +936,7 @@ void clUpsampleSquareRootEx(cl_mem diffmap, size_t xsize, size_t ysize, int step cl_int clysize = ysize; cl_int clstep = step; - cl_mem mem_diffmap = ocl.allocMem(xsize * ysize * sizeof(float)); + cl_mem mem_diffmap = ocl.allocMem(xsize * ysize * sizeof(float)); cl_kernel kernel = ocl.kernel[KERNEL_UPSAMPLESQUAREROOT]; clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&diffmap); @@ -965,23 +965,26 @@ void clUpsampleSquareRootEx(cl_mem diffmap, size_t xsize, size_t ysize, int step { LogError("Error: clUpsampleSquareRootEx() clFinish returned %s.\n", TranslateOpenCLError(err)); } - clReleaseMemObject(mem_diffmap); + + clReleaseMemObject(mem_diffmap); } -void clCalculateDiffmapGetBlurredEx(cl_mem diffmap, size_t xsize, size_t ysize, int s, int s2, cl_mem blurred) +void clRemoveBorderEx(cl_mem in, size_t xsize, size_t ysize, int step, cl_mem out) { cl_int err = 0; ocl_args_d_t &ocl = getOcl(); - cl_int cls = s; - cl_int cls2 = s2; - cl_kernel kernel = ocl.kernel[KERNEL_CALCULATEDIFFMAPGETBLURRED]; - clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&diffmap); - clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&s); - clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&s2); - clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&blurred); - - size_t globalWorkSize[2] = { xsize, ysize }; + cl_int cls = 8 - step; + cl_int cls2 = (8 - step) / 2; + cl_int clxsize = xsize; + cl_kernel kernel = ocl.kernel[KERNEL_REMOVEBORDER]; + clSetKernelArg(kernel, 0, sizeof(cl_mem), &in); + clSetKernelArg(kernel, 1, sizeof(cl_int), &clxsize); + clSetKernelArg(kernel, 2, sizeof(cl_int), &cls); + clSetKernelArg(kernel, 3, sizeof(cl_int), &cls2); + clSetKernelArg(kernel, 4, sizeof(cl_mem), &out); + + size_t globalWorkSize[2] = { xsize - cls, ysize - cls}; err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); if (CL_SUCCESS != err) { @@ -994,20 +997,20 @@ void clCalculateDiffmapGetBlurredEx(cl_mem diffmap, size_t xsize, size_t ysize, } } -void clGetDiffmapFromBlurredEx(cl_mem diffmap, size_t xsize, size_t ysize, int s, int s2, cl_mem blurred) +void clAddBorderEx(cl_mem out, size_t xsize, size_t ysize, int step, cl_mem in) { cl_int err = 0; ocl_args_d_t &ocl = getOcl(); - cl_int cls = s; - cl_int cls2 = s2; - cl_kernel kernel = ocl.kernel[KERNEL_CALCULATEDIFFMAPGETBLURRED]; - clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&blurred); - clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&s); - clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&s2); - clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&diffmap); + cl_int cls = 8 - step; + cl_int cls2 = (8 - step) / 2; + cl_kernel kernel = ocl.kernel[KERNEL_ADDBORDER]; + clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&out); + clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&cls); + clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&cls2); + clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&in); - size_t globalWorkSize[2] = { xsize, ysize }; + size_t globalWorkSize[2] = { xsize - cls, ysize - cls }; err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); if (CL_SUCCESS != err) { @@ -1027,29 +1030,35 @@ void clCalculateDiffmapEx(cl_mem diffmap/*in,out*/, size_t xsize, size_t ysize, static const double kSigma = 8.8510880283; static const double mul1 = 24.8235314874; static const double scale = 1.0 / (1.0 + mul1); + const int s = 8 - step; int s2 = (8 - step) / 2; ocl_args_d_t &ocl = getOcl(); - cl_mem blurred = ocl.allocMem((xsize - s) * (ysize - s) * sizeof(float)); - clCalculateDiffmapGetBlurredEx(diffmap, (xsize - s), (ysize - s), s, s2, blurred); + cl_mem blurred = ocl.allocMem((xsize - s) * (ysize - s) * sizeof(float)); + clRemoveBorderEx(diffmap, xsize, ysize, step, blurred); static const double border_ratio = 0.03027655136; clBlurEx(blurred, xsize - s, ysize - s, kSigma, border_ratio); - clGetDiffmapFromBlurredEx(diffmap, (xsize - s), (ysize - s), s, s2, blurred); + + clAddBorderEx(diffmap, xsize, ysize, step, blurred); clScaleImageEx(diffmap, xsize * ysize, scale); - clReleaseMemObject(blurred); + + clReleaseMemObject(blurred); } void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b, float* r2, float* g2, float* b2, size_t xsize, size_t ysize, - size_t res_xsize, size_t res_ysize, size_t step, - float* result, size_t result_len) + float* result) { - cl_int channel_size = xsize * ysize * sizeof(float); + const size_t res_xsize = (xsize + step - 1) / step; + const size_t res_ysize = (ysize + step - 1) / step; + + cl_int channel_size = xsize * ysize * sizeof(float); + cl_int channel_step_size = res_xsize * res_ysize * sizeof(float); cl_int err = 0; ocl_args_d_t &ocl = getOcl(); @@ -1062,34 +1071,35 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b, clEnqueueWriteBuffer(ocl.commandQueue, xyb1.r, CL_FALSE, 0, channel_size, r2, 0, NULL, NULL); clEnqueueWriteBuffer(ocl.commandQueue, xyb1.g, CL_FALSE, 0, channel_size, g2, 0, NULL, NULL); clEnqueueWriteBuffer(ocl.commandQueue, xyb1.b, CL_FALSE, 0, channel_size, b2, 0, NULL, NULL); - err = clFinish(ocl.commandQueue); - cl_mem mem_result = ocl.allocMem(res_xsize * res_ysize * sizeof(float)); + cl_mem mem_result = ocl.allocMem(channel_size); const float pattern = 0; clEnqueueFillBuffer(ocl.commandQueue, mem_result, &pattern, sizeof(float), 0, res_xsize * res_ysize, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, mem_result, CL_FALSE, 0, result_len, result, 0, NULL, NULL); - - ocl_channels mask = ocl.allocMemChannels(channel_size); - ocl_channels mask_dc = ocl.allocMemChannels(channel_size); + clEnqueueWriteBuffer(ocl.commandQueue, mem_result, CL_FALSE, 0, channel_step_size, result, 0, NULL, NULL); - cl_mem edge_detector_map = ocl.allocMem(3 * res_xsize * res_ysize * sizeof(float)); - cl_mem block_diff_dc = ocl.allocMem(3 * res_xsize * res_ysize * sizeof(float)); - cl_mem block_diff_ac = ocl.allocMem(3 * res_xsize * res_ysize * sizeof(float)); + cl_mem edge_detector_map = ocl.allocMem(3 * channel_step_size); + cl_mem block_diff_dc = ocl.allocMem(3 * channel_step_size); + cl_mem block_diff_ac = ocl.allocMem(3 * channel_step_size); clMaskHighIntensityChangeEx(xyb0, xyb1, xsize, ysize); clEdgeDetectorMapEx(xyb0, xyb1, xsize, ysize, step, edge_detector_map); clBlockDiffMapEx(xyb0, xyb1, xsize, ysize, step, block_diff_dc, block_diff_ac); clEdgeDetectorLowFreqEx(xyb0, xyb1, xsize, ysize, step, block_diff_ac); + { + ocl_channels mask = ocl.allocMemChannels(channel_size); + ocl_channels mask_dc = ocl.allocMemChannels(channel_size); + clMaskEx(xyb0, xyb1, xsize, ysize, mask, mask_dc); + clCombineChannelsEx(mask, mask_dc, block_diff_dc, block_diff_ac, edge_detector_map, xsize, ysize, res_xsize, step, mem_result); - clMaskEx(xyb0, xyb1, xsize, ysize, mask, mask_dc); - - clCombineChannelsEx(mask, mask_dc, block_diff_dc, block_diff_ac, edge_detector_map, xsize, ysize, res_xsize, step, mem_result); + ocl.releaseMemChannels(mask); + ocl.releaseMemChannels(mask_dc); + } clCalculateDiffmapEx(mem_result, xsize, ysize, step); - cl_float *result_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mem_result, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + cl_float *result_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mem_result, true, CL_MAP_READ, 0, res_xsize * res_ysize * sizeof(float), 0, NULL, NULL, &err); err = clFinish(ocl.commandQueue); memcpy(result, result_r, channel_size); @@ -1103,8 +1113,5 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b, clReleaseMemObject(block_diff_dc); clReleaseMemObject(block_diff_ac); - ocl.releaseMemChannels(mask); - ocl.releaseMemChannels(mask_dc); - clReleaseMemObject(mem_result); } diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h index 583e37e0..a287c8cc 100644 --- a/clguetzli/clguetzli.h +++ b/clguetzli/clguetzli.h @@ -32,9 +32,8 @@ void clOpsinDynamicsImage(size_t xsize, size_t ysize, float* r, float* g, float* void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b, float* r2, float* g2, float* b2, size_t xsize, size_t ysize, - size_t res_xsize, size_t res_ysize, size_t step, - float* result, size_t result_len); + float* result); void clCombineChannelsEx( ocl_channels mask, diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h index aac82f31..b9ada586 100644 --- a/clguetzli/ocl.h +++ b/clguetzli/ocl.h @@ -57,8 +57,8 @@ enum KernelName { KERNEL_MASKHIGHINTENSITYCHANGE, KERNEL_DIFFPRECOMPUTE, KERNEL_UPSAMPLESQUAREROOT, - KERNEL_CALCULATEDIFFMAPGETBLURRED, - KERNEL_GETDIFFMAPFROMBLURRED, + KERNEL_ADDBORDER, + KERNEL_REMOVEBORDER, KERNEL_AVERAGEADDIMAGE, KERNEL_EDGEDETECTOR, KERNEL_BLOCKDIFFMAP, diff --git a/guetzli.vcxproj b/guetzli.vcxproj index 3026fb04..d4388976 100644 --- a/guetzli.vcxproj +++ b/guetzli.vcxproj @@ -193,6 +193,7 @@ + @@ -288,6 +289,7 @@ + diff --git a/guetzli.vcxproj.filters b/guetzli.vcxproj.filters index a74b94c9..0f876fd4 100644 --- a/guetzli.vcxproj.filters +++ b/guetzli.vcxproj.filters @@ -306,6 +306,9 @@ clguetzli + + clguetzli + @@ -569,6 +572,9 @@ clguetzli + + clguetzli + diff --git a/guetzli/butteraugli_comparator.h b/guetzli/butteraugli_comparator.h index 098341e3..775b2fd7 100644 --- a/guetzli/butteraugli_comparator.h +++ b/guetzli/butteraugli_comparator.h @@ -66,7 +66,7 @@ class ButteraugliComparator : public Comparator { int factor_y, const std::vector& distmap, std::vector* block_weight) override; - private: + protected: const int width_; const int height_; const float target_distance_; diff --git a/guetzli/processor.cc b/guetzli/processor.cc index b6057f5e..1637284d 100644 --- a/guetzli/processor.cc +++ b/guetzli/processor.cc @@ -22,6 +22,7 @@ #include #include "guetzli/butteraugli_comparator.h" +#include "clguetzli\clBlock.h" #include "guetzli/comparator.h" #include "guetzli/debug_print.h" #include "guetzli/fast_log.h" @@ -407,12 +408,6 @@ void Processor::ComputeBlockZeroingOrder( std::vector > input_order; func(block, orig_block, comp_mask, params_, input_order); - if (input_order.size() > 10) - { - int i = 0; - i++; - } - coeff_t processed_block[kBlockSize]; memcpy(processed_block, block, sizeof(processed_block)); @@ -439,6 +434,7 @@ void Processor::ComputeBlockZeroingOrder( } float max_err = 0; + for (int iy = 0; iy < factor_y; ++iy) { for (int ix = 0; ix < factor_x; ++ix) { int block_xx = block_x * factor_x + ix; @@ -450,6 +446,8 @@ void Processor::ComputeBlockZeroingOrder( } } + /*max_err = */((ButteraugliComparatorEx*)comparator_)->CompareBlockEx(*img, 0, 0, candidate_block); + if (max_err < best_err) { // TOBEREMOVE:ÕÒ³ö×îС´íÎóÖµµÄi best_err = max_err; best_i = i; @@ -928,10 +926,10 @@ bool Process(const Params& params, ProcessStats* stats, if (stats == nullptr) { stats = &dummy_stats; } - std::unique_ptr comparator; + std::unique_ptr comparator; if (jpg.width >= 32 && jpg.height >= 32) { comparator.reset( - new ButteraugliComparator(jpg.width, jpg.height, &rgb, + new ButteraugliComparatorEx(jpg.width, jpg.height, &rgb, params.butteraugli_target, stats)); } bool ok = ProcessJpegData(params, jpg, comparator.get(), &out, stats); diff --git a/third_party/butteraugli/butteraugli/butteraugli.cc b/third_party/butteraugli/butteraugli/butteraugli.cc index fdc1f49a..6fd2d281 100644 --- a/third_party/butteraugli/butteraugli/butteraugli.cc +++ b/third_party/butteraugli/butteraugli/butteraugli.cc @@ -1177,11 +1177,10 @@ void ButteraugliComparator::DiffmapOpsinDynamicsImage( { result.resize(xsize_ * ysize_); clDiffmapOpsinDynamicsImage(xyb0_arg[0].data(), xyb0_arg[1].data(), xyb0_arg[2].data(), - xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), xsize_, ysize_, step_, res_xsize_, res_ysize_, result.data(), result.size()); + xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), xsize_, ysize_, step_, result.data()); return; } - if (xsize_ < 8 || ysize_ < 8) return; auto xyb0 = xyb0_arg; { From 389777fbfd7da6647e655536f4fcca627fc46b0b Mon Sep 17 00:00:00 2001 From: strongtu Date: Fri, 12 May 2017 14:15:45 +0800 Subject: [PATCH 062/189] =?UTF-8?q?fix-mapbuffer=E9=95=BF=E5=BA=A6?= =?UTF-8?q?=E5=92=8C=E9=9C=80=E8=A6=81=E7=9A=84=E4=B8=8D=E7=AC=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clguetzli.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index 38d31785..5fe8da6d 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -1099,7 +1099,7 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b, clCalculateDiffmapEx(mem_result, xsize, ysize, step); - cl_float *result_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mem_result, true, CL_MAP_READ, 0, res_xsize * res_ysize * sizeof(float), 0, NULL, NULL, &err); + cl_float *result_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mem_result, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); err = clFinish(ocl.commandQueue); memcpy(result, result_r, channel_size); From aaddc932966deed60f1423715a11fc1219eef8f4 Mon Sep 17 00:00:00 2001 From: strongtu Date: Fri, 12 May 2017 17:15:31 +0800 Subject: [PATCH 063/189] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=20clButteraugliCompa?= =?UTF-8?q?rator=EF=BC=8C=E9=81=BF=E5=85=8D=E5=AF=B9=E7=AC=AC=E4=B8=89?= =?UTF-8?q?=E6=96=B9=E5=BA=93=E4=BB=A3=E7=A0=81=E7=A0=B4=E5=9D=8F=E5=A4=AA?= =?UTF-8?q?=E5=A4=A7?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clbutter_comparator.cpp | 283 ++++++++++++++++++ clguetzli/clbutter_comparator.h | 86 ++++++ .../{clBlock.cpp => clguetzli_comparator.cpp} | 2 +- .../{clBlock.h => clguetzli_comparator.h} | 3 +- guetzli.vcxproj | 6 +- guetzli.vcxproj.filters | 10 +- guetzli/butteraugli_comparator.h | 3 +- guetzli/processor.cc | 2 +- .../butteraugli/butteraugli/butteraugli.cc | 231 +------------- .../butteraugli/butteraugli/butteraugli.h | 12 +- 10 files changed, 404 insertions(+), 234 deletions(-) create mode 100644 clguetzli/clbutter_comparator.cpp create mode 100644 clguetzli/clbutter_comparator.h rename clguetzli/{clBlock.cpp => clguetzli_comparator.cpp} (99%) rename clguetzli/{clBlock.h => clguetzli_comparator.h} (91%) diff --git a/clguetzli/clbutter_comparator.cpp b/clguetzli/clbutter_comparator.cpp new file mode 100644 index 00000000..650e4373 --- /dev/null +++ b/clguetzli/clbutter_comparator.cpp @@ -0,0 +1,283 @@ +#include "clbutter_comparator.h" +#include "clguetzli.h" +#include "clguetzli_test.h" + +namespace butteraugli +{ + clButteraugliComparator::clButteraugliComparator(size_t xsize, size_t ysize, int step) + : ButteraugliComparator(xsize, ysize, step) + { + + } + + void clButteraugliComparator::DiffmapOpsinDynamicsImage(const std::vector> &xyb0, + std::vector> &xyb1, + std::vector &result) + { + if (g_useOpenCL && xsize_ > 100 && ysize_ > 100) + { + result.resize(xsize_ * ysize_); + clDiffmapOpsinDynamicsImage(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), + xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), xsize_, ysize_, step_, result.data()); + } + else + { + ButteraugliComparator::DiffmapOpsinDynamicsImage(xyb0, xyb1, result); + } + } + + void clButteraugliComparator::BlockDiffMap(const std::vector > &xyb0, + const std::vector > &xyb1, + std::vector* block_diff_dc, + std::vector* block_diff_ac) + { + ButteraugliComparator::BlockDiffMap(xyb0, xyb1, block_diff_dc, block_diff_ac); + + if (g_checkOpenCL) + { + tclBlockDiffMap(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), + xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), + xsize_, ysize_, step_, + (*block_diff_dc).data(), (*block_diff_ac).data()); + } + } + + + void clButteraugliComparator::EdgeDetectorMap(const std::vector > &xyb0, + const std::vector > &xyb1, + std::vector* edge_detector_map) + { + ButteraugliComparator::EdgeDetectorMap(xyb0, xyb1, edge_detector_map); + + if (g_checkOpenCL) + { + tclEdgeDetectorMap(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), + xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), + xsize_, ysize_, step_, + (*edge_detector_map).data()); + } + } + + void clButteraugliComparator::EdgeDetectorLowFreq(const std::vector > &xyb0, + const std::vector > &xyb1, + std::vector* block_diff_ac) + { + std::vector orign_ac; + if (g_checkOpenCL) + { + orign_ac = *block_diff_ac; + } + + ButteraugliComparator::EdgeDetectorLowFreq(xyb0, xyb1, block_diff_ac); + + if (g_checkOpenCL) + { + tclEdgeDetectorLowFreq(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), + xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), + xsize_, ysize_, step_, + orign_ac.data(), (*block_diff_ac).data()); + } + } + + void clButteraugliComparator::CombineChannels(const std::vector >& mask_xyb, + const std::vector >& mask_xyb_dc, + const std::vector& block_diff_dc, + const std::vector& block_diff_ac, + const std::vector& edge_detector_map, + std::vector* result) + { + std::vector temp; + if (g_checkOpenCL) + { + temp = *result; + } + + ButteraugliComparator::CombineChannels(mask_xyb, mask_xyb_dc, block_diff_dc, block_diff_ac, edge_detector_map, result); + + if (g_checkOpenCL) + { + tclCombineChannels(mask_xyb[0].data(), mask_xyb[1].data(), mask_xyb[2].data(), + mask_xyb_dc[0].data(), mask_xyb_dc[1].data(), mask_xyb_dc[2].data(), + block_diff_dc.data(), + block_diff_ac.data(), edge_detector_map.data(), xsize_, ysize_, res_xsize_, res_ysize_, step_, &temp[0], &(*result)[0]); + } + } + + void MinSquareVal(size_t square_size, size_t offset, size_t xsize, size_t ysize, float *values) + { + std::vector img; + if (g_checkOpenCL) + { + img.resize(xsize * ysize); + memcpy(img.data(), values, xsize * ysize * sizeof(float)); + } + + _MinSquareVal(square_size, offset, xsize, ysize, values); + + + if (g_checkOpenCL) + { + tclMinSquareVal(img.data(), square_size, offset, xsize, ysize, values); + } + } + + void Average5x5(int xsize, int ysize, std::vector* diffs) + { + std::vector diffs_org; + if (g_checkOpenCL) + { + diffs_org = *diffs; + } + + _Average5x5(xsize, ysize, diffs); + + if (g_checkOpenCL) + { + tclAverage5x5(xsize, ysize, diffs_org, *diffs); + } + } + + void DiffPrecompute(const std::vector > &xyb0, const std::vector > &xyb1, size_t xsize, size_t ysize, std::vector > *mask) + { + _DiffPrecompute(xyb0, xyb1, xsize, ysize, mask); + + if (g_checkOpenCL) + { + tclDiffPrecompute(xyb0, xyb1, xsize, ysize, mask); + } + } + + void Mask(const std::vector > &xyb0, + const std::vector > &xyb1, + size_t xsize, size_t ysize, + std::vector > *mask, + std::vector > *mask_dc) + { + _Mask(xyb0, xyb1, xsize, ysize, mask, mask_dc); + + if (g_checkOpenCL) + { + tclMask(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), + xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), + xsize, ysize, + (*mask)[0].data(), (*mask)[1].data(), (*mask)[2].data(), + (*mask_dc)[0].data(), (*mask_dc)[1].data(), (*mask_dc)[2].data()); + } + } + + void CalculateDiffmap(const size_t xsize, const size_t ysize, + const size_t step, + std::vector* diffmap) + { + std::vector diffmap_org; + if (g_checkOpenCL) + { + diffmap_org = *diffmap; + } + + _CalculateDiffmap(xsize, ysize, step, diffmap); + + if (g_checkOpenCL) + { + tclCalculateDiffmap(xsize, ysize, step, diffmap_org.data(), diffmap_org.size(), (*diffmap).data()); + } + } + + void MaskHighIntensityChange( + size_t xsize, size_t ysize, + const std::vector > &c0, + const std::vector > &c1, + std::vector > &xyb0, + std::vector > &xyb1) + { + _MaskHighIntensityChange(xsize, ysize, c0, c1, xyb0, xyb1); + + if (g_checkOpenCL) + { + tclMaskHighIntensityChange(c0[0].data(), c0[1].data(), c0[2].data(), + c1[0].data(), c1[1].data(), c1[2].data(), + xsize, ysize, + xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), + xyb1[0].data(), xyb1[1].data(), xyb1[2].data()); + } + } + + void ScaleImage(double scale, std::vector *result) + { + std::vector result_org; + if (g_checkOpenCL) + { + result_org = *result; + } + + _ScaleImage(scale, result); + + if (g_checkOpenCL) + { + tclScaleImage(scale, result_org.data(), (*result).data(), (*result).size()); + } + } + + void Convolution(size_t xsize, size_t ysize, + size_t xstep, + size_t len, size_t offset, + const float* __restrict__ multipliers, + const float* __restrict__ inp, + float border_ratio, + float* __restrict__ result) + { + _Convolution(xsize, ysize, xstep, len, offset, multipliers, inp, border_ratio, result); + + if (g_checkOpenCL) + { + tclConvolution(xsize, ysize, xstep, len, offset, multipliers, inp, border_ratio, result); + } + } + + void Blur(size_t xsize, size_t ysize, float* channel, double sigma, + double border_ratio) + { + std::vector orignChannel; + if (g_checkOpenCL) + { + orignChannel.resize(xsize * ysize); + memcpy(orignChannel.data(), channel, xsize * ysize * sizeof(float)); + } + + _Blur(xsize, ysize, channel, sigma, border_ratio); + + if (g_checkOpenCL) + { + tclBlur(orignChannel.data(), xsize, ysize, sigma, border_ratio, channel); + } + } + + void OpsinDynamicsImage(size_t xsize, size_t ysize, + std::vector > &rgb) + { + if (g_useOpenCL && xsize > 100 && ysize > 100) + { + float * r = rgb[0].data(); + float * g = rgb[1].data(); + float * b = rgb[2].data(); + + clOpsinDynamicsImage(xsize, ysize, r, g, b); + } + else + { + std::vector< std::vector> orig_rgb; + if (g_checkOpenCL) + { + orig_rgb = rgb; + } + + _OpsinDynamicsImage(xsize, ysize, rgb); + + if (g_checkOpenCL) + { + tclOpsinDynamicsImage(orig_rgb[0].data(), orig_rgb[1].data(), orig_rgb[2].data(), xsize, ysize, + rgb[0].data(), rgb[1].data(), rgb[2].data()); + } + } + } +} \ No newline at end of file diff --git a/clguetzli/clbutter_comparator.h b/clguetzli/clbutter_comparator.h new file mode 100644 index 00000000..eb2e4e32 --- /dev/null +++ b/clguetzli/clbutter_comparator.h @@ -0,0 +1,86 @@ +#pragma once +#include +#include "butteraugli\butteraugli.h" + +#define __restrict__ + +namespace butteraugli { + + class clButteraugliComparator : public ButteraugliComparator + { + public: + clButteraugliComparator(size_t xsize, size_t ysize, int step); + + virtual void DiffmapOpsinDynamicsImage(const std::vector> &xyb0, + std::vector> &xyb1, + std::vector &result); + + virtual void BlockDiffMap(const std::vector > &rgb0, + const std::vector > &rgb1, + std::vector* block_diff_dc, + std::vector* block_diff_ac); + + + virtual void EdgeDetectorMap(const std::vector > &rgb0, + const std::vector > &rgb1, + std::vector* edge_detector_map); + + virtual void EdgeDetectorLowFreq(const std::vector > &rgb0, + const std::vector > &rgb1, + std::vector* block_diff_ac); + + virtual void CombineChannels(const std::vector >& scale_xyb, + const std::vector >& scale_xyb_dc, + const std::vector& block_diff_dc, + const std::vector& block_diff_ac, + const std::vector& edge_detector_map, + std::vector* result); + }; + + void _MinSquareVal(size_t square_size, size_t offset, size_t xsize, size_t ysize, float *values); + void _Average5x5(int xsize, int ysize, std::vector* diffs); + void _DiffPrecompute(const std::vector > &xyb0, const std::vector > &xyb1, size_t xsize, size_t ysize, std::vector > *mask); + void _Mask(const std::vector > &xyb0, + const std::vector > &xyb1, + size_t xsize, size_t ysize, + std::vector > *mask, + std::vector > *mask_dc); + void _CalculateDiffmap(const size_t xsize, const size_t ysize, + const size_t step, + std::vector* diffmap); + void _OpsinDynamicsImage(size_t xsize, size_t ysize, + std::vector > &rgb); + void _MaskHighIntensityChange( + size_t xsize, size_t ysize, + const std::vector > &c0, + const std::vector > &c1, + std::vector > &xyb0, + std::vector > &xyb1); + void _ScaleImage(double scale, std::vector *result); + void _Convolution(size_t xsize, size_t ysize, + size_t xstep, + size_t len, size_t offset, + const float* __restrict__ multipliers, + const float* __restrict__ inp, + float border_ratio, + float* __restrict__ result); + void _Blur(size_t xsize, size_t ysize, float* channel, double sigma, + double border_ratio); + + void MinSquareVal(size_t square_size, size_t offset, size_t xsize, size_t ysize, float *values); + void Average5x5(int xsize, int ysize, std::vector* diffs); + void DiffPrecompute(const std::vector > &xyb0, const std::vector > &xyb1, size_t xsize, size_t ysize, std::vector > *mask); + void ScaleImage(double scale, std::vector *result); + void Convolution(size_t xsize, size_t ysize, + size_t xstep, + size_t len, size_t offset, + const float* __restrict__ multipliers, + const float* __restrict__ inp, + float border_ratio, + float* __restrict__ result); + void Blur(size_t xsize, size_t ysize, float* channel, double sigma, + double border_ratio); + void CalculateDiffmap(const size_t xsize, const size_t ysize, + const size_t step, + std::vector* diffmap); +} \ No newline at end of file diff --git a/clguetzli/clBlock.cpp b/clguetzli/clguetzli_comparator.cpp similarity index 99% rename from clguetzli/clBlock.cpp rename to clguetzli/clguetzli_comparator.cpp index 4650a813..ce3a9b64 100644 --- a/clguetzli/clBlock.cpp +++ b/clguetzli/clguetzli_comparator.cpp @@ -1,6 +1,6 @@ #include #include -#include "clBlock.h" +#include "clguetzli_comparator.h" #include "guetzli\idct.h" diff --git a/clguetzli/clBlock.h b/clguetzli/clguetzli_comparator.h similarity index 91% rename from clguetzli/clBlock.h rename to clguetzli/clguetzli_comparator.h index a3c91e71..778d0532 100644 --- a/clguetzli/clBlock.h +++ b/clguetzli/clguetzli_comparator.h @@ -11,8 +11,9 @@ namespace guetzli { const std::vector* rgb, const float target_distance, ProcessStats* stats); - void StartBlockComparisons(); + //void Compare(const OutputImage& img) override; + void StartBlockComparisons(); void SwitchBlock(int block_x, int block_y, int factor_x, int factor_y); double CompareBlockEx(const OutputImage& img, int off_x, int off_y, coeff_t* candidate_block); diff --git a/guetzli.vcxproj b/guetzli.vcxproj index d4388976..3ae4554f 100644 --- a/guetzli.vcxproj +++ b/guetzli.vcxproj @@ -193,8 +193,9 @@ - + + @@ -289,8 +290,9 @@ - + + diff --git a/guetzli.vcxproj.filters b/guetzli.vcxproj.filters index 0f876fd4..9b0a7ad0 100644 --- a/guetzli.vcxproj.filters +++ b/guetzli.vcxproj.filters @@ -306,7 +306,10 @@ clguetzli - + + clguetzli + + clguetzli @@ -572,7 +575,10 @@ clguetzli - + + clguetzli + + clguetzli diff --git a/guetzli/butteraugli_comparator.h b/guetzli/butteraugli_comparator.h index 775b2fd7..0136f2bb 100644 --- a/guetzli/butteraugli_comparator.h +++ b/guetzli/butteraugli_comparator.h @@ -20,6 +20,7 @@ #include #include "butteraugli/butteraugli.h" +#include "clguetzli\clbutter_comparator.h" #include "guetzli/comparator.h" #include "guetzli/jpeg_data.h" #include "guetzli/output_image.h" @@ -78,7 +79,7 @@ class ButteraugliComparator : public Comparator { std::vector> rgb_linear_pregamma_; std::vector> mask_xyz_; std::vector>> per_block_pregamma_; - ::butteraugli::ButteraugliComparator comparator_; + ::butteraugli::clButteraugliComparator comparator_; float distance_; std::vector distmap_; ProcessStats* stats_; diff --git a/guetzli/processor.cc b/guetzli/processor.cc index 1637284d..62613d04 100644 --- a/guetzli/processor.cc +++ b/guetzli/processor.cc @@ -22,7 +22,7 @@ #include #include "guetzli/butteraugli_comparator.h" -#include "clguetzli\clBlock.h" +#include "clguetzli\clguetzli_comparator.h" #include "guetzli/comparator.h" #include "guetzli/debug_print.h" #include "guetzli/fast_log.h" diff --git a/third_party/butteraugli/butteraugli/butteraugli.cc b/third_party/butteraugli/butteraugli/butteraugli.cc index 6fd2d281..73b78a05 100644 --- a/third_party/butteraugli/butteraugli/butteraugli.cc +++ b/third_party/butteraugli/butteraugli/butteraugli.cc @@ -30,6 +30,7 @@ // * Blur - to hold the smoothing code #include "butteraugli/butteraugli.h" +#include "clguetzli\clbutter_comparator.h" #include #include @@ -62,7 +63,7 @@ inline double DotProduct(const float u[3], const double v[3]) { } // Computes a horizontal convolution and transposes the result. -static void Convolution(size_t xsize, size_t ysize, +void _Convolution(size_t xsize, size_t ysize, size_t xstep, size_t len, size_t offset, const float* __restrict__ multipliers, @@ -93,23 +94,10 @@ static void Convolution(size_t xsize, size_t ysize, result[ox * ysize + y] = static_cast(sum * scale); } } - - if (g_checkOpenCL) - { - tclConvolution(xsize, ysize, xstep, len, offset, multipliers, inp, border_ratio, result); - } } -void Blur(size_t xsize, size_t ysize, float* channel, double sigma, +void _Blur(size_t xsize, size_t ysize, float* channel, double sigma, double border_ratio) { - - std::vector orignChannel; - if (g_checkOpenCL) - { - orignChannel.resize(xsize * ysize); - memcpy(orignChannel.data(), channel, xsize * ysize * sizeof(float)); - } - PROFILER_FUNC; double m = 2.25; // Accuracy increases when m is increased. const double scaler = -1.0 / (2 * sigma * sigma); @@ -144,15 +132,6 @@ void Blur(size_t xsize, size_t ysize, float* channel, double sigma, downsampled_output[(y / ystep) * dxsize + (x / xstep)]; } } - if (g_checkOpenCL) - { - tclUpsample(downsampled_output.data(), xsize, ysize, xstep, ystep, channel); - } - } - - if (g_checkOpenCL) - { - tclBlur(orignChannel.data(), xsize, ysize, sigma, border_ratio, channel); } } @@ -798,7 +777,7 @@ ButteraugliComparator::ButteraugliComparator( assert(step <= 4); } -void MaskHighIntensityChange( +void _MaskHighIntensityChange( size_t xsize, size_t ysize, const std::vector > &c0, const std::vector > &c1, @@ -850,15 +829,6 @@ void MaskHighIntensityChange( } } } - - if (g_checkOpenCL) - { - tclMaskHighIntensityChange(c0[0].data(), c0[1].data(), c0[2].data(), - c1[0].data(), c1[1].data(), c1[2].data(), - xsize, ysize, - xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), - xyb1[0].data(), xyb1[1].data(), xyb1[2].data()); - } } double SimpleGamma(double v) { @@ -936,28 +906,6 @@ struct RationalPolynomial { return b1; } -#ifdef ENABLE_OPENCL_CHECK - static double EvaluatePolynomialNonRecursion(const double x, const double *coefficients, int n) { - double b1 = 0.0; - double b2 = 0.0; - - for (int i = n - 1; i >= 0; i--) - { - if (i == 0) { - const double x_b1 = x * b1; - b1 = x_b1 - b2 + coefficients[0]; - break; - } - const double x_b1 = x * b1; - const double t = (x_b1 + x_b1) - b2 + coefficients[i]; - b2 = b1; - b1 = t; - } - - return b1; - } -#endif // ENABLE_OPENCL_CHECK - // Evaluates the polynomial at x (in [min_value, max_value]). inline double operator()(const float x) const { // First normalize to [0, 1]. @@ -996,56 +944,13 @@ static inline float GammaPolynomial(float value) { return static_cast(r(value)); } -#ifdef ENABLE_OPENCL_CHECK -static double GammaNonRecursion(double v) { - double min_value = 0.770000000000000; - double max_value = 274.579999999999984; - - double p[5 + 1] = { - 881.979476556478289, 1496.058452015812463, 908.662212739659481, - 373.566100223287378, 85.840860336314364, 6.683258861509244, - }; - double q[5 + 1] = { - 12.262350348616792, 20.557285797683576, 12.161463238367844, - 4.711532733641639, 0.899112889751053, 0.035662329617191, - }; - - // First normalize to [0, 1]. - const double x01 = (v - min_value) / (max_value - min_value); - // And then to [-1, 1] domain of Chebyshev polynomials. - const double xc = 2.0 * x01 - 1.0; - - const double yp = RationalPolynomial::EvaluatePolynomialNonRecursion(xc, p, 6); - const double yq = RationalPolynomial::EvaluatePolynomialNonRecursion(xc, q, 6); - if (yq == 0.0) return 0.0; - return static_cast(yp / yq); -} -#endif // ENABLE_OPENCL_CHECK - static inline double Gamma(double v) { // return SimpleGamma(v); return GammaPolynomial(static_cast(v)); } -void OpsinDynamicsImage(size_t xsize, size_t ysize, +void _OpsinDynamicsImage(size_t xsize, size_t ysize, std::vector > &rgb) { - - if (g_useOpenCL && xsize > 100 && ysize > 100) - { - float * r = rgb[0].data(); - float * g = rgb[1].data(); - float * b = rgb[2].data(); - - clOpsinDynamicsImage(xsize, ysize, r, g, b); - return; - } - - std::vector< std::vector> orig_rgb; - if (g_checkOpenCL) - { - orig_rgb = rgb; - } - PROFILER_FUNC; std::vector > blurred = rgb; static const double kSigma = 1.1; @@ -1075,41 +980,20 @@ void OpsinDynamicsImage(size_t xsize, size_t ysize, rgb[1][i] = static_cast(y); rgb[2][i] = static_cast(z); } - - if (g_checkOpenCL) - { - tclOpsinDynamicsImage(orig_rgb[0].data(), orig_rgb[1].data(), orig_rgb[2].data(), xsize, ysize, - rgb[0].data(), rgb[1].data(), rgb[2].data()); - } } -static void ScaleImage(double scale, std::vector *result) { - std::vector result_org; - if (g_checkOpenCL) - { - result_org = *result; - } +void _ScaleImage(double scale, std::vector *result) { PROFILER_FUNC; for (size_t i = 0; i < result->size(); ++i) { (*result)[i] *= static_cast(scale); } - - if (g_checkOpenCL) - { - tclScaleImage(scale, result_org.data(), (*result).data(), (*result).size()); - } } // Making a cluster of local errors to be more impactful than // just a single error. -void CalculateDiffmap(const size_t xsize, const size_t ysize, +void _CalculateDiffmap(const size_t xsize, const size_t ysize, const size_t step, std::vector* diffmap) { - std::vector diffmap_org; - if (g_checkOpenCL) - { - diffmap_org = *diffmap; - } PROFILER_FUNC; // Shift the diffmap more correctly above the pixels, from 2.5 pixels to 0.5 // pixels distance over the original image. The border of 2 pixels on top and @@ -1162,10 +1046,6 @@ void CalculateDiffmap(const size_t xsize, const size_t ysize, } ScaleImage(scale, diffmap); } - if (g_checkOpenCL) - { - tclCalculateDiffmap(xsize, ysize, step, diffmap_org.data(), diffmap_org.size(), (*diffmap).data()); - } } void ButteraugliComparator::DiffmapOpsinDynamicsImage( @@ -1173,14 +1053,6 @@ void ButteraugliComparator::DiffmapOpsinDynamicsImage( std::vector> &xyb1, std::vector &result) { - if (g_useOpenCL && xsize_ > 100 && ysize_ > 100) - { - result.resize(xsize_ * ysize_); - clDiffmapOpsinDynamicsImage(xyb0_arg[0].data(), xyb0_arg[1].data(), xyb0_arg[2].data(), - xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), xsize_, ysize_, step_, result.data()); - return; - } - if (xsize_ < 8 || ysize_ < 8) return; auto xyb0 = xyb0_arg; { @@ -1244,14 +1116,6 @@ void ButteraugliComparator::BlockDiffMap( } } } - - if (g_checkOpenCL) - { - tclBlockDiffMap(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), - xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), - xsize_, ysize_, step_, - (*block_diff_dc).data(), (*block_diff_ac).data()); - } } void ButteraugliComparator::EdgeDetectorMap( @@ -1284,14 +1148,6 @@ void ButteraugliComparator::EdgeDetectorMap( } } } - - if (g_checkOpenCL) - { - tclEdgeDetectorMap(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), - xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), - xsize_, ysize_, step_, - (*edge_detector_map).data()); - } } void ButteraugliComparator::EdgeDetectorLowFreq( @@ -1299,12 +1155,6 @@ void ButteraugliComparator::EdgeDetectorLowFreq( const std::vector > &xyb1, std::vector* block_diff_ac) { - std::vector orign_ac; - if (g_checkOpenCL) - { - orign_ac = *block_diff_ac; - } - PROFILER_FUNC; static const double kSigma = 14; static const double kMul = 10; @@ -1355,14 +1205,6 @@ void ButteraugliComparator::EdgeDetectorLowFreq( } } } - - if (g_checkOpenCL) - { - tclEdgeDetectorLowFreq(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), - xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), - xsize_, ysize_, step_, - orign_ac.data(), (*block_diff_ac).data()); - } } void ButteraugliComparator::CombineChannels( @@ -1376,12 +1218,6 @@ void ButteraugliComparator::CombineChannels( PROFILER_FUNC; result->resize(res_xsize_ * res_ysize_); - std::vector temp; - if (g_checkOpenCL) - { - temp = *result; - } - for (size_t res_y = 0; res_y + (8 - step_) < ysize_; res_y += step_) { for (size_t res_x = 0, j = 0; res_x + (8 - step_) < xsize_; res_x += step_, j++) { size_t res_ix = (res_y * res_xsize_ + res_x) / step_; @@ -1397,14 +1233,6 @@ void ButteraugliComparator::CombineChannels( DotProduct(&edge_detector_map[3 * res_ix], mask)); } } - - if (g_checkOpenCL) - { - tclCombineChannels(mask_xyb[0].data(), mask_xyb[1].data(), mask_xyb[2].data(), - mask_xyb_dc[0].data(), mask_xyb_dc[1].data(), mask_xyb_dc[2].data(), - block_diff_dc.data(), - block_diff_ac.data(), edge_detector_map.data(), xsize_, ysize_, res_xsize_, res_ysize_, step_, &temp[0], &(*result)[0]); - } } double ButteraugliScoreFromDiffmap(const std::vector& diffmap) { @@ -1502,20 +1330,13 @@ double MaskDcB(double delta) { return InterpolateClampNegative(lut.data(), lut.size(), delta); } -void MinSquareVal(size_t square_size, size_t offset, +void _MinSquareVal(size_t square_size, size_t offset, size_t xsize, size_t ysize, float *values) { // offset is not negative and smaller than square_size. assert(offset < square_size); std::vector tmp(xsize * ysize); - std::vector img; - if (g_checkOpenCL) - { - img.resize(xsize * ysize); - memcpy(img.data(), values, xsize * ysize * sizeof(float)); - } - for (size_t y = 0; y < ysize; ++y) { const size_t minh = offset > y ? 0 : y - offset; const size_t maxh = std::min(ysize, y + square_size - offset); @@ -1552,21 +1373,10 @@ void MinSquareVal(size_t square_size, size_t offset, *pValuePoint = min; pValuePoint += xsize; } } - - if (g_checkOpenCL) - { - tclMinSquareVal(img.data(), square_size, offset, xsize, ysize, values); - } } // ===== Functions used by Mask only ===== -void Average5x5(int xsize, int ysize, std::vector* diffs) { - std::vector diffs_org; - if (g_checkOpenCL) - { - diffs_org = *diffs; - } - +void _Average5x5(int xsize, int ysize, std::vector* diffs) { PROFILER_FUNC; if (xsize < 4 || ysize < 4) { // TODO: Make this work for small dimensions as well. @@ -1617,14 +1427,9 @@ void Average5x5(int xsize, int ysize, std::vector* diffs) { } *diffs = result; ScaleImage(scale, diffs); - - if (g_checkOpenCL) - { - tclAverage5x5(xsize, ysize, diffs_org, *diffs); - } } -void DiffPrecompute( +void _DiffPrecompute( const std::vector > &xyb0, const std::vector > &xyb1, size_t xsize, size_t ysize, @@ -1677,14 +1482,9 @@ void DiffPrecompute( } } } - - if (g_checkOpenCL) - { - tclDiffPrecompute(xyb0, xyb1, xsize, ysize, mask); - } } -void Mask(const std::vector > &xyb0, +void _Mask(const std::vector > &xyb0, const std::vector > &xyb1, size_t xsize, size_t ysize, std::vector > *mask, @@ -1735,15 +1535,6 @@ void Mask(const std::vector > &xyb0, ScaleImage(kGlobalScale * kGlobalScale, &(*mask)[i]); ScaleImage(kGlobalScale * kGlobalScale, &(*mask_dc)[i]); } - - if (g_checkOpenCL) - { - tclMask(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), - xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), - xsize, ysize, - (*mask)[0].data(), (*mask)[1].data(), (*mask)[2].data(), - (*mask_dc)[0].data(), (*mask_dc)[1].data(), (*mask_dc)[2].data()); - } } } // namespace butteraugli diff --git a/third_party/butteraugli/butteraugli/butteraugli.h b/third_party/butteraugli/butteraugli/butteraugli.h index eeb91084..637f50ff 100644 --- a/third_party/butteraugli/butteraugli/butteraugli.h +++ b/third_party/butteraugli/butteraugli/butteraugli.h @@ -46,26 +46,26 @@ class ButteraugliComparator { // Computes the butteraugli map between xyb0 and xyb1 and updates result. // Both xyb0 and xyb1 are in opsin-dynamics space. // NOTE: The xyb1 image is mutated by this function in-place. - void DiffmapOpsinDynamicsImage(const std::vector> &xyb0, + virtual void DiffmapOpsinDynamicsImage(const std::vector> &xyb0, std::vector> &xyb1, std::vector &result); - private: - void BlockDiffMap(const std::vector > &rgb0, + protected: + virtual void BlockDiffMap(const std::vector > &rgb0, const std::vector > &rgb1, std::vector* block_diff_dc, std::vector* block_diff_ac); - void EdgeDetectorMap(const std::vector > &rgb0, + virtual void EdgeDetectorMap(const std::vector > &rgb0, const std::vector > &rgb1, std::vector* edge_detector_map); - void EdgeDetectorLowFreq(const std::vector > &rgb0, + virtual void EdgeDetectorLowFreq(const std::vector > &rgb0, const std::vector > &rgb1, std::vector* block_diff_ac); - void CombineChannels(const std::vector >& scale_xyb, + virtual void CombineChannels(const std::vector >& scale_xyb, const std::vector >& scale_xyb_dc, const std::vector& block_diff_dc, const std::vector& block_diff_ac, From 36905d7e7ccd40c0bef71951887e4b345f39ca25 Mon Sep 17 00:00:00 2001 From: strongtu Date: Fri, 12 May 2017 21:24:47 +0800 Subject: [PATCH 064/189] =?UTF-8?q?=E8=A7=84=E8=8C=83kernel=E5=87=BD?= =?UTF-8?q?=E6=95=B0=E5=90=8D=E4=BB=A5cl=E5=BC=80=E5=A4=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clguetzli.cl | 1613 ++++++++++++++++++++------------------- clguetzli/clguetzli.cpp | 38 +- clguetzli/clguetzli.h | 16 +- 3 files changed, 849 insertions(+), 818 deletions(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index 308ef1d3..d44f5c07 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -6,461 +6,702 @@ //#error "Double precision floating point not supported by OpenCL implementation." //#endif -__kernel void MinSquareVal(__global float* pA, __global float* pC, int square_size, int offset) +#define kBlockEdge 8 +#define kBlockSize (kBlockEdge * kBlockEdge) +#define kBlockEdgeHalf (kBlockEdge / 2) +#define kBlockHalf (kBlockEdge * kBlockEdgeHalf) + +void XybToVals(double x, double y, double z, double *valx, double *valy, double *valz); +double InterpolateClampNegative(__global const double *array, int size, double sx); +void XybDiffLowFreqSquaredAccumulate(double r0, double g0, double b0, + double r1, double g1, double b1, + double factor, double res[3]); +double DotProduct(__global float u[3], double v[3]); +void OpsinAbsorbance(const double in[3], double out[3]); +void RgbToXyb(double r, double g, double b, double *valx, double *valy, double *valz); +double Gamma(double v); +void ButteraugliBlockDiff(__private double xyb0[3 * kBlockSize], + __private double xyb1[3 * kBlockSize], + double diff_xyb_dc[3], + double diff_xyb_ac[3], + double diff_xyb_edge_dc[3]); +void Butteraugli8x8CornerEdgeDetectorDiff( + int pos_x, + int pos_y, + int xsize, + int ysize, + __global float *r, __global float *g, __global float* b, + __global float *r2, __global float* g2, __global float *b2, + double* diff_xyb); + +__kernel void clOpsinDynamicsImage( + __global float *r, __global float *g, __global float *b, + __global float *r_blurred, __global float *g_blurred, __global float *b_blurred, + int size) { - const int x = get_global_id(0); - const int y = get_global_id(1); - const int width = get_global_size(0); - const int height = get_global_size(1); + const int i = get_global_id(0); + double pre[3] = { r_blurred[i], g_blurred[i], b_blurred[i] }; + double pre_mixed[3]; + OpsinAbsorbance(pre, pre_mixed); + + double sensitivity[3]; + sensitivity[0] = Gamma(pre_mixed[0]) / pre_mixed[0]; + sensitivity[1] = Gamma(pre_mixed[1]) / pre_mixed[1]; + sensitivity[2] = Gamma(pre_mixed[2]) / pre_mixed[2]; + + double cur_rgb[3] = { r[i], g[i], b[i] }; + double cur_mixed[3]; + OpsinAbsorbance(cur_rgb, cur_mixed); + cur_mixed[0] *= sensitivity[0]; + cur_mixed[1] *= sensitivity[1]; + cur_mixed[2] *= sensitivity[2]; - int minH = offset > y ? 0 : y - offset; - int maxH = min(y + square_size - offset, height); + double x, y, z; + RgbToXyb(cur_mixed[0], cur_mixed[1], cur_mixed[2], &x, &y, &z); + r[i] = x; + g[i] = y; + b[i] = z; +} - int minW = offset > x ? 0 : x - offset; - int maxW = min(x + square_size - offset, width); +__kernel void clMinSquareVal(__global float* pA, __global float* pC, int square_size, int offset) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + const int width = get_global_size(0); + const int height = get_global_size(1); - float minValue = pA[minH * width + minW]; + int minH = offset > y ? 0 : y - offset; + int maxH = min(y + square_size - offset, height); - for (int j = minH; j < maxH; j++) - { - for (int i = minW; i < maxW; i++) - { - float tmp = pA[j * width + i]; - if (tmp < minValue) minValue = tmp; - } - } + int minW = offset > x ? 0 : x - offset; + int maxW = min(x + square_size - offset, width); - pC[y * width + x] = minValue; + float minValue = pA[minH * width + minW]; + + for (int j = minH; j < maxH; j++) + { + for (int i = minW; i < maxW; i++) + { + float tmp = pA[j * width + i]; + if (tmp < minValue) minValue = tmp; + } + } + + pC[y * width + x] = minValue; } -__kernel void ConvolutionX(__global float* multipliers, __global float* inp, __global float* result, - int step, int len, int offset, float border_ratio) +__kernel void clConvolutionX(__global float* multipliers, __global float* inp, __global float* result, + int step, int len, int offset, float border_ratio) { - const int x = get_global_id(0); - const int y = get_global_id(1); + const int x = get_global_id(0); + const int y = get_global_id(1); - if (x % step != 0) return; + if (x % step != 0) return; - const int xsize = get_global_size(0); - const int ysize = get_global_size(1); + const int xsize = get_global_size(0); + const int ysize = get_global_size(1); - float weight_no_border = 0; - for (int j = 0; j <= 2 * offset; j++) - { - weight_no_border += multipliers[j]; - } + float weight_no_border = 0; + for (int j = 0; j <= 2 * offset; j++) + { + weight_no_border += multipliers[j]; + } - int minx = x < offset ? 0 : x - offset; - int maxx = min(xsize, x + len - offset); + int minx = x < offset ? 0 : x - offset; + int maxx = min(xsize, x + len - offset); - float weight = 0.0; - for (int j = minx; j < maxx; j++) - { - weight += multipliers[j - x + offset]; - } + float weight = 0.0; + for (int j = minx; j < maxx; j++) + { + weight += multipliers[j - x + offset]; + } - weight = (1.0 - border_ratio) * weight + border_ratio * weight_no_border; - float scale = 1.0 / weight; + weight = (1.0 - border_ratio) * weight + border_ratio * weight_no_border; + float scale = 1.0 / weight; - float sum = 0.0; - for (int j = minx; j < maxx; j++) - { - sum += inp[y * xsize + j] * multipliers[j - x + offset]; - } + float sum = 0.0; + for (int j = minx; j < maxx; j++) + { + sum += inp[y * xsize + j] * multipliers[j - x + offset]; + } - result[y * xsize + x] = sum * scale; + result[y * xsize + x] = sum * scale; } -__kernel void ConvolutionY(__global float* multipliers, __global float* inp, __global float* result, - int step, int len, int offset, float border_ratio) +__kernel void clConvolutionY(__global float* multipliers, __global float* inp, __global float* result, + int step, int len, int offset, float border_ratio) { - const int x = get_global_id(0); - const int y = get_global_id(1); + const int x = get_global_id(0); + const int y = get_global_id(1); - if (x % step != 0) return; - if (y % step != 0) return; + if (x % step != 0) return; + if (y % step != 0) return; - const int xsize = get_global_size(0); - const int ysize = get_global_size(1); + const int xsize = get_global_size(0); + const int ysize = get_global_size(1); - float weight_no_border = 0; - for (int j = 0; j <= 2 * offset; j++) - { - weight_no_border += multipliers[j]; - } + float weight_no_border = 0; + for (int j = 0; j <= 2 * offset; j++) + { + weight_no_border += multipliers[j]; + } - int miny = y < offset ? 0 : y - offset; - int maxy = min(ysize, y + len - offset); + int miny = y < offset ? 0 : y - offset; + int maxy = min(ysize, y + len - offset); - float weight = 0.0; - for (int j = miny; j < maxy; j++) - { - weight += multipliers[j - y + offset]; - } + float weight = 0.0; + for (int j = miny; j < maxy; j++) + { + weight += multipliers[j - y + offset]; + } - weight = (1.0 - border_ratio) * weight + border_ratio * weight_no_border; - float scale = 1.0 / weight; + weight = (1.0 - border_ratio) * weight + border_ratio * weight_no_border; + float scale = 1.0 / weight; - float sum = 0.0; - for (int j = miny; j < maxy; j++) - { - sum += inp[j * xsize + x] * multipliers[j - y + offset]; - } + float sum = 0.0; + for (int j = miny; j < maxy; j++) + { + sum += inp[j * xsize + x] * multipliers[j - y + offset]; + } - result[y * xsize + x] = sum * scale; + result[y * xsize + x] = sum * scale; } -__kernel void Convolution(__global float* multipliers, __global float* inp, __global float* result, - int xsize, int xstep, int len, int offset, float border_ratio) +__kernel void clConvolution(__global float* multipliers, __global float* inp, __global float* result, + int xsize, int xstep, int len, int offset, float border_ratio) { - const int ox = get_global_id(0); - const int y = get_global_id(1); + const int ox = get_global_id(0); + const int y = get_global_id(1); - const int oxsize = get_global_size(0); - const int ysize = get_global_size(1); + const int oxsize = get_global_size(0); + const int ysize = get_global_size(1); - const int x = ox * xstep; + const int x = ox * xstep; - float weight_no_border = 0; - for (int j = 0; j <= 2 * offset; j++) - { - weight_no_border += multipliers[j]; - } + float weight_no_border = 0; + for (int j = 0; j <= 2 * offset; j++) + { + weight_no_border += multipliers[j]; + } - int minx = x < offset ? 0 : x - offset; - int maxx = min(xsize, x + len - offset); + int minx = x < offset ? 0 : x - offset; + int maxx = min(xsize, x + len - offset); - float weight = 0.0; - for (int j = minx; j < maxx; j++) - { - weight += multipliers[j - x + offset]; - } + float weight = 0.0; + for (int j = minx; j < maxx; j++) + { + weight += multipliers[j - x + offset]; + } - weight = (1.0 - border_ratio) * weight + border_ratio * weight_no_border; - float scale = 1.0 / weight; + weight = (1.0 - border_ratio) * weight + border_ratio * weight_no_border; + float scale = 1.0 / weight; - float sum = 0.0; - for (int j = minx; j < maxx; j++) - { - sum += inp[y * xsize + j] * multipliers[j - x + offset]; - } + float sum = 0.0; + for (int j = minx; j < maxx; j++) + { + sum += inp[y * xsize + j] * multipliers[j - x + offset]; + } - result[ox * ysize + y] = sum * scale; + result[ox * ysize + y] = sum * scale; } -__kernel void SquareSample(__global float* pA, __global float* pC, int xstep, int ystep) +__kernel void clSquareSample(__global float* pA, __global float* pC, int xstep, int ystep) { - const int x = get_global_id(0); - const int y = get_global_id(1); + const int x = get_global_id(0); + const int y = get_global_id(1); - int x_sample = x - x % xstep; - int y_sample = y - y % ystep; + int x_sample = x - x % xstep; + int y_sample = y - y % ystep; - if (x_sample == x && y_sample == y) return; + if (x_sample == x && y_sample == y) return; - const int xsize = get_global_size(0); - const int ysize = get_global_size(1); + const int xsize = get_global_size(0); + const int ysize = get_global_size(1); - pC[y * xsize + x] = pA[y_sample * xsize + x_sample]; + pC[y * xsize + x] = pA[y_sample * xsize + x_sample]; } -__kernel void DownSample(__global float* pA, __global float* pC, int xstep, int ystep) +__kernel void clDownSample(__global float* pA, __global float* pC, int xstep, int ystep) { - const int x = get_global_id(0); - const int y = get_global_id(1); + const int x = get_global_id(0); + const int y = get_global_id(1); - const int xsize = get_global_size(0); - const int ysize = get_global_size(1); + const int xsize = get_global_size(0); + const int ysize = get_global_size(1); - const int oxsize = (xsize + xstep - 1) / xstep; + const int oxsize = (xsize + xstep - 1) / xstep; - const int sample_x = x / xstep; - const int sample_y = y / ystep; + const int sample_x = x / xstep; + const int sample_y = y / ystep; - pC[y * xsize + x] = pA[sample_y * oxsize + sample_x]; + pC[y * xsize + x] = pA[sample_y * oxsize + sample_x]; } -__constant float g_mix[12] = { - 0.348036746003, - 0.577814843137, - 0.0544556093735, - 0.774145581713, - 0.26922717275, - 0.767247733938, - 0.0366922708552, - 0.920130265014, - 0.0882062883536, - 0.158581714673, - 0.712857943858, - 10.6524069248, -}; - -void OpsinAbsorbance(const double in[3], double out[3]) +__kernel void clScaleImage(double scale, __global float *result) { - out[0] = g_mix[0] * in[0] + g_mix[1] * in[1] + g_mix[2] * in[2] + g_mix[3]; - out[1] = g_mix[4] * in[0] + g_mix[5] * in[1] + g_mix[6] * in[2] + g_mix[7]; - out[2] = g_mix[8] * in[0] + g_mix[9] * in[1] + g_mix[10] * in[2] + g_mix[11]; + const int i = get_global_id(0); + result[i] *= scale; } -double EvaluatePolynomial(const double x, __constant const double *coefficients, int n) +kernel void clRemoveBorder(__global float *in, int in_xsize, int s, int s2, __global float *out) { - double b1 = 0.0; - double b2 = 0.0; - - for (int i = n - 1; i >= 0; i--) - { - if (i == 0) { - const double x_b1 = x * b1; - b1 = x_b1 - b2 + coefficients[0]; - break; - } - const double x_b1 = x * b1; - const double t = (x_b1 + x_b1) - b2 + coefficients[i]; - b2 = b1; - b1 = t; - } + const int x = get_global_id(0); + const int y = get_global_id(1); + + const int xsize = get_global_size(0); + const int ysize = get_global_size(1); - return b1; + out[y * xsize + x] = in[(y + s2) * (xsize + s) + x + s2]; } +kernel void clAddBorder(__global float *out, int s, int s2, __global float *in) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + const int xsize = get_global_size(0); + const int ysize = get_global_size(1); -__constant double g_gamma_p[5 + 1] = { - 881.979476556478289, 1496.058452015812463, 908.662212739659481, - 373.566100223287378, 85.840860336314364, 6.683258861509244, -}; -__constant double g_gamma_q[5 + 1] = { - 12.262350348616792, 20.557285797683576, 12.161463238367844, - 4.711532733641639, 0.899112889751053, 0.035662329617191, -}; + const double mul1 = 24.8235314874; + out[(y + s2) * xsize + x + s2] += (float)(mul1) * in[y * (xsize - s) + x]; -double Gamma(double v) -{ - const double min_value = 0.770000000000000; - const double max_value = 274.579999999999984; - const double x01 = (v - min_value) / (max_value - min_value); - const double xc = 2.0 * x01 - 1.0; - - const double yp = EvaluatePolynomial(xc, g_gamma_p, 6); - const double yq = EvaluatePolynomial(xc, g_gamma_q, 6); - if (yq == 0.0) return 0.0; - return (float)(yp / yq); } -void RgbToXyb(double r, double g, double b, double *valx, double *valy, double *valz) +__kernel void clCombineChannels( + __global float *mask_x, __global float *mask_y, __global float *mask_b, + __global float *mask_dc_x, __global float *mask_dc_y, __global float *mask_dc_b, + __global float *block_diff_dc, + __global float *block_diff_ac, + __global float *edge_detector_map, + int xsize, int ysize, + int res_xsize, + int step, + __global float *result) { - const double a0 = 1.01611726948; - const double a1 = 0.982482243696; - const double a2 = 1.43571362627; - const double a3 = 0.896039849412; - *valx = a0 * r - a1 * g; - *valy = a2 * r + a3 * g; - *valz = b; + const int res_x = get_global_id(0) * step; + const int res_y = get_global_id(1) * step; + + double mask[3]; + double dc_mask[3]; + mask[0] = mask_x[(res_y + 3) * xsize + (res_x + 3)]; + dc_mask[0] = mask_dc_x[(res_y + 3) * xsize + (res_x + 3)]; + + mask[1] = mask_y[(res_y + 3) * xsize + (res_x + 3)]; + dc_mask[1] = mask_dc_y[(res_y + 3) * xsize + (res_x + 3)]; + + mask[2] = mask_b[(res_y + 3) * xsize + (res_x + 3)]; + dc_mask[2] = mask_dc_b[(res_y + 3) * xsize + (res_x + 3)]; + + size_t res_ix = (res_y * res_xsize + res_x) / step; + result[res_ix] = (float)( + DotProduct(&block_diff_dc[3 * res_ix], dc_mask) + + DotProduct(&block_diff_ac[3 * res_ix], mask) + + DotProduct(&edge_detector_map[3 * res_ix], mask)); } -__kernel void OpsinDynamicsImage( - __global float *r, __global float *g, __global float *b, - __global float *r_blurred, __global float *g_blurred, __global float *b_blurred, - int size) +__kernel void clDiffPrecompute( + __global float *xyb0_x, __global float *xyb0_y, __global float *xyb0_b, + __global float *xyb1_x, __global float *xyb1_y, __global float *xyb1_b, + __global float *mask_x, __global float *mask_y, __global float *mask_b) { - const int i = get_global_id(0); - double pre[3] = { r_blurred[i], g_blurred[i], b_blurred[i] }; - double pre_mixed[3]; - OpsinAbsorbance(pre, pre_mixed); - - double sensitivity[3]; - sensitivity[0] = Gamma(pre_mixed[0]) / pre_mixed[0]; - sensitivity[1] = Gamma(pre_mixed[1]) / pre_mixed[1]; - sensitivity[2] = Gamma(pre_mixed[2]) / pre_mixed[2]; - - double cur_rgb[3] = { r[i], g[i], b[i] }; - double cur_mixed[3]; - OpsinAbsorbance(cur_rgb, cur_mixed); - cur_mixed[0] *= sensitivity[0]; - cur_mixed[1] *= sensitivity[1]; - cur_mixed[2] *= sensitivity[2]; + const int x = get_global_id(0); + const int y = get_global_id(1); + const int xsize = get_global_size(0); + const int ysize = get_global_size(1); + + double valsh0[3] = { 0.0 }; + double valsv0[3] = { 0.0 }; + double valsh1[3] = { 0.0 }; + double valsv1[3] = { 0.0 }; + int ix2; + + int ix = x + xsize * y; + if (x + 1 < xsize) { + ix2 = ix + 1; + } + else { + ix2 = ix - 1; + } + { + double x0 = (xyb0_x[ix] - xyb0_x[ix2]); + double y0 = (xyb0_y[ix] - xyb0_y[ix2]); + double z0 = (xyb0_b[ix] - xyb0_b[ix2]); + XybToVals(x0, y0, z0, &valsh0[0], &valsh0[1], &valsh0[2]); + double x1 = (xyb1_x[ix] - xyb1_x[ix2]); + double y1 = (xyb1_y[ix] - xyb1_y[ix2]); + double z1 = (xyb1_b[ix] - xyb1_b[ix2]); + XybToVals(x1, y1, z1, &valsh1[0], &valsh1[1], &valsh1[2]); + } + if (y + 1 < ysize) { + ix2 = ix + xsize; + } + else { + ix2 = ix - xsize; + } + { + double x0 = (xyb0_x[ix] - xyb0_x[ix2]); + double y0 = (xyb0_y[ix] - xyb0_y[ix2]); + double z0 = (xyb0_b[ix] - xyb0_b[ix2]); + XybToVals(x0, y0, z0, &valsv0[0], &valsv0[1], &valsv0[2]); + double x1 = (xyb1_x[ix] - xyb1_x[ix2]); + double y1 = (xyb1_y[ix] - xyb1_y[ix2]); + double z1 = (xyb1_b[ix] - xyb1_b[ix2]); + XybToVals(x1, y1, z1, &valsv1[0], &valsv1[1], &valsv1[2]); + } - double x, y, z; - RgbToXyb(cur_mixed[0], cur_mixed[1], cur_mixed[2], &x, &y, &z); - r[i] = x; - g[i] = y; - b[i] = z; + double sup0 = fabs(valsh0[0]) + fabs(valsv0[0]); + double sup1 = fabs(valsh1[0]) + fabs(valsv1[0]); + double m = min(sup0, sup1); + mask_x[ix] = (float)(m); + + sup0 = fabs(valsh0[1]) + fabs(valsv0[1]); + sup1 = fabs(valsh1[1]) + fabs(valsv1[1]); + m = min(sup0, sup1); + mask_y[ix] = (float)(m); + + sup0 = fabs(valsh0[2]) + fabs(valsv0[2]); + sup1 = fabs(valsh1[2]) + fabs(valsv1[2]); + m = min(sup0, sup1); + mask_b[ix] = (float)(m); } +__kernel void clEdgeDetectorMap(__global float *result, + __global float *r, __global float *g, __global float* b, + __global float *r2, __global float* g2, __global float *b2, + int xsize, int ysize, int step) +{ + const int res_x = get_global_id(0); + const int res_y = get_global_id(1); -double InterpolateClampNegative(__global const double *array, - int size, double sx) { - if (sx < 0) { - sx = 0; - } - double ix = fabs(sx); - int baseix = (int)(ix); - double res; - if (baseix >= size - 1) { - res = array[size - 1]; - } - else { - double mix = ix - baseix; - int nextix = baseix + 1; - res = array[baseix] + mix * (array[nextix] - array[baseix]); - } - return res; + const int res_xsize = get_global_size(0); + const int res_ysize = get_global_size(1); + + int pos_x = res_x * step; + int pos_y = res_y * step; + + if (pos_x >= xsize - (8 - step)) return; + if (pos_y >= ysize - (8 - step)) return; + + pos_x = min(pos_x, xsize - 8); + pos_y = min(pos_y, ysize - 8); + + double diff_xyb[3] = { 0.0 }; + Butteraugli8x8CornerEdgeDetectorDiff(pos_x, pos_y, xsize, ysize, + r, g, b, + r2, g2, b2, + &diff_xyb[0]); + + int idx = (res_y * res_xsize + res_x) * 3; + result[idx] = diff_xyb[0]; + result[idx + 1] = diff_xyb[1]; + result[idx + 2] = diff_xyb[2]; } -__kernel void DoMask( - __global float *mask_x, __global float *mask_y, __global float *mask_b, - __global float *mask_dc_x, __global float *mask_dc_y, __global float *mask_dc_b, - __global double *lut_x, __global double *lut_y, __global double *lut_b, - __global double *lut_dc_x, __global double *lut_dc_y, __global double *lut_dc_b) +__kernel void clEdgeDetectorLowFreq(__global float *result, + __global float *r, __global float *g, __global float* b, + __global float *r2, __global float* g2, __global float *b2, + int xsize, int ysize, int step) { - const double w00 = 232.206464018; - const double w11 = 22.9455222245; - const double w22 = 503.962310606; - - const int x = get_global_id(0); - const int y = get_global_id(1); - - const int xsize = get_global_size(0); - const int ysize = get_global_size(1); - - const size_t idx = y * xsize + x; - const double s0 = mask_x[idx]; - const double s1 = mask_y[idx]; - const double s2 = mask_b[idx]; - const double p0 = w00 * s0; - const double p1 = w11 * s1; - const double p2 = w22 * s2; - - mask_x[idx] = (float)(InterpolateClampNegative(lut_x, 512, p0)); - mask_y[idx] = (float)(InterpolateClampNegative(lut_y, 512, p1)); - mask_b[idx] = (float)(InterpolateClampNegative(lut_b, 512, p2)); - mask_dc_x[idx] = (float)(InterpolateClampNegative(lut_dc_x, 512, p0)); - mask_dc_y[idx] = (float)(InterpolateClampNegative(lut_dc_y, 512, p1)); - mask_dc_b[idx] = (float)(InterpolateClampNegative(lut_dc_b, 512, p2)); + const int res_x = get_global_id(0); + const int res_y = get_global_id(1); + + if (res_x < 8 / step) return; + + const int res_xsize = get_global_size(0); + const int res_ysize = get_global_size(1); + + int pos_x = (res_x - (8 / step)) * step; + int pos_y = res_y * step; + + if (pos_x + 8 >= xsize) return; + if (pos_y + 8 >= ysize) return; + + int ix = pos_y * xsize + pos_x; + + double diff[4][3]; + __global float* blurred0[3] = { r, g, b }; + __global float* blurred1[3] = { r2, g2, b2 }; + + for (int i = 0; i < 3; ++i) { + int ix2 = ix + 8; + diff[0][i] = + ((blurred1[i][ix] - blurred0[i][ix]) + + (blurred0[i][ix2] - blurred1[i][ix2])); + ix2 = ix + 8 * xsize; + diff[1][i] = + ((blurred1[i][ix] - blurred0[i][ix]) + + (blurred0[i][ix2] - blurred1[i][ix2])); + ix2 = ix + 6 * xsize + 6; + diff[2][i] = + ((blurred1[i][ix] - blurred0[i][ix]) + + (blurred0[i][ix2] - blurred1[i][ix2])); + ix2 = ix + 6 * xsize - 6; + diff[3][i] = pos_x < 8 ? 0 : + ((blurred1[i][ix] - blurred0[i][ix]) + + (blurred0[i][ix2] - blurred1[i][ix2])); + } + double max_diff_xyb[3] = { 0 }; + for (int k = 0; k < 4; ++k) { + double diff_xyb[3] = { 0 }; + XybDiffLowFreqSquaredAccumulate(diff[k][0], diff[k][1], diff[k][2], + 0, 0, 0, 1.0, + diff_xyb); + for (int i = 0; i < 3; ++i) { + max_diff_xyb[i] = max(max_diff_xyb[i], diff_xyb[i]); + } + } + + int res_ix = res_y * res_xsize + res_x; + + const double kMul = 10; + result[res_ix * 3] += max_diff_xyb[0] * kMul; + result[res_ix * 3 + 1] += max_diff_xyb[1] * kMul; + result[res_ix * 3 + 2] += max_diff_xyb[2] * kMul; } -__kernel void ScaleImage(double scale, __global float *result) +__kernel void clDoMask( + __global float *mask_x, __global float *mask_y, __global float *mask_b, + __global float *mask_dc_x, __global float *mask_dc_y, __global float *mask_dc_b, + __global double *lut_x, __global double *lut_y, __global double *lut_b, + __global double *lut_dc_x, __global double *lut_dc_y, __global double *lut_dc_b) { - const int i = get_global_id(0); - result[i] *= scale; + const double w00 = 232.206464018; + const double w11 = 22.9455222245; + const double w22 = 503.962310606; + + const int x = get_global_id(0); + const int y = get_global_id(1); + + const int xsize = get_global_size(0); + const int ysize = get_global_size(1); + + const size_t idx = y * xsize + x; + const double s0 = mask_x[idx]; + const double s1 = mask_y[idx]; + const double s2 = mask_b[idx]; + const double p0 = w00 * s0; + const double p1 = w11 * s1; + const double p2 = w22 * s2; + + mask_x[idx] = (float)(InterpolateClampNegative(lut_x, 512, p0)); + mask_y[idx] = (float)(InterpolateClampNegative(lut_y, 512, p1)); + mask_b[idx] = (float)(InterpolateClampNegative(lut_b, 512, p2)); + mask_dc_x[idx] = (float)(InterpolateClampNegative(lut_dc_x, 512, p0)); + mask_dc_y[idx] = (float)(InterpolateClampNegative(lut_dc_y, 512, p1)); + mask_dc_b[idx] = (float)(InterpolateClampNegative(lut_dc_b, 512, p2)); + } -double DotProduct(__global float u[3], double v[3]) { - return u[0] * v[0] + u[1] * v[1] + u[2] * v[2]; +__kernel void clBlockDiffMap(__global float* r, __global float* g, __global float* b, + __global float* r2, __global float* g2, __global float* b2, + __global float* block_diff_dc, __global float* block_diff_ac, + int xsize, int ysize, int step) +{ + const int res_x = get_global_id(0); + const int res_y = get_global_id(1); + + const int res_xsize = get_global_size(0); + const int res_ysize = get_global_size(1); + + int pos_x = res_x * step; + int pos_y = res_y * step; + + if ((pos_x + kBlockEdge - step - 1) >= xsize) return; + if ((pos_y + kBlockEdge - step - 1) >= ysize) return; + + size_t res_ix = res_y * res_xsize + res_x; + size_t offset = min(pos_y, ysize - 8) * xsize + min(pos_x, xsize - 8); + + double block0[3 * kBlockEdge * kBlockEdge]; + double block1[3 * kBlockEdge * kBlockEdge]; + + double *block0_r = &block0[0]; + double *block0_g = &block0[kBlockEdge * kBlockEdge]; + double *block0_b = &block0[2 * kBlockEdge * kBlockEdge]; + + double *block1_r = &block1[0]; + double *block1_g = &block1[kBlockEdge * kBlockEdge]; + double *block1_b = &block1[2 * kBlockEdge * kBlockEdge]; + + for (int y = 0; y < kBlockEdge; y++) + { + for (int x = 0; x < kBlockEdge; x++) + { + block0_r[kBlockEdge * y + x] = r[offset + y * xsize + x]; + block0_g[kBlockEdge * y + x] = g[offset + y * xsize + x]; + block0_b[kBlockEdge * y + x] = b[offset + y * xsize + x]; + block1_r[kBlockEdge * y + x] = r2[offset + y * xsize + x]; + block1_g[kBlockEdge * y + x] = g2[offset + y * xsize + x]; + block1_b[kBlockEdge * y + x] = b2[offset + y * xsize + x]; + } + } + + double diff_xyb_dc[3] = { 0.0 }; + double diff_xyb_ac[3] = { 0.0 }; + double diff_xyb_edge_dc[3] = { 0.0 }; + + ButteraugliBlockDiff(block0, block1, diff_xyb_dc, diff_xyb_ac, diff_xyb_edge_dc); + + for (int i = 0; i < 3; i++) + { + block_diff_dc[3 * res_ix + i] = diff_xyb_dc[i]; + block_diff_ac[3 * res_ix + i] = diff_xyb_ac[i]; + } } -__kernel void CombineChannels( - __global float *mask_x, __global float *mask_y, __global float *mask_b, - __global float *mask_dc_x, __global float *mask_dc_y, __global float *mask_dc_b, - __global float *block_diff_dc, - __global float *block_diff_ac, - __global float *edge_detector_map, - int xsize, int ysize, - int res_xsize, - int step, - __global float *result) +__kernel void clMaskHighIntensityChange( + __global float *xyb0_x, __global float *xyb0_y, __global float *xyb0_b, + __global float *xyb1_x, __global float *xyb1_y, __global float *xyb1_b, + __global float *c0_x, __global float *c0_y, __global float *c0_b, + __global float *c1_x, __global float *c1_y, __global float *c1_b +) { - const int res_x = get_global_id(0) * step; - const int res_y = get_global_id(1) * step; - - double mask[3]; - double dc_mask[3]; - mask[0] = mask_x[(res_y + 3) * xsize + (res_x + 3)]; - dc_mask[0] = mask_dc_x[(res_y + 3) * xsize + (res_x + 3)]; - - mask[1] = mask_y[(res_y + 3) * xsize + (res_x + 3)]; - dc_mask[1] = mask_dc_y[(res_y + 3) * xsize + (res_x + 3)]; - - mask[2] = mask_b[(res_y + 3) * xsize + (res_x + 3)]; - dc_mask[2] = mask_dc_b[(res_y + 3) * xsize + (res_x + 3)]; - - size_t res_ix = (res_y * res_xsize + res_x) / step; - result[res_ix] = (float)( - DotProduct(&block_diff_dc[3 * res_ix], dc_mask) + - DotProduct(&block_diff_ac[3 * res_ix], mask) + - DotProduct(&edge_detector_map[3 * res_ix], mask)); - //result[res_ix] = 1; + const int x = get_global_id(0); + const int y = get_global_id(1); + const int xsize = get_global_size(0); + const int ysize = get_global_size(1); + + size_t ix = y * xsize + x; + const double ave[3] = { + (c0_x[ix] + c1_x[ix]) * 0.5, + (c0_y[ix] + c1_y[ix]) * 0.5, + (c0_b[ix] + c1_b[ix]) * 0.5, + }; + double sqr_max_diff = -1; + { + int offset[4] = { -1, 1, -(int)(xsize), (int)(xsize) }; + int border[4] = { x == 0, x + 1 == xsize, y == 0, y + 1 == ysize }; + for (int dir = 0; dir < 4; ++dir) { + if (border[dir]) { + continue; + } + const int ix2 = ix + offset[dir]; + double diff = 0.5 * (c0_y[ix2] + c1_y[ix2]) - ave[1]; + diff *= diff; + if (sqr_max_diff < diff) { + sqr_max_diff = diff; + } + } + } + const double kReductionX = 275.19165240059317; + const double kReductionY = 18599.41286306991; + const double kReductionZ = 410.8995306951065; + const double kChromaBalance = 106.95800948271017; + double chroma_scale = kChromaBalance / (ave[1] + kChromaBalance); + + const double mix[3] = { + chroma_scale * kReductionX / (sqr_max_diff + kReductionX), + kReductionY / (sqr_max_diff + kReductionY), + chroma_scale * kReductionZ / (sqr_max_diff + kReductionZ), + }; + // Interpolate lineraly between the average color and the actual + // color -- to reduce the importance of this pixel. + xyb0_x[ix] = (float)(mix[0] * c0_x[ix] + (1 - mix[0]) * ave[0]); + xyb1_x[ix] = (float)(mix[0] * c1_x[ix] + (1 - mix[0]) * ave[0]); + + xyb0_y[ix] = (float)(mix[1] * c0_y[ix] + (1 - mix[1]) * ave[1]); + xyb1_y[ix] = (float)(mix[1] * c1_y[ix] + (1 - mix[1]) * ave[1]); + + xyb0_b[ix] = (float)(mix[2] * c0_b[ix] + (1 - mix[2]) * ave[2]); + xyb1_b[ix] = (float)(mix[2] * c1_b[ix] + (1 - mix[2]) * ave[2]); } -inline double Interpolate(__constant double *array, int size, double sx) { - double ix = fabs(sx); +__kernel void clUpsampleSquareRoot(__global float *diffmap, int xsize, int ysize, int step, __global float *diffmap_out) +{ + const int res_x = get_global_id(0); + const int res_y = get_global_id(1); - int baseix = (int)(ix); - double res; - if (baseix >= size - 1) { - res = array[size - 1]; - } - else { - double mix = ix - baseix; - int nextix = baseix + 1; - res = array[baseix] + mix * (array[nextix] - array[baseix]); - } - if (sx < 0) res = -res; - return res; -} + const int res_xsize = get_global_size(0); + const int res_ysize = get_global_size(1); -#define XybLowFreqToVals_inc 5.2511644570349185 -__constant double XybLowFreqToVals_lut[21] = { - 0, - 1 * XybLowFreqToVals_inc, - 2 * XybLowFreqToVals_inc, - 3 * XybLowFreqToVals_inc, - 4 * XybLowFreqToVals_inc, - 5 * XybLowFreqToVals_inc, - 6 * XybLowFreqToVals_inc, - 7 * XybLowFreqToVals_inc, - 8 * XybLowFreqToVals_inc, - 9 * XybLowFreqToVals_inc, - 10 * XybLowFreqToVals_inc, - 11 * XybLowFreqToVals_inc, - 12 * XybLowFreqToVals_inc, - 13 * XybLowFreqToVals_inc, - 14 * XybLowFreqToVals_inc, - 15 * XybLowFreqToVals_inc, - 16 * XybLowFreqToVals_inc, - 17 * XybLowFreqToVals_inc, - 18 * XybLowFreqToVals_inc, - 19 * XybLowFreqToVals_inc, - 20 * XybLowFreqToVals_inc, -}; + const int pos_x = res_x * step; + const int pos_y = res_y * step; -void XybLowFreqToVals(double x, double y, double z, - double *valx, double *valy, double *valz) { - const double xmul = 6.64482198135; - const double ymul = 0.837846224276; - const double zmul = 7.34905756986; - const double y_to_z_mul = 0.0812519812628; - - z += y_to_z_mul * y; - *valz = z * zmul; - *valx = x * xmul; - *valy = Interpolate(&XybLowFreqToVals_lut[0], 21, y * ymul); + if (pos_y + 8 - step >= ysize) return; + if (pos_x + 8 - step >= xsize) return; + + int s2 = (8 - step) / 2; + + // Upsample and take square root. + float orig_val = diffmap[res_y * res_xsize + res_x]; + + const float kInitialSlope = 100; + // TODO(b/29974893): Until that is fixed do not call sqrt on very small + // numbers. + double val = orig_val < (1.0 / (kInitialSlope * kInitialSlope)) + ? kInitialSlope * orig_val + : sqrt(orig_val); + + for (size_t off_y = 0; off_y < step; ++off_y) { + for (size_t off_x = 0; off_x < step; ++off_x) { + diffmap_out[(pos_y + off_y + s2) * xsize + pos_x + off_x + s2] = val; + } + } } -void XybDiffLowFreqSquaredAccumulate(double r0, double g0, double b0, - double r1, double g1, double b1, - double factor, double res[3]) { - double valx0, valy0, valz0; - double valx1, valy1, valz1; - XybLowFreqToVals(r0, g0, b0, &valx0, &valy0, &valz0); - if (r1 == 0.0 && g1 == 0.0 && b1 == 0.0) { - //PROFILER_ZONE("XybDiff r1=g1=b1=0"); - res[0] += factor * valx0 * valx0; - res[1] += factor * valy0 * valy0; - res[2] += factor * valz0 * valz0; - return; - } - XybLowFreqToVals(r1, g1, b1, &valx1, &valy1, &valz1); - // Approximate the distance of the colors by their respective distances - // to gray. - double valx = valx0 - valx1; - double valy = valy0 - valy1; - double valz = valz0 - valz1; - res[0] += factor * valx * valx; - res[1] += factor * valy * valy; - res[2] += factor * valz * valz; +__kernel void clAverageAddImage(__global float *img, __global float *tmp0, __global float *tmp1) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + const int xsize = get_global_size(0); + const int ysize = get_global_size(1); + + const int row0 = y * xsize; + if (x == 0) // excute once per y + { + img[row0 + 1] += tmp0[row0]; + img[row0 + 0] += tmp0[row0 + 1]; + img[row0 + 2] += tmp0[row0 + 1]; + + img[row0 + xsize - 3] += tmp0[row0 + xsize - 2]; + img[row0 + xsize - 1] += tmp0[row0 + xsize - 2]; + img[row0 + xsize - 2] += tmp0[row0 + xsize - 1]; + + if (y > 0) { + const int rowd1 = row0 - xsize; + img[rowd1 + 1] += tmp1[row0]; + img[rowd1 + 0] += tmp0[row0]; + + img[rowd1 + xsize - 1] += tmp0[row0 + xsize - 1]; + img[rowd1 + xsize - 2] += tmp1[row0 + xsize - 1]; + } + if (y + 1 < ysize) { + const int rowu1 = row0 + xsize; + img[rowu1 + 1] += tmp1[row0]; + img[rowu1 + 0] += tmp0[row0]; + + img[rowu1 + xsize - 1] += tmp0[row0 + xsize - 1]; + img[rowu1 + xsize - 2] += tmp1[row0 + xsize - 1]; + } + } + + if (x >= 2 && x < xsize - 2) + { + img[row0 + x - 1] += tmp0[row0 + x]; + img[row0 + x + 1] += tmp0[row0 + x]; + } + + if (x >= 1 && x < xsize - 1) { + if (y > 0) { + const int rowd1 = row0 - xsize; + img[rowd1 + x + 1] += tmp1[row0 + x]; + img[rowd1 + x + 0] += tmp0[row0 + x]; + img[rowd1 + x - 1] += tmp1[row0 + x]; + } + if (y + 1 < ysize) { + const int rowu1 = row0 + xsize; + img[rowu1 + x + 1] += tmp1[row0 + x]; + img[rowu1 + x + 0] += tmp0[row0 + x]; + img[rowu1 + x - 1] += tmp1[row0 + x]; + } + } } + + + + + void Butteraugli8x8CornerEdgeDetectorDiff( int pos_x, int pos_y, @@ -517,105 +758,176 @@ void Butteraugli8x8CornerEdgeDetectorDiff( } } -__kernel void edgeDetectorMap(__global float *result, - __global float *r, __global float *g, __global float* b, - __global float *r2, __global float* g2, __global float *b2, - int xsize, int ysize, int step) -{ - const int res_x = get_global_id(0); - const int res_y = get_global_id(1); - const int res_xsize = get_global_size(0); - const int res_ysize = get_global_size(1); - int pos_x = res_x * step; - int pos_y = res_y * step; +double DotProduct(__global float u[3], double v[3]) { + return u[0] * v[0] + u[1] * v[1] + u[2] * v[2]; +} - if (pos_x >= xsize - (8 - step)) return; - if (pos_y >= ysize - (8 - step)) return; +double Interpolate(__constant double *array, int size, double sx) { + double ix = fabs(sx); - pos_x = min(pos_x, xsize - 8); - pos_y = min(pos_y, ysize - 8); + int baseix = (int)(ix); + double res; + if (baseix >= size - 1) { + res = array[size - 1]; + } + else { + double mix = ix - baseix; + int nextix = baseix + 1; + res = array[baseix] + mix * (array[nextix] - array[baseix]); + } + if (sx < 0) res = -res; + return res; +} - double diff_xyb[3] = { 0.0 }; - Butteraugli8x8CornerEdgeDetectorDiff(pos_x, pos_y, xsize, ysize, - r, g, b, - r2, g2, b2, - &diff_xyb[0]); +#define XybToVals_off_x 11.38708334481672 +#define XybToVals_inc_x 14.550189611520716 +__constant double XybToVals_lut_x[21] = { + 0, + XybToVals_off_x, + XybToVals_off_x + 1 * XybToVals_inc_x, + XybToVals_off_x + 2 * XybToVals_inc_x, + XybToVals_off_x + 3 * XybToVals_inc_x, + XybToVals_off_x + 4 * XybToVals_inc_x, + XybToVals_off_x + 5 * XybToVals_inc_x, + XybToVals_off_x + 6 * XybToVals_inc_x, + XybToVals_off_x + 7 * XybToVals_inc_x, + XybToVals_off_x + 8 * XybToVals_inc_x, + XybToVals_off_x + 9 * XybToVals_inc_x, + XybToVals_off_x + 10 * XybToVals_inc_x, + XybToVals_off_x + 11 * XybToVals_inc_x, + XybToVals_off_x + 12 * XybToVals_inc_x, + XybToVals_off_x + 13 * XybToVals_inc_x, + XybToVals_off_x + 14 * XybToVals_inc_x, + XybToVals_off_x + 15 * XybToVals_inc_x, + XybToVals_off_x + 16 * XybToVals_inc_x, + XybToVals_off_x + 17 * XybToVals_inc_x, + XybToVals_off_x + 18 * XybToVals_inc_x, + XybToVals_off_x + 19 * XybToVals_inc_x, +}; - int idx = (res_y * res_xsize + res_x) * 3; - result[idx] = diff_xyb[0]; - result[idx + 1] = diff_xyb[1]; - result[idx + 2] = diff_xyb[2]; -} +#define XybToVals_off_y 1.4103373714040413 +#define XybToVals_inc_y 0.7084088867024 +__constant double XybToVals_lut_y[21] = { + 0, + XybToVals_off_y, + XybToVals_off_y + 1 * XybToVals_inc_y, + XybToVals_off_y + 2 * XybToVals_inc_y, + XybToVals_off_y + 3 * XybToVals_inc_y, + XybToVals_off_y + 4 * XybToVals_inc_y, + XybToVals_off_y + 5 * XybToVals_inc_y, + XybToVals_off_y + 6 * XybToVals_inc_y, + XybToVals_off_y + 7 * XybToVals_inc_y, + XybToVals_off_y + 8 * XybToVals_inc_y, + XybToVals_off_y + 9 * XybToVals_inc_y, + XybToVals_off_y + 10 * XybToVals_inc_y, + XybToVals_off_y + 11 * XybToVals_inc_y, + XybToVals_off_y + 12 * XybToVals_inc_y, + XybToVals_off_y + 13 * XybToVals_inc_y, + XybToVals_off_y + 14 * XybToVals_inc_y, + XybToVals_off_y + 15 * XybToVals_inc_y, + XybToVals_off_y + 16 * XybToVals_inc_y, + XybToVals_off_y + 17 * XybToVals_inc_y, + XybToVals_off_y + 18 * XybToVals_inc_y, + XybToVals_off_y + 19 * XybToVals_inc_y, +}; -__kernel void edgeDetectorLowFreq(__global float *result, - __global float *r, __global float *g, __global float* b, - __global float *r2, __global float* g2, __global float *b2, - int xsize, int ysize, int step) +void XybToVals( + double x, double y, double z, + double *valx, double *valy, double *valz) { - const int res_x = get_global_id(0); - const int res_y = get_global_id(1); - - if (res_x < 8 / step) return; - - const int res_xsize = get_global_size(0); - const int res_ysize = get_global_size(1); - - int pos_x = (res_x - (8 / step)) * step; - int pos_y = res_y * step; - - if (pos_x + 8 >= xsize) return; - if (pos_y + 8 >= ysize) return; - - int ix = pos_y * xsize + pos_x; - - double diff[4][3]; - __global float* blurred0[3] = { r, g, b }; - __global float* blurred1[3] = { r2, g2, b2 }; - - for (int i = 0; i < 3; ++i) { - int ix2 = ix + 8; - diff[0][i] = - ((blurred1[i][ix] - blurred0[i][ix]) + - (blurred0[i][ix2] - blurred1[i][ix2])); - ix2 = ix + 8 * xsize; - diff[1][i] = - ((blurred1[i][ix] - blurred0[i][ix]) + - (blurred0[i][ix2] - blurred1[i][ix2])); - ix2 = ix + 6 * xsize + 6; - diff[2][i] = - ((blurred1[i][ix] - blurred0[i][ix]) + - (blurred0[i][ix2] - blurred1[i][ix2])); - ix2 = ix + 6 * xsize - 6; - diff[3][i] = pos_x < 8 ? 0 : - ((blurred1[i][ix] - blurred0[i][ix]) + - (blurred0[i][ix2] - blurred1[i][ix2])); - } - double max_diff_xyb[3] = { 0 }; - for (int k = 0; k < 4; ++k) { - double diff_xyb[3] = { 0 }; - XybDiffLowFreqSquaredAccumulate(diff[k][0], diff[k][1], diff[k][2], - 0, 0, 0, 1.0, - diff_xyb); - for (int i = 0; i < 3; ++i) { - max_diff_xyb[i] = max(max_diff_xyb[i], diff_xyb[i]); - } - } + const double xmul = 0.758304045695; + const double ymul = 2.28148649801; + const double zmul = 1.87816926918; + + *valx = Interpolate(&XybToVals_lut_x[0], 21, x * xmul); + *valy = Interpolate(&XybToVals_lut_y[0], 21, y * ymul); + *valz = zmul * z; +} - int res_ix = res_y * res_xsize + res_x; - const double kMul = 10; +#define XybLowFreqToVals_inc 5.2511644570349185 +__constant double XybLowFreqToVals_lut[21] = { + 0, + 1 * XybLowFreqToVals_inc, + 2 * XybLowFreqToVals_inc, + 3 * XybLowFreqToVals_inc, + 4 * XybLowFreqToVals_inc, + 5 * XybLowFreqToVals_inc, + 6 * XybLowFreqToVals_inc, + 7 * XybLowFreqToVals_inc, + 8 * XybLowFreqToVals_inc, + 9 * XybLowFreqToVals_inc, + 10 * XybLowFreqToVals_inc, + 11 * XybLowFreqToVals_inc, + 12 * XybLowFreqToVals_inc, + 13 * XybLowFreqToVals_inc, + 14 * XybLowFreqToVals_inc, + 15 * XybLowFreqToVals_inc, + 16 * XybLowFreqToVals_inc, + 17 * XybLowFreqToVals_inc, + 18 * XybLowFreqToVals_inc, + 19 * XybLowFreqToVals_inc, + 20 * XybLowFreqToVals_inc, +}; - result[res_ix * 3] += max_diff_xyb[0] * kMul; - result[res_ix * 3 + 1] += max_diff_xyb[1] * kMul; - result[res_ix * 3 + 2] += max_diff_xyb[2] * kMul; +void XybLowFreqToVals(double x, double y, double z, + double *valx, double *valy, double *valz) { + const double xmul = 6.64482198135; + const double ymul = 0.837846224276; + const double zmul = 7.34905756986; + const double y_to_z_mul = 0.0812519812628; + + z += y_to_z_mul * y; + *valz = z * zmul; + *valx = x * xmul; + *valy = Interpolate(&XybLowFreqToVals_lut[0], 21, y * ymul); } -#define kBlockEdge 8 -#define kBlockSize (kBlockEdge * kBlockEdge) -#define kBlockEdgeHalf (kBlockEdge / 2) -#define kBlockHalf (kBlockEdge * kBlockEdgeHalf) + +double InterpolateClampNegative(__global const double *array, + int size, double sx) { + if (sx < 0) { + sx = 0; + } + double ix = fabs(sx); + int baseix = (int)(ix); + double res; + if (baseix >= size - 1) { + res = array[size - 1]; + } + else { + double mix = ix - baseix; + int nextix = baseix + 1; + res = array[baseix] + mix * (array[nextix] - array[baseix]); + } + return res; +} + +void XybDiffLowFreqSquaredAccumulate(double r0, double g0, double b0, + double r1, double g1, double b1, + double factor, double res[3]) { + double valx0, valy0, valz0; + double valx1, valy1, valz1; + XybLowFreqToVals(r0, g0, b0, &valx0, &valy0, &valz0); + if (r1 == 0.0 && g1 == 0.0 && b1 == 0.0) { + //PROFILER_ZONE("XybDiff r1=g1=b1=0"); + res[0] += factor * valx0 * valx0; + res[1] += factor * valy0 * valy0; + res[2] += factor * valz0 * valz0; + return; + } + XybLowFreqToVals(r1, g1, b1, &valx1, &valy1, &valz1); + // Approximate the distance of the colors by their respective distances + // to gray. + double valx = valx0 - valx1; + double valy = valy0 - valy1; + double valz = valz0 - valz1; + res[0] += factor * valx * valx; + res[1] += factor * valy * valy; + res[2] += factor * valz * valz; +} typedef struct __Complex { @@ -1026,361 +1338,80 @@ void ButteraugliBlockDiff(__private double xyb0[3 * kBlockSize], } } -__kernel void blockDiffMap(__global float* r, __global float* g, __global float* b, - __global float* r2, __global float* g2, __global float* b2, - __global float* block_diff_dc, __global float* block_diff_ac, - int xsize, int ysize, int step) -{ - const int res_x = get_global_id(0); - const int res_y = get_global_id(1); - - const int res_xsize = get_global_size(0); - const int res_ysize = get_global_size(1); - - int pos_x = res_x * step; - int pos_y = res_y * step; - - if ((pos_x + kBlockEdge - step - 1) >= xsize) return; - if ((pos_y + kBlockEdge - step - 1) >= ysize) return; - - size_t res_ix = res_y * res_xsize + res_x; - size_t offset = min(pos_y, ysize - 8) * xsize + min(pos_x, xsize - 8); - - double block0[3 * kBlockEdge * kBlockEdge]; - double block1[3 * kBlockEdge * kBlockEdge]; - - double *block0_r = &block0[0]; - double *block0_g = &block0[kBlockEdge * kBlockEdge]; - double *block0_b = &block0[2 * kBlockEdge * kBlockEdge]; - - double *block1_r = &block1[0]; - double *block1_g = &block1[kBlockEdge * kBlockEdge]; - double *block1_b = &block1[2 * kBlockEdge * kBlockEdge]; - - for (int y = 0; y < kBlockEdge; y++) - { - for (int x = 0; x < kBlockEdge; x++) - { - block0_r[kBlockEdge * y + x] = r[offset + y * xsize + x]; - block0_g[kBlockEdge * y + x] = g[offset + y * xsize + x]; - block0_b[kBlockEdge * y + x] = b[offset + y * xsize + x]; - block1_r[kBlockEdge * y + x] = r2[offset + y * xsize + x]; - block1_g[kBlockEdge * y + x] = g2[offset + y * xsize + x]; - block1_b[kBlockEdge * y + x] = b2[offset + y * xsize + x]; - } - } - - double diff_xyb_dc[3] = { 0.0 }; - double diff_xyb_ac[3] = { 0.0 }; - double diff_xyb_edge_dc[3] = { 0.0 }; - ButteraugliBlockDiff(block0, block1, diff_xyb_dc, diff_xyb_ac, diff_xyb_edge_dc); - - for (int i = 0; i < 3; i++) - { - block_diff_dc[3 * res_ix + i] = diff_xyb_dc[i]; - block_diff_ac[3 * res_ix + i] = diff_xyb_ac[i]; - } -} -__kernel void MaskHighIntensityChange( - __global float *xyb0_x, __global float *xyb0_y, __global float *xyb0_b, - __global float *xyb1_x, __global float *xyb1_y, __global float *xyb1_b, - __global float *c0_x, __global float *c0_y, __global float *c0_b, - __global float *c1_x, __global float *c1_y, __global float *c1_b - ) -{ - const int x = get_global_id(0); - const int y = get_global_id(1); - const int xsize = get_global_size(0); - const int ysize = get_global_size(1); - - size_t ix = y * xsize + x; - const double ave[3] = { - (c0_x[ix] + c1_x[ix]) * 0.5, - (c0_y[ix] + c1_y[ix]) * 0.5, - (c0_b[ix] + c1_b[ix]) * 0.5, - }; - double sqr_max_diff = -1; - { - int offset[4] = { -1, 1, -(int)(xsize), (int)(xsize) }; - int border[4] = { x == 0, x + 1 == xsize, y == 0, y + 1 == ysize }; - for (int dir = 0; dir < 4; ++dir) { - if (border[dir]) { - continue; - } - const int ix2 = ix + offset[dir]; - double diff = 0.5 * (c0_y[ix2] + c1_y[ix2]) - ave[1]; - diff *= diff; - if (sqr_max_diff < diff) { - sqr_max_diff = diff; - } - } - } - const double kReductionX = 275.19165240059317; - const double kReductionY = 18599.41286306991; - const double kReductionZ = 410.8995306951065; - const double kChromaBalance = 106.95800948271017; - double chroma_scale = kChromaBalance / (ave[1] + kChromaBalance); - - const double mix[3] = { - chroma_scale * kReductionX / (sqr_max_diff + kReductionX), - kReductionY / (sqr_max_diff + kReductionY), - chroma_scale * kReductionZ / (sqr_max_diff + kReductionZ), - }; - // Interpolate lineraly between the average color and the actual - // color -- to reduce the importance of this pixel. - xyb0_x[ix] = (float)(mix[0] * c0_x[ix] + (1 - mix[0]) * ave[0]); - xyb1_x[ix] = (float)(mix[0] * c1_x[ix] + (1 - mix[0]) * ave[0]); - - xyb0_y[ix] = (float)(mix[1] * c0_y[ix] + (1 - mix[1]) * ave[1]); - xyb1_y[ix] = (float)(mix[1] * c1_y[ix] + (1 - mix[1]) * ave[1]); - - xyb0_b[ix] = (float)(mix[2] * c0_b[ix] + (1 - mix[2]) * ave[2]); - xyb1_b[ix] = (float)(mix[2] * c1_b[ix] + (1 - mix[2]) * ave[2]); -} - - -#define XybToVals_off_x 11.38708334481672 -#define XybToVals_inc_x 14.550189611520716 -__constant double XybToVals_lut_x[21] = { - 0, - XybToVals_off_x, - XybToVals_off_x + 1 * XybToVals_inc_x, - XybToVals_off_x + 2 * XybToVals_inc_x, - XybToVals_off_x + 3 * XybToVals_inc_x, - XybToVals_off_x + 4 * XybToVals_inc_x, - XybToVals_off_x + 5 * XybToVals_inc_x, - XybToVals_off_x + 6 * XybToVals_inc_x, - XybToVals_off_x + 7 * XybToVals_inc_x, - XybToVals_off_x + 8 * XybToVals_inc_x, - XybToVals_off_x + 9 * XybToVals_inc_x, - XybToVals_off_x + 10 * XybToVals_inc_x, - XybToVals_off_x + 11 * XybToVals_inc_x, - XybToVals_off_x + 12 * XybToVals_inc_x, - XybToVals_off_x + 13 * XybToVals_inc_x, - XybToVals_off_x + 14 * XybToVals_inc_x, - XybToVals_off_x + 15 * XybToVals_inc_x, - XybToVals_off_x + 16 * XybToVals_inc_x, - XybToVals_off_x + 17 * XybToVals_inc_x, - XybToVals_off_x + 18 * XybToVals_inc_x, - XybToVals_off_x + 19 * XybToVals_inc_x, -}; - -#define XybToVals_off_y 1.4103373714040413 -#define XybToVals_inc_y 0.7084088867024 -__constant double XybToVals_lut_y[21] = { - 0, - XybToVals_off_y, - XybToVals_off_y + 1 * XybToVals_inc_y, - XybToVals_off_y + 2 * XybToVals_inc_y, - XybToVals_off_y + 3 * XybToVals_inc_y, - XybToVals_off_y + 4 * XybToVals_inc_y, - XybToVals_off_y + 5 * XybToVals_inc_y, - XybToVals_off_y + 6 * XybToVals_inc_y, - XybToVals_off_y + 7 * XybToVals_inc_y, - XybToVals_off_y + 8 * XybToVals_inc_y, - XybToVals_off_y + 9 * XybToVals_inc_y, - XybToVals_off_y + 10 * XybToVals_inc_y, - XybToVals_off_y + 11 * XybToVals_inc_y, - XybToVals_off_y + 12 * XybToVals_inc_y, - XybToVals_off_y + 13 * XybToVals_inc_y, - XybToVals_off_y + 14 * XybToVals_inc_y, - XybToVals_off_y + 15 * XybToVals_inc_y, - XybToVals_off_y + 16 * XybToVals_inc_y, - XybToVals_off_y + 17 * XybToVals_inc_y, - XybToVals_off_y + 18 * XybToVals_inc_y, - XybToVals_off_y + 19 * XybToVals_inc_y, -}; - -void XybToVals( - double x, double y, double z, - double *valx, double *valy, double *valz) -{ - const double xmul = 0.758304045695; - const double ymul = 2.28148649801; - const double zmul = 1.87816926918; - - *valx = Interpolate(&XybToVals_lut_x[0], 21, x * xmul); - *valy = Interpolate(&XybToVals_lut_y[0], 21, y * ymul); - *valz = zmul * z; -} - -__kernel void DiffPrecompute( - __global float *xyb0_x, __global float *xyb0_y, __global float *xyb0_b, - __global float *xyb1_x, __global float *xyb1_y, __global float *xyb1_b, - __global float *mask_x, __global float *mask_y, __global float *mask_b ) -{ - const int x = get_global_id(0); - const int y = get_global_id(1); - const int xsize = get_global_size(0); - const int ysize = get_global_size(1); - - double valsh0[3] = { 0.0 }; - double valsv0[3] = { 0.0 }; - double valsh1[3] = { 0.0 }; - double valsv1[3] = { 0.0 }; - int ix2; - - int ix = x + xsize * y; - if (x + 1 < xsize) { - ix2 = ix + 1; - } - else { - ix2 = ix - 1; - } - { - double x0 = (xyb0_x[ix] - xyb0_x[ix2]); - double y0 = (xyb0_y[ix] - xyb0_y[ix2]); - double z0 = (xyb0_b[ix] - xyb0_b[ix2]); - XybToVals(x0, y0, z0, &valsh0[0], &valsh0[1], &valsh0[2]); - double x1 = (xyb1_x[ix] - xyb1_x[ix2]); - double y1 = (xyb1_y[ix] - xyb1_y[ix2]); - double z1 = (xyb1_b[ix] - xyb1_b[ix2]); - XybToVals(x1, y1, z1, &valsh1[0], &valsh1[1], &valsh1[2]); - } - if (y + 1 < ysize) { - ix2 = ix + xsize; - } - else { - ix2 = ix - xsize; - } - { - double x0 = (xyb0_x[ix] - xyb0_x[ix2]); - double y0 = (xyb0_y[ix] - xyb0_y[ix2]); - double z0 = (xyb0_b[ix] - xyb0_b[ix2]); - XybToVals(x0, y0, z0, &valsv0[0], &valsv0[1], &valsv0[2]); - double x1 = (xyb1_x[ix] - xyb1_x[ix2]); - double y1 = (xyb1_y[ix] - xyb1_y[ix2]); - double z1 = (xyb1_b[ix] - xyb1_b[ix2]); - XybToVals(x1, y1, z1, &valsv1[0], &valsv1[1], &valsv1[2]); - } - - double sup0 = fabs(valsh0[0]) + fabs(valsv0[0]); - double sup1 = fabs(valsh1[0]) + fabs(valsv1[0]); - double m = min(sup0, sup1); - mask_x[ix] = (float)(m); - - sup0 = fabs(valsh0[1]) + fabs(valsv0[1]); - sup1 = fabs(valsh1[1]) + fabs(valsv1[1]); - m = min(sup0, sup1); - mask_y[ix] = (float)(m); - - sup0 = fabs(valsh0[2]) + fabs(valsv0[2]); - sup1 = fabs(valsh1[2]) + fabs(valsv1[2]); - m = min(sup0, sup1); - mask_b[ix] = (float)(m); -} - -__kernel void UpsampleSquareRoot(__global float *diffmap, int xsize, int ysize, int step, __global float *diffmap_out) +void OpsinAbsorbance(const double in[3], double out[3]) { - const int res_x = get_global_id(0); - const int res_y = get_global_id(1); - - const int res_xsize = get_global_size(0); - const int res_ysize = get_global_size(1); - - const int pos_x = res_x * step; - const int pos_y = res_y * step; - - if (pos_y + 8 - step >= ysize) return; - if (pos_x + 8 - step >= xsize) return; - - int s2 = (8 - step) / 2; - - // Upsample and take square root. - float orig_val = diffmap[res_y * res_xsize + res_x]; - - const float kInitialSlope = 100; - // TODO(b/29974893): Until that is fixed do not call sqrt on very small - // numbers. - double val = orig_val < (1.0 / (kInitialSlope * kInitialSlope)) - ? kInitialSlope * orig_val - : sqrt(orig_val); - - for (size_t off_y = 0; off_y < step; ++off_y) { - for (size_t off_x = 0; off_x < step; ++off_x) { - diffmap_out[(pos_y + off_y + s2) * xsize + pos_x + off_x + s2] = val; - } - } + __constant static float g_mix[12] = { + 0.348036746003, + 0.577814843137, + 0.0544556093735, + 0.774145581713, + 0.26922717275, + 0.767247733938, + 0.0366922708552, + 0.920130265014, + 0.0882062883536, + 0.158581714673, + 0.712857943858, + 10.6524069248, + }; + + out[0] = g_mix[0] * in[0] + g_mix[1] * in[1] + g_mix[2] * in[2] + g_mix[3]; + out[1] = g_mix[4] * in[0] + g_mix[5] * in[1] + g_mix[6] * in[2] + g_mix[7]; + out[2] = g_mix[8] * in[0] + g_mix[9] * in[1] + g_mix[10] * in[2] + g_mix[11]; } -kernel void removeBorder(__global float *in, int in_xsize, int s, int s2, __global float *out) +double EvaluatePolynomial(const double x, __constant const double *coefficients, int n) { - const int x = get_global_id(0); - const int y = get_global_id(1); + double b1 = 0.0; + double b2 = 0.0; - const int xsize = get_global_size(0); - const int ysize = get_global_size(1); + for (int i = n - 1; i >= 0; i--) + { + if (i == 0) { + const double x_b1 = x * b1; + b1 = x_b1 - b2 + coefficients[0]; + break; + } + const double x_b1 = x * b1; + const double t = (x_b1 + x_b1) - b2 + coefficients[i]; + b2 = b1; + b1 = t; + } - out[y * xsize + x] = in[(y + s2) * (xsize + s) + x + s2]; + return b1; } -kernel void addBorder(__global float *out, int s, int s2, __global float *in) +double Gamma(double v) { - const int x = get_global_id(0); - const int y = get_global_id(1); - const int xsize = get_global_size(0); - const int ysize = get_global_size(1); - - const double mul1 = 24.8235314874; - out[(y + s2) * xsize + x + s2] += (float)(mul1) * in[y * (xsize - s) + x]; - + static __constant double g_gamma_p[5 + 1] = { + 881.979476556478289, 1496.058452015812463, 908.662212739659481, + 373.566100223287378, 85.840860336314364, 6.683258861509244, + }; + + static __constant double g_gamma_q[5 + 1] = { + 12.262350348616792, 20.557285797683576, 12.161463238367844, + 4.711532733641639, 0.899112889751053, 0.035662329617191, + }; + + const double min_value = 0.770000000000000; + const double max_value = 274.579999999999984; + const double x01 = (v - min_value) / (max_value - min_value); + const double xc = 2.0 * x01 - 1.0; + + const double yp = EvaluatePolynomial(xc, g_gamma_p, 6); + const double yq = EvaluatePolynomial(xc, g_gamma_q, 6); + if (yq == 0.0) return 0.0; + return (float)(yp / yq); } -__kernel void AverageAddImage(__global float *img, __global float *tmp0, __global float *tmp1) +void RgbToXyb(double r, double g, double b, double *valx, double *valy, double *valz) { - const int x = get_global_id(0); - const int y = get_global_id(1); - const int xsize = get_global_size(0); - const int ysize = get_global_size(1); - - const int row0 = y * xsize; - if (x == 0) // excute once per y - { - img[row0 + 1] += tmp0[row0]; - img[row0 + 0] += tmp0[row0 + 1]; - img[row0 + 2] += tmp0[row0 + 1]; - - img[row0 + xsize - 3] += tmp0[row0 + xsize - 2]; - img[row0 + xsize - 1] += tmp0[row0 + xsize - 2]; - img[row0 + xsize - 2] += tmp0[row0 + xsize - 1]; - - if (y > 0) { - const int rowd1 = row0 - xsize; - img[rowd1 + 1] += tmp1[row0]; - img[rowd1 + 0] += tmp0[row0]; - - img[rowd1 + xsize - 1] += tmp0[row0 + xsize - 1]; - img[rowd1 + xsize - 2] += tmp1[row0 + xsize - 1]; - } - if (y + 1 < ysize) { - const int rowu1 = row0 + xsize; - img[rowu1 + 1] += tmp1[row0]; - img[rowu1 + 0] += tmp0[row0]; - - img[rowu1 + xsize - 1] += tmp0[row0 + xsize - 1]; - img[rowu1 + xsize - 2] += tmp1[row0 + xsize - 1]; - } - } - - if (x >= 2 && x < xsize - 2) - { - img[row0 + x - 1] += tmp0[row0 + x]; - img[row0 + x + 1] += tmp0[row0 + x]; - } - - if (x >= 1 && x < xsize - 1) { - if (y > 0) { - const int rowd1 = row0 - xsize; - img[rowd1 + x + 1] += tmp1[row0 + x]; - img[rowd1 + x + 0] += tmp0[row0 + x]; - img[rowd1 + x - 1] += tmp1[row0 + x]; - } - if (y + 1 < ysize) { - const int rowu1 = row0 + xsize; - img[rowu1 + x + 1] += tmp1[row0 + x]; - img[rowu1 + x + 0] += tmp0[row0 + x]; - img[rowu1 + x - 1] += tmp1[row0 + x]; - } - } -} + const double a0 = 1.01611726948; + const double a1 = 0.982482243696; + const double a2 = 1.43571362627; + const double a3 = 0.896039849412; + *valx = a0 * r - a1 * g; + *valy = a2 * r + a3 * g; + *valz = b; +} \ No newline at end of file diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index 5fe8da6d..20f246f3 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -44,25 +44,25 @@ ocl_args_d_t& getOcl(void) LogError("Error happened during the build of OpenCL program.\nBuild log:%s", &build_log[0]); } } - ocl.kernel[KERNEL_MINSQUAREVAL] = clCreateKernel(ocl.program, "MinSquareVal", &err); - ocl.kernel[KERNEL_CONVOLUTION] = clCreateKernel(ocl.program, "Convolution", &err); - ocl.kernel[KERNEL_CONVOLUTIONX] = clCreateKernel(ocl.program, "ConvolutionX", &err); - ocl.kernel[KERNEL_CONVOLUTIONY] = clCreateKernel(ocl.program, "ConvolutionY", &err); - ocl.kernel[KERNEL_SQUARESAMPLE] = clCreateKernel(ocl.program, "SquareSample", &err); - ocl.kernel[KERNEL_DOWNSAMPLE] = clCreateKernel(ocl.program, "DownSample", &err); - ocl.kernel[KERNEL_OPSINDYNAMICSIMAGE] = clCreateKernel(ocl.program, "OpsinDynamicsImage", &err); - ocl.kernel[KERNEL_DOMASK] = clCreateKernel(ocl.program, "DoMask", &err); - ocl.kernel[KERNEL_SCALEIMAGE] = clCreateKernel(ocl.program, "ScaleImage", &err); - ocl.kernel[KERNEL_COMBINECHANNELS] = clCreateKernel(ocl.program, "CombineChannels", &err); - ocl.kernel[KERNEL_MASKHIGHINTENSITYCHANGE] = clCreateKernel(ocl.program, "MaskHighIntensityChange", &err); - ocl.kernel[KERNEL_DIFFPRECOMPUTE] = clCreateKernel(ocl.program, "DiffPrecompute", &err); - ocl.kernel[KERNEL_UPSAMPLESQUAREROOT] = clCreateKernel(ocl.program, "UpsampleSquareRoot", &err); - ocl.kernel[KERNEL_ADDBORDER] = clCreateKernel(ocl.program, "addBorder", &err); - ocl.kernel[KERNEL_REMOVEBORDER] = clCreateKernel(ocl.program, "removeBorder", &err); - ocl.kernel[KERNEL_AVERAGEADDIMAGE] = clCreateKernel(ocl.program, "AverageAddImage", &err); - ocl.kernel[KERNEL_EDGEDETECTOR] = clCreateKernel(ocl.program, "edgeDetectorMap", &err); - ocl.kernel[KERNEL_BLOCKDIFFMAP] = clCreateKernel(ocl.program, "blockDiffMap", &err); - ocl.kernel[KERNEL_EDGEDETECTORLOWFREQ] = clCreateKernel(ocl.program, "edgeDetectorLowFreq", &err); + ocl.kernel[KERNEL_MINSQUAREVAL] = clCreateKernel(ocl.program, "clMinSquareVal", &err); + ocl.kernel[KERNEL_CONVOLUTION] = clCreateKernel(ocl.program, "clConvolution", &err); + ocl.kernel[KERNEL_CONVOLUTIONX] = clCreateKernel(ocl.program, "clConvolutionX", &err); + ocl.kernel[KERNEL_CONVOLUTIONY] = clCreateKernel(ocl.program, "clConvolutionY", &err); + ocl.kernel[KERNEL_SQUARESAMPLE] = clCreateKernel(ocl.program, "clSquareSample", &err); + ocl.kernel[KERNEL_DOWNSAMPLE] = clCreateKernel(ocl.program, "clDownSample", &err); + ocl.kernel[KERNEL_OPSINDYNAMICSIMAGE] = clCreateKernel(ocl.program, "clOpsinDynamicsImage", &err); + ocl.kernel[KERNEL_DOMASK] = clCreateKernel(ocl.program, "clDoMask", &err); + ocl.kernel[KERNEL_SCALEIMAGE] = clCreateKernel(ocl.program, "clScaleImage", &err); + ocl.kernel[KERNEL_COMBINECHANNELS] = clCreateKernel(ocl.program, "clCombineChannels", &err); + ocl.kernel[KERNEL_MASKHIGHINTENSITYCHANGE] = clCreateKernel(ocl.program, "clMaskHighIntensityChange", &err); + ocl.kernel[KERNEL_DIFFPRECOMPUTE] = clCreateKernel(ocl.program, "clDiffPrecompute", &err); + ocl.kernel[KERNEL_UPSAMPLESQUAREROOT] = clCreateKernel(ocl.program, "clUpsampleSquareRoot", &err); + ocl.kernel[KERNEL_ADDBORDER] = clCreateKernel(ocl.program, "clAddBorder", &err); + ocl.kernel[KERNEL_REMOVEBORDER] = clCreateKernel(ocl.program, "clRemoveBorder", &err); + ocl.kernel[KERNEL_AVERAGEADDIMAGE] = clCreateKernel(ocl.program, "clAverageAddImage", &err); + ocl.kernel[KERNEL_EDGEDETECTOR] = clCreateKernel(ocl.program, "clEdgeDetectorMap", &err); + ocl.kernel[KERNEL_BLOCKDIFFMAP] = clCreateKernel(ocl.program, "clBlockDiffMap", &err); + ocl.kernel[KERNEL_EDGEDETECTORLOWFREQ] = clCreateKernel(ocl.program, "clEdgeDetectorLowFreq", &err); return ocl; } diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h index a287c8cc..a25ddc08 100644 --- a/clguetzli/clguetzli.h +++ b/clguetzli/clguetzli.h @@ -5,6 +5,14 @@ extern bool g_useOpenCL; extern bool g_checkOpenCL; +void clOpsinDynamicsImage(size_t xsize, size_t ysize, float* r, float* g, float* b); + +void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b, + float* r2, float* g2, float* b2, + size_t xsize, size_t ysize, + size_t step, + float* result); + void clMaskHighIntensityChangeEx(ocl_channels xyb0/*in,out*/, ocl_channels xyb1/*in,out*/, size_t xsize, size_t ysize); @@ -27,14 +35,6 @@ void clBlurEx(cl_mem image, size_t xsize, size_t ysize, double sigma, double bor void clOpsinDynamicsImageEx(ocl_channels rgb/*in,out*/, size_t xsize, size_t ysize); -void clOpsinDynamicsImage(size_t xsize, size_t ysize, float* r, float* g, float* b); - -void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b, - float* r2, float* g2, float* b2, - size_t xsize, size_t ysize, - size_t step, - float* result); - void clCombineChannelsEx( ocl_channels mask, ocl_channels mask_dc, From 7c97e95f2096197250782739a6dbb2019876522f Mon Sep 17 00:00:00 2001 From: strongtu Date: Sat, 13 May 2017 09:30:50 +0800 Subject: [PATCH 065/189] =?UTF-8?q?=E4=BF=AE=E6=AD=A3n=E5=8D=A1=E4=B8=8A?= =?UTF-8?q?=E7=9A=84=E7=BC=96=E8=AF=91=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clguetzli.cl | 49 +++++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 25 deletions(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index d44f5c07..9d132ae8 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -1338,24 +1338,23 @@ void ButteraugliBlockDiff(__private double xyb0[3 * kBlockSize], } } +__constant static float g_mix[12] = { + 0.348036746003, + 0.577814843137, + 0.0544556093735, + 0.774145581713, + 0.26922717275, + 0.767247733938, + 0.0366922708552, + 0.920130265014, + 0.0882062883536, + 0.158581714673, + 0.712857943858, + 10.6524069248, +}; void OpsinAbsorbance(const double in[3], double out[3]) { - __constant static float g_mix[12] = { - 0.348036746003, - 0.577814843137, - 0.0544556093735, - 0.774145581713, - 0.26922717275, - 0.767247733938, - 0.0366922708552, - 0.920130265014, - 0.0882062883536, - 0.158581714673, - 0.712857943858, - 10.6524069248, - }; - out[0] = g_mix[0] * in[0] + g_mix[1] * in[1] + g_mix[2] * in[2] + g_mix[3]; out[1] = g_mix[4] * in[0] + g_mix[5] * in[1] + g_mix[6] * in[2] + g_mix[7]; out[2] = g_mix[8] * in[0] + g_mix[9] * in[1] + g_mix[10] * in[2] + g_mix[11]; @@ -1382,18 +1381,18 @@ double EvaluatePolynomial(const double x, __constant const double *coefficients, return b1; } -double Gamma(double v) -{ - static __constant double g_gamma_p[5 + 1] = { - 881.979476556478289, 1496.058452015812463, 908.662212739659481, - 373.566100223287378, 85.840860336314364, 6.683258861509244, - }; +static __constant double g_gamma_p[5 + 1] = { + 881.979476556478289, 1496.058452015812463, 908.662212739659481, + 373.566100223287378, 85.840860336314364, 6.683258861509244, +}; - static __constant double g_gamma_q[5 + 1] = { - 12.262350348616792, 20.557285797683576, 12.161463238367844, - 4.711532733641639, 0.899112889751053, 0.035662329617191, - }; +static __constant double g_gamma_q[5 + 1] = { + 12.262350348616792, 20.557285797683576, 12.161463238367844, + 4.711532733641639, 0.899112889751053, 0.035662329617191, +}; +double Gamma(double v) +{ const double min_value = 0.770000000000000; const double max_value = 274.579999999999984; const double x01 = (v - min_value) / (max_value - min_value); From 5eb14f3b5d4065b3acd9c588bb47bc200e3d79cf Mon Sep 17 00:00:00 2001 From: strongtu Date: Sat, 13 May 2017 10:50:03 +0800 Subject: [PATCH 066/189] Merge branch 'master' of https://github.com/ianhuang-777/guetzli --- clguetzli/clbutter_comparator.cpp | 17 ++++++ clguetzli/clguetzli.cpp | 89 ++++++++++++++++++++++++++++-- clguetzli/clguetzli.h | 6 ++ clguetzli/clguetzli_comparator.cpp | 44 +++------------ guetzli/processor.cc | 54 +++++++++++++++++- 5 files changed, 166 insertions(+), 44 deletions(-) diff --git a/clguetzli/clbutter_comparator.cpp b/clguetzli/clbutter_comparator.cpp index 650e4373..1da9a2cd 100644 --- a/clguetzli/clbutter_comparator.cpp +++ b/clguetzli/clbutter_comparator.cpp @@ -153,6 +153,23 @@ namespace butteraugli std::vector > *mask, std::vector > *mask_dc) { + if (g_useOpenCL) + { + mask->resize(3); + mask_dc->resize(3); + for (int i = 0; i < 3; i++) + { + (*mask)[i].resize(xsize * ysize); + (*mask_dc)[i].resize(xsize * ysize); + } + clMask(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), + xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), + xsize, ysize, + (*mask)[0].data(), (*mask)[1].data(), (*mask)[2].data(), + (*mask_dc)[0].data(), (*mask_dc)[1].data(), (*mask_dc)[2].data()); + return; + } + _Mask(xyb0, xyb1, xsize, ysize, mask, mask_dc); if (g_checkOpenCL) diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index 20f246f3..32d0d77b 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -765,7 +765,12 @@ void clDoMask(ocl_channels mask/*in, out*/, ocl_channels mask_dc/*in, out*/, siz double scaler = 0.0738288224836; double mul = 20.8029176447; static double lut_x[512]; - MakeMask(extmul, extoff, mul, offset, scaler, lut_x); + static bool lutx_init = false; + if (!lutx_init) + { + lutx_init = true; + MakeMask(extmul, extoff, mul, offset, scaler, lut_x); + } extmul = 0.373995618954; extoff = 1.5307267433; @@ -773,7 +778,12 @@ void clDoMask(ocl_channels mask/*in, out*/, ocl_channels mask_dc/*in, out*/, siz scaler = 1.1731667845; mul = 16.2447033988; static double lut_y[512]; - MakeMask(extmul, extoff, mul, offset, scaler, lut_y); + static bool luty_init = false; + if (!luty_init) + { + luty_init = true; + MakeMask(extmul, extoff, mul, offset, scaler, lut_y); + } extmul = 0.61582234137; extoff = -4.25376118646; @@ -781,7 +791,12 @@ void clDoMask(ocl_channels mask/*in, out*/, ocl_channels mask_dc/*in, out*/, siz scaler = 0.47434643535; mul = 31.1444967089; static double lut_b[512]; - MakeMask(extmul, extoff, mul, offset, scaler, lut_b); + static bool lutb_init = false; + if (!lutb_init) + { + lutb_init = true; + MakeMask(extmul, extoff, mul, offset, scaler, lut_b); + } extmul = 1.79116943438; extoff = -3.86797479189; @@ -789,7 +804,12 @@ void clDoMask(ocl_channels mask/*in, out*/, ocl_channels mask_dc/*in, out*/, siz scaler = 0.486575865525; mul = 20.4563479139; static double lut_dcx[512]; - MakeMask(extmul, extoff, mul, offset, scaler, lut_dcx); + static bool lutdcx_init = false; + if (!lutdcx_init) + { + lutdcx_init = true; + MakeMask(extmul, extoff, mul, offset, scaler, lut_dcx); + } extmul = 0.212223514236; extoff = -3.65647120524; @@ -797,7 +817,12 @@ void clDoMask(ocl_channels mask/*in, out*/, ocl_channels mask_dc/*in, out*/, siz scaler = 0.170392660501; mul = 21.6566724788; static double lut_dcy[512]; - MakeMask(extmul, extoff, mul, offset, scaler, lut_dcy); + static bool lutdcy_init = false; + if (!lutdcy_init) + { + lutdcy_init = true; + MakeMask(extmul, extoff, mul, offset, scaler, lut_dcy); + } extmul = 0.349376011816; extoff = -0.894711072781; @@ -805,7 +830,12 @@ void clDoMask(ocl_channels mask/*in, out*/, ocl_channels mask_dc/*in, out*/, siz scaler = 0.380086095024; mul = 18.0373825149; static double lut_dcb[512]; - MakeMask(extmul, extoff, mul, offset, scaler, lut_dcb); + static bool lutdcb_init = false; + if (!lutdcb_init) + { + lutdcb_init = true; + MakeMask(extmul, extoff, mul, offset, scaler, lut_dcb); + } size_t channel_size = 512 * 3 * sizeof(double); ocl_channels xyb = ocl.allocMemChannels(channel_size); @@ -876,6 +906,53 @@ void clMaskEx(ocl_channels rgb, ocl_channels rgb2, } } +void clMask(const float* r, const float* g, const float* b, + const float* r2, const float* g2, const float* b2, + size_t xsize, size_t ysize, + float* mask_r, float* mask_g, float* mask_b, + float* maskdc_r, float* maskdc_g, float* maskdc_b) +{ + cl_int err = CL_SUCCESS; + ocl_args_d_t &ocl = getOcl(); + + cl_int channel_size = xsize * ysize * sizeof(float); + + ocl_channels rgb = ocl.allocMemChannels(channel_size); + ocl_channels rgb2 = ocl.allocMemChannels(channel_size); + ocl_channels mask = ocl.allocMemChannels(channel_size); + ocl_channels mask_dc = ocl.allocMemChannels(channel_size); + + clEnqueueWriteBuffer(ocl.commandQueue, rgb.r, CL_FALSE, 0, channel_size, r, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, rgb.g, CL_FALSE, 0, channel_size, g, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, rgb.b, CL_FALSE, 0, channel_size, b, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, rgb2.r, CL_FALSE, 0, channel_size, r2, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, rgb2.g, CL_FALSE, 0, channel_size, g2, 0, NULL, NULL); + clEnqueueWriteBuffer(ocl.commandQueue, rgb2.b, CL_FALSE, 0, channel_size, b2, 0, NULL, NULL); + err = clFinish(ocl.commandQueue); + + clMaskEx(rgb, rgb2, xsize, ysize, mask, mask_dc); + + cl_float *r0_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask.r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + cl_float *r0_g = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask.g, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + cl_float *r0_b = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask.b, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + cl_float *r1_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask_dc.r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + cl_float *r1_g = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask_dc.g, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + cl_float *r1_b = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask_dc.b, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + err = clFinish(ocl.commandQueue); + + memcpy(mask_r, r0_r, channel_size); + memcpy(mask_g, r0_g, channel_size); + memcpy(mask_b, r0_b, channel_size); + memcpy(maskdc_r, r1_r, channel_size); + memcpy(maskdc_g, r1_g, channel_size); + memcpy(maskdc_b, r1_b, channel_size); + + ocl.releaseMemChannels(rgb); + ocl.releaseMemChannels(rgb2); + ocl.releaseMemChannels(mask); + ocl.releaseMemChannels(mask_dc); +} + void clCombineChannelsEx( ocl_channels mask, ocl_channels mask_dc, diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h index a25ddc08..aa595ab5 100644 --- a/clguetzli/clguetzli.h +++ b/clguetzli/clguetzli.h @@ -13,6 +13,12 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b, size_t step, float* result); +void clMask(const float* r, const float* g, const float* b, + const float* r2, const float* g2, const float* b2, + size_t xsize, size_t ysize, + float* mask_r, float* mask_g, float* mask_b, + float* maskdc_r, float* maskdc_g, float* maskdc_b); + void clMaskHighIntensityChangeEx(ocl_channels xyb0/*in,out*/, ocl_channels xyb1/*in,out*/, size_t xsize, size_t ysize); diff --git a/clguetzli/clguetzli_comparator.cpp b/clguetzli/clguetzli_comparator.cpp index ce3a9b64..03653754 100644 --- a/clguetzli/clguetzli_comparator.cpp +++ b/clguetzli/clguetzli_comparator.cpp @@ -238,36 +238,13 @@ void IDCTToImage(const uint8_t idct[8 * 8], uint16_t *pixels_) // out = [YUVYUV....YUVYUV] void ImageToYUV(uint16_t *pixels_, uint8_t *out) { - const int ymin = 0; - const int xmin = 0; - const int ysize = 8; - const int xsize = 8; - const int width_ = 8; - const int height_ = 8; const int stride = 3; - const int yend1 = ymin + ysize; - const int yend0 = std::min(yend1, height_); - int y = ymin; - for (; y < yend0; ++y) { - const int xend1 = xmin + xsize; - const int xend0 = std::min(xend1, width_); - int x = xmin; - int px = y * width_ + xmin; - for (; x < xend0; ++x, ++px, out += stride) { + for (int y = 0; y < 8; ++y) { + for (int x = 0; x < 8; ++x) { + int px = y * 8 + x; *out = static_cast((pixels_[px] + 8 - (x & 1)) >> 4); - } - const int offset = -stride; - for (; x < xend1; ++x) { - *out = out[offset]; - out += stride; - } - } - for (; y < yend1; ++y) { - const int offset = -stride * xsize; - for (int x = 0; x < xsize; ++x) { - *out = out[offset]; - out += stride; + out += stride; } } } @@ -305,10 +282,10 @@ void BlockToImage(coeff_t *block, float* r, float* g, float* b) uint8_t yuv[8 * 8 * 3]; ImageToYUV(&pixels[0], &yuv[0]); - ImageToYUV(&pixels[8*8], &yuv[8*8]); - ImageToYUV(&pixels[8*8*2], &yuv[8*8*2]); + ImageToYUV(&pixels[8*8], &yuv[1]); + ImageToYUV(&pixels[8*8*2], &yuv[2]); - YUVToRGB(yuv); + YUVToRGB(yuv); const double* lut = Srgb8ToLinearTable(); for (int i = 0; i < 8 * 8; i++) @@ -341,17 +318,14 @@ namespace guetzli double ButteraugliComparatorEx::CompareBlockEx(const OutputImage& img, int off_x, int off_y, coeff_t* candidate_block) { - return 0; - int block_x = block_x_ * factor_x_ + off_x; + int block_x = block_x_ * factor_x_ + off_x; int block_y = block_y_ * factor_y_ + off_y; int xmin = 8 * block_x; int ymin = 8 * block_y; int block_ix = off_y * factor_x_ + off_x; const std::vector >& rgb0_c = per_block_pregamma_[block_ix]; - std::vector > rgb1_c2(3, std::vector(kDCTBlockSize)); - img.ToLinearRGB(xmin, ymin, 8, 8, &rgb1_c2); - + // std::vector > rgb1_c(3, std::vector(kDCTBlockSize)); BlockToImage(candidate_block, rgb1_c[0].data(), rgb1_c[1].data(), rgb1_c[2].data()); diff --git a/guetzli/processor.cc b/guetzli/processor.cc index 62613d04..c8684b35 100644 --- a/guetzli/processor.cc +++ b/guetzli/processor.cc @@ -55,6 +55,8 @@ class Processor { ProcessStats* stats); private: + void CompareBlockZeroingOrderBatch(const JPEGData& jpg, OutputImage* img); + void SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, const uint8_t comp_mask, const double target_mul, bool stop_early); @@ -434,7 +436,7 @@ void Processor::ComputeBlockZeroingOrder( } float max_err = 0; - +/* for (int iy = 0; iy < factor_y; ++iy) { for (int ix = 0; ix < factor_x; ++ix) { int block_xx = block_x * factor_x + ix; @@ -445,8 +447,8 @@ void Processor::ComputeBlockZeroingOrder( } } } - - /*max_err = */((ButteraugliComparatorEx*)comparator_)->CompareBlockEx(*img, 0, 0, candidate_block); +*/ + max_err = ((ButteraugliComparatorEx*)comparator_)->CompareBlockEx(*img, 0, 0, candidate_block); if (max_err < best_err) { // TOBEREMOVE:ÕÒ³ö×îС´íÎóÖµµÄi best_err = max_err; @@ -558,6 +560,52 @@ size_t EstimateDCSize(const JPEGData& jpg) { } // namespace +void Processor::CompareBlockZeroingOrderBatch(const JPEGData& jpg, OutputImage* img) +{ + // we only support factor_x == factor_y == 1 + const int width = img->width(); + const int height = img->height(); + const int factor_x = 1; + const int factor_y = 1; + + const int block_width = (width + 8 * factor_x - 1) / (8 * factor_x); + const int block_height = (height + 8 * factor_y - 1) / (8 * factor_y); + const int num_blocks = block_width * block_height; + + comparator_->StartBlockComparisons(); // TOBEREMOVE:³õʼ»¯Ò»Ð©²ÎÊý + std::vector orig_block_batch(num_blocks * kBlockSize); // [block_r block_g block_b] + std::vector block_batch(num_blocks * kBlockSize); // [block_r block_g block_b] + + for (int block_y = 0, block_ix = 0; block_y < block_height; ++block_y) { + for (int block_x = 0; block_x < block_width; ++block_x, ++block_ix) { + coeff_t *orig_block = &orig_block_batch[block_ix * kBlockSize]; + coeff_t *block = &block_batch[block_ix * kBlockSize]; + + for (int c = 0; c < 3; ++c) + { + img->component(c).GetCoeffBlock(block_x, block_y, &block[c * kDCTBlockSize]); // TOBEREMOVE:È¡³ö¶Ô±ÈͼÏñblockϵÊý + + const JPEGComponent& comp = jpg.components[c]; + int jpg_block_ix = block_y * comp.width_in_blocks + block_x; + memcpy(&orig_block[c * kDCTBlockSize], &comp.coeffs[jpg_block_ix * kDCTBlockSize], kDCTBlockSize * sizeof(orig_block[0])); // TOBEREMOVE:È¡³öԭʼͼÏñblockϵÊý + } + +/* + std::vector block_order; + block_order.clear(); + ComputeBlockZeroingOrder(block, orig_block, block_x, block_y, factor_x, factor_y, comp_mask, img, &block_order); // TOBEREMOVE:´«ÈëԭʼblockºÍ¶Ô±ÈͼÏñblock¼ÆËãzeroing order·ÅÈëblock_order + candidate_coeff_offsets[block_ix] = candidate_coeffs.size(); + for (size_t i = 0; i < block_order.size(); ++i) { // TOBEREMOVE:°Ñ½á¹û¸³Öµµ½ºòѡϵÊý + candidate_coeffs.push_back(block_order[i].idx); + candidate_coeff_errors.push_back(block_order[i].block_err); + } +*/ + } + } + + comparator_->FinishBlockComparisons(); // TOBEREMOVE:Çå³ý²ÎÊý +} + void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, const uint8_t comp_mask, const double target_mul, From f12e272387e3480a0f9ff9cf2827de35cf6da16f Mon Sep 17 00:00:00 2001 From: strongtu Date: Sat, 13 May 2017 11:43:32 +0800 Subject: [PATCH 067/189] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=20SelectFrequencyMas?= =?UTF-8?q?kingBatch=20=E5=8C=96=E5=A4=84=E7=90=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clguetzli_comparator.cpp | 8 +- clguetzli/clguetzli_comparator.h | 2 +- guetzli/processor.cc | 561 ++++++++++++++++++----------- 3 files changed, 350 insertions(+), 221 deletions(-) diff --git a/clguetzli/clguetzli_comparator.cpp b/clguetzli/clguetzli_comparator.cpp index 03653754..5970e6d8 100644 --- a/clguetzli/clguetzli_comparator.cpp +++ b/clguetzli/clguetzli_comparator.cpp @@ -316,13 +316,13 @@ namespace guetzli ButteraugliComparator::SwitchBlock(block_x, block_y, factor_x, factor_y); } - double ButteraugliComparatorEx::CompareBlockEx(const OutputImage& img, int off_x, int off_y, coeff_t* candidate_block) + double ButteraugliComparatorEx::CompareBlockEx(coeff_t* candidate_block) { - int block_x = block_x_ * factor_x_ + off_x; - int block_y = block_y_ * factor_y_ + off_y; + int block_x = block_x_ * factor_x_; + int block_y = block_y_ * factor_y_; int xmin = 8 * block_x; int ymin = 8 * block_y; - int block_ix = off_y * factor_x_ + off_x; + int block_ix = 0; const std::vector >& rgb0_c = per_block_pregamma_[block_ix]; // diff --git a/clguetzli/clguetzli_comparator.h b/clguetzli/clguetzli_comparator.h index 778d0532..96642ce6 100644 --- a/clguetzli/clguetzli_comparator.h +++ b/clguetzli/clguetzli_comparator.h @@ -16,7 +16,7 @@ namespace guetzli { void StartBlockComparisons(); void SwitchBlock(int block_x, int block_y, int factor_x, int factor_y); - double CompareBlockEx(const OutputImage& img, int off_x, int off_y, coeff_t* candidate_block); + double CompareBlockEx(coeff_t* candidate_block); protected: std::vector imgOpsinDynamicsBlockList; diff --git a/guetzli/processor.cc b/guetzli/processor.cc index c8684b35..d1607c14 100644 --- a/guetzli/processor.cc +++ b/guetzli/processor.cc @@ -55,16 +55,30 @@ class Processor { ProcessStats* stats); private: - void CompareBlockZeroingOrderBatch(const JPEGData& jpg, OutputImage* img); + + void SelectFrequencyMaskingBatch(const JPEGData& jpg, OutputImage* img, const double target_mul, bool stop_early); void SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, const uint8_t comp_mask, const double target_mul, bool stop_early); + + void SelectFrequencyBackEnd(const JPEGData& jpg, OutputImage* img, + const uint8_t comp_mask, + const double target_mul, + bool stop_early, + std::vector &candidate_coeff_offsets, + std::vector& candidate_coeffs, + std::vector &candidate_coeff_errors); + void ComputeBlockZeroingOrder( const coeff_t block[kBlockSize], const coeff_t orig_block[kBlockSize], const int block_x, const int block_y, const int factor_x, const int factor_y, const uint8_t comp_mask, OutputImage* img, std::vector* output_order); + + void ComputeBlockZeroingOrder(const coeff_t block[kBlockSize], const coeff_t orig_block[kBlockSize], + const int block_x, const int block_y, std::vector* output_order); + bool SelectQuantMatrix(const JPEGData& jpg_in, const bool downsample, int best_q[3][kDCTBlockSize], OutputImage* img); @@ -365,41 +379,6 @@ bool Processor::SelectQuantMatrix(const JPEGData& jpg_in, const bool downsample, } -void func(const coeff_t block[kBlockSize], const coeff_t orig_block[kBlockSize], - const uint8_t comp_mask, guetzli::Params ¶ms_, std::vector > &input_order) -{ - static const uint8_t oldCsf[kDCTBlockSize] = { - 10, 10, 20, 40, 60, 70, 80, 90, - 10, 20, 30, 60, 70, 80, 90, 90, - 20, 30, 60, 70, 80, 90, 90, 90, - 40, 60, 70, 80, 90, 90, 90, 90, - 60, 70, 80, 90, 90, 90, 90, 90, - 70, 80, 90, 90, 90, 90, 90, 90, - 80, 90, 90, 90, 90, 90, 90, 90, - 90, 90, 90, 90, 90, 90, 90, 90, - }; - static const double kWeight[3] = { 1.0, 0.22, 0.20 }; -#include "guetzli/order.inc" - - for (int c = 0; c < 3; ++c) { // TOBEREMOVE:¼ÆËãÊäÈëblockµÄinput_order,·Ç0µÄ´ò·Ö - if (!(comp_mask & (1 << c))) continue; - for (int k = 1; k < kDCTBlockSize; ++k) { - int idx = c * kDCTBlockSize + k; // TOBEREMOVE:ÿ¸ö·ÖÁ¿ÒÀ´Î - if (block[idx] != 0) { - float score; - if (params_.new_zeroing_model) { - score = std::abs(orig_block[idx]) * csf[idx] + bias[idx]; - } - else { - score = static_cast((std::abs(orig_block[idx]) - kJPEGZigZagOrder[k] / 64.0) * kWeight[c] / oldCsf[k]); - } - input_order.push_back(std::make_pair(idx, score)); - } - } - } - std::sort(input_order.begin(), input_order.end(), [](const std::pair& a, const std::pair& b) { return a.second < b.second; }); - -} // REQUIRES: block[c*64...(c*64+63)] is all zero if (comp_mask & (1<* output_order) { - std::vector > input_order; - func(block, orig_block, comp_mask, params_, input_order); + static const uint8_t oldCsf[kDCTBlockSize] = { + 10, 10, 20, 40, 60, 70, 80, 90, + 10, 20, 30, 60, 70, 80, 90, 90, + 20, 30, 60, 70, 80, 90, 90, 90, + 40, 60, 70, 80, 90, 90, 90, 90, + 60, 70, 80, 90, 90, 90, 90, 90, + 70, 80, 90, 90, 90, 90, 90, 90, + 80, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + }; + static const double kWeight[3] = { 1.0, 0.22, 0.20 }; +#include "guetzli/order.inc" + std::vector > input_order; + for (int c = 0; c < 3; ++c) { + if (!(comp_mask & (1 << c))) continue; + for (int k = 1; k < kDCTBlockSize; ++k) { + int idx = c * kDCTBlockSize + k; + if (block[idx] != 0) { + float score; + if (params_.new_zeroing_model) { + score = std::abs(orig_block[idx]) * csf[idx] + bias[idx]; + } else { + score = static_cast((std::abs(orig_block[idx]) - kJPEGZigZagOrder[k] / 64.0) * + kWeight[c] / oldCsf[k]); + } + input_order.push_back(std::make_pair(idx, score)); + } + } + } + std::sort(input_order.begin(), input_order.end(), + [](const std::pair& a, const std::pair& b) { + return a.second < b.second; }); + coeff_t processed_block[kBlockSize]; memcpy(processed_block, block, sizeof(processed_block)); @@ -436,7 +446,7 @@ void Processor::ComputeBlockZeroingOrder( } float max_err = 0; -/* + for (int iy = 0; iy < factor_y; ++iy) { for (int ix = 0; ix < factor_x; ++ix) { int block_xx = block_x * factor_x + ix; @@ -447,8 +457,6 @@ void Processor::ComputeBlockZeroingOrder( } } } -*/ - max_err = ((ButteraugliComparatorEx*)comparator_)->CompareBlockEx(*img, 0, 0, candidate_block); if (max_err < best_err) { // TOBEREMOVE:ÕÒ³ö×îС´íÎóÖµµÄi best_err = max_err; @@ -560,7 +568,7 @@ size_t EstimateDCSize(const JPEGData& jpg) { } // namespace -void Processor::CompareBlockZeroingOrderBatch(const JPEGData& jpg, OutputImage* img) +void Processor::SelectFrequencyMaskingBatch(const JPEGData& jpg, OutputImage* img, const double target_mul, bool stop_early) { // we only support factor_x == factor_y == 1 const int width = img->width(); @@ -576,6 +584,7 @@ void Processor::CompareBlockZeroingOrderBatch(const JPEGData& jpg, OutputImage* std::vector orig_block_batch(num_blocks * kBlockSize); // [block_r block_g block_b] std::vector block_batch(num_blocks * kBlockSize); // [block_r block_g block_b] + // step 1 »ñÈ¡ËùÓÐblock list for (int block_y = 0, block_ix = 0; block_y < block_height; ++block_y) { for (int block_x = 0; block_x < block_width; ++block_x, ++block_ix) { coeff_t *orig_block = &orig_block_batch[block_ix * kBlockSize]; @@ -583,27 +592,124 @@ void Processor::CompareBlockZeroingOrderBatch(const JPEGData& jpg, OutputImage* for (int c = 0; c < 3; ++c) { - img->component(c).GetCoeffBlock(block_x, block_y, &block[c * kDCTBlockSize]); // TOBEREMOVE:È¡³ö¶Ô±ÈͼÏñblockϵÊý + img->component(c).GetCoeffBlock(block_x, block_y, &block[c * kDCTBlockSize]); const JPEGComponent& comp = jpg.components[c]; int jpg_block_ix = block_y * comp.width_in_blocks + block_x; memcpy(&orig_block[c * kDCTBlockSize], &comp.coeffs[jpg_block_ix * kDCTBlockSize], kDCTBlockSize * sizeof(orig_block[0])); // TOBEREMOVE:È¡³öԭʼͼÏñblockϵÊý } + } + } + + std::vector candidate_coeff_offsets(num_blocks + 1); + std::vector candidate_coeffs; + std::vector candidate_coeff_errors; + + // step 2 ¶Ô±Èÿ¸öblock½á¹û + for (int block_y = 0, block_ix = 0; block_y < block_height; ++block_y) { + for (int block_x = 0; block_x < block_width; ++block_x, ++block_ix) { + coeff_t *orig_block = &orig_block_batch[block_ix * kBlockSize]; + coeff_t *block = &block_batch[block_ix * kBlockSize]; -/* std::vector block_order; - block_order.clear(); - ComputeBlockZeroingOrder(block, orig_block, block_x, block_y, factor_x, factor_y, comp_mask, img, &block_order); // TOBEREMOVE:´«ÈëԭʼblockºÍ¶Ô±ÈͼÏñblock¼ÆËãzeroing order·ÅÈëblock_order + + ComputeBlockZeroingOrder(block, orig_block, block_x, block_y, &block_order); + + // ÒÔÏ´¦ÀíÒÀȻûÓÐbatch»¯£¬ÓÃÓÚÏȼø¶¨ÆäËû¼ÆËã½á¹û candidate_coeff_offsets[block_ix] = candidate_coeffs.size(); - for (size_t i = 0; i < block_order.size(); ++i) { // TOBEREMOVE:°Ñ½á¹û¸³Öµµ½ºòѡϵÊý + for (size_t i = 0; i < block_order.size(); ++i) { candidate_coeffs.push_back(block_order[i].idx); candidate_coeff_errors.push_back(block_order[i].block_err); } -*/ } } + // comparator_->FinishBlockComparisons(); // TOBEREMOVE:Çå³ý²ÎÊý + + SelectFrequencyBackEnd(jpg, img, 7, target_mul, stop_early, + candidate_coeff_offsets, candidate_coeffs, candidate_coeff_errors); + +} + +void Processor::ComputeBlockZeroingOrder(const coeff_t block[kBlockSize], const coeff_t orig_block[kBlockSize], + const int block_x, const int block_y, std::vector* output_order) +{ + static const uint8_t oldCsf[kDCTBlockSize] = { + 10, 10, 20, 40, 60, 70, 80, 90, + 10, 20, 30, 60, 70, 80, 90, 90, + 20, 30, 60, 70, 80, 90, 90, 90, + 40, 60, 70, 80, 90, 90, 90, 90, + 60, 70, 80, 90, 90, 90, 90, 90, + 70, 80, 90, 90, 90, 90, 90, 90, + 80, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + }; + static const double kWeight[3] = { 1.0, 0.22, 0.20 }; +#include "guetzli/order.inc" + std::vector > input_order; + for (int c = 0; c < 3; ++c) { + for (int k = 1; k < kDCTBlockSize; ++k) { + int idx = c * kDCTBlockSize + k; + if (block[idx] != 0) { + float score; + if (params_.new_zeroing_model) { + score = std::abs(orig_block[idx]) * csf[idx] + bias[idx]; + } + else { + score = static_cast((std::abs(orig_block[idx]) - kJPEGZigZagOrder[k] / 64.0) * kWeight[c] / oldCsf[k]); + } + input_order.push_back(std::make_pair(idx, score)); + } + } + } + std::sort(input_order.begin(), input_order.end(), [](const std::pair& a, const std::pair& b) { return a.second < b.second; }); + + coeff_t processed_block[kBlockSize]; + memcpy(processed_block, block, sizeof(processed_block)); + + comparator_->SwitchBlock(block_x, block_y, 1, 1); + + while (!input_order.empty()) { + float best_err = 1e17f; + int best_i = 0; + for (size_t i = 0; i < std::min(params_.zeroing_greedy_lookahead, input_order.size()); ++i) + { + coeff_t candidate_block[kBlockSize]; + memcpy(candidate_block, processed_block, sizeof(candidate_block)); + + const int idx = input_order[i].first; + + candidate_block[idx] = 0; // TOBEREMOVE:¶Ô±ÈblockµÄÅÅÐòµÃ·ÖǰiµÍµÄÖÃ0(i¸ù¾Ýinput_orderÊý¾Ý±ä»¯¶ø±ä»¯)£¬²¢ÏÈÉèÖûضԱÈͼÏñµÄÈý¸ö·ÖÁ¿¶ÔÓ¦blockÖÐÈ¥£¬ºóÐøÔÙ×ö¶Ô±È²ÉÓᣠ+ + float max_err = ((ButteraugliComparatorEx*)comparator_)->CompareBlockEx(candidate_block); + if (max_err < best_err) { // TOBEREMOVE:ÕÒ³ö×îС´íÎóÖµµÄi + best_err = max_err; + best_i = i; + } + } + + int idx = input_order[best_i].first; + processed_block[idx] = 0; + input_order.erase(input_order.begin() + best_i); + + output_order->push_back({ idx, best_err }); // TOBEREMOVE:½«ÉÏÃæ¼ÆËã³öÀ´µÄ×îС´íÎóµÄidx£¬¶ÔÓ¦µ½¶Ô±ÈblockÖеĶÔӦλÖÃÕæÕýµÄÖÃΪ0,ÒÆ³ýinput_orderÏ¼´Ñ¡È¡µ±Ç°Öµ£¬·ÅÈëoutput_order,²¢ÕýʽµÄÉèÖõ½¶Ô±ÈͼÏñÖÐÈ¥¡£ + } + + // TOBEREMOVE:×îÖÕÒÆ³ýerrÊý´óÓÚerrorÏÞÖÆµÄÏî·µ»Ø£¬²¢»¹Ô­¶Ô±ÈͼÏñµ½Ô­Ê¼Öµ¡£ + // Make the block error values monotonic. + float min_err = 1e10; + for (int i = output_order->size() - 1; i >= 0; --i) { + min_err = std::min(min_err, (*output_order)[i].block_err); + (*output_order)[i].block_err = min_err; + } + // Cut off at the block error limit. + size_t num = 0; + while (num < output_order->size() && + (*output_order)[num].block_err <= comparator_->BlockErrorLimit()) { + ++num; + } + output_order->resize(num); } void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, @@ -612,7 +718,6 @@ void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, bool stop_early) { const int width = img->width(); const int height = img->height(); - const int ncomp = jpg.components.size(); const int last_c = Log2FloorNonZero(comp_mask); if (static_cast(last_c) >= jpg.components.size()) return; const int factor_x = img->component(last_c).factor_x(); @@ -659,183 +764,207 @@ void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, comparator_->FinishBlockComparisons(); // TOBEREMOVE:Çå³ý²ÎÊý candidate_coeff_offsets[num_blocks] = candidate_coeffs.size(); - std::vector ac_histograms(ncomp); - int jpg_header_size, dc_size; - { - JPEGData jpg_out = jpg; - img->SaveToJpegData(&jpg_out); - jpg_header_size = JpegHeaderSize(jpg_out, params_.clear_metadata); - dc_size = EstimateDCSize(jpg_out); - BuildACHistograms(jpg_out, &ac_histograms[0]); - } - std::vector ac_depths; - int ac_histogram_size = ComputeEntropyCodes(ac_histograms, &ac_depths); - int base_size = jpg_header_size + dc_size + ac_histogram_size + - EntropyCodedDataSize(ac_histograms, ac_depths); - int prev_size = base_size; - - std::vector max_block_error(num_blocks); - std::vector last_indexes(num_blocks); - - bool first_up_iter = true; - for (int direction : {1, -1}) { - for (;;) { - if (stop_early && direction == -1) { - if (prev_size > 1.01 * final_output_->jpeg_data.size()) { - // If we are down-adjusting the error, the output size will only keep - // increasing. - // TODO(user): Do this check always by comparing only the size - // of the currently processed components. - break; - } - } - std::vector > global_order; - int blocks_to_change; - std::vector block_weight; - for (int rblock = 1; rblock <= 4; ++rblock) { - block_weight = std::vector(num_blocks); - std::vector distmap(width * height); - if (!first_up_iter) { - distmap = comparator_->distmap(); - } - comparator_->ComputeBlockErrorAdjustmentWeights( - direction, rblock, target_mul, factor_x, factor_y, distmap, - &block_weight); - global_order.clear(); - blocks_to_change = 0; - for (int block_y = 0, block_ix = 0; block_y < block_height; ++block_y) { - for (int block_x = 0; block_x < block_width; ++block_x, ++block_ix) { - const int last_index = last_indexes[block_ix]; - const int offset = candidate_coeff_offsets[block_ix]; - const int num_candidates = - candidate_coeff_offsets[block_ix + 1] - offset; - const float* candidate_errors = &candidate_coeff_errors[offset]; - const float max_err = max_block_error[block_ix]; - if (block_weight[block_ix] == 0) { - continue; + SelectFrequencyBackEnd(jpg, img, comp_mask, target_mul, stop_early, + candidate_coeff_offsets, candidate_coeffs, candidate_coeff_errors); +} + +void Processor::SelectFrequencyBackEnd(const JPEGData& jpg, OutputImage* img, + const uint8_t comp_mask, + const double target_mul, + bool stop_early, + std::vector &candidate_coeff_offsets, + std::vector& candidate_coeffs, + std::vector &candidate_coeff_errors) +{ + const int ncomp = jpg.components.size(); + const int width = img->width(); + const int height = img->height(); + const int last_c = Log2FloorNonZero(comp_mask); + if (static_cast(last_c) >= jpg.components.size()) return; + const int factor_x = img->component(last_c).factor_x(); + const int factor_y = img->component(last_c).factor_y(); + const int block_width = (width + 8 * factor_x - 1) / (8 * factor_x); + const int block_height = (height + 8 * factor_y - 1) / (8 * factor_y); + const int num_blocks = block_width * block_height; + + std::vector ac_histograms(ncomp); + int jpg_header_size, dc_size; + { + JPEGData jpg_out = jpg; + img->SaveToJpegData(&jpg_out); + jpg_header_size = JpegHeaderSize(jpg_out, params_.clear_metadata); + dc_size = EstimateDCSize(jpg_out); + BuildACHistograms(jpg_out, &ac_histograms[0]); + } + std::vector ac_depths; + int ac_histogram_size = ComputeEntropyCodes(ac_histograms, &ac_depths); + int base_size = jpg_header_size + dc_size + ac_histogram_size + + EntropyCodedDataSize(ac_histograms, ac_depths); + int prev_size = base_size; + + std::vector max_block_error(num_blocks); + std::vector last_indexes(num_blocks); + + bool first_up_iter = true; + for (int direction : {1, -1}) { + for (;;) { + if (stop_early && direction == -1) { + if (prev_size > 1.01 * final_output_->jpeg_data.size()) { + // If we are down-adjusting the error, the output size will only keep + // increasing. + // TODO(user): Do this check always by comparing only the size + // of the currently processed components. + break; + } } - if (direction > 0) { - for (size_t i = last_index; i < num_candidates; ++i) { - float val = ((candidate_errors[i] - max_err) / - block_weight[block_ix]); - global_order.push_back(std::make_pair(block_ix, val)); - } - blocks_to_change += (last_index < num_candidates ? 1 : 0); - } else { - for (int i = last_index - 1; i >= 0; --i) { - float val = ((max_err - candidate_errors[i]) / - block_weight[block_ix]); - global_order.push_back(std::make_pair(block_ix, val)); - } - blocks_to_change += (last_index > 0 ? 1 : 0); + std::vector > global_order; + int blocks_to_change; + std::vector block_weight; + for (int rblock = 1; rblock <= 4; ++rblock) { + block_weight = std::vector(num_blocks); + std::vector distmap(width * height); + if (!first_up_iter) { + distmap = comparator_->distmap(); + } + comparator_->ComputeBlockErrorAdjustmentWeights( + direction, rblock, target_mul, factor_x, factor_y, distmap, + &block_weight); + global_order.clear(); + blocks_to_change = 0; + for (int block_y = 0, block_ix = 0; block_y < block_height; ++block_y) { + for (int block_x = 0; block_x < block_width; ++block_x, ++block_ix) { + const int last_index = last_indexes[block_ix]; + const int offset = candidate_coeff_offsets[block_ix]; + const int num_candidates = + candidate_coeff_offsets[block_ix + 1] - offset; + const float* candidate_errors = &candidate_coeff_errors[offset]; + const float max_err = max_block_error[block_ix]; + if (block_weight[block_ix] == 0) { + continue; + } + if (direction > 0) { + for (size_t i = last_index; i < num_candidates; ++i) { + float val = ((candidate_errors[i] - max_err) / + block_weight[block_ix]); + global_order.push_back(std::make_pair(block_ix, val)); + } + blocks_to_change += (last_index < num_candidates ? 1 : 0); + } + else { + for (int i = last_index - 1; i >= 0; --i) { + float val = ((max_err - candidate_errors[i]) / + block_weight[block_ix]); + global_order.push_back(std::make_pair(block_ix, val)); + } + blocks_to_change += (last_index > 0 ? 1 : 0); + } + } + } + if (!global_order.empty()) { + // If we found something to adjust with the current block adjustment + // radius, we can stop and adjust the blocks we have. + break; + } } - } - } - if (!global_order.empty()) { - // If we found something to adjust with the current block adjustment - // radius, we can stop and adjust the blocks we have. - break; - } - } - if (global_order.empty()) { - break; - } + if (global_order.empty()) { + break; + } - std::sort(global_order.begin(), global_order.end(), + std::sort(global_order.begin(), global_order.end(), [](const std::pair& a, - const std::pair& b) { - return a.second < b.second; }); + const std::pair& b) { + return a.second < b.second; }); - double rel_size_delta = direction > 0 ? 0.01 : 0.0005; - if (direction > 0 && comparator_->DistanceOK(1.0)) { - rel_size_delta = 0.05; - } - double min_size_delta = base_size * rel_size_delta; - - float coeffs_to_change_per_block = - direction > 0 ? 2.0f : factor_x * factor_y * 0.2f; - int min_coeffs_to_change = coeffs_to_change_per_block * blocks_to_change; - - if (first_up_iter) { - const float limit = 0.75f * comparator_->BlockErrorLimit(); - auto it = std::partition_point(global_order.begin(), global_order.end(), - [=](const std::pair& a) { - return a.second < limit; }); - min_coeffs_to_change = std::max(min_coeffs_to_change, - it - global_order.begin()); - first_up_iter = false; - } + double rel_size_delta = direction > 0 ? 0.01 : 0.0005; + if (direction > 0 && comparator_->DistanceOK(1.0)) { + rel_size_delta = 0.05; + } + double min_size_delta = base_size * rel_size_delta; + + float coeffs_to_change_per_block = + direction > 0 ? 2.0f : factor_x * factor_y * 0.2f; + int min_coeffs_to_change = coeffs_to_change_per_block * blocks_to_change; + + if (first_up_iter) { + const float limit = 0.75f * comparator_->BlockErrorLimit(); + auto it = std::partition_point(global_order.begin(), global_order.end(), + [=](const std::pair& a) { + return a.second < limit; }); + min_coeffs_to_change = std::max(min_coeffs_to_change, + it - global_order.begin()); + first_up_iter = false; + } - std::set changed_blocks; - float val_threshold = 0.0; - int changed_coeffs = 0; - int est_jpg_size = prev_size; - for (size_t i = 0; i < global_order.size(); ++i) { - const int block_ix = global_order[i].first; - const int block_x = block_ix % block_width; - const int block_y = block_ix / block_width; - const int last_idx = last_indexes[block_ix]; - const int offset = candidate_coeff_offsets[block_ix]; - const uint8_t* candidates = &candidate_coeffs[offset]; - const int idx = candidates[last_idx + std::min(direction, 0)]; - const int c = idx / kDCTBlockSize; - const int k = idx % kDCTBlockSize; - const int* quant = img->component(c).quant(); - const JPEGComponent& comp = jpg.components[c]; - const int jpg_block_ix = block_y * comp.width_in_blocks + block_x; - const int newval = direction > 0 ? 0 : Quantize( - comp.coeffs[jpg_block_ix * kDCTBlockSize + k], quant[k]); - coeff_t block[kDCTBlockSize] = { 0 }; - img->component(c).GetCoeffBlock(block_x, block_y, block); - UpdateACHistogram(-1, block, quant, &ac_histograms[c]); - block[k] = newval; - UpdateACHistogram(1, block, quant, &ac_histograms[c]); - img->component(c).SetCoeffBlock(block_x, block_y, block); - last_indexes[block_ix] += direction; - changed_blocks.insert(block_ix); - val_threshold = global_order[i].second; - ++changed_coeffs; - static const int kEntropyCodeUpdateFreq = 10; - if (i % kEntropyCodeUpdateFreq == 0) { - ac_histogram_size = ComputeEntropyCodes(ac_histograms, &ac_depths); - } - est_jpg_size = jpg_header_size + dc_size + ac_histogram_size + - EntropyCodedDataSize(ac_histograms, ac_depths); - if (changed_coeffs > min_coeffs_to_change && - std::abs(est_jpg_size - prev_size) > min_size_delta) { - break; - } - } - size_t global_order_size = global_order.size(); - std::vector>().swap(global_order); + std::set changed_blocks; + float val_threshold = 0.0; + int changed_coeffs = 0; + int est_jpg_size = prev_size; + for (size_t i = 0; i < global_order.size(); ++i) { + const int block_ix = global_order[i].first; + const int block_x = block_ix % block_width; + const int block_y = block_ix / block_width; + const int last_idx = last_indexes[block_ix]; + const int offset = candidate_coeff_offsets[block_ix]; + const uint8_t* candidates = &candidate_coeffs[offset]; + const int idx = candidates[last_idx + std::min(direction, 0)]; + const int c = idx / kDCTBlockSize; + const int k = idx % kDCTBlockSize; + const int* quant = img->component(c).quant(); + const JPEGComponent& comp = jpg.components[c]; + const int jpg_block_ix = block_y * comp.width_in_blocks + block_x; + const int newval = direction > 0 ? 0 : Quantize( + comp.coeffs[jpg_block_ix * kDCTBlockSize + k], quant[k]); + coeff_t block[kDCTBlockSize] = { 0 }; + img->component(c).GetCoeffBlock(block_x, block_y, block); + UpdateACHistogram(-1, block, quant, &ac_histograms[c]); + block[k] = newval; + UpdateACHistogram(1, block, quant, &ac_histograms[c]); + img->component(c).SetCoeffBlock(block_x, block_y, block); + last_indexes[block_ix] += direction; + changed_blocks.insert(block_ix); + val_threshold = global_order[i].second; + ++changed_coeffs; + static const int kEntropyCodeUpdateFreq = 10; + if (i % kEntropyCodeUpdateFreq == 0) { + ac_histogram_size = ComputeEntropyCodes(ac_histograms, &ac_depths); + } + est_jpg_size = jpg_header_size + dc_size + ac_histogram_size + + EntropyCodedDataSize(ac_histograms, ac_depths); + if (changed_coeffs > min_coeffs_to_change && + std::abs(est_jpg_size - prev_size) > min_size_delta) { + break; + } + } + size_t global_order_size = global_order.size(); + std::vector>().swap(global_order); - for (int i = 0; i < num_blocks; ++i) { - max_block_error[i] += block_weight[i] * val_threshold * direction; - } + for (int i = 0; i < num_blocks; ++i) { + max_block_error[i] += block_weight[i] * val_threshold * direction; + } - ++stats_->counters[kNumItersCnt]; - ++stats_->counters[direction > 0 ? kNumItersUpCnt : kNumItersDownCnt]; - std::string encoded_jpg; - { - JPEGData jpg_out = jpg; - img->SaveToJpegData(&jpg_out); - OutputJpeg(jpg_out, &encoded_jpg); - } - GUETZLI_LOG(stats_, - "Iter %2d: %s(%d) %s Coeffs[%d/%zd] " - "Blocks[%zd/%d/%d] ValThres[%.4f] Out[%7zd] EstErr[%.2f%%]", - stats_->counters[kNumItersCnt], img->FrameTypeStr().c_str(), - comp_mask, direction > 0 ? "up" : "down", changed_coeffs, - global_order_size, changed_blocks.size(), - blocks_to_change, num_blocks, val_threshold, - encoded_jpg.size(), - 100.0 - (100.0 * est_jpg_size) / encoded_jpg.size()); - comparator_->Compare(*img); - MaybeOutput(encoded_jpg); - prev_size = est_jpg_size; + ++stats_->counters[kNumItersCnt]; + ++stats_->counters[direction > 0 ? kNumItersUpCnt : kNumItersDownCnt]; + std::string encoded_jpg; + { + JPEGData jpg_out = jpg; + img->SaveToJpegData(&jpg_out); + OutputJpeg(jpg_out, &encoded_jpg); + } + GUETZLI_LOG(stats_, + "Iter %2d: %s(%d) %s Coeffs[%d/%zd] " + "Blocks[%zd/%d/%d] ValThres[%.4f] Out[%7zd] EstErr[%.2f%%]", + stats_->counters[kNumItersCnt], img->FrameTypeStr().c_str(), + comp_mask, direction > 0 ? "up" : "down", changed_coeffs, + global_order_size, changed_blocks.size(), + blocks_to_change, num_blocks, val_threshold, + encoded_jpg.size(), + 100.0 - (100.0 * est_jpg_size) / encoded_jpg.size()); + comparator_->Compare(*img); + MaybeOutput(encoded_jpg); + prev_size = est_jpg_size; + } } - } } bool IsGrayscale(const JPEGData& jpg) { From dae16735416e0d0a31f9cf7a1125a7f0c19def33 Mon Sep 17 00:00:00 2001 From: strongtu Date: Mon, 15 May 2017 02:47:42 +0800 Subject: [PATCH 068/189] =?UTF-8?q?=E5=BB=BA=E7=AB=8Bcl=E7=AB=AF=E7=9A=84?= =?UTF-8?q?=E6=89=B9=E9=87=8F=E5=8C=96ComputeZeroingOrder=E9=83=BD?= =?UTF-8?q?=E6=9C=89=E5=93=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clguetzli.cl | 229 ++++++++++++++++++++++++++++- clguetzli/clguetzli_comparator.cpp | 103 +++++++++++-- clguetzli/clguetzli_comparator.h | 13 +- guetzli/processor.cc | 10 +- 4 files changed, 330 insertions(+), 25 deletions(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index 9d132ae8..906b2149 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -8,6 +8,7 @@ #define kBlockEdge 8 #define kBlockSize (kBlockEdge * kBlockEdge) +#define kDCTBlockSize (kBlockEdge * kBlockEdge) #define kBlockEdgeHalf (kBlockEdge / 2) #define kBlockHalf (kBlockEdge * kBlockEdgeHalf) @@ -1413,4 +1414,230 @@ void RgbToXyb(double r, double g, double b, double *valx, double *valy, double * *valx = a0 * r - a1 * g; *valy = a2 * r + a3 * g; *valz = b; -} \ No newline at end of file +} + + +///================================================== +typedef struct __IntFloatPair +{ + int idx; + float err; +}IntFloatPair, DCTScoreData, CoeffData; + +typedef int16 coeff_t; + +typedef struct __IntFloatPairList +{ + int size; + IntFloatPair *pData; +}IntFloatPairList; + +// chrisk todo +// return size +int list_push_back(IntFloatPairList* list, int i, float f) +{ + +} + +// chrisk todo +// remove idx and return size +int list_erase(IntFloatPairList* list, int idx) +{ +} + +// chrisk todo +int SortInputOrder(DCTScoreData* input_order, int size) +{ +/* + std::sort(input_order.begin(), input_order.end(), + [](const std::pair& a, const std::pair& b) { + return a.second < b.second; }); +*/ +} + +// chrisk todo +// return the count of Non-zero item +int MakeInputOrder(__global coeff_t *orig_block, DCTScoreData *input_order, int size) +{ +/* + static const double kWeight[3] = { 1.0, 0.22, 0.20 }; +#include "guetzli/order.inc" + std::vector > input_order; + for (int c = 0; c < 3; ++c) { + if (!(comp_mask & (1 << c))) continue; + for (int k = 1; k < kDCTBlockSize; ++k) { + int idx = c * kDCTBlockSize + k; + if (block[idx] != 0) { + float score; + if (params_.new_zeroing_model) { + score = std::abs(orig_block[idx]) * csf[idx] + bias[idx]; + } + else { + score = static_cast((std::abs(orig_block[idx]) - kJPEGZigZagOrder[k] / 64.0) * + kWeight[c] / oldCsf[k]); + } + input_order.push_back(std::make_pair(idx, score)); + } + } + } +*/ + return SortInputOrder(input_order, size); +} + +// chrisk todo +void BlockToImage(coeff_t *candidate_block, float *r, float *g, float *b) +{ + +} + +// ian todo +void BlurEx(float *r, int xsize, int ysize, double kSigma, double border_ratio) +{ +} + + +// ian todo +void OpsinDynamicsImageBlock(float *r, float *g, float *b, + float *r_blurred, float *g_blurred, float *b_blurred, + int size) +{ + +} + +// strong todo +void MaskHighIntensityChangeBlock(float *xyb0_x, float *xyb0_y, float *xyb0_b, + float *xyb1_x, float *xyb1_y, float *xyb1_b, + float *c0_x, float *c0_y, float *c0_b, + float *c1_x, float *c1_y, float *c1_b, + int xsize, int ysize) +{ +} + +// strong todo +float CompareBlockEx(coeff_t *candidate_block, float* orig_image_block, float* mask_scale_block) +{ + float image_block[3 * kDCTBlockSize]; + float *r1 = image_block; + float *g1 = &image_block[kDCTBlockSize]; + float *b1 = &image_block[2 * kDCTBlockSize]; + BlockToImage(candidate_block, r1, g1, b1); + + float *r0 = orig_image_block; + float *g0 = &orig_image_block[kDCTBlockSize]; + float *b0 = &orig_image_block[2 * kDCTBlockSize]; + + float *cr0, *cg0, *cb0; + float *cr1, *cg1, *cb1; + + float *r0_blurred, *g0_blurred, *b0_blurred; + float *r1_blurred, *g1_blurred, *b1_blurred; + + //BlurEx(r0,.. + //BlurEx + //BlurEx + //BlurEx. + OpsinDynamicsImageBlock(r0, g0, b0, r0_blurred, g0_blurred, b0_blurred, kDCTBlockSize); + OpsinDynamicsImageBlock(r1, g1, b1, r1_blurred, g1_blurred, b1_blurred, kDCTBlockSize); + + MaskHighIntensityChangeBlock(r0, g0, b0, r1, g1, b1, cr0, cg0, cb0, cr1, cg1, cb1, 8, 8); + { + double b0[3 * kDCTBlockSize]; + double b1[3 * kDCTBlockSize]; + /* + for (int c = 0; c < 3; ++c) { + for (int ix = 0; ix < kDCTBlockSize; ++ix) { + b0[c * kDCTBlockSize + ix] = rgb0[c][ix]; + b1[c * kDCTBlockSize + ix] = rgb1[c][ix]; + } + } + */ + double diff_xyz_dc[3] = { 0.0 }; + double diff_xyz_ac[3] = { 0.0 }; + double diff_xyz_edge_dc[3] = { 0.0 }; + + ButteraugliBlockDiff(b0, b1, diff_xyz_dc, diff_xyz_ac, diff_xyz_edge_dc); + + double diff = 0.0; + double diff_edge = 0.0; + /* + for (int c = 0; c < 3; ++c) { + diff += diff_xyz_dc[c] * imgMaskXyzScaleBlockList[block_ix * 3 + c]; + diff += diff_xyz_ac[c] * imgMaskXyzScaleBlockList[block_ix * 3 + c]; + diff_edge += diff_xyz_edge_dc[c] * imgMaskXyzScaleBlockList[block_ix * 3 + c]; + } + const double kEdgeWeight = 0.05; + return sqrt((1 - kEdgeWeight) * diff + kEdgeWeight * diff_edge); + */ + } + return 0; +} + +// strong todo +__kernel void clComputeBlockZeroingOrder(__global coeff_t *orig_block_list/*in*/, + __global coeff_t *block_list/*in*/, + __global float *orig_image/*in*/, + __global CoeffData *output_order_list/*out*/) +{ + int block_idx = get_global_id(0); + + __global coeff_t *orig_block = orig_block_list + block_idx * kBlockSize; + __global coeff_t *block = block_list + block_idx * kBlockSize; + + DCTScoreData input_order_data[kBlockSize]; + CoeffData output_order_data[kBlockSize]; + + MakeInputOrder(orig_block, input_order_data, kBlockSize); + IntFloatPairList input_order = { kBlockSize, input_order_data }; + IntFloatPairList output_order = { kBlockSize, output_order_data }; + + + coeff_t processed_block[kBlockSize]; + // memcpy(processed_block, block, sizeof(processed_block); + + while (input_order.size > 0) + { + float best_err = 1e17f; + int best_i = 0; + for (int i = 0; i < min(3, input_order.size); i++) + { + coeff_t candidate_block[kBlockSize]; + // memcpy(candidate_block, processed_block, sizeof(candidate_block); + + const int idx = input_order.pData[i].idx; + + candidate_block[idx] = 0; + + float max_err = CompareBlockEx(candidate_block, 0, 0); + if (max_err < best_err) + { + best_err = max_err; + best_i = i; + } + } + + int idx = input_order.pData[best_i].idx; + processed_block[idx] = 0; + list_erase(&input_order, best_i); + + list_push_back(&output_order, idx, best_err); + } + // ×¢Òâoutput_orderÕâÀïµÄresize¾ÍÊǰÑβ²¿µÄÖÃλ0 +/* + // TOBEREMOVE:×îÖÕÒÆ³ýerrÊý´óÓÚerrorÏÞÖÆµÄÏî·µ»Ø£¬²¢»¹Ô­¶Ô±ÈͼÏñµ½Ô­Ê¼Öµ¡£ + // Make the block error values monotonic. + float min_err = 1e10; + for (int i = output_order->size() - 1; i >= 0; --i) { + min_err = std::min(min_err, (*output_order)[i].block_err); + (*output_order)[i].block_err = min_err; + } + // Cut off at the block error limit. + size_t num = 0; + while (num < output_order->size() && + (*output_order)[num].block_err <= comparator_->BlockErrorLimit()) { + ++num; + } + output_order->resize(num); +*/ + + // memcpy(output_data_list + block_idx * kBlockSize +} diff --git a/clguetzli/clguetzli_comparator.cpp b/clguetzli/clguetzli_comparator.cpp index 5970e6d8..3babe180 100644 --- a/clguetzli/clguetzli_comparator.cpp +++ b/clguetzli/clguetzli_comparator.cpp @@ -309,26 +309,89 @@ namespace guetzli void ButteraugliComparatorEx::StartBlockComparisons() { ButteraugliComparator::StartBlockComparisons(); + + const int width = width_; + const int height = height_; + const int factor_x = 1; + const int factor_y = 1; + + const int block_width = (width + 8 * factor_x - 1) / (8 * factor_x); + const int block_height = (height + 8 * factor_y - 1) / (8 * factor_y); + const int num_blocks = block_width * block_height; + + const double* lut = Srgb8ToLinearTable(); + + imgOpsinDynamicsBlockList.resize(num_blocks * 3 * kDCTBlockSize); + imgMaskXyzScaleBlockList.resize(num_blocks * 3); + for (int block_y = 0, block_ix = 0; block_y < block_height; ++block_y) + { + for (int block_x = 0; block_x < block_width; ++block_x, ++block_ix) + { + float* curR = &imgOpsinDynamicsBlockList[block_ix * 3 * kDCTBlockSize]; + float* curG = curR + kDCTBlockSize; + float* curB = curG + kDCTBlockSize; + + for (int iy = 0, i = 0; iy < 8; ++iy) { + for (int ix = 0; ix < 8; ++ix, ++i) { + int x = std::min(8 * block_x + ix, width - 1); + int y = std::min(8 * block_y + iy, height - 1); + int px = y * width + x; + + curR[i] = lut[rgb_orig_[3 * px]]; + curG[i] = lut[rgb_orig_[3 * px + 1]]; + curB[i] = lut[rgb_orig_[3 * px + 2]]; + } + } + + int xmin = block_x * 8; + int ymin = block_y * 8; + + imgMaskXyzScaleBlockList[block_ix * 3] = mask_xyz_[0][ymin * width_ + xmin]; + imgMaskXyzScaleBlockList[block_ix * 3 + 1] = mask_xyz_[1][ymin * width_ + xmin]; + imgMaskXyzScaleBlockList[block_ix * 3 + 2] = mask_xyz_[2][ymin * width_ + xmin]; + } + } + + } + void ButteraugliComparatorEx::FinishBlockComparisons() { + ButteraugliComparator::FinishBlockComparisons(); + + imgOpsinDynamicsBlockList.clear(); + imgMaskXyzScaleBlockList.clear(); + } + void ButteraugliComparatorEx::SwitchBlock(int block_x, int block_y, int factor_x, int factor_y) { + block_x_ = block_x; + block_y_ = block_y; + factor_x_ = factor_x; + factor_y_ = factor_y; + return; ButteraugliComparator::SwitchBlock(block_x, block_y, factor_x, factor_y); } double ButteraugliComparatorEx::CompareBlockEx(coeff_t* candidate_block) { - int block_x = block_x_ * factor_x_; - int block_y = block_y_ * factor_y_; - int xmin = 8 * block_x; - int ymin = 8 * block_y; - int block_ix = 0; - const std::vector >& rgb0_c = per_block_pregamma_[block_ix]; + int block_ix = getCurrentBlockIdx(); + + float* block_opsin = &imgOpsinDynamicsBlockList[block_ix * 3 * kDCTBlockSize]; + + // Õâ¸öÄڴ濽±´´ýÓÅ»¯£¬µ«²»ÊÇÏÖÔÚ + std::vector< std::vector > rgb0_c; + rgb0_c.resize(3); + for (int i = 0; i < 3; i++) + { + rgb0_c[i].resize(kDCTBlockSize); + memcpy(rgb0_c[i].data(), block_opsin + i*kDCTBlockSize, kDCTBlockSize * sizeof(float)); + } // std::vector > rgb1_c(3, std::vector(kDCTBlockSize)); BlockToImage(candidate_block, rgb1_c[0].data(), rgb1_c[1].data(), rgb1_c[2].data()); + ::butteraugli::OpsinDynamicsImage(8, 8, rgb0_c); ::butteraugli::OpsinDynamicsImage(8, 8, rgb1_c); std::vector > rgb0 = rgb0_c; @@ -349,20 +412,28 @@ namespace guetzli double diff_xyz_edge_dc[3] = { 0.0 }; ::butteraugli::ButteraugliBlockDiff(b0, b1, diff_xyz_dc, diff_xyz_ac, diff_xyz_edge_dc); - double scale[3]; - for (int c = 0; c < 3; ++c) { - scale[c] = mask_xyz_[c][ymin * width_ + xmin]; - } - - static const double kEdgeWeight = 0.05; - double diff = 0.0; double diff_edge = 0.0; for (int c = 0; c < 3; ++c) { - diff += diff_xyz_dc[c] * scale[c]; - diff += diff_xyz_ac[c] * scale[c]; - diff_edge += diff_xyz_edge_dc[c] * scale[c]; + diff += diff_xyz_dc[c] * imgMaskXyzScaleBlockList[block_ix * 3 + c]; + diff += diff_xyz_ac[c] * imgMaskXyzScaleBlockList[block_ix * 3 + c]; + diff_edge += diff_xyz_edge_dc[c] * imgMaskXyzScaleBlockList[block_ix * 3 + c]; } + const double kEdgeWeight = 0.05; return sqrt((1 - kEdgeWeight) * diff + kEdgeWeight * diff_edge); } + + + int ButteraugliComparatorEx::getCurrentBlockIdx(void) + { + const int width = width_; + const int height = height_; + const int factor_x = 1; + const int factor_y = 1; + + const int block_width = (width + 8 * factor_x - 1) / (8 * factor_x); + const int block_height = (height + 8 * factor_y - 1) / (8 * factor_y); + + return block_y_ * block_width + block_x_; + } } diff --git a/clguetzli/clguetzli_comparator.h b/clguetzli/clguetzli_comparator.h index 96642ce6..353eff59 100644 --- a/clguetzli/clguetzli_comparator.h +++ b/clguetzli/clguetzli_comparator.h @@ -11,15 +11,16 @@ namespace guetzli { const std::vector* rgb, const float target_distance, ProcessStats* stats); - //void Compare(const OutputImage& img) override; - - void StartBlockComparisons(); - void SwitchBlock(int block_x, int block_y, int factor_x, int factor_y); + void StartBlockComparisons() override; + void FinishBlockComparisons() override; + void SwitchBlock(int block_x, int block_y, int factor_x, int factor_y) override; double CompareBlockEx(coeff_t* candidate_block); - + private: + int getCurrentBlockIdx(void); protected: - std::vector imgOpsinDynamicsBlockList; + std::vector imgOpsinDynamicsBlockList; // [RR..RRGG..GGBB..BB]:blockCount + std::vector imgMaskXyzScaleBlockList; // [RGBRGB..RGBRGB]:blockCount }; } \ No newline at end of file diff --git a/guetzli/processor.cc b/guetzli/processor.cc index d1607c14..c4d43bef 100644 --- a/guetzli/processor.cc +++ b/guetzli/processor.cc @@ -613,7 +613,7 @@ void Processor::SelectFrequencyMaskingBatch(const JPEGData& jpg, OutputImage* im std::vector block_order; - ComputeBlockZeroingOrder(block, orig_block, block_x, block_y, &block_order); + ComputeBlockZeroingOrder(block, orig_block, block_x, block_y, &block_order); // ÒÔÏ´¦ÀíÒÀȻûÓÐbatch»¯£¬ÓÃÓÚÏȼø¶¨ÆäËû¼ÆËã½á¹û candidate_coeff_offsets[block_ix] = candidate_coeffs.size(); @@ -626,6 +626,7 @@ void Processor::SelectFrequencyMaskingBatch(const JPEGData& jpg, OutputImage* im // comparator_->FinishBlockComparisons(); // TOBEREMOVE:Çå³ý²ÎÊý + candidate_coeff_offsets[num_blocks] = candidate_coeffs.size(); SelectFrequencyBackEnd(jpg, img, 7, target_mul, stop_early, candidate_coeff_offsets, candidate_coeffs, candidate_coeff_errors); @@ -665,6 +666,10 @@ void Processor::ComputeBlockZeroingOrder(const coeff_t block[kBlockSize], const } std::sort(input_order.begin(), input_order.end(), [](const std::pair& a, const std::pair& b) { return a.second < b.second; }); + if (input_order.size() > 10) + { + int i = 0; + } coeff_t processed_block[kBlockSize]; memcpy(processed_block, block, sizeof(processed_block)); @@ -1060,7 +1065,8 @@ bool Processor::ProcessJpegData(const Params& params, const JPEGData& jpg_in, img.ApplyGlobalQuantization(best_q); if (!downsample) { - SelectFrequencyMasking(jpg, &img, 7, 1.0, false); + //SelectFrequencyMasking(jpg, &img, 7, 1.0, false); + SelectFrequencyMaskingBatch(jpg, &img, 1.0, false); } else { const float ymul = jpg.components.size() == 1 ? 1.0f : 0.97f; SelectFrequencyMasking(jpg, &img, 1, ymul, false); From 148927eb42889308670c3ac0ff390acb8f3f012a Mon Sep 17 00:00:00 2001 From: strongtu Date: Mon, 15 May 2017 10:05:12 +0800 Subject: [PATCH 069/189] =?UTF-8?q?=E8=B0=83=E6=95=B4=E5=88=86=E5=B7=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clguetzli.cl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index 906b2149..d259327a 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -1493,6 +1493,7 @@ void BlockToImage(coeff_t *candidate_block, float *r, float *g, float *b) // ian todo void BlurEx(float *r, int xsize, int ysize, double kSigma, double border_ratio) { + // ²Î¿¼clBlurEx2µÄʵÏÖ£¬sigma = 1.1£¬Õâʱstep¡¢diff¶¼½«ÌØ»¯Îª¹Ì¶¨Öµ } @@ -1504,7 +1505,7 @@ void OpsinDynamicsImageBlock(float *r, float *g, float *b, } -// strong todo +// chrisk todo void MaskHighIntensityChangeBlock(float *xyb0_x, float *xyb0_y, float *xyb0_b, float *xyb1_x, float *xyb1_y, float *xyb1_b, float *c0_x, float *c0_y, float *c0_b, From 55f60a4a40ade3cf015d096f36f1f94eae9ea15c Mon Sep 17 00:00:00 2001 From: strongtu Date: Mon, 15 May 2017 10:20:10 +0800 Subject: [PATCH 070/189] =?UTF-8?q?=E5=88=86=E9=85=8D=E5=B7=A5=E4=BD=9C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clguetzli.cl | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index d259327a..9db59cf1 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -1418,6 +1418,9 @@ void RgbToXyb(double r, double g, double b, double *valx, double *valy, double * ///================================================== +// ¸÷룬ÒÔÏÂÕâЩº¯Êý¾ÍÊÇΪÁËʵÏÖButteraugliComparatorEx::CompareBlockEx + +// IntFloatPairÊÇΪÁËÄ£Äâoutput_order input_orderµÄvector£¬µ«ÊÇ´óС¹Ì¶¨Îª8x8 typedef struct __IntFloatPair { int idx; @@ -1487,7 +1490,7 @@ int MakeInputOrder(__global coeff_t *orig_block, DCTScoreData *input_order, int // chrisk todo void BlockToImage(coeff_t *candidate_block, float *r, float *g, float *b) { - + // ²Î¿¼clguetzli_comparator.cpp : BlockToImage } // ian todo From 16e27abc880cd8bee7858683317d8992d50d87a1 Mon Sep 17 00:00:00 2001 From: strongtu Date: Mon, 15 May 2017 16:29:51 +0800 Subject: [PATCH 071/189] clComputeBlockZeroingOrder CompareBlockEx --- clguetzli/clguetzli.cl | 189 +++++++++++++++++++++++++---------------- 1 file changed, 116 insertions(+), 73 deletions(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index 9db59cf1..7e8aabc6 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -1494,7 +1494,8 @@ void BlockToImage(coeff_t *candidate_block, float *r, float *g, float *b) } // ian todo -void BlurEx(float *r, int xsize, int ysize, double kSigma, double border_ratio) +// ¼ÆËã½á¹ûÊä³öµ½output +void BlurEx(float *r, int xsize, int ysize, double kSigma, double border_ratio, float *output) { // ²Î¿¼clBlurEx2µÄʵÏÖ£¬sigma = 1.1£¬Õâʱstep¡¢diff¶¼½«ÌØ»¯Îª¹Ì¶¨Öµ } @@ -1517,75 +1518,111 @@ void MaskHighIntensityChangeBlock(float *xyb0_x, float *xyb0_y, float *xyb0_b, { } +void func(float *r, float *g, float *b, float *r_blurred, float *g_blurred, float *b_blurred) +{ + //BlurEx(r, g, b, r_blurred, g_blurred, +} + +typedef union ocl_channels_t +{ + struct + { + float * r; + float * g; + float * b; + }; + + float *ch[3]; +}ocl_channels; + +void floatcopy(float *dst, float *src, int size) +{ + for (int i = 0; i < size; i++) + { + dst[i] = src[i]; + } +} + +void CalcOpsinDynamicsImage(ocl_channels rgb) +{ + float rgb_blurred[3][kDCTBlockSize]; + for (int i = 0; i < 3; i++) + { + BlurEx(rgb.ch[i], 8, 8, 1.1, 0, rgb_blurred[i]); + } + OpsinDynamicsImageBlock(rgb.r, rgb.g, rgb.b, rgb_blurred[0], rgb_blurred[1], rgb_blurred[2], kDCTBlockSize); +} + // strong todo -float CompareBlockEx(coeff_t *candidate_block, float* orig_image_block, float* mask_scale_block) +float CompareBlockEx(coeff_t *candidate_block, __global float* orig_image_block, __global float* mask_scale_block) { - float image_block[3 * kDCTBlockSize]; - float *r1 = image_block; - float *g1 = &image_block[kDCTBlockSize]; - float *b1 = &image_block[2 * kDCTBlockSize]; - BlockToImage(candidate_block, r1, g1, b1); - - float *r0 = orig_image_block; - float *g0 = &orig_image_block[kDCTBlockSize]; - float *b0 = &orig_image_block[2 * kDCTBlockSize]; - - float *cr0, *cg0, *cb0; - float *cr1, *cg1, *cb1; - - float *r0_blurred, *g0_blurred, *b0_blurred; - float *r1_blurred, *g1_blurred, *b1_blurred; - - //BlurEx(r0,.. - //BlurEx - //BlurEx - //BlurEx. - OpsinDynamicsImageBlock(r0, g0, b0, r0_blurred, g0_blurred, b0_blurred, kDCTBlockSize); - OpsinDynamicsImageBlock(r1, g1, b1, r1_blurred, g1_blurred, b1_blurred, kDCTBlockSize); - - MaskHighIntensityChangeBlock(r0, g0, b0, r1, g1, b1, cr0, cg0, cb0, cr1, cg1, cb1, 8, 8); + float rgb0[3][kDCTBlockSize]; + float rgb1[3][kDCTBlockSize]; { - double b0[3 * kDCTBlockSize]; - double b1[3 * kDCTBlockSize]; - /* - for (int c = 0; c < 3; ++c) { - for (int ix = 0; ix < kDCTBlockSize; ++ix) { - b0[c * kDCTBlockSize + ix] = rgb0[c][ix]; - b1[c * kDCTBlockSize + ix] = rgb1[c][ix]; - } - } - */ - double diff_xyz_dc[3] = { 0.0 }; - double diff_xyz_ac[3] = { 0.0 }; - double diff_xyz_edge_dc[3] = { 0.0 }; - - ButteraugliBlockDiff(b0, b1, diff_xyz_dc, diff_xyz_ac, diff_xyz_edge_dc); - - double diff = 0.0; - double diff_edge = 0.0; - /* - for (int c = 0; c < 3; ++c) { - diff += diff_xyz_dc[c] * imgMaskXyzScaleBlockList[block_ix * 3 + c]; - diff += diff_xyz_ac[c] * imgMaskXyzScaleBlockList[block_ix * 3 + c]; - diff_edge += diff_xyz_edge_dc[c] * imgMaskXyzScaleBlockList[block_ix * 3 + c]; - } - const double kEdgeWeight = 0.05; - return sqrt((1 - kEdgeWeight) * diff + kEdgeWeight * diff_edge); - */ + float rgb0_data[3*kDCTBlockSize]; + ocl_channels rgb0_c = { rgb0_data, &rgb0_data[kDCTBlockSize], &rgb0_data[2 * kDCTBlockSize] }; + for (int i = 0; i < 3*kDCTBlockSize; i++) + { + rgb0_data[i] = orig_image_block[i]; + } + + float image_block[3 * kDCTBlockSize]; + ocl_channels rgb1_c = { image_block, &image_block[kDCTBlockSize], &image_block[2 * kDCTBlockSize] }; + BlockToImage(candidate_block, rgb1_c.r, rgb1_c.g, rgb1_c.b); + + CalcOpsinDynamicsImage(rgb0_c); + CalcOpsinDynamicsImage(rgb1_c); + + floatcopy(rgb0, rgb0_data, 3 * kDCTBlockSize); + floatcopy(rgb1, image_block, 3 * kDCTBlockSize); + + MaskHighIntensityChangeBlock(rgb0[0],rgb0[1], rgb0[2], + rgb1[0], rgb1[1], rgb1[2], + rgb0_c.ch[0], rgb0_c.ch[1], rgb0_c.ch[2], + rgb1_c.ch[0], rgb1_c.ch[1], rgb1_c.ch[2], + 8, 8); + + } + + // ÕâÀïΪɶҪ°Ñfloatת³Édouble²ÅÄܼÌÐø×ö¼ÆË㣿 + double b0[3 * kDCTBlockSize]; // + double b1[3 * kDCTBlockSize]; + for (int c = 0; c < 3; ++c) { + for (int ix = 0; ix < kDCTBlockSize; ++ix) { + b0[c * kDCTBlockSize + ix] = rgb0[c][ix]; + b1[c * kDCTBlockSize + ix] = rgb1[c][ix]; + } } - return 0; + + double diff_xyz_dc[3] = { 0.0 }; + double diff_xyz_ac[3] = { 0.0 }; + double diff_xyz_edge_dc[3] = { 0.0 }; + ButteraugliBlockDiff(b0, b1, diff_xyz_dc, diff_xyz_ac, diff_xyz_edge_dc); + + double diff = 0.0; + double diff_edge = 0.0; + + for (int c = 0; c < 3; ++c) { + diff += diff_xyz_dc[c] * mask_scale_block[c]; + diff += diff_xyz_ac[c] * mask_scale_block[c]; + diff_edge += diff_xyz_edge_dc[c] * mask_scale_block[c]; + } + const double kEdgeWeight = 0.05; + return sqrt((1 - kEdgeWeight) * diff + kEdgeWeight * diff_edge); } // strong todo __kernel void clComputeBlockZeroingOrder(__global coeff_t *orig_block_list/*in*/, __global coeff_t *block_list/*in*/, __global float *orig_image/*in*/, + __global float *mask_scale/*in*/, __global CoeffData *output_order_list/*out*/) { int block_idx = get_global_id(0); __global coeff_t *orig_block = orig_block_list + block_idx * kBlockSize; __global coeff_t *block = block_list + block_idx * kBlockSize; + __global float* orig_image_block = orig_image + block_idx * kBlockSize; DCTScoreData input_order_data[kBlockSize]; CoeffData output_order_data[kBlockSize]; @@ -1594,9 +1631,10 @@ __kernel void clComputeBlockZeroingOrder(__global coeff_t *orig_block_list/*in*/ IntFloatPairList input_order = { kBlockSize, input_order_data }; IntFloatPairList output_order = { kBlockSize, output_order_data }; - coeff_t processed_block[kBlockSize]; - // memcpy(processed_block, block, sizeof(processed_block); + for (int i = 0; i < kBlockSize; i++) { + processed_block[i] = block[i]; + } while (input_order.size > 0) { @@ -1605,13 +1643,15 @@ __kernel void clComputeBlockZeroingOrder(__global coeff_t *orig_block_list/*in*/ for (int i = 0; i < min(3, input_order.size); i++) { coeff_t candidate_block[kBlockSize]; - // memcpy(candidate_block, processed_block, sizeof(candidate_block); + for (int i = 0; i < kBlockSize; i++) { + candidate_block[i] = processed_block[i]; + } const int idx = input_order.pData[i].idx; candidate_block[idx] = 0; - float max_err = CompareBlockEx(candidate_block, 0, 0); + float max_err = CompareBlockEx(candidate_block, orig_image_block, mask_scale + block_idx * 3); if (max_err < best_err) { best_err = max_err; @@ -1626,22 +1666,25 @@ __kernel void clComputeBlockZeroingOrder(__global coeff_t *orig_block_list/*in*/ list_push_back(&output_order, idx, best_err); } // ×¢Òâoutput_orderÕâÀïµÄresize¾ÍÊǰÑβ²¿µÄÖÃλ0 -/* - // TOBEREMOVE:×îÖÕÒÆ³ýerrÊý´óÓÚerrorÏÞÖÆµÄÏî·µ»Ø£¬²¢»¹Ô­¶Ô±ÈͼÏñµ½Ô­Ê¼Öµ¡£ - // Make the block error values monotonic. float min_err = 1e10; - for (int i = output_order->size() - 1; i >= 0; --i) { - min_err = std::min(min_err, (*output_order)[i].block_err); - (*output_order)[i].block_err = min_err; + for (int i = output_order.size - 1; i >= 0; --i) { + min_err = min(min_err, output_order.pData[i].err); + output_order.pData[i].err = min_err; } - // Cut off at the block error limit. - size_t num = 0; - while (num < output_order->size() && - (*output_order)[num].block_err <= comparator_->BlockErrorLimit()) { - ++num; - } - output_order->resize(num); -*/ - // memcpy(output_data_list + block_idx * kBlockSize + __global CoeffData *output_block = output_order_list + block_idx * kBlockSize; + + for (int i = 0; i < kBlockSize; i++) + { + if (i > output_order.size) + { + output_block[i].idx = 0; + output_block[i].err = 0; + } + else + { + output_block[i].idx = output_order.pData[i].idx; + output_block[i].err = output_order.pData[i].err; + } + } } From 87b462ac378463f3a372bfb2917835ac190b495d Mon Sep 17 00:00:00 2001 From: strongtu Date: Mon, 15 May 2017 16:47:07 +0800 Subject: [PATCH 072/189] =?UTF-8?q?=E4=BF=AE=E6=AD=A3n=E5=8D=A1=E7=BC=96?= =?UTF-8?q?=E8=AF=91=E5=85=BC=E5=AE=B9=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clguetzli.cl | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index 7e8aabc6..b33e98ce 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -1439,18 +1439,20 @@ typedef struct __IntFloatPairList // return size int list_push_back(IntFloatPairList* list, int i, float f) { - + return 0; } // chrisk todo // remove idx and return size int list_erase(IntFloatPairList* list, int idx) { + return 0; } // chrisk todo int SortInputOrder(DCTScoreData* input_order, int size) { + return 0; /* std::sort(input_order.begin(), input_order.end(), [](const std::pair& a, const std::pair& b) { @@ -1518,11 +1520,6 @@ void MaskHighIntensityChangeBlock(float *xyb0_x, float *xyb0_y, float *xyb0_b, { } -void func(float *r, float *g, float *b, float *r_blurred, float *g_blurred, float *b_blurred) -{ - //BlurEx(r, g, b, r_blurred, g_blurred, -} - typedef union ocl_channels_t { struct @@ -1560,14 +1557,20 @@ float CompareBlockEx(coeff_t *candidate_block, __global float* orig_image_block, float rgb1[3][kDCTBlockSize]; { float rgb0_data[3*kDCTBlockSize]; - ocl_channels rgb0_c = { rgb0_data, &rgb0_data[kDCTBlockSize], &rgb0_data[2 * kDCTBlockSize] }; + ocl_channels rgb0_c; + rgb0_c.r = &rgb0_data[0]; + rgb0_c.g = &rgb0_data[kDCTBlockSize]; + rgb0_c.b = &rgb0_data[2 * kDCTBlockSize]; for (int i = 0; i < 3*kDCTBlockSize; i++) { rgb0_data[i] = orig_image_block[i]; } float image_block[3 * kDCTBlockSize]; - ocl_channels rgb1_c = { image_block, &image_block[kDCTBlockSize], &image_block[2 * kDCTBlockSize] }; + ocl_channels rgb1_c; + rgb1_c.r = &image_block[0]; + rgb1_c.g = &image_block[kDCTBlockSize]; + rgb1_c.b = &image_block[2 * kDCTBlockSize]; BlockToImage(candidate_block, rgb1_c.r, rgb1_c.g, rgb1_c.b); CalcOpsinDynamicsImage(rgb0_c); From 092557922bb8dcc865846f4c73ccc309d6d24e32 Mon Sep 17 00:00:00 2001 From: ianhuang-777 <306168910@qq.com> Date: Mon, 15 May 2017 19:55:49 +0800 Subject: [PATCH 073/189] Implement part of BlurEx --- clguetzli/clguetzli.cl | 46 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index b33e98ce..f619b76b 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -1495,11 +1495,57 @@ void BlockToImage(coeff_t *candidate_block, float *r, float *g, float *b) // ²Î¿¼clguetzli_comparator.cpp : BlockToImage } +void Convolution(__global float* multipliers, __global float* inp, __global float* result, + size_t xsize, size_t ysize, int xstep, int len, int offset, float border_ratio) +{ + float weight_no_border = 0; + + for (size_t j = 0; j <= 2 * offset; ++j) { + weight_no_border += multipliers[j]; + } + for (size_t x = 0, ox = 0; x < xsize; x += xstep, ox++) { + int minx = x < offset ? 0 : x - offset; + int maxx = min(xsize, x + len - offset) - 1; + float weight = 0.0; + for (int j = minx; j <= maxx; ++j) { + weight += multipliers[j - x + offset]; + } + // Interpolate linearly between the no-border scaling and border scaling. + weight = (1.0 - border_ratio) * weight + border_ratio * weight_no_border; + float scale = 1.0 / weight; + for (size_t y = 0; y < ysize; ++y) { + float sum = 0.0; + for (int j = minx; j <= maxx; ++j) { + sum += inp[y * xsize + j] * multipliers[j - x + offset]; + } + result[ox * ysize + y] = (float)(sum * scale); + } + } +} + // ian todo // ¼ÆËã½á¹ûÊä³öµ½output void BlurEx(float *r, int xsize, int ysize, double kSigma, double border_ratio, float *output) { // ²Î¿¼clBlurEx2µÄʵÏÖ£¬sigma = 1.1£¬Õâʱstep¡¢diff¶¼½«ÌØ»¯Îª¹Ì¶¨Öµ + const double sigma = 1.1; + double m = 2.25; // Accuracy increases when m is increased. + const double scaler = -0.41322314049586772; // when sigma=1.1, scaler is -0.41322314049586772 + const int diff = 2; // when sigma=1.1, diff's value is 2. + const int expn_size = 5; // when sigma=1.1, scaler is 5 + float expn[5] = { exp(scaler * (-diff) * (-diff)), + exp(scaler * (-diff + 1) * (-diff + 1)), + exp(scaler * (-diff + 2) * (-diff + 2)), + exp(scaler * (-diff + 3) * (-diff + 3)), + exp(scaler * (-diff + 4) * (-diff + 4))}; + const int xstep = 1; // when sigma=1.1, xstep is 1. + /* + Convolution(xsize, ysize, xstep, expn_size, diff, expn.data(), channel, + border_ratio, + tmp.data()); + Convolution(ysize, dxsize, ystep, expn_size, diff, expn.data(), tmp.data(), + border_ratio, output); + */ } From b3455dd3a735614d8667cf66fc8a7b338a7aa8b3 Mon Sep 17 00:00:00 2001 From: Zhou Ke Date: Tue, 16 May 2017 02:53:50 +0800 Subject: [PATCH 074/189] Merge remote-tracking branch 'origin/master' --- clguetzli/clguetzli.cl | 517 +++++++++++++++++++++++++++++-- clguetzli/clguetzli.cpp | 52 ++++ clguetzli/clguetzli.h | 8 + clguetzli/clguetzli_comparator.h | 2 +- clguetzli/ocl.h | 1 + guetzli/processor.cc | 59 +++- 6 files changed, 606 insertions(+), 33 deletions(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index f619b76b..6e354874 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -1439,20 +1439,41 @@ typedef struct __IntFloatPairList // return size int list_push_back(IntFloatPairList* list, int i, float f) { - return 0; + list->pData[list->size].idx = i; + list->pData[list->size].err = f; + return ++list->size; } // chrisk todo // remove idx and return size int list_erase(IntFloatPairList* list, int idx) { - return 0; + for (int i = idx; i < list->size - 1; i++) + { + list->pData[i].idx = list->pData[i + 1].idx; + list->pData[i].err = list->pData[i + 1].err; + } + return --list->size; } // chrisk todo int SortInputOrder(DCTScoreData* input_order, int size) { - return 0; + int i, j; + DCTScoreData tmp; + for (j = 1; j < size; j++) { + tmp.idx = input_order[j].idx; + tmp.err = input_order[j].err; + i = j - 1; + while (i >= 0 && input_order[i].err > tmp.err) { + input_order[i + 1].idx = input_order[i].idx; + input_order[i + 1].err = input_order[i].err; + i--; + } + input_order[i + 1].idx = tmp.idx; + input_order[i + 1].err = tmp.err; + } + return size; /* std::sort(input_order.begin(), input_order.end(), [](const std::pair& a, const std::pair& b) { @@ -1460,10 +1481,412 @@ int SortInputOrder(DCTScoreData* input_order, int size) */ } +__constant static float csf[192] = { + 0.0f, + 1.71014f, + 0.298711f, + 0.233709f, + 0.223126f, + 0.207072f, + 0.192775f, + 0.161201f, + 2.05807f, + 0.222927f, + 0.203406f, + 0.188465f, + 0.184668f, + 0.169993f, + 0.159142f, + 0.130155f, + 0.430518f, + 0.204939f, + 0.206655f, + 0.192231f, + 0.182941f, + 0.169455f, + 0.157599f, + 0.127153f, + 0.234757f, + 0.191098f, + 0.192698f, + 0.17425f, + 0.166503f, + 0.142154f, + 0.126182f, + 0.104196f, + 0.226117f, + 0.185373f, + 0.183825f, + 0.166643f, + 0.159414f, + 0.12636f, + 0.108696f, + 0.0911974f, + 0.207463f, + 0.171517f, + 0.170124f, + 0.141582f, + 0.126213f, + 0.103627f, + 0.0882436f, + 0.0751848f, + 0.196436f, + 0.161947f, + 0.159271f, + 0.126938f, + 0.109125f, + 0.0878027f, + 0.0749842f, + 0.0633859f, + 0.165232f, + 0.132905f, + 0.128679f, + 0.105766f, + 0.0906087f, + 0.0751544f, + 0.0641187f, + 0.0529921f, + 0.0f, + 0.147235f, + 0.11264f, + 0.0757892f, + 0.0493929f, + 0.0280663f, + 0.0075012f, + -0.000945567f, + 0.149251f, + 0.0964806f, + 0.0786224f, + 0.05206f, + 0.0292758f, + 0.00353094f, + -0.00277912f, + -0.00404481f, + 0.115551f, + 0.0793142f, + 0.0623735f, + 0.0405019f, + 0.0152656f, + -0.00145742f, + -0.00370369f, + -0.00375106f, + 0.0791547f, + 0.0537506f, + 0.0413634f, + 0.0193486f, + 0.000609066f, + -0.00510923f, + -0.0046452f, + -0.00385187f, + 0.0544534f, + 0.0334066f, + 0.0153899f, + 0.000539088f, + -0.00356085f, + -0.00535661f, + -0.00429145f, + -0.00343131f, + 0.0356439f, + 0.00865645f, + 0.00165229f, + -0.00425931f, + -0.00507324f, + -0.00459083f, + -0.003703f, + -0.00310327f, + 0.0121926f, + -0.0009259f, + -0.00330991f, + -0.00499378f, + -0.00437381f, + -0.00377427f, + -0.00311731f, + -0.00255125f, + -0.000320593f, + -0.00426043f, + -0.00416549f, + -0.00419364f, + -0.00365418f, + -0.00317499f, + -0.00255932f, + -0.00217917f, + 0.0f, + 0.143471f, + 0.124336f, + 0.0947465f, + 0.0814066f, + 0.0686776f, + 0.0588122f, + 0.0374415f, + 0.146315f, + 0.105334f, + 0.0949415f, + 0.0784241f, + 0.0689064f, + 0.0588304f, + 0.0495961f, + 0.0202342f, + 0.123818f, + 0.0952654f, + 0.0860556f, + 0.0724158f, + 0.0628307f, + 0.0529965f, + 0.0353941f, + 0.00815821f, + 0.097054f, + 0.080422f, + 0.0731085f, + 0.0636154f, + 0.055606f, + 0.0384127f, + 0.0142879f, + 0.00105195f, + 0.0849312f, + 0.071115f, + 0.0631183f, + 0.0552972f, + 0.0369221f, + 0.00798314f, + 0.000716374f, + -0.00200948f, + 0.0722298f, + 0.0599559f, + 0.054841f, + 0.0387529f, + 0.0107262f, + 0.000355315f, + -0.00244803f, + -0.00335222f, + 0.0635335f, + 0.0514196f, + 0.0406309f, + 0.0125833f, + 0.00151305f, + -0.00140269f, + -0.00362547f, + -0.00337649f, + 0.0472024f, + 0.0198725f, + 0.0113437f, + 0.00266305f, + -0.00137183f, + -0.00354158f, + -0.00341292f, + -0.00290074f +}; + +__constant static float bias[192] = { + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0 +}; + // chrisk todo // return the count of Non-zero item -int MakeInputOrder(__global coeff_t *orig_block, DCTScoreData *input_order, int size) +int MakeInputOrder(__global coeff_t *block, __global coeff_t *orig_block, DCTScoreData *input_order, int size) { + /* ÓÐһЩÎÊÌ⣬ÏÈ×¢Ê͵ô + for (int c = 0; c < 3; ++c) { + if (!(comp_mask & (1 << c))) continue; + for (int k = 1; k < size; ++k) { + int idx = c * size + k; + if (block[idx] != 0) { + float score = abs(orig_block[idx]) * csf[idx] + bias[idx]; + list_push_back(input_order, idx, score); + } + } + } + */ /* static const double kWeight[3] = { 1.0, 0.22, 0.20 }; #include "guetzli/order.inc" @@ -1495,8 +1918,12 @@ void BlockToImage(coeff_t *candidate_block, float *r, float *g, float *b) // ²Î¿¼clguetzli_comparator.cpp : BlockToImage } -void Convolution(__global float* multipliers, __global float* inp, __global float* result, - size_t xsize, size_t ysize, int xstep, int len, int offset, float border_ratio) +void Convolution(size_t xsize, size_t ysize, + int xstep, int len, int offset, + float* multipliers, + float* inp, + float border_ratio, + float* result) { float weight_no_border = 0; @@ -1539,13 +1966,15 @@ void BlurEx(float *r, int xsize, int ysize, double kSigma, double border_ratio, exp(scaler * (-diff + 3) * (-diff + 3)), exp(scaler * (-diff + 4) * (-diff + 4))}; const int xstep = 1; // when sigma=1.1, xstep is 1. - /* - Convolution(xsize, ysize, xstep, expn_size, diff, expn.data(), channel, - border_ratio, - tmp.data()); - Convolution(ysize, dxsize, ystep, expn_size, diff, expn.data(), tmp.data(), + const int ystep = xstep; + + int dxsize = (xsize + xstep - 1) / xstep; + int dysize = (ysize + ystep - 1) / ystep; + + float *tmp = 0; // TODO:need a tmp and + Convolution(xsize, ysize, xstep, expn_size, diff, expn, r, border_ratio, tmp); + Convolution(ysize, dxsize, ystep, expn_size, diff, expn, tmp, border_ratio, output); - */ } @@ -1554,7 +1983,29 @@ void OpsinDynamicsImageBlock(float *r, float *g, float *b, float *r_blurred, float *g_blurred, float *b_blurred, int size) { - + for (size_t i = 0; i < size; ++i) { + double sensitivity[3]; + { + // Calculate sensitivity[3] based on the smoothed image gamma derivative. + double pre_rgb[3] = { r_blurred[i], g_blurred[i], b_blurred[i] }; + double pre_mixed[3]; + OpsinAbsorbance(pre_rgb, pre_mixed); + sensitivity[0] = Gamma(pre_mixed[0]) / pre_mixed[0]; + sensitivity[1] = Gamma(pre_mixed[1]) / pre_mixed[1]; + sensitivity[2] = Gamma(pre_mixed[2]) / pre_mixed[2]; + } + double cur_rgb[3] = { r[i], g[i], b[i] }; + double cur_mixed[3]; + OpsinAbsorbance(cur_rgb, cur_mixed); + cur_mixed[0] *= sensitivity[0]; + cur_mixed[1] *= sensitivity[1]; + cur_mixed[2] *= sensitivity[2]; + double x, y, z; + RgbToXyb(cur_mixed[0], cur_mixed[1], cur_mixed[2], &x, &y, &z); + r[i] = (float)(x); + g[i] = (float)(y); + b[i] = (float)(z); + } } // chrisk todo @@ -1597,6 +2048,9 @@ void CalcOpsinDynamicsImage(ocl_channels rgb) } // strong todo +// candidate_block [R....R][G....G][B....B] +// orig_image_block [RR..RRGG..GGBB..BB] +// mask_scale[RGB] float CompareBlockEx(coeff_t *candidate_block, __global float* orig_image_block, __global float* mask_scale_block) { float rgb0[3][kDCTBlockSize]; @@ -1661,6 +2115,12 @@ float CompareBlockEx(coeff_t *candidate_block, __global float* orig_image_block, } // strong todo +// orig_block_list [R....R][G....G][B....B] +// block_list [R....R][G....G][B....B] +// orig_image [RR..RRGG..GGBB..BB] +// mask_scale[RGB] +// output_orlder_list [3 * kBlockSize] + __kernel void clComputeBlockZeroingOrder(__global coeff_t *orig_block_list/*in*/, __global coeff_t *block_list/*in*/, __global float *orig_image/*in*/, @@ -1668,20 +2128,21 @@ __kernel void clComputeBlockZeroingOrder(__global coeff_t *orig_block_list/*in*/ __global CoeffData *output_order_list/*out*/) { int block_idx = get_global_id(0); +#define kComputeBlockSize (kBlockSize * 3) - __global coeff_t *orig_block = orig_block_list + block_idx * kBlockSize; - __global coeff_t *block = block_list + block_idx * kBlockSize; - __global float* orig_image_block = orig_image + block_idx * kBlockSize; + __global coeff_t *orig_block = orig_block_list + block_idx * kComputeBlockSize; + __global coeff_t *block = block_list + block_idx * kComputeBlockSize; + __global float* orig_image_block = orig_image + block_idx * kComputeBlockSize; - DCTScoreData input_order_data[kBlockSize]; - CoeffData output_order_data[kBlockSize]; + DCTScoreData input_order_data[kComputeBlockSize]; + CoeffData output_order_data[kComputeBlockSize]; - MakeInputOrder(orig_block, input_order_data, kBlockSize); - IntFloatPairList input_order = { kBlockSize, input_order_data }; - IntFloatPairList output_order = { kBlockSize, output_order_data }; + int count = MakeInputOrder(block, orig_block, input_order_data, kComputeBlockSize); + IntFloatPairList input_order = { count, input_order_data }; + IntFloatPairList output_order = { 0, output_order_data }; - coeff_t processed_block[kBlockSize]; - for (int i = 0; i < kBlockSize; i++) { + coeff_t processed_block[kComputeBlockSize]; + for (int i = 0; i < kComputeBlockSize; i++) { processed_block[i] = block[i]; } @@ -1691,8 +2152,8 @@ __kernel void clComputeBlockZeroingOrder(__global coeff_t *orig_block_list/*in*/ int best_i = 0; for (int i = 0; i < min(3, input_order.size); i++) { - coeff_t candidate_block[kBlockSize]; - for (int i = 0; i < kBlockSize; i++) { + coeff_t candidate_block[kComputeBlockSize]; + for (int i = 0; i < kComputeBlockSize; i++) { candidate_block[i] = processed_block[i]; } @@ -1721,11 +2182,11 @@ __kernel void clComputeBlockZeroingOrder(__global coeff_t *orig_block_list/*in*/ output_order.pData[i].err = min_err; } - __global CoeffData *output_block = output_order_list + block_idx * kBlockSize; + __global CoeffData *output_block = output_order_list + block_idx * kComputeBlockSize; - for (int i = 0; i < kBlockSize; i++) + for (int i = 0; i < kComputeBlockSize; i++) { - if (i > output_order.size) + if (i >= output_order.size) { output_block[i].idx = 0; output_block[i].err = 0; diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index 32d0d77b..b8c8ad2a 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -63,6 +63,7 @@ ocl_args_d_t& getOcl(void) ocl.kernel[KERNEL_EDGEDETECTOR] = clCreateKernel(ocl.program, "clEdgeDetectorMap", &err); ocl.kernel[KERNEL_BLOCKDIFFMAP] = clCreateKernel(ocl.program, "clBlockDiffMap", &err); ocl.kernel[KERNEL_EDGEDETECTORLOWFREQ] = clCreateKernel(ocl.program, "clEdgeDetectorLowFreq", &err); + ocl.kernel[KERNEL_COMPUTEBLOCKZERONGORDER] = clCreateKernel(ocl.program, "clComputeBlockZeroingOrder", &err); return ocl; } @@ -1192,3 +1193,54 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b, clReleaseMemObject(mem_result); } + +void clComputeBlockZeroingOrder(guetzli::coeff_t *orig_block_list, guetzli::coeff_t *block_list, + float *orig_iamge, float* mask_scale, CoeffData *output_order_list, + int size) +{ + using namespace guetzli; + + int item_count = 3 * kDCTBlockSize * size; + + cl_int err = 0; + ocl_args_d_t &ocl = getOcl(); + + cl_mem mem_orig_block_list = ocl.allocMem(sizeof(coeff_t) * item_count); + cl_mem mem_block_list = ocl.allocMem(sizeof(coeff_t) * item_count); + cl_mem mem_orig_image = ocl.allocMem(sizeof(float) * item_count); + cl_mem mem_mask_scale = ocl.allocMem(sizeof(float) * 3 * size); + cl_mem mem_output_order_list = ocl.allocMem(sizeof(CoeffData) * item_count); + + cl_kernel kernel = ocl.kernel[KERNEL_COMPUTEBLOCKZERONGORDER]; + clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&mem_orig_block_list); + clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&mem_block_list); + clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&mem_orig_image); + clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&mem_mask_scale); + clSetKernelArg(kernel, 4, sizeof(cl_mem), &mem_output_order_list); + + size_t globalWorkSize[1] = { size }; + err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL); + if (CL_SUCCESS != err) + { + LogError("Error: clComputeBlockZeroingOrder() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err)); + } + err = clFinish(ocl.commandQueue); + if (CL_SUCCESS != err) + { + LogError("Error: clComputeBlockZeroingOrder() clFinish returned %s.\n", TranslateOpenCLError(err)); + } + + CoeffData *result = (CoeffData *)clEnqueueMapBuffer(ocl.commandQueue, mem_output_order_list, true, CL_MAP_READ, 0, sizeof(CoeffData) * item_count, 0, NULL, NULL, &err); + err = clFinish(ocl.commandQueue); + memcpy(output_order_list, result, sizeof(CoeffData) * item_count); + + clEnqueueUnmapMemObject(ocl.commandQueue, mem_output_order_list, result, sizeof(CoeffData) * item_count, NULL, NULL); + clFinish(ocl.commandQueue); + + clReleaseMemObject(mem_orig_block_list); + clReleaseMemObject(mem_block_list); + clReleaseMemObject(mem_orig_image); + clReleaseMemObject(mem_mask_scale); + clReleaseMemObject(mem_output_order_list); + +} \ No newline at end of file diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h index aa595ab5..178dce27 100644 --- a/clguetzli/clguetzli.h +++ b/clguetzli/clguetzli.h @@ -1,10 +1,16 @@ #pragma once #include "CL\cl.h" +#include "guetzli\jpeg_data.h" #include "ocl.h" extern bool g_useOpenCL; extern bool g_checkOpenCL; +struct CoeffData { + int idx; + float block_err; +}; + void clOpsinDynamicsImage(size_t xsize, size_t ysize, float* r, float* g, float* b); void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b, @@ -13,6 +19,8 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b, size_t step, float* result); +void clComputeBlockZeroingOrder(guetzli::coeff_t *orig_block_list, guetzli::coeff_t *block_list, float *orig_iamge, float* mask_scale, CoeffData *output_order_list, int size); + void clMask(const float* r, const float* g, const float* b, const float* r2, const float* g2, const float* b2, size_t xsize, size_t ysize, diff --git a/clguetzli/clguetzli_comparator.h b/clguetzli/clguetzli_comparator.h index 353eff59..97f23fb9 100644 --- a/clguetzli/clguetzli_comparator.h +++ b/clguetzli/clguetzli_comparator.h @@ -18,7 +18,7 @@ namespace guetzli { double CompareBlockEx(coeff_t* candidate_block); private: int getCurrentBlockIdx(void); - protected: + public: std::vector imgOpsinDynamicsBlockList; // [RR..RRGG..GGBB..BB]:blockCount std::vector imgMaskXyzScaleBlockList; // [RGBRGB..RGBRGB]:blockCount }; diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h index b9ada586..ae5ceeeb 100644 --- a/clguetzli/ocl.h +++ b/clguetzli/ocl.h @@ -63,6 +63,7 @@ enum KernelName { KERNEL_EDGEDETECTOR, KERNEL_BLOCKDIFFMAP, KERNEL_EDGEDETECTORLOWFREQ, + KERNEL_COMPUTEBLOCKZERONGORDER, KERNEL_COUNT, }; diff --git a/guetzli/processor.cc b/guetzli/processor.cc index c4d43bef..eaf9f75b 100644 --- a/guetzli/processor.cc +++ b/guetzli/processor.cc @@ -23,6 +23,7 @@ #include "guetzli/butteraugli_comparator.h" #include "clguetzli\clguetzli_comparator.h" +#include "clguetzli\clguetzli.h" #include "guetzli/comparator.h" #include "guetzli/debug_print.h" #include "guetzli/fast_log.h" @@ -38,11 +39,12 @@ namespace guetzli { namespace { static const size_t kBlockSize = 3 * kDCTBlockSize; - +/* struct CoeffData { int idx; float block_err; }; +*/ struct QuantData { int q[3][kDCTBlockSize]; size_t jpg_size; @@ -601,10 +603,59 @@ void Processor::SelectFrequencyMaskingBatch(const JPEGData& jpg, OutputImage* im } } + std::vector output_order(num_blocks * kBlockSize); + ButteraugliComparatorEx * comp = (ButteraugliComparatorEx*)comparator_; + + if (g_useOpenCL) + { + clComputeBlockZeroingOrder(orig_block_batch.data(), + block_batch.data(), + comp->imgOpsinDynamicsBlockList.data(), + comp->imgMaskXyzScaleBlockList.data(), + output_order.data(), + num_blocks); + } + else + { + for (int block_y = 0, block_ix = 0; block_y < block_height; ++block_y) { + for (int block_x = 0; block_x < block_width; ++block_x, ++block_ix) { + coeff_t *orig_block = &orig_block_batch[block_ix * kBlockSize]; + coeff_t *block = &block_batch[block_ix * kBlockSize]; + + std::vector block_order; + + ComputeBlockZeroingOrder(block, orig_block, block_x, block_y, &block_order); + + CoeffData * p = &output_order[block_ix * kBlockSize]; + for (int i = 0; i < block_order.size(); i++) + { + p[i].idx = block_order[i].idx; + p[i].block_err = block_order[i].block_err; + } + } + } + } + std::vector candidate_coeff_offsets(num_blocks + 1); std::vector candidate_coeffs; std::vector candidate_coeff_errors; + for (int block_y = 0, block_ix = 0; block_y < block_height; ++block_y) { + for (int block_x = 0; block_x < block_width; ++block_x, ++block_ix) { + CoeffData * p = &output_order[block_ix * kBlockSize]; + + candidate_coeff_offsets[block_ix] = candidate_coeffs.size(); + for (int i = 0; i < kBlockSize; i++) + { + if (p[i].block_err > 0 && p[i].block_err <= comparator_->BlockErrorLimit()) + { + candidate_coeffs.push_back(p[i].idx); + candidate_coeff_errors.push_back(p[i].block_err); + } + } + } + } +/* // step 2 ¶Ô±Èÿ¸öblock½á¹û for (int block_y = 0, block_ix = 0; block_y < block_height; ++block_y) { for (int block_x = 0; block_x < block_width; ++block_x, ++block_ix) { @@ -623,7 +674,7 @@ void Processor::SelectFrequencyMaskingBatch(const JPEGData& jpg, OutputImage* im } } } - +*/ // comparator_->FinishBlockComparisons(); // TOBEREMOVE:Çå³ý²ÎÊý candidate_coeff_offsets[num_blocks] = candidate_coeffs.size(); @@ -666,9 +717,9 @@ void Processor::ComputeBlockZeroingOrder(const coeff_t block[kBlockSize], const } std::sort(input_order.begin(), input_order.end(), [](const std::pair& a, const std::pair& b) { return a.second < b.second; }); - if (input_order.size() > 10) + if (input_order.size() > 64) { - int i = 0; + g_compareBlock++; } coeff_t processed_block[kBlockSize]; memcpy(processed_block, block, sizeof(processed_block)); From 5e53802a76a1dcd6d4120aa764184c4bdac9f1c3 Mon Sep 17 00:00:00 2001 From: ianhuang-777 <306168910@qq.com> Date: Tue, 16 May 2017 09:44:34 +0800 Subject: [PATCH 075/189] Fix BlurEx --- clguetzli/clguetzli.cl | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index 6e354874..f64f9331 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -1966,12 +1966,11 @@ void BlurEx(float *r, int xsize, int ysize, double kSigma, double border_ratio, exp(scaler * (-diff + 3) * (-diff + 3)), exp(scaler * (-diff + 4) * (-diff + 4))}; const int xstep = 1; // when sigma=1.1, xstep is 1. - const int ystep = xstep; + const int ystep = xstep; - int dxsize = (xsize + xstep - 1) / xstep; - int dysize = (ysize + ystep - 1) / ystep; + int dxsize = (xsize + xstep - 1) / xstep; - float *tmp = 0; // TODO:need a tmp and + float tmp[8*8] = { 0 }; Convolution(xsize, ysize, xstep, expn_size, diff, expn, r, border_ratio, tmp); Convolution(ysize, dxsize, ystep, expn_size, diff, expn, tmp, border_ratio, output); From e69365c400b8acac05f1fb3382419d48b57a81d0 Mon Sep 17 00:00:00 2001 From: Zhou Ke Date: Tue, 16 May 2017 09:48:50 +0800 Subject: [PATCH 076/189] fix data type of coeff_t --- clguetzli/clguetzli.cl | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index f64f9331..8f5601a5 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -1427,7 +1427,7 @@ typedef struct __IntFloatPair float err; }IntFloatPair, DCTScoreData, CoeffData; -typedef int16 coeff_t; +typedef short coeff_t; typedef struct __IntFloatPairList { @@ -1875,7 +1875,7 @@ __constant static float bias[192] = { // return the count of Non-zero item int MakeInputOrder(__global coeff_t *block, __global coeff_t *orig_block, DCTScoreData *input_order, int size) { - /* ÓÐһЩÎÊÌ⣬ÏÈ×¢Ê͵ô + int comp_mask = 7; for (int c = 0; c < 3; ++c) { if (!(comp_mask & (1 << c))) continue; for (int k = 1; k < size; ++k) { @@ -1886,7 +1886,6 @@ int MakeInputOrder(__global coeff_t *block, __global coeff_t *orig_block, DCTSco } } } - */ /* static const double kWeight[3] = { 1.0, 0.22, 0.20 }; #include "guetzli/order.inc" From 8d82c8e640261e5d90e7f5db63153b9116bdca9b Mon Sep 17 00:00:00 2001 From: Zhou Ke Date: Tue, 16 May 2017 10:03:02 +0800 Subject: [PATCH 077/189] modify MakeInputOrder --- clguetzli/clguetzli.cl | 40 ++++++---------------------------------- 1 file changed, 6 insertions(+), 34 deletions(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index 8f5601a5..6271b6bd 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -1474,11 +1474,6 @@ int SortInputOrder(DCTScoreData* input_order, int size) input_order[i + 1].err = tmp.err; } return size; -/* - std::sort(input_order.begin(), input_order.end(), - [](const std::pair& a, const std::pair& b) { - return a.second < b.second; }); -*/ } __constant static float csf[192] = { @@ -1873,41 +1868,18 @@ __constant static float bias[192] = { // chrisk todo // return the count of Non-zero item -int MakeInputOrder(__global coeff_t *block, __global coeff_t *orig_block, DCTScoreData *input_order, int size) +int MakeInputOrder(__global coeff_t *block, __global coeff_t *orig_block, DCTScoreData *input_order, int block_size) { - int comp_mask = 7; + int size = 0; for (int c = 0; c < 3; ++c) { - if (!(comp_mask & (1 << c))) continue; - for (int k = 1; k < size; ++k) { - int idx = c * size + k; + for (int k = 1; k < block_size; ++k) { + int idx = c * block_size + k; if (block[idx] != 0) { float score = abs(orig_block[idx]) * csf[idx] + bias[idx]; - list_push_back(input_order, idx, score); + size = list_push_back(input_order, idx, score); } } } -/* - static const double kWeight[3] = { 1.0, 0.22, 0.20 }; -#include "guetzli/order.inc" - std::vector > input_order; - for (int c = 0; c < 3; ++c) { - if (!(comp_mask & (1 << c))) continue; - for (int k = 1; k < kDCTBlockSize; ++k) { - int idx = c * kDCTBlockSize + k; - if (block[idx] != 0) { - float score; - if (params_.new_zeroing_model) { - score = std::abs(orig_block[idx]) * csf[idx] + bias[idx]; - } - else { - score = static_cast((std::abs(orig_block[idx]) - kJPEGZigZagOrder[k] / 64.0) * - kWeight[c] / oldCsf[k]); - } - input_order.push_back(std::make_pair(idx, score)); - } - } - } -*/ return SortInputOrder(input_order, size); } @@ -2135,7 +2107,7 @@ __kernel void clComputeBlockZeroingOrder(__global coeff_t *orig_block_list/*in*/ DCTScoreData input_order_data[kComputeBlockSize]; CoeffData output_order_data[kComputeBlockSize]; - int count = MakeInputOrder(block, orig_block, input_order_data, kComputeBlockSize); + int count = MakeInputOrder(block, orig_block, input_order_data, kBlockSize); IntFloatPairList input_order = { count, input_order_data }; IntFloatPairList output_order = { 0, output_order_data }; From fecac92d6e1db4d2cf7116bd1f52951263d5ead8 Mon Sep 17 00:00:00 2001 From: Zhou Ke Date: Tue, 16 May 2017 10:42:19 +0800 Subject: [PATCH 078/189] Add BlockToImage --- clguetzli/clguetzli.cl | 416 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 414 insertions(+), 2 deletions(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index 6271b6bd..3a242fdb 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -1883,10 +1883,422 @@ int MakeInputOrder(__global coeff_t *block, __global coeff_t *orig_block, DCTSco return SortInputOrder(input_order, size); } +__constant static int kIDCTMatrix[kDCTBlockSize] = { + 8192, 11363, 10703, 9633, 8192, 6437, 4433, 2260, + 8192, 9633, 4433, -2259, -8192, -11362, -10704, -6436, + 8192, 6437, -4433, -11362, -8192, 2261, 10704, 9633, + 8192, 2260, -10703, -6436, 8192, 9633, -4433, -11363, + 8192, -2260, -10703, 6436, 8192, -9633, -4433, 11363, + 8192, -6437, -4433, 11362, -8192, -2261, 10704, -9633, + 8192, -9633, 4433, 2259, -8192, 11362, -10704, 6436, + 8192, -11363, 10703, -9633, 8192, -6437, 4433, -2260, +}; + +// Computes out[x] = sum{kIDCTMatrix[8*x+u]*in[u*stride]; for u in [0..7]} +void Compute1dIDCT(const coeff_t* in, const int stride, int out[8]) { + int tmp0, tmp1, tmp2, tmp3, tmp4; + + tmp1 = kIDCTMatrix[0] * in[0]; + out[0] = out[1] = out[2] = out[3] = out[4] = out[5] = out[6] = out[7] = tmp1; + + tmp0 = in[stride]; + tmp1 = kIDCTMatrix[1] * tmp0; + tmp2 = kIDCTMatrix[9] * tmp0; + tmp3 = kIDCTMatrix[17] * tmp0; + tmp4 = kIDCTMatrix[25] * tmp0; + out[0] += tmp1; + out[1] += tmp2; + out[2] += tmp3; + out[3] += tmp4; + out[4] -= tmp4; + out[5] -= tmp3; + out[6] -= tmp2; + out[7] -= tmp1; + + tmp0 = in[2 * stride]; + tmp1 = kIDCTMatrix[2] * tmp0; + tmp2 = kIDCTMatrix[10] * tmp0; + out[0] += tmp1; + out[1] += tmp2; + out[2] -= tmp2; + out[3] -= tmp1; + out[4] -= tmp1; + out[5] -= tmp2; + out[6] += tmp2; + out[7] += tmp1; + + tmp0 = in[3 * stride]; + tmp1 = kIDCTMatrix[3] * tmp0; + tmp2 = kIDCTMatrix[11] * tmp0; + tmp3 = kIDCTMatrix[19] * tmp0; + tmp4 = kIDCTMatrix[27] * tmp0; + out[0] += tmp1; + out[1] += tmp2; + out[2] += tmp3; + out[3] += tmp4; + out[4] -= tmp4; + out[5] -= tmp3; + out[6] -= tmp2; + out[7] -= tmp1; + + tmp0 = in[4 * stride]; + tmp1 = kIDCTMatrix[4] * tmp0; + out[0] += tmp1; + out[1] -= tmp1; + out[2] -= tmp1; + out[3] += tmp1; + out[4] += tmp1; + out[5] -= tmp1; + out[6] -= tmp1; + out[7] += tmp1; + + tmp0 = in[5 * stride]; + tmp1 = kIDCTMatrix[5] * tmp0; + tmp2 = kIDCTMatrix[13] * tmp0; + tmp3 = kIDCTMatrix[21] * tmp0; + tmp4 = kIDCTMatrix[29] * tmp0; + out[0] += tmp1; + out[1] += tmp2; + out[2] += tmp3; + out[3] += tmp4; + out[4] -= tmp4; + out[5] -= tmp3; + out[6] -= tmp2; + out[7] -= tmp1; + + tmp0 = in[6 * stride]; + tmp1 = kIDCTMatrix[6] * tmp0; + tmp2 = kIDCTMatrix[14] * tmp0; + out[0] += tmp1; + out[1] += tmp2; + out[2] -= tmp2; + out[3] -= tmp1; + out[4] -= tmp1; + out[5] -= tmp2; + out[6] += tmp2; + out[7] += tmp1; + + tmp0 = in[7 * stride]; + tmp1 = kIDCTMatrix[7] * tmp0; + tmp2 = kIDCTMatrix[15] * tmp0; + tmp3 = kIDCTMatrix[23] * tmp0; + tmp4 = kIDCTMatrix[31] * tmp0; + out[0] += tmp1; + out[1] += tmp2; + out[2] += tmp3; + out[3] += tmp4; + out[4] -= tmp4; + out[5] -= tmp3; + out[6] -= tmp2; + out[7] -= tmp1; +} + +void CoeffToIDCT(coeff_t *block, uchar * out) +{ + coeff_t colidcts[kDCTBlockSize]; + const int kColScale = 11; + const int kColRound = 1 << (kColScale - 1); + for (int x = 0; x < 8; ++x) + { + int colbuf[8] = { 0 }; + Compute1dIDCT(&block[x], 8, colbuf); + for (int y = 0; y < 8; ++y) + { + colidcts[8 * y + x] = (colbuf[y] + kColRound) >> kColScale; + } + } + const int kRowScale = 18; + const int kRowRound = 257 << (kRowScale - 1); // includes offset by 128 + for (int y = 0; y < 8; ++y) + { + const int rowidx = 8 * y; + int rowbuf[8] = { 0 }; + Compute1dIDCT(&colidcts[rowidx], 1, rowbuf); + for (int x = 0; x < 8; ++x) { + out[rowidx + x] = max(0, min(255, (rowbuf[x] + kRowRound) >> kRowScale)); + } + } +} + +void IDCTToImage(uchar *idct, ushort *pixels_) +{ + const int block_x = 0; + const int block_y = 0; + const int width_ = 8; + const int height_ = 8; + + for (int iy = 0; iy < 8; ++iy) + { + for (int ix = 0; ix < 8; ++ix) + { + int x = 8 * block_x + ix; + int y = 8 * block_y + iy; + if (x >= width_ || y >= height_) continue; + int p = y * width_ + x; + pixels_[p] = idct[8 * iy + ix] << 4; + } + } +} + +void ImageToYUV(ushort *pixels_, uchar *out) +{ + const int stride = 3; + + for (int y = 0; y < 8; ++y) + { + for (int x = 0; x < 8; ++x) + { + int px = y * 8 + x; + *out = (uchar) ((pixels_[px] + 8 - (x & 1)) >> 4); + out += stride; + } + } +} + +__constant static int kCrToRedTable[256] = { + -179, -178, -177, -175, -174, -172, -171, -170, -168, -167, -165, -164, + -163, -161, -160, -158, -157, -156, -154, -153, -151, -150, -149, -147, + -146, -144, -143, -142, -140, -139, -137, -136, -135, -133, -132, -130, + -129, -128, -126, -125, -123, -122, -121, -119, -118, -116, -115, -114, + -112, -111, -109, -108, -107, -105, -104, -102, -101, -100, -98, -97, + -95, -94, -93, -91, -90, -88, -87, -86, -84, -83, -81, -80, + -79, -77, -76, -74, -73, -72, -70, -69, -67, -66, -64, -63, + -62, -60, -59, -57, -56, -55, -53, -52, -50, -49, -48, -46, + -45, -43, -42, -41, -39, -38, -36, -35, -34, -32, -31, -29, + -28, -27, -25, -24, -22, -21, -20, -18, -17, -15, -14, -13, + -11, -10, -8, -7, -6, -4, -3, -1, 0, 1, 3, 4, + 6, 7, 8, 10, 11, 13, 14, 15, 17, 18, 20, 21, + 22, 24, 25, 27, 28, 29, 31, 32, 34, 35, 36, 38, + 39, 41, 42, 43, 45, 46, 48, 49, 50, 52, 53, 55, + 56, 57, 59, 60, 62, 63, 64, 66, 67, 69, 70, 72, + 73, 74, 76, 77, 79, 80, 81, 83, 84, 86, 87, 88, + 90, 91, 93, 94, 95, 97, 98, 100, 101, 102, 104, 105, + 107, 108, 109, 111, 112, 114, 115, 116, 118, 119, 121, 122, + 123, 125, 126, 128, 129, 130, 132, 133, 135, 136, 137, 139, + 140, 142, 143, 144, 146, 147, 149, 150, 151, 153, 154, 156, + 157, 158, 160, 161, 163, 164, 165, 167, 168, 170, 171, 172, + 174, 175, 177, 178 +}; + +__constant static int kCbToBlueTable[256] = { + -227, -225, -223, -222, -220, -218, -216, -214, -213, -211, -209, -207, + -206, -204, -202, -200, -198, -197, -195, -193, -191, -190, -188, -186, + -184, -183, -181, -179, -177, -175, -174, -172, -170, -168, -167, -165, + -163, -161, -159, -158, -156, -154, -152, -151, -149, -147, -145, -144, + -142, -140, -138, -136, -135, -133, -131, -129, -128, -126, -124, -122, + -120, -119, -117, -115, -113, -112, -110, -108, -106, -105, -103, -101, + -99, -97, -96, -94, -92, -90, -89, -87, -85, -83, -82, -80, + -78, -76, -74, -73, -71, -69, -67, -66, -64, -62, -60, -58, + -57, -55, -53, -51, -50, -48, -46, -44, -43, -41, -39, -37, + -35, -34, -32, -30, -28, -27, -25, -23, -21, -19, -18, -16, + -14, -12, -11, -9, -7, -5, -4, -2, 0, 2, 4, 5, + 7, 9, 11, 12, 14, 16, 18, 19, 21, 23, 25, 27, + 28, 30, 32, 34, 35, 37, 39, 41, 43, 44, 46, 48, + 50, 51, 53, 55, 57, 58, 60, 62, 64, 66, 67, 69, + 71, 73, 74, 76, 78, 80, 82, 83, 85, 87, 89, 90, + 92, 94, 96, 97, 99, 101, 103, 105, 106, 108, 110, 112, + 113, 115, 117, 119, 120, 122, 124, 126, 128, 129, 131, 133, + 135, 136, 138, 140, 142, 144, 145, 147, 149, 151, 152, 154, + 156, 158, 159, 161, 163, 165, 167, 168, 170, 172, 174, 175, + 177, 179, 181, 183, 184, 186, 188, 190, 191, 193, 195, 197, + 198, 200, 202, 204, 206, 207, 209, 211, 213, 214, 216, 218, + 220, 222, 223, 225, +}; + +__constant static int kCrToGreenTable[256] = { + 5990656, 5943854, 5897052, 5850250, 5803448, 5756646, 5709844, 5663042, + 5616240, 5569438, 5522636, 5475834, 5429032, 5382230, 5335428, 5288626, + 5241824, 5195022, 5148220, 5101418, 5054616, 5007814, 4961012, 4914210, + 4867408, 4820606, 4773804, 4727002, 4680200, 4633398, 4586596, 4539794, + 4492992, 4446190, 4399388, 4352586, 4305784, 4258982, 4212180, 4165378, + 4118576, 4071774, 4024972, 3978170, 3931368, 3884566, 3837764, 3790962, + 3744160, 3697358, 3650556, 3603754, 3556952, 3510150, 3463348, 3416546, + 3369744, 3322942, 3276140, 3229338, 3182536, 3135734, 3088932, 3042130, + 2995328, 2948526, 2901724, 2854922, 2808120, 2761318, 2714516, 2667714, + 2620912, 2574110, 2527308, 2480506, 2433704, 2386902, 2340100, 2293298, + 2246496, 2199694, 2152892, 2106090, 2059288, 2012486, 1965684, 1918882, + 1872080, 1825278, 1778476, 1731674, 1684872, 1638070, 1591268, 1544466, + 1497664, 1450862, 1404060, 1357258, 1310456, 1263654, 1216852, 1170050, + 1123248, 1076446, 1029644, 982842, 936040, 889238, 842436, 795634, + 748832, 702030, 655228, 608426, 561624, 514822, 468020, 421218, + 374416, 327614, 280812, 234010, 187208, 140406, 93604, 46802, + 0, -46802, -93604, -140406, -187208, -234010, -280812, -327614, + -374416, -421218, -468020, -514822, -561624, -608426, -655228, -702030, + -748832, -795634, -842436, -889238, -936040, -982842, -1029644, -1076446, + -1123248, -1170050, -1216852, -1263654, -1310456, -1357258, -1404060, -1450862, + -1497664, -1544466, -1591268, -1638070, -1684872, -1731674, -1778476, -1825278, + -1872080, -1918882, -1965684, -2012486, -2059288, -2106090, -2152892, -2199694, + -2246496, -2293298, -2340100, -2386902, -2433704, -2480506, -2527308, -2574110, + -2620912, -2667714, -2714516, -2761318, -2808120, -2854922, -2901724, -2948526, + -2995328, -3042130, -3088932, -3135734, -3182536, -3229338, -3276140, -3322942, + -3369744, -3416546, -3463348, -3510150, -3556952, -3603754, -3650556, -3697358, + -3744160, -3790962, -3837764, -3884566, -3931368, -3978170, -4024972, -4071774, + -4118576, -4165378, -4212180, -4258982, -4305784, -4352586, -4399388, -4446190, + -4492992, -4539794, -4586596, -4633398, -4680200, -4727002, -4773804, -4820606, + -4867408, -4914210, -4961012, -5007814, -5054616, -5101418, -5148220, -5195022, + -5241824, -5288626, -5335428, -5382230, -5429032, -5475834, -5522636, -5569438, + -5616240, -5663042, -5709844, -5756646, -5803448, -5850250, -5897052, -5943854, +}; + +__constant static int kCbToGreenTable[256] = { + 2919680, 2897126, 2874572, 2852018, 2829464, 2806910, 2784356, 2761802, + 2739248, 2716694, 2694140, 2671586, 2649032, 2626478, 2603924, 2581370, + 2558816, 2536262, 2513708, 2491154, 2468600, 2446046, 2423492, 2400938, + 2378384, 2355830, 2333276, 2310722, 2288168, 2265614, 2243060, 2220506, + 2197952, 2175398, 2152844, 2130290, 2107736, 2085182, 2062628, 2040074, + 2017520, 1994966, 1972412, 1949858, 1927304, 1904750, 1882196, 1859642, + 1837088, 1814534, 1791980, 1769426, 1746872, 1724318, 1701764, 1679210, + 1656656, 1634102, 1611548, 1588994, 1566440, 1543886, 1521332, 1498778, + 1476224, 1453670, 1431116, 1408562, 1386008, 1363454, 1340900, 1318346, + 1295792, 1273238, 1250684, 1228130, 1205576, 1183022, 1160468, 1137914, + 1115360, 1092806, 1070252, 1047698, 1025144, 1002590, 980036, 957482, + 934928, 912374, 889820, 867266, 844712, 822158, 799604, 777050, + 754496, 731942, 709388, 686834, 664280, 641726, 619172, 596618, + 574064, 551510, 528956, 506402, 483848, 461294, 438740, 416186, + 393632, 371078, 348524, 325970, 303416, 280862, 258308, 235754, + 213200, 190646, 168092, 145538, 122984, 100430, 77876, 55322, + 32768, 10214, -12340, -34894, -57448, -80002, -102556, -125110, + -147664, -170218, -192772, -215326, -237880, -260434, -282988, -305542, + -328096, -350650, -373204, -395758, -418312, -440866, -463420, -485974, + -508528, -531082, -553636, -576190, -598744, -621298, -643852, -666406, + -688960, -711514, -734068, -756622, -779176, -801730, -824284, -846838, + -869392, -891946, -914500, -937054, -959608, -982162, -1004716, -1027270, + -1049824, -1072378, -1094932, -1117486, -1140040, -1162594, -1185148, -1207702, + -1230256, -1252810, -1275364, -1297918, -1320472, -1343026, -1365580, -1388134, + -1410688, -1433242, -1455796, -1478350, -1500904, -1523458, -1546012, -1568566, + -1591120, -1613674, -1636228, -1658782, -1681336, -1703890, -1726444, -1748998, + -1771552, -1794106, -1816660, -1839214, -1861768, -1884322, -1906876, -1929430, + -1951984, -1974538, -1997092, -2019646, -2042200, -2064754, -2087308, -2109862, + -2132416, -2154970, -2177524, -2200078, -2222632, -2245186, -2267740, -2290294, + -2312848, -2335402, -2357956, -2380510, -2403064, -2425618, -2448172, -2470726, + -2493280, -2515834, -2538388, -2560942, -2583496, -2606050, -2628604, -2651158, + -2673712, -2696266, -2718820, -2741374, -2763928, -2786482, -2809036, -2831590, +}; + +__constant static uchar kRangeLimitLut[4 * 256] = {}; + +void YUVToRGB(uchar *pixelBlock) +{ + __constant uchar* kRangeLimit = kRangeLimitLut + 384; + for (int i = 0; i < 64; i++) + { + uchar *pixel = &pixelBlock[i * 3]; + + int y = pixel[0]; + int cb = pixel[1]; + int cr = pixel[2]; + pixel[0] = kRangeLimit[y + kCrToRedTable[cr]]; + pixel[1] = kRangeLimit[y + ((kCrToGreenTable[cr] + kCbToGreenTable[cb]) >> 16)]; + pixel[2] = kRangeLimit[y + kCbToBlueTable[cb]]; + } +} + // chrisk todo -void BlockToImage(coeff_t *candidate_block, float *r, float *g, float *b) +void BlockToImage(coeff_t *block, float *r, float *g, float *b) { - // ²Î¿¼clguetzli_comparator.cpp : BlockToImage + uchar idct[8 * 8 * 3]; + CoeffToIDCT(&block[0], &idct[0]); + CoeffToIDCT(&block[8 * 8], &idct[8 * 8]); + CoeffToIDCT(&block[8 * 8 * 2], &idct[8 * 8 * 2]); + + ushort pixels[8 * 8 * 3]; + + IDCTToImage(&idct[0], &pixels[0]); + IDCTToImage(&idct[8 * 8], &pixels[8 * 8]); + IDCTToImage(&idct[8 * 8 * 2], &pixels[8 * 8 * 2]); + + uchar yuv[8 * 8 * 3]; + + ImageToYUV(&pixels[0], &yuv[0]); + ImageToYUV(&pixels[8 * 8], &yuv[1]); + ImageToYUV(&pixels[8 * 8 * 2], &yuv[2]); + + YUVToRGB(yuv); + + // Srgb8ToLinearTable begin + double lut[256]; + int i = 0; + for (; i < 11; ++i) + { + lut[i] = i / 12.92; + } + for (; i < 256; ++i) + { + lut[i] = 255.0 * pow(((i / 255.0) + 0.055) / 1.055, 2.4); + } + // Srgb8ToLinearTable end + + for (int i = 0; i < 8 * 8; i++) + { + r[i] = lut[yuv[3 * i]]; + g[i] = lut[yuv[3 * i + 1]]; + b[i] = lut[yuv[3 * i + 2]]; + } } void Convolution(size_t xsize, size_t ysize, From e2b38304ef7ac89d9d57b6f6d6a571b40a983c38 Mon Sep 17 00:00:00 2001 From: Zhou Ke Date: Tue, 16 May 2017 11:43:22 +0800 Subject: [PATCH 079/189] Add MaskHighIntensityChangeBlock --- clguetzli/clguetzli.cl | 51 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index 3a242fdb..810e00eb 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -2397,6 +2397,57 @@ void MaskHighIntensityChangeBlock(float *xyb0_x, float *xyb0_y, float *xyb0_b, float *c1_x, float *c1_y, float *c1_b, int xsize, int ysize) { + for (int x = 0; x < xsize; ++x) + { + for (int y = 0; y < ysize; ++y) + { + size_t ix = y * xsize + x; + const double ave[3] = { + (c0_x[ix] + c1_x[ix]) * 0.5, + (c0_y[ix] + c1_y[ix]) * 0.5, + (c0_b[ix] + c1_b[ix]) * 0.5, + }; + double sqr_max_diff = -1; + { + int offset[4] = { -1, 1, -(int)(xsize), (int)(xsize) }; + int border[4] = { x == 0, x + 1 == xsize, y == 0, y + 1 == ysize }; + for (int dir = 0; dir < 4; ++dir) { + if (border[dir]) + { + continue; + } + const int ix2 = ix + offset[dir]; + double diff = 0.5 * (c0_y[ix2] + c1_y[ix2]) - ave[1]; + diff *= diff; + if (sqr_max_diff < diff) + { + sqr_max_diff = diff; + } + } + } + const double kReductionX = 275.19165240059317; + const double kReductionY = 18599.41286306991; + const double kReductionZ = 410.8995306951065; + const double kChromaBalance = 106.95800948271017; + double chroma_scale = kChromaBalance / (ave[1] + kChromaBalance); + + const double mix[3] = { + chroma_scale * kReductionX / (sqr_max_diff + kReductionX), + kReductionY / (sqr_max_diff + kReductionY), + chroma_scale * kReductionZ / (sqr_max_diff + kReductionZ), + }; + // Interpolate lineraly between the average color and the actual + // color -- to reduce the importance of this pixel. + xyb0_x[ix] = (float)(mix[0] * c0_x[ix] + (1 - mix[0]) * ave[0]); + xyb1_x[ix] = (float)(mix[0] * c1_x[ix] + (1 - mix[0]) * ave[0]); + + xyb0_y[ix] = (float)(mix[1] * c0_y[ix] + (1 - mix[1]) * ave[1]); + xyb1_y[ix] = (float)(mix[1] * c1_y[ix] + (1 - mix[1]) * ave[1]); + + xyb0_b[ix] = (float)(mix[2] * c0_b[ix] + (1 - mix[2]) * ave[2]); + xyb1_b[ix] = (float)(mix[2] * c1_b[ix] + (1 - mix[2]) * ave[2]); + } + } } typedef union ocl_channels_t From b76def649158e1334e5fdf32fb41e413ac628dd9 Mon Sep 17 00:00:00 2001 From: strongtu Date: Tue, 16 May 2017 16:06:11 +0800 Subject: [PATCH 080/189] =?UTF-8?q?SelectFrequencyMaskingBatch=20=E8=AE=A1?= =?UTF-8?q?=E7=AE=97=E6=B5=81=E7=A8=8B=E4=BF=AE=E6=AD=A3=EF=BC=8C=E7=BB=88?= =?UTF-8?q?=E4=BA=8E=E5=8F=AF=E4=BB=A5=E6=AD=A3=E5=B8=B8=E8=B7=91=E8=B5=B7?= =?UTF-8?q?=E6=9D=A5=E4=BA=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clguetzli.cl | 26 ++++++------ clguetzli/clguetzli.cpp | 20 +++++---- clguetzli/clguetzli.h | 2 +- clguetzli/clguetzli_comparator.cpp | 2 +- clguetzli/ocl.cpp | 16 ++++++- clguetzli/ocl.h | 2 +- guetzli/butteraugli_comparator.cc | 6 --- guetzli/butteraugli_comparator.h | 3 -- guetzli/guetzli.cc | 4 -- guetzli/processor.cc | 67 ++++++++++++++++-------------- 10 files changed, 78 insertions(+), 70 deletions(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index 810e00eb..e4565a90 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -1868,7 +1868,7 @@ __constant static float bias[192] = { // chrisk todo // return the count of Non-zero item -int MakeInputOrder(__global coeff_t *block, __global coeff_t *orig_block, DCTScoreData *input_order, int block_size) +int MakeInputOrder(__global coeff_t *block, __global coeff_t *orig_block, IntFloatPairList *input_order, int block_size) { int size = 0; for (int c = 0; c < 3; ++c) { @@ -1880,7 +1880,7 @@ int MakeInputOrder(__global coeff_t *block, __global coeff_t *orig_block, DCTSco } } } - return SortInputOrder(input_order, size); + return SortInputOrder(input_order->pData, size); } __constant static int kIDCTMatrix[kDCTBlockSize] = { @@ -2558,6 +2558,7 @@ __kernel void clComputeBlockZeroingOrder(__global coeff_t *orig_block_list/*in*/ __global coeff_t *block_list/*in*/, __global float *orig_image/*in*/, __global float *mask_scale/*in*/, + float BlockErrorLimit, __global CoeffData *output_order_list/*out*/) { int block_idx = get_global_id(0); @@ -2570,10 +2571,11 @@ __kernel void clComputeBlockZeroingOrder(__global coeff_t *orig_block_list/*in*/ DCTScoreData input_order_data[kComputeBlockSize]; CoeffData output_order_data[kComputeBlockSize]; - int count = MakeInputOrder(block, orig_block, input_order_data, kBlockSize); - IntFloatPairList input_order = { count, input_order_data }; + IntFloatPairList input_order = { 0, input_order_data }; IntFloatPairList output_order = { 0, output_order_data }; + int count = MakeInputOrder(block, orig_block, &input_order, kBlockSize); + coeff_t processed_block[kComputeBlockSize]; for (int i = 0; i < kComputeBlockSize; i++) { processed_block[i] = block[i]; @@ -2617,17 +2619,15 @@ __kernel void clComputeBlockZeroingOrder(__global coeff_t *orig_block_list/*in*/ __global CoeffData *output_block = output_order_list + block_idx * kComputeBlockSize; - for (int i = 0; i < kComputeBlockSize; i++) + int out_count = 0; + for (int i = 0; i < kComputeBlockSize && i < output_order.size; i++) { - if (i >= output_order.size) - { - output_block[i].idx = 0; - output_block[i].err = 0; - } - else + // ¹ýÂ˽ϴóµÄerr£¬Õⲿ·Ö½øÈëºó¶Ë¼ÆËãûÓÐÒâÒå + if (output_order.pData[i].err <= BlockErrorLimit) { - output_block[i].idx = output_order.pData[i].idx; - output_block[i].err = output_order.pData[i].err; + output_block[out_count].idx = output_order.pData[i].idx; + output_block[out_count].err = output_order.pData[i].err; + out_count++; } } } diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index b8c8ad2a..4c8b7b25 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -1194,9 +1194,9 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b, clReleaseMemObject(mem_result); } -void clComputeBlockZeroingOrder(guetzli::coeff_t *orig_block_list, guetzli::coeff_t *block_list, - float *orig_iamge, float* mask_scale, CoeffData *output_order_list, - int size) +void clComputeBlockZeroingOrder(guetzli::coeff_t *orig_block_batch, guetzli::coeff_t *block_batch, + float *orig_image, float* mask_scale, CoeffData *output_order_batch, + int size, float BlockErrorLimit) { using namespace guetzli; @@ -1205,18 +1205,20 @@ void clComputeBlockZeroingOrder(guetzli::coeff_t *orig_block_list, guetzli::coef cl_int err = 0; ocl_args_d_t &ocl = getOcl(); - cl_mem mem_orig_block_list = ocl.allocMem(sizeof(coeff_t) * item_count); - cl_mem mem_block_list = ocl.allocMem(sizeof(coeff_t) * item_count); - cl_mem mem_orig_image = ocl.allocMem(sizeof(float) * item_count); - cl_mem mem_mask_scale = ocl.allocMem(sizeof(float) * 3 * size); + cl_mem mem_orig_block_list = ocl.allocMem(sizeof(coeff_t) * item_count, orig_block_batch); + cl_mem mem_block_list = ocl.allocMem(sizeof(coeff_t) * item_count, block_batch); + cl_mem mem_orig_image = ocl.allocMem(sizeof(float) * item_count, orig_image); + cl_mem mem_mask_scale = ocl.allocMem(sizeof(float) * 3 * size, mask_scale); cl_mem mem_output_order_list = ocl.allocMem(sizeof(CoeffData) * item_count); + cl_float clBlockErrorLimit = BlockErrorLimit; cl_kernel kernel = ocl.kernel[KERNEL_COMPUTEBLOCKZERONGORDER]; clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&mem_orig_block_list); clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&mem_block_list); clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&mem_orig_image); clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&mem_mask_scale); - clSetKernelArg(kernel, 4, sizeof(cl_mem), &mem_output_order_list); + clSetKernelArg(kernel, 4, sizeof(cl_float), &clBlockErrorLimit); + clSetKernelArg(kernel, 5, sizeof(cl_mem), &mem_output_order_list); size_t globalWorkSize[1] = { size }; err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL); @@ -1232,7 +1234,7 @@ void clComputeBlockZeroingOrder(guetzli::coeff_t *orig_block_list, guetzli::coef CoeffData *result = (CoeffData *)clEnqueueMapBuffer(ocl.commandQueue, mem_output_order_list, true, CL_MAP_READ, 0, sizeof(CoeffData) * item_count, 0, NULL, NULL, &err); err = clFinish(ocl.commandQueue); - memcpy(output_order_list, result, sizeof(CoeffData) * item_count); + memcpy(output_order_batch, result, sizeof(CoeffData) * item_count); clEnqueueUnmapMemObject(ocl.commandQueue, mem_output_order_list, result, sizeof(CoeffData) * item_count, NULL, NULL); clFinish(ocl.commandQueue); diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h index 178dce27..83bbae09 100644 --- a/clguetzli/clguetzli.h +++ b/clguetzli/clguetzli.h @@ -19,7 +19,7 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b, size_t step, float* result); -void clComputeBlockZeroingOrder(guetzli::coeff_t *orig_block_list, guetzli::coeff_t *block_list, float *orig_iamge, float* mask_scale, CoeffData *output_order_list, int size); +void clComputeBlockZeroingOrder(guetzli::coeff_t *orig_block_batch, guetzli::coeff_t *block_batch, float *orig_iamge, float* mask_scale, CoeffData *output_order_batch, int size, float BlockErrorLimit); void clMask(const float* r, const float* g, const float* b, const float* r2, const float* g2, const float* b2, diff --git a/clguetzli/clguetzli_comparator.cpp b/clguetzli/clguetzli_comparator.cpp index 3babe180..eb442e21 100644 --- a/clguetzli/clguetzli_comparator.cpp +++ b/clguetzli/clguetzli_comparator.cpp @@ -368,7 +368,7 @@ namespace guetzli block_y_ = block_y; factor_x_ = factor_x; factor_y_ = factor_y; - return; + ButteraugliComparator::SwitchBlock(block_x, block_y, factor_x, factor_y); } diff --git a/clguetzli/ocl.cpp b/clguetzli/ocl.cpp index 05f5470f..2f114e02 100644 --- a/clguetzli/ocl.cpp +++ b/clguetzli/ocl.cpp @@ -189,7 +189,7 @@ void* ocl_args_d_t::allocC(size_t s) return outputC; } -cl_mem ocl_args_d_t::allocMem(size_t s) +cl_mem ocl_args_d_t::allocMem(size_t s, void *init) { cl_uint optimizedSize = ((s - 1) / 64 + 1) * 64; cl_int err = 0; @@ -198,6 +198,20 @@ cl_mem ocl_args_d_t::allocMem(size_t s) { LogError("Error: allocMem() for buffer returned %s.\n", TranslateOpenCLError(err)); } + if (mem && init) + { + err = clEnqueueWriteBuffer(this->commandQueue, mem, CL_FALSE, 0, s, init, 0, NULL, NULL); + if (CL_SUCCESS != err) + { + LogError("Error: allocMem() clEnqueueWriteBuffer return %s.\n", TranslateOpenCLError(err)); + } + err = clFinish(this->commandQueue); + if (CL_SUCCESS != err) + { + LogError("Error: allocMem() clFinish return %s.\n", TranslateOpenCLError(err)); + } + } + return mem; } diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h index ae5ceeeb..59b4582d 100644 --- a/clguetzli/ocl.h +++ b/clguetzli/ocl.h @@ -95,7 +95,7 @@ struct ocl_args_d_t void* allocB(size_t s); void* allocC(size_t s); - cl_mem allocMem(size_t s); + cl_mem allocMem(size_t s, void *init = NULL); ocl_channels allocMemChannels(size_t s); void releaseMemChannels(ocl_channels rgb); diff --git a/guetzli/butteraugli_comparator.cc b/guetzli/butteraugli_comparator.cc index 9034d68e..1748b80d 100644 --- a/guetzli/butteraugli_comparator.cc +++ b/guetzli/butteraugli_comparator.cc @@ -22,9 +22,6 @@ #include "guetzli/gamma_correct.h" #include "guetzli/score.h" -int g_switchBlock = 0; -int g_compareBlock = 0; - namespace guetzli { ButteraugliComparator::ButteraugliComparator(const int width, const int height, @@ -97,8 +94,6 @@ void ButteraugliComparator::SwitchBlock(int block_x, int block_y, ::butteraugli::OpsinDynamicsImage(8, 8, per_block_pregamma_[bx]); } } - - g_switchBlock++; } double ButteraugliComparator::CompareBlock(const OutputImage& img, @@ -114,7 +109,6 @@ double ButteraugliComparator::CompareBlock(const OutputImage& img, std::vector > rgb1_c(3, std::vector(kDCTBlockSize)); img.ToLinearRGB(xmin, ymin, 8, 8, &rgb1_c); ::butteraugli::OpsinDynamicsImage(8, 8, rgb1_c); - g_compareBlock++; std::vector > rgb0 = rgb0_c; std::vector > rgb1 = rgb1_c; diff --git a/guetzli/butteraugli_comparator.h b/guetzli/butteraugli_comparator.h index 0136f2bb..bc247afe 100644 --- a/guetzli/butteraugli_comparator.h +++ b/guetzli/butteraugli_comparator.h @@ -26,9 +26,6 @@ #include "guetzli/output_image.h" #include "guetzli/stats.h" -extern int g_switchBlock; -extern int g_compareBlock; - namespace guetzli { constexpr int kButteraugliStep = 3; diff --git a/guetzli/guetzli.cc b/guetzli/guetzli.cc index 32103a74..3f91cddd 100644 --- a/guetzli/guetzli.cc +++ b/guetzli/guetzli.cc @@ -233,9 +233,6 @@ void Usage() { } // namespace -extern int g_switchBlock; -extern int g_compareBlock; - int main(int argc, char** argv) { std::set_terminate(TerminateHandler); @@ -337,6 +334,5 @@ int main(int argc, char** argv) { WriteFileOrDie(argv[opt_idx + 1], out_data); - fprintf(stderr, "%d %d", g_switchBlock, g_compareBlock); return 0; } diff --git a/guetzli/processor.cc b/guetzli/processor.cc index eaf9f75b..11abfa4c 100644 --- a/guetzli/processor.cc +++ b/guetzli/processor.cc @@ -603,30 +603,39 @@ void Processor::SelectFrequencyMaskingBatch(const JPEGData& jpg, OutputImage* im } } - std::vector output_order(num_blocks * kBlockSize); + // step 2 ¼ÆËãËùÓÐblockµÄϵÊýÆ«²î + std::vector output_order_gpu; + std::vector output_order_cpu; + CoeffData * output_order = NULL; ButteraugliComparatorEx * comp = (ButteraugliComparatorEx*)comparator_; - if (g_useOpenCL) + if (g_useOpenCL || g_checkOpenCL) { + output_order_gpu.resize(num_blocks * kBlockSize); + output_order = output_order_gpu.data(); clComputeBlockZeroingOrder(orig_block_batch.data(), block_batch.data(), comp->imgOpsinDynamicsBlockList.data(), comp->imgMaskXyzScaleBlockList.data(), - output_order.data(), - num_blocks); + output_order_gpu.data(), + num_blocks, + comparator_->BlockErrorLimit()); + + } - else + if (!g_useOpenCL || g_checkOpenCL) { + output_order_cpu.resize(num_blocks * kBlockSize); + output_order = output_order_cpu.data(); for (int block_y = 0, block_ix = 0; block_y < block_height; ++block_y) { for (int block_x = 0; block_x < block_width; ++block_x, ++block_ix) { coeff_t *orig_block = &orig_block_batch[block_ix * kBlockSize]; coeff_t *block = &block_batch[block_ix * kBlockSize]; std::vector block_order; - ComputeBlockZeroingOrder(block, orig_block, block_x, block_y, &block_order); - CoeffData * p = &output_order[block_ix * kBlockSize]; + CoeffData * p = &output_order_cpu[block_ix * kBlockSize]; for (int i = 0; i < block_order.size(); i++) { p[i].idx = block_order[i].idx; @@ -635,6 +644,23 @@ void Processor::SelectFrequencyMaskingBatch(const JPEGData& jpg, OutputImage* im } } } + if (g_checkOpenCL) + { + int count = 0; + int check_size = output_order_gpu.size(); + for (int i = 0; i < check_size; i++) + { + if (output_order_cpu[i].idx != output_order_gpu[i].idx || + fabs(output_order_cpu[i].block_err - output_order_gpu[i].block_err) > 0.001) + { + count++; + } + } + if (count > 0) + { + LogError("CHK %s(%d) %d:%d\r\n", __FUNCTION__, __LINE__, count, check_size); + } + } std::vector candidate_coeff_offsets(num_blocks + 1); std::vector candidate_coeffs; @@ -655,28 +681,9 @@ void Processor::SelectFrequencyMaskingBatch(const JPEGData& jpg, OutputImage* im } } } -/* - // step 2 ¶Ô±Èÿ¸öblock½á¹û - for (int block_y = 0, block_ix = 0; block_y < block_height; ++block_y) { - for (int block_x = 0; block_x < block_width; ++block_x, ++block_ix) { - coeff_t *orig_block = &orig_block_batch[block_ix * kBlockSize]; - coeff_t *block = &block_batch[block_ix * kBlockSize]; - std::vector block_order; - - ComputeBlockZeroingOrder(block, orig_block, block_x, block_y, &block_order); - - // ÒÔÏ´¦ÀíÒÀȻûÓÐbatch»¯£¬ÓÃÓÚÏȼø¶¨ÆäËû¼ÆËã½á¹û - candidate_coeff_offsets[block_ix] = candidate_coeffs.size(); - for (size_t i = 0; i < block_order.size(); ++i) { - candidate_coeffs.push_back(block_order[i].idx); - candidate_coeff_errors.push_back(block_order[i].block_err); - } - } - } -*/ // - comparator_->FinishBlockComparisons(); // TOBEREMOVE:Çå³ý²ÎÊý + comparator_->FinishBlockComparisons(); candidate_coeff_offsets[num_blocks] = candidate_coeffs.size(); SelectFrequencyBackEnd(jpg, img, 7, target_mul, stop_early, @@ -717,10 +724,6 @@ void Processor::ComputeBlockZeroingOrder(const coeff_t block[kBlockSize], const } std::sort(input_order.begin(), input_order.end(), [](const std::pair& a, const std::pair& b) { return a.second < b.second; }); - if (input_order.size() > 64) - { - g_compareBlock++; - } coeff_t processed_block[kBlockSize]; memcpy(processed_block, block, sizeof(processed_block)); @@ -1122,6 +1125,8 @@ bool Processor::ProcessJpegData(const Params& params, const JPEGData& jpg_in, const float ymul = jpg.components.size() == 1 ? 1.0f : 0.97f; SelectFrequencyMasking(jpg, &img, 1, ymul, false); SelectFrequencyMasking(jpg, &img, 6, 1.0, true); +// SelectFrequencyMaskingBatch(jpg, &img, ymul, false); +// SelectFrequencyMaskingBatch(jpg, &img, 1.0, true); } } From caa4fbbcbb9a6d7822483c28ea57eb88e1bd29af Mon Sep 17 00:00:00 2001 From: strongtu Date: Wed, 17 May 2017 03:08:51 +0800 Subject: [PATCH 081/189] Merge branch 'master' of https://github.com/ianhuang-777/guetzli --- clguetzli/clbutter_comparator.cpp | 1 + clguetzli/clbutter_comparator.h | 2 +- clguetzli/clguetzli.cpp | 2 +- clguetzli/clguetzli.h | 8 +- clguetzli/clguetzli_comparator.cpp | 343 +++++------------- clguetzli/clguetzli_comparator.h | 5 +- guetzli/butteraugli_comparator.cc | 2 +- guetzli/butteraugli_comparator.h | 3 +- guetzli/comparator.h | 2 +- guetzli/guetzli.cc | 5 +- guetzli/processor.cc | 79 ++-- guetzli/processor.h | 5 + .../butteraugli/butteraugli/butteraugli.cc | 55 +-- 13 files changed, 161 insertions(+), 351 deletions(-) diff --git a/clguetzli/clbutter_comparator.cpp b/clguetzli/clbutter_comparator.cpp index 1da9a2cd..734e2c33 100644 --- a/clguetzli/clbutter_comparator.cpp +++ b/clguetzli/clbutter_comparator.cpp @@ -96,6 +96,7 @@ namespace butteraugli if (g_checkOpenCL) { + temp.resize(res_xsize_ * res_ysize_); tclCombineChannels(mask_xyb[0].data(), mask_xyb[1].data(), mask_xyb[2].data(), mask_xyb_dc[0].data(), mask_xyb_dc[1].data(), mask_xyb_dc[2].data(), block_diff_dc.data(), diff --git a/clguetzli/clbutter_comparator.h b/clguetzli/clbutter_comparator.h index eb2e4e32..19ca163f 100644 --- a/clguetzli/clbutter_comparator.h +++ b/clguetzli/clbutter_comparator.h @@ -62,7 +62,7 @@ namespace butteraugli { size_t len, size_t offset, const float* __restrict__ multipliers, const float* __restrict__ inp, - float border_ratio, + double border_ratio, float* __restrict__ result); void _Blur(size_t xsize, size_t ysize, float* channel, double sigma, double border_ratio); diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index 4c8b7b25..d2f01f3e 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -1195,7 +1195,7 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b, } void clComputeBlockZeroingOrder(guetzli::coeff_t *orig_block_batch, guetzli::coeff_t *block_batch, - float *orig_image, float* mask_scale, CoeffData *output_order_batch, + float *orig_image, float* mask_scale, guetzli::CoeffData *output_order_batch, int size, float BlockErrorLimit) { using namespace guetzli; diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h index 83bbae09..457de4a0 100644 --- a/clguetzli/clguetzli.h +++ b/clguetzli/clguetzli.h @@ -1,16 +1,12 @@ #pragma once #include "CL\cl.h" #include "guetzli\jpeg_data.h" +#include "guetzli\processor.h" #include "ocl.h" extern bool g_useOpenCL; extern bool g_checkOpenCL; -struct CoeffData { - int idx; - float block_err; -}; - void clOpsinDynamicsImage(size_t xsize, size_t ysize, float* r, float* g, float* b); void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b, @@ -19,7 +15,7 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b, size_t step, float* result); -void clComputeBlockZeroingOrder(guetzli::coeff_t *orig_block_batch, guetzli::coeff_t *block_batch, float *orig_iamge, float* mask_scale, CoeffData *output_order_batch, int size, float BlockErrorLimit); +void clComputeBlockZeroingOrder(guetzli::coeff_t *orig_block_batch, guetzli::coeff_t *block_batch, float *orig_iamge, float* mask_scale, guetzli::CoeffData *output_order_batch, int size, float BlockErrorLimit); void clMask(const float* r, const float* g, const float* b, const float* r2, const float* g2, const float* b2, diff --git a/clguetzli/clguetzli_comparator.cpp b/clguetzli/clguetzli_comparator.cpp index eb442e21..22b1965f 100644 --- a/clguetzli/clguetzli_comparator.cpp +++ b/clguetzli/clguetzli_comparator.cpp @@ -2,222 +2,19 @@ #include #include "clguetzli_comparator.h" #include "guetzli\idct.h" +#include "guetzli\color_transform.h" +#include "guetzli\gamma_correct.h" +#include "clguetzli\ocl.h" +#include "clguetzli\clguetzli.h" +using namespace guetzli; -typedef int16_t coeff_t; - -const double* NewSrgb8ToLinearTable() { - double* table = new double[256]; - int i = 0; - for (; i < 11; ++i) { - table[i] = i / 12.92; - } - for (; i < 256; ++i) { - table[i] = 255.0 * std::pow(((i / 255.0) + 0.055) / 1.055, 2.4); - } - return table; -} - -const double* Srgb8ToLinearTable() { - static const double* const kSrgb8ToLinearTable = NewSrgb8ToLinearTable(); - return kSrgb8ToLinearTable; -} - -static const int kCrToRedTable[256] = { - -179, -178, -177, -175, -174, -172, -171, -170, -168, -167, -165, -164, - -163, -161, -160, -158, -157, -156, -154, -153, -151, -150, -149, -147, - -146, -144, -143, -142, -140, -139, -137, -136, -135, -133, -132, -130, - -129, -128, -126, -125, -123, -122, -121, -119, -118, -116, -115, -114, - -112, -111, -109, -108, -107, -105, -104, -102, -101, -100, -98, -97, - -95, -94, -93, -91, -90, -88, -87, -86, -84, -83, -81, -80, - -79, -77, -76, -74, -73, -72, -70, -69, -67, -66, -64, -63, - -62, -60, -59, -57, -56, -55, -53, -52, -50, -49, -48, -46, - -45, -43, -42, -41, -39, -38, -36, -35, -34, -32, -31, -29, - -28, -27, -25, -24, -22, -21, -20, -18, -17, -15, -14, -13, - -11, -10, -8, -7, -6, -4, -3, -1, 0, 1, 3, 4, - 6, 7, 8, 10, 11, 13, 14, 15, 17, 18, 20, 21, - 22, 24, 25, 27, 28, 29, 31, 32, 34, 35, 36, 38, - 39, 41, 42, 43, 45, 46, 48, 49, 50, 52, 53, 55, - 56, 57, 59, 60, 62, 63, 64, 66, 67, 69, 70, 72, - 73, 74, 76, 77, 79, 80, 81, 83, 84, 86, 87, 88, - 90, 91, 93, 94, 95, 97, 98, 100, 101, 102, 104, 105, - 107, 108, 109, 111, 112, 114, 115, 116, 118, 119, 121, 122, - 123, 125, 126, 128, 129, 130, 132, 133, 135, 136, 137, 139, - 140, 142, 143, 144, 146, 147, 149, 150, 151, 153, 154, 156, - 157, 158, 160, 161, 163, 164, 165, 167, 168, 170, 171, 172, - 174, 175, 177, 178 -}; - -static const int kCbToBlueTable[256] = { - -227, -225, -223, -222, -220, -218, -216, -214, -213, -211, -209, -207, - -206, -204, -202, -200, -198, -197, -195, -193, -191, -190, -188, -186, - -184, -183, -181, -179, -177, -175, -174, -172, -170, -168, -167, -165, - -163, -161, -159, -158, -156, -154, -152, -151, -149, -147, -145, -144, - -142, -140, -138, -136, -135, -133, -131, -129, -128, -126, -124, -122, - -120, -119, -117, -115, -113, -112, -110, -108, -106, -105, -103, -101, - -99, -97, -96, -94, -92, -90, -89, -87, -85, -83, -82, -80, - -78, -76, -74, -73, -71, -69, -67, -66, -64, -62, -60, -58, - -57, -55, -53, -51, -50, -48, -46, -44, -43, -41, -39, -37, - -35, -34, -32, -30, -28, -27, -25, -23, -21, -19, -18, -16, - -14, -12, -11, -9, -7, -5, -4, -2, 0, 2, 4, 5, - 7, 9, 11, 12, 14, 16, 18, 19, 21, 23, 25, 27, - 28, 30, 32, 34, 35, 37, 39, 41, 43, 44, 46, 48, - 50, 51, 53, 55, 57, 58, 60, 62, 64, 66, 67, 69, - 71, 73, 74, 76, 78, 80, 82, 83, 85, 87, 89, 90, - 92, 94, 96, 97, 99, 101, 103, 105, 106, 108, 110, 112, - 113, 115, 117, 119, 120, 122, 124, 126, 128, 129, 131, 133, - 135, 136, 138, 140, 142, 144, 145, 147, 149, 151, 152, 154, - 156, 158, 159, 161, 163, 165, 167, 168, 170, 172, 174, 175, - 177, 179, 181, 183, 184, 186, 188, 190, 191, 193, 195, 197, - 198, 200, 202, 204, 206, 207, 209, 211, 213, 214, 216, 218, - 220, 222, 223, 225, -}; - -static const int kCrToGreenTable[256] = { - 5990656, 5943854, 5897052, 5850250, 5803448, 5756646, 5709844, 5663042, - 5616240, 5569438, 5522636, 5475834, 5429032, 5382230, 5335428, 5288626, - 5241824, 5195022, 5148220, 5101418, 5054616, 5007814, 4961012, 4914210, - 4867408, 4820606, 4773804, 4727002, 4680200, 4633398, 4586596, 4539794, - 4492992, 4446190, 4399388, 4352586, 4305784, 4258982, 4212180, 4165378, - 4118576, 4071774, 4024972, 3978170, 3931368, 3884566, 3837764, 3790962, - 3744160, 3697358, 3650556, 3603754, 3556952, 3510150, 3463348, 3416546, - 3369744, 3322942, 3276140, 3229338, 3182536, 3135734, 3088932, 3042130, - 2995328, 2948526, 2901724, 2854922, 2808120, 2761318, 2714516, 2667714, - 2620912, 2574110, 2527308, 2480506, 2433704, 2386902, 2340100, 2293298, - 2246496, 2199694, 2152892, 2106090, 2059288, 2012486, 1965684, 1918882, - 1872080, 1825278, 1778476, 1731674, 1684872, 1638070, 1591268, 1544466, - 1497664, 1450862, 1404060, 1357258, 1310456, 1263654, 1216852, 1170050, - 1123248, 1076446, 1029644, 982842, 936040, 889238, 842436, 795634, - 748832, 702030, 655228, 608426, 561624, 514822, 468020, 421218, - 374416, 327614, 280812, 234010, 187208, 140406, 93604, 46802, - 0, -46802, -93604, -140406, -187208, -234010, -280812, -327614, - -374416, -421218, -468020, -514822, -561624, -608426, -655228, -702030, - -748832, -795634, -842436, -889238, -936040, -982842, -1029644, -1076446, - -1123248, -1170050, -1216852, -1263654, -1310456, -1357258, -1404060, -1450862, - -1497664, -1544466, -1591268, -1638070, -1684872, -1731674, -1778476, -1825278, - -1872080, -1918882, -1965684, -2012486, -2059288, -2106090, -2152892, -2199694, - -2246496, -2293298, -2340100, -2386902, -2433704, -2480506, -2527308, -2574110, - -2620912, -2667714, -2714516, -2761318, -2808120, -2854922, -2901724, -2948526, - -2995328, -3042130, -3088932, -3135734, -3182536, -3229338, -3276140, -3322942, - -3369744, -3416546, -3463348, -3510150, -3556952, -3603754, -3650556, -3697358, - -3744160, -3790962, -3837764, -3884566, -3931368, -3978170, -4024972, -4071774, - -4118576, -4165378, -4212180, -4258982, -4305784, -4352586, -4399388, -4446190, - -4492992, -4539794, -4586596, -4633398, -4680200, -4727002, -4773804, -4820606, - -4867408, -4914210, -4961012, -5007814, -5054616, -5101418, -5148220, -5195022, - -5241824, -5288626, -5335428, -5382230, -5429032, -5475834, -5522636, -5569438, - -5616240, -5663042, -5709844, -5756646, -5803448, -5850250, -5897052, -5943854, -}; - -static const int kCbToGreenTable[256] = { - 2919680, 2897126, 2874572, 2852018, 2829464, 2806910, 2784356, 2761802, - 2739248, 2716694, 2694140, 2671586, 2649032, 2626478, 2603924, 2581370, - 2558816, 2536262, 2513708, 2491154, 2468600, 2446046, 2423492, 2400938, - 2378384, 2355830, 2333276, 2310722, 2288168, 2265614, 2243060, 2220506, - 2197952, 2175398, 2152844, 2130290, 2107736, 2085182, 2062628, 2040074, - 2017520, 1994966, 1972412, 1949858, 1927304, 1904750, 1882196, 1859642, - 1837088, 1814534, 1791980, 1769426, 1746872, 1724318, 1701764, 1679210, - 1656656, 1634102, 1611548, 1588994, 1566440, 1543886, 1521332, 1498778, - 1476224, 1453670, 1431116, 1408562, 1386008, 1363454, 1340900, 1318346, - 1295792, 1273238, 1250684, 1228130, 1205576, 1183022, 1160468, 1137914, - 1115360, 1092806, 1070252, 1047698, 1025144, 1002590, 980036, 957482, - 934928, 912374, 889820, 867266, 844712, 822158, 799604, 777050, - 754496, 731942, 709388, 686834, 664280, 641726, 619172, 596618, - 574064, 551510, 528956, 506402, 483848, 461294, 438740, 416186, - 393632, 371078, 348524, 325970, 303416, 280862, 258308, 235754, - 213200, 190646, 168092, 145538, 122984, 100430, 77876, 55322, - 32768, 10214, -12340, -34894, -57448, -80002, -102556, -125110, - -147664, -170218, -192772, -215326, -237880, -260434, -282988, -305542, - -328096, -350650, -373204, -395758, -418312, -440866, -463420, -485974, - -508528, -531082, -553636, -576190, -598744, -621298, -643852, -666406, - -688960, -711514, -734068, -756622, -779176, -801730, -824284, -846838, - -869392, -891946, -914500, -937054, -959608, -982162, -1004716, -1027270, - -1049824, -1072378, -1094932, -1117486, -1140040, -1162594, -1185148, -1207702, - -1230256, -1252810, -1275364, -1297918, -1320472, -1343026, -1365580, -1388134, - -1410688, -1433242, -1455796, -1478350, -1500904, -1523458, -1546012, -1568566, - -1591120, -1613674, -1636228, -1658782, -1681336, -1703890, -1726444, -1748998, - -1771552, -1794106, -1816660, -1839214, -1861768, -1884322, -1906876, -1929430, - -1951984, -1974538, -1997092, -2019646, -2042200, -2064754, -2087308, -2109862, - -2132416, -2154970, -2177524, -2200078, -2222632, -2245186, -2267740, -2290294, - -2312848, -2335402, -2357956, -2380510, -2403064, -2425618, -2448172, -2470726, - -2493280, -2515834, -2538388, -2560942, -2583496, -2606050, -2628604, -2651158, - -2673712, -2696266, -2718820, -2741374, -2763928, -2786482, -2809036, -2831590, -}; - -static const uint8_t kRangeLimitLut[4 * 256] = {}; - -static const uint8_t* kRangeLimit = kRangeLimitLut + 384; - -void CoeffToIDCT(coeff_t *block, uint8_t *idct) +void CoeffToIDCT(const coeff_t *block, uint8_t *idct) { guetzli::ComputeBlockIDCT(block, idct); } -void IDCTToImage(const uint8_t idct[8 * 8], uint16_t *pixels_) +void IDCTToPixel(const uint8_t idct[8 * 8], uint16_t *pixels_) { const int block_x = 0; const int block_y = 0; @@ -236,7 +33,7 @@ void IDCTToImage(const uint8_t idct[8 * 8], uint16_t *pixels_) } // out = [YUVYUV....YUVYUV] -void ImageToYUV(uint16_t *pixels_, uint8_t *out) +void PixelToYUV(uint16_t *pixels_, uint8_t *out) { const int stride = 3; @@ -266,34 +63,55 @@ void YUVToRGB(uint8_t* pixelBlock) } // block = [R....R][G....G][B.....] -void BlockToImage(coeff_t *block, float* r, float* g, float* b) +void BlockToImage(const coeff_t *block, float* r, float* g, float* b, int inside_x, int inside_y) { - uint8_t idct[8 * 8 * 3]; - CoeffToIDCT(&block[0], &idct[0]); - CoeffToIDCT(&block[8 * 8], &idct[8 * 8]); - CoeffToIDCT(&block[8 * 8 * 2], &idct[8 * 8 * 2]); + uint8_t idct[3][8 * 8]; + CoeffToIDCT(&block[0], idct[0]); + CoeffToIDCT(&block[8 * 8], idct[1]); + CoeffToIDCT(&block[8 * 8 * 2], idct[2]); - uint16_t pixels[8 * 8 * 3]; + uint16_t pixels[3][8 * 8]; - IDCTToImage(&idct[0], &pixels[0]); - IDCTToImage(&idct[8*8], &pixels[8*8]); - IDCTToImage(&idct[8*8*2], &pixels[8*8*2]); + IDCTToPixel(idct[0], pixels[0]); + IDCTToPixel(idct[1], pixels[1]); + IDCTToPixel(idct[2], pixels[2]); uint8_t yuv[8 * 8 * 3]; - ImageToYUV(&pixels[0], &yuv[0]); - ImageToYUV(&pixels[8*8], &yuv[1]); - ImageToYUV(&pixels[8*8*2], &yuv[2]); + PixelToYUV(pixels[0], &yuv[0]); + PixelToYUV(pixels[1], &yuv[1]); + PixelToYUV(pixels[2], &yuv[2]); YUVToRGB(yuv); const double* lut = Srgb8ToLinearTable(); + for (int i = 0; i < 8 * 8; i++) { r[i] = lut[yuv[3 * i]]; g[i] = lut[yuv[3 * i + 1]]; b[i] = lut[yuv[3 * i + 2]]; } + for (int y = 0; y < inside_y; y++) + { + for (int x = inside_x; x < 8; x++) + { + int idx = y * 8 + (inside_x - 1); + r[y * 8 + x] = r[idx]; + g[y * 8 + x] = g[idx]; + b[y * 8 + x] = b[idx]; + } + } + for (int y = inside_y; y < 8; y++) + { + for (int x = 0; x < 8; x++) + { + int idx = (inside_y - 1) * 8 + x; + r[y * 8 + x] = r[idx]; + g[y * 8 + x] = g[idx]; + b[y * 8 + x] = b[idx]; + } + } } namespace guetzli @@ -351,9 +169,7 @@ namespace guetzli imgMaskXyzScaleBlockList[block_ix * 3 + 2] = mask_xyz_[2][ymin * width_ + xmin]; } } - - - } + } void ButteraugliComparatorEx::FinishBlockComparisons() { ButteraugliComparator::FinishBlockComparisons(); @@ -362,35 +178,75 @@ namespace guetzli imgMaskXyzScaleBlockList.clear(); } - void ButteraugliComparatorEx::SwitchBlock(int block_x, int block_y, int factor_x, int factor_y) - { + void ButteraugliComparatorEx::SwitchBlock(int block_x, int block_y, int factor_x, int factor_y) + { block_x_ = block_x; block_y_ = block_y; factor_x_ = factor_x; factor_y_ = factor_y; - ButteraugliComparator::SwitchBlock(block_x, block_y, factor_x, factor_y); - } + ButteraugliComparator::SwitchBlock(block_x, block_y, factor_x, factor_y); + } - double ButteraugliComparatorEx::CompareBlockEx(coeff_t* candidate_block) - { + double ButteraugliComparatorEx::CompareBlock(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block) const + { + double err = CompareBlockEx(img, off_x, off_y, candidate_block); + if (g_checkOpenCL) + { + double err1 = ButteraugliComparator::CompareBlock(img, off_x, off_y, candidate_block); + if (err1 != err) + { + LogError("Error: CompareBlock misstake.\n"); + } + } + + return err; + } + + double ButteraugliComparatorEx::CompareBlockEx(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block) const + { int block_ix = getCurrentBlockIdx(); - float* block_opsin = &imgOpsinDynamicsBlockList[block_ix * 3 * kDCTBlockSize]; + const float* block_opsin = &imgOpsinDynamicsBlockList[block_ix * 3 * kDCTBlockSize]; - // Õâ¸öÄڴ濽±´´ýÓÅ»¯£¬µ«²»ÊÇÏÖÔÚ + // Õâ¿éÊÇԭʼͼÏñ std::vector< std::vector > rgb0_c; rgb0_c.resize(3); for (int i = 0; i < 3; i++) { rgb0_c[i].resize(kDCTBlockSize); - memcpy(rgb0_c[i].data(), block_opsin + i*kDCTBlockSize, kDCTBlockSize * sizeof(float)); + memcpy(rgb0_c[i].data(), block_opsin + i * kDCTBlockSize, kDCTBlockSize * sizeof(float)); } - // - std::vector > rgb1_c(3, std::vector(kDCTBlockSize)); - BlockToImage(candidate_block, rgb1_c[0].data(), rgb1_c[1].data(), rgb1_c[2].data()); + // imgÊÇÈ«¾ÖÓÅ»¯ºóµÄͼÏñ£¬ÎÒÃÇͨ¹ýcoeff_tÊý¾Ý·´Ëã³öÀ´rgb + int border_x = block_x_ * 8 + 8 > width_ ? width_ - block_x_ * 8 : 8; + int border_y = block_y_ * 8 + 8 > height_ ? height_ - block_y_ * 8 : 8; + std::vector > rgb1_c(3, std::vector(kDCTBlockSize)); + BlockToImage(candidate_block, rgb1_c[0].data(), rgb1_c[1].data(), rgb1_c[2].data(), border_x, border_y); +/* + { + // ¿ÉÄÜ»¹ÓÐÎÊÌ⣬ÎÒÃÇ×öÒ»¸öУÑé + int block_x = block_x_ * factor_x_ + off_x; + int block_y = block_y_ * factor_y_ + off_y; + int xmin = 8 * block_x; + int ymin = 8 * block_y; + + std::vector > rgb1_c2(3, std::vector(kDCTBlockSize)); + img.ToLinearRGB(xmin, ymin, 8, 8, &rgb1_c2); + for (int i = 0; i < 3; i++) + { + for (int k = 0; k < 64; k++) + { + if (fabs(rgb1_c[i][k] - rgb1_c2[i][k]) > 0.001) + { + LogError("Error: CompareBlock misstake.\n"); + } + } + } + } +*/ + // ÏÂÃæÊǼÆË㹤×÷ ::butteraugli::OpsinDynamicsImage(8, 8, rgb0_c); ::butteraugli::OpsinDynamicsImage(8, 8, rgb1_c); @@ -424,15 +280,10 @@ namespace guetzli } - int ButteraugliComparatorEx::getCurrentBlockIdx(void) + int ButteraugliComparatorEx::getCurrentBlockIdx(void) const { - const int width = width_; - const int height = height_; - const int factor_x = 1; - const int factor_y = 1; - - const int block_width = (width + 8 * factor_x - 1) / (8 * factor_x); - const int block_height = (height + 8 * factor_y - 1) / (8 * factor_y); + const int block_width = (width_ + 8 * factor_x_ - 1) / (8 * factor_x_); + const int block_height = (height_ + 8 * factor_y_ - 1) / (8 * factor_y_); return block_y_ * block_width + block_x_; } diff --git a/clguetzli/clguetzli_comparator.h b/clguetzli/clguetzli_comparator.h index 97f23fb9..7f3a768c 100644 --- a/clguetzli/clguetzli_comparator.h +++ b/clguetzli/clguetzli_comparator.h @@ -15,9 +15,10 @@ namespace guetzli { void FinishBlockComparisons() override; void SwitchBlock(int block_x, int block_y, int factor_x, int factor_y) override; - double CompareBlockEx(coeff_t* candidate_block); + double CompareBlock(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block) const override; + double CompareBlockEx(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block) const; private: - int getCurrentBlockIdx(void); + int getCurrentBlockIdx(void) const; public: std::vector imgOpsinDynamicsBlockList; // [RR..RRGG..GGBB..BB]:blockCount std::vector imgMaskXyzScaleBlockList; // [RGBRGB..RGBRGB]:blockCount diff --git a/guetzli/butteraugli_comparator.cc b/guetzli/butteraugli_comparator.cc index 1748b80d..accb905f 100644 --- a/guetzli/butteraugli_comparator.cc +++ b/guetzli/butteraugli_comparator.cc @@ -97,7 +97,7 @@ void ButteraugliComparator::SwitchBlock(int block_x, int block_y, } double ButteraugliComparator::CompareBlock(const OutputImage& img, - int off_x, int off_y) const { + int off_x, int off_y, const coeff_t* candidate_block) const { int block_x = block_x_ * factor_x_ + off_x; int block_y = block_y_ * factor_y_ + off_y; int xmin = 8 * block_x; diff --git a/guetzli/butteraugli_comparator.h b/guetzli/butteraugli_comparator.h index bc247afe..572a9689 100644 --- a/guetzli/butteraugli_comparator.h +++ b/guetzli/butteraugli_comparator.h @@ -30,7 +30,6 @@ namespace guetzli { constexpr int kButteraugliStep = 3; - class ButteraugliComparator : public Comparator { public: ButteraugliComparator(const int width, const int height, @@ -46,7 +45,7 @@ class ButteraugliComparator : public Comparator { int factor_x, int factor_y) override; double CompareBlock(const OutputImage& img, - int off_x, int off_y) const override; + int off_x, int off_y, const coeff_t* candidate_block) const override; double ScoreOutputSize(int size) const override; diff --git a/guetzli/comparator.h b/guetzli/comparator.h index 00c56977..db76ac77 100644 --- a/guetzli/comparator.h +++ b/guetzli/comparator.h @@ -51,7 +51,7 @@ class Comparator { // the resulting per-block distance. The interpretation of the returned // distance depends on the comparator used. virtual double CompareBlock(const OutputImage& img, - int off_x, int off_y) const = 0; + int off_x, int off_y, const coeff_t* candidate_block) const = 0; // Returns the combined score of the output image in the last Compare() call // (or the baseline image, if Compare() was not called yet), based on output diff --git a/guetzli/guetzli.cc b/guetzli/guetzli.cc index 3f91cddd..5982bc1c 100644 --- a/guetzli/guetzli.cc +++ b/guetzli/guetzli.cc @@ -226,8 +226,8 @@ void Usage() { " Default value is %d.\n" " --memlimit M - Memory limit in MB. Guetzli will fail if unable to stay under\n" " the limit. Default limit is %d MB.\n" - " --nomemlimit - Do not limit memory usage.\n" - " --opencl - Use OpenCL\n", kDefaultJPEGQuality, kDefaultMemlimitMB); + " --opencl - Use OpenCL\n" + " --nomemlimit - Do not limit memory usage.\n", kDefaultJPEGQuality, kDefaultMemlimitMB); exit(1); } @@ -333,6 +333,5 @@ int main(int argc, char** argv) { } WriteFileOrDie(argv[opt_idx + 1], out_data); - return 0; } diff --git a/guetzli/processor.cc b/guetzli/processor.cc index 11abfa4c..e3bd4be4 100644 --- a/guetzli/processor.cc +++ b/guetzli/processor.cc @@ -22,8 +22,6 @@ #include #include "guetzli/butteraugli_comparator.h" -#include "clguetzli\clguetzli_comparator.h" -#include "clguetzli\clguetzli.h" #include "guetzli/comparator.h" #include "guetzli/debug_print.h" #include "guetzli/fast_log.h" @@ -33,18 +31,15 @@ #include "guetzli/jpeg_data_writer.h" #include "guetzli/output_image.h" #include "guetzli/quantize.h" +#include "clguetzli\clguetzli_comparator.h" +#include "clguetzli\clguetzli.h" namespace guetzli { namespace { static const size_t kBlockSize = 3 * kDCTBlockSize; -/* -struct CoeffData { - int idx; - float block_err; -}; -*/ + struct QuantData { int q[3][kDCTBlockSize]; size_t jpg_size; @@ -381,14 +376,12 @@ bool Processor::SelectQuantMatrix(const JPEGData& jpg_in, const bool downsample, } - // REQUIRES: block[c*64...(c*64+63)] is all zero if (comp_mask & (1<* output_order) { - static const uint8_t oldCsf[kDCTBlockSize] = { 10, 10, 20, 40, 60, 70, 80, 90, 10, 20, 30, 60, 70, 80, 90, 90, @@ -421,64 +414,52 @@ void Processor::ComputeBlockZeroingOrder( std::sort(input_order.begin(), input_order.end(), [](const std::pair& a, const std::pair& b) { return a.second < b.second; }); - - - coeff_t processed_block[kBlockSize]; - memcpy(processed_block, block, sizeof(processed_block)); - - comparator_->SwitchBlock(block_x, block_y, factor_x, factor_y); - - + coeff_t processed_block[kBlockSize]; + memcpy(processed_block, block, sizeof(processed_block)); + comparator_->SwitchBlock(block_x, block_y, factor_x, factor_y); while (!input_order.empty()) { float best_err = 1e17f; int best_i = 0; - for (size_t i = 0; i < std::min(params_.zeroing_greedy_lookahead, input_order.size()); ++i) - { + for (size_t i = 0; i < std::min(params_.zeroing_greedy_lookahead, + input_order.size()); + ++i) { coeff_t candidate_block[kBlockSize]; memcpy(candidate_block, processed_block, sizeof(candidate_block)); - const int idx = input_order[i].first; - - candidate_block[idx] = 0; // TOBEREMOVE:¶Ô±ÈblockµÄÅÅÐòµÃ·ÖǰiµÍµÄÖÃ0(i¸ù¾Ýinput_orderÊý¾Ý±ä»¯¶ø±ä»¯)£¬²¢ÏÈÉèÖûضԱÈͼÏñµÄÈý¸ö·ÖÁ¿¶ÔÓ¦blockÖÐÈ¥£¬ºóÐøÔÙ×ö¶Ô±È²ÉÓᣠ- + candidate_block[idx] = 0; for (int c = 0; c < 3; ++c) { if (comp_mask & (1 << c)) { - img->component(c).SetCoeffBlock(block_x, block_y, &candidate_block[c * kDCTBlockSize]); + img->component(c).SetCoeffBlock( + block_x, block_y, &candidate_block[c * kDCTBlockSize]); } } - float max_err = 0; - for (int iy = 0; iy < factor_y; ++iy) { for (int ix = 0; ix < factor_x; ++ix) { int block_xx = block_x * factor_x + ix; int block_yy = block_y * factor_y + iy; if (8 * block_xx < img->width() && 8 * block_yy < img->height()) { - float err = static_cast(comparator_->CompareBlock(*img, ix, iy)); // TOBEREMOVE:ºÍԭͼµÄ¶ÔÓ¦block±È½Ï£¬·µ»Ø´íÎóÖµ + float err = static_cast(comparator_->CompareBlock(*img, ix, iy, candidate_block)); max_err = std::max(max_err, err); } } } - - if (max_err < best_err) { // TOBEREMOVE:ÕÒ³ö×îС´íÎóÖµµÄi + if (max_err < best_err) { best_err = max_err; best_i = i; } } - int idx = input_order[best_i].first; processed_block[idx] = 0; input_order.erase(input_order.begin() + best_i); - - output_order->push_back({idx, best_err}); // TOBEREMOVE:½«ÉÏÃæ¼ÆËã³öÀ´µÄ×îС´íÎóµÄidx£¬¶ÔÓ¦µ½¶Ô±ÈblockÖеĶÔӦλÖÃÕæÕýµÄÖÃΪ0,ÒÆ³ýinput_orderÏ¼´Ñ¡È¡µ±Ç°Öµ£¬·ÅÈëoutput_order,²¢ÕýʽµÄÉèÖõ½¶Ô±ÈͼÏñÖÐÈ¥¡£ + output_order->push_back({idx, best_err}); for (int c = 0; c < 3; ++c) { if (comp_mask & (1 << c)) { - img->component(c).SetCoeffBlock(block_x, block_y, &processed_block[c * kDCTBlockSize]); + img->component(c).SetCoeffBlock( + block_x, block_y, &processed_block[c * kDCTBlockSize]); } } } - - // TOBEREMOVE:×îÖÕÒÆ³ýerrÊý´óÓÚerrorÏÞÖÆµÄÏî·µ»Ø£¬²¢»¹Ô­¶Ô±ÈͼÏñµ½Ô­Ê¼Öµ¡£ // Make the block error values monotonic. float min_err = 1e10; for (int i = output_order->size() - 1; i >= 0; --i) { @@ -495,7 +476,8 @@ void Processor::ComputeBlockZeroingOrder( // Restore *img to the same state as it was at the start of this function. for (int c = 0; c < 3; ++c) { if (comp_mask & (1 << c)) { - img->component(c).SetCoeffBlock(block_x, block_y, &block[c * kDCTBlockSize]); + img->component(c).SetCoeffBlock( + block_x, block_y, &block[c * kDCTBlockSize]); } } } @@ -741,7 +723,7 @@ void Processor::ComputeBlockZeroingOrder(const coeff_t block[kBlockSize], const candidate_block[idx] = 0; // TOBEREMOVE:¶Ô±ÈblockµÄÅÅÐòµÃ·ÖǰiµÍµÄÖÃ0(i¸ù¾Ýinput_orderÊý¾Ý±ä»¯¶ø±ä»¯)£¬²¢ÏÈÉèÖûضԱÈͼÏñµÄÈý¸ö·ÖÁ¿¶ÔÓ¦blockÖÐÈ¥£¬ºóÐøÔÙ×ö¶Ô±È²ÉÓᣠ- float max_err = ((ButteraugliComparatorEx*)comparator_)->CompareBlockEx(candidate_block); + float max_err = 0;/// ((ButteraugliComparatorEx*)comparator_)->CompareBlockEx(img, 0, 0, candidate_block); if (max_err < best_err) { // TOBEREMOVE:ÕÒ³ö×îС´íÎóÖµµÄi best_err = max_err; best_i = i; @@ -777,6 +759,7 @@ void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, bool stop_early) { const int width = img->width(); const int height = img->height(); + const int ncomp = jpg.components.size(); const int last_c = Log2FloorNonZero(comp_mask); if (static_cast(last_c) >= jpg.components.size()) return; const int factor_x = img->component(last_c).factor_x(); @@ -792,7 +775,7 @@ void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, candidate_coeff_errors.reserve(60 * num_blocks); std::vector block_order; block_order.reserve(3 * kDCTBlockSize); - comparator_->StartBlockComparisons(); // TOBEREMOVE:³õʼ»¯Ò»Ð©²ÎÊý + comparator_->StartBlockComparisons(); for (int block_y = 0, block_ix = 0; block_y < block_height; ++block_y) { for (int block_x = 0; block_x < block_width; ++block_x, ++block_ix) { coeff_t block[kBlockSize] = { 0 }; @@ -802,25 +785,25 @@ void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, assert(img->component(c).factor_x() == factor_x); assert(img->component(c).factor_y() == factor_y); img->component(c).GetCoeffBlock(block_x, block_y, - &block[c * kDCTBlockSize]); // TOBEREMOVE:È¡³ö¶Ô±ÈͼÏñblockϵÊý + &block[c * kDCTBlockSize]); const JPEGComponent& comp = jpg.components[c]; int jpg_block_ix = block_y * comp.width_in_blocks + block_x; memcpy(&orig_block[c * kDCTBlockSize], &comp.coeffs[jpg_block_ix * kDCTBlockSize], - kDCTBlockSize * sizeof(orig_block[0])); // TOBEREMOVE:È¡³öԭʼͼÏñblockϵÊý + kDCTBlockSize * sizeof(orig_block[0])); } } block_order.clear(); ComputeBlockZeroingOrder(block, orig_block, block_x, block_y, factor_x, - factor_y, comp_mask, img, &block_order); // TOBEREMOVE:´«ÈëԭʼblockºÍ¶Ô±ÈͼÏñblock¼ÆËãzeroing order·ÅÈëblock_order + factor_y, comp_mask, img, &block_order); candidate_coeff_offsets[block_ix] = candidate_coeffs.size(); - for (size_t i = 0; i < block_order.size(); ++i) { // TOBEREMOVE:°Ñ½á¹û¸³Öµµ½ºòѡϵÊý + for (size_t i = 0; i < block_order.size(); ++i) { candidate_coeffs.push_back(block_order[i].idx); candidate_coeff_errors.push_back(block_order[i].block_err); } } } - comparator_->FinishBlockComparisons(); // TOBEREMOVE:Çå³ý²ÎÊý + comparator_->FinishBlockComparisons(); candidate_coeff_offsets[num_blocks] = candidate_coeffs.size(); SelectFrequencyBackEnd(jpg, img, comp_mask, target_mul, stop_early, @@ -908,8 +891,7 @@ void Processor::SelectFrequencyBackEnd(const JPEGData& jpg, OutputImage* img, global_order.push_back(std::make_pair(block_ix, val)); } blocks_to_change += (last_index < num_candidates ? 1 : 0); - } - else { + } else { for (int i = last_index - 1; i >= 0; --i) { float val = ((max_err - candidate_errors[i]) / block_weight[block_ix]); @@ -1119,14 +1101,11 @@ bool Processor::ProcessJpegData(const Params& params, const JPEGData& jpg_in, img.ApplyGlobalQuantization(best_q); if (!downsample) { - //SelectFrequencyMasking(jpg, &img, 7, 1.0, false); - SelectFrequencyMaskingBatch(jpg, &img, 1.0, false); + SelectFrequencyMasking(jpg, &img, 7, 1.0, false); } else { const float ymul = jpg.components.size() == 1 ? 1.0f : 0.97f; SelectFrequencyMasking(jpg, &img, 1, ymul, false); SelectFrequencyMasking(jpg, &img, 6, 1.0, true); -// SelectFrequencyMaskingBatch(jpg, &img, ymul, false); -// SelectFrequencyMaskingBatch(jpg, &img, 1.0, true); } } diff --git a/guetzli/processor.h b/guetzli/processor.h index c2beb7e0..b36b184e 100644 --- a/guetzli/processor.h +++ b/guetzli/processor.h @@ -26,6 +26,11 @@ namespace guetzli { +struct CoeffData { + int idx; + float block_err; +}; + struct Params { float butteraugli_target = 1.0; bool clear_metadata = true; diff --git a/third_party/butteraugli/butteraugli/butteraugli.cc b/third_party/butteraugli/butteraugli/butteraugli.cc index 73b78a05..288bee78 100644 --- a/third_party/butteraugli/butteraugli/butteraugli.cc +++ b/third_party/butteraugli/butteraugli/butteraugli.cc @@ -30,7 +30,6 @@ // * Blur - to hold the smoothing code #include "butteraugli/butteraugli.h" -#include "clguetzli\clbutter_comparator.h" #include #include @@ -41,6 +40,7 @@ #include #include +#include "clguetzli\clbutter_comparator.h" #include "clguetzli\clguetzli.h" #include "clguetzli\clguetzli_test.h" @@ -64,30 +64,29 @@ inline double DotProduct(const float u[3], const double v[3]) { // Computes a horizontal convolution and transposes the result. void _Convolution(size_t xsize, size_t ysize, - size_t xstep, - size_t len, size_t offset, - const float* __restrict__ multipliers, - const float* __restrict__ inp, - float border_ratio, - float* __restrict__ result) { + size_t xstep, + size_t len, size_t offset, + const float* __restrict__ multipliers, + const float* __restrict__ inp, + double border_ratio, + float* __restrict__ result) { PROFILER_FUNC; - float weight_no_border = 0; - + double weight_no_border = 0; for (size_t j = 0; j <= 2 * offset; ++j) { weight_no_border += multipliers[j]; } for (size_t x = 0, ox = 0; x < xsize; x += xstep, ox++) { int minx = x < offset ? 0 : x - offset; int maxx = std::min(xsize, x + len - offset) - 1; - float weight = 0.0; + double weight = 0.0; for (int j = minx; j <= maxx; ++j) { weight += multipliers[j - x + offset]; } // Interpolate linearly between the no-border scaling and border scaling. weight = (1.0 - border_ratio) * weight + border_ratio * weight_no_border; - float scale = 1.0 / weight; + double scale = 1.0 / weight; for (size_t y = 0; y < ysize; ++y) { - float sum = 0.0; + double sum = 0.0; for (int j = minx; j <= maxx; ++j) { sum += inp[y * xsize + j] * multipliers[j - x + offset]; } @@ -744,7 +743,6 @@ const double *GetOpsinAbsorbance() { return &kMix[0]; } -// mixÊÇÒ»¸ö[4x4]¾ØÕó£¬Óëin[,,,1]½øÐвæ³Ë void OpsinAbsorbance(const double in[3], double out[3]) { const double *mix = GetOpsinAbsorbance(); out[0] = mix[0] * in[0] + mix[1] * in[1] + mix[2] * in[2] + mix[3]; @@ -874,24 +872,6 @@ inline void ClenshawRecursion<0>(const double x, const double *coefficients, *b1 = x_b1 - (*b2) + coefficients[0]; } -void ClenshawRecursion_fun(const double x, const double *coefficients, - double *b1, double *b2, int n) -{ - if (n == 0) { - const double x_b1 = x * (*b1); - // The final iteration differs - no 2 * x_b1 here. - *b1 = x_b1 - (*b2) + coefficients[0]; - return; - } - - const double x_b1 = x * (*b1); - const double t = (x_b1 + x_b1) - (*b2) + coefficients[n]; - *b2 = *b1; - *b1 = t; - - ClenshawRecursion_fun(x, coefficients, b1, b2, n - 1); -} - // Rational polynomial := dividing two polynomial evaluations. These are easier // to find than minimax polynomials. struct RationalPolynomial { @@ -900,9 +880,7 @@ struct RationalPolynomial { const double (&coefficients)[N]) { double b1 = 0.0; double b2 = 0.0; - ClenshawRecursion(x, coefficients, &b1, &b2); - return b1; } @@ -1052,7 +1030,6 @@ void ButteraugliComparator::DiffmapOpsinDynamicsImage( const std::vector> &xyb0_arg, std::vector> &xyb1, std::vector &result) { - if (xsize_ < 8 || ysize_ < 8) return; auto xyb0 = xyb0_arg; { @@ -1154,7 +1131,6 @@ void ButteraugliComparator::EdgeDetectorLowFreq( const std::vector > &xyb0, const std::vector > &xyb1, std::vector* block_diff_ac) { - PROFILER_FUNC; static const double kSigma = 14; static const double kMul = 10; @@ -1214,12 +1190,10 @@ void ButteraugliComparator::CombineChannels( const std::vector& block_diff_ac, const std::vector& edge_detector_map, std::vector* result) { - PROFILER_FUNC; result->resize(res_xsize_ * res_ysize_); - for (size_t res_y = 0; res_y + (8 - step_) < ysize_; res_y += step_) { - for (size_t res_x = 0, j = 0; res_x + (8 - step_) < xsize_; res_x += step_, j++) { + for (size_t res_x = 0; res_x + (8 - step_) < xsize_; res_x += step_) { size_t res_ix = (res_y * res_xsize_ + res_x) / step_; double mask[3]; double dc_mask[3]; @@ -1330,9 +1304,14 @@ double MaskDcB(double delta) { return InterpolateClampNegative(lut.data(), lut.size(), delta); } +// Replaces values[x + y * xsize] with the minimum of the values in the +// square_size square with coordinates +// x - offset .. x + square_size - offset - 1, +// y - offset .. y + square_size - offset - 1. void _MinSquareVal(size_t square_size, size_t offset, size_t xsize, size_t ysize, float *values) { + PROFILER_FUNC; // offset is not negative and smaller than square_size. assert(offset < square_size); std::vector tmp(xsize * ysize); From 8c63e20b589df03a46fa541fcf4b79acca91394b Mon Sep 17 00:00:00 2001 From: strongtu Date: Wed, 17 May 2017 10:15:54 +0800 Subject: [PATCH 082/189] =?UTF-8?q?=E5=AF=B9=E4=BA=8E8x8=E7=9A=84=E5=9D=97?= =?UTF-8?q?=EF=BC=8C=E6=9A=82=E6=97=B6=E4=B8=8D=E5=81=9Acheck=EF=BC=8C?= =?UTF-8?q?=E5=90=A6=E5=88=99=E9=80=9F=E5=BA=A6=E5=A4=AA=E6=85=A2=E4=BA=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clbutter_comparator.cpp | 46 +++++++++++++++--------------- clguetzli/clguetzli_comparator.cpp | 2 +- 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/clguetzli/clbutter_comparator.cpp b/clguetzli/clbutter_comparator.cpp index 734e2c33..c61c8578 100644 --- a/clguetzli/clbutter_comparator.cpp +++ b/clguetzli/clbutter_comparator.cpp @@ -33,7 +33,7 @@ namespace butteraugli { ButteraugliComparator::BlockDiffMap(xyb0, xyb1, block_diff_dc, block_diff_ac); - if (g_checkOpenCL) + if (g_checkOpenCL && xsize_ > 8 && ysize_ > 8) { tclBlockDiffMap(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), @@ -49,7 +49,7 @@ namespace butteraugli { ButteraugliComparator::EdgeDetectorMap(xyb0, xyb1, edge_detector_map); - if (g_checkOpenCL) + if (g_checkOpenCL && xsize_ > 8 && ysize_ > 8) { tclEdgeDetectorMap(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), @@ -63,14 +63,14 @@ namespace butteraugli std::vector* block_diff_ac) { std::vector orign_ac; - if (g_checkOpenCL) + if (g_checkOpenCL && xsize_ > 8 && ysize_ > 8) { orign_ac = *block_diff_ac; } ButteraugliComparator::EdgeDetectorLowFreq(xyb0, xyb1, block_diff_ac); - if (g_checkOpenCL) + if (g_checkOpenCL && xsize_ > 8 && ysize_ > 8) { tclEdgeDetectorLowFreq(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), @@ -87,14 +87,14 @@ namespace butteraugli std::vector* result) { std::vector temp; - if (g_checkOpenCL) + if (g_checkOpenCL && xsize_ > 8 && ysize_ > 8) { temp = *result; } ButteraugliComparator::CombineChannels(mask_xyb, mask_xyb_dc, block_diff_dc, block_diff_ac, edge_detector_map, result); - if (g_checkOpenCL) + if (g_checkOpenCL && xsize_ > 8 && ysize_ > 8) { temp.resize(res_xsize_ * res_ysize_); tclCombineChannels(mask_xyb[0].data(), mask_xyb[1].data(), mask_xyb[2].data(), @@ -107,7 +107,7 @@ namespace butteraugli void MinSquareVal(size_t square_size, size_t offset, size_t xsize, size_t ysize, float *values) { std::vector img; - if (g_checkOpenCL) + if (g_checkOpenCL && xsize > 8 && ysize > 8) { img.resize(xsize * ysize); memcpy(img.data(), values, xsize * ysize * sizeof(float)); @@ -116,7 +116,7 @@ namespace butteraugli _MinSquareVal(square_size, offset, xsize, ysize, values); - if (g_checkOpenCL) + if (g_checkOpenCL && xsize > 8 && ysize > 8) { tclMinSquareVal(img.data(), square_size, offset, xsize, ysize, values); } @@ -125,14 +125,14 @@ namespace butteraugli void Average5x5(int xsize, int ysize, std::vector* diffs) { std::vector diffs_org; - if (g_checkOpenCL) + if (g_checkOpenCL && xsize > 8 && ysize > 8) { diffs_org = *diffs; } _Average5x5(xsize, ysize, diffs); - if (g_checkOpenCL) + if (g_checkOpenCL && xsize > 8 && ysize > 8) { tclAverage5x5(xsize, ysize, diffs_org, *diffs); } @@ -142,7 +142,7 @@ namespace butteraugli { _DiffPrecompute(xyb0, xyb1, xsize, ysize, mask); - if (g_checkOpenCL) + if (g_checkOpenCL && xsize > 8 && ysize > 8) { tclDiffPrecompute(xyb0, xyb1, xsize, ysize, mask); } @@ -154,7 +154,7 @@ namespace butteraugli std::vector > *mask, std::vector > *mask_dc) { - if (g_useOpenCL) + if (g_useOpenCL && xsize > 100 && ysize > 100) { mask->resize(3); mask_dc->resize(3); @@ -173,7 +173,7 @@ namespace butteraugli _Mask(xyb0, xyb1, xsize, ysize, mask, mask_dc); - if (g_checkOpenCL) + if (g_checkOpenCL && xsize > 8 && ysize > 8) { tclMask(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), @@ -188,14 +188,14 @@ namespace butteraugli std::vector* diffmap) { std::vector diffmap_org; - if (g_checkOpenCL) + if (g_checkOpenCL && xsize > 8 && ysize > 8) { diffmap_org = *diffmap; } _CalculateDiffmap(xsize, ysize, step, diffmap); - if (g_checkOpenCL) + if (g_checkOpenCL && xsize > 8 && ysize > 8) { tclCalculateDiffmap(xsize, ysize, step, diffmap_org.data(), diffmap_org.size(), (*diffmap).data()); } @@ -210,7 +210,7 @@ namespace butteraugli { _MaskHighIntensityChange(xsize, ysize, c0, c1, xyb0, xyb1); - if (g_checkOpenCL) + if (g_checkOpenCL && xsize > 8 && ysize > 8) { tclMaskHighIntensityChange(c0[0].data(), c0[1].data(), c0[2].data(), c1[0].data(), c1[1].data(), c1[2].data(), @@ -223,14 +223,14 @@ namespace butteraugli void ScaleImage(double scale, std::vector *result) { std::vector result_org; - if (g_checkOpenCL) + if (g_checkOpenCL && result->size() > 64) { result_org = *result; } _ScaleImage(scale, result); - if (g_checkOpenCL) + if (g_checkOpenCL && result->size() > 64) { tclScaleImage(scale, result_org.data(), (*result).data(), (*result).size()); } @@ -246,7 +246,7 @@ namespace butteraugli { _Convolution(xsize, ysize, xstep, len, offset, multipliers, inp, border_ratio, result); - if (g_checkOpenCL) + if (g_checkOpenCL && xsize > 8 && ysize > 8) { tclConvolution(xsize, ysize, xstep, len, offset, multipliers, inp, border_ratio, result); } @@ -256,7 +256,7 @@ namespace butteraugli double border_ratio) { std::vector orignChannel; - if (g_checkOpenCL) + if (g_checkOpenCL && xsize > 8 && ysize > 8) { orignChannel.resize(xsize * ysize); memcpy(orignChannel.data(), channel, xsize * ysize * sizeof(float)); @@ -264,7 +264,7 @@ namespace butteraugli _Blur(xsize, ysize, channel, sigma, border_ratio); - if (g_checkOpenCL) + if (g_checkOpenCL && xsize > 8 && ysize > 8) { tclBlur(orignChannel.data(), xsize, ysize, sigma, border_ratio, channel); } @@ -284,14 +284,14 @@ namespace butteraugli else { std::vector< std::vector> orig_rgb; - if (g_checkOpenCL) + if (g_checkOpenCL && xsize > 8 && ysize > 8) { orig_rgb = rgb; } _OpsinDynamicsImage(xsize, ysize, rgb); - if (g_checkOpenCL) + if (g_checkOpenCL && xsize > 8 && ysize > 8) { tclOpsinDynamicsImage(orig_rgb[0].data(), orig_rgb[1].data(), orig_rgb[2].data(), xsize, ysize, rgb[0].data(), rgb[1].data(), rgb[2].data()); diff --git a/clguetzli/clguetzli_comparator.cpp b/clguetzli/clguetzli_comparator.cpp index 22b1965f..2b705d86 100644 --- a/clguetzli/clguetzli_comparator.cpp +++ b/clguetzli/clguetzli_comparator.cpp @@ -196,7 +196,7 @@ namespace guetzli double err1 = ButteraugliComparator::CompareBlock(img, off_x, off_y, candidate_block); if (err1 != err) { - LogError("Error: CompareBlock misstake.\n"); + LogError("CHK %s(%d) \r\n", __FUNCTION__, __LINE__); } } From 6b8bebfeb2cceaa1109dc0d63d42d9379f947267 Mon Sep 17 00:00:00 2001 From: strongtu Date: Wed, 17 May 2017 20:45:02 +0800 Subject: [PATCH 083/189] Merge branch 'master' of https://github.com/ianhuang-777/guetzli --- clguetzli/clguetzli.cl | 337 +++++++++++++++++++---- clguetzli/clguetzli.cpp | 45 ++-- clguetzli/clguetzli.h | 8 +- clguetzli/clguetzli_comparator.cpp | 419 ++++++++++++++++++++++++++--- clguetzli/clguetzli_comparator.h | 9 +- clguetzli/ocl.cpp | 2 +- clguetzli/ocl.h | 2 +- guetzli/butteraugli_comparator.cc | 4 +- guetzli/butteraugli_comparator.h | 2 +- guetzli/comparator.h | 2 +- guetzli/processor.cc | 58 ++-- 11 files changed, 746 insertions(+), 142 deletions(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index e4565a90..ab595dde 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -1993,7 +1993,7 @@ void Compute1dIDCT(const coeff_t* in, const int stride, int out[8]) { out[7] -= tmp1; } -void CoeffToIDCT(coeff_t *block, uchar * out) +void CoeffToIDCT(__private const coeff_t block[8*8], uchar out[8*8]) { coeff_t colidcts[kDCTBlockSize]; const int kColScale = 11; @@ -2020,7 +2020,7 @@ void CoeffToIDCT(coeff_t *block, uchar * out) } } -void IDCTToImage(uchar *idct, ushort *pixels_) +void IDCTToPixel(const uchar idct[8*8], ushort pixels_[8*8]) { const int block_x = 0; const int block_y = 0; @@ -2040,7 +2040,7 @@ void IDCTToImage(uchar *idct, ushort *pixels_) } } -void ImageToYUV(ushort *pixels_, uchar *out) +void PixelToYUV(const ushort pixels_[8*8], uchar out[8*8]) { const int stride = 3; @@ -2242,7 +2242,7 @@ __constant static uchar kRangeLimitLut[4 * 256] = { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }; -void YUVToRGB(uchar *pixelBlock) +void YUVToRGB(__private uchar pixelBlock[3*8*8]) { __constant uchar* kRangeLimit = kRangeLimitLut + 384; for (int i = 0; i < 64; i++) @@ -2258,46 +2258,290 @@ void YUVToRGB(uchar *pixelBlock) } } +__constant static double kSrgb8ToLinearTable[256] = { + 0.000000, + 0.077399, + 0.154799, + 0.232198, + 0.309598, + 0.386997, + 0.464396, + 0.541796, + 0.619195, + 0.696594, + 0.773994, + 0.853367, + 0.937509, + 1.026303, + 1.119818, + 1.218123, + 1.321287, + 1.429375, + 1.542452, + 1.660583, + 1.783830, + 1.912253, + 2.045914, + 2.184872, + 2.329185, + 2.478910, + 2.634105, + 2.794824, + 2.961123, + 3.133055, + 3.310673, + 3.494031, + 3.683180, + 3.878171, + 4.079055, + 4.285881, + 4.498698, + 4.717556, + 4.942502, + 5.173584, + 5.410848, + 5.654341, + 5.904108, + 6.160196, + 6.422649, + 6.691512, + 6.966827, + 7.248640, + 7.536993, + 7.831928, + 8.133488, + 8.441715, + 8.756651, + 9.078335, + 9.406810, + 9.742115, + 10.084290, + 10.433375, + 10.789410, + 11.152432, + 11.522482, + 11.899597, + 12.283815, + 12.675174, + 13.073712, + 13.479465, + 13.892470, + 14.312765, + 14.740385, + 15.175366, + 15.617744, + 16.067555, + 16.524833, + 16.989614, + 17.461933, + 17.941824, + 18.429322, + 18.924460, + 19.427272, + 19.937793, + 20.456054, + 20.982090, + 21.515934, + 22.057618, + 22.607175, + 23.164636, + 23.730036, + 24.303404, + 24.884774, + 25.474176, + 26.071642, + 26.677203, + 27.290891, + 27.912736, + 28.542769, + 29.181020, + 29.827520, + 30.482299, + 31.145387, + 31.816813, + 32.496609, + 33.184802, + 33.881422, + 34.586499, + 35.300062, + 36.022139, + 36.752760, + 37.491953, + 38.239746, + 38.996169, + 39.761248, + 40.535013, + 41.317491, + 42.108710, + 42.908697, + 43.717481, + 44.535088, + 45.361546, + 46.196882, + 47.041124, + 47.894297, + 48.756429, + 49.627547, + 50.507676, + 51.396845, + 52.295078, + 53.202402, + 54.118843, + 55.044428, + 55.979181, + 56.923129, + 57.876298, + 58.838712, + 59.810398, + 60.791381, + 61.781686, + 62.781338, + 63.790363, + 64.808784, + 65.836627, + 66.873918, + 67.920679, + 68.976937, + 70.042715, + 71.118037, + 72.202929, + 73.297414, + 74.401516, + 75.515259, + 76.638668, + 77.771765, + 78.914575, + 80.067122, + 81.229428, + 82.401518, + 83.583415, + 84.775142, + 85.976722, + 87.188178, + 88.409534, + 89.640813, + 90.882037, + 92.133229, + 93.394412, + 94.665609, + 95.946841, + 97.238133, + 98.539506, + 99.850982, + 101.172584, + 102.504334, + 103.846254, + 105.198366, + 106.560693, + 107.933256, + 109.316077, + 110.709177, + 112.112579, + 113.526305, + 114.950375, + 116.384811, + 117.829635, + 119.284868, + 120.750532, + 122.226647, + 123.713235, + 125.210317, + 126.717914, + 128.236047, + 129.764737, + 131.304005, + 132.853871, + 134.414357, + 135.985483, + 137.567270, + 139.159738, + 140.762907, + 142.376799, + 144.001434, + 145.636832, + 147.283012, + 148.939997, + 150.607804, + 152.286456, + 153.975971, + 155.676371, + 157.387673, + 159.109900, + 160.843070, + 162.587203, + 164.342319, + 166.108438, + 167.885578, + 169.673761, + 171.473005, + 173.283330, + 175.104755, + 176.937299, + 178.780982, + 180.635824, + 182.501843, + 184.379058, + 186.267489, + 188.167154, + 190.078073, + 192.000265, + 193.933749, + 195.878543, + 197.834666, + 199.802137, + 201.780975, + 203.771198, + 205.772826, + 207.785876, + 209.810367, + 211.846319, + 213.893748, + 215.952674, + 218.023115, + 220.105089, + 222.198615, + 224.303711, + 226.420395, + 228.548685, + 230.688599, + 232.840156, + 235.003373, + 237.178269, + 239.364861, + 241.563167, + 243.773205, + 245.994993, + 248.228549, + 250.473890, + 252.731035, + 255.000000, +}; + // chrisk todo -void BlockToImage(coeff_t *block, float *r, float *g, float *b) +void BlockToImage(__private coeff_t block[8*8*3], float r[8*8], float g[8*8], float b[8*8]) { - uchar idct[8 * 8 * 3]; + uchar idct[3][8 * 8]; CoeffToIDCT(&block[0], &idct[0]); - CoeffToIDCT(&block[8 * 8], &idct[8 * 8]); - CoeffToIDCT(&block[8 * 8 * 2], &idct[8 * 8 * 2]); - - ushort pixels[8 * 8 * 3]; + CoeffToIDCT(&block[8 * 8], &idct[1]); + CoeffToIDCT(&block[8 * 8 * 2], &idct[2]); - IDCTToImage(&idct[0], &pixels[0]); - IDCTToImage(&idct[8 * 8], &pixels[8 * 8]); - IDCTToImage(&idct[8 * 8 * 2], &pixels[8 * 8 * 2]); + ushort pixels[3][8 * 8]; + IDCTToPixel(&idct[0], &pixels[0]); + IDCTToPixel(&idct[1], &pixels[1]); + IDCTToPixel(&idct[2], &pixels[2]); uchar yuv[8 * 8 * 3]; - - ImageToYUV(&pixels[0], &yuv[0]); - ImageToYUV(&pixels[8 * 8], &yuv[1]); - ImageToYUV(&pixels[8 * 8 * 2], &yuv[2]); + PixelToYUV(&pixels[0], &yuv[0]); + PixelToYUV(&pixels[1], &yuv[1]); + PixelToYUV(&pixels[2], &yuv[2]); YUVToRGB(yuv); - // Srgb8ToLinearTable begin - double lut[256]; - int i = 0; - for (; i < 11; ++i) - { - lut[i] = i / 12.92; - } - for (; i < 256; ++i) - { - lut[i] = 255.0 * pow(((i / 255.0) + 0.055) / 1.055, 2.4); - } - // Srgb8ToLinearTable end - for (int i = 0; i < 8 * 8; i++) { - r[i] = lut[yuv[3 * i]]; - g[i] = lut[yuv[3 * i + 1]]; - b[i] = lut[yuv[3 * i + 2]]; + r[i] = kSrgb8ToLinearTable[yuv[3 * i]]; + g[i] = kSrgb8ToLinearTable[yuv[3 * i + 1]]; + b[i] = kSrgb8ToLinearTable[yuv[3 * i + 2]]; } } @@ -2514,7 +2758,7 @@ float CompareBlockEx(coeff_t *candidate_block, __global float* orig_image_block, MaskHighIntensityChangeBlock(rgb0[0],rgb0[1], rgb0[2], rgb1[0], rgb1[1], rgb1[2], - rgb0_c.ch[0], rgb0_c.ch[1], rgb0_c.ch[2], + rgb0_c.ch[0], rgb0_c.ch[1], rgb0_c.ch[2], rgb1_c.ch[0], rgb1_c.ch[1], rgb1_c.ch[2], 8, 8); @@ -2548,25 +2792,20 @@ float CompareBlockEx(coeff_t *candidate_block, __global float* orig_image_block, } // strong todo -// orig_block_list [R....R][G....G][B....B] -// block_list [R....R][G....G][B....B] -// orig_image [RR..RRGG..GGBB..BB] -// mask_scale[RGB] -// output_orlder_list [3 * kBlockSize] - -__kernel void clComputeBlockZeroingOrder(__global coeff_t *orig_block_list/*in*/, - __global coeff_t *block_list/*in*/, - __global float *orig_image/*in*/, - __global float *mask_scale/*in*/, +// batchÊÇÖ¸ÒѾ­¶þά¿éÕ¹¿ªÎªÁËһά¿é +__kernel void clComputeBlockZeroingOrder(__global const coeff_t *orig_batch, // ԭʼͼÏñϵÊý + __global const float *orig_image_batch, // ԭʼͼÏñpregammaºó + __global const float *mask_scale, // ԭʼͼÏñµÄij¸öÉñÃØ²ÎÊý + __global const coeff_t *mayout_batch, // Êä³ö±¸Ñ¡Í¼µÄϵÊý float BlockErrorLimit, __global CoeffData *output_order_list/*out*/) { int block_idx = get_global_id(0); #define kComputeBlockSize (kBlockSize * 3) - __global coeff_t *orig_block = orig_block_list + block_idx * kComputeBlockSize; - __global coeff_t *block = block_list + block_idx * kComputeBlockSize; - __global float* orig_image_block = orig_image + block_idx * kComputeBlockSize; + __global coeff_t *orig_block = orig_batch + block_idx * kComputeBlockSize; + __global coeff_t *mayout_block = mayout_batch + block_idx * kComputeBlockSize; + __global float *orig_image_block = orig_image_batch + block_idx * kComputeBlockSize; DCTScoreData input_order_data[kComputeBlockSize]; CoeffData output_order_data[kComputeBlockSize]; @@ -2574,11 +2813,11 @@ __kernel void clComputeBlockZeroingOrder(__global coeff_t *orig_block_list/*in*/ IntFloatPairList input_order = { 0, input_order_data }; IntFloatPairList output_order = { 0, output_order_data }; - int count = MakeInputOrder(block, orig_block, &input_order, kBlockSize); + int count = MakeInputOrder(mayout_block, orig_block, &input_order, kBlockSize); coeff_t processed_block[kComputeBlockSize]; for (int i = 0; i < kComputeBlockSize; i++) { - processed_block[i] = block[i]; + processed_block[i] = mayout_block[i]; } while (input_order.size > 0) diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index d2f01f3e..5ab406e7 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -1194,9 +1194,14 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b, clReleaseMemObject(mem_result); } -void clComputeBlockZeroingOrder(guetzli::coeff_t *orig_block_batch, guetzli::coeff_t *block_batch, - float *orig_image, float* mask_scale, guetzli::CoeffData *output_order_batch, - int size, float BlockErrorLimit) +// batchÊÇÖ¸ÒѾ­¶þά¿éÕ¹¿ªÎªÁËһά¿é +void clComputeBlockZeroingOrder(const guetzli::coeff_t *orig_batch, // ԭʼͼÏñϵÊý + const float *orig_image_batch, // ԭʼͼÏñpregammaºó + const float* orig_mask_scale_batch, // ԭʼͼÏñµÄij¸öÉñÃØ²ÎÊý + const guetzli::coeff_t *mayout_batch, // Êä³ö±¸Ñ¡Í¼µÄϵÊý + int size, // + float BlockErrorLimit, + guetzli::CoeffData *output_order_batch) // { using namespace guetzli; @@ -1205,20 +1210,20 @@ void clComputeBlockZeroingOrder(guetzli::coeff_t *orig_block_batch, guetzli::coe cl_int err = 0; ocl_args_d_t &ocl = getOcl(); - cl_mem mem_orig_block_list = ocl.allocMem(sizeof(coeff_t) * item_count, orig_block_batch); - cl_mem mem_block_list = ocl.allocMem(sizeof(coeff_t) * item_count, block_batch); - cl_mem mem_orig_image = ocl.allocMem(sizeof(float) * item_count, orig_image); - cl_mem mem_mask_scale = ocl.allocMem(sizeof(float) * 3 * size, mask_scale); - cl_mem mem_output_order_list = ocl.allocMem(sizeof(CoeffData) * item_count); + cl_mem mem_orig_batch = ocl.allocMem(sizeof(coeff_t) * item_count, orig_batch); + cl_mem mem_orig_image_batch = ocl.allocMem(sizeof(float) * item_count, orig_image_batch); + cl_mem mem_mask_scale_batch = ocl.allocMem(sizeof(float) * 3 * size, orig_mask_scale_batch); + cl_mem mem_mayout_batch = ocl.allocMem(sizeof(coeff_t) * item_count, mayout_batch); + cl_mem mem_output_order_batch = ocl.allocMem(sizeof(CoeffData) * item_count); cl_float clBlockErrorLimit = BlockErrorLimit; cl_kernel kernel = ocl.kernel[KERNEL_COMPUTEBLOCKZERONGORDER]; - clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&mem_orig_block_list); - clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&mem_block_list); - clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&mem_orig_image); - clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&mem_mask_scale); + clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&mem_orig_batch); + clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&mem_orig_image_batch); + clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&mem_mask_scale_batch); + clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&mem_mayout_batch); clSetKernelArg(kernel, 4, sizeof(cl_float), &clBlockErrorLimit); - clSetKernelArg(kernel, 5, sizeof(cl_mem), &mem_output_order_list); + clSetKernelArg(kernel, 5, sizeof(cl_mem), &mem_output_order_batch); size_t globalWorkSize[1] = { size }; err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL); @@ -1232,17 +1237,17 @@ void clComputeBlockZeroingOrder(guetzli::coeff_t *orig_block_batch, guetzli::coe LogError("Error: clComputeBlockZeroingOrder() clFinish returned %s.\n", TranslateOpenCLError(err)); } - CoeffData *result = (CoeffData *)clEnqueueMapBuffer(ocl.commandQueue, mem_output_order_list, true, CL_MAP_READ, 0, sizeof(CoeffData) * item_count, 0, NULL, NULL, &err); + CoeffData *result = (CoeffData *)clEnqueueMapBuffer(ocl.commandQueue, mem_output_order_batch, true, CL_MAP_READ, 0, sizeof(CoeffData) * item_count, 0, NULL, NULL, &err); err = clFinish(ocl.commandQueue); memcpy(output_order_batch, result, sizeof(CoeffData) * item_count); - clEnqueueUnmapMemObject(ocl.commandQueue, mem_output_order_list, result, sizeof(CoeffData) * item_count, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, mem_output_order_batch, result, sizeof(CoeffData) * item_count, NULL, NULL); clFinish(ocl.commandQueue); - clReleaseMemObject(mem_orig_block_list); - clReleaseMemObject(mem_block_list); - clReleaseMemObject(mem_orig_image); - clReleaseMemObject(mem_mask_scale); - clReleaseMemObject(mem_output_order_list); + clReleaseMemObject(mem_orig_batch); + clReleaseMemObject(mem_orig_image_batch); + clReleaseMemObject(mem_mask_scale_batch); + clReleaseMemObject(mem_mayout_batch); + clReleaseMemObject(mem_output_order_batch); } \ No newline at end of file diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h index 457de4a0..459110de 100644 --- a/clguetzli/clguetzli.h +++ b/clguetzli/clguetzli.h @@ -15,7 +15,13 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b, size_t step, float* result); -void clComputeBlockZeroingOrder(guetzli::coeff_t *orig_block_batch, guetzli::coeff_t *block_batch, float *orig_iamge, float* mask_scale, guetzli::CoeffData *output_order_batch, int size, float BlockErrorLimit); +void clComputeBlockZeroingOrder(const guetzli::coeff_t *orig_batch, + const float *orig_image_batch, + const float* orig_mask_scale_batch, + const guetzli::coeff_t *mayout_batch, + int size, + float BlockErrorLimit, + guetzli::CoeffData *output_order_batch); void clMask(const float* r, const float* g, const float* b, const float* r2, const float* g2, const float* b2, diff --git a/clguetzli/clguetzli_comparator.cpp b/clguetzli/clguetzli_comparator.cpp index 2b705d86..deaf4cdb 100644 --- a/clguetzli/clguetzli_comparator.cpp +++ b/clguetzli/clguetzli_comparator.cpp @@ -9,12 +9,12 @@ using namespace guetzli; -void CoeffToIDCT(const coeff_t *block, uint8_t *idct) +void CoeffToIDCT(const coeff_t block[8*8], uint8_t idct[8*8]) { guetzli::ComputeBlockIDCT(block, idct); } -void IDCTToPixel(const uint8_t idct[8 * 8], uint16_t *pixels_) +void IDCTToPixel8x8(const uint8_t idct[8 * 8], uint16_t pixels_[8*8]) { const int block_x = 0; const int block_y = 0; @@ -32,14 +32,85 @@ void IDCTToPixel(const uint8_t idct[8 * 8], uint16_t *pixels_) } } +void IDCTToPixel16x16(const uint8_t idct[8*8], uint16_t pixels_[16*16]) +{ + const int block_x = 0; + const int block_y = 0; + const int width_ = 16; + const int height_ = 16; + + // Fill in the 10x10 pixel area in the subsampled image that will be the + // basis of the upsampling. This area is enough to hold the 3x3 kernel of + // the fancy upsampler around each pixel. + static const int kSubsampledEdgeSize = 10; + uint16_t subsampled[kSubsampledEdgeSize * kSubsampledEdgeSize]; + for (int j = 0; j < kSubsampledEdgeSize; ++j) { + // The order we fill in the rows is: + // 8 rows intersecting the block, row below, row above + const int y0 = block_y * 16 + (j < 9 ? j * 2 : -2); + for (int i = 0; i < kSubsampledEdgeSize; ++i) { + // The order we fill in each row is: + // 8 pixels within the block, left edge, right edge + const int ix = ((j < 9 ? (j + 1) * kSubsampledEdgeSize : 0) + + (i < 9 ? i + 1 : 0)); + const int x0 = block_x * 16 + (i < 9 ? i * 2 : -2); + if (x0 < 0) { + subsampled[ix] = subsampled[ix + 1]; + } + else if (y0 < 0) { + subsampled[ix] = subsampled[ix + kSubsampledEdgeSize]; + } + else if (x0 >= width_) { + subsampled[ix] = subsampled[ix - 1]; + } + else if (y0 >= height_) { + subsampled[ix] = subsampled[ix - kSubsampledEdgeSize]; + } + else if (i < 8 && j < 8) { + subsampled[ix] = idct[j * 8 + i] << 4; + } + else { + // Reconstruct the subsampled pixels around the edge of the current + // block by computing the inverse of the fancy upsampler. + const int y1 = std::max(y0 - 1, 0); + const int x1 = std::max(x0 - 1, 0); + subsampled[ix] = (pixels_[y0 * width_ + x0] * 9 + + pixels_[y1 * width_ + x1] + + pixels_[y0 * width_ + x1] * -3 + + pixels_[y1 * width_ + x0] * -3) >> 2; + } + } + } + + // Determine area to update. + int xmin = std::max(block_x * 16 - 1, 0); + int xmax = std::min(block_x * 16 + 16, width_ - 1); + int ymin = std::max(block_y * 16 - 1, 0); + int ymax = std::min(block_y * 16 + 16, height_ - 1); + + // Apply the fancy upsampler on the subsampled block. + for (int y = ymin; y <= ymax; ++y) { + const int y0 = ((y & ~1) / 2 - block_y * 8 + 1) * kSubsampledEdgeSize; + const int dy = ((y & 1) * 2 - 1) * kSubsampledEdgeSize; + uint16_t* rowptr = &pixels_[y * width_]; + for (int x = xmin; x <= xmax; ++x) { + const int x0 = (x & ~1) / 2 - block_x * 8 + 1; + const int dx = (x & 1) * 2 - 1; + const int ix = x0 + y0; + rowptr[x] = (subsampled[ix] * 9 + subsampled[ix + dy] * 3 + + subsampled[ix + dx] * 3 + subsampled[ix + dx + dy]) >> 4; + } + } +} + // out = [YUVYUV....YUVYUV] -void PixelToYUV(uint16_t *pixels_, uint8_t *out) +void PixelToYUV(uint16_t pixels_[8*8], uint8_t out[8*8], int xsize = 8, int ysize = 8) { const int stride = 3; - for (int y = 0; y < 8; ++y) { - for (int x = 0; x < 8; ++x) { - int px = y * 8 + x; + for (int y = 0; y < xsize; ++y) { + for (int x = 0; x < ysize; ++x) { + int px = y * xsize + x; *out = static_cast((pixels_[px] + 8 - (x & 1)) >> 4); out += stride; } @@ -47,9 +118,9 @@ void PixelToYUV(uint16_t *pixels_, uint8_t *out) } // pixel = [YUVYUV...YUVYUV] to [RGBRGB...RGBRGB] -void YUVToRGB(uint8_t* pixelBlock) +void YUVToRGB(uint8_t pixelBlock[3*8*8], int size = 8 * 8) { - for (int i = 0; i < 64; i++) + for (int i = 0; i < size; i++) { uint8_t *pixel = &pixelBlock[i*3]; @@ -62,8 +133,42 @@ void YUVToRGB(uint8_t* pixelBlock) } } +void YUVToImage(uint8_t yuv[3 * 8 * 8], float* r, float* g, float* b, int xsize = 8, int ysize = 8, int inside_x = 8, int inside_y = 8) +{ + YUVToRGB(yuv, xsize * ysize); + + const double* lut = Srgb8ToLinearTable(); + + for (int i = 0; i < xsize * ysize; i++) + { + r[i] = lut[yuv[3 * i]]; + g[i] = lut[yuv[3 * i + 1]]; + b[i] = lut[yuv[3 * i + 2]]; + } + for (int y = 0; y < inside_y; y++) + { + for (int x = inside_x; x < xsize; x++) + { + int idx = y * 8 + (inside_x - 1); + r[y * 8 + x] = r[idx]; + g[y * 8 + x] = g[idx]; + b[y * 8 + x] = b[idx]; + } + } + for (int y = inside_y; y < ysize; y++) + { + for (int x = 0; x < xsize; x++) + { + int idx = (inside_y - 1) * 8 + x; + r[y * 8 + x] = r[idx]; + g[y * 8 + x] = g[idx]; + b[y * 8 + x] = b[idx]; + } + } +} + // block = [R....R][G....G][B.....] -void BlockToImage(const coeff_t *block, float* r, float* g, float* b, int inside_x, int inside_y) +void BlockToImage(const coeff_t block[8*8*3], float* r, float* g, float* b, int inside_x, int inside_y) { uint8_t idct[3][8 * 8]; CoeffToIDCT(&block[0], idct[0]); @@ -71,13 +176,11 @@ void BlockToImage(const coeff_t *block, float* r, float* g, float* b, int inside CoeffToIDCT(&block[8 * 8 * 2], idct[2]); uint16_t pixels[3][8 * 8]; - - IDCTToPixel(idct[0], pixels[0]); - IDCTToPixel(idct[1], pixels[1]); - IDCTToPixel(idct[2], pixels[2]); + IDCTToPixel8x8(idct[0], pixels[0]); + IDCTToPixel8x8(idct[1], pixels[1]); + IDCTToPixel8x8(idct[2], pixels[2]); uint8_t yuv[8 * 8 * 3]; - PixelToYUV(pixels[0], &yuv[0]); PixelToYUV(pixels[1], &yuv[1]); PixelToYUV(pixels[2], &yuv[2]); @@ -114,6 +217,87 @@ void BlockToImage(const coeff_t *block, float* r, float* g, float* b, int inside } } +void CoeffToYUV16x16(const coeff_t block[8 * 8], uint8_t *yuv) +{ + uint8_t idct[8 * 8]; + CoeffToIDCT(&block[0], &idct[0]); + + uint16_t pixels[16 * 16]; + IDCTToPixel16x16(idct, pixels); + + PixelToYUV(pixels, yuv, 16, 16); +} + +void CoeffToYUV8x8(const coeff_t block[8 * 8], uint8_t *yuv) +{ + uint8_t idct[8 * 8]; + CoeffToIDCT(&block[0], &idct[0]); + + uint16_t pixels[8 * 8]; + IDCTToPixel8x8(idct, pixels); + + PixelToYUV(pixels, yuv); +} + +void Copy8x8To16x16(const uint8_t yuv8x8[3 * 8 * 8], uint8_t yuv16x16[3 * 16 * 16], int off_x, int off_y) +{ + for (int y = 0; y < 8; y++) + { + for (int x = 0; x < 8; x++) + { + int idx = y * 8 + x; + int idx16 = (y + off_y * 8) * 16 + (x + off_x * 8); + yuv16x16[idx16 * 3] = yuv8x8[idx * 3]; + } + } +} + +void Copy16x16To8x8(const uint8_t yuv16x16[3 * 16 * 16], uint8_t yuv8x8[3 * 8 * 8], int off_x, int off_y) +{ + for (int y = 0; y < 8; y++) + { + for (int x = 0; x < 8; x++) + { + int idx = y * 8 + x; + int idx16 = (y + off_y * 8) * 16 + (x + off_x * 8); + yuv8x8[idx * 3] = yuv16x16[idx16 * 3]; + } + } +} + +void Copy16x16ToChannel(const float rgb16x16[3][16 * 16], float r[8 * 8], float g[8 * 8], float b[8 * 8], int off_x, int off_y) +{ + for (int y = 0; y < 8; y++) + { + for (int x = 0; x < 8; x++) + { + int idx = y * 8 + x; + int idx16 = (y + off_y * 8) * 16 + (x + off_x * 8); + r[idx] = rgb16x16[0][idx16]; + g[idx] = rgb16x16[1][idx16]; + b[idx] = rgb16x16[2][idx16]; + } + } +} + +typedef struct __channel_info_t +{ + int factor; + int block_width; + int block_height; +}channel_info; + +void ComputeBlockFacor(const coeff_t* candidate_block, + const coeff_t * mayout_coeff[3], + const channel_info mayout_channel[3], + const coeff_t * orig_coeff[3], + const int comp_mask, + int factor +) +{ + +} + namespace guetzli { ButteraugliComparatorEx::ButteraugliComparatorEx(const int width, const int height, @@ -188,12 +372,12 @@ namespace guetzli ButteraugliComparator::SwitchBlock(block_x, block_y, factor_x, factor_y); } - double ButteraugliComparatorEx::CompareBlock(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block) const + double ButteraugliComparatorEx::CompareBlock(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block, const int comp_mask) const { - double err = CompareBlockEx(img, off_x, off_y, candidate_block); + double err = CompareBlockEx2(img, off_x, off_y, candidate_block, comp_mask); if (g_checkOpenCL) { - double err1 = ButteraugliComparator::CompareBlock(img, off_x, off_y, candidate_block); + double err1 = ButteraugliComparator::CompareBlock(img, off_x, off_y, candidate_block, comp_mask); if (err1 != err) { LogError("CHK %s(%d) \r\n", __FUNCTION__, __LINE__); @@ -247,38 +431,176 @@ namespace guetzli } */ // ÏÂÃæÊǼÆË㹤×÷ + return ComputeImage8x8Block(rgb0_c, rgb1_c, getCurrentBlock8x8Idx(off_x, off_y)); + } + + int ButteraugliComparatorEx::GetOrigBlock(std::vector< std::vector > &rgb0_c, int off_x, int off_y) const + { + int block_xx = block_x_ * factor_x_ + off_x; + int block_yy = block_y_ * factor_y_ + off_y; + if (block_xx * 8 >= width_ || block_yy * 8 >= height_) return -1; + + const int block8_width = (width_ + 8 - 1) / 8; + + int block_ix = block_yy * block8_width + block_xx; + + rgb0_c.resize(3); + const float* block_opsin = &imgOpsinDynamicsBlockList[block_ix * 3 * kDCTBlockSize]; + for (int i = 0; i < 3; i++) + { + rgb0_c[i].resize(kDCTBlockSize); + memcpy(rgb0_c[i].data(), block_opsin + i * kDCTBlockSize, kDCTBlockSize * sizeof(float)); + } + + return block_ix; + } + + double ButteraugliComparatorEx::CompareBlockEx2(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block, const int comp_mask) const + { + const int block_x = block_x_; + const int block_y = block_y_; + const int factor = factor_x_; + + const coeff_t *candidate_channel[3]; + channel_info mayout_channel[3]; + const coeff_t *mayout_coeff[3]; + for (int c = 0; c < 3; c++) + { + candidate_channel[c] = &candidate_block[c * 8 * 8]; + mayout_coeff[c] = img.component(c).coeffs(); + mayout_channel[c].block_height = img.component(c).height_in_blocks(); + mayout_channel[c].block_width = img.component(c).width_in_blocks(); + mayout_channel[c].factor = img.component(c).factor_x(); + } + + uint8_t yuv16x16[3 * 16 * 16]; // factor 2 mode output image + uint8_t yuv8x8[3 * 8 * 8]; // factor 1 mode output image + + // ²»¹Ücomp_maskÈçºÎ£¬×ª»»ÎªRGB×ÜÊÇÐèÒªµÄ + for (int c = 0; c < 3; c++) + { + if (mayout_channel[c].factor == 1) { + if (factor == 1) { // channel_factor == factor ˵Ã÷Òª½éÈëÔËË㣬²ÉÓÃcandidateÖеÄϵÊý + //int block_8x8idx = block_y * mayout_channel[c].block_width + block_x; + const coeff_t * coeff_block = candidate_channel[c];//mayout_coeff[c] + block_8x8idx * 8 * 8; + CoeffToYUV8x8(coeff_block, &yuv8x8[c]); + } + else { + for (int iy = 0; iy < factor; ++iy) { + for (int ix = 0; ix < factor; ++ix) { + int block_xx = block_x * factor + ix; + int block_yy = block_y * factor + iy; + + int block_8x8idx = block_yy * mayout_channel[c].block_width + block_xx; + const coeff_t * coeff_block = mayout_coeff[c] + block_8x8idx * 8 * 8; + CoeffToYUV8x8(coeff_block, &yuv8x8[c]); + + // copy YUV8x8 to YUV1616 corner + Copy8x8To16x16(&yuv8x8[c], &yuv16x16[c], ix, iy); + } + } + } + } + else { + if (factor == 1) { + int block_xx = block_x / mayout_channel[c].factor; + int block_yy = block_y / mayout_channel[c].factor; + int ix = block_x % mayout_channel[c].factor;; + int iy = block_y % mayout_channel[c].factor; + + int block_8x8idx = block_yy * mayout_channel[c].block_width + block_xx; + const coeff_t * coeff_block = mayout_coeff[c] + block_8x8idx * 8 * 8; + CoeffToYUV16x16(coeff_block, &yuv16x16[c]); + + // copy YUV16x16 corner to YUV8x8 + Copy16x16To8x8(&yuv16x16[c], &yuv8x8[c], ix, iy); + } + else { + //int block_8x8idx = block_y * mayout_channel[c].block_width + block_x; + const coeff_t * coeff_block = candidate_channel[c];//mayout_coeff[c] + block_8x8idx * 8 * 8; + CoeffToYUV16x16(coeff_block, &yuv16x16[c]); + } + } + } + + if (factor == 1) + { + int block_ix = getCurrentBlockIdx(); + const float* block_opsin = &imgOpsinDynamicsBlockList[block_ix * 3 * kDCTBlockSize]; + std::vector< std::vector > rgb0_c; + int block_8x8idx = GetOrigBlock(rgb0_c, 0, 0); + + std::vector > rgb1_c(3, std::vector(kDCTBlockSize)); + YUVToImage(yuv8x8, rgb1_c[0].data(), rgb1_c[1].data(), rgb1_c[2].data()); + + double err = 0;// ComputeImage8x8Block(rgb0_c, rgb1_c, block_8x8idx); + + double err1 = ButteraugliComparator::CompareBlock(img, off_x, off_y, candidate_block, comp_mask); + + return err; + } + else + { + float rgb16x16[3][16 * 16]; + YUVToImage(yuv8x8, rgb16x16[0], rgb16x16[1], rgb16x16[2], 16, 16, 16, 16); + + float max_err = 0; + // for (int iy = 0; iy < factor; ++iy) + { + //for (int ix = 0; ix < factor; ++ix) + { + int ix = off_x; + int iy = off_y; + std::vector< std::vector > rgb0_c; + int block_8x8idx = GetOrigBlock(rgb0_c, ix, iy); + if (block_8x8idx < 0) return max_err;// continue; + + std::vector > rgb1_c(3, std::vector(kDCTBlockSize)); + Copy16x16ToChannel(rgb16x16, rgb1_c[0].data(), rgb1_c[1].data(), rgb1_c[2].data(), ix, iy); + + float err = ComputeImage8x8Block(rgb0_c, rgb1_c, getCurrentBlock8x8Idx(0, 0)); + max_err = std::max(max_err, err); + } + } + return max_err; + } + } + + double ButteraugliComparatorEx::ComputeImage8x8Block(std::vector > &rgb0_c, + std::vector > &rgb1_c, + int block_8x8idx) const + { ::butteraugli::OpsinDynamicsImage(8, 8, rgb0_c); - ::butteraugli::OpsinDynamicsImage(8, 8, rgb1_c); + ::butteraugli::OpsinDynamicsImage(8, 8, rgb1_c); - std::vector > rgb0 = rgb0_c; - std::vector > rgb1 = rgb1_c; + std::vector > rgb0 = rgb0_c; + std::vector > rgb1 = rgb1_c; - ::butteraugli::MaskHighIntensityChange(8, 8, rgb0_c, rgb1_c, rgb0, rgb1); + ::butteraugli::MaskHighIntensityChange(8, 8, rgb0_c, rgb1_c, rgb0, rgb1); - double b0[3 * kDCTBlockSize]; - double b1[3 * kDCTBlockSize]; - for (int c = 0; c < 3; ++c) { - for (int ix = 0; ix < kDCTBlockSize; ++ix) { - b0[c * kDCTBlockSize + ix] = rgb0[c][ix]; - b1[c * kDCTBlockSize + ix] = rgb1[c][ix]; - } - } - double diff_xyz_dc[3] = { 0.0 }; - double diff_xyz_ac[3] = { 0.0 }; - double diff_xyz_edge_dc[3] = { 0.0 }; - ::butteraugli::ButteraugliBlockDiff(b0, b1, diff_xyz_dc, diff_xyz_ac, diff_xyz_edge_dc); - - double diff = 0.0; - double diff_edge = 0.0; - for (int c = 0; c < 3; ++c) { - diff += diff_xyz_dc[c] * imgMaskXyzScaleBlockList[block_ix * 3 + c]; - diff += diff_xyz_ac[c] * imgMaskXyzScaleBlockList[block_ix * 3 + c]; - diff_edge += diff_xyz_edge_dc[c] * imgMaskXyzScaleBlockList[block_ix * 3 + c]; - } + double b0[3 * kDCTBlockSize]; + double b1[3 * kDCTBlockSize]; + for (int c = 0; c < 3; ++c) { + for (int ix = 0; ix < kDCTBlockSize; ++ix) { + b0[c * kDCTBlockSize + ix] = rgb0[c][ix]; + b1[c * kDCTBlockSize + ix] = rgb1[c][ix]; + } + } + double diff_xyz_dc[3] = { 0.0 }; + double diff_xyz_ac[3] = { 0.0 }; + double diff_xyz_edge_dc[3] = { 0.0 }; + ::butteraugli::ButteraugliBlockDiff(b0, b1, diff_xyz_dc, diff_xyz_ac, diff_xyz_edge_dc); + + double diff = 0.0; + double diff_edge = 0.0; + for (int c = 0; c < 3; ++c) { + diff += diff_xyz_dc[c] * imgMaskXyzScaleBlockList[block_8x8idx * 3 + c]; + diff += diff_xyz_ac[c] * imgMaskXyzScaleBlockList[block_8x8idx * 3 + c]; + diff_edge += diff_xyz_edge_dc[c] * imgMaskXyzScaleBlockList[block_8x8idx * 3 + c]; + } const double kEdgeWeight = 0.05; - return sqrt((1 - kEdgeWeight) * diff + kEdgeWeight * diff_edge); - } - + return sqrt((1 - kEdgeWeight) * diff + kEdgeWeight * diff_edge); + } int ButteraugliComparatorEx::getCurrentBlockIdx(void) const { @@ -287,4 +609,13 @@ namespace guetzli return block_y_ * block_width + block_x_; } + + int ButteraugliComparatorEx::getCurrentBlock8x8Idx(int off_x, int off_y) const + { + int block_xx = block_x_ * factor_x_ + off_x; + int block_yy = block_y_ * factor_y_ + off_y; + + const int block8_width = (width_ + 8 - 1) / 8; + return block_yy * block8_width + block_xx; + } } diff --git a/clguetzli/clguetzli_comparator.h b/clguetzli/clguetzli_comparator.h index 7f3a768c..840254a7 100644 --- a/clguetzli/clguetzli_comparator.h +++ b/clguetzli/clguetzli_comparator.h @@ -15,10 +15,17 @@ namespace guetzli { void FinishBlockComparisons() override; void SwitchBlock(int block_x, int block_y, int factor_x, int factor_y) override; - double CompareBlock(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block) const override; + double CompareBlock(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block, const int comp_mask) const override; double CompareBlockEx(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block) const; + double CompareBlockEx2(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block, const int comp_mask) const; private: + int GetOrigBlock(std::vector< std::vector > &rgb0_c, int off_x, int off_y) const; + double ComputeImage8x8Block(std::vector > &rgb0_c, + std::vector > &rgb1_c, + int block_8x8idx) const; + int getCurrentBlockIdx(void) const; + int getCurrentBlock8x8Idx(int off_x, int off_y) const; public: std::vector imgOpsinDynamicsBlockList; // [RR..RRGG..GGBB..BB]:blockCount std::vector imgMaskXyzScaleBlockList; // [RGBRGB..RGBRGB]:blockCount diff --git a/clguetzli/ocl.cpp b/clguetzli/ocl.cpp index 2f114e02..7be57d49 100644 --- a/clguetzli/ocl.cpp +++ b/clguetzli/ocl.cpp @@ -189,7 +189,7 @@ void* ocl_args_d_t::allocC(size_t s) return outputC; } -cl_mem ocl_args_d_t::allocMem(size_t s, void *init) +cl_mem ocl_args_d_t::allocMem(size_t s, const void *init) { cl_uint optimizedSize = ((s - 1) / 64 + 1) * 64; cl_int err = 0; diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h index 59b4582d..bcc8ef9c 100644 --- a/clguetzli/ocl.h +++ b/clguetzli/ocl.h @@ -95,7 +95,7 @@ struct ocl_args_d_t void* allocB(size_t s); void* allocC(size_t s); - cl_mem allocMem(size_t s, void *init = NULL); + cl_mem allocMem(size_t s, const void *init = NULL); ocl_channels allocMemChannels(size_t s); void releaseMemChannels(ocl_channels rgb); diff --git a/guetzli/butteraugli_comparator.cc b/guetzli/butteraugli_comparator.cc index accb905f..124aea8d 100644 --- a/guetzli/butteraugli_comparator.cc +++ b/guetzli/butteraugli_comparator.cc @@ -97,7 +97,9 @@ void ButteraugliComparator::SwitchBlock(int block_x, int block_y, } double ButteraugliComparator::CompareBlock(const OutputImage& img, - int off_x, int off_y, const coeff_t* candidate_block) const { + int off_x, int off_y, + const coeff_t* candidate_block, + const int comp_mask) const { int block_x = block_x_ * factor_x_ + off_x; int block_y = block_y_ * factor_y_ + off_y; int xmin = 8 * block_x; diff --git a/guetzli/butteraugli_comparator.h b/guetzli/butteraugli_comparator.h index 572a9689..5418c0d2 100644 --- a/guetzli/butteraugli_comparator.h +++ b/guetzli/butteraugli_comparator.h @@ -45,7 +45,7 @@ class ButteraugliComparator : public Comparator { int factor_x, int factor_y) override; double CompareBlock(const OutputImage& img, - int off_x, int off_y, const coeff_t* candidate_block) const override; + int off_x, int off_y, const coeff_t* candidate_block, const int comp_mask) const override; double ScoreOutputSize(int size) const override; diff --git a/guetzli/comparator.h b/guetzli/comparator.h index db76ac77..061f9603 100644 --- a/guetzli/comparator.h +++ b/guetzli/comparator.h @@ -51,7 +51,7 @@ class Comparator { // the resulting per-block distance. The interpretation of the returned // distance depends on the comparator used. virtual double CompareBlock(const OutputImage& img, - int off_x, int off_y, const coeff_t* candidate_block) const = 0; + int off_x, int off_y, const coeff_t* candidate_block, const int comp_mask) const = 0; // Returns the combined score of the output image in the last Compare() call // (or the baseline image, if Compare() was not called yet), based on output diff --git a/guetzli/processor.cc b/guetzli/processor.cc index e3bd4be4..e4b616e4 100644 --- a/guetzli/processor.cc +++ b/guetzli/processor.cc @@ -53,7 +53,9 @@ class Processor { private: - void SelectFrequencyMaskingBatch(const JPEGData& jpg, OutputImage* img, const double target_mul, bool stop_early); + void SelectFrequencyMaskingBatch(const JPEGData& jpg, OutputImage* img, + const uint8_t comp_mask, const double target_mul, + bool stop_early); void SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, const uint8_t comp_mask, const double target_mul, @@ -352,6 +354,7 @@ bool Processor::SelectQuantMatrix(const JPEGData& jpg_in, const bool downsample, const float target_mul_low = 0.95f; QuantData best = TryQuantMatrix(jpg_in, target_mul_high, best_q, img); +/* for (;;) { int q_next[3][kDCTBlockSize]; if (!qgen.GetNext(q_next)) { @@ -367,7 +370,7 @@ bool Processor::SelectQuantMatrix(const JPEGData& jpg_in, const bool downsample, } } } - +*/ memcpy(&best_q[0][0], &best.q[0][0], kBlockSize * sizeof(best_q[0][0])); GUETZLI_LOG(stats_, "\n%s selected quantization matrix:\n", downsample ? "YUV420" : "YUV444"); @@ -439,7 +442,7 @@ void Processor::ComputeBlockZeroingOrder( int block_xx = block_x * factor_x + ix; int block_yy = block_y * factor_y + iy; if (8 * block_xx < img->width() && 8 * block_yy < img->height()) { - float err = static_cast(comparator_->CompareBlock(*img, ix, iy, candidate_block)); + float err = static_cast(comparator_->CompareBlock(*img, ix, iy, candidate_block, comp_mask)); max_err = std::max(max_err, err); } } @@ -552,8 +555,21 @@ size_t EstimateDCSize(const JPEGData& jpg) { } // namespace -void Processor::SelectFrequencyMaskingBatch(const JPEGData& jpg, OutputImage* img, const double target_mul, bool stop_early) +void Processor::SelectFrequencyMaskingBatch(const JPEGData& jpg, OutputImage* img, const uint8_t comp_mask, + const double target_mul, bool stop_early) { + const int ncomp = jpg.components.size(); + if (ncomp != 3) return; + + std::vector block_[3]; + for (int c = 0; c < 3; c++) + { + int block_height = img->component(c).width_in_blocks(); + int block_width = img->component(c).height_in_blocks(); + + block_[c].resize(block_height * block_width); + } + // we only support factor_x == factor_y == 1 const int width = img->width(); const int height = img->height(); @@ -564,19 +580,19 @@ void Processor::SelectFrequencyMaskingBatch(const JPEGData& jpg, OutputImage* im const int block_height = (height + 8 * factor_y - 1) / (8 * factor_y); const int num_blocks = block_width * block_height; - comparator_->StartBlockComparisons(); // TOBEREMOVE:³õʼ»¯Ò»Ð©²ÎÊý - std::vector orig_block_batch(num_blocks * kBlockSize); // [block_r block_g block_b] - std::vector block_batch(num_blocks * kBlockSize); // [block_r block_g block_b] + comparator_->StartBlockComparisons(); // ³õʼ»¯Ò»Ð©²ÎÊý£¬Ö÷ÒªÊǶÔԭͼ½øÐÐһЩ´¦Àí + std::vector orig_batch(num_blocks * kBlockSize); // [block_r block_g block_b] + std::vector mayout_batch(num_blocks * kBlockSize); // [block_r block_g block_b] // step 1 »ñÈ¡ËùÓÐblock list for (int block_y = 0, block_ix = 0; block_y < block_height; ++block_y) { for (int block_x = 0; block_x < block_width; ++block_x, ++block_ix) { - coeff_t *orig_block = &orig_block_batch[block_ix * kBlockSize]; - coeff_t *block = &block_batch[block_ix * kBlockSize]; + coeff_t *orig_block = &orig_batch[block_ix * kBlockSize]; + coeff_t *mayout_block = &mayout_batch[block_ix * kBlockSize]; for (int c = 0; c < 3; ++c) { - img->component(c).GetCoeffBlock(block_x, block_y, &block[c * kDCTBlockSize]); + img->component(c).GetCoeffBlock(block_x, block_y, &mayout_block[c * kDCTBlockSize]); const JPEGComponent& comp = jpg.components[c]; int jpg_block_ix = block_y * comp.width_in_blocks + block_x; @@ -595,15 +611,13 @@ void Processor::SelectFrequencyMaskingBatch(const JPEGData& jpg, OutputImage* im { output_order_gpu.resize(num_blocks * kBlockSize); output_order = output_order_gpu.data(); - clComputeBlockZeroingOrder(orig_block_batch.data(), - block_batch.data(), - comp->imgOpsinDynamicsBlockList.data(), - comp->imgMaskXyzScaleBlockList.data(), - output_order_gpu.data(), - num_blocks, - comparator_->BlockErrorLimit()); - - + clComputeBlockZeroingOrder(orig_batch.data(), + comp->imgOpsinDynamicsBlockList.data(), + comp->imgMaskXyzScaleBlockList.data(), + mayout_batch.data(), + num_blocks, + comparator_->BlockErrorLimit(), + output_order_gpu.data()); } if (!g_useOpenCL || g_checkOpenCL) { @@ -611,8 +625,8 @@ void Processor::SelectFrequencyMaskingBatch(const JPEGData& jpg, OutputImage* im output_order = output_order_cpu.data(); for (int block_y = 0, block_ix = 0; block_y < block_height; ++block_y) { for (int block_x = 0; block_x < block_width; ++block_x, ++block_ix) { - coeff_t *orig_block = &orig_block_batch[block_ix * kBlockSize]; - coeff_t *block = &block_batch[block_ix * kBlockSize]; + coeff_t *orig_block = &orig_batch[block_ix * kBlockSize]; + coeff_t *block = &mayout_batch[block_ix * kBlockSize]; std::vector block_order; ComputeBlockZeroingOrder(block, orig_block, block_x, block_y, &block_order); @@ -1101,7 +1115,7 @@ bool Processor::ProcessJpegData(const Params& params, const JPEGData& jpg_in, img.ApplyGlobalQuantization(best_q); if (!downsample) { - SelectFrequencyMasking(jpg, &img, 7, 1.0, false); + SelectFrequencyMasking(jpg, &img, 7, 1.0, false); } else { const float ymul = jpg.components.size() == 1 ? 1.0f : 0.97f; SelectFrequencyMasking(jpg, &img, 1, ymul, false); From 6482c6784c651d651e8e355834348f0a770efc07 Mon Sep 17 00:00:00 2001 From: strongtu Date: Thu, 18 May 2017 20:03:49 +0800 Subject: [PATCH 084/189] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E8=AE=BF=E9=97=AE?= =?UTF-8?q?=E6=8E=A5=E5=8F=A3=EF=BC=8C=E4=B8=BB=E8=A6=81=E7=94=A8=E4=BA=8E?= =?UTF-8?q?=E6=95=B0=E6=8D=AE=E6=A0=A1=E9=AA=8C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- guetzli/output_image.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/guetzli/output_image.h b/guetzli/output_image.h index 1018eeac..9c9f935a 100644 --- a/guetzli/output_image.h +++ b/guetzli/output_image.h @@ -37,6 +37,8 @@ class OutputImageComponent { int width_in_blocks() const { return width_in_blocks_; } int height_in_blocks() const { return height_in_blocks_; } const coeff_t* coeffs() const { return &coeffs_[0]; } + const uint16_t* pixels() const { return &pixels_[0]; } + size_t pixels_size() const { return pixels_.size(); } const int* quant() const { return &quant_[0]; } bool IsAllZero() const; From c5a08a1b84e7f471bc6fb656ffb64e645174b32c Mon Sep 17 00:00:00 2001 From: strongtu Date: Thu, 18 May 2017 20:06:01 +0800 Subject: [PATCH 085/189] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E6=A0=A1=E9=AA=8C?= =?UTF-8?q?=E5=8E=9F=E5=9B=BE=E6=95=B0=E6=8D=AE=E5=8F=98=E5=8C=96=E7=9A=84?= =?UTF-8?q?=E4=BB=A3=E7=A0=81=EF=BC=8Cto=20be=20delete?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- guetzli/processor.cc | 98 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 88 insertions(+), 10 deletions(-) diff --git a/guetzli/processor.cc b/guetzli/processor.cc index e4b616e4..ffaf025d 100644 --- a/guetzli/processor.cc +++ b/guetzli/processor.cc @@ -59,7 +59,7 @@ class Processor { void SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, const uint8_t comp_mask, const double target_mul, - bool stop_early); + bool stop_early, const OutputImage &img2); void SelectFrequencyBackEnd(const JPEGData& jpg, OutputImage* img, const uint8_t comp_mask, @@ -72,7 +72,7 @@ class Processor { void ComputeBlockZeroingOrder( const coeff_t block[kBlockSize], const coeff_t orig_block[kBlockSize], const int block_x, const int block_y, const int factor_x, - const int factor_y, const uint8_t comp_mask, OutputImage* img, + const int factor_y, const uint8_t comp_mask, OutputImage* img, const OutputImage& img2, std::vector* output_order); void ComputeBlockZeroingOrder(const coeff_t block[kBlockSize], const coeff_t orig_block[kBlockSize], @@ -354,7 +354,7 @@ bool Processor::SelectQuantMatrix(const JPEGData& jpg_in, const bool downsample, const float target_mul_low = 0.95f; QuantData best = TryQuantMatrix(jpg_in, target_mul_high, best_q, img); -/* + for (;;) { int q_next[3][kDCTBlockSize]; if (!qgen.GetNext(q_next)) { @@ -370,7 +370,7 @@ bool Processor::SelectQuantMatrix(const JPEGData& jpg_in, const bool downsample, } } } -*/ + memcpy(&best_q[0][0], &best.q[0][0], kBlockSize * sizeof(best_q[0][0])); GUETZLI_LOG(stats_, "\n%s selected quantization matrix:\n", downsample ? "YUV420" : "YUV444"); @@ -383,7 +383,7 @@ bool Processor::SelectQuantMatrix(const JPEGData& jpg_in, const bool downsample, void Processor::ComputeBlockZeroingOrder( const coeff_t block[kBlockSize], const coeff_t orig_block[kBlockSize], const int block_x, const int block_y, const int factor_x, - const int factor_y, const uint8_t comp_mask, OutputImage* img, + const int factor_y, const uint8_t comp_mask, OutputImage* img, const OutputImage &img2, std::vector* output_order) { static const uint8_t oldCsf[kDCTBlockSize] = { 10, 10, 20, 40, 60, 70, 80, 90, @@ -420,6 +420,19 @@ void Processor::ComputeBlockZeroingOrder( coeff_t processed_block[kBlockSize]; memcpy(processed_block, block, sizeof(processed_block)); comparator_->SwitchBlock(block_x, block_y, factor_x, factor_y); + + bool bCheck = false; + uint8_t orig_rgb[3][16 * 16] = { 0 }; + if (bCheck) + { + for (int c = 0; c < 3; ++c) { + if (comp_mask & (1 << c) && factor_x == 2) { + if ((block_x + 1) * factor_x * 8 > img->width()) continue; + img->component(c).ToPixels((block_x + 1) * factor_x * 8, block_y * factor_y * 8, 16, 16, orig_rgb[c], 1); + } + } + } + while (!input_order.empty()) { float best_err = 1e17f; int best_i = 0; @@ -451,6 +464,36 @@ void Processor::ComputeBlockZeroingOrder( best_err = max_err; best_i = i; } + + if (bCheck) + { + // ÿ´Î¶¼Òª»Ö¸´Ò»Ï¿´¿´ + for (int c = 0; c < 3; ++c) { + if (comp_mask & (1 << c)) { + img->component(c).SetCoeffBlock(block_x, block_y, &block[c * kDCTBlockSize]); + } + } + // ¿´¿´ÏàÁÙ¿éÊDz»Êǻָ´ÁË + uint8_t last_rgb[3][16 * 16] = { 0 }; + for (int c = 0; c < 3; ++c) { + if (comp_mask & (1 << c) && factor_x == 2) { + if ((block_x + 1) * factor_x * 8 > img->width()) continue; + img->component(c).ToPixels((block_x + 1) * factor_x * 8, block_y * factor_y * 8, 16, 16, last_rgb[c], 1); + } + } + int count = 0; + for (int c = 0; c < 3; c++) { + for (int k = 0; factor_x == 2 && k < 16 * 16; k++) { + if (last_rgb[c][k] != orig_rgb[c][k]) { + count++; + } + } + } + if (count > 0) + { + LogError("misstake in processing %d:%d block=%d:%d\r\n", count, 16 * 16, block_x, block_y); + } + } } int idx = input_order[best_i].first; processed_block[idx] = 0; @@ -483,6 +526,23 @@ void Processor::ComputeBlockZeroingOrder( block_x, block_y, &block[c * kDCTBlockSize]); } } + + if (bCheck) + { + // ȫͼ¼ì²éһϠ+ for (int c = 0; c < 3; c++) + { + int size = img->component(c).pixels_size(); + if (!(comp_mask & (1 << c))) continue; + for (int k = 0; k < size && factor_x == 2; k++) + { + if (img2.component(c).pixels()[k] != img->component(c).pixels()[k]) + { + LogError("misstake in restore\r\n"); + } + } + } + } } namespace { @@ -770,7 +830,8 @@ void Processor::ComputeBlockZeroingOrder(const coeff_t block[kBlockSize], const void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, const uint8_t comp_mask, const double target_mul, - bool stop_early) { + bool stop_early, + const OutputImage& img2) { const int width = img->width(); const int height = img->height(); const int ncomp = jpg.components.size(); @@ -809,7 +870,8 @@ void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, } block_order.clear(); ComputeBlockZeroingOrder(block, orig_block, block_x, block_y, factor_x, - factor_y, comp_mask, img, &block_order); + factor_y, comp_mask, img, img2, &block_order); + candidate_coeff_offsets[block_ix] = candidate_coeffs.size(); for (size_t i = 0; i < block_order.size(); ++i) { candidate_coeffs.push_back(block_order[i].idx); @@ -1114,12 +1176,28 @@ bool Processor::ProcessJpegData(const Params& params, const JPEGData& jpg_in, img.CopyFromJpegData(jpg); img.ApplyGlobalQuantization(best_q); + OutputImage img2(jpg.width, jpg.height); + img2.CopyFromJpegData(jpg); + img2.ApplyGlobalQuantization(best_q); + + for (int c = 0; c < 3; c++) + { + int size = img.component(c).pixels_size(); + for (int k = 0; k < size; k++) + { + if (img2.component(c).pixels()[k] != img.component(c).pixels()[k]) + { + LogError("fdjsalfjlkadsfdsafjdsfjdlsajdklsjf\r\n"); + } + } + } + if (!downsample) { - SelectFrequencyMasking(jpg, &img, 7, 1.0, false); + SelectFrequencyMasking(jpg, &img, 7, 1.0, false, img2); } else { const float ymul = jpg.components.size() == 1 ? 1.0f : 0.97f; - SelectFrequencyMasking(jpg, &img, 1, ymul, false); - SelectFrequencyMasking(jpg, &img, 6, 1.0, true); + SelectFrequencyMasking(jpg, &img, 1, ymul, false, img2); + SelectFrequencyMasking(jpg, &img, 6, 1.0, true, img2); } } From d587e6608b409311163878359fb1dde0035652cb Mon Sep 17 00:00:00 2001 From: strongtu Date: Thu, 18 May 2017 20:08:37 +0800 Subject: [PATCH 086/189] =?UTF-8?q?=E5=A2=9E=E5=8A=A0factor=5Fx=20=3D=20fa?= =?UTF-8?q?ctor=5Fy=20=3D=202=E6=97=B6=E7=9A=84batch=E5=8C=96=E5=8E=9F?= =?UTF-8?q?=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clguetzli_comparator.cpp | 175 ++++++++++++++++++----------- 1 file changed, 108 insertions(+), 67 deletions(-) diff --git a/clguetzli/clguetzli_comparator.cpp b/clguetzli/clguetzli_comparator.cpp index deaf4cdb..672054d5 100644 --- a/clguetzli/clguetzli_comparator.cpp +++ b/clguetzli/clguetzli_comparator.cpp @@ -32,13 +32,8 @@ void IDCTToPixel8x8(const uint8_t idct[8 * 8], uint16_t pixels_[8*8]) } } -void IDCTToPixel16x16(const uint8_t idct[8*8], uint16_t pixels_[16*16]) +void IDCTToPixel16x16(const uint8_t idct[8*8], uint16_t pixels_out[16*16], const uint16_t *pixel_orig, int block_x, int block_y, int width_, int height_) { - const int block_x = 0; - const int block_y = 0; - const int width_ = 16; - const int height_ = 16; - // Fill in the 10x10 pixel area in the subsampled image that will be the // basis of the upsampling. This area is enough to hold the 3x3 kernel of // the fancy upsampler around each pixel. @@ -74,30 +69,32 @@ void IDCTToPixel16x16(const uint8_t idct[8*8], uint16_t pixels_[16*16]) // block by computing the inverse of the fancy upsampler. const int y1 = std::max(y0 - 1, 0); const int x1 = std::max(x0 - 1, 0); - subsampled[ix] = (pixels_[y0 * width_ + x0] * 9 + - pixels_[y1 * width_ + x1] + - pixels_[y0 * width_ + x1] * -3 + - pixels_[y1 * width_ + x0] * -3) >> 2; + subsampled[ix] = (pixel_orig[y0 * width_ + x0] * 9 + + pixel_orig[y1 * width_ + x1] + + pixel_orig[y0 * width_ + x1] * -3 + + pixel_orig[y1 * width_ + x0] * -3) >> 2; } } } - - // Determine area to update. - int xmin = std::max(block_x * 16 - 1, 0); - int xmax = std::min(block_x * 16 + 16, width_ - 1); - int ymin = std::max(block_y * 16 - 1, 0); - int ymax = std::min(block_y * 16 + 16, height_ - 1); + // Determine area to update. + int xmin = block_x * 16; // std::max(block_x * 16 - 1, 0); + int xmax = std::min(block_x * 16 + 15, width_ - 1); + int ymin = block_y * 16; // std::max(block_y * 16 - 1, 0); + int ymax = std::min(block_y * 16 + 15, height_ - 1); // Apply the fancy upsampler on the subsampled block. for (int y = ymin; y <= ymax; ++y) { const int y0 = ((y & ~1) / 2 - block_y * 8 + 1) * kSubsampledEdgeSize; const int dy = ((y & 1) * 2 - 1) * kSubsampledEdgeSize; - uint16_t* rowptr = &pixels_[y * width_]; for (int x = xmin; x <= xmax; ++x) { const int x0 = (x & ~1) / 2 - block_x * 8 + 1; const int dx = (x & 1) * 2 - 1; const int ix = x0 + y0; - rowptr[x] = (subsampled[ix] * 9 + subsampled[ix + dy] * 3 + + + int out_x = x - xmin; + int out_y = y - ymin; + + pixels_out[out_y * 16 + out_x] = (subsampled[ix] * 9 + subsampled[ix + dy] * 3 + subsampled[ix + dx] * 3 + subsampled[ix + dx + dy]) >> 4; } } @@ -149,20 +146,20 @@ void YUVToImage(uint8_t yuv[3 * 8 * 8], float* r, float* g, float* b, int xsize { for (int x = inside_x; x < xsize; x++) { - int idx = y * 8 + (inside_x - 1); - r[y * 8 + x] = r[idx]; - g[y * 8 + x] = g[idx]; - b[y * 8 + x] = b[idx]; + int idx = y * xsize + (inside_x - 1); + r[y * xsize + x] = r[idx]; + g[y * xsize + x] = g[idx]; + b[y * xsize + x] = b[idx]; } } for (int y = inside_y; y < ysize; y++) { for (int x = 0; x < xsize; x++) { - int idx = (inside_y - 1) * 8 + x; - r[y * 8 + x] = r[idx]; - g[y * 8 + x] = g[idx]; - b[y * 8 + x] = b[idx]; + int idx = (inside_y - 1) * xsize + x; + r[y * xsize + x] = r[idx]; + g[y * xsize + x] = g[idx]; + b[y * xsize + x] = b[idx]; } } } @@ -217,13 +214,13 @@ void BlockToImage(const coeff_t block[8*8*3], float* r, float* g, float* b, int } } -void CoeffToYUV16x16(const coeff_t block[8 * 8], uint8_t *yuv) +void CoeffToYUV16x16(const coeff_t block[8 * 8], uint8_t *yuv, const uint16_t *pixel_orig, int block_x, int block_y, int width_, int height_) { uint8_t idct[8 * 8]; CoeffToIDCT(&block[0], &idct[0]); uint16_t pixels[16 * 16]; - IDCTToPixel16x16(idct, pixels); + IDCTToPixel16x16(idct, pixels, pixel_orig, block_x, block_y, width_, height_); PixelToYUV(pixels, yuv, 16, 16); } @@ -285,6 +282,7 @@ typedef struct __channel_info_t int factor; int block_width; int block_height; + const uint16_t *pixel; }channel_info; void ComputeBlockFacor(const coeff_t* candidate_block, @@ -403,10 +401,10 @@ namespace guetzli } // imgÊÇÈ«¾ÖÓÅ»¯ºóµÄͼÏñ£¬ÎÒÃÇͨ¹ýcoeff_tÊý¾Ý·´Ëã³öÀ´rgb - int border_x = block_x_ * 8 + 8 > width_ ? width_ - block_x_ * 8 : 8; - int border_y = block_y_ * 8 + 8 > height_ ? height_ - block_y_ * 8 : 8; + int inside_x = block_x_ * 8 + 8 > width_ ? width_ - block_x_ * 8 : 8; + int inside_y = block_y_ * 8 + 8 > height_ ? height_ - block_y_ * 8 : 8; std::vector > rgb1_c(3, std::vector(kDCTBlockSize)); - BlockToImage(candidate_block, rgb1_c[0].data(), rgb1_c[1].data(), rgb1_c[2].data(), border_x, border_y); + BlockToImage(candidate_block, rgb1_c[0].data(), rgb1_c[1].data(), rgb1_c[2].data(), inside_x, inside_y); /* { // ¿ÉÄÜ»¹ÓÐÎÊÌ⣬ÎÒÃÇ×öÒ»¸öУÑé @@ -471,18 +469,18 @@ namespace guetzli mayout_channel[c].block_height = img.component(c).height_in_blocks(); mayout_channel[c].block_width = img.component(c).width_in_blocks(); mayout_channel[c].factor = img.component(c).factor_x(); + mayout_channel[c].pixel = img.component(c).pixels(); } - uint8_t yuv16x16[3 * 16 * 16]; // factor 2 mode output image - uint8_t yuv8x8[3 * 8 * 8]; // factor 1 mode output image + uint8_t yuv16x16[3 * 16 * 16] = { 0 }; // factor 2 mode output image + uint8_t yuv8x8[3 * 8 * 8] = { 0 }; // factor 1 mode output image // ²»¹Ücomp_maskÈçºÎ£¬×ª»»ÎªRGB×ÜÊÇÐèÒªµÄ for (int c = 0; c < 3; c++) { if (mayout_channel[c].factor == 1) { if (factor == 1) { // channel_factor == factor ˵Ã÷Òª½éÈëÔËË㣬²ÉÓÃcandidateÖеÄϵÊý - //int block_8x8idx = block_y * mayout_channel[c].block_width + block_x; - const coeff_t * coeff_block = candidate_channel[c];//mayout_coeff[c] + block_8x8idx * 8 * 8; + const coeff_t * coeff_block = candidate_channel[c]; CoeffToYUV8x8(coeff_block, &yuv8x8[c]); } else { @@ -491,6 +489,12 @@ namespace guetzli int block_xx = block_x * factor + ix; int block_yy = block_y * factor + iy; + if (ix != off_x || iy != off_y) continue; + if (block_xx >= mayout_channel[c].block_width || + block_yy >= mayout_channel[c].block_height) + { + continue; + } int block_8x8idx = block_yy * mayout_channel[c].block_width + block_xx; const coeff_t * coeff_block = mayout_coeff[c] + block_8x8idx * 8 * 8; CoeffToYUV8x8(coeff_block, &yuv8x8[c]); @@ -508,61 +512,98 @@ namespace guetzli int ix = block_x % mayout_channel[c].factor;; int iy = block_y % mayout_channel[c].factor; - int block_8x8idx = block_yy * mayout_channel[c].block_width + block_xx; - const coeff_t * coeff_block = mayout_coeff[c] + block_8x8idx * 8 * 8; - CoeffToYUV16x16(coeff_block, &yuv16x16[c]); + int block_16x16idx = block_yy * mayout_channel[c].block_width + block_xx; + const coeff_t * coeff_block = mayout_coeff[c] + block_16x16idx * 8 * 8; +/* + uint8_t ch[16 * 16] = { 0 }; + img.component(c).ToPixels(block_xx * 8, block_yy * 8, 16, 16, ch, 1); +*/ + CoeffToYUV16x16(coeff_block, &yuv16x16[c], mayout_channel[c].pixel, block_xx, block_yy, img.width(), img.height()); // copy YUV16x16 corner to YUV8x8 Copy16x16To8x8(&yuv16x16[c], &yuv8x8[c], ix, iy); } else { - //int block_8x8idx = block_y * mayout_channel[c].block_width + block_x; - const coeff_t * coeff_block = candidate_channel[c];//mayout_coeff[c] + block_8x8idx * 8 * 8; - CoeffToYUV16x16(coeff_block, &yuv16x16[c]); + const coeff_t * coeff_block = candidate_channel[c]; + CoeffToYUV16x16(coeff_block, &yuv16x16[c], mayout_channel[c].pixel, block_x, block_y, img.width(), img.height()); } } } if (factor == 1) { - int block_ix = getCurrentBlockIdx(); - const float* block_opsin = &imgOpsinDynamicsBlockList[block_ix * 3 * kDCTBlockSize]; std::vector< std::vector > rgb0_c; int block_8x8idx = GetOrigBlock(rgb0_c, 0, 0); +/* + uint8_t yuv[3 * 8 * 8]; - std::vector > rgb1_c(3, std::vector(kDCTBlockSize)); - YUVToImage(yuv8x8, rgb1_c[0].data(), rgb1_c[1].data(), rgb1_c[2].data()); - - double err = 0;// ComputeImage8x8Block(rgb0_c, rgb1_c, block_8x8idx); + std::vector > rgb1_c2(3, std::vector(kDCTBlockSize)); + { + int block_x = block_x_ * factor_x_ + off_x; + int block_y = block_y_ * factor_y_ + off_y; + int xmin = 8 * block_x; + int ymin = 8 * block_y; - double err1 = ButteraugliComparator::CompareBlock(img, off_x, off_y, candidate_block, comp_mask); + img.ToLinearRGB(xmin, ymin, 8, 8, &rgb1_c2); - return err; + img.component(0).ToPixels(xmin, ymin, 8, 8, &yuv[0], 3); + img.component(1).ToPixels(xmin, ymin, 8, 8, &yuv[1], 3); + img.component(2).ToPixels(xmin, ymin, 8, 8, &yuv[2], 3); + } +*/ + int inside_x = block_x_ * 8 + 8 > width_ ? width_ - block_x_ * 8 : 8; + int inside_y = block_y_ * 8 + 8 > height_ ? height_ - block_y_ * 8 : 8; + std::vector > rgb1_c(3, std::vector(kDCTBlockSize)); + YUVToImage(yuv8x8, rgb1_c[0].data(), rgb1_c[1].data(), rgb1_c[2].data(), 8, 8, inside_x, inside_y); +/* + int count = 0; + for (int i = 0; i < 64; i++) + { + if (rgb1_c[0][i] != rgb1_c2[0][i] || + rgb1_c[1][i] != rgb1_c2[1][i] || + rgb1_c[2][i] != rgb1_c2[2][i]) + { + count++; + } + } + if (count > 0) + { + LogError("fdjskafjdlasfj"); + } +*/ + return ComputeImage8x8Block(rgb0_c, rgb1_c, block_8x8idx); } else { - float rgb16x16[3][16 * 16]; - YUVToImage(yuv8x8, rgb16x16[0], rgb16x16[1], rgb16x16[2], 16, 16, 16, 16); - - float max_err = 0; - // for (int iy = 0; iy < factor; ++iy) + int inside_x = block_x_ * 16 + 16 > width_ ? width_ - block_x_ * 16 : 16; + int inside_y = block_y_ * 16 + 16 > height_ ? height_ - block_y_ * 16 : 16; +/* + uint8_t yuv[3 * 8 * 8]; + std::vector > rgb1_c2(3, std::vector(kDCTBlockSize)); { - //for (int ix = 0; ix < factor; ++ix) - { - int ix = off_x; - int iy = off_y; - std::vector< std::vector > rgb0_c; - int block_8x8idx = GetOrigBlock(rgb0_c, ix, iy); - if (block_8x8idx < 0) return max_err;// continue; + int block_x = block_x_ * factor_x_ + off_x; + int block_y = block_y_ * factor_y_ + off_y; + int xmin = 8 * block_x; + int ymin = 8 * block_y; - std::vector > rgb1_c(3, std::vector(kDCTBlockSize)); - Copy16x16ToChannel(rgb16x16, rgb1_c[0].data(), rgb1_c[1].data(), rgb1_c[2].data(), ix, iy); + img.ToLinearRGB(xmin, ymin, 8, 8, &rgb1_c2); - float err = ComputeImage8x8Block(rgb0_c, rgb1_c, getCurrentBlock8x8Idx(0, 0)); - max_err = std::max(max_err, err); - } + img.component(0).ToPixels(xmin, ymin, 8, 8, &yuv[0], 3); + img.component(1).ToPixels(xmin, ymin, 8, 8, &yuv[1], 3); + img.component(2).ToPixels(xmin, ymin, 8, 8, &yuv[2], 3); } - return max_err; + +*/ + float rgb16x16[3][16 * 16]; + YUVToImage(yuv16x16, rgb16x16[0], rgb16x16[1], rgb16x16[2], 16, 16, inside_x, inside_y); + + std::vector< std::vector > rgb0_c; + int block_8x8idx = GetOrigBlock(rgb0_c, off_x, off_y); + + std::vector > rgb1_c(3, std::vector(kDCTBlockSize)); + Copy16x16ToChannel(rgb16x16, rgb1_c[0].data(), rgb1_c[1].data(), rgb1_c[2].data(), off_x, off_y); + + return ComputeImage8x8Block(rgb0_c, rgb1_c, block_8x8idx); } } From 8d281104038aad564d6b356e1209fa362eb6c970 Mon Sep 17 00:00:00 2001 From: strongtu Date: Fri, 19 May 2017 22:09:46 +0800 Subject: [PATCH 087/189] =?UTF-8?q?=E7=BF=BB=E8=AF=91ComputeBlockEx2?= =?UTF-8?q?=E4=B8=BAOpenCL?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clguetzli.cl | 636 ++++++++++++++++++++++++++++- clguetzli/clguetzli.cpp | 69 +++- clguetzli/clguetzli.h | 24 ++ clguetzli/clguetzli_comparator.cpp | 25 +- clguetzli/ocl.h | 3 +- 5 files changed, 714 insertions(+), 43 deletions(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index ab595dde..55167a07 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -2020,6 +2020,25 @@ void CoeffToIDCT(__private const coeff_t block[8*8], uchar out[8*8]) } } + +void IDCTToPixel8x8(const uchar idct[8 * 8], ushort pixels_[8 * 8]) +{ + const int block_x = 0; + const int block_y = 0; + const int width_ = 8; + const int height_ = 8; + + for (int iy = 0; iy < 8; ++iy) { + for (int ix = 0; ix < 8; ++ix) { + int x = 8 * block_x + ix; + int y = 8 * block_y + iy; + if (x >= width_ || y >= height_) continue; + int p = y * width_ + x; + pixels_[p] = idct[8 * iy + ix] << 4; + } + } +} +/* void IDCTToPixel(const uchar idct[8*8], ushort pixels_[8*8]) { const int block_x = 0; @@ -2039,20 +2058,89 @@ void IDCTToPixel(const uchar idct[8*8], ushort pixels_[8*8]) } } } +*/ + + +void IDCTToPixel16x16(const uchar idct[8 * 8], ushort pixels_out[16 * 16], __global const ushort *pixel_orig, int block_x, int block_y, int width_, int height_) +{ + // Fill in the 10x10 pixel area in the subsampled image that will be the + // basis of the upsampling. This area is enough to hold the 3x3 kernel of + // the fancy upsampler around each pixel. +#define kSubsampledEdgeSize 10 + ushort subsampled[kSubsampledEdgeSize * kSubsampledEdgeSize]; + for (int j = 0; j < kSubsampledEdgeSize; ++j) { + // The order we fill in the rows is: + // 8 rows intersecting the block, row below, row above + const int y0 = block_y * 16 + (j < 9 ? j * 2 : -2); + for (int i = 0; i < kSubsampledEdgeSize; ++i) { + // The order we fill in each row is: + // 8 pixels within the block, left edge, right edge + const int ix = ((j < 9 ? (j + 1) * kSubsampledEdgeSize : 0) + + (i < 9 ? i + 1 : 0)); + const int x0 = block_x * 16 + (i < 9 ? i * 2 : -2); + if (x0 < 0) { + subsampled[ix] = subsampled[ix + 1]; + } + else if (y0 < 0) { + subsampled[ix] = subsampled[ix + kSubsampledEdgeSize]; + } + else if (x0 >= width_) { + subsampled[ix] = subsampled[ix - 1]; + } + else if (y0 >= height_) { + subsampled[ix] = subsampled[ix - kSubsampledEdgeSize]; + } + else if (i < 8 && j < 8) { + subsampled[ix] = idct[j * 8 + i] << 4; + } + else { + // Reconstruct the subsampled pixels around the edge of the current + // block by computing the inverse of the fancy upsampler. + const int y1 = max(y0 - 1, 0); + const int x1 = max(x0 - 1, 0); + subsampled[ix] = (pixel_orig[y0 * width_ + x0] * 9 + + pixel_orig[y1 * width_ + x1] + + pixel_orig[y0 * width_ + x1] * -3 + + pixel_orig[y1 * width_ + x0] * -3) >> 2; + } + } + } + // Determine area to update. + int xmin = block_x * 16; // std::max(block_x * 16 - 1, 0); + int xmax = min(block_x * 16 + 15, width_ - 1); + int ymin = block_y * 16; // std::max(block_y * 16 - 1, 0); + int ymax = min(block_y * 16 + 15, height_ - 1); + + // Apply the fancy upsampler on the subsampled block. + for (int y = ymin; y <= ymax; ++y) { + const int y0 = ((y & ~1) / 2 - block_y * 8 + 1) * kSubsampledEdgeSize; + const int dy = ((y & 1) * 2 - 1) * kSubsampledEdgeSize; + for (int x = xmin; x <= xmax; ++x) { + const int x0 = (x & ~1) / 2 - block_x * 8 + 1; + const int dx = (x & 1) * 2 - 1; + const int ix = x0 + y0; + + int out_x = x - xmin; + int out_y = y - ymin; + + pixels_out[out_y * 16 + out_x] = (subsampled[ix] * 9 + subsampled[ix + dy] * 3 + + subsampled[ix + dx] * 3 + subsampled[ix + dx + dy]) >> 4; + } + } +} -void PixelToYUV(const ushort pixels_[8*8], uchar out[8*8]) +// out = [YUVYUV....YUVYUV] +void PixelToYUV(ushort pixels_[8 * 8], uchar out[8 * 8], int xsize/* = 8*/, int ysize/* = 8*/) { - const int stride = 3; + const int stride = 3; - for (int y = 0; y < 8; ++y) - { - for (int x = 0; x < 8; ++x) - { - int px = y * 8 + x; - *out = (uchar) ((pixels_[px] + 8 - (x & 1)) >> 4); - out += stride; - } - } + for (int y = 0; y < xsize; ++y) { + for (int x = 0; x < ysize; ++x) { + int px = y * xsize + x; + *out = (uchar)((pixels_[px] + 8 - (x & 1)) >> 4); + out += stride; + } + } } __constant static int kCrToRedTable[256] = { @@ -2242,10 +2330,10 @@ __constant static uchar kRangeLimitLut[4 * 256] = { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }; -void YUVToRGB(__private uchar pixelBlock[3*8*8]) +void YUVToRGB(__private uchar pixelBlock[3*8*8], int size /*= 8 * 8*/) { __constant uchar* kRangeLimit = kRangeLimitLut + 384; - for (int i = 0; i < 64; i++) + for (int i = 0; i < size; i++) { uchar *pixel = &pixelBlock[i * 3]; @@ -2517,8 +2605,44 @@ __constant static double kSrgb8ToLinearTable[256] = { 255.000000, }; + +void YUVToImage(uchar yuv[3 * 8 * 8], float* r, float* g, float* b, int xsize/* = 8*/, int ysize/* = 8*/, int inside_x/* = 8*/, int inside_y/* = 8*/) +{ + YUVToRGB(yuv, xsize * ysize); + + const __constant double* lut = kSrgb8ToLinearTable; + + for (int i = 0; i < xsize * ysize; i++) + { + r[i] = lut[yuv[3 * i]]; + g[i] = lut[yuv[3 * i + 1]]; + b[i] = lut[yuv[3 * i + 2]]; + } + for (int y = 0; y < inside_y; y++) + { + for (int x = inside_x; x < xsize; x++) + { + int idx = y * xsize + (inside_x - 1); + r[y * xsize + x] = r[idx]; + g[y * xsize + x] = g[idx]; + b[y * xsize + x] = b[idx]; + } + } + for (int y = inside_y; y < ysize; y++) + { + for (int x = 0; x < xsize; x++) + { + int idx = (inside_y - 1) * xsize + x; + r[y * xsize + x] = r[idx]; + g[y * xsize + x] = g[idx]; + b[y * xsize + x] = b[idx]; + } + } +} + + // chrisk todo -void BlockToImage(__private coeff_t block[8*8*3], float r[8*8], float g[8*8], float b[8*8]) +void BlockToImage(__private coeff_t block[8*8*3], float r[8*8], float g[8*8], float b[8*8], int inside_x, int inside_y) { uchar idct[3][8 * 8]; CoeffToIDCT(&block[0], &idct[0]); @@ -2526,16 +2650,16 @@ void BlockToImage(__private coeff_t block[8*8*3], float r[8*8], float g[8*8], fl CoeffToIDCT(&block[8 * 8 * 2], &idct[2]); ushort pixels[3][8 * 8]; - IDCTToPixel(&idct[0], &pixels[0]); - IDCTToPixel(&idct[1], &pixels[1]); - IDCTToPixel(&idct[2], &pixels[2]); + IDCTToPixel8x8(&idct[0], &pixels[0]); + IDCTToPixel8x8(&idct[1], &pixels[1]); + IDCTToPixel8x8(&idct[2], &pixels[2]); uchar yuv[8 * 8 * 3]; - PixelToYUV(&pixels[0], &yuv[0]); - PixelToYUV(&pixels[1], &yuv[1]); - PixelToYUV(&pixels[2], &yuv[2]); + PixelToYUV(&pixels[0], &yuv[0], 8, 8); + PixelToYUV(&pixels[1], &yuv[1], 8, 8); + PixelToYUV(&pixels[2], &yuv[2], 8, 8); - YUVToRGB(yuv); + YUVToRGB(yuv, 8 * 8); for (int i = 0; i < 8 * 8; i++) { @@ -2543,6 +2667,110 @@ void BlockToImage(__private coeff_t block[8*8*3], float r[8*8], float g[8*8], fl g[i] = kSrgb8ToLinearTable[yuv[3 * i + 1]]; b[i] = kSrgb8ToLinearTable[yuv[3 * i + 2]]; } + for (int y = 0; y < inside_y; y++) + { + for (int x = inside_x; x < 8; x++) + { + int idx = y * 8 + (inside_x - 1); + r[y * 8 + x] = r[idx]; + g[y * 8 + x] = g[idx]; + b[y * 8 + x] = b[idx]; + } + } + for (int y = inside_y; y < 8; y++) + { + for (int x = 0; x < 8; x++) + { + int idx = (inside_y - 1) * 8 + x; + r[y * 8 + x] = r[idx]; + g[y * 8 + x] = g[idx]; + b[y * 8 + x] = b[idx]; + } + } +} + +void CoeffToYUV16x16(const coeff_t block[8 * 8], uchar *yuv, __global const ushort *pixel_orig, int block_x, int block_y, int width_, int height_) +{ + uchar idct[8 * 8]; + CoeffToIDCT(&block[0], &idct[0]); + + uchar pixels[16 * 16]; + IDCTToPixel16x16(idct, pixels, pixel_orig, block_x, block_y, width_, height_); + + PixelToYUV(pixels, yuv, 16, 16); +} + +void CoeffToYUV16x16_g(__global const coeff_t block[8 * 8], uchar *yuv, __global const ushort *pixel_orig, int block_x, int block_y, int width_, int height_) +{ + coeff_t b[8 * 8]; + for (int i = 0; i < 8 * 8; i++) + { + b[i] = block[i]; + } + CoeffToYUV16x16(b, yuv, pixel_orig, block_x, block_y, width_, height_); +} + +void CoeffToYUV8x8(const coeff_t block[8 * 8], uchar *yuv) +{ + uchar idct[8 * 8]; + CoeffToIDCT(&block[0], &idct[0]); + + ushort pixels[8 * 8]; + IDCTToPixel8x8(idct, pixels); + + PixelToYUV(pixels, yuv, 8, 8); +} + +void CoeffToYUV8x8_g(__global const coeff_t block[8 * 8], uchar *yuv) +{ + coeff_t b[8 * 8]; + for (int i = 0; i < 8 * 8; i++) + { + b[i] = block[i]; + } + + CoeffToYUV8x8(b, yuv); +} + +void Copy8x8To16x16(const uchar yuv8x8[3 * 8 * 8], uchar yuv16x16[3 * 16 * 16], int off_x, int off_y) +{ + for (int y = 0; y < 8; y++) + { + for (int x = 0; x < 8; x++) + { + int idx = y * 8 + x; + int idx16 = (y + off_y * 8) * 16 + (x + off_x * 8); + yuv16x16[idx16 * 3] = yuv8x8[idx * 3]; + } + } +} + +void Copy16x16To8x8(const uchar yuv16x16[3 * 16 * 16], uchar yuv8x8[3 * 8 * 8], int off_x, int off_y) +{ + for (int y = 0; y < 8; y++) + { + for (int x = 0; x < 8; x++) + { + int idx = y * 8 + x; + int idx16 = (y + off_y * 8) * 16 + (x + off_x * 8); + yuv8x8[idx * 3] = yuv16x16[idx16 * 3]; + } + } +} + +void Copy16x16ToChannel(const float rgb16x16[3][16 * 16], float r[8 * 8], float g[8 * 8], float b[8 * 8], int off_x, int off_y) +{ + for (int y = 0; y < 8; y++) + { + for (int x = 0; x < 8; x++) + { + int idx = y * 8 + x; + int idx16 = (y + off_y * 8) * 16 + (x + off_x * 8); + r[idx] = rgb16x16[0][idx16]; + g[idx] = rgb16x16[1][idx16]; + b[idx] = rgb16x16[2][idx16]; + } + } } void Convolution(size_t xsize, size_t ysize, @@ -2714,6 +2942,14 @@ void floatcopy(float *dst, float *src, int size) } } +void floatcopy_g(float *dst, __global float *src, int size) +{ + for (int i = 0; i < size; i++) + { + dst[i] = src[i]; + } +} + void CalcOpsinDynamicsImage(ocl_channels rgb) { float rgb_blurred[3][kDCTBlockSize]; @@ -2724,6 +2960,60 @@ void CalcOpsinDynamicsImage(ocl_channels rgb) OpsinDynamicsImageBlock(rgb.r, rgb.g, rgb.b, rgb_blurred[0], rgb_blurred[1], rgb_blurred[2], kDCTBlockSize); } +void CalcOpsinDynamicsImage2(float rgb[3][kDCTBlockSize]) +{ + float rgb_blurred[3][kDCTBlockSize]; + for (int i = 0; i < 3; i++) + { + BlurEx(rgb[i], 8, 8, 1.1, 0, rgb_blurred[i]); + } + OpsinDynamicsImageBlock(rgb[0], rgb[1], rgb[2], rgb_blurred[0], rgb_blurred[1], rgb_blurred[2], kDCTBlockSize); +} + +double ComputeImage8x8Block(float rgb0_c[3][kDCTBlockSize], float rgb1_c[3][kDCTBlockSize], __global float* mask_scale_block) +{ + CalcOpsinDynamicsImage2(rgb0_c); + CalcOpsinDynamicsImage2(rgb1_c); + + float rgb0[3][kDCTBlockSize]; + float rgb1[3][kDCTBlockSize]; + + floatcopy(rgb0, rgb0_c, 3 * kDCTBlockSize); + floatcopy(rgb1, rgb1_c, 3 * kDCTBlockSize); + + MaskHighIntensityChangeBlock(rgb0[0], rgb0[1], rgb0[2], + rgb1[0], rgb1[1], rgb1[2], + rgb0_c[0], rgb0_c[1], rgb0_c[2], + rgb1_c[0], rgb1_c[1], rgb1_c[2], + 8, 8); + + // ÕâÀïΪɶҪ°Ñfloatת³Édouble²ÅÄܼÌÐø×ö¼ÆË㣿 + double b0[3 * kDCTBlockSize]; // + double b1[3 * kDCTBlockSize]; + for (int c = 0; c < 3; ++c) { + for (int ix = 0; ix < kDCTBlockSize; ++ix) { + b0[c * kDCTBlockSize + ix] = rgb0[c][ix]; + b1[c * kDCTBlockSize + ix] = rgb1[c][ix]; + } + } + + double diff_xyz_dc[3] = { 0.0 }; + double diff_xyz_ac[3] = { 0.0 }; + double diff_xyz_edge_dc[3] = { 0.0 }; + ButteraugliBlockDiff(b0, b1, diff_xyz_dc, diff_xyz_ac, diff_xyz_edge_dc); + + double diff = 0.0; + double diff_edge = 0.0; + + for (int c = 0; c < 3; ++c) { + diff += diff_xyz_dc[c] * mask_scale_block[c]; + diff += diff_xyz_ac[c] * mask_scale_block[c]; + diff_edge += diff_xyz_edge_dc[c] * mask_scale_block[c]; + } + const double kEdgeWeight = 0.05; + return sqrt((1 - kEdgeWeight) * diff + kEdgeWeight * diff_edge); +} + // strong todo // candidate_block [R....R][G....G][B....B] // orig_image_block [RR..RRGG..GGBB..BB] @@ -2748,7 +3038,7 @@ float CompareBlockEx(coeff_t *candidate_block, __global float* orig_image_block, rgb1_c.r = &image_block[0]; rgb1_c.g = &image_block[kDCTBlockSize]; rgb1_c.b = &image_block[2 * kDCTBlockSize]; - BlockToImage(candidate_block, rgb1_c.r, rgb1_c.g, rgb1_c.b); + BlockToImage(candidate_block, rgb1_c.r, rgb1_c.g, rgb1_c.b, 8, 8); CalcOpsinDynamicsImage(rgb0_c); CalcOpsinDynamicsImage(rgb1_c); @@ -2870,3 +3160,305 @@ __kernel void clComputeBlockZeroingOrder(__global const coeff_t *orig_batch, } } } + +typedef struct __channel_info_t +{ + int factor; + int block_width; + int block_height; + __global const coeff_t *coeff; + __global const ushort *pixel; +}channel_info; + +// return the count of Non-zero item +int MakeInputOrderEx(coeff_t *block, coeff_t *orig_block, IntFloatPairList *input_order, int block_size) +{ + int size = 0; + for (int c = 0; c < 3; ++c) { + for (int k = 1; k < block_size; ++k) { + int idx = c * block_size + k; + if (block[idx] != 0) { + float score = abs(orig_block[idx]) * csf[idx] + bias[idx]; + size = list_push_back(input_order, idx, score); + } + } + } + return SortInputOrder(input_order->pData, size); +} + +int GetOrigBlock(float rgb0_c[3][kDCTBlockSize], + __global float *orig_image_batch, + int width_, + int height_, + int block_x, int block_y, + int factor, + int off_x, int off_y) +{ + int block_xx = block_x * factor + off_x; + int block_yy = block_y * factor + off_y; + if (block_xx * 8 >= width_ || block_yy * 8 >= height_) return -1; + + const int block8_width = (width_ + 8 - 1) / 8; + + int block_ix = block_yy * block8_width + block_xx; + + __global const float* block_opsin = &orig_image_batch[block_ix * 3 * kDCTBlockSize]; + for (int i = 0; i < 3; i++) { + for (int k = 0; k < kDCTBlockSize; k++) { + rgb0_c[i][k] = block_opsin[i * kDCTBlockSize + k]; + } + } + + return block_ix; +} + +double CompareBlockFactor(const channel_info mayout_channel[3], + const coeff_t* candidate_block, + const int block_x, + const int block_y, + __global const float *orig_image_batch, + __global const float *mask_scale, + const int image_width, + const int image_height, + const int factor) +{ + const coeff_t *candidate_channel[3]; + for (int c = 0; c < 3; c++) { + candidate_channel[c] = &candidate_block[c * 8 * 8]; + } + + uchar yuv16x16[3 * 16 * 16] = { 0 }; // factor 2 mode output image + uchar yuv8x8[3 * 8 * 8] = { 0 }; // factor 1 mode output image + + for (int c = 0; c < 3; c++) + { + if (mayout_channel[c].factor == 1) { + if (factor == 1) { + const coeff_t *coeff_block = candidate_channel[c]; + CoeffToYUV8x8(coeff_block, &yuv8x8[c]); + } + else { + for (int iy = 0; iy < factor; ++iy) { + for (int ix = 0; ix < factor; ++ix) { + int block_xx = block_x * factor + ix; + int block_yy = block_y * factor + iy; + + ///if (ix != off_x || iy != off_y) continue; + if (block_xx >= mayout_channel[c].block_width || + block_yy >= mayout_channel[c].block_height) + { + continue; + } + int block_8x8idx = block_yy * mayout_channel[c].block_width + block_xx; + __global const coeff_t * coeff_block = mayout_channel[c].coeff + block_8x8idx * 8 * 8; + CoeffToYUV8x8_g(coeff_block, &yuv8x8[c]); + + // copy YUV8x8 to YUV1616 corner + Copy8x8To16x16(&yuv8x8[c], &yuv16x16[c], ix, iy); + } + } + } + } + else { + if (factor == 1) { + int block_xx = block_x / mayout_channel[c].factor; + int block_yy = block_y / mayout_channel[c].factor; + int ix = block_x % mayout_channel[c].factor;; + int iy = block_y % mayout_channel[c].factor; + + int block_16x16idx = block_yy * mayout_channel[c].block_width + block_xx; + __global const coeff_t * coeff_block = mayout_channel[c].coeff + block_16x16idx * 8 * 8; + + CoeffToYUV16x16_g(coeff_block, &yuv16x16[c], + mayout_channel[c].pixel, block_xx, block_yy, + image_width, + image_height); + + // copy YUV16x16 corner to YUV8x8 + Copy16x16To8x8(&yuv16x16[c], &yuv8x8[c], ix, iy); + } + else { + const coeff_t * coeff_block = candidate_channel[c]; + CoeffToYUV16x16(coeff_block, &yuv16x16[c], + mayout_channel[c].pixel, block_x, block_y, + image_width, + image_height); + } + } + } + + if (factor == 1) + { + float rgb0_c[3][kDCTBlockSize]; + int block_8x8idx = GetOrigBlock(rgb0_c, orig_image_batch, image_width, image_height, block_x, block_y, factor, 0, 0); + + int inside_x = block_x * 8 + 8 > image_width ? image_width - block_x * 8 : 8; + int inside_y = block_y * 8 + 8 > image_height ? image_height - block_y * 8 : 8; + float rgb1_c[3][kDCTBlockSize]; + + YUVToImage(yuv8x8, rgb1_c[0], rgb1_c[1], rgb1_c[2], 8, 8, inside_x, inside_y); + + return ComputeImage8x8Block(rgb0_c, rgb1_c, mask_scale + block_8x8idx * 3); + } + else + { + int inside_x = block_x * 16 + 16 > image_width ? image_width - block_x * 16 : 16; + int inside_y = block_y * 16 + 16 > image_height ? image_height - block_y * 16 : 16; + + float rgb16x16[3][16 * 16]; + YUVToImage(yuv16x16, rgb16x16[0], rgb16x16[1], rgb16x16[2], 16, 16, inside_x, inside_y); + + double max_err = 0; + for (int iy = 0; iy < factor; ++iy) { + for (int ix = 0; ix < factor; ++ix) { + int block_xx = block_x * factor + ix; + int block_yy = block_y * factor + iy; + + if (block_xx * 8 >= image_width || + block_yy * 8 >= image_height) + { + continue; + } + + float rgb0_c[3][kDCTBlockSize]; + int block_8x8idx = GetOrigBlock(rgb0_c, orig_image_batch, image_width, image_height, block_x, block_y, factor, ix, iy); + + float rgb1_c[3][kDCTBlockSize]; + Copy16x16ToChannel(rgb16x16, rgb1_c[0], rgb1_c[1], rgb1_c[2], ix, iy); + double err = ComputeImage8x8Block(rgb0_c, rgb1_c, block_8x8idx); + max_err = max(max_err, err); + } + } + return max_err; + } +} + +// batchÊÇÖ¸ÒѾ­¶þά¿éÕ¹¿ªÎªÁËһά¿é +__kernel void clComputeBlockZeroingOrderFactor( + __global const coeff_t *orig_batch_0, // ԭʼͼÏñϵÊý + __global const coeff_t *orig_batch_1, // ԭʼͼÏñϵÊý + __global const coeff_t *orig_batch_2, // ԭʼͼÏñϵÊý + __global const float *orig_image_batch, // ԭʼͼÏñpregamma + __global const float *mask_scale, // ԭʼͼÏñµÄij¸öÉñÃØ²ÎÊý + int image_width, + int image_height, + __global const coeff_t *mayout_batch_0, // Êä³ö±¸Ñ¡Í¼µÄϵÊý + __global const coeff_t *mayout_batch_1, // Êä³ö±¸Ñ¡Í¼µÄϵÊý + __global const coeff_t *mayout_batch_2, // Êä³ö±¸Ñ¡Í¼µÄϵÊý + __global const ushort *mayout_pixel_0, + __global const ushort *mayout_pixel_1, + __global const ushort *mayout_pixel_2, + channel_info mayout_channel_0, + channel_info mayout_channel_1, + channel_info mayout_channel_2, + int factor, // µ±Ç°²ÎÓëÔËËãµÄfactor + int comp_mask, // µ±Ç°²ÎÓëÔËËãµÄchannel + float BlockErrorLimit, + __global CoeffData *output_order_list/*out*/) +{ + const int block_x = get_global_id(0); + const int block_y = get_global_id(1); +#define kComputeBlockSize (kBlockSize * 3) + + channel_info orig_channel[3]; + orig_channel[0].coeff = orig_batch_0; + orig_channel[1].coeff = orig_batch_1; + orig_channel[2].coeff = orig_batch_2; + + channel_info mayout_channel[3] = { mayout_channel_0, mayout_channel_1, mayout_channel_2 }; + mayout_channel[0].coeff = mayout_batch_0; + mayout_channel[1].coeff = mayout_batch_1; + mayout_channel[2].coeff = mayout_batch_2; + mayout_channel[0].pixel = mayout_pixel_0; + mayout_channel[1].pixel = mayout_pixel_1; + mayout_channel[2].pixel = mayout_pixel_2; + + int block_idx = 0; // ¸ù¾ÝÏÂÃæmaskÃüÖеÄchannelÀ´¼ÆËãindx + + coeff_t mayout_block[kComputeBlockSize] = { 0 }; + coeff_t orig_block[kComputeBlockSize] = { 0 }; + for (int c = 0; c < 3; c++) { + if (comp_mask & (1< 0) + { + float best_err = 1e17f; + int best_i = 0; + for (int i = 0; i < min(3, input_order.size); i++) + { + coeff_t candidate_block[kComputeBlockSize]; + for (int i = 0; i < kComputeBlockSize; i++) { + candidate_block[i] = processed_block[i]; + } + + const int idx = input_order.pData[i].idx; + + candidate_block[idx] = 0; + + float max_err = CompareBlockFactor(mayout_channel, + candidate_block, + block_x, + block_y, + orig_image_batch, + mask_scale, + image_width, + image_height, + factor); + if (max_err < best_err) + { + best_err = max_err; + best_i = i; + } + } + + int idx = input_order.pData[best_i].idx; + processed_block[idx] = 0; + list_erase(&input_order, best_i); + + list_push_back(&output_order, idx, best_err); + } + + // ×¢Òâoutput_orderÕâÀïµÄresize¾ÍÊǰÑβ²¿µÄÖÃλ0 + float min_err = 1e10; + for (int i = output_order.size - 1; i >= 0; --i) { + min_err = min(min_err, output_order.pData[i].err); + output_order.pData[i].err = min_err; + } + + __global CoeffData *output_block = output_order_list + block_idx * kComputeBlockSize; + + int out_count = 0; + for (int i = 0; i < kComputeBlockSize && i < output_order.size; i++) + { + // ¹ýÂ˽ϴóµÄerr£¬Õⲿ·Ö½øÈëºó¶Ë¼ÆËãûÓÐÒâÒå + if (output_order.pData[i].err <= BlockErrorLimit) + { + output_block[out_count].idx = output_order.pData[i].idx; + output_block[out_count].err = output_order.pData[i].err; + out_count++; + } + } +} \ No newline at end of file diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index 5ab406e7..67a76300 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -63,7 +63,8 @@ ocl_args_d_t& getOcl(void) ocl.kernel[KERNEL_EDGEDETECTOR] = clCreateKernel(ocl.program, "clEdgeDetectorMap", &err); ocl.kernel[KERNEL_BLOCKDIFFMAP] = clCreateKernel(ocl.program, "clBlockDiffMap", &err); ocl.kernel[KERNEL_EDGEDETECTORLOWFREQ] = clCreateKernel(ocl.program, "clEdgeDetectorLowFreq", &err); - ocl.kernel[KERNEL_COMPUTEBLOCKZERONGORDER] = clCreateKernel(ocl.program, "clComputeBlockZeroingOrder", &err); + ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER] = clCreateKernel(ocl.program, "clComputeBlockZeroingOrder", &err); + ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDERFACTOR] = clCreateKernel(ocl.program, "clComputeBlockZeroingOrderFactor", &err); return ocl; } @@ -1217,7 +1218,7 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b, cl_mem mem_output_order_batch = ocl.allocMem(sizeof(CoeffData) * item_count); cl_float clBlockErrorLimit = BlockErrorLimit; - cl_kernel kernel = ocl.kernel[KERNEL_COMPUTEBLOCKZERONGORDER]; + cl_kernel kernel = ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER]; clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&mem_orig_batch); clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&mem_orig_image_batch); clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&mem_mask_scale_batch); @@ -1249,5 +1250,69 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b, clReleaseMemObject(mem_mask_scale_batch); clReleaseMemObject(mem_mayout_batch); clReleaseMemObject(mem_output_order_batch); +} + +void clComputeBlockZeroingOrderFactor( + const guetzli::coeff_t *orig_channel[3], + const float *orig_image_batch, + const float *mask_scale, + int image_width, + int image_height, + const channel_info *mayout_channel[3], + int factor, + int comp_mask, + int block_width, + int block_height, + float BlockErrorLimit, + guetzli::CoeffData *output_order_batch) +{ + return; +/* + using namespace guetzli; + + int item_count = 3 * kDCTBlockSize * size; + + cl_int err = 0; + ocl_args_d_t &ocl = getOcl(); + + cl_mem mem_orig_batch = ocl.allocMem(sizeof(coeff_t) * item_count, orig_batch); + cl_mem mem_orig_image_batch = ocl.allocMem(sizeof(float) * item_count, orig_image_batch); + cl_mem mem_mask_scale_batch = ocl.allocMem(sizeof(float) * 3 * size, orig_mask_scale_batch); + cl_mem mem_mayout_batch = ocl.allocMem(sizeof(coeff_t) * item_count, mayout_batch); + cl_mem mem_output_order_batch = ocl.allocMem(sizeof(CoeffData) * item_count); + cl_float clBlockErrorLimit = BlockErrorLimit; + cl_kernel kernel = ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDERFACTOR]; + clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&mem_orig_batch); + clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&mem_orig_image_batch); + clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&mem_mask_scale_batch); + clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&mem_mayout_batch); + clSetKernelArg(kernel, 4, sizeof(cl_float), &clBlockErrorLimit); + clSetKernelArg(kernel, 5, sizeof(cl_mem), &mem_output_order_batch); + + size_t globalWorkSize[1] = { size }; + err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL); + if (CL_SUCCESS != err) + { + LogError("Error: clComputeBlockZeroingOrder() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err)); + } + err = clFinish(ocl.commandQueue); + if (CL_SUCCESS != err) + { + LogError("Error: clComputeBlockZeroingOrder() clFinish returned %s.\n", TranslateOpenCLError(err)); + } + + CoeffData *result = (CoeffData *)clEnqueueMapBuffer(ocl.commandQueue, mem_output_order_batch, true, CL_MAP_READ, 0, sizeof(CoeffData) * item_count, 0, NULL, NULL, &err); + err = clFinish(ocl.commandQueue); + memcpy(output_order_batch, result, sizeof(CoeffData) * item_count); + + clEnqueueUnmapMemObject(ocl.commandQueue, mem_output_order_batch, result, sizeof(CoeffData) * item_count, NULL, NULL); + clFinish(ocl.commandQueue); + + clReleaseMemObject(mem_orig_batch); + clReleaseMemObject(mem_orig_image_batch); + clReleaseMemObject(mem_mask_scale_batch); + clReleaseMemObject(mem_mayout_batch); + clReleaseMemObject(mem_output_order_batch); +*/ } \ No newline at end of file diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h index 459110de..1cd97727 100644 --- a/clguetzli/clguetzli.h +++ b/clguetzli/clguetzli.h @@ -23,6 +23,30 @@ void clComputeBlockZeroingOrder(const guetzli::coeff_t *orig_batch, float BlockErrorLimit, guetzli::CoeffData *output_order_batch); +typedef struct __channel_info_t +{ + int factor; + int block_width; + int block_height; + const guetzli::coeff_t *coeff; + const uint16_t *pixel; +}channel_info; + +void clComputeBlockZeroingOrderFactor( + const guetzli::coeff_t *orig_batch[3], + const float *orig_image_batch, + const float *mask_scale, + int image_width, + int image_height, + const guetzli::coeff_t *mayout_batch[3], + const channel_info *channel[3], + int factor, + int comp_mask, + int block_width, + int block_height, + float BlockErrorLimit, + guetzli::CoeffData *output_order_batch); + void clMask(const float* r, const float* g, const float* b, const float* r2, const float* g2, const float* b2, size_t xsize, size_t ysize, diff --git a/clguetzli/clguetzli_comparator.cpp b/clguetzli/clguetzli_comparator.cpp index 672054d5..1fa6a886 100644 --- a/clguetzli/clguetzli_comparator.cpp +++ b/clguetzli/clguetzli_comparator.cpp @@ -276,26 +276,16 @@ void Copy16x16ToChannel(const float rgb16x16[3][16 * 16], float r[8 * 8], float } } } - +/* typedef struct __channel_info_t { int factor; int block_width; int block_height; + const coeff_t *coeff; const uint16_t *pixel; }channel_info; - -void ComputeBlockFacor(const coeff_t* candidate_block, - const coeff_t * mayout_coeff[3], - const channel_info mayout_channel[3], - const coeff_t * orig_coeff[3], - const int comp_mask, - int factor -) -{ - -} - +*/ namespace guetzli { ButteraugliComparatorEx::ButteraugliComparatorEx(const int width, const int height, @@ -461,15 +451,14 @@ namespace guetzli const coeff_t *candidate_channel[3]; channel_info mayout_channel[3]; - const coeff_t *mayout_coeff[3]; for (int c = 0; c < 3; c++) { candidate_channel[c] = &candidate_block[c * 8 * 8]; - mayout_coeff[c] = img.component(c).coeffs(); mayout_channel[c].block_height = img.component(c).height_in_blocks(); mayout_channel[c].block_width = img.component(c).width_in_blocks(); mayout_channel[c].factor = img.component(c).factor_x(); - mayout_channel[c].pixel = img.component(c).pixels(); + mayout_channel[c].pixel = img.component(c).pixels(); + mayout_channel[c].coeff = img.component(c).coeffs(); } uint8_t yuv16x16[3 * 16 * 16] = { 0 }; // factor 2 mode output image @@ -496,7 +485,7 @@ namespace guetzli continue; } int block_8x8idx = block_yy * mayout_channel[c].block_width + block_xx; - const coeff_t * coeff_block = mayout_coeff[c] + block_8x8idx * 8 * 8; + const coeff_t * coeff_block = mayout_channel[c].coeff + block_8x8idx * 8 * 8; CoeffToYUV8x8(coeff_block, &yuv8x8[c]); // copy YUV8x8 to YUV1616 corner @@ -513,7 +502,7 @@ namespace guetzli int iy = block_y % mayout_channel[c].factor; int block_16x16idx = block_yy * mayout_channel[c].block_width + block_xx; - const coeff_t * coeff_block = mayout_coeff[c] + block_16x16idx * 8 * 8; + const coeff_t * coeff_block = mayout_channel[c].coeff + block_16x16idx * 8 * 8; /* uint8_t ch[16 * 16] = { 0 }; img.component(c).ToPixels(block_xx * 8, block_yy * 8, 16, 16, ch, 1); diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h index bcc8ef9c..94bb88b8 100644 --- a/clguetzli/ocl.h +++ b/clguetzli/ocl.h @@ -63,7 +63,8 @@ enum KernelName { KERNEL_EDGEDETECTOR, KERNEL_BLOCKDIFFMAP, KERNEL_EDGEDETECTORLOWFREQ, - KERNEL_COMPUTEBLOCKZERONGORDER, + KERNEL_COMPUTEBLOCKZEROINGORDER, + KERNEL_COMPUTEBLOCKZEROINGORDERFACTOR, KERNEL_COUNT, }; From 08db770c98a00e964b09fb0cd61ba7940254b791 Mon Sep 17 00:00:00 2001 From: strongtu Date: Sat, 20 May 2017 10:36:27 +0800 Subject: [PATCH 088/189] =?UTF-8?q?clComputeBlockZeroingOrderFactor?= =?UTF-8?q?=E8=B0=83=E8=AF=95=20=20=20=20=20Factor=3D=3D1=E6=97=B6?= =?UTF-8?q?=E9=AA=8C=E8=AF=81=E9=80=9A=E8=BF=87?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clguetzli.cl | 26 +++++++--- clguetzli/clguetzli.cpp | 106 +++++++++++++++++++++++++++------------- clguetzli/clguetzli.h | 17 +++---- guetzli/processor.cc | 71 ++++++++++++++++++--------- 4 files changed, 146 insertions(+), 74 deletions(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index 55167a07..42b9c489 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -1464,6 +1464,7 @@ int SortInputOrder(DCTScoreData* input_order, int size) for (j = 1; j < size; j++) { tmp.idx = input_order[j].idx; tmp.err = input_order[j].err; + i = j - 1; while (i >= 0 && input_order[i].err > tmp.err) { input_order[i + 1].idx = input_order[i].idx; @@ -2942,7 +2943,7 @@ void floatcopy(float *dst, float *src, int size) } } -void floatcopy_g(float *dst, __global float *src, int size) +void coeffcopy_g(coeff_t *dst, __global coeff_t *src, int size) { for (int i = 0; i < size; i++) { @@ -3171,8 +3172,9 @@ typedef struct __channel_info_t }channel_info; // return the count of Non-zero item -int MakeInputOrderEx(coeff_t *block, coeff_t *orig_block, IntFloatPairList *input_order, int block_size) +int MakeInputOrderEx(coeff_t block[3*8*8], coeff_t orig_block[3*8*8], IntFloatPairList *input_order) { + const int block_size = 64; int size = 0; for (int c = 0; c < 3; ++c) { for (int k = 1; k < block_size; ++k) { @@ -3183,6 +3185,7 @@ int MakeInputOrderEx(coeff_t *block, coeff_t *orig_block, IntFloatPairList *inpu } } } + return SortInputOrder(input_order->pData, size); } @@ -3375,28 +3378,37 @@ __kernel void clComputeBlockZeroingOrderFactor( int block_idx = 0; // ¸ù¾ÝÏÂÃæmaskÃüÖеÄchannelÀ´¼ÆËãindx - coeff_t mayout_block[kComputeBlockSize] = { 0 }; - coeff_t orig_block[kComputeBlockSize] = { 0 }; + coeff_t mayout_block[kComputeBlockSize] = { 1,20,160,78 }; + coeff_t orig_block[kComputeBlockSize] = { 2,190,78,78 }; + for (int c = 0; c < 3; c++) { if (comp_mask & (1< block_[3]; - for (int c = 0; c < 3; c++) - { - int block_height = img->component(c).width_in_blocks(); - int block_width = img->component(c).height_in_blocks(); - - block_[c].resize(block_height * block_width); - } - - // we only support factor_x == factor_y == 1 const int width = img->width(); const int height = img->height(); - const int factor_x = 1; - const int factor_y = 1; - + const int ncomp = jpg.components.size(); + const int last_c = Log2FloorNonZero(comp_mask); + if (static_cast(last_c) >= jpg.components.size()) return; + const int factor_x = img->component(last_c).factor_x(); + const int factor_y = img->component(last_c).factor_y(); const int block_width = (width + 8 * factor_x - 1) / (8 * factor_x); const int block_height = (height + 8 * factor_y - 1) / (8 * factor_y); const int num_blocks = block_width * block_height; + comparator_->StartBlockComparisons(); // ³õʼ»¯Ò»Ð©²ÎÊý£¬Ö÷ÒªÊǶÔԭͼ½øÐÐһЩ´¦Àí - std::vector orig_batch(num_blocks * kBlockSize); // [block_r block_g block_b] - std::vector mayout_batch(num_blocks * kBlockSize); // [block_r block_g block_b] +// std::vector orig_batch(num_blocks * kBlockSize); // [block_r block_g block_b] +// std::vector mayout_batch(num_blocks * kBlockSize); // [block_r block_g block_b] +/* // step 1 »ñÈ¡ËùÓÐblock list for (int block_y = 0, block_ix = 0; block_y < block_height; ++block_y) { for (int block_x = 0; block_x < block_width; ++block_x, ++block_ix) { @@ -660,7 +651,7 @@ void Processor::SelectFrequencyMaskingBatch(const JPEGData& jpg, OutputImage* im } } } - +*/ // step 2 ¼ÆËãËùÓÐblockµÄϵÊýÆ«²î std::vector output_order_gpu; std::vector output_order_cpu; @@ -669,8 +660,40 @@ void Processor::SelectFrequencyMaskingBatch(const JPEGData& jpg, OutputImage* im if (g_useOpenCL || g_checkOpenCL) { + channel_info orig_channel[3]; + channel_info mayout_channel[3]; + + for (int c = 0; c < 3; c++) + { + mayout_channel[c].factor = img->component(c).factor_x(); + mayout_channel[c].block_width = img->component(c).width_in_blocks(); + mayout_channel[c].block_height = img->component(c).height_in_blocks(); + mayout_channel[c].coeff = img->component(c).coeffs(); + mayout_channel[c].pixel = img->component(c).pixels(); + + orig_channel[c].factor = jpg.components[c].v_samp_factor; + orig_channel[c].block_width = jpg.components[c].width_in_blocks; + orig_channel[c].block_height = jpg.components[c].height_in_blocks; + orig_channel[c].coeff = jpg.components[c].coeffs.data(); + } output_order_gpu.resize(num_blocks * kBlockSize); output_order = output_order_gpu.data(); + + clComputeBlockZeroingOrderFactor(orig_channel, + comp->imgOpsinDynamicsBlockList.data(), + comp->imgMaskXyzScaleBlockList.data(), + width, + height, + mayout_channel, + factor_x, + comp_mask, + comp->BlockErrorLimit(), + output_order); + +/* + output_order_gpu.resize(num_blocks * kBlockSize); + output_order = output_order_gpu.data(); + clComputeBlockZeroingOrder(orig_batch.data(), comp->imgOpsinDynamicsBlockList.data(), comp->imgMaskXyzScaleBlockList.data(), @@ -678,7 +701,10 @@ void Processor::SelectFrequencyMaskingBatch(const JPEGData& jpg, OutputImage* im num_blocks, comparator_->BlockErrorLimit(), output_order_gpu.data()); +*/ + } +/* if (!g_useOpenCL || g_checkOpenCL) { output_order_cpu.resize(num_blocks * kBlockSize); @@ -700,6 +726,7 @@ void Processor::SelectFrequencyMaskingBatch(const JPEGData& jpg, OutputImage* im } } } +*/ if (g_checkOpenCL) { int count = 0; @@ -1193,11 +1220,11 @@ bool Processor::ProcessJpegData(const Params& params, const JPEGData& jpg_in, } if (!downsample) { - SelectFrequencyMasking(jpg, &img, 7, 1.0, false, img2); + SelectFrequencyMaskingBatch(jpg, &img, 7, 1.0, false); } else { const float ymul = jpg.components.size() == 1 ? 1.0f : 0.97f; - SelectFrequencyMasking(jpg, &img, 1, ymul, false, img2); - SelectFrequencyMasking(jpg, &img, 6, 1.0, true, img2); + SelectFrequencyMaskingBatch(jpg, &img, 1, ymul, false); + SelectFrequencyMaskingBatch(jpg, &img, 6, 1.0, true); } } From d931558afce21e32e57c2707fbcdd6f3459fedaa Mon Sep 17 00:00:00 2001 From: strongtu Date: Sat, 20 May 2017 10:48:21 +0800 Subject: [PATCH 089/189] =?UTF-8?q?=E7=B2=BE=E7=AE=80=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clguetzli.cl | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index 42b9c489..d2293e17 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -2951,6 +2951,14 @@ void coeffcopy_g(coeff_t *dst, __global coeff_t *src, int size) } } +void coeffcopy(coeff_t *dst, coeff_t *src, int size) +{ + for (int i = 0; i < size; i++) + { + dst[i] = src[i]; + } +} + void CalcOpsinDynamicsImage(ocl_channels rgb) { float rgb_blurred[3][kDCTBlockSize]; @@ -3378,8 +3386,8 @@ __kernel void clComputeBlockZeroingOrderFactor( int block_idx = 0; // ¸ù¾ÝÏÂÃæmaskÃüÖеÄchannelÀ´¼ÆËãindx - coeff_t mayout_block[kComputeBlockSize] = { 1,20,160,78 }; - coeff_t orig_block[kComputeBlockSize] = { 2,190,78,78 }; + coeff_t mayout_block[kComputeBlockSize] = { 0 }; + coeff_t orig_block[kComputeBlockSize] = { 0 }; for (int c = 0; c < 3; c++) { if (comp_mask & (1< 0) { @@ -3422,12 +3420,9 @@ __kernel void clComputeBlockZeroingOrderFactor( for (int i = 0; i < min(3, input_order.size); i++) { coeff_t candidate_block[kComputeBlockSize]; - for (int i = 0; i < kComputeBlockSize; i++) { - candidate_block[i] = processed_block[i]; - } + coeffcopy(candidate_block, processed_block, kComputeBlockSize); const int idx = input_order.pData[i].idx; - candidate_block[idx] = 0; float max_err = CompareBlockFactor(mayout_channel, From 0bda30e6dcc7d8cf8c218c2615bad51a2bc39593 Mon Sep 17 00:00:00 2001 From: strongtu Date: Sat, 20 May 2017 14:09:16 +0800 Subject: [PATCH 090/189] =?UTF-8?q?factor=202=E6=94=AF=E6=8C=81=E5=AE=8C?= =?UTF-8?q?=E6=88=90=20=E5=8F=91=E7=8E=B0=E4=B8=80=E4=B8=AAbug=EF=BC=8C?= =?UTF-8?q?=E5=A6=82=E6=9E=9C=E6=9C=80=E5=90=8E=E4=B8=80=E4=B8=AA=E5=9D=97?= =?UTF-8?q?=E8=AE=A1=E7=AE=97=E5=87=BA=E6=9D=A5err=E7=B3=BB=E6=95=B0?= =?UTF-8?q?=E4=B8=AA=E6=95=B0=E4=B8=BA0=E7=9A=84=E8=AF=9D=EF=BC=8CBackEnd?= =?UTF-8?q?=E5=A4=84=E7=90=86=E4=BC=9Acrash?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clguetzli.cl | 64 ++++++++++++------------- clguetzli/clguetzli_comparator.cpp | 3 +- guetzli/processor.cc | 77 +++++++++++------------------- 3 files changed, 62 insertions(+), 82 deletions(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index d2293e17..32d0864d 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -1869,7 +1869,7 @@ __constant static float bias[192] = { // chrisk todo // return the count of Non-zero item -int MakeInputOrder(__global coeff_t *block, __global coeff_t *orig_block, IntFloatPairList *input_order, int block_size) +int MakeInputOrder(__global const coeff_t *block, __global const coeff_t *orig_block, IntFloatPairList *input_order, int block_size) { int size = 0; for (int c = 0; c < 3; ++c) { @@ -2607,7 +2607,7 @@ __constant static double kSrgb8ToLinearTable[256] = { }; -void YUVToImage(uchar yuv[3 * 8 * 8], float* r, float* g, float* b, int xsize/* = 8*/, int ysize/* = 8*/, int inside_x/* = 8*/, int inside_y/* = 8*/) +void YUVToImage(__private uchar yuv[3 * 8 * 8], float* r, float* g, float* b, int xsize/* = 8*/, int ysize/* = 8*/, int inside_x/* = 8*/, int inside_y/* = 8*/) { YUVToRGB(yuv, xsize * ysize); @@ -2646,19 +2646,19 @@ void YUVToImage(uchar yuv[3 * 8 * 8], float* r, float* g, float* b, int xsize/* void BlockToImage(__private coeff_t block[8*8*3], float r[8*8], float g[8*8], float b[8*8], int inside_x, int inside_y) { uchar idct[3][8 * 8]; - CoeffToIDCT(&block[0], &idct[0]); - CoeffToIDCT(&block[8 * 8], &idct[1]); - CoeffToIDCT(&block[8 * 8 * 2], &idct[2]); + CoeffToIDCT(&block[0], idct[0]); + CoeffToIDCT(&block[8 * 8], idct[1]); + CoeffToIDCT(&block[8 * 8 * 2], idct[2]); ushort pixels[3][8 * 8]; - IDCTToPixel8x8(&idct[0], &pixels[0]); - IDCTToPixel8x8(&idct[1], &pixels[1]); - IDCTToPixel8x8(&idct[2], &pixels[2]); + IDCTToPixel8x8(idct[0], pixels[0]); + IDCTToPixel8x8(idct[1], pixels[1]); + IDCTToPixel8x8(idct[2], pixels[2]); uchar yuv[8 * 8 * 3]; - PixelToYUV(&pixels[0], &yuv[0], 8, 8); - PixelToYUV(&pixels[1], &yuv[1], 8, 8); - PixelToYUV(&pixels[2], &yuv[2], 8, 8); + PixelToYUV(pixels[0], &yuv[0], 8, 8); + PixelToYUV(pixels[1], &yuv[1], 8, 8); + PixelToYUV(pixels[2], &yuv[2], 8, 8); YUVToRGB(yuv, 8 * 8); @@ -2690,12 +2690,12 @@ void BlockToImage(__private coeff_t block[8*8*3], float r[8*8], float g[8*8], fl } } -void CoeffToYUV16x16(const coeff_t block[8 * 8], uchar *yuv, __global const ushort *pixel_orig, int block_x, int block_y, int width_, int height_) +void CoeffToYUV16x16(__private const coeff_t block[8 * 8], uchar *yuv, __global const ushort *pixel_orig, int block_x, int block_y, int width_, int height_) { uchar idct[8 * 8]; CoeffToIDCT(&block[0], &idct[0]); - uchar pixels[16 * 16]; + ushort pixels[16 * 16]; IDCTToPixel16x16(idct, pixels, pixel_orig, block_x, block_y, width_, height_); PixelToYUV(pixels, yuv, 16, 16); @@ -2711,7 +2711,7 @@ void CoeffToYUV16x16_g(__global const coeff_t block[8 * 8], uchar *yuv, __global CoeffToYUV16x16(b, yuv, pixel_orig, block_x, block_y, width_, height_); } -void CoeffToYUV8x8(const coeff_t block[8 * 8], uchar *yuv) +void CoeffToYUV8x8(__private const coeff_t block[8 * 8], uchar *yuv) { uchar idct[8 * 8]; CoeffToIDCT(&block[0], &idct[0]); @@ -2834,8 +2834,8 @@ void BlurEx(float *r, int xsize, int ysize, double kSigma, double border_ratio, // ian todo -void OpsinDynamicsImageBlock(float *r, float *g, float *b, - float *r_blurred, float *g_blurred, float *b_blurred, +void OpsinDynamicsImageBlock(__private float *r, __private float *g, __private float *b, + __private float *r_blurred, __private float *g_blurred, __private float *b_blurred, int size) { for (size_t i = 0; i < size; ++i) { @@ -2935,7 +2935,7 @@ typedef union ocl_channels_t float *ch[3]; }ocl_channels; -void floatcopy(float *dst, float *src, int size) +void floatcopy(float *dst, const float *src, int size) { for (int i = 0; i < size; i++) { @@ -2943,7 +2943,7 @@ void floatcopy(float *dst, float *src, int size) } } -void coeffcopy_g(coeff_t *dst, __global coeff_t *src, int size) +void coeffcopy_g(coeff_t *dst, const __global coeff_t *src, int size) { for (int i = 0; i < size; i++) { @@ -2951,7 +2951,7 @@ void coeffcopy_g(coeff_t *dst, __global coeff_t *src, int size) } } -void coeffcopy(coeff_t *dst, coeff_t *src, int size) +void coeffcopy(coeff_t *dst, const coeff_t *src, int size) { for (int i = 0; i < size; i++) { @@ -2969,7 +2969,7 @@ void CalcOpsinDynamicsImage(ocl_channels rgb) OpsinDynamicsImageBlock(rgb.r, rgb.g, rgb.b, rgb_blurred[0], rgb_blurred[1], rgb_blurred[2], kDCTBlockSize); } -void CalcOpsinDynamicsImage2(float rgb[3][kDCTBlockSize]) +void CalcOpsinDynamicsImage2(__private float rgb[3][kDCTBlockSize]) { float rgb_blurred[3][kDCTBlockSize]; for (int i = 0; i < 3; i++) @@ -2979,7 +2979,7 @@ void CalcOpsinDynamicsImage2(float rgb[3][kDCTBlockSize]) OpsinDynamicsImageBlock(rgb[0], rgb[1], rgb[2], rgb_blurred[0], rgb_blurred[1], rgb_blurred[2], kDCTBlockSize); } -double ComputeImage8x8Block(float rgb0_c[3][kDCTBlockSize], float rgb1_c[3][kDCTBlockSize], __global float* mask_scale_block) +double ComputeImage8x8Block(__private float rgb0_c[3][kDCTBlockSize], __private float rgb1_c[3][kDCTBlockSize], const __global float* mask_scale_block) { CalcOpsinDynamicsImage2(rgb0_c); CalcOpsinDynamicsImage2(rgb1_c); @@ -2987,8 +2987,8 @@ double ComputeImage8x8Block(float rgb0_c[3][kDCTBlockSize], float rgb1_c[3][kDCT float rgb0[3][kDCTBlockSize]; float rgb1[3][kDCTBlockSize]; - floatcopy(rgb0, rgb0_c, 3 * kDCTBlockSize); - floatcopy(rgb1, rgb1_c, 3 * kDCTBlockSize); + floatcopy(&rgb0[0][0], &rgb0_c[0][0], 3 * kDCTBlockSize); + floatcopy(&rgb1[0][0], &rgb1_c[0][0], 3 * kDCTBlockSize); MaskHighIntensityChangeBlock(rgb0[0], rgb0[1], rgb0[2], rgb1[0], rgb1[1], rgb1[2], @@ -3027,7 +3027,7 @@ double ComputeImage8x8Block(float rgb0_c[3][kDCTBlockSize], float rgb1_c[3][kDCT // candidate_block [R....R][G....G][B....B] // orig_image_block [RR..RRGG..GGBB..BB] // mask_scale[RGB] -float CompareBlockEx(coeff_t *candidate_block, __global float* orig_image_block, __global float* mask_scale_block) +float CompareBlockEx(coeff_t *candidate_block, __global const float* orig_image_block, __global const float* mask_scale_block) { float rgb0[3][kDCTBlockSize]; float rgb1[3][kDCTBlockSize]; @@ -3052,8 +3052,8 @@ float CompareBlockEx(coeff_t *candidate_block, __global float* orig_image_block, CalcOpsinDynamicsImage(rgb0_c); CalcOpsinDynamicsImage(rgb1_c); - floatcopy(rgb0, rgb0_data, 3 * kDCTBlockSize); - floatcopy(rgb1, image_block, 3 * kDCTBlockSize); + floatcopy(&rgb0[0][0], rgb0_data, 3 * kDCTBlockSize); + floatcopy(&rgb1[0][0], image_block, 3 * kDCTBlockSize); MaskHighIntensityChangeBlock(rgb0[0],rgb0[1], rgb0[2], rgb1[0], rgb1[1], rgb1[2], @@ -3102,9 +3102,9 @@ __kernel void clComputeBlockZeroingOrder(__global const coeff_t *orig_batch, int block_idx = get_global_id(0); #define kComputeBlockSize (kBlockSize * 3) - __global coeff_t *orig_block = orig_batch + block_idx * kComputeBlockSize; - __global coeff_t *mayout_block = mayout_batch + block_idx * kComputeBlockSize; - __global float *orig_image_block = orig_image_batch + block_idx * kComputeBlockSize; + __global const coeff_t *orig_block = orig_batch + block_idx * kComputeBlockSize; + __global const coeff_t *mayout_block = mayout_batch + block_idx * kComputeBlockSize; + __global const float *orig_image_block = orig_image_batch + block_idx * kComputeBlockSize; DCTScoreData input_order_data[kComputeBlockSize]; CoeffData output_order_data[kComputeBlockSize]; @@ -3180,7 +3180,7 @@ typedef struct __channel_info_t }channel_info; // return the count of Non-zero item -int MakeInputOrderEx(coeff_t block[3*8*8], coeff_t orig_block[3*8*8], IntFloatPairList *input_order) +int MakeInputOrderEx(const coeff_t block[3*8*8], const coeff_t orig_block[3*8*8], IntFloatPairList *input_order) { const int block_size = 64; int size = 0; @@ -3198,7 +3198,7 @@ int MakeInputOrderEx(coeff_t block[3*8*8], coeff_t orig_block[3*8*8], IntFloatPa } int GetOrigBlock(float rgb0_c[3][kDCTBlockSize], - __global float *orig_image_batch, + const __global float *orig_image_batch, int width_, int height_, int block_x, int block_y, @@ -3336,7 +3336,7 @@ double CompareBlockFactor(const channel_info mayout_channel[3], float rgb1_c[3][kDCTBlockSize]; Copy16x16ToChannel(rgb16x16, rgb1_c[0], rgb1_c[1], rgb1_c[2], ix, iy); - double err = ComputeImage8x8Block(rgb0_c, rgb1_c, block_8x8idx); + double err = ComputeImage8x8Block(rgb0_c, rgb1_c, mask_scale + block_8x8idx * 3); max_err = max(max_err, err); } } diff --git a/clguetzli/clguetzli_comparator.cpp b/clguetzli/clguetzli_comparator.cpp index 1fa6a886..08890a46 100644 --- a/clguetzli/clguetzli_comparator.cpp +++ b/clguetzli/clguetzli_comparator.cpp @@ -362,7 +362,8 @@ namespace guetzli double ButteraugliComparatorEx::CompareBlock(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block, const int comp_mask) const { - double err = CompareBlockEx2(img, off_x, off_y, candidate_block, comp_mask); + double err = ButteraugliComparator::CompareBlock(img, off_x, off_y, candidate_block, comp_mask); + return err; if (g_checkOpenCL) { double err1 = ButteraugliComparator::CompareBlock(img, off_x, off_y, candidate_block, comp_mask); diff --git a/guetzli/processor.cc b/guetzli/processor.cc index 885f9cfb..88079303 100644 --- a/guetzli/processor.cc +++ b/guetzli/processor.cc @@ -55,7 +55,7 @@ class Processor { void SelectFrequencyMaskingBatch(const JPEGData& jpg, OutputImage* img, const uint8_t comp_mask, const double target_mul, - bool stop_early); + bool stop_early, const OutputImage &img2); void SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, const uint8_t comp_mask, const double target_mul, @@ -74,9 +74,10 @@ class Processor { const int block_x, const int block_y, const int factor_x, const int factor_y, const uint8_t comp_mask, OutputImage* img, const OutputImage& img2, std::vector* output_order); - + /* void ComputeBlockZeroingOrder(const coeff_t block[kBlockSize], const coeff_t orig_block[kBlockSize], const int block_x, const int block_y, std::vector* output_order); + */ bool SelectQuantMatrix(const JPEGData& jpg_in, const bool downsample, int best_q[3][kDCTBlockSize], @@ -616,7 +617,7 @@ size_t EstimateDCSize(const JPEGData& jpg) { } // namespace void Processor::SelectFrequencyMaskingBatch(const JPEGData& jpg, OutputImage* img, const uint8_t comp_mask, - const double target_mul, bool stop_early) + const double target_mul, bool stop_early, const OutputImage &img2) { const int width = img->width(); const int height = img->height(); @@ -632,27 +633,6 @@ void Processor::SelectFrequencyMaskingBatch(const JPEGData& jpg, OutputImage* im comparator_->StartBlockComparisons(); // ³õʼ»¯Ò»Ð©²ÎÊý£¬Ö÷ÒªÊǶÔԭͼ½øÐÐһЩ´¦Àí -// std::vector orig_batch(num_blocks * kBlockSize); // [block_r block_g block_b] -// std::vector mayout_batch(num_blocks * kBlockSize); // [block_r block_g block_b] -/* - // step 1 »ñÈ¡ËùÓÐblock list - for (int block_y = 0, block_ix = 0; block_y < block_height; ++block_y) { - for (int block_x = 0; block_x < block_width; ++block_x, ++block_ix) { - coeff_t *orig_block = &orig_batch[block_ix * kBlockSize]; - coeff_t *mayout_block = &mayout_batch[block_ix * kBlockSize]; - - for (int c = 0; c < 3; ++c) - { - img->component(c).GetCoeffBlock(block_x, block_y, &mayout_block[c * kDCTBlockSize]); - - const JPEGComponent& comp = jpg.components[c]; - int jpg_block_ix = block_y * comp.width_in_blocks + block_x; - memcpy(&orig_block[c * kDCTBlockSize], &comp.coeffs[jpg_block_ix * kDCTBlockSize], kDCTBlockSize * sizeof(orig_block[0])); // TOBEREMOVE:È¡³öԭʼͼÏñblockϵÊý - } - } - } -*/ - // step 2 ¼ÆËãËùÓÐblockµÄϵÊýÆ«²î std::vector output_order_gpu; std::vector output_order_cpu; CoeffData * output_order = NULL; @@ -690,32 +670,31 @@ void Processor::SelectFrequencyMaskingBatch(const JPEGData& jpg, OutputImage* im comp->BlockErrorLimit(), output_order); -/* - output_order_gpu.resize(num_blocks * kBlockSize); - output_order = output_order_gpu.data(); - - clComputeBlockZeroingOrder(orig_batch.data(), - comp->imgOpsinDynamicsBlockList.data(), - comp->imgMaskXyzScaleBlockList.data(), - mayout_batch.data(), - num_blocks, - comparator_->BlockErrorLimit(), - output_order_gpu.data()); -*/ - } -/* if (!g_useOpenCL || g_checkOpenCL) { output_order_cpu.resize(num_blocks * kBlockSize); output_order = output_order_cpu.data(); for (int block_y = 0, block_ix = 0; block_y < block_height; ++block_y) { for (int block_x = 0; block_x < block_width; ++block_x, ++block_ix) { - coeff_t *orig_block = &orig_batch[block_ix * kBlockSize]; - coeff_t *block = &mayout_batch[block_ix * kBlockSize]; + coeff_t block[kBlockSize] = { 0 }; + coeff_t orig_block[kBlockSize] = { 0 }; + for (int c = 0; c < 3; ++c) { + if (comp_mask & (1 << c)) { + assert(img->component(c).factor_x() == factor_x); + assert(img->component(c).factor_y() == factor_y); + img->component(c).GetCoeffBlock(block_x, block_y, + &block[c * kDCTBlockSize]); + const JPEGComponent& comp = jpg.components[c]; + int jpg_block_ix = block_y * comp.width_in_blocks + block_x; + memcpy(&orig_block[c * kDCTBlockSize], + &comp.coeffs[jpg_block_ix * kDCTBlockSize], + kDCTBlockSize * sizeof(orig_block[0])); + } + } std::vector block_order; - ComputeBlockZeroingOrder(block, orig_block, block_x, block_y, &block_order); + ComputeBlockZeroingOrder(block, orig_block, block_x, block_y, factor_x, factor_y, comp_mask, img, img2, &block_order); CoeffData * p = &output_order_cpu[block_ix * kBlockSize]; for (int i = 0; i < block_order.size(); i++) @@ -726,7 +705,7 @@ void Processor::SelectFrequencyMaskingBatch(const JPEGData& jpg, OutputImage* im } } } -*/ + if (g_checkOpenCL) { int count = 0; @@ -769,11 +748,11 @@ void Processor::SelectFrequencyMaskingBatch(const JPEGData& jpg, OutputImage* im comparator_->FinishBlockComparisons(); candidate_coeff_offsets[num_blocks] = candidate_coeffs.size(); - SelectFrequencyBackEnd(jpg, img, 7, target_mul, stop_early, + SelectFrequencyBackEnd(jpg, img, comp_mask, target_mul, stop_early, candidate_coeff_offsets, candidate_coeffs, candidate_coeff_errors); } - +/* void Processor::ComputeBlockZeroingOrder(const coeff_t block[kBlockSize], const coeff_t orig_block[kBlockSize], const int block_x, const int block_y, std::vector* output_order) { @@ -824,7 +803,7 @@ void Processor::ComputeBlockZeroingOrder(const coeff_t block[kBlockSize], const candidate_block[idx] = 0; // TOBEREMOVE:¶Ô±ÈblockµÄÅÅÐòµÃ·ÖǰiµÍµÄÖÃ0(i¸ù¾Ýinput_orderÊý¾Ý±ä»¯¶ø±ä»¯)£¬²¢ÏÈÉèÖûضԱÈͼÏñµÄÈý¸ö·ÖÁ¿¶ÔÓ¦blockÖÐÈ¥£¬ºóÐøÔÙ×ö¶Ô±È²ÉÓᣠ- float max_err = 0;/// ((ButteraugliComparatorEx*)comparator_)->CompareBlockEx(img, 0, 0, candidate_block); + float max_err = ((ButteraugliComparatorEx*)comparator_)->CompareBlockEx(img, 0, 0, candidate_block); if (max_err < best_err) { // TOBEREMOVE:ÕÒ³ö×îС´íÎóÖµµÄi best_err = max_err; best_i = i; @@ -853,7 +832,7 @@ void Processor::ComputeBlockZeroingOrder(const coeff_t block[kBlockSize], const } output_order->resize(num); } - +*/ void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, const uint8_t comp_mask, const double target_mul, @@ -1220,11 +1199,11 @@ bool Processor::ProcessJpegData(const Params& params, const JPEGData& jpg_in, } if (!downsample) { - SelectFrequencyMaskingBatch(jpg, &img, 7, 1.0, false); + SelectFrequencyMaskingBatch(jpg, &img, 7, 1.0, false, img2); } else { const float ymul = jpg.components.size() == 1 ? 1.0f : 0.97f; - SelectFrequencyMaskingBatch(jpg, &img, 1, ymul, false); - SelectFrequencyMaskingBatch(jpg, &img, 6, 1.0, true); + SelectFrequencyMaskingBatch(jpg, &img, 1, ymul, false, img2); + SelectFrequencyMaskingBatch(jpg, &img, 6, 1.0, true, img2); } } From 999585d2eeaabb43ebc18838797e19d183e6022c Mon Sep 17 00:00:00 2001 From: strongtu Date: Sat, 20 May 2017 17:18:05 +0800 Subject: [PATCH 091/189] =?UTF-8?q?=E5=90=88=E5=B9=B6=E7=B1=BB=E5=9E=8B?= =?UTF-8?q?=E5=A3=B0=E6=98=8E=EF=BC=8C=E5=9C=A8opencl=E4=B8=ADinclude=20Op?= =?UTF-8?q?enCL=E4=BB=A3=E7=A0=81=E5=90=88=E5=B9=B6=E5=85=A5C=E7=AB=AF?= =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BB=A3=E7=A0=81=E4=BE=9B=E7=BC=96=E8=AF=91?= =?UTF-8?q?=E8=B0=83=E8=AF=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clguetzli.cl | 139 ++++++++++++----------------- clguetzli/clguetzli.cpp | 8 +- clguetzli/clguetzli.h | 15 +--- clguetzli/clguetzli_comparator.cpp | 11 +-- clguetzli/ocl.h | 20 +---- guetzli.vcxproj | 3 +- guetzli.vcxproj.filters | 4 +- 7 files changed, 70 insertions(+), 130 deletions(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index 32d0864d..7da649e6 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -1,10 +1,6 @@ -//#ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64 : enable -//#elif defined(cl_amd_fp64) -//#pragma OPENCL EXTENSION cl_amd_fp64 : enable -//#else -//#error "Double precision floating point not supported by OpenCL implementation." -//#endif + +#include "clguetzli\clguetzli.cl.h" #define kBlockEdge 8 #define kBlockSize (kBlockEdge * kBlockEdge) @@ -244,7 +240,7 @@ __kernel void clScaleImage(double scale, __global float *result) result[i] *= scale; } -kernel void clRemoveBorder(__global float *in, int in_xsize, int s, int s2, __global float *out) +__kernel void clRemoveBorder(__global float *in, int in_xsize, int s, int s2, __global float *out) { const int x = get_global_id(0); const int y = get_global_id(1); @@ -255,7 +251,7 @@ kernel void clRemoveBorder(__global float *in, int in_xsize, int s, int s2, __gl out[y * xsize + x] = in[(y + s2) * (xsize + s) + x + s2]; } -kernel void clAddBorder(__global float *out, int s, int s2, __global float *in) +__kernel void clAddBorder(__global float *out, int s, int s2, __global float *in) { const int x = get_global_id(0); const int y = get_global_id(1); @@ -1427,8 +1423,6 @@ typedef struct __IntFloatPair float err; }IntFloatPair, DCTScoreData, CoeffData; -typedef short coeff_t; - typedef struct __IntFloatPairList { int size; @@ -2643,7 +2637,7 @@ void YUVToImage(__private uchar yuv[3 * 8 * 8], float* r, float* g, float* b, in // chrisk todo -void BlockToImage(__private coeff_t block[8*8*3], float r[8*8], float g[8*8], float b[8*8], int inside_x, int inside_y) +void BlockToImage(__private const coeff_t block[8*8*3], float r[8*8], float g[8*8], float b[8*8], int inside_x, int inside_y) { uchar idct[3][8 * 8]; CoeffToIDCT(&block[0], idct[0]); @@ -2870,70 +2864,58 @@ void MaskHighIntensityChangeBlock(float *xyb0_x, float *xyb0_y, float *xyb0_b, float *c1_x, float *c1_y, float *c1_b, int xsize, int ysize) { - for (int x = 0; x < xsize; ++x) - { - for (int y = 0; y < ysize; ++y) - { - size_t ix = y * xsize + x; - const double ave[3] = { - (c0_x[ix] + c1_x[ix]) * 0.5, - (c0_y[ix] + c1_y[ix]) * 0.5, - (c0_b[ix] + c1_b[ix]) * 0.5, - }; - double sqr_max_diff = -1; - { - int offset[4] = { -1, 1, -(int)(xsize), (int)(xsize) }; - int border[4] = { x == 0, x + 1 == xsize, y == 0, y + 1 == ysize }; - for (int dir = 0; dir < 4; ++dir) { - if (border[dir]) - { - continue; - } - const int ix2 = ix + offset[dir]; - double diff = 0.5 * (c0_y[ix2] + c1_y[ix2]) - ave[1]; - diff *= diff; - if (sqr_max_diff < diff) - { - sqr_max_diff = diff; - } - } - } - const double kReductionX = 275.19165240059317; - const double kReductionY = 18599.41286306991; - const double kReductionZ = 410.8995306951065; - const double kChromaBalance = 106.95800948271017; - double chroma_scale = kChromaBalance / (ave[1] + kChromaBalance); - - const double mix[3] = { - chroma_scale * kReductionX / (sqr_max_diff + kReductionX), - kReductionY / (sqr_max_diff + kReductionY), - chroma_scale * kReductionZ / (sqr_max_diff + kReductionZ), - }; - // Interpolate lineraly between the average color and the actual - // color -- to reduce the importance of this pixel. - xyb0_x[ix] = (float)(mix[0] * c0_x[ix] + (1 - mix[0]) * ave[0]); - xyb1_x[ix] = (float)(mix[0] * c1_x[ix] + (1 - mix[0]) * ave[0]); - - xyb0_y[ix] = (float)(mix[1] * c0_y[ix] + (1 - mix[1]) * ave[1]); - xyb1_y[ix] = (float)(mix[1] * c1_y[ix] + (1 - mix[1]) * ave[1]); - - xyb0_b[ix] = (float)(mix[2] * c0_b[ix] + (1 - mix[2]) * ave[2]); - xyb1_b[ix] = (float)(mix[2] * c1_b[ix] + (1 - mix[2]) * ave[2]); - } - } -} - -typedef union ocl_channels_t -{ - struct + for (int x = 0; x < xsize; ++x) { - float * r; - float * g; - float * b; - }; - - float *ch[3]; -}ocl_channels; + for (int y = 0; y < ysize; ++y) + { + size_t ix = y * xsize + x; + const double ave[3] = { + (c0_x[ix] + c1_x[ix]) * 0.5, + (c0_y[ix] + c1_y[ix]) * 0.5, + (c0_b[ix] + c1_b[ix]) * 0.5, + }; + double sqr_max_diff = -1; + { + int offset[4] = { -1, 1, -(int)(xsize), (int)(xsize) }; + int border[4] = { x == 0, x + 1 == xsize, y == 0, y + 1 == ysize }; + for (int dir = 0; dir < 4; ++dir) { + if (border[dir]) + { + continue; + } + const int ix2 = ix + offset[dir]; + double diff = 0.5 * (c0_y[ix2] + c1_y[ix2]) - ave[1]; + diff *= diff; + if (sqr_max_diff < diff) + { + sqr_max_diff = diff; + } + } + } + const double kReductionX = 275.19165240059317; + const double kReductionY = 18599.41286306991; + const double kReductionZ = 410.8995306951065; + const double kChromaBalance = 106.95800948271017; + double chroma_scale = kChromaBalance / (ave[1] + kChromaBalance); + + const double mix[3] = { + chroma_scale * kReductionX / (sqr_max_diff + kReductionX), + kReductionY / (sqr_max_diff + kReductionY), + chroma_scale * kReductionZ / (sqr_max_diff + kReductionZ), + }; + // Interpolate lineraly between the average color and the actual + // color -- to reduce the importance of this pixel. + xyb0_x[ix] = (float)(mix[0] * c0_x[ix] + (1 - mix[0]) * ave[0]); + xyb1_x[ix] = (float)(mix[0] * c1_x[ix] + (1 - mix[0]) * ave[0]); + + xyb0_y[ix] = (float)(mix[1] * c0_y[ix] + (1 - mix[1]) * ave[1]); + xyb1_y[ix] = (float)(mix[1] * c1_y[ix] + (1 - mix[1]) * ave[1]); + + xyb0_b[ix] = (float)(mix[2] * c0_b[ix] + (1 - mix[2]) * ave[2]); + xyb1_b[ix] = (float)(mix[2] * c1_b[ix] + (1 - mix[2]) * ave[2]); + } + } +} void floatcopy(float *dst, const float *src, int size) { @@ -3170,15 +3152,6 @@ __kernel void clComputeBlockZeroingOrder(__global const coeff_t *orig_batch, } } -typedef struct __channel_info_t -{ - int factor; - int block_width; - int block_height; - __global const coeff_t *coeff; - __global const ushort *pixel; -}channel_info; - // return the count of Non-zero item int MakeInputOrderEx(const coeff_t block[3*8*8], const coeff_t orig_block[3*8*8], IntFloatPairList *input_order) { diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index 4d132717..ec6e70fb 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -1211,10 +1211,10 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b, cl_int err = 0; ocl_args_d_t &ocl = getOcl(); - cl_mem mem_orig_batch = ocl.allocMem(sizeof(coeff_t) * item_count, orig_batch); + cl_mem mem_orig_batch = ocl.allocMem(sizeof(::coeff_t) * item_count, orig_batch); cl_mem mem_orig_image_batch = ocl.allocMem(sizeof(float) * item_count, orig_image_batch); cl_mem mem_mask_scale_batch = ocl.allocMem(sizeof(float) * 3 * size, orig_mask_scale_batch); - cl_mem mem_mayout_batch = ocl.allocMem(sizeof(coeff_t) * item_count, mayout_batch); + cl_mem mem_mayout_batch = ocl.allocMem(sizeof(::coeff_t) * item_count, mayout_batch); cl_mem mem_output_order_batch = ocl.allocMem(sizeof(CoeffData) * item_count); cl_float clBlockErrorLimit = BlockErrorLimit; @@ -1280,10 +1280,10 @@ void clComputeBlockZeroingOrderFactor( for (int c = 0; c < 3; c++) { int block_count = orig_channel[c].block_width * orig_channel[c].block_height; - mem_orig_coeff[c] = ocl.allocMem(block_count * sizeof(coeff_t) * kDCTBlockSize, orig_channel[c].coeff); + mem_orig_coeff[c] = ocl.allocMem(block_count * sizeof(::coeff_t) * kDCTBlockSize, orig_channel[c].coeff); block_count = mayout_channel[c].block_width * mayout_channel[c].block_height; - mem_mayout_coeff[c] = ocl.allocMem(block_count * sizeof(coeff_t) * kDCTBlockSize, mayout_channel[c].coeff); + mem_mayout_coeff[c] = ocl.allocMem(block_count * sizeof(::coeff_t) * kDCTBlockSize, mayout_channel[c].coeff); mem_mayout_pixel[c] = ocl.allocMem(image_width * image_height * sizeof(uint16_t), mayout_channel[c].pixel); diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h index b7479407..9be1ac10 100644 --- a/clguetzli/clguetzli.h +++ b/clguetzli/clguetzli.h @@ -4,6 +4,8 @@ #include "guetzli\processor.h" #include "ocl.h" +#include "clguetzli.cl.h" + extern bool g_useOpenCL; extern bool g_checkOpenCL; @@ -15,23 +17,14 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b, size_t step, float* result); -void clComputeBlockZeroingOrder(const guetzli::coeff_t *orig_batch, +void clComputeBlockZeroingOrder(const coeff_t *orig_batch, const float *orig_image_batch, const float* orig_mask_scale_batch, - const guetzli::coeff_t *mayout_batch, + const coeff_t *mayout_batch, int size, float BlockErrorLimit, guetzli::CoeffData *output_order_batch); -typedef struct __channel_info_t -{ - int factor; - int block_width; - int block_height; - const guetzli::coeff_t *coeff; - const uint16_t *pixel; -}channel_info; - void clComputeBlockZeroingOrderFactor( const channel_info orig_channel[3], const float *orig_image_batch, diff --git a/clguetzli/clguetzli_comparator.cpp b/clguetzli/clguetzli_comparator.cpp index 08890a46..b003ecb8 100644 --- a/clguetzli/clguetzli_comparator.cpp +++ b/clguetzli/clguetzli_comparator.cpp @@ -276,16 +276,7 @@ void Copy16x16ToChannel(const float rgb16x16[3][16 * 16], float r[8 * 8], float } } } -/* -typedef struct __channel_info_t -{ - int factor; - int block_width; - int block_height; - const coeff_t *coeff; - const uint16_t *pixel; -}channel_info; -*/ + namespace guetzli { ButteraugliComparatorEx::ButteraugliComparatorEx(const int width, const int height, diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h index 94bb88b8..0b8df8b4 100644 --- a/clguetzli/ocl.h +++ b/clguetzli/ocl.h @@ -2,6 +2,7 @@ #include "CL\cl.h" #include "utils.h" +#include "clguetzli.cl.h" // Macros for OpenCL versions #define OPENCL_VERSION_1_2 1.2f @@ -68,25 +69,6 @@ enum KernelName { KERNEL_COUNT, }; -typedef union ocl_channels_t -{ - struct - { - cl_mem r; - cl_mem g; - cl_mem b; - }; - - struct - { - cl_mem x; - cl_mem y; - cl_mem b; - }; - - cl_mem ch[3]; -}ocl_channels; - struct ocl_args_d_t { ocl_args_d_t(); diff --git a/guetzli.vcxproj b/guetzli.vcxproj index 3ae4554f..c914d909 100644 --- a/guetzli.vcxproj +++ b/guetzli.vcxproj @@ -291,8 +291,8 @@ + - @@ -385,6 +385,7 @@ Document + "C:\Users\strongtu\Documents\Project\git_strong\guetzli\clguetzli";%(Include) diff --git a/guetzli.vcxproj.filters b/guetzli.vcxproj.filters index 9b0a7ad0..98009b47 100644 --- a/guetzli.vcxproj.filters +++ b/guetzli.vcxproj.filters @@ -575,10 +575,10 @@ clguetzli - + clguetzli - + clguetzli From 643e8dbfcd263d79d6e0085892050abb1d5c09db Mon Sep 17 00:00:00 2001 From: strongtu Date: Sat, 20 May 2017 17:22:35 +0800 Subject: [PATCH 092/189] =?UTF-8?q?=E4=BF=AE=E6=AD=A3=20clEnqueueUnmapMemO?= =?UTF-8?q?bject=20=E5=8F=82=E6=95=B0=E4=BC=A0=E9=80=92bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clguetzli.cpp | 10 +++--- clguetzli/clguetzli_test.cpp | 60 ++++++++++++++++++------------------ 2 files changed, 35 insertions(+), 35 deletions(-) diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index ec6e70fb..b5d7c4a2 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -385,9 +385,9 @@ void clOpsinDynamicsImage(size_t xsize, size_t ysize, float* r, float* g, float* memcpy(g, result_g, channel_size); memcpy(b, result_b, channel_size); - clEnqueueUnmapMemObject(ocl.commandQueue, rgb.r, result_r, channel_size, NULL, NULL); - clEnqueueUnmapMemObject(ocl.commandQueue, rgb.g, result_g, channel_size, NULL, NULL); - clEnqueueUnmapMemObject(ocl.commandQueue, rgb.b, result_b, channel_size, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, rgb.r, result_r, 0, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, rgb.g, result_g, 0, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, rgb.b, result_b, 0, NULL, NULL); clFinish(ocl.commandQueue); ocl.releaseMemChannels(rgb); @@ -1182,7 +1182,7 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b, err = clFinish(ocl.commandQueue); memcpy(result, result_r, channel_size); - clEnqueueUnmapMemObject(ocl.commandQueue, mem_result, result_r, channel_size, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, mem_result, result_r, 0, NULL, NULL); clFinish(ocl.commandQueue); ocl.releaseMemChannels(xyb1); @@ -1242,7 +1242,7 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b, err = clFinish(ocl.commandQueue); memcpy(output_order_batch, result, sizeof(CoeffData) * item_count); - clEnqueueUnmapMemObject(ocl.commandQueue, mem_output_order_batch, result, sizeof(CoeffData) * item_count, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, mem_output_order_batch, result, 0, NULL, NULL); clFinish(ocl.commandQueue); clReleaseMemObject(mem_orig_batch); diff --git a/clguetzli/clguetzli_test.cpp b/clguetzli/clguetzli_test.cpp index 38e3e966..6dca483f 100644 --- a/clguetzli/clguetzli_test.cpp +++ b/clguetzli/clguetzli_test.cpp @@ -62,12 +62,12 @@ void tclMaskHighIntensityChange(const float* r, const float* g, const float* b, FLOAT_COMPARE(result_g2, r1_g, xsize * ysize); FLOAT_COMPARE(result_b2, r1_b, xsize * ysize); - clEnqueueUnmapMemObject(ocl.commandQueue, xyb0.r, r0_r, channel_size, NULL, NULL); - clEnqueueUnmapMemObject(ocl.commandQueue, xyb0.g, r0_g, channel_size, NULL, NULL); - clEnqueueUnmapMemObject(ocl.commandQueue, xyb0.b, r0_b, channel_size, NULL, NULL); - clEnqueueUnmapMemObject(ocl.commandQueue, xyb1.r, r1_r, channel_size, NULL, NULL); - clEnqueueUnmapMemObject(ocl.commandQueue, xyb1.g, r1_g, channel_size, NULL, NULL); - clEnqueueUnmapMemObject(ocl.commandQueue, xyb1.b, r1_b, channel_size, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, xyb0.r, r0_r, 0, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, xyb0.g, r0_g, 0, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, xyb0.b, r0_b, 0, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, xyb1.r, r1_r, 0, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, xyb1.g, r1_g, 0, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, xyb1.b, r1_b, 0, NULL, NULL); err = clFinish(ocl.commandQueue); ocl.releaseMemChannels(xyb0); @@ -106,7 +106,7 @@ void tclEdgeDetectorMap(const float* r, const float* g, const float* b, FLOAT_COMPARE(result, r_r, res_xsize * res_ysize * 3); - clEnqueueUnmapMemObject(ocl.commandQueue, edge, r_r, edgemap_size, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, edge, r_r, 0, NULL, NULL); err = clFinish(ocl.commandQueue); ocl.releaseMemChannels(xyb0); @@ -150,8 +150,8 @@ void tclBlockDiffMap(const float* r, const float* g, const float* b, FLOAT_COMPARE(r_dc, result_diff_dc, res_xsize * res_ysize * 3); FLOAT_COMPARE(r_ac, result_diff_ac, res_xsize * res_ysize * 3); - clEnqueueUnmapMemObject(ocl.commandQueue, block_diff_dc, r_dc, reschannel_size, NULL, NULL); - clEnqueueUnmapMemObject(ocl.commandQueue, block_diff_ac, r_ac, reschannel_size, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, block_diff_dc, r_dc, 0, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, block_diff_ac, r_ac, 0, NULL, NULL); err = clFinish(ocl.commandQueue); ocl.releaseMemChannels(xyb0); @@ -197,7 +197,7 @@ void tclEdgeDetectorLowFreq(const float* r, const float* g, const float* b, FLOAT_COMPARE(r_ac, result_diff_ac, res_xsize * res_ysize * 3); - clEnqueueUnmapMemObject(ocl.commandQueue, block_diff_ac, r_ac, reschannel_size, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, block_diff_ac, r_ac, 0, NULL, NULL); err = clFinish(ocl.commandQueue); ocl.releaseMemChannels(xyb0); @@ -246,12 +246,12 @@ void tclMask(const float* r, const float* g, const float* b, FLOAT_COMPARE(maskdc_g, r1_g, xsize * ysize); FLOAT_COMPARE(maskdc_b, r1_b, xsize * ysize); - clEnqueueUnmapMemObject(ocl.commandQueue, mask.r, r0_r, channel_size, NULL, NULL); - clEnqueueUnmapMemObject(ocl.commandQueue, mask.g, r0_g, channel_size, NULL, NULL); - clEnqueueUnmapMemObject(ocl.commandQueue, mask.b, r0_b, channel_size, NULL, NULL); - clEnqueueUnmapMemObject(ocl.commandQueue, mask_dc.r, r1_r, channel_size, NULL, NULL); - clEnqueueUnmapMemObject(ocl.commandQueue, mask_dc.g, r1_g, channel_size, NULL, NULL); - clEnqueueUnmapMemObject(ocl.commandQueue, mask_dc.b, r1_b, channel_size, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, mask.r, r0_r, 0, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, mask.g, r0_g, 0, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, mask.b, r0_b, 0, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, mask_dc.r, r1_r, 0, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, mask_dc.g, r1_g, 0, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, mask_dc.b, r1_b, 0, NULL, NULL); err = clFinish(ocl.commandQueue); ocl.releaseMemChannels(rgb); @@ -299,7 +299,7 @@ void tclCombineChannels(const float *mask_xyb_x, const float *mask_xyb_y, const FLOAT_COMPARE(result_tmp, result, res_xsize * res_ysize); - clEnqueueUnmapMemObject(ocl.commandQueue, cl_result, result_tmp, res_xsize * res_ysize * sizeof(float), NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, cl_result, result_tmp, 0, NULL, NULL); ocl.releaseMemChannels(mask); ocl.releaseMemChannels(mask_dc); clReleaseMemObject(cl_block_diff_dc); @@ -324,7 +324,7 @@ void tclCalculateDiffmap(const size_t xsize, const size_t ysize, //cl_float *result_tmp = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mem_diffmap, true, CL_MAP_READ, 0, length, 0, NULL, NULL, &err); //err = clFinish(ocl.commandQueue); //FLOAT_COMPARE(result_tmp, diffmap_cmp, xsize * ysize); - //clEnqueueUnmapMemObject(ocl.commandQueue, mem_diffmap, result_tmp, length, NULL, NULL); + //clEnqueueUnmapMemObject(ocl.commandQueue, mem_diffmap, result_tmp, 0, NULL, NULL); clReleaseMemObject(mem_diffmap); } @@ -346,7 +346,7 @@ void tclBlur(float* channel, size_t xsize, size_t ysize, double sigma, double bo FLOAT_COMPARE(result, r_r, xsize * ysize); - clEnqueueUnmapMemObject(ocl.commandQueue, r, r_r, channel_size, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, r, r_r, 0, NULL, NULL); err = clFinish(ocl.commandQueue); clReleaseMemObject(r); @@ -383,7 +383,7 @@ void tclConvolution(size_t xsize, size_t ysize, FLOAT_COMPARE(result, r_r, dxsize * ysize); - clEnqueueUnmapMemObject(ocl.commandQueue, r, r_r, result_size, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, r, r_r, 0, NULL, NULL); err = clFinish(ocl.commandQueue); clReleaseMemObject(i); @@ -415,7 +415,7 @@ void tclUpsample(float* image, size_t xsize, size_t ysize, FLOAT_COMPARE(result, r_r, xsize * ysize); - clEnqueueUnmapMemObject(ocl.commandQueue, r, r_r, result_size, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, r, r_r, 0, NULL, NULL); err = clFinish(ocl.commandQueue); clReleaseMemObject(img); @@ -457,9 +457,9 @@ void tclDiffPrecompute( ocl.releaseMemChannels(cl_xyb0); ocl.releaseMemChannels(cl_xyb1); ocl.releaseMemChannels(cl_mask); - clEnqueueUnmapMemObject(ocl.commandQueue, cl_mask.x, r_x, channel_size, NULL, NULL); - clEnqueueUnmapMemObject(ocl.commandQueue, cl_mask.y, r_y, channel_size, NULL, NULL); - clEnqueueUnmapMemObject(ocl.commandQueue, cl_mask.b, r_b, channel_size, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, cl_mask.x, r_x, 0, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, cl_mask.y, r_y, 0, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, cl_mask.b, r_b, 0, NULL, NULL); } // ian todo @@ -474,7 +474,7 @@ void tclAverage5x5(int xsize, int ysize, std::vector &diffs_org, std::vec err = clFinish(ocl.commandQueue); FLOAT_COMPARE(r, diffs_cmp.data(), xsize * ysize); - clEnqueueUnmapMemObject(ocl.commandQueue, mem_diff, r, xsize * ysize * sizeof(float), NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, mem_diff, r, 0, NULL, NULL); clReleaseMemObject(mem_diff); } @@ -498,7 +498,7 @@ void tclMinSquareVal(float *img, size_t square_size, size_t offset, FLOAT_COMPARE(values, r_r, xsize * ysize); - clEnqueueUnmapMemObject(ocl.commandQueue, r, r_r, img_size, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, r, r_r, 0, NULL, NULL); err = clFinish(ocl.commandQueue); clReleaseMemObject(r); @@ -517,7 +517,7 @@ void tclScaleImage(double scale, float *result_org, float *result_cmp, size_t le FLOAT_COMPARE(r_r, result_cmp, length); - clEnqueueUnmapMemObject(ocl.commandQueue, mem_result_org, r_r, length * sizeof(float), NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, mem_result_org, r_r, 0, NULL, NULL); clReleaseMemObject(mem_result_org); } @@ -546,9 +546,9 @@ void tclOpsinDynamicsImage(float* r, float* g, float* b, size_t xsize, size_t ys FLOAT_COMPARE(result_g, r_g, xsize * ysize); FLOAT_COMPARE(result_b, r_b, xsize * ysize); - clEnqueueUnmapMemObject(ocl.commandQueue, rgb.r, r_r, channel_size, NULL, NULL); - clEnqueueUnmapMemObject(ocl.commandQueue, rgb.g, r_g, channel_size, NULL, NULL); - clEnqueueUnmapMemObject(ocl.commandQueue, rgb.b, r_b, channel_size, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, rgb.r, r_r, 0, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, rgb.g, r_g, 0, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, rgb.b, r_b, 0, NULL, NULL); err = clFinish(ocl.commandQueue); ocl.releaseMemChannels(rgb); From b2d8639423665e7e4081cc2fee9a253b1a614dae Mon Sep 17 00:00:00 2001 From: strongtu Date: Sat, 20 May 2017 22:10:04 +0800 Subject: [PATCH 093/189] =?UTF-8?q?=E7=B2=BE=E7=AE=80=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clguetzli.cl.cpp | 145 +++++++++++++++++++++++++++++++ clguetzli/clguetzli_comparator.h | 11 +-- 2 files changed, 146 insertions(+), 10 deletions(-) create mode 100644 clguetzli/clguetzli.cl.cpp diff --git a/clguetzli/clguetzli.cl.cpp b/clguetzli/clguetzli.cl.cpp new file mode 100644 index 00000000..fb994d7c --- /dev/null +++ b/clguetzli/clguetzli.cl.cpp @@ -0,0 +1,145 @@ +#include +#include +#include "utils.h" +#include "clguetzli_comparator.h" + +extern bool g_useOpenCL; +extern bool g_checkOpenCL; + +using namespace std; + +int get_global_id(int dim) +{ + return 0; +} + +int get_global_size(int dim) +{ + return 0; +} + +#define abs(exper) fabs((exper)) + +#define __opencl +#include "clguetzli.cl" + + +namespace guetzli +{ + ButteraugliComparatorEx::ButteraugliComparatorEx(const int width, const int height, + const std::vector* rgb, + const float target_distance, ProcessStats* stats) + : ButteraugliComparator(width, height, rgb, target_distance, stats) + { + + } + + void ButteraugliComparatorEx::StartBlockComparisons() + { + ButteraugliComparator::StartBlockComparisons(); + + const int width = width_; + const int height = height_; + const int factor_x = 1; + const int factor_y = 1; + + const int block_width = (width + 8 * factor_x - 1) / (8 * factor_x); + const int block_height = (height + 8 * factor_y - 1) / (8 * factor_y); + const int num_blocks = block_width * block_height; + + const double* lut = kSrgb8ToLinearTable; + + imgOpsinDynamicsBlockList.resize(num_blocks * 3 * kDCTBlockSize); + imgMaskXyzScaleBlockList.resize(num_blocks * 3); + for (int block_y = 0, block_ix = 0; block_y < block_height; ++block_y) + { + for (int block_x = 0; block_x < block_width; ++block_x, ++block_ix) + { + float* curR = &imgOpsinDynamicsBlockList[block_ix * 3 * kDCTBlockSize]; + float* curG = curR + kDCTBlockSize; + float* curB = curG + kDCTBlockSize; + + for (int iy = 0, i = 0; iy < 8; ++iy) { + for (int ix = 0; ix < 8; ++ix, ++i) { + int x = std::min(8 * block_x + ix, width - 1); + int y = std::min(8 * block_y + iy, height - 1); + int px = y * width + x; + + curR[i] = lut[rgb_orig_[3 * px]]; + curG[i] = lut[rgb_orig_[3 * px + 1]]; + curB[i] = lut[rgb_orig_[3 * px + 2]]; + } + } + + int xmin = block_x * 8; + int ymin = block_y * 8; + + imgMaskXyzScaleBlockList[block_ix * 3] = mask_xyz_[0][ymin * width_ + xmin]; + imgMaskXyzScaleBlockList[block_ix * 3 + 1] = mask_xyz_[1][ymin * width_ + xmin]; + imgMaskXyzScaleBlockList[block_ix * 3 + 2] = mask_xyz_[2][ymin * width_ + xmin]; + } + } + } + + void ButteraugliComparatorEx::FinishBlockComparisons() { + ButteraugliComparator::FinishBlockComparisons(); + + imgOpsinDynamicsBlockList.clear(); + imgMaskXyzScaleBlockList.clear(); + } + + void ButteraugliComparatorEx::SwitchBlock(int block_x, int block_y, int factor_x, int factor_y) + { + block_x_ = block_x; + block_y_ = block_y; + factor_x_ = factor_x; + factor_y_ = factor_y; + + ButteraugliComparator::SwitchBlock(block_x, block_y, factor_x, factor_y); + } + + double ButteraugliComparatorEx::CompareBlock(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block, const int comp_mask) const + { + double err = ButteraugliComparator::CompareBlock(img, off_x, off_y, candidate_block, comp_mask); + return err; + if (g_checkOpenCL) + { + double err1 = ButteraugliComparator::CompareBlock(img, off_x, off_y, candidate_block, comp_mask); + if (err1 != err) + { + LogError("CHK %s(%d) \r\n", __FUNCTION__, __LINE__); + } + } + + return err; + } + + double ButteraugliComparatorEx::CompareBlockEx(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block, const int comp_mask) const + { + const int block_x = block_x_; + const int block_y = block_y_; + const int factor = factor_x_; + + const coeff_t *candidate_channel[3]; + channel_info mayout_channel[3]; + for (int c = 0; c < 3; c++) + { + candidate_channel[c] = &candidate_block[c * 8 * 8]; + mayout_channel[c].block_height = img.component(c).height_in_blocks(); + mayout_channel[c].block_width = img.component(c).width_in_blocks(); + mayout_channel[c].factor = img.component(c).factor_x(); + mayout_channel[c].pixel = img.component(c).pixels(); + mayout_channel[c].coeff = img.component(c).coeffs(); + } + + return CompareBlockFactor(mayout_channel, + candidate_block, + block_x, + block_y, + imgOpsinDynamicsBlockList.data(), + imgMaskXyzScaleBlockList.data(), + width_, + height_, + factor_x_); + } +} diff --git a/clguetzli/clguetzli_comparator.h b/clguetzli/clguetzli_comparator.h index 840254a7..721fcb32 100644 --- a/clguetzli/clguetzli_comparator.h +++ b/clguetzli/clguetzli_comparator.h @@ -16,16 +16,7 @@ namespace guetzli { void SwitchBlock(int block_x, int block_y, int factor_x, int factor_y) override; double CompareBlock(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block, const int comp_mask) const override; - double CompareBlockEx(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block) const; - double CompareBlockEx2(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block, const int comp_mask) const; - private: - int GetOrigBlock(std::vector< std::vector > &rgb0_c, int off_x, int off_y) const; - double ComputeImage8x8Block(std::vector > &rgb0_c, - std::vector > &rgb1_c, - int block_8x8idx) const; - - int getCurrentBlockIdx(void) const; - int getCurrentBlock8x8Idx(int off_x, int off_y) const; + double CompareBlockEx(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block, const int comp_mask) const; public: std::vector imgOpsinDynamicsBlockList; // [RR..RRGG..GGBB..BB]:blockCount std::vector imgMaskXyzScaleBlockList; // [RGBRGB..RGBRGB]:blockCount From 5a546242b50ab07ac8a90c80569e31ff6301b824 Mon Sep 17 00:00:00 2001 From: strongtu Date: Sat, 20 May 2017 23:42:52 +0800 Subject: [PATCH 094/189] =?UTF-8?q?=E6=B8=85=E7=90=86=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clguetzli.cl | 171 +------- clguetzli/clguetzli.cl.cpp | 44 +- clguetzli/clguetzli.cl.h | 84 ++++ clguetzli/clguetzli.cpp | 1 - clguetzli/clguetzli.h | 27 +- clguetzli/clguetzli_comparator.cpp | 643 ----------------------------- clguetzli/clguetzli_comparator.h | 25 -- clguetzli/ocl.h | 1 - 8 files changed, 134 insertions(+), 862 deletions(-) create mode 100644 clguetzli/clguetzli.cl.h delete mode 100644 clguetzli/clguetzli_comparator.cpp delete mode 100644 clguetzli/clguetzli_comparator.h diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index 7da649e6..793e0f0b 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -2,11 +2,11 @@ #include "clguetzli\clguetzli.cl.h" -#define kBlockEdge 8 -#define kBlockSize (kBlockEdge * kBlockEdge) -#define kDCTBlockSize (kBlockEdge * kBlockEdge) +#define kBlockEdge 8 +#define kBlockSize (kBlockEdge * kBlockEdge) +#define kDCTBlockSize (kBlockEdge * kBlockEdge) #define kBlockEdgeHalf (kBlockEdge / 2) -#define kBlockHalf (kBlockEdge * kBlockEdgeHalf) +#define kBlockHalf (kBlockEdge * kBlockEdgeHalf) void XybToVals(double x, double y, double z, double *valx, double *valy, double *valz); double InterpolateClampNegative(__global const double *array, int size, double sx); @@ -2941,17 +2941,7 @@ void coeffcopy(coeff_t *dst, const coeff_t *src, int size) } } -void CalcOpsinDynamicsImage(ocl_channels rgb) -{ - float rgb_blurred[3][kDCTBlockSize]; - for (int i = 0; i < 3; i++) - { - BlurEx(rgb.ch[i], 8, 8, 1.1, 0, rgb_blurred[i]); - } - OpsinDynamicsImageBlock(rgb.r, rgb.g, rgb.b, rgb_blurred[0], rgb_blurred[1], rgb_blurred[2], kDCTBlockSize); -} - -void CalcOpsinDynamicsImage2(__private float rgb[3][kDCTBlockSize]) +void CalcOpsinDynamicsImage(__private float rgb[3][kDCTBlockSize]) { float rgb_blurred[3][kDCTBlockSize]; for (int i = 0; i < 3; i++) @@ -2963,8 +2953,8 @@ void CalcOpsinDynamicsImage2(__private float rgb[3][kDCTBlockSize]) double ComputeImage8x8Block(__private float rgb0_c[3][kDCTBlockSize], __private float rgb1_c[3][kDCTBlockSize], const __global float* mask_scale_block) { - CalcOpsinDynamicsImage2(rgb0_c); - CalcOpsinDynamicsImage2(rgb1_c); + CalcOpsinDynamicsImage(rgb0_c); + CalcOpsinDynamicsImage(rgb1_c); float rgb0[3][kDCTBlockSize]; float rgb1[3][kDCTBlockSize]; @@ -3005,153 +2995,6 @@ double ComputeImage8x8Block(__private float rgb0_c[3][kDCTBlockSize], __private return sqrt((1 - kEdgeWeight) * diff + kEdgeWeight * diff_edge); } -// strong todo -// candidate_block [R....R][G....G][B....B] -// orig_image_block [RR..RRGG..GGBB..BB] -// mask_scale[RGB] -float CompareBlockEx(coeff_t *candidate_block, __global const float* orig_image_block, __global const float* mask_scale_block) -{ - float rgb0[3][kDCTBlockSize]; - float rgb1[3][kDCTBlockSize]; - { - float rgb0_data[3*kDCTBlockSize]; - ocl_channels rgb0_c; - rgb0_c.r = &rgb0_data[0]; - rgb0_c.g = &rgb0_data[kDCTBlockSize]; - rgb0_c.b = &rgb0_data[2 * kDCTBlockSize]; - for (int i = 0; i < 3*kDCTBlockSize; i++) - { - rgb0_data[i] = orig_image_block[i]; - } - - float image_block[3 * kDCTBlockSize]; - ocl_channels rgb1_c; - rgb1_c.r = &image_block[0]; - rgb1_c.g = &image_block[kDCTBlockSize]; - rgb1_c.b = &image_block[2 * kDCTBlockSize]; - BlockToImage(candidate_block, rgb1_c.r, rgb1_c.g, rgb1_c.b, 8, 8); - - CalcOpsinDynamicsImage(rgb0_c); - CalcOpsinDynamicsImage(rgb1_c); - - floatcopy(&rgb0[0][0], rgb0_data, 3 * kDCTBlockSize); - floatcopy(&rgb1[0][0], image_block, 3 * kDCTBlockSize); - - MaskHighIntensityChangeBlock(rgb0[0],rgb0[1], rgb0[2], - rgb1[0], rgb1[1], rgb1[2], - rgb0_c.ch[0], rgb0_c.ch[1], rgb0_c.ch[2], - rgb1_c.ch[0], rgb1_c.ch[1], rgb1_c.ch[2], - 8, 8); - - } - - // ÕâÀïΪɶҪ°Ñfloatת³Édouble²ÅÄܼÌÐø×ö¼ÆË㣿 - double b0[3 * kDCTBlockSize]; // - double b1[3 * kDCTBlockSize]; - for (int c = 0; c < 3; ++c) { - for (int ix = 0; ix < kDCTBlockSize; ++ix) { - b0[c * kDCTBlockSize + ix] = rgb0[c][ix]; - b1[c * kDCTBlockSize + ix] = rgb1[c][ix]; - } - } - - double diff_xyz_dc[3] = { 0.0 }; - double diff_xyz_ac[3] = { 0.0 }; - double diff_xyz_edge_dc[3] = { 0.0 }; - ButteraugliBlockDiff(b0, b1, diff_xyz_dc, diff_xyz_ac, diff_xyz_edge_dc); - - double diff = 0.0; - double diff_edge = 0.0; - - for (int c = 0; c < 3; ++c) { - diff += diff_xyz_dc[c] * mask_scale_block[c]; - diff += diff_xyz_ac[c] * mask_scale_block[c]; - diff_edge += diff_xyz_edge_dc[c] * mask_scale_block[c]; - } - const double kEdgeWeight = 0.05; - return sqrt((1 - kEdgeWeight) * diff + kEdgeWeight * diff_edge); -} - -// strong todo -// batchÊÇÖ¸ÒѾ­¶þά¿éÕ¹¿ªÎªÁËһά¿é -__kernel void clComputeBlockZeroingOrder(__global const coeff_t *orig_batch, // ԭʼͼÏñϵÊý - __global const float *orig_image_batch, // ԭʼͼÏñpregammaºó - __global const float *mask_scale, // ԭʼͼÏñµÄij¸öÉñÃØ²ÎÊý - __global const coeff_t *mayout_batch, // Êä³ö±¸Ñ¡Í¼µÄϵÊý - float BlockErrorLimit, - __global CoeffData *output_order_list/*out*/) -{ - int block_idx = get_global_id(0); -#define kComputeBlockSize (kBlockSize * 3) - - __global const coeff_t *orig_block = orig_batch + block_idx * kComputeBlockSize; - __global const coeff_t *mayout_block = mayout_batch + block_idx * kComputeBlockSize; - __global const float *orig_image_block = orig_image_batch + block_idx * kComputeBlockSize; - - DCTScoreData input_order_data[kComputeBlockSize]; - CoeffData output_order_data[kComputeBlockSize]; - - IntFloatPairList input_order = { 0, input_order_data }; - IntFloatPairList output_order = { 0, output_order_data }; - - int count = MakeInputOrder(mayout_block, orig_block, &input_order, kBlockSize); - - coeff_t processed_block[kComputeBlockSize]; - for (int i = 0; i < kComputeBlockSize; i++) { - processed_block[i] = mayout_block[i]; - } - - while (input_order.size > 0) - { - float best_err = 1e17f; - int best_i = 0; - for (int i = 0; i < min(3, input_order.size); i++) - { - coeff_t candidate_block[kComputeBlockSize]; - for (int i = 0; i < kComputeBlockSize; i++) { - candidate_block[i] = processed_block[i]; - } - - const int idx = input_order.pData[i].idx; - - candidate_block[idx] = 0; - - float max_err = CompareBlockEx(candidate_block, orig_image_block, mask_scale + block_idx * 3); - if (max_err < best_err) - { - best_err = max_err; - best_i = i; - } - } - - int idx = input_order.pData[best_i].idx; - processed_block[idx] = 0; - list_erase(&input_order, best_i); - - list_push_back(&output_order, idx, best_err); - } - // ×¢Òâoutput_orderÕâÀïµÄresize¾ÍÊǰÑβ²¿µÄÖÃλ0 - float min_err = 1e10; - for (int i = output_order.size - 1; i >= 0; --i) { - min_err = min(min_err, output_order.pData[i].err); - output_order.pData[i].err = min_err; - } - - __global CoeffData *output_block = output_order_list + block_idx * kComputeBlockSize; - - int out_count = 0; - for (int i = 0; i < kComputeBlockSize && i < output_order.size; i++) - { - // ¹ýÂ˽ϴóµÄerr£¬Õⲿ·Ö½øÈëºó¶Ë¼ÆËãûÓÐÒâÒå - if (output_order.pData[i].err <= BlockErrorLimit) - { - output_block[out_count].idx = output_order.pData[i].idx; - output_block[out_count].err = output_order.pData[i].err; - out_count++; - } - } -} - // return the count of Non-zero item int MakeInputOrderEx(const coeff_t block[3*8*8], const coeff_t orig_block[3*8*8], IntFloatPairList *input_order) { diff --git a/clguetzli/clguetzli.cl.cpp b/clguetzli/clguetzli.cl.cpp index fb994d7c..9e6f7b87 100644 --- a/clguetzli/clguetzli.cl.cpp +++ b/clguetzli/clguetzli.cl.cpp @@ -1,29 +1,32 @@ #include #include +#include #include "utils.h" -#include "clguetzli_comparator.h" - -extern bool g_useOpenCL; -extern bool g_checkOpenCL; using namespace std; -int get_global_id(int dim) -{ - return 0; -} +int g_idvec[10] = { 0 }; +int g_sizevec[10] = { 0 }; -int get_global_size(int dim) -{ - return 0; +int get_global_id(int dim) { + return g_idvec[dim]; +} +int get_global_size(int dim) { + return g_sizevec[dim]; } -#define abs(exper) fabs((exper)) +void set_global_id(int dim, int id){ + g_idvec[dim] = id; +} +void set_global_size(int dim, int size){ + g_sizevec[dim] = size; +} #define __opencl +#define abs(exper) fabs((exper)) +#include "clguetzli.h" #include "clguetzli.cl" - namespace guetzli { ButteraugliComparatorEx::ButteraugliComparatorEx(const int width, const int height, @@ -90,11 +93,6 @@ namespace guetzli void ButteraugliComparatorEx::SwitchBlock(int block_x, int block_y, int factor_x, int factor_y) { - block_x_ = block_x; - block_y_ = block_y; - factor_x_ = factor_x; - factor_y_ = factor_y; - ButteraugliComparator::SwitchBlock(block_x, block_y, factor_x, factor_y); } @@ -116,15 +114,9 @@ namespace guetzli double ButteraugliComparatorEx::CompareBlockEx(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block, const int comp_mask) const { - const int block_x = block_x_; - const int block_y = block_y_; - const int factor = factor_x_; - - const coeff_t *candidate_channel[3]; channel_info mayout_channel[3]; for (int c = 0; c < 3; c++) { - candidate_channel[c] = &candidate_block[c * 8 * 8]; mayout_channel[c].block_height = img.component(c).height_in_blocks(); mayout_channel[c].block_width = img.component(c).width_in_blocks(); mayout_channel[c].factor = img.component(c).factor_x(); @@ -134,8 +126,8 @@ namespace guetzli return CompareBlockFactor(mayout_channel, candidate_block, - block_x, - block_y, + block_x_, + block_y_, imgOpsinDynamicsBlockList.data(), imgMaskXyzScaleBlockList.data(), width_, diff --git a/clguetzli/clguetzli.cl.h b/clguetzli/clguetzli.cl.h new file mode 100644 index 00000000..53a89eef --- /dev/null +++ b/clguetzli/clguetzli.cl.h @@ -0,0 +1,84 @@ +#ifndef __CLGUETZLI_CL_H__ +#define __CLGUETZLI_CL_H__ + +#ifdef __cplusplus + +#define __kernel +#define __private +#define __global +#define __constant +typedef unsigned char uchar; +typedef unsigned short ushort; + +int get_global_id(int dim); +int get_global_size(int dim); +void set_global_id(int dim, int id); +void set_global_size(int dim, int size); + +#ifdef __opencl +typedef union ocl_channels_t +{ + struct + { + float * r; + float * g; + float * b; + }; + union + { + float *ch[3]; + }; +}ocl_channels; +#else +typedef union ocl_channels_t +{ + struct + { + cl_mem r; + cl_mem g; + cl_mem b; + }; + struct + { + cl_mem x; + cl_mem y; + cl_mem b; + }; + union + { + cl_mem ch[3]; + }; +}ocl_channels; + +#endif + +#else +typedef union ocl_channels_t +{ + struct + { + float * r; + float * g; + float * b; + }; + + union + { + float *ch[3]; + }; +}ocl_channels; + +#endif + +typedef short coeff_t; + +typedef struct __channel_info_t +{ + int factor; + int block_width; + int block_height; + __global const coeff_t *coeff; + __global const ushort *pixel; +}channel_info; + +#endif \ No newline at end of file diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index b5d7c4a2..d04e8c1c 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -63,7 +63,6 @@ ocl_args_d_t& getOcl(void) ocl.kernel[KERNEL_EDGEDETECTOR] = clCreateKernel(ocl.program, "clEdgeDetectorMap", &err); ocl.kernel[KERNEL_BLOCKDIFFMAP] = clCreateKernel(ocl.program, "clBlockDiffMap", &err); ocl.kernel[KERNEL_EDGEDETECTORLOWFREQ] = clCreateKernel(ocl.program, "clEdgeDetectorLowFreq", &err); - ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER] = clCreateKernel(ocl.program, "clComputeBlockZeroingOrder", &err); ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDERFACTOR] = clCreateKernel(ocl.program, "clComputeBlockZeroingOrderFactor", &err); return ocl; diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h index 9be1ac10..3a20eaa1 100644 --- a/clguetzli/clguetzli.h +++ b/clguetzli/clguetzli.h @@ -1,9 +1,9 @@ #pragma once +#include #include "CL\cl.h" -#include "guetzli\jpeg_data.h" #include "guetzli\processor.h" +#include "guetzli\butteraugli_comparator.h" #include "ocl.h" - #include "clguetzli.cl.h" extern bool g_useOpenCL; @@ -94,3 +94,26 @@ void clScaleImageEx(cl_mem img/*in, out*/, size_t size, double w); void clDiffPrecomputeEx(ocl_channels xyb0, ocl_channels xyb1, size_t xsize, size_t ysize, ocl_channels mask/*out*/); void clAverage5x5Ex(cl_mem img/*in,out*/, size_t xsize, size_t ysize); + +class guetzli::OutputImage; + +namespace guetzli { + + class ButteraugliComparatorEx : public ButteraugliComparator + { + public: + ButteraugliComparatorEx(const int width, const int height, + const std::vector* rgb, + const float target_distance, ProcessStats* stats); + + void StartBlockComparisons() override; + void FinishBlockComparisons() override; + void SwitchBlock(int block_x, int block_y, int factor_x, int factor_y) override; + + double CompareBlock(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block, const int comp_mask) const override; + double CompareBlockEx(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block, const int comp_mask) const; + public: + std::vector imgOpsinDynamicsBlockList; // [RR..RRGG..GGBB..BB]:blockCount + std::vector imgMaskXyzScaleBlockList; // [RGBRGB..RGBRGB]:blockCount + }; +} \ No newline at end of file diff --git a/clguetzli/clguetzli_comparator.cpp b/clguetzli/clguetzli_comparator.cpp deleted file mode 100644 index b003ecb8..00000000 --- a/clguetzli/clguetzli_comparator.cpp +++ /dev/null @@ -1,643 +0,0 @@ -#include -#include -#include "clguetzli_comparator.h" -#include "guetzli\idct.h" -#include "guetzli\color_transform.h" -#include "guetzli\gamma_correct.h" -#include "clguetzli\ocl.h" -#include "clguetzli\clguetzli.h" - -using namespace guetzli; - -void CoeffToIDCT(const coeff_t block[8*8], uint8_t idct[8*8]) -{ - guetzli::ComputeBlockIDCT(block, idct); -} - -void IDCTToPixel8x8(const uint8_t idct[8 * 8], uint16_t pixels_[8*8]) -{ - const int block_x = 0; - const int block_y = 0; - const int width_ = 8; - const int height_ = 8; - - for (int iy = 0; iy < 8; ++iy) { - for (int ix = 0; ix < 8; ++ix) { - int x = 8 * block_x + ix; - int y = 8 * block_y + iy; - if (x >= width_ || y >= height_) continue; - int p = y * width_ + x; - pixels_[p] = idct[8 * iy + ix] << 4; - } - } -} - -void IDCTToPixel16x16(const uint8_t idct[8*8], uint16_t pixels_out[16*16], const uint16_t *pixel_orig, int block_x, int block_y, int width_, int height_) -{ - // Fill in the 10x10 pixel area in the subsampled image that will be the - // basis of the upsampling. This area is enough to hold the 3x3 kernel of - // the fancy upsampler around each pixel. - static const int kSubsampledEdgeSize = 10; - uint16_t subsampled[kSubsampledEdgeSize * kSubsampledEdgeSize]; - for (int j = 0; j < kSubsampledEdgeSize; ++j) { - // The order we fill in the rows is: - // 8 rows intersecting the block, row below, row above - const int y0 = block_y * 16 + (j < 9 ? j * 2 : -2); - for (int i = 0; i < kSubsampledEdgeSize; ++i) { - // The order we fill in each row is: - // 8 pixels within the block, left edge, right edge - const int ix = ((j < 9 ? (j + 1) * kSubsampledEdgeSize : 0) + - (i < 9 ? i + 1 : 0)); - const int x0 = block_x * 16 + (i < 9 ? i * 2 : -2); - if (x0 < 0) { - subsampled[ix] = subsampled[ix + 1]; - } - else if (y0 < 0) { - subsampled[ix] = subsampled[ix + kSubsampledEdgeSize]; - } - else if (x0 >= width_) { - subsampled[ix] = subsampled[ix - 1]; - } - else if (y0 >= height_) { - subsampled[ix] = subsampled[ix - kSubsampledEdgeSize]; - } - else if (i < 8 && j < 8) { - subsampled[ix] = idct[j * 8 + i] << 4; - } - else { - // Reconstruct the subsampled pixels around the edge of the current - // block by computing the inverse of the fancy upsampler. - const int y1 = std::max(y0 - 1, 0); - const int x1 = std::max(x0 - 1, 0); - subsampled[ix] = (pixel_orig[y0 * width_ + x0] * 9 + - pixel_orig[y1 * width_ + x1] + - pixel_orig[y0 * width_ + x1] * -3 + - pixel_orig[y1 * width_ + x0] * -3) >> 2; - } - } - } - // Determine area to update. - int xmin = block_x * 16; // std::max(block_x * 16 - 1, 0); - int xmax = std::min(block_x * 16 + 15, width_ - 1); - int ymin = block_y * 16; // std::max(block_y * 16 - 1, 0); - int ymax = std::min(block_y * 16 + 15, height_ - 1); - - // Apply the fancy upsampler on the subsampled block. - for (int y = ymin; y <= ymax; ++y) { - const int y0 = ((y & ~1) / 2 - block_y * 8 + 1) * kSubsampledEdgeSize; - const int dy = ((y & 1) * 2 - 1) * kSubsampledEdgeSize; - for (int x = xmin; x <= xmax; ++x) { - const int x0 = (x & ~1) / 2 - block_x * 8 + 1; - const int dx = (x & 1) * 2 - 1; - const int ix = x0 + y0; - - int out_x = x - xmin; - int out_y = y - ymin; - - pixels_out[out_y * 16 + out_x] = (subsampled[ix] * 9 + subsampled[ix + dy] * 3 + - subsampled[ix + dx] * 3 + subsampled[ix + dx + dy]) >> 4; - } - } -} - -// out = [YUVYUV....YUVYUV] -void PixelToYUV(uint16_t pixels_[8*8], uint8_t out[8*8], int xsize = 8, int ysize = 8) -{ - const int stride = 3; - - for (int y = 0; y < xsize; ++y) { - for (int x = 0; x < ysize; ++x) { - int px = y * xsize + x; - *out = static_cast((pixels_[px] + 8 - (x & 1)) >> 4); - out += stride; - } - } -} - -// pixel = [YUVYUV...YUVYUV] to [RGBRGB...RGBRGB] -void YUVToRGB(uint8_t pixelBlock[3*8*8], int size = 8 * 8) -{ - for (int i = 0; i < size; i++) - { - uint8_t *pixel = &pixelBlock[i*3]; - - int y = pixel[0]; - int cb = pixel[1]; - int cr = pixel[2]; - pixel[0] = kRangeLimit[y + kCrToRedTable[cr]]; - pixel[1] = kRangeLimit[y + ((kCrToGreenTable[cr] + kCbToGreenTable[cb]) >> 16)]; - pixel[2] = kRangeLimit[y + kCbToBlueTable[cb]]; - } -} - -void YUVToImage(uint8_t yuv[3 * 8 * 8], float* r, float* g, float* b, int xsize = 8, int ysize = 8, int inside_x = 8, int inside_y = 8) -{ - YUVToRGB(yuv, xsize * ysize); - - const double* lut = Srgb8ToLinearTable(); - - for (int i = 0; i < xsize * ysize; i++) - { - r[i] = lut[yuv[3 * i]]; - g[i] = lut[yuv[3 * i + 1]]; - b[i] = lut[yuv[3 * i + 2]]; - } - for (int y = 0; y < inside_y; y++) - { - for (int x = inside_x; x < xsize; x++) - { - int idx = y * xsize + (inside_x - 1); - r[y * xsize + x] = r[idx]; - g[y * xsize + x] = g[idx]; - b[y * xsize + x] = b[idx]; - } - } - for (int y = inside_y; y < ysize; y++) - { - for (int x = 0; x < xsize; x++) - { - int idx = (inside_y - 1) * xsize + x; - r[y * xsize + x] = r[idx]; - g[y * xsize + x] = g[idx]; - b[y * xsize + x] = b[idx]; - } - } -} - -// block = [R....R][G....G][B.....] -void BlockToImage(const coeff_t block[8*8*3], float* r, float* g, float* b, int inside_x, int inside_y) -{ - uint8_t idct[3][8 * 8]; - CoeffToIDCT(&block[0], idct[0]); - CoeffToIDCT(&block[8 * 8], idct[1]); - CoeffToIDCT(&block[8 * 8 * 2], idct[2]); - - uint16_t pixels[3][8 * 8]; - IDCTToPixel8x8(idct[0], pixels[0]); - IDCTToPixel8x8(idct[1], pixels[1]); - IDCTToPixel8x8(idct[2], pixels[2]); - - uint8_t yuv[8 * 8 * 3]; - PixelToYUV(pixels[0], &yuv[0]); - PixelToYUV(pixels[1], &yuv[1]); - PixelToYUV(pixels[2], &yuv[2]); - - YUVToRGB(yuv); - - const double* lut = Srgb8ToLinearTable(); - - for (int i = 0; i < 8 * 8; i++) - { - r[i] = lut[yuv[3 * i]]; - g[i] = lut[yuv[3 * i + 1]]; - b[i] = lut[yuv[3 * i + 2]]; - } - for (int y = 0; y < inside_y; y++) - { - for (int x = inside_x; x < 8; x++) - { - int idx = y * 8 + (inside_x - 1); - r[y * 8 + x] = r[idx]; - g[y * 8 + x] = g[idx]; - b[y * 8 + x] = b[idx]; - } - } - for (int y = inside_y; y < 8; y++) - { - for (int x = 0; x < 8; x++) - { - int idx = (inside_y - 1) * 8 + x; - r[y * 8 + x] = r[idx]; - g[y * 8 + x] = g[idx]; - b[y * 8 + x] = b[idx]; - } - } -} - -void CoeffToYUV16x16(const coeff_t block[8 * 8], uint8_t *yuv, const uint16_t *pixel_orig, int block_x, int block_y, int width_, int height_) -{ - uint8_t idct[8 * 8]; - CoeffToIDCT(&block[0], &idct[0]); - - uint16_t pixels[16 * 16]; - IDCTToPixel16x16(idct, pixels, pixel_orig, block_x, block_y, width_, height_); - - PixelToYUV(pixels, yuv, 16, 16); -} - -void CoeffToYUV8x8(const coeff_t block[8 * 8], uint8_t *yuv) -{ - uint8_t idct[8 * 8]; - CoeffToIDCT(&block[0], &idct[0]); - - uint16_t pixels[8 * 8]; - IDCTToPixel8x8(idct, pixels); - - PixelToYUV(pixels, yuv); -} - -void Copy8x8To16x16(const uint8_t yuv8x8[3 * 8 * 8], uint8_t yuv16x16[3 * 16 * 16], int off_x, int off_y) -{ - for (int y = 0; y < 8; y++) - { - for (int x = 0; x < 8; x++) - { - int idx = y * 8 + x; - int idx16 = (y + off_y * 8) * 16 + (x + off_x * 8); - yuv16x16[idx16 * 3] = yuv8x8[idx * 3]; - } - } -} - -void Copy16x16To8x8(const uint8_t yuv16x16[3 * 16 * 16], uint8_t yuv8x8[3 * 8 * 8], int off_x, int off_y) -{ - for (int y = 0; y < 8; y++) - { - for (int x = 0; x < 8; x++) - { - int idx = y * 8 + x; - int idx16 = (y + off_y * 8) * 16 + (x + off_x * 8); - yuv8x8[idx * 3] = yuv16x16[idx16 * 3]; - } - } -} - -void Copy16x16ToChannel(const float rgb16x16[3][16 * 16], float r[8 * 8], float g[8 * 8], float b[8 * 8], int off_x, int off_y) -{ - for (int y = 0; y < 8; y++) - { - for (int x = 0; x < 8; x++) - { - int idx = y * 8 + x; - int idx16 = (y + off_y * 8) * 16 + (x + off_x * 8); - r[idx] = rgb16x16[0][idx16]; - g[idx] = rgb16x16[1][idx16]; - b[idx] = rgb16x16[2][idx16]; - } - } -} - -namespace guetzli -{ - ButteraugliComparatorEx::ButteraugliComparatorEx(const int width, const int height, - const std::vector* rgb, - const float target_distance, ProcessStats* stats) - : ButteraugliComparator(width, height, rgb, target_distance, stats) - { - - } - - void ButteraugliComparatorEx::StartBlockComparisons() - { - ButteraugliComparator::StartBlockComparisons(); - - const int width = width_; - const int height = height_; - const int factor_x = 1; - const int factor_y = 1; - - const int block_width = (width + 8 * factor_x - 1) / (8 * factor_x); - const int block_height = (height + 8 * factor_y - 1) / (8 * factor_y); - const int num_blocks = block_width * block_height; - - const double* lut = Srgb8ToLinearTable(); - - imgOpsinDynamicsBlockList.resize(num_blocks * 3 * kDCTBlockSize); - imgMaskXyzScaleBlockList.resize(num_blocks * 3); - for (int block_y = 0, block_ix = 0; block_y < block_height; ++block_y) - { - for (int block_x = 0; block_x < block_width; ++block_x, ++block_ix) - { - float* curR = &imgOpsinDynamicsBlockList[block_ix * 3 * kDCTBlockSize]; - float* curG = curR + kDCTBlockSize; - float* curB = curG + kDCTBlockSize; - - for (int iy = 0, i = 0; iy < 8; ++iy) { - for (int ix = 0; ix < 8; ++ix, ++i) { - int x = std::min(8 * block_x + ix, width - 1); - int y = std::min(8 * block_y + iy, height - 1); - int px = y * width + x; - - curR[i] = lut[rgb_orig_[3 * px]]; - curG[i] = lut[rgb_orig_[3 * px + 1]]; - curB[i] = lut[rgb_orig_[3 * px + 2]]; - } - } - - int xmin = block_x * 8; - int ymin = block_y * 8; - - imgMaskXyzScaleBlockList[block_ix * 3] = mask_xyz_[0][ymin * width_ + xmin]; - imgMaskXyzScaleBlockList[block_ix * 3 + 1] = mask_xyz_[1][ymin * width_ + xmin]; - imgMaskXyzScaleBlockList[block_ix * 3 + 2] = mask_xyz_[2][ymin * width_ + xmin]; - } - } - } - - void ButteraugliComparatorEx::FinishBlockComparisons() { - ButteraugliComparator::FinishBlockComparisons(); - - imgOpsinDynamicsBlockList.clear(); - imgMaskXyzScaleBlockList.clear(); - } - - void ButteraugliComparatorEx::SwitchBlock(int block_x, int block_y, int factor_x, int factor_y) - { - block_x_ = block_x; - block_y_ = block_y; - factor_x_ = factor_x; - factor_y_ = factor_y; - - ButteraugliComparator::SwitchBlock(block_x, block_y, factor_x, factor_y); - } - - double ButteraugliComparatorEx::CompareBlock(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block, const int comp_mask) const - { - double err = ButteraugliComparator::CompareBlock(img, off_x, off_y, candidate_block, comp_mask); - return err; - if (g_checkOpenCL) - { - double err1 = ButteraugliComparator::CompareBlock(img, off_x, off_y, candidate_block, comp_mask); - if (err1 != err) - { - LogError("CHK %s(%d) \r\n", __FUNCTION__, __LINE__); - } - } - - return err; - } - - double ButteraugliComparatorEx::CompareBlockEx(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block) const - { - int block_ix = getCurrentBlockIdx(); - - const float* block_opsin = &imgOpsinDynamicsBlockList[block_ix * 3 * kDCTBlockSize]; - - // Õâ¿éÊÇԭʼͼÏñ - std::vector< std::vector > rgb0_c; - rgb0_c.resize(3); - for (int i = 0; i < 3; i++) - { - rgb0_c[i].resize(kDCTBlockSize); - memcpy(rgb0_c[i].data(), block_opsin + i * kDCTBlockSize, kDCTBlockSize * sizeof(float)); - } - - // imgÊÇÈ«¾ÖÓÅ»¯ºóµÄͼÏñ£¬ÎÒÃÇͨ¹ýcoeff_tÊý¾Ý·´Ëã³öÀ´rgb - int inside_x = block_x_ * 8 + 8 > width_ ? width_ - block_x_ * 8 : 8; - int inside_y = block_y_ * 8 + 8 > height_ ? height_ - block_y_ * 8 : 8; - std::vector > rgb1_c(3, std::vector(kDCTBlockSize)); - BlockToImage(candidate_block, rgb1_c[0].data(), rgb1_c[1].data(), rgb1_c[2].data(), inside_x, inside_y); -/* - { - // ¿ÉÄÜ»¹ÓÐÎÊÌ⣬ÎÒÃÇ×öÒ»¸öУÑé - int block_x = block_x_ * factor_x_ + off_x; - int block_y = block_y_ * factor_y_ + off_y; - int xmin = 8 * block_x; - int ymin = 8 * block_y; - - std::vector > rgb1_c2(3, std::vector(kDCTBlockSize)); - img.ToLinearRGB(xmin, ymin, 8, 8, &rgb1_c2); - - for (int i = 0; i < 3; i++) - { - for (int k = 0; k < 64; k++) - { - if (fabs(rgb1_c[i][k] - rgb1_c2[i][k]) > 0.001) - { - LogError("Error: CompareBlock misstake.\n"); - } - } - } - } -*/ - // ÏÂÃæÊǼÆË㹤×÷ - return ComputeImage8x8Block(rgb0_c, rgb1_c, getCurrentBlock8x8Idx(off_x, off_y)); - } - - int ButteraugliComparatorEx::GetOrigBlock(std::vector< std::vector > &rgb0_c, int off_x, int off_y) const - { - int block_xx = block_x_ * factor_x_ + off_x; - int block_yy = block_y_ * factor_y_ + off_y; - if (block_xx * 8 >= width_ || block_yy * 8 >= height_) return -1; - - const int block8_width = (width_ + 8 - 1) / 8; - - int block_ix = block_yy * block8_width + block_xx; - - rgb0_c.resize(3); - const float* block_opsin = &imgOpsinDynamicsBlockList[block_ix * 3 * kDCTBlockSize]; - for (int i = 0; i < 3; i++) - { - rgb0_c[i].resize(kDCTBlockSize); - memcpy(rgb0_c[i].data(), block_opsin + i * kDCTBlockSize, kDCTBlockSize * sizeof(float)); - } - - return block_ix; - } - - double ButteraugliComparatorEx::CompareBlockEx2(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block, const int comp_mask) const - { - const int block_x = block_x_; - const int block_y = block_y_; - const int factor = factor_x_; - - const coeff_t *candidate_channel[3]; - channel_info mayout_channel[3]; - for (int c = 0; c < 3; c++) - { - candidate_channel[c] = &candidate_block[c * 8 * 8]; - mayout_channel[c].block_height = img.component(c).height_in_blocks(); - mayout_channel[c].block_width = img.component(c).width_in_blocks(); - mayout_channel[c].factor = img.component(c).factor_x(); - mayout_channel[c].pixel = img.component(c).pixels(); - mayout_channel[c].coeff = img.component(c).coeffs(); - } - - uint8_t yuv16x16[3 * 16 * 16] = { 0 }; // factor 2 mode output image - uint8_t yuv8x8[3 * 8 * 8] = { 0 }; // factor 1 mode output image - - // ²»¹Ücomp_maskÈçºÎ£¬×ª»»ÎªRGB×ÜÊÇÐèÒªµÄ - for (int c = 0; c < 3; c++) - { - if (mayout_channel[c].factor == 1) { - if (factor == 1) { // channel_factor == factor ˵Ã÷Òª½éÈëÔËË㣬²ÉÓÃcandidateÖеÄϵÊý - const coeff_t * coeff_block = candidate_channel[c]; - CoeffToYUV8x8(coeff_block, &yuv8x8[c]); - } - else { - for (int iy = 0; iy < factor; ++iy) { - for (int ix = 0; ix < factor; ++ix) { - int block_xx = block_x * factor + ix; - int block_yy = block_y * factor + iy; - - if (ix != off_x || iy != off_y) continue; - if (block_xx >= mayout_channel[c].block_width || - block_yy >= mayout_channel[c].block_height) - { - continue; - } - int block_8x8idx = block_yy * mayout_channel[c].block_width + block_xx; - const coeff_t * coeff_block = mayout_channel[c].coeff + block_8x8idx * 8 * 8; - CoeffToYUV8x8(coeff_block, &yuv8x8[c]); - - // copy YUV8x8 to YUV1616 corner - Copy8x8To16x16(&yuv8x8[c], &yuv16x16[c], ix, iy); - } - } - } - } - else { - if (factor == 1) { - int block_xx = block_x / mayout_channel[c].factor; - int block_yy = block_y / mayout_channel[c].factor; - int ix = block_x % mayout_channel[c].factor;; - int iy = block_y % mayout_channel[c].factor; - - int block_16x16idx = block_yy * mayout_channel[c].block_width + block_xx; - const coeff_t * coeff_block = mayout_channel[c].coeff + block_16x16idx * 8 * 8; -/* - uint8_t ch[16 * 16] = { 0 }; - img.component(c).ToPixels(block_xx * 8, block_yy * 8, 16, 16, ch, 1); -*/ - CoeffToYUV16x16(coeff_block, &yuv16x16[c], mayout_channel[c].pixel, block_xx, block_yy, img.width(), img.height()); - - // copy YUV16x16 corner to YUV8x8 - Copy16x16To8x8(&yuv16x16[c], &yuv8x8[c], ix, iy); - } - else { - const coeff_t * coeff_block = candidate_channel[c]; - CoeffToYUV16x16(coeff_block, &yuv16x16[c], mayout_channel[c].pixel, block_x, block_y, img.width(), img.height()); - } - } - } - - if (factor == 1) - { - std::vector< std::vector > rgb0_c; - int block_8x8idx = GetOrigBlock(rgb0_c, 0, 0); -/* - uint8_t yuv[3 * 8 * 8]; - - std::vector > rgb1_c2(3, std::vector(kDCTBlockSize)); - { - int block_x = block_x_ * factor_x_ + off_x; - int block_y = block_y_ * factor_y_ + off_y; - int xmin = 8 * block_x; - int ymin = 8 * block_y; - - img.ToLinearRGB(xmin, ymin, 8, 8, &rgb1_c2); - - img.component(0).ToPixels(xmin, ymin, 8, 8, &yuv[0], 3); - img.component(1).ToPixels(xmin, ymin, 8, 8, &yuv[1], 3); - img.component(2).ToPixels(xmin, ymin, 8, 8, &yuv[2], 3); - } -*/ - int inside_x = block_x_ * 8 + 8 > width_ ? width_ - block_x_ * 8 : 8; - int inside_y = block_y_ * 8 + 8 > height_ ? height_ - block_y_ * 8 : 8; - std::vector > rgb1_c(3, std::vector(kDCTBlockSize)); - YUVToImage(yuv8x8, rgb1_c[0].data(), rgb1_c[1].data(), rgb1_c[2].data(), 8, 8, inside_x, inside_y); -/* - int count = 0; - for (int i = 0; i < 64; i++) - { - if (rgb1_c[0][i] != rgb1_c2[0][i] || - rgb1_c[1][i] != rgb1_c2[1][i] || - rgb1_c[2][i] != rgb1_c2[2][i]) - { - count++; - } - } - if (count > 0) - { - LogError("fdjskafjdlasfj"); - } -*/ - return ComputeImage8x8Block(rgb0_c, rgb1_c, block_8x8idx); - } - else - { - int inside_x = block_x_ * 16 + 16 > width_ ? width_ - block_x_ * 16 : 16; - int inside_y = block_y_ * 16 + 16 > height_ ? height_ - block_y_ * 16 : 16; -/* - uint8_t yuv[3 * 8 * 8]; - std::vector > rgb1_c2(3, std::vector(kDCTBlockSize)); - { - int block_x = block_x_ * factor_x_ + off_x; - int block_y = block_y_ * factor_y_ + off_y; - int xmin = 8 * block_x; - int ymin = 8 * block_y; - - img.ToLinearRGB(xmin, ymin, 8, 8, &rgb1_c2); - - img.component(0).ToPixels(xmin, ymin, 8, 8, &yuv[0], 3); - img.component(1).ToPixels(xmin, ymin, 8, 8, &yuv[1], 3); - img.component(2).ToPixels(xmin, ymin, 8, 8, &yuv[2], 3); - } - -*/ - float rgb16x16[3][16 * 16]; - YUVToImage(yuv16x16, rgb16x16[0], rgb16x16[1], rgb16x16[2], 16, 16, inside_x, inside_y); - - std::vector< std::vector > rgb0_c; - int block_8x8idx = GetOrigBlock(rgb0_c, off_x, off_y); - - std::vector > rgb1_c(3, std::vector(kDCTBlockSize)); - Copy16x16ToChannel(rgb16x16, rgb1_c[0].data(), rgb1_c[1].data(), rgb1_c[2].data(), off_x, off_y); - - return ComputeImage8x8Block(rgb0_c, rgb1_c, block_8x8idx); - } - } - - double ButteraugliComparatorEx::ComputeImage8x8Block(std::vector > &rgb0_c, - std::vector > &rgb1_c, - int block_8x8idx) const - { - ::butteraugli::OpsinDynamicsImage(8, 8, rgb0_c); - ::butteraugli::OpsinDynamicsImage(8, 8, rgb1_c); - - std::vector > rgb0 = rgb0_c; - std::vector > rgb1 = rgb1_c; - - ::butteraugli::MaskHighIntensityChange(8, 8, rgb0_c, rgb1_c, rgb0, rgb1); - - double b0[3 * kDCTBlockSize]; - double b1[3 * kDCTBlockSize]; - for (int c = 0; c < 3; ++c) { - for (int ix = 0; ix < kDCTBlockSize; ++ix) { - b0[c * kDCTBlockSize + ix] = rgb0[c][ix]; - b1[c * kDCTBlockSize + ix] = rgb1[c][ix]; - } - } - double diff_xyz_dc[3] = { 0.0 }; - double diff_xyz_ac[3] = { 0.0 }; - double diff_xyz_edge_dc[3] = { 0.0 }; - ::butteraugli::ButteraugliBlockDiff(b0, b1, diff_xyz_dc, diff_xyz_ac, diff_xyz_edge_dc); - - double diff = 0.0; - double diff_edge = 0.0; - for (int c = 0; c < 3; ++c) { - diff += diff_xyz_dc[c] * imgMaskXyzScaleBlockList[block_8x8idx * 3 + c]; - diff += diff_xyz_ac[c] * imgMaskXyzScaleBlockList[block_8x8idx * 3 + c]; - diff_edge += diff_xyz_edge_dc[c] * imgMaskXyzScaleBlockList[block_8x8idx * 3 + c]; - } - const double kEdgeWeight = 0.05; - return sqrt((1 - kEdgeWeight) * diff + kEdgeWeight * diff_edge); - } - - int ButteraugliComparatorEx::getCurrentBlockIdx(void) const - { - const int block_width = (width_ + 8 * factor_x_ - 1) / (8 * factor_x_); - const int block_height = (height_ + 8 * factor_y_ - 1) / (8 * factor_y_); - - return block_y_ * block_width + block_x_; - } - - int ButteraugliComparatorEx::getCurrentBlock8x8Idx(int off_x, int off_y) const - { - int block_xx = block_x_ * factor_x_ + off_x; - int block_yy = block_y_ * factor_y_ + off_y; - - const int block8_width = (width_ + 8 - 1) / 8; - return block_yy * block8_width + block_xx; - } -} diff --git a/clguetzli/clguetzli_comparator.h b/clguetzli/clguetzli_comparator.h deleted file mode 100644 index 721fcb32..00000000 --- a/clguetzli/clguetzli_comparator.h +++ /dev/null @@ -1,25 +0,0 @@ -#pragma once -#include -#include "guetzli\butteraugli_comparator.h" - -namespace guetzli { - - class ButteraugliComparatorEx : public ButteraugliComparator - { - public: - ButteraugliComparatorEx(const int width, const int height, - const std::vector* rgb, - const float target_distance, ProcessStats* stats); - - void StartBlockComparisons() override; - void FinishBlockComparisons() override; - void SwitchBlock(int block_x, int block_y, int factor_x, int factor_y) override; - - double CompareBlock(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block, const int comp_mask) const override; - double CompareBlockEx(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block, const int comp_mask) const; - public: - std::vector imgOpsinDynamicsBlockList; // [RR..RRGG..GGBB..BB]:blockCount - std::vector imgMaskXyzScaleBlockList; // [RGBRGB..RGBRGB]:blockCount - }; - -} \ No newline at end of file diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h index 0b8df8b4..c188acc9 100644 --- a/clguetzli/ocl.h +++ b/clguetzli/ocl.h @@ -64,7 +64,6 @@ enum KernelName { KERNEL_EDGEDETECTOR, KERNEL_BLOCKDIFFMAP, KERNEL_EDGEDETECTORLOWFREQ, - KERNEL_COMPUTEBLOCKZEROINGORDER, KERNEL_COMPUTEBLOCKZEROINGORDERFACTOR, KERNEL_COUNT, }; From d0949f18db593ac5a734317173f5e7972860c1aa Mon Sep 17 00:00:00 2001 From: strongtu Date: Sun, 21 May 2017 00:22:44 +0800 Subject: [PATCH 095/189] =?UTF-8?q?=E6=B8=85=E7=90=86=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clguetzli.cl | 2 +- clguetzli/clguetzli.cl.cpp | 2 + clguetzli/clguetzli.cpp | 2 +- guetzli.vcxproj | 1 - guetzli.vcxproj.filters | 3 -- guetzli/processor.cc | 81 -------------------------------------- 6 files changed, 4 insertions(+), 87 deletions(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index 793e0f0b..3d6b34fb 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -2953,7 +2953,7 @@ void CalcOpsinDynamicsImage(__private float rgb[3][kDCTBlockSize]) double ComputeImage8x8Block(__private float rgb0_c[3][kDCTBlockSize], __private float rgb1_c[3][kDCTBlockSize], const __global float* mask_scale_block) { - CalcOpsinDynamicsImage(rgb0_c); +// CalcOpsinDynamicsImage(rgb0_c); CalcOpsinDynamicsImage(rgb1_c); float rgb0[3][kDCTBlockSize]; diff --git a/clguetzli/clguetzli.cl.cpp b/clguetzli/clguetzli.cl.cpp index 9e6f7b87..4075ef94 100644 --- a/clguetzli/clguetzli.cl.cpp +++ b/clguetzli/clguetzli.cl.cpp @@ -74,6 +74,8 @@ namespace guetzli } } + CalcOpsinDynamicsImage((float(*)[64])curR); + int xmin = block_x * 8; int ymin = block_y * 8; diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index d04e8c1c..ea85dabd 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -1217,7 +1217,7 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b, cl_mem mem_output_order_batch = ocl.allocMem(sizeof(CoeffData) * item_count); cl_float clBlockErrorLimit = BlockErrorLimit; - cl_kernel kernel = ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER]; + cl_kernel kernel = 0;// ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER]; clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&mem_orig_batch); clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&mem_orig_image_batch); clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&mem_mask_scale_batch); diff --git a/guetzli.vcxproj b/guetzli.vcxproj index c914d909..11a2f227 100644 --- a/guetzli.vcxproj +++ b/guetzli.vcxproj @@ -195,7 +195,6 @@ - diff --git a/guetzli.vcxproj.filters b/guetzli.vcxproj.filters index 98009b47..4dd1b5ee 100644 --- a/guetzli.vcxproj.filters +++ b/guetzli.vcxproj.filters @@ -309,9 +309,6 @@ clguetzli - - clguetzli - diff --git a/guetzli/processor.cc b/guetzli/processor.cc index 88079303..7a8612c7 100644 --- a/guetzli/processor.cc +++ b/guetzli/processor.cc @@ -752,87 +752,6 @@ void Processor::SelectFrequencyMaskingBatch(const JPEGData& jpg, OutputImage* im candidate_coeff_offsets, candidate_coeffs, candidate_coeff_errors); } -/* -void Processor::ComputeBlockZeroingOrder(const coeff_t block[kBlockSize], const coeff_t orig_block[kBlockSize], - const int block_x, const int block_y, std::vector* output_order) -{ - static const uint8_t oldCsf[kDCTBlockSize] = { - 10, 10, 20, 40, 60, 70, 80, 90, - 10, 20, 30, 60, 70, 80, 90, 90, - 20, 30, 60, 70, 80, 90, 90, 90, - 40, 60, 70, 80, 90, 90, 90, 90, - 60, 70, 80, 90, 90, 90, 90, 90, - 70, 80, 90, 90, 90, 90, 90, 90, - 80, 90, 90, 90, 90, 90, 90, 90, - 90, 90, 90, 90, 90, 90, 90, 90, - }; - static const double kWeight[3] = { 1.0, 0.22, 0.20 }; -#include "guetzli/order.inc" - std::vector > input_order; - for (int c = 0; c < 3; ++c) { - for (int k = 1; k < kDCTBlockSize; ++k) { - int idx = c * kDCTBlockSize + k; - if (block[idx] != 0) { - float score; - if (params_.new_zeroing_model) { - score = std::abs(orig_block[idx]) * csf[idx] + bias[idx]; - } - else { - score = static_cast((std::abs(orig_block[idx]) - kJPEGZigZagOrder[k] / 64.0) * kWeight[c] / oldCsf[k]); - } - input_order.push_back(std::make_pair(idx, score)); - } - } - } - std::sort(input_order.begin(), input_order.end(), [](const std::pair& a, const std::pair& b) { return a.second < b.second; }); - - coeff_t processed_block[kBlockSize]; - memcpy(processed_block, block, sizeof(processed_block)); - - comparator_->SwitchBlock(block_x, block_y, 1, 1); - - while (!input_order.empty()) { - float best_err = 1e17f; - int best_i = 0; - for (size_t i = 0; i < std::min(params_.zeroing_greedy_lookahead, input_order.size()); ++i) - { - coeff_t candidate_block[kBlockSize]; - memcpy(candidate_block, processed_block, sizeof(candidate_block)); - - const int idx = input_order[i].first; - - candidate_block[idx] = 0; // TOBEREMOVE:¶Ô±ÈblockµÄÅÅÐòµÃ·ÖǰiµÍµÄÖÃ0(i¸ù¾Ýinput_orderÊý¾Ý±ä»¯¶ø±ä»¯)£¬²¢ÏÈÉèÖûضԱÈͼÏñµÄÈý¸ö·ÖÁ¿¶ÔÓ¦blockÖÐÈ¥£¬ºóÐøÔÙ×ö¶Ô±È²ÉÓᣠ- - float max_err = ((ButteraugliComparatorEx*)comparator_)->CompareBlockEx(img, 0, 0, candidate_block); - if (max_err < best_err) { // TOBEREMOVE:ÕÒ³ö×îС´íÎóÖµµÄi - best_err = max_err; - best_i = i; - } - } - - int idx = input_order[best_i].first; - processed_block[idx] = 0; - input_order.erase(input_order.begin() + best_i); - - output_order->push_back({ idx, best_err }); // TOBEREMOVE:½«ÉÏÃæ¼ÆËã³öÀ´µÄ×îС´íÎóµÄidx£¬¶ÔÓ¦µ½¶Ô±ÈblockÖеĶÔӦλÖÃÕæÕýµÄÖÃΪ0,ÒÆ³ýinput_orderÏ¼´Ñ¡È¡µ±Ç°Öµ£¬·ÅÈëoutput_order,²¢ÕýʽµÄÉèÖõ½¶Ô±ÈͼÏñÖÐÈ¥¡£ - } - - // TOBEREMOVE:×îÖÕÒÆ³ýerrÊý´óÓÚerrorÏÞÖÆµÄÏî·µ»Ø£¬²¢»¹Ô­¶Ô±ÈͼÏñµ½Ô­Ê¼Öµ¡£ - // Make the block error values monotonic. - float min_err = 1e10; - for (int i = output_order->size() - 1; i >= 0; --i) { - min_err = std::min(min_err, (*output_order)[i].block_err); - (*output_order)[i].block_err = min_err; - } - // Cut off at the block error limit. - size_t num = 0; - while (num < output_order->size() && - (*output_order)[num].block_err <= comparator_->BlockErrorLimit()) { - ++num; - } - output_order->resize(num); -} -*/ void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, const uint8_t comp_mask, const double target_mul, From cc746ff03b0f7388a53d56cfdb2b2aaada7c6a0b Mon Sep 17 00:00:00 2001 From: strongtu Date: Sun, 21 May 2017 01:01:21 +0800 Subject: [PATCH 096/189] =?UTF-8?q?=E6=B8=85=E7=90=86=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clguetzli.cl | 5 +++-- clguetzli/clguetzli.cpp | 13 ++++++------- clguetzli/clguetzli.h | 2 +- clguetzli/ocl.h | 2 +- guetzli/processor.cc | 6 +----- 5 files changed, 12 insertions(+), 16 deletions(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index 3d6b34fb..814c4157 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -3161,7 +3161,7 @@ double CompareBlockFactor(const channel_info mayout_channel[3], } // batchÊÇÖ¸ÒѾ­¶þά¿éÕ¹¿ªÎªÁËһά¿é -__kernel void clComputeBlockZeroingOrderFactor( +__kernel void clComputeBlockZeroingOrder( __global const coeff_t *orig_batch_0, // ԭʼͼÏñϵÊý __global const coeff_t *orig_batch_1, // ԭʼͼÏñϵÊý __global const coeff_t *orig_batch_2, // ԭʼͼÏñϵÊý @@ -3169,12 +3169,14 @@ __kernel void clComputeBlockZeroingOrderFactor( __global const float *mask_scale, // ԭʼͼÏñµÄij¸öÉñÃØ²ÎÊý int image_width, int image_height, + __global const coeff_t *mayout_batch_0, // Êä³ö±¸Ñ¡Í¼µÄϵÊý __global const coeff_t *mayout_batch_1, // Êä³ö±¸Ñ¡Í¼µÄϵÊý __global const coeff_t *mayout_batch_2, // Êä³ö±¸Ñ¡Í¼µÄϵÊý __global const ushort *mayout_pixel_0, __global const ushort *mayout_pixel_1, __global const ushort *mayout_pixel_2, + channel_info mayout_channel_0, channel_info mayout_channel_1, channel_info mayout_channel_2, @@ -3217,7 +3219,6 @@ __kernel void clComputeBlockZeroingOrderFactor( } } - DCTScoreData input_order_data[kComputeBlockSize]; CoeffData output_order_data[kComputeBlockSize]; diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index ea85dabd..ba271160 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -63,7 +63,7 @@ ocl_args_d_t& getOcl(void) ocl.kernel[KERNEL_EDGEDETECTOR] = clCreateKernel(ocl.program, "clEdgeDetectorMap", &err); ocl.kernel[KERNEL_BLOCKDIFFMAP] = clCreateKernel(ocl.program, "clBlockDiffMap", &err); ocl.kernel[KERNEL_EDGEDETECTORLOWFREQ] = clCreateKernel(ocl.program, "clEdgeDetectorLowFreq", &err); - ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDERFACTOR] = clCreateKernel(ocl.program, "clComputeBlockZeroingOrderFactor", &err); + ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER] = clCreateKernel(ocl.program, "clComputeBlockZeroingOrder", &err); return ocl; } @@ -1251,7 +1251,7 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b, clReleaseMemObject(mem_output_order_batch); } -void clComputeBlockZeroingOrderFactor( +void clComputeBlockZeroingOrder( const channel_info orig_channel[3], const float *orig_image_batch, const float *mask_scale, @@ -1285,7 +1285,6 @@ void clComputeBlockZeroingOrderFactor( mem_mayout_coeff[c] = ocl.allocMem(block_count * sizeof(::coeff_t) * kDCTBlockSize, mayout_channel[c].coeff); mem_mayout_pixel[c] = ocl.allocMem(image_width * image_height * sizeof(uint16_t), mayout_channel[c].pixel); - } cl_mem mem_orig_image = ocl.allocMem(sizeof(float) * 3 * kDCTBlockSize * block8_width * block8_height, orig_image_batch); cl_mem mem_mask_scale = ocl.allocMem(sizeof(float) * 3 * block8_width * block8_height, mask_scale); @@ -1293,12 +1292,12 @@ void clComputeBlockZeroingOrderFactor( int output_order_batch_size = sizeof(CoeffData) * 3 * kDCTBlockSize * blockf_width * blockf_height; cl_mem mem_output_order_batch = ocl.allocMem(output_order_batch_size); cl_float clBlockErrorLimit = BlockErrorLimit; - cl_int clWidth = image_width; - cl_int clHeight = image_height; - cl_int clFactor = factor; + cl_int clWidth = image_width; + cl_int clHeight = image_height; + cl_int clFactor = factor; cl_int clMask = comp_mask; - cl_kernel kernel = ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDERFACTOR]; + cl_kernel kernel = ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER]; clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&mem_orig_coeff[0]); clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&mem_orig_coeff[1]); clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&mem_orig_coeff[2]); diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h index 3a20eaa1..cd5a1524 100644 --- a/clguetzli/clguetzli.h +++ b/clguetzli/clguetzli.h @@ -25,7 +25,7 @@ void clComputeBlockZeroingOrder(const coeff_t *orig_batch, float BlockErrorLimit, guetzli::CoeffData *output_order_batch); -void clComputeBlockZeroingOrderFactor( +void clComputeBlockZeroingOrder( const channel_info orig_channel[3], const float *orig_image_batch, const float *mask_scale, diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h index c188acc9..15e115af 100644 --- a/clguetzli/ocl.h +++ b/clguetzli/ocl.h @@ -64,7 +64,7 @@ enum KernelName { KERNEL_EDGEDETECTOR, KERNEL_BLOCKDIFFMAP, KERNEL_EDGEDETECTORLOWFREQ, - KERNEL_COMPUTEBLOCKZEROINGORDERFACTOR, + KERNEL_COMPUTEBLOCKZEROINGORDER, KERNEL_COUNT, }; diff --git a/guetzli/processor.cc b/guetzli/processor.cc index 7a8612c7..7540e470 100644 --- a/guetzli/processor.cc +++ b/guetzli/processor.cc @@ -74,10 +74,6 @@ class Processor { const int block_x, const int block_y, const int factor_x, const int factor_y, const uint8_t comp_mask, OutputImage* img, const OutputImage& img2, std::vector* output_order); - /* - void ComputeBlockZeroingOrder(const coeff_t block[kBlockSize], const coeff_t orig_block[kBlockSize], - const int block_x, const int block_y, std::vector* output_order); - */ bool SelectQuantMatrix(const JPEGData& jpg_in, const bool downsample, int best_q[3][kDCTBlockSize], @@ -659,7 +655,7 @@ void Processor::SelectFrequencyMaskingBatch(const JPEGData& jpg, OutputImage* im output_order_gpu.resize(num_blocks * kBlockSize); output_order = output_order_gpu.data(); - clComputeBlockZeroingOrderFactor(orig_channel, + clComputeBlockZeroingOrder(orig_channel, comp->imgOpsinDynamicsBlockList.data(), comp->imgMaskXyzScaleBlockList.data(), width, From 1f87bb2244092f215012e6ff8c6ec30bb409c7ab Mon Sep 17 00:00:00 2001 From: strongtu Date: Sun, 21 May 2017 01:04:23 +0800 Subject: [PATCH 097/189] =?UTF-8?q?=E6=B8=85=E7=90=86=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- guetzli/processor.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/guetzli/processor.cc b/guetzli/processor.cc index 7540e470..32cb13bb 100644 --- a/guetzli/processor.cc +++ b/guetzli/processor.cc @@ -31,7 +31,6 @@ #include "guetzli/jpeg_data_writer.h" #include "guetzli/output_image.h" #include "guetzli/quantize.h" -#include "clguetzli\clguetzli_comparator.h" #include "clguetzli\clguetzli.h" namespace guetzli { From add84365e698c488ccc5cec9d66d05b8ea3ef872 Mon Sep 17 00:00:00 2001 From: strongtu Date: Mon, 22 May 2017 09:36:32 +0800 Subject: [PATCH 098/189] =?UTF-8?q?=E5=8E=BB=E6=8E=89=E7=BC=96=E8=AF=91?= =?UTF-8?q?=E4=BA=8B=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- guetzli.vcxproj | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/guetzli.vcxproj b/guetzli.vcxproj index 11a2f227..3b6abf4e 100644 --- a/guetzli.vcxproj +++ b/guetzli.vcxproj @@ -129,7 +129,8 @@ - copy $(ProjectDir)\clguetzli\clguetzli.cl $(ProjectDir)\clguetzli.cl + + @@ -171,7 +172,8 @@ $(INTELOCLSDKROOT)lib\x64 - copy $(ProjectDir)\clguetzli\clguetzli.cl $(ProjectDir)\clguetzli.cl + + @@ -384,7 +386,6 @@ Document - "C:\Users\strongtu\Documents\Project\git_strong\guetzli\clguetzli";%(Include) From 8f80356104d359a7f855e76749689b7caa610c7d Mon Sep 17 00:00:00 2001 From: strongtu Date: Mon, 22 May 2017 11:06:18 +0800 Subject: [PATCH 099/189] =?UTF-8?q?=E7=B2=BE=E7=AE=80=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clguetzli.cl.cpp | 54 +- clguetzli/clguetzli.cpp | 57 -- clguetzli/clguetzli.h | 10 - guetzli.vcxproj | 1 + guetzli.vcxproj.filters | 3 + guetzli/processor.cc | 494 +++++++----------- .../butteraugli/butteraugli/butteraugli.cc | 35 +- 7 files changed, 213 insertions(+), 441 deletions(-) diff --git a/clguetzli/clguetzli.cl.cpp b/clguetzli/clguetzli.cl.cpp index 4075ef94..0a05b038 100644 --- a/clguetzli/clguetzli.cl.cpp +++ b/clguetzli/clguetzli.cl.cpp @@ -93,47 +93,37 @@ namespace guetzli imgMaskXyzScaleBlockList.clear(); } - void ButteraugliComparatorEx::SwitchBlock(int block_x, int block_y, int factor_x, int factor_y) - { - ButteraugliComparator::SwitchBlock(block_x, block_y, factor_x, factor_y); - } - double ButteraugliComparatorEx::CompareBlock(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block, const int comp_mask) const { double err = ButteraugliComparator::CompareBlock(img, off_x, off_y, candidate_block, comp_mask); - return err; if (g_checkOpenCL) { - double err1 = ButteraugliComparator::CompareBlock(img, off_x, off_y, candidate_block, comp_mask); - if (err1 != err) + channel_info mayout_channel[3]; + for (int c = 0; c < 3; c++) { - LogError("CHK %s(%d) \r\n", __FUNCTION__, __LINE__); + mayout_channel[c].block_height = img.component(c).height_in_blocks(); + mayout_channel[c].block_width = img.component(c).width_in_blocks(); + mayout_channel[c].factor = img.component(c).factor_x(); + mayout_channel[c].pixel = img.component(c).pixels(); + mayout_channel[c].coeff = img.component(c).coeffs(); } - } - return err; - } - - double ButteraugliComparatorEx::CompareBlockEx(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block, const int comp_mask) const - { - channel_info mayout_channel[3]; - for (int c = 0; c < 3; c++) - { - mayout_channel[c].block_height = img.component(c).height_in_blocks(); - mayout_channel[c].block_width = img.component(c).width_in_blocks(); - mayout_channel[c].factor = img.component(c).factor_x(); - mayout_channel[c].pixel = img.component(c).pixels(); - mayout_channel[c].coeff = img.component(c).coeffs(); + double err2 = CompareBlockFactor(mayout_channel, + candidate_block, + block_x_, + block_y_, + imgOpsinDynamicsBlockList.data(), + imgMaskXyzScaleBlockList.data(), + width_, + height_, + factor_x_); + + if (err != err2) + { + LogError("CompareBlock miss %s(%d) \r\n", __FUNCTION__, __LINE__); + } } - return CompareBlockFactor(mayout_channel, - candidate_block, - block_x_, - block_y_, - imgOpsinDynamicsBlockList.data(), - imgMaskXyzScaleBlockList.data(), - width_, - height_, - factor_x_); + return err; } } diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index ba271160..67eb4918 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -1194,63 +1194,6 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b, clReleaseMemObject(mem_result); } -// batchÊÇÖ¸ÒѾ­¶þά¿éÕ¹¿ªÎªÁËһά¿é -void clComputeBlockZeroingOrder(const guetzli::coeff_t *orig_batch, // ԭʼͼÏñϵÊý - const float *orig_image_batch, // ԭʼͼÏñpregammaºó - const float* orig_mask_scale_batch, // ԭʼͼÏñµÄij¸öÉñÃØ²ÎÊý - const guetzli::coeff_t *mayout_batch, // Êä³ö±¸Ñ¡Í¼µÄϵÊý - int size, // - float BlockErrorLimit, - guetzli::CoeffData *output_order_batch) // -{ - using namespace guetzli; - - int item_count = 3 * kDCTBlockSize * size; - - cl_int err = 0; - ocl_args_d_t &ocl = getOcl(); - - cl_mem mem_orig_batch = ocl.allocMem(sizeof(::coeff_t) * item_count, orig_batch); - cl_mem mem_orig_image_batch = ocl.allocMem(sizeof(float) * item_count, orig_image_batch); - cl_mem mem_mask_scale_batch = ocl.allocMem(sizeof(float) * 3 * size, orig_mask_scale_batch); - cl_mem mem_mayout_batch = ocl.allocMem(sizeof(::coeff_t) * item_count, mayout_batch); - cl_mem mem_output_order_batch = ocl.allocMem(sizeof(CoeffData) * item_count); - cl_float clBlockErrorLimit = BlockErrorLimit; - - cl_kernel kernel = 0;// ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER]; - clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&mem_orig_batch); - clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&mem_orig_image_batch); - clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&mem_mask_scale_batch); - clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&mem_mayout_batch); - clSetKernelArg(kernel, 4, sizeof(cl_float), &clBlockErrorLimit); - clSetKernelArg(kernel, 5, sizeof(cl_mem), &mem_output_order_batch); - - size_t globalWorkSize[1] = { size }; - err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL); - if (CL_SUCCESS != err) - { - LogError("Error: clComputeBlockZeroingOrder() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err)); - } - err = clFinish(ocl.commandQueue); - if (CL_SUCCESS != err) - { - LogError("Error: clComputeBlockZeroingOrder() clFinish returned %s.\n", TranslateOpenCLError(err)); - } - - CoeffData *result = (CoeffData *)clEnqueueMapBuffer(ocl.commandQueue, mem_output_order_batch, true, CL_MAP_READ, 0, sizeof(CoeffData) * item_count, 0, NULL, NULL, &err); - err = clFinish(ocl.commandQueue); - memcpy(output_order_batch, result, sizeof(CoeffData) * item_count); - - clEnqueueUnmapMemObject(ocl.commandQueue, mem_output_order_batch, result, 0, NULL, NULL); - clFinish(ocl.commandQueue); - - clReleaseMemObject(mem_orig_batch); - clReleaseMemObject(mem_orig_image_batch); - clReleaseMemObject(mem_mask_scale_batch); - clReleaseMemObject(mem_mayout_batch); - clReleaseMemObject(mem_output_order_batch); -} - void clComputeBlockZeroingOrder( const channel_info orig_channel[3], const float *orig_image_batch, diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h index cd5a1524..4e6f3209 100644 --- a/clguetzli/clguetzli.h +++ b/clguetzli/clguetzli.h @@ -17,14 +17,6 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b, size_t step, float* result); -void clComputeBlockZeroingOrder(const coeff_t *orig_batch, - const float *orig_image_batch, - const float* orig_mask_scale_batch, - const coeff_t *mayout_batch, - int size, - float BlockErrorLimit, - guetzli::CoeffData *output_order_batch); - void clComputeBlockZeroingOrder( const channel_info orig_channel[3], const float *orig_image_batch, @@ -108,10 +100,8 @@ namespace guetzli { void StartBlockComparisons() override; void FinishBlockComparisons() override; - void SwitchBlock(int block_x, int block_y, int factor_x, int factor_y) override; double CompareBlock(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block, const int comp_mask) const override; - double CompareBlockEx(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block, const int comp_mask) const; public: std::vector imgOpsinDynamicsBlockList; // [RR..RRGG..GGBB..BB]:blockCount std::vector imgMaskXyzScaleBlockList; // [RGBRGB..RGBRGB]:blockCount diff --git a/guetzli.vcxproj b/guetzli.vcxproj index 3b6abf4e..42a13971 100644 --- a/guetzli.vcxproj +++ b/guetzli.vcxproj @@ -196,6 +196,7 @@ + diff --git a/guetzli.vcxproj.filters b/guetzli.vcxproj.filters index 4dd1b5ee..fc895c38 100644 --- a/guetzli.vcxproj.filters +++ b/guetzli.vcxproj.filters @@ -309,6 +309,9 @@ clguetzli + + clguetzli + diff --git a/guetzli/processor.cc b/guetzli/processor.cc index 32cb13bb..43b513dc 100644 --- a/guetzli/processor.cc +++ b/guetzli/processor.cc @@ -51,14 +51,9 @@ class Processor { ProcessStats* stats); private: - - void SelectFrequencyMaskingBatch(const JPEGData& jpg, OutputImage* img, - const uint8_t comp_mask, const double target_mul, - bool stop_early, const OutputImage &img2); - void SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, const uint8_t comp_mask, const double target_mul, - bool stop_early, const OutputImage &img2); + bool stop_early); void SelectFrequencyBackEnd(const JPEGData& jpg, OutputImage* img, const uint8_t comp_mask, @@ -71,7 +66,7 @@ class Processor { void ComputeBlockZeroingOrder( const coeff_t block[kBlockSize], const coeff_t orig_block[kBlockSize], const int block_x, const int block_y, const int factor_x, - const int factor_y, const uint8_t comp_mask, OutputImage* img, const OutputImage& img2, + const int factor_y, const uint8_t comp_mask, OutputImage* img, std::vector* output_order); bool SelectQuantMatrix(const JPEGData& jpg_in, const bool downsample, @@ -350,7 +345,6 @@ bool Processor::SelectQuantMatrix(const JPEGData& jpg_in, const bool downsample, const float target_mul_low = 0.95f; QuantData best = TryQuantMatrix(jpg_in, target_mul_high, best_q, img); - for (;;) { int q_next[3][kDCTBlockSize]; if (!qgen.GetNext(q_next)) { @@ -379,7 +373,7 @@ bool Processor::SelectQuantMatrix(const JPEGData& jpg_in, const bool downsample, void Processor::ComputeBlockZeroingOrder( const coeff_t block[kBlockSize], const coeff_t orig_block[kBlockSize], const int block_x, const int block_y, const int factor_x, - const int factor_y, const uint8_t comp_mask, OutputImage* img, const OutputImage &img2, + const int factor_y, const uint8_t comp_mask, OutputImage* img, std::vector* output_order) { static const uint8_t oldCsf[kDCTBlockSize] = { 10, 10, 20, 40, 60, 70, 80, 90, @@ -416,19 +410,6 @@ void Processor::ComputeBlockZeroingOrder( coeff_t processed_block[kBlockSize]; memcpy(processed_block, block, sizeof(processed_block)); comparator_->SwitchBlock(block_x, block_y, factor_x, factor_y); - - bool bCheck = false; - uint8_t orig_rgb[3][16 * 16] = { 0 }; - if (bCheck) - { - for (int c = 0; c < 3; ++c) { - if (comp_mask & (1 << c) && factor_x == 2) { - if ((block_x + 1) * factor_x * 8 > img->width()) continue; - img->component(c).ToPixels((block_x + 1) * factor_x * 8, block_y * factor_y * 8, 16, 16, orig_rgb[c], 1); - } - } - } - while (!input_order.empty()) { float best_err = 1e17f; int best_i = 0; @@ -460,36 +441,6 @@ void Processor::ComputeBlockZeroingOrder( best_err = max_err; best_i = i; } - - if (bCheck) - { - // ÿ´Î¶¼Òª»Ö¸´Ò»Ï¿´¿´ - for (int c = 0; c < 3; ++c) { - if (comp_mask & (1 << c)) { - img->component(c).SetCoeffBlock(block_x, block_y, &block[c * kDCTBlockSize]); - } - } - // ¿´¿´ÏàÁÙ¿éÊDz»Êǻָ´ÁË - uint8_t last_rgb[3][16 * 16] = { 0 }; - for (int c = 0; c < 3; ++c) { - if (comp_mask & (1 << c) && factor_x == 2) { - if ((block_x + 1) * factor_x * 8 > img->width()) continue; - img->component(c).ToPixels((block_x + 1) * factor_x * 8, block_y * factor_y * 8, 16, 16, last_rgb[c], 1); - } - } - int count = 0; - for (int c = 0; c < 3; c++) { - for (int k = 0; factor_x == 2 && k < 16 * 16; k++) { - if (last_rgb[c][k] != orig_rgb[c][k]) { - count++; - } - } - } - if (count > 0) - { - LogError("misstake in processing %d:%d block=%d:%d\r\n", count, 16 * 16, block_x, block_y); - } - } } int idx = input_order[best_i].first; processed_block[idx] = 0; @@ -522,23 +473,6 @@ void Processor::ComputeBlockZeroingOrder( block_x, block_y, &block[c * kDCTBlockSize]); } } - - if (bCheck) - { - // ȫͼ¼ì²éһϠ- for (int c = 0; c < 3; c++) - { - int size = img->component(c).pixels_size(); - if (!(comp_mask & (1 << c))) continue; - for (int k = 0; k < size && factor_x == 2; k++) - { - if (img2.component(c).pixels()[k] != img->component(c).pixels()[k]) - { - LogError("misstake in restore\r\n"); - } - } - } - } } namespace { @@ -611,8 +545,8 @@ size_t EstimateDCSize(const JPEGData& jpg) { } // namespace -void Processor::SelectFrequencyMaskingBatch(const JPEGData& jpg, OutputImage* img, const uint8_t comp_mask, - const double target_mul, bool stop_early, const OutputImage &img2) +void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, const uint8_t comp_mask, + const double target_mul, bool stop_early) { const int width = img->width(); const int height = img->height(); @@ -689,7 +623,7 @@ void Processor::SelectFrequencyMaskingBatch(const JPEGData& jpg, OutputImage* im } std::vector block_order; - ComputeBlockZeroingOrder(block, orig_block, block_x, block_y, factor_x, factor_y, comp_mask, img, img2, &block_order); + ComputeBlockZeroingOrder(block, orig_block, block_x, block_y, factor_x, factor_y, comp_mask, img, &block_order); CoeffData * p = &output_order_cpu[block_ix * kBlockSize]; for (int i = 0; i < block_order.size(); i++) @@ -747,64 +681,6 @@ void Processor::SelectFrequencyMaskingBatch(const JPEGData& jpg, OutputImage* im candidate_coeff_offsets, candidate_coeffs, candidate_coeff_errors); } -void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, - const uint8_t comp_mask, - const double target_mul, - bool stop_early, - const OutputImage& img2) { - const int width = img->width(); - const int height = img->height(); - const int ncomp = jpg.components.size(); - const int last_c = Log2FloorNonZero(comp_mask); - if (static_cast(last_c) >= jpg.components.size()) return; - const int factor_x = img->component(last_c).factor_x(); - const int factor_y = img->component(last_c).factor_y(); - const int block_width = (width + 8 * factor_x - 1) / (8 * factor_x); - const int block_height = (height + 8 * factor_y - 1) / (8 * factor_y); - const int num_blocks = block_width * block_height; - - std::vector candidate_coeff_offsets(num_blocks + 1); - std::vector candidate_coeffs; - std::vector candidate_coeff_errors; - candidate_coeffs.reserve(60 * num_blocks); - candidate_coeff_errors.reserve(60 * num_blocks); - std::vector block_order; - block_order.reserve(3 * kDCTBlockSize); - comparator_->StartBlockComparisons(); - for (int block_y = 0, block_ix = 0; block_y < block_height; ++block_y) { - for (int block_x = 0; block_x < block_width; ++block_x, ++block_ix) { - coeff_t block[kBlockSize] = { 0 }; - coeff_t orig_block[kBlockSize] = { 0 }; - for (int c = 0; c < 3; ++c) { - if (comp_mask & (1 << c)) { - assert(img->component(c).factor_x() == factor_x); - assert(img->component(c).factor_y() == factor_y); - img->component(c).GetCoeffBlock(block_x, block_y, - &block[c * kDCTBlockSize]); - const JPEGComponent& comp = jpg.components[c]; - int jpg_block_ix = block_y * comp.width_in_blocks + block_x; - memcpy(&orig_block[c * kDCTBlockSize], - &comp.coeffs[jpg_block_ix * kDCTBlockSize], - kDCTBlockSize * sizeof(orig_block[0])); - } - } - block_order.clear(); - ComputeBlockZeroingOrder(block, orig_block, block_x, block_y, factor_x, - factor_y, comp_mask, img, img2, &block_order); - - candidate_coeff_offsets[block_ix] = candidate_coeffs.size(); - for (size_t i = 0; i < block_order.size(); ++i) { - candidate_coeffs.push_back(block_order[i].idx); - candidate_coeff_errors.push_back(block_order[i].block_err); - } - } - } - comparator_->FinishBlockComparisons(); - candidate_coeff_offsets[num_blocks] = candidate_coeffs.size(); - - SelectFrequencyBackEnd(jpg, img, comp_mask, target_mul, stop_early, - candidate_coeff_offsets, candidate_coeffs, candidate_coeff_errors); -} void Processor::SelectFrequencyBackEnd(const JPEGData& jpg, OutputImage* img, const uint8_t comp_mask, @@ -825,183 +701,183 @@ void Processor::SelectFrequencyBackEnd(const JPEGData& jpg, OutputImage* img, const int block_height = (height + 8 * factor_y - 1) / (8 * factor_y); const int num_blocks = block_width * block_height; - std::vector ac_histograms(ncomp); - int jpg_header_size, dc_size; - { - JPEGData jpg_out = jpg; - img->SaveToJpegData(&jpg_out); - jpg_header_size = JpegHeaderSize(jpg_out, params_.clear_metadata); - dc_size = EstimateDCSize(jpg_out); - BuildACHistograms(jpg_out, &ac_histograms[0]); - } - std::vector ac_depths; - int ac_histogram_size = ComputeEntropyCodes(ac_histograms, &ac_depths); - int base_size = jpg_header_size + dc_size + ac_histogram_size + - EntropyCodedDataSize(ac_histograms, ac_depths); - int prev_size = base_size; - - std::vector max_block_error(num_blocks); - std::vector last_indexes(num_blocks); - - bool first_up_iter = true; - for (int direction : {1, -1}) { - for (;;) { - if (stop_early && direction == -1) { - if (prev_size > 1.01 * final_output_->jpeg_data.size()) { - // If we are down-adjusting the error, the output size will only keep - // increasing. - // TODO(user): Do this check always by comparing only the size - // of the currently processed components. - break; - } + std::vector ac_histograms(ncomp); + int jpg_header_size, dc_size; + { + JPEGData jpg_out = jpg; + img->SaveToJpegData(&jpg_out); + jpg_header_size = JpegHeaderSize(jpg_out, params_.clear_metadata); + dc_size = EstimateDCSize(jpg_out); + BuildACHistograms(jpg_out, &ac_histograms[0]); + } + std::vector ac_depths; + int ac_histogram_size = ComputeEntropyCodes(ac_histograms, &ac_depths); + int base_size = jpg_header_size + dc_size + ac_histogram_size + + EntropyCodedDataSize(ac_histograms, ac_depths); + int prev_size = base_size; + + std::vector max_block_error(num_blocks); + std::vector last_indexes(num_blocks); + + bool first_up_iter = true; + for (int direction : {1, -1}) { + for (;;) { + if (stop_early && direction == -1) { + if (prev_size > 1.01 * final_output_->jpeg_data.size()) { + // If we are down-adjusting the error, the output size will only keep + // increasing. + // TODO(user): Do this check always by comparing only the size + // of the currently processed components. + break; + } + } + std::vector > global_order; + int blocks_to_change; + std::vector block_weight; + for (int rblock = 1; rblock <= 4; ++rblock) { + block_weight = std::vector(num_blocks); + std::vector distmap(width * height); + if (!first_up_iter) { + distmap = comparator_->distmap(); + } + comparator_->ComputeBlockErrorAdjustmentWeights( + direction, rblock, target_mul, factor_x, factor_y, distmap, + &block_weight); + global_order.clear(); + blocks_to_change = 0; + for (int block_y = 0, block_ix = 0; block_y < block_height; ++block_y) { + for (int block_x = 0; block_x < block_width; ++block_x, ++block_ix) { + const int last_index = last_indexes[block_ix]; + const int offset = candidate_coeff_offsets[block_ix]; + const int num_candidates = + candidate_coeff_offsets[block_ix + 1] - offset; + const float* candidate_errors = &candidate_coeff_errors[offset]; + const float max_err = max_block_error[block_ix]; + if (block_weight[block_ix] == 0) { + continue; } - std::vector > global_order; - int blocks_to_change; - std::vector block_weight; - for (int rblock = 1; rblock <= 4; ++rblock) { - block_weight = std::vector(num_blocks); - std::vector distmap(width * height); - if (!first_up_iter) { - distmap = comparator_->distmap(); - } - comparator_->ComputeBlockErrorAdjustmentWeights( - direction, rblock, target_mul, factor_x, factor_y, distmap, - &block_weight); - global_order.clear(); - blocks_to_change = 0; - for (int block_y = 0, block_ix = 0; block_y < block_height; ++block_y) { - for (int block_x = 0; block_x < block_width; ++block_x, ++block_ix) { - const int last_index = last_indexes[block_ix]; - const int offset = candidate_coeff_offsets[block_ix]; - const int num_candidates = - candidate_coeff_offsets[block_ix + 1] - offset; - const float* candidate_errors = &candidate_coeff_errors[offset]; - const float max_err = max_block_error[block_ix]; - if (block_weight[block_ix] == 0) { - continue; - } - if (direction > 0) { - for (size_t i = last_index; i < num_candidates; ++i) { - float val = ((candidate_errors[i] - max_err) / - block_weight[block_ix]); - global_order.push_back(std::make_pair(block_ix, val)); - } - blocks_to_change += (last_index < num_candidates ? 1 : 0); - } else { - for (int i = last_index - 1; i >= 0; --i) { - float val = ((max_err - candidate_errors[i]) / - block_weight[block_ix]); - global_order.push_back(std::make_pair(block_ix, val)); - } - blocks_to_change += (last_index > 0 ? 1 : 0); - } - } - } - if (!global_order.empty()) { - // If we found something to adjust with the current block adjustment - // radius, we can stop and adjust the blocks we have. - break; - } + if (direction > 0) { + for (size_t i = last_index; i < num_candidates; ++i) { + float val = ((candidate_errors[i] - max_err) / + block_weight[block_ix]); + global_order.push_back(std::make_pair(block_ix, val)); + } + blocks_to_change += (last_index < num_candidates ? 1 : 0); + } else { + for (int i = last_index - 1; i >= 0; --i) { + float val = ((max_err - candidate_errors[i]) / + block_weight[block_ix]); + global_order.push_back(std::make_pair(block_ix, val)); + } + blocks_to_change += (last_index > 0 ? 1 : 0); } + } + } + if (!global_order.empty()) { + // If we found something to adjust with the current block adjustment + // radius, we can stop and adjust the blocks we have. + break; + } + } - if (global_order.empty()) { - break; - } + if (global_order.empty()) { + break; + } - std::sort(global_order.begin(), global_order.end(), + std::sort(global_order.begin(), global_order.end(), [](const std::pair& a, - const std::pair& b) { - return a.second < b.second; }); + const std::pair& b) { + return a.second < b.second; }); - double rel_size_delta = direction > 0 ? 0.01 : 0.0005; - if (direction > 0 && comparator_->DistanceOK(1.0)) { - rel_size_delta = 0.05; - } - double min_size_delta = base_size * rel_size_delta; - - float coeffs_to_change_per_block = - direction > 0 ? 2.0f : factor_x * factor_y * 0.2f; - int min_coeffs_to_change = coeffs_to_change_per_block * blocks_to_change; - - if (first_up_iter) { - const float limit = 0.75f * comparator_->BlockErrorLimit(); - auto it = std::partition_point(global_order.begin(), global_order.end(), - [=](const std::pair& a) { - return a.second < limit; }); - min_coeffs_to_change = std::max(min_coeffs_to_change, - it - global_order.begin()); - first_up_iter = false; - } + double rel_size_delta = direction > 0 ? 0.01 : 0.0005; + if (direction > 0 && comparator_->DistanceOK(1.0)) { + rel_size_delta = 0.05; + } + double min_size_delta = base_size * rel_size_delta; + + float coeffs_to_change_per_block = + direction > 0 ? 2.0f : factor_x * factor_y * 0.2f; + int min_coeffs_to_change = coeffs_to_change_per_block * blocks_to_change; + + if (first_up_iter) { + const float limit = 0.75f * comparator_->BlockErrorLimit(); + auto it = std::partition_point(global_order.begin(), global_order.end(), + [=](const std::pair& a) { + return a.second < limit; }); + min_coeffs_to_change = std::max(min_coeffs_to_change, + it - global_order.begin()); + first_up_iter = false; + } - std::set changed_blocks; - float val_threshold = 0.0; - int changed_coeffs = 0; - int est_jpg_size = prev_size; - for (size_t i = 0; i < global_order.size(); ++i) { - const int block_ix = global_order[i].first; - const int block_x = block_ix % block_width; - const int block_y = block_ix / block_width; - const int last_idx = last_indexes[block_ix]; - const int offset = candidate_coeff_offsets[block_ix]; - const uint8_t* candidates = &candidate_coeffs[offset]; - const int idx = candidates[last_idx + std::min(direction, 0)]; - const int c = idx / kDCTBlockSize; - const int k = idx % kDCTBlockSize; - const int* quant = img->component(c).quant(); - const JPEGComponent& comp = jpg.components[c]; - const int jpg_block_ix = block_y * comp.width_in_blocks + block_x; - const int newval = direction > 0 ? 0 : Quantize( - comp.coeffs[jpg_block_ix * kDCTBlockSize + k], quant[k]); - coeff_t block[kDCTBlockSize] = { 0 }; - img->component(c).GetCoeffBlock(block_x, block_y, block); - UpdateACHistogram(-1, block, quant, &ac_histograms[c]); - block[k] = newval; - UpdateACHistogram(1, block, quant, &ac_histograms[c]); - img->component(c).SetCoeffBlock(block_x, block_y, block); - last_indexes[block_ix] += direction; - changed_blocks.insert(block_ix); - val_threshold = global_order[i].second; - ++changed_coeffs; - static const int kEntropyCodeUpdateFreq = 10; - if (i % kEntropyCodeUpdateFreq == 0) { - ac_histogram_size = ComputeEntropyCodes(ac_histograms, &ac_depths); - } - est_jpg_size = jpg_header_size + dc_size + ac_histogram_size + - EntropyCodedDataSize(ac_histograms, ac_depths); - if (changed_coeffs > min_coeffs_to_change && - std::abs(est_jpg_size - prev_size) > min_size_delta) { - break; - } - } - size_t global_order_size = global_order.size(); - std::vector>().swap(global_order); + std::set changed_blocks; + float val_threshold = 0.0; + int changed_coeffs = 0; + int est_jpg_size = prev_size; + for (size_t i = 0; i < global_order.size(); ++i) { + const int block_ix = global_order[i].first; + const int block_x = block_ix % block_width; + const int block_y = block_ix / block_width; + const int last_idx = last_indexes[block_ix]; + const int offset = candidate_coeff_offsets[block_ix]; + const uint8_t* candidates = &candidate_coeffs[offset]; + const int idx = candidates[last_idx + std::min(direction, 0)]; + const int c = idx / kDCTBlockSize; + const int k = idx % kDCTBlockSize; + const int* quant = img->component(c).quant(); + const JPEGComponent& comp = jpg.components[c]; + const int jpg_block_ix = block_y * comp.width_in_blocks + block_x; + const int newval = direction > 0 ? 0 : Quantize( + comp.coeffs[jpg_block_ix * kDCTBlockSize + k], quant[k]); + coeff_t block[kDCTBlockSize] = { 0 }; + img->component(c).GetCoeffBlock(block_x, block_y, block); + UpdateACHistogram(-1, block, quant, &ac_histograms[c]); + block[k] = newval; + UpdateACHistogram(1, block, quant, &ac_histograms[c]); + img->component(c).SetCoeffBlock(block_x, block_y, block); + last_indexes[block_ix] += direction; + changed_blocks.insert(block_ix); + val_threshold = global_order[i].second; + ++changed_coeffs; + static const int kEntropyCodeUpdateFreq = 10; + if (i % kEntropyCodeUpdateFreq == 0) { + ac_histogram_size = ComputeEntropyCodes(ac_histograms, &ac_depths); + } + est_jpg_size = jpg_header_size + dc_size + ac_histogram_size + + EntropyCodedDataSize(ac_histograms, ac_depths); + if (changed_coeffs > min_coeffs_to_change && + std::abs(est_jpg_size - prev_size) > min_size_delta) { + break; + } + } + size_t global_order_size = global_order.size(); + std::vector>().swap(global_order); - for (int i = 0; i < num_blocks; ++i) { - max_block_error[i] += block_weight[i] * val_threshold * direction; - } + for (int i = 0; i < num_blocks; ++i) { + max_block_error[i] += block_weight[i] * val_threshold * direction; + } - ++stats_->counters[kNumItersCnt]; - ++stats_->counters[direction > 0 ? kNumItersUpCnt : kNumItersDownCnt]; - std::string encoded_jpg; - { - JPEGData jpg_out = jpg; - img->SaveToJpegData(&jpg_out); - OutputJpeg(jpg_out, &encoded_jpg); - } - GUETZLI_LOG(stats_, - "Iter %2d: %s(%d) %s Coeffs[%d/%zd] " - "Blocks[%zd/%d/%d] ValThres[%.4f] Out[%7zd] EstErr[%.2f%%]", - stats_->counters[kNumItersCnt], img->FrameTypeStr().c_str(), - comp_mask, direction > 0 ? "up" : "down", changed_coeffs, - global_order_size, changed_blocks.size(), - blocks_to_change, num_blocks, val_threshold, - encoded_jpg.size(), - 100.0 - (100.0 * est_jpg_size) / encoded_jpg.size()); - comparator_->Compare(*img); - MaybeOutput(encoded_jpg); - prev_size = est_jpg_size; - } + ++stats_->counters[kNumItersCnt]; + ++stats_->counters[direction > 0 ? kNumItersUpCnt : kNumItersDownCnt]; + std::string encoded_jpg; + { + JPEGData jpg_out = jpg; + img->SaveToJpegData(&jpg_out); + OutputJpeg(jpg_out, &encoded_jpg); + } + GUETZLI_LOG(stats_, + "Iter %2d: %s(%d) %s Coeffs[%d/%zd] " + "Blocks[%zd/%d/%d] ValThres[%.4f] Out[%7zd] EstErr[%.2f%%]", + stats_->counters[kNumItersCnt], img->FrameTypeStr().c_str(), + comp_mask, direction > 0 ? "up" : "down", changed_coeffs, + global_order_size, changed_blocks.size(), + blocks_to_change, num_blocks, val_threshold, + encoded_jpg.size(), + 100.0 - (100.0 * est_jpg_size) / encoded_jpg.size()); + comparator_->Compare(*img); + MaybeOutput(encoded_jpg); + prev_size = est_jpg_size; } + } } bool IsGrayscale(const JPEGData& jpg) { @@ -1096,28 +972,12 @@ bool Processor::ProcessJpegData(const Params& params, const JPEGData& jpg_in, img.CopyFromJpegData(jpg); img.ApplyGlobalQuantization(best_q); - OutputImage img2(jpg.width, jpg.height); - img2.CopyFromJpegData(jpg); - img2.ApplyGlobalQuantization(best_q); - - for (int c = 0; c < 3; c++) - { - int size = img.component(c).pixels_size(); - for (int k = 0; k < size; k++) - { - if (img2.component(c).pixels()[k] != img.component(c).pixels()[k]) - { - LogError("fdjsalfjlkadsfdsafjdsfjdlsajdklsjf\r\n"); - } - } - } - if (!downsample) { - SelectFrequencyMaskingBatch(jpg, &img, 7, 1.0, false, img2); + SelectFrequencyMasking(jpg, &img, 7, 1.0, false); } else { const float ymul = jpg.components.size() == 1 ? 1.0f : 0.97f; - SelectFrequencyMaskingBatch(jpg, &img, 1, ymul, false, img2); - SelectFrequencyMaskingBatch(jpg, &img, 6, 1.0, true, img2); + SelectFrequencyMasking(jpg, &img, 1, ymul, false); + SelectFrequencyMasking(jpg, &img, 6, 1.0, true); } } @@ -1156,7 +1016,7 @@ bool Process(const Params& params, ProcessStats* stats, if (stats == nullptr) { stats = &dummy_stats; } - std::unique_ptr comparator; + std::unique_ptr comparator; if (jpg.width >= 32 && jpg.height >= 32) { comparator.reset( new ButteraugliComparatorEx(jpg.width, jpg.height, &rgb, diff --git a/third_party/butteraugli/butteraugli/butteraugli.cc b/third_party/butteraugli/butteraugli/butteraugli.cc index 288bee78..69511051 100644 --- a/third_party/butteraugli/butteraugli/butteraugli.cc +++ b/third_party/butteraugli/butteraugli/butteraugli.cc @@ -1315,41 +1315,26 @@ void _MinSquareVal(size_t square_size, size_t offset, // offset is not negative and smaller than square_size. assert(offset < square_size); std::vector tmp(xsize * ysize); - for (size_t y = 0; y < ysize; ++y) { const size_t minh = offset > y ? 0 : y - offset; const size_t maxh = std::min(ysize, y + square_size - offset); - - float *pTmpPoint = &tmp[y * xsize]; - float *pValuePoint = &values[minh * xsize]; - for (size_t x = 0; x < xsize; ++x) { - float *pValues = pValuePoint++; - float min = *pValues; - - for (size_t j = minh + 1; j < maxh; ++j) { - pValues += xsize; - if (*pValues < min) min = *pValues; - } - *pTmpPoint++ = min; + double min = values[x + minh * xsize]; + for (size_t j = minh + 1; j < maxh; ++j) { + min = fmin(min, values[x + j * xsize]); + } + tmp[x + y * xsize] = static_cast(min); } } for (size_t x = 0; x < xsize; ++x) { const size_t minw = offset > x ? 0 : x - offset; const size_t maxw = std::min(xsize, x + square_size - offset); - - float *pValuePoint = &values[x]; - float *pTmpPoint = &tmp[minw]; - for (size_t y = 0; y < ysize; ++y) { - float * pTmp = pTmpPoint; pTmpPoint += xsize; - float min = *pTmp; - - for (size_t j = minw + 1; j < maxw; ++j) { - pTmp++; - if (*pTmp < min) min = *pTmp; - } - *pValuePoint = min; pValuePoint += xsize; + double min = tmp[minw + y * xsize]; + for (size_t j = minw + 1; j < maxw; ++j) { + min = fmin(min, tmp[j + y * xsize]); + } + values[x + y * xsize] = static_cast(min); } } } From f766120bc1fa6dadd6eee4e6a3c44724b918cf90 Mon Sep 17 00:00:00 2001 From: strongtu Date: Mon, 22 May 2017 11:22:36 +0800 Subject: [PATCH 100/189] =?UTF-8?q?=E7=B2=BE=E7=AE=80=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clguetzli.cl | 16 ++--- clguetzli/clguetzli.cl.h | 141 +++++++++++++++++++-------------------- 2 files changed, 77 insertions(+), 80 deletions(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index 814c4157..0f97ad8e 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -3167,8 +3167,8 @@ __kernel void clComputeBlockZeroingOrder( __global const coeff_t *orig_batch_2, // ԭʼͼÏñϵÊý __global const float *orig_image_batch, // ԭʼͼÏñpregamma __global const float *mask_scale, // ԭʼͼÏñµÄij¸öÉñÃØ²ÎÊý - int image_width, - int image_height, + const int image_width, + const int image_height, __global const coeff_t *mayout_batch_0, // Êä³ö±¸Ñ¡Í¼µÄϵÊý __global const coeff_t *mayout_batch_1, // Êä³ö±¸Ñ¡Í¼µÄϵÊý @@ -3177,12 +3177,12 @@ __kernel void clComputeBlockZeroingOrder( __global const ushort *mayout_pixel_1, __global const ushort *mayout_pixel_2, - channel_info mayout_channel_0, - channel_info mayout_channel_1, - channel_info mayout_channel_2, - int factor, // µ±Ç°²ÎÓëÔËËãµÄfactor - int comp_mask, // µ±Ç°²ÎÓëÔËËãµÄchannel - float BlockErrorLimit, + const channel_info mayout_channel_0, + const channel_info mayout_channel_1, + const channel_info mayout_channel_2, + const int factor, // µ±Ç°²ÎÓëÔËËãµÄfactor + const int comp_mask, // µ±Ç°²ÎÓëÔËËãµÄchannel + const float BlockErrorLimit, __global CoeffData *output_order_list/*out*/) { const int block_x = get_global_id(0); diff --git a/clguetzli/clguetzli.cl.h b/clguetzli/clguetzli.cl.h index 53a89eef..529ca141 100644 --- a/clguetzli/clguetzli.cl.h +++ b/clguetzli/clguetzli.cl.h @@ -2,83 +2,80 @@ #define __CLGUETZLI_CL_H__ #ifdef __cplusplus + #define __kernel + #define __private + #define __global + #define __constant + typedef unsigned char uchar; + typedef unsigned short ushort; -#define __kernel -#define __private -#define __global -#define __constant -typedef unsigned char uchar; -typedef unsigned short ushort; + int get_global_id(int dim); + int get_global_size(int dim); + void set_global_id(int dim, int id); + void set_global_size(int dim, int size); -int get_global_id(int dim); -int get_global_size(int dim); -void set_global_id(int dim, int id); -void set_global_size(int dim, int size); - -#ifdef __opencl -typedef union ocl_channels_t -{ - struct - { - float * r; - float * g; - float * b; - }; - union - { - float *ch[3]; - }; -}ocl_channels; -#else -typedef union ocl_channels_t -{ - struct - { - cl_mem r; - cl_mem g; - cl_mem b; - }; - struct - { - cl_mem x; - cl_mem y; - cl_mem b; - }; - union + #ifdef __opencl + typedef union ocl_channels_t + { + struct + { + float * r; + float * g; + float * b; + }; + union + { + float *ch[3]; + }; + }ocl_channels; + #else + typedef union ocl_channels_t + { + struct + { + cl_mem r; + cl_mem g; + cl_mem b; + }; + struct + { + cl_mem x; + cl_mem y; + cl_mem b; + }; + union + { + cl_mem ch[3]; + }; + }ocl_channels; + #endif +#else /*__cplusplus*/ + typedef union ocl_channels_t { - cl_mem ch[3]; - }; -}ocl_channels; + struct + { + float * r; + float * g; + float * b; + }; -#endif + union + { + float *ch[3]; + }; + }ocl_channels; -#else -typedef union ocl_channels_t -{ - struct - { - float * r; - float * g; - float * b; - }; - - union - { - float *ch[3]; - }; -}ocl_channels; +#endif /*__cplusplus*/ -#endif + typedef short coeff_t; -typedef short coeff_t; - -typedef struct __channel_info_t -{ - int factor; - int block_width; - int block_height; - __global const coeff_t *coeff; - __global const ushort *pixel; -}channel_info; + typedef struct __channel_info_t + { + int factor; + int block_width; + int block_height; + __global const coeff_t *coeff; + __global const ushort *pixel; + }channel_info; -#endif \ No newline at end of file +#endif /*__CLGUETZLI_CL_H__*/ \ No newline at end of file From 264209c16c356be19ad805858bc80215d8ef627e Mon Sep 17 00:00:00 2001 From: strongtu Date: Mon, 22 May 2017 11:51:11 +0800 Subject: [PATCH 101/189] =?UTF-8?q?const=20=E6=8E=A7=E5=88=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clguetzli.cl | 127 ++++++++++++++--------------------------- 1 file changed, 44 insertions(+), 83 deletions(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index 0f97ad8e..b9ee9d1e 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -13,7 +13,7 @@ double InterpolateClampNegative(__global const double *array, int size, double s void XybDiffLowFreqSquaredAccumulate(double r0, double g0, double b0, double r1, double g1, double b1, double factor, double res[3]); -double DotProduct(__global float u[3], double v[3]); +double DotProduct(__global const float u[3], const double v[3]); void OpsinAbsorbance(const double in[3], double out[3]); void RgbToXyb(double r, double g, double b, double *valx, double *valy, double *valz); double Gamma(double v); @@ -27,13 +27,13 @@ void Butteraugli8x8CornerEdgeDetectorDiff( int pos_y, int xsize, int ysize, - __global float *r, __global float *g, __global float* b, - __global float *r2, __global float* g2, __global float *b2, + __global const float *r, __global const float *g, __global const float* b, + __global const float *r2, __global const float* g2, __global const float *b2, double* diff_xyb); __kernel void clOpsinDynamicsImage( __global float *r, __global float *g, __global float *b, - __global float *r_blurred, __global float *g_blurred, __global float *b_blurred, + __global const float *r_blurred, __global const float *g_blurred, __global const float *b_blurred, int size) { const int i = get_global_id(0); @@ -60,7 +60,7 @@ __kernel void clOpsinDynamicsImage( b[i] = z; } -__kernel void clMinSquareVal(__global float* pA, __global float* pC, int square_size, int offset) +__kernel void clMinSquareVal(__global const float* pA, __global float* pC, int square_size, int offset) { const int x = get_global_id(0); const int y = get_global_id(1); @@ -87,7 +87,7 @@ __kernel void clMinSquareVal(__global float* pA, __global float* pC, int square_ pC[y * width + x] = minValue; } -__kernel void clConvolutionX(__global float* multipliers, __global float* inp, __global float* result, +__kernel void clConvolutionX(__global const float* multipliers, __global const float* inp, __global float* result, int step, int len, int offset, float border_ratio) { const int x = get_global_id(0); @@ -125,7 +125,7 @@ __kernel void clConvolutionX(__global float* multipliers, __global float* inp, _ result[y * xsize + x] = sum * scale; } -__kernel void clConvolutionY(__global float* multipliers, __global float* inp, __global float* result, +__kernel void clConvolutionY(__global const float* multipliers, __global const float* inp, __global float* result, int step, int len, int offset, float border_ratio) { const int x = get_global_id(0); @@ -164,7 +164,7 @@ __kernel void clConvolutionY(__global float* multipliers, __global float* inp, _ result[y * xsize + x] = sum * scale; } -__kernel void clConvolution(__global float* multipliers, __global float* inp, __global float* result, +__kernel void clConvolution(__global const float* multipliers, __global const float* inp, __global float* result, int xsize, int xstep, int len, int offset, float border_ratio) { const int ox = get_global_id(0); @@ -202,7 +202,7 @@ __kernel void clConvolution(__global float* multipliers, __global float* inp, __ result[ox * ysize + y] = sum * scale; } -__kernel void clSquareSample(__global float* pA, __global float* pC, int xstep, int ystep) +__kernel void clSquareSample(__global const float* pA, __global float* pC, int xstep, int ystep) { const int x = get_global_id(0); const int y = get_global_id(1); @@ -218,7 +218,7 @@ __kernel void clSquareSample(__global float* pA, __global float* pC, int xstep, pC[y * xsize + x] = pA[y_sample * xsize + x_sample]; } -__kernel void clDownSample(__global float* pA, __global float* pC, int xstep, int ystep) +__kernel void clDownSample(__global const float* pA, __global float* pC, int xstep, int ystep) { const int x = get_global_id(0); const int y = get_global_id(1); @@ -240,7 +240,7 @@ __kernel void clScaleImage(double scale, __global float *result) result[i] *= scale; } -__kernel void clRemoveBorder(__global float *in, int in_xsize, int s, int s2, __global float *out) +__kernel void clRemoveBorder(__global const float *in, int in_xsize, int s, int s2, __global float *out) { const int x = get_global_id(0); const int y = get_global_id(1); @@ -251,7 +251,7 @@ __kernel void clRemoveBorder(__global float *in, int in_xsize, int s, int s2, __ out[y * xsize + x] = in[(y + s2) * (xsize + s) + x + s2]; } -__kernel void clAddBorder(__global float *out, int s, int s2, __global float *in) +__kernel void clAddBorder(__global float *out, int s, int s2, __global const float *in) { const int x = get_global_id(0); const int y = get_global_id(1); @@ -264,10 +264,10 @@ __kernel void clAddBorder(__global float *out, int s, int s2, __global float *in } __kernel void clCombineChannels( - __global float *mask_x, __global float *mask_y, __global float *mask_b, - __global float *mask_dc_x, __global float *mask_dc_y, __global float *mask_dc_b, - __global float *block_diff_dc, - __global float *block_diff_ac, + __global const float *mask_x, __global const float *mask_y, __global const float *mask_b, + __global const float *mask_dc_x, __global const float *mask_dc_y, __global const float *mask_dc_b, + __global const float *block_diff_dc, + __global const float *block_diff_ac, __global float *edge_detector_map, int xsize, int ysize, int res_xsize, @@ -296,8 +296,8 @@ __kernel void clCombineChannels( } __kernel void clDiffPrecompute( - __global float *xyb0_x, __global float *xyb0_y, __global float *xyb0_b, - __global float *xyb1_x, __global float *xyb1_y, __global float *xyb1_b, + __global const float *xyb0_x, __global const float *xyb0_y, __global const float *xyb0_b, + __global const float *xyb1_x, __global const float *xyb1_y, __global const float *xyb1_b, __global float *mask_x, __global float *mask_y, __global float *mask_b) { const int x = get_global_id(0); @@ -362,8 +362,8 @@ __kernel void clDiffPrecompute( } __kernel void clEdgeDetectorMap(__global float *result, - __global float *r, __global float *g, __global float* b, - __global float *r2, __global float* g2, __global float *b2, + __global const float *r, __global const float *g, __global const float* b, + __global const float *r2, __global const float* g2, __global const float *b2, int xsize, int ysize, int step) { const int res_x = get_global_id(0); @@ -394,8 +394,8 @@ __kernel void clEdgeDetectorMap(__global float *result, } __kernel void clEdgeDetectorLowFreq(__global float *result, - __global float *r, __global float *g, __global float* b, - __global float *r2, __global float* g2, __global float *b2, + __global const float *r, __global const float *g, __global const float* b, + __global const float *r2, __global const float* g2, __global const float *b2, int xsize, int ysize, int step) { const int res_x = get_global_id(0); @@ -415,8 +415,8 @@ __kernel void clEdgeDetectorLowFreq(__global float *result, int ix = pos_y * xsize + pos_x; double diff[4][3]; - __global float* blurred0[3] = { r, g, b }; - __global float* blurred1[3] = { r2, g2, b2 }; + __global const float* blurred0[3] = { r, g, b }; + __global const float* blurred1[3] = { r2, g2, b2 }; for (int i = 0; i < 3; ++i) { int ix2 = ix + 8; @@ -459,8 +459,8 @@ __kernel void clEdgeDetectorLowFreq(__global float *result, __kernel void clDoMask( __global float *mask_x, __global float *mask_y, __global float *mask_b, __global float *mask_dc_x, __global float *mask_dc_y, __global float *mask_dc_b, - __global double *lut_x, __global double *lut_y, __global double *lut_b, - __global double *lut_dc_x, __global double *lut_dc_y, __global double *lut_dc_b) + __global const double *lut_x, __global const double *lut_y, __global const double *lut_b, + __global const double *lut_dc_x, __global const double *lut_dc_y, __global const double *lut_dc_b) { const double w00 = 232.206464018; const double w11 = 22.9455222245; @@ -489,8 +489,8 @@ __kernel void clDoMask( } -__kernel void clBlockDiffMap(__global float* r, __global float* g, __global float* b, - __global float* r2, __global float* g2, __global float* b2, +__kernel void clBlockDiffMap(__global const float* r, __global const float* g, __global const float* b, + __global const float* r2, __global const float* g2, __global const float* b2, __global float* block_diff_dc, __global float* block_diff_ac, int xsize, int ysize, int step) { @@ -549,8 +549,8 @@ __kernel void clBlockDiffMap(__global float* r, __global float* g, __global floa __kernel void clMaskHighIntensityChange( __global float *xyb0_x, __global float *xyb0_y, __global float *xyb0_b, __global float *xyb1_x, __global float *xyb1_y, __global float *xyb1_b, - __global float *c0_x, __global float *c0_y, __global float *c0_b, - __global float *c1_x, __global float *c1_y, __global float *c1_b + __global const float *c0_x, __global const float *c0_y, __global const float *c0_b, + __global const float *c1_x, __global const float *c1_y, __global const float *c1_b ) { const int x = get_global_id(0); @@ -603,7 +603,7 @@ __kernel void clMaskHighIntensityChange( xyb1_b[ix] = (float)(mix[2] * c1_b[ix] + (1 - mix[2]) * ave[2]); } -__kernel void clUpsampleSquareRoot(__global float *diffmap, int xsize, int ysize, int step, __global float *diffmap_out) +__kernel void clUpsampleSquareRoot(__global const float *diffmap, int xsize, int ysize, int step, __global float *diffmap_out) { const int res_x = get_global_id(0); const int res_y = get_global_id(1); @@ -636,7 +636,7 @@ __kernel void clUpsampleSquareRoot(__global float *diffmap, int xsize, int ysize } } -__kernel void clAverageAddImage(__global float *img, __global float *tmp0, __global float *tmp1) +__kernel void clAverageAddImage(__global float *img, __global const float *tmp0, __global const float *tmp1) { const int x = get_global_id(0); const int y = get_global_id(1); @@ -694,18 +694,13 @@ __kernel void clAverageAddImage(__global float *img, __global float *tmp0, __glo } } - - - - - void Butteraugli8x8CornerEdgeDetectorDiff( int pos_x, int pos_y, int xsize, int ysize, - __global float *r, __global float *g, __global float* b, - __global float *r2, __global float* g2, __global float *b2, + __global const float *r, __global const float *g, __global const float* b, + __global const float *r2, __global const float* g2, __global const float *b2, double* diff_xyb) { int local_count = 0; @@ -755,13 +750,11 @@ void Butteraugli8x8CornerEdgeDetectorDiff( } } - - -double DotProduct(__global float u[3], double v[3]) { +double DotProduct(__global const float u[3], const double v[3]) { return u[0] * v[0] + u[1] * v[1] + u[2] * v[2]; } -double Interpolate(__constant double *array, int size, double sx) { +double Interpolate(__constant const double *array, const int size, const double sx) { double ix = fabs(sx); int baseix = (int)(ix); @@ -843,7 +836,6 @@ void XybToVals( *valz = zmul * z; } - #define XybLowFreqToVals_inc 5.2511644570349185 __constant double XybLowFreqToVals_lut[21] = { 0, @@ -882,7 +874,6 @@ void XybLowFreqToVals(double x, double y, double z, *valy = Interpolate(&XybLowFreqToVals_lut[0], 21, y * ymul); } - double InterpolateClampNegative(__global const double *array, int size, double sx) { if (sx < 0) { @@ -932,7 +923,6 @@ typedef struct __Complex double imag; }Complex; - __constant double kSqrtHalf = 0.70710678118654752440084436210484903; void RealFFT8(const double* in, Complex* out) { double t1, t2, t3, t5, t6, t7, t8; @@ -1412,10 +1402,6 @@ void RgbToXyb(double r, double g, double b, double *valx, double *valy, double * *valz = b; } - -///================================================== -// ¸÷룬ÒÔÏÂÕâЩº¯Êý¾ÍÊÇΪÁËʵÏÖButteraugliComparatorEx::CompareBlockEx - // IntFloatPairÊÇΪÁËÄ£Äâoutput_order input_orderµÄvector£¬µ«ÊÇ´óС¹Ì¶¨Îª8x8 typedef struct __IntFloatPair { @@ -2015,7 +2001,6 @@ void CoeffToIDCT(__private const coeff_t block[8*8], uchar out[8*8]) } } - void IDCTToPixel8x8(const uchar idct[8 * 8], ushort pixels_[8 * 8]) { const int block_x = 0; @@ -2033,28 +2018,6 @@ void IDCTToPixel8x8(const uchar idct[8 * 8], ushort pixels_[8 * 8]) } } } -/* -void IDCTToPixel(const uchar idct[8*8], ushort pixels_[8*8]) -{ - const int block_x = 0; - const int block_y = 0; - const int width_ = 8; - const int height_ = 8; - - for (int iy = 0; iy < 8; ++iy) - { - for (int ix = 0; ix < 8; ++ix) - { - int x = 8 * block_x + ix; - int y = 8 * block_y + iy; - if (x >= width_ || y >= height_) continue; - int p = y * width_ + x; - pixels_[p] = idct[8 * iy + ix] << 4; - } - } -} -*/ - void IDCTToPixel16x16(const uchar idct[8 * 8], ushort pixels_out[16 * 16], __global const ushort *pixel_orig, int block_x, int block_y, int width_, int height_) { @@ -2770,8 +2733,8 @@ void Copy16x16ToChannel(const float rgb16x16[3][16 * 16], float r[8 * 8], float void Convolution(size_t xsize, size_t ysize, int xstep, int len, int offset, - float* multipliers, - float* inp, + const float* multipliers, + const float* inp, float border_ratio, float* result) { @@ -2802,7 +2765,7 @@ void Convolution(size_t xsize, size_t ysize, // ian todo // ¼ÆËã½á¹ûÊä³öµ½output -void BlurEx(float *r, int xsize, int ysize, double kSigma, double border_ratio, float *output) +void BlurEx(const float *r, int xsize, int ysize, double kSigma, double border_ratio, float *output) { // ²Î¿¼clBlurEx2µÄʵÏÖ£¬sigma = 1.1£¬Õâʱstep¡¢diff¶¼½«ÌØ»¯Îª¹Ì¶¨Öµ const double sigma = 1.1; @@ -2826,10 +2789,9 @@ void BlurEx(float *r, int xsize, int ysize, double kSigma, double border_ratio, border_ratio, output); } - // ian todo void OpsinDynamicsImageBlock(__private float *r, __private float *g, __private float *b, - __private float *r_blurred, __private float *g_blurred, __private float *b_blurred, + __private const float *r_blurred, __private const float *g_blurred, __private const float *b_blurred, int size) { for (size_t i = 0; i < size; ++i) { @@ -2860,8 +2822,8 @@ void OpsinDynamicsImageBlock(__private float *r, __private float *g, __private f // chrisk todo void MaskHighIntensityChangeBlock(float *xyb0_x, float *xyb0_y, float *xyb0_b, float *xyb1_x, float *xyb1_y, float *xyb1_b, - float *c0_x, float *c0_y, float *c0_b, - float *c1_x, float *c1_y, float *c1_b, + const float *c0_x, const float *c0_y, const float *c0_b, + const float *c1_x, const float *c1_y, const float *c1_b, int xsize, int ysize) { for (int x = 0; x < xsize; ++x) @@ -2925,7 +2887,7 @@ void floatcopy(float *dst, const float *src, int size) } } -void coeffcopy_g(coeff_t *dst, const __global coeff_t *src, int size) +void coeffcopy_g(coeff_t *dst, __global const coeff_t *src, int size) { for (int i = 0; i < size; i++) { @@ -3015,8 +2977,7 @@ int MakeInputOrderEx(const coeff_t block[3*8*8], const coeff_t orig_block[3*8*8] int GetOrigBlock(float rgb0_c[3][kDCTBlockSize], const __global float *orig_image_batch, - int width_, - int height_, + int width_, int height_, int block_x, int block_y, int factor, int off_x, int off_y) From ea15082c300d7293de92113cfb7ec3a8be10404e Mon Sep 17 00:00:00 2001 From: ianhuang-777 <306168910@qq.com> Date: Mon, 22 May 2017 13:10:39 +0800 Subject: [PATCH 102/189] Fix Average5x5 --- clguetzli/clguetzli.cl | 72 +++++++++++++----------------------- clguetzli/clguetzli_test.cpp | 6 +-- 2 files changed, 29 insertions(+), 49 deletions(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index b9ee9d1e..ec745d36 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -644,54 +644,34 @@ __kernel void clAverageAddImage(__global float *img, __global const float *tmp0, const int ysize = get_global_size(1); const int row0 = y * xsize; - if (x == 0) // excute once per y - { - img[row0 + 1] += tmp0[row0]; - img[row0 + 0] += tmp0[row0 + 1]; - img[row0 + 2] += tmp0[row0 + 1]; - - img[row0 + xsize - 3] += tmp0[row0 + xsize - 2]; - img[row0 + xsize - 1] += tmp0[row0 + xsize - 2]; - img[row0 + xsize - 2] += tmp0[row0 + xsize - 1]; - - if (y > 0) { - const int rowd1 = row0 - xsize; - img[rowd1 + 1] += tmp1[row0]; - img[rowd1 + 0] += tmp0[row0]; - - img[rowd1 + xsize - 1] += tmp0[row0 + xsize - 1]; - img[rowd1 + xsize - 2] += tmp1[row0 + xsize - 1]; - } - if (y + 1 < ysize) { - const int rowu1 = row0 + xsize; - img[rowu1 + 1] += tmp1[row0]; - img[rowu1 + 0] += tmp0[row0]; - - img[rowu1 + xsize - 1] += tmp0[row0 + xsize - 1]; - img[rowu1 + xsize - 2] += tmp1[row0 + xsize - 1]; - } - } + if (x - 1 >= 0) { + img[row0 + x] += tmp0[row0 + x - 1]; + } + if (x + 1 < xsize) { + img[row0 + x] += tmp0[row0 + x + 1]; + } - if (x >= 2 && x < xsize - 2) - { - img[row0 + x - 1] += tmp0[row0 + x]; - img[row0 + x + 1] += tmp0[row0 + x]; - } + if (y > 0) { + const int rowd1 = row0 - xsize; + if (x - 1 >= 0) { + img[row0 + x] += tmp1[rowd1 + x - 1]; + } + img[row0 + x] += tmp0[rowd1 + x]; + if (x + 1 < xsize) { + img[row0 + x] += tmp1[rowd1 + x + 1]; + } + } - if (x >= 1 && x < xsize - 1) { - if (y > 0) { - const int rowd1 = row0 - xsize; - img[rowd1 + x + 1] += tmp1[row0 + x]; - img[rowd1 + x + 0] += tmp0[row0 + x]; - img[rowd1 + x - 1] += tmp1[row0 + x]; - } - if (y + 1 < ysize) { - const int rowu1 = row0 + xsize; - img[rowu1 + x + 1] += tmp1[row0 + x]; - img[rowu1 + x + 0] += tmp0[row0 + x]; - img[rowu1 + x - 1] += tmp1[row0 + x]; - } - } + if (y + 1 < ysize) { + const int rowu1 = row0 + xsize; + if (x - 1 >= 0) { + img[row0 + x] += tmp1[rowu1 + x - 1]; + } + img[row0 + x] += tmp0[rowu1 + x]; + if (x + 1 < xsize) { + img[row0 + x] += tmp1[rowu1 + x + 1]; + } + } } void Butteraugli8x8CornerEdgeDetectorDiff( diff --git a/clguetzli/clguetzli_test.cpp b/clguetzli/clguetzli_test.cpp index 6dca483f..9d3d05a7 100644 --- a/clguetzli/clguetzli_test.cpp +++ b/clguetzli/clguetzli_test.cpp @@ -454,12 +454,12 @@ void tclDiffPrecompute( FLOAT_COMPARE(r_y, (*mask_cmp)[1].data(), xsize * ysize); FLOAT_COMPARE(r_b, (*mask_cmp)[2].data(), xsize * ysize); - ocl.releaseMemChannels(cl_xyb0); - ocl.releaseMemChannels(cl_xyb1); - ocl.releaseMemChannels(cl_mask); clEnqueueUnmapMemObject(ocl.commandQueue, cl_mask.x, r_x, 0, NULL, NULL); clEnqueueUnmapMemObject(ocl.commandQueue, cl_mask.y, r_y, 0, NULL, NULL); clEnqueueUnmapMemObject(ocl.commandQueue, cl_mask.b, r_b, 0, NULL, NULL); + ocl.releaseMemChannels(cl_xyb0); + ocl.releaseMemChannels(cl_xyb1); + ocl.releaseMemChannels(cl_mask); } // ian todo From 64968862268f04d82ff46c68bc8be7f5df892346 Mon Sep 17 00:00:00 2001 From: ianhuang-777 <306168910@qq.com> Date: Mon, 22 May 2017 14:53:42 +0800 Subject: [PATCH 103/189] Inline ScaleIamge in kernel Average5x5 --- clguetzli/clguetzli.cl | 22 +++++++------ clguetzli/clguetzli.cpp | 68 ++++++++++++----------------------------- clguetzli/ocl.h | 2 +- 3 files changed, 34 insertions(+), 58 deletions(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index ec745d36..152961dc 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -636,42 +636,46 @@ __kernel void clUpsampleSquareRoot(__global const float *diffmap, int xsize, int } } -__kernel void clAverageAddImage(__global float *img, __global const float *tmp0, __global const float *tmp1) +__kernel void clAverage5x5(__global float *img, __global const float *img_org) { const int x = get_global_id(0); const int y = get_global_id(1); const int xsize = get_global_size(0); const int ysize = get_global_size(1); + const float w = 0.679144890667f; + const float scale = 1.0f / (5.0f + 4 * w); const int row0 = y * xsize; if (x - 1 >= 0) { - img[row0 + x] += tmp0[row0 + x - 1]; + img[row0 + x] += img_org[row0 + x - 1]; } if (x + 1 < xsize) { - img[row0 + x] += tmp0[row0 + x + 1]; + img[row0 + x] += img_org[row0 + x + 1]; } if (y > 0) { const int rowd1 = row0 - xsize; if (x - 1 >= 0) { - img[row0 + x] += tmp1[rowd1 + x - 1]; + img[row0 + x] += img_org[rowd1 + x - 1] * w; } - img[row0 + x] += tmp0[rowd1 + x]; + img[row0 + x] += img_org[rowd1 + x]; if (x + 1 < xsize) { - img[row0 + x] += tmp1[rowd1 + x + 1]; + img[row0 + x] += img_org[rowd1 + x + 1] * w; } } if (y + 1 < ysize) { const int rowu1 = row0 + xsize; if (x - 1 >= 0) { - img[row0 + x] += tmp1[rowu1 + x - 1]; + img[row0 + x] += img_org[rowu1 + x - 1] * w; } - img[row0 + x] += tmp0[rowu1 + x]; + img[row0 + x] += img_org[rowu1 + x]; if (x + 1 < xsize) { - img[row0 + x] += tmp1[rowu1 + x + 1]; + img[row0 + x] += img_org[rowu1 + x + 1] * w; } } + + img[row0 + x] *= scale; } void Butteraugli8x8CornerEdgeDetectorDiff( diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index 67eb4918..6be1e014 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -59,7 +59,7 @@ ocl_args_d_t& getOcl(void) ocl.kernel[KERNEL_UPSAMPLESQUAREROOT] = clCreateKernel(ocl.program, "clUpsampleSquareRoot", &err); ocl.kernel[KERNEL_ADDBORDER] = clCreateKernel(ocl.program, "clAddBorder", &err); ocl.kernel[KERNEL_REMOVEBORDER] = clCreateKernel(ocl.program, "clRemoveBorder", &err); - ocl.kernel[KERNEL_AVERAGEADDIMAGE] = clCreateKernel(ocl.program, "clAverageAddImage", &err); + ocl.kernel[KERNEL_AVERAGE5X5] = clCreateKernel(ocl.program, "clAverage5x5", &err); ocl.kernel[KERNEL_EDGEDETECTOR] = clCreateKernel(ocl.program, "clEdgeDetectorMap", &err); ocl.kernel[KERNEL_BLOCKDIFFMAP] = clCreateKernel(ocl.program, "clBlockDiffMap", &err); ocl.kernel[KERNEL_EDGEDETECTORLOWFREQ] = clCreateKernel(ocl.program, "clEdgeDetectorLowFreq", &err); @@ -642,29 +642,6 @@ void clScaleImageEx(cl_mem img/*in, out*/, size_t size, double w) } } -void clAverageAddImage(cl_mem img, cl_mem tmp0, cl_mem tmp1, size_t xsize, size_t ysize) -{ - cl_int err = CL_SUCCESS; - ocl_args_d_t &ocl = getOcl(); - - cl_kernel kernel = ocl.kernel[KERNEL_AVERAGEADDIMAGE]; - clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&img); - clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&tmp0); - clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&tmp1); - - size_t globalWorkSize[2] = { xsize, ysize }; - err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); - if (CL_SUCCESS != err) - { - LogError("Error: clAverageAddImage() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err)); - } - err = clFinish(ocl.commandQueue); - if (CL_SUCCESS != err) - { - LogError("Error: clAverageAddImage() clFinish returned %s.\n", TranslateOpenCLError(err)); - } -} - void clAverage5x5Ex(cl_mem img/*in,out*/, size_t xsize, size_t ysize) { if (xsize < 4 || ysize < 4) { @@ -677,30 +654,25 @@ void clAverage5x5Ex(cl_mem img/*in,out*/, size_t xsize, size_t ysize) size_t len = xsize * ysize * sizeof(float); ocl.allocA(len); - ocl.allocB(len); - ocl.allocC(len); - cl_mem result = ocl.srcA; - cl_mem tmp0 = ocl.srcB; - cl_mem tmp1 = ocl.dstMem; - - err = clEnqueueCopyBuffer(ocl.commandQueue, img, result, 0, 0, len, 0, NULL, NULL); - err = clEnqueueCopyBuffer(ocl.commandQueue, img, tmp0, 0, 0, len, 0, NULL, NULL); - err = clEnqueueCopyBuffer(ocl.commandQueue, img, tmp1, 0, 0, len, 0, NULL, NULL); - - static const float w = 0.679144890667f; - static const float scale = 1.0f / (5.0f + 4 * w); - - clScaleImageEx(tmp1, xsize * ysize, w); - clAverageAddImage(result, tmp0, tmp1, xsize, ysize); - - err = clEnqueueCopyBuffer(ocl.commandQueue, result, img, 0, 0, len, 0, NULL, NULL); - if (CL_SUCCESS != err) - { - LogError("Error: clAverage5x5Ex() clEnqueueCopyBuffer returned %s.\n", TranslateOpenCLError(err)); - } - err = clFinish(ocl.commandQueue); - - clScaleImageEx(img, xsize * ysize, scale); + cl_mem tmp = ocl.srcA; + + err = clEnqueueCopyBuffer(ocl.commandQueue, img, tmp, 0, 0, len, 0, NULL, NULL); + + cl_kernel kernel = ocl.kernel[KERNEL_AVERAGE5X5]; + clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&img); + clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&tmp); + + size_t globalWorkSize[2] = { xsize, ysize }; + err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); + if (CL_SUCCESS != err) + { + LogError("Error: clAverage5x5Ex() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err)); + } + err = clFinish(ocl.commandQueue); + if (CL_SUCCESS != err) + { + LogError("Error: clAverage5x5Ex() clFinish returned %s.\n", TranslateOpenCLError(err)); + } } void clMinSquareValEx(cl_mem img/*in,out*/, size_t xsize, size_t ysize, size_t square_size, size_t offset) diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h index 15e115af..b74a8a58 100644 --- a/clguetzli/ocl.h +++ b/clguetzli/ocl.h @@ -60,7 +60,7 @@ enum KernelName { KERNEL_UPSAMPLESQUAREROOT, KERNEL_ADDBORDER, KERNEL_REMOVEBORDER, - KERNEL_AVERAGEADDIMAGE, + KERNEL_AVERAGE5X5, KERNEL_EDGEDETECTOR, KERNEL_BLOCKDIFFMAP, KERNEL_EDGEDETECTORLOWFREQ, From 89cda39c53d9281f5f1fe6d86e3118d408671f92 Mon Sep 17 00:00:00 2001 From: ianhuang-777 <306168910@qq.com> Date: Mon, 22 May 2017 15:38:01 +0800 Subject: [PATCH 104/189] Avoid const value computing in work item --- clguetzli/clguetzli.cl | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index 152961dc..5548e7a1 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -462,16 +462,16 @@ __kernel void clDoMask( __global const double *lut_x, __global const double *lut_y, __global const double *lut_b, __global const double *lut_dc_x, __global const double *lut_dc_y, __global const double *lut_dc_b) { - const double w00 = 232.206464018; - const double w11 = 22.9455222245; - const double w22 = 503.962310606; - const int x = get_global_id(0); const int y = get_global_id(1); const int xsize = get_global_size(0); const int ysize = get_global_size(1); + const double w00 = 232.206464018; + const double w11 = 22.9455222245; + const double w22 = 503.962310606; + const size_t idx = y * xsize + x; const double s0 = mask_x[idx]; const double s1 = mask_y[idx]; @@ -636,6 +636,8 @@ __kernel void clUpsampleSquareRoot(__global const float *diffmap, int xsize, int } } +#define Average5x5_w 0.679144890667f +__constant float Average5x5_scale = 1.0f / (5.0f + 4 * Average5x5_w); __kernel void clAverage5x5(__global float *img, __global const float *img_org) { const int x = get_global_id(0); @@ -643,8 +645,6 @@ __kernel void clAverage5x5(__global float *img, __global const float *img_org) const int xsize = get_global_size(0); const int ysize = get_global_size(1); - const float w = 0.679144890667f; - const float scale = 1.0f / (5.0f + 4 * w); const int row0 = y * xsize; if (x - 1 >= 0) { img[row0 + x] += img_org[row0 + x - 1]; @@ -656,26 +656,26 @@ __kernel void clAverage5x5(__global float *img, __global const float *img_org) if (y > 0) { const int rowd1 = row0 - xsize; if (x - 1 >= 0) { - img[row0 + x] += img_org[rowd1 + x - 1] * w; + img[row0 + x] += img_org[rowd1 + x - 1] * Average5x5_w; } img[row0 + x] += img_org[rowd1 + x]; if (x + 1 < xsize) { - img[row0 + x] += img_org[rowd1 + x + 1] * w; + img[row0 + x] += img_org[rowd1 + x + 1] * Average5x5_w; } } if (y + 1 < ysize) { const int rowu1 = row0 + xsize; if (x - 1 >= 0) { - img[row0 + x] += img_org[rowu1 + x - 1] * w; + img[row0 + x] += img_org[rowu1 + x - 1] * Average5x5_w; } img[row0 + x] += img_org[rowu1 + x]; if (x + 1 < xsize) { - img[row0 + x] += img_org[rowu1 + x + 1] * w; + img[row0 + x] += img_org[rowu1 + x + 1] * Average5x5_w; } } - img[row0 + x] *= scale; + img[row0 + x] *= Average5x5_scale; } void Butteraugli8x8CornerEdgeDetectorDiff( From f54bc0ea0177358e818389f3631ee6042abe3376 Mon Sep 17 00:00:00 2001 From: ianhuang-777 <306168910@qq.com> Date: Mon, 22 May 2017 15:42:00 +0800 Subject: [PATCH 105/189] Fix tclCalculateDiffmap --- clguetzli/clguetzli_test.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/clguetzli/clguetzli_test.cpp b/clguetzli/clguetzli_test.cpp index 9d3d05a7..1797fe66 100644 --- a/clguetzli/clguetzli_test.cpp +++ b/clguetzli/clguetzli_test.cpp @@ -321,10 +321,10 @@ void tclCalculateDiffmap(const size_t xsize, const size_t ysize, cl_mem mem_diffmap = ocl.allocMem(length); clEnqueueWriteBuffer(ocl.commandQueue, mem_diffmap, CL_FALSE, 0, org_len * sizeof(float), diffmap, 0, NULL, NULL); clCalculateDiffmapEx(mem_diffmap, xsize, ysize, step); - //cl_float *result_tmp = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mem_diffmap, true, CL_MAP_READ, 0, length, 0, NULL, NULL, &err); - //err = clFinish(ocl.commandQueue); - //FLOAT_COMPARE(result_tmp, diffmap_cmp, xsize * ysize); - //clEnqueueUnmapMemObject(ocl.commandQueue, mem_diffmap, result_tmp, 0, NULL, NULL); + cl_float *result_tmp = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mem_diffmap, true, CL_MAP_READ, 0, length, 0, NULL, NULL, &err); + err = clFinish(ocl.commandQueue); + FLOAT_COMPARE(result_tmp, diffmap_cmp, xsize * ysize); + clEnqueueUnmapMemObject(ocl.commandQueue, mem_diffmap, result_tmp, 0, NULL, NULL); clReleaseMemObject(mem_diffmap); } From 36f2e52e8c7d00494934c8258a9f1042f8abf1c2 Mon Sep 17 00:00:00 2001 From: strongtu Date: Wed, 24 May 2017 11:49:57 +0800 Subject: [PATCH 106/189] Merge branch 'master' of https://github.com/ianhuang-777/guetzli --- clguetzli/clbutter_comparator.h | 2 +- clguetzli/clguetzli.cl | 8 +- clguetzli/clguetzli.cl.h | 2 +- clguetzli/clguetzli.cpp | 4 +- clguetzli/clguetzli.h | 6 +- clguetzli/ocl.cpp | 6 + clguetzli/ocl.h | 2 +- clguetzli/utils.cpp | 11 +- guetzli.make | 190 ++++++++++++++++-- guetzli/butteraugli_comparator.h | 2 +- guetzli/guetzli.cc | 2 +- guetzli/processor.cc | 2 +- guetzli_static.make | 181 +++++++++++++++-- premake5.lua | 12 +- .../butteraugli/butteraugli/butteraugli.cc | 6 +- 15 files changed, 374 insertions(+), 62 deletions(-) diff --git a/clguetzli/clbutter_comparator.h b/clguetzli/clbutter_comparator.h index 19ca163f..41e332ed 100644 --- a/clguetzli/clbutter_comparator.h +++ b/clguetzli/clbutter_comparator.h @@ -1,6 +1,6 @@ #pragma once #include -#include "butteraugli\butteraugli.h" +#include "butteraugli/butteraugli.h" #define __restrict__ diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index 5548e7a1..9722b08d 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -1,6 +1,6 @@ #pragma OPENCL EXTENSION cl_khr_fp64 : enable -#include "clguetzli\clguetzli.cl.h" +#include "clguetzli/clguetzli.cl.h" #define kBlockEdge 8 #define kBlockSize (kBlockEdge * kBlockEdge) @@ -258,6 +258,12 @@ __kernel void clAddBorder(__global float *out, int s, int s2, __global const flo const int xsize = get_global_size(0); const int ysize = get_global_size(1); + if (x >= xsize - s || + y >= ysize - s) + { + return; + } + const double mul1 = 24.8235314874; out[(y + s2) * xsize + x + s2] += (float)(mul1) * in[y * (xsize - s) + x]; diff --git a/clguetzli/clguetzli.cl.h b/clguetzli/clguetzli.cl.h index 529ca141..35b4ed3c 100644 --- a/clguetzli/clguetzli.cl.h +++ b/clguetzli/clguetzli.cl.h @@ -41,7 +41,7 @@ { cl_mem x; cl_mem y; - cl_mem b; + cl_mem b_; }; union { diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index 6be1e014..b12bdfc4 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -22,7 +22,7 @@ ocl_args_d_t& getOcl(void) char* source = nullptr; size_t src_size = 0; - ReadSourceFromFile("clguetzli\\clguetzli.cl", &source, &src_size); + ReadSourceFromFile("clguetzli/clguetzli.cl", &source, &src_size); ocl.program = clCreateProgramWithSource(ocl.context, 1, (const char**)&source, &src_size, &err); @@ -1060,7 +1060,7 @@ void clAddBorderEx(cl_mem out, size_t xsize, size_t ysize, int step, cl_mem in) clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&cls2); clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&in); - size_t globalWorkSize[2] = { xsize - cls, ysize - cls }; + size_t globalWorkSize[2] = { xsize, ysize}; err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); if (CL_SUCCESS != err) { diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h index 4e6f3209..3235103e 100644 --- a/clguetzli/clguetzli.h +++ b/clguetzli/clguetzli.h @@ -1,8 +1,8 @@ #pragma once #include -#include "CL\cl.h" -#include "guetzli\processor.h" -#include "guetzli\butteraugli_comparator.h" +#include "CL/cl.h" +#include "guetzli/processor.h" +#include "guetzli/butteraugli_comparator.h" #include "ocl.h" #include "clguetzli.cl.h" diff --git a/clguetzli/ocl.cpp b/clguetzli/ocl.cpp index 7be57d49..517b42c3 100644 --- a/clguetzli/ocl.cpp +++ b/clguetzli/ocl.cpp @@ -1,4 +1,10 @@ #include "ocl.h" +#include +#ifdef __linux__ +#include +#define _aligned_malloc memalign +#define _aligned_free free +#endif #include diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h index b74a8a58..d72000b3 100644 --- a/clguetzli/ocl.h +++ b/clguetzli/ocl.h @@ -1,6 +1,6 @@ #pragma once -#include "CL\cl.h" +#include "CL/cl.h" #include "utils.h" #include "clguetzli.cl.h" diff --git a/clguetzli/utils.cpp b/clguetzli/utils.cpp index 24520cd8..4fc8dbc2 100644 --- a/clguetzli/utils.cpp +++ b/clguetzli/utils.cpp @@ -22,11 +22,10 @@ #include #include -#include #include -#include -#include "CL\cl.h" -#include "CL\cl_ext.h" +#include +#include "CL/cl.h" +#include "CL/cl_ext.h" #include "utils.h" #include @@ -70,7 +69,11 @@ int ReadSourceFromFile(const char* fileName, char** source, size_t* sourceSize) int errorCode = CL_SUCCESS; FILE* fp = NULL; +#ifdef __linux__ + fp = fopen(fileName, "rb"); +#else fopen_s(&fp, fileName, "rb"); +#endif if (fp == NULL) { LogError("Error: Couldn't find program source file '%s'.\n", fileName); diff --git a/guetzli.make b/guetzli.make index 7edeea3f..442d678b 100644 --- a/guetzli.make +++ b/guetzli.make @@ -16,15 +16,15 @@ ifeq ($(config),release) TARGET = $(TARGETDIR)/guetzli OBJDIR = obj/Release/guetzli DEFINES += - INCLUDES += -I. -Ithird_party/butteraugli + INCLUDES += -I. -Ithird_party/butteraugli -Iclguetzli -I"$(OPENCL_INC)" FORCE_INCLUDE += ALL_CPPFLAGS += $(CPPFLAGS) -MMD -MP $(DEFINES) $(INCLUDES) ALL_CFLAGS += $(CFLAGS) $(ALL_CPPFLAGS) -O3 -g `pkg-config --cflags libpng || libpng-config --cflags` ALL_CXXFLAGS += $(CXXFLAGS) $(ALL_CPPFLAGS) -O3 -g -std=c++11 `pkg-config --cflags libpng || libpng-config --cflags` ALL_RESFLAGS += $(RESFLAGS) $(DEFINES) $(INCLUDES) - LIBS += + LIBS += -lOpenCL LDDEPS += - ALL_LDFLAGS += $(LDFLAGS) `pkg-config --libs libpng || libpng-config --ldflags` + ALL_LDFLAGS += $(LDFLAGS) -L"$(OPENCL_LIB)" `pkg-config --libs libpng || libpng-config --ldflags` LINKCMD = $(CXX) -o "$@" $(OBJECTS) $(RESOURCES) $(ALL_LDFLAGS) $(LIBS) define PREBUILDCMDS endef @@ -32,7 +32,7 @@ ifeq ($(config),release) endef define POSTBUILDCMDS endef -all: $(TARGETDIR) $(OBJDIR) prebuild prelink $(TARGET) +all: prebuild prelink $(TARGET) @: endif @@ -43,15 +43,15 @@ ifeq ($(config),debug) TARGET = $(TARGETDIR)/guetzli OBJDIR = obj/Debug/guetzli DEFINES += - INCLUDES += -I. -Ithird_party/butteraugli + INCLUDES += -I. -Ithird_party/butteraugli -Iclguetzli -I"$(OPENCL_INC)" FORCE_INCLUDE += ALL_CPPFLAGS += $(CPPFLAGS) -MMD -MP $(DEFINES) $(INCLUDES) ALL_CFLAGS += $(CFLAGS) $(ALL_CPPFLAGS) -g `pkg-config --cflags libpng || libpng-config --cflags` ALL_CXXFLAGS += $(CXXFLAGS) $(ALL_CPPFLAGS) -g -std=c++11 `pkg-config --cflags libpng || libpng-config --cflags` ALL_RESFLAGS += $(RESFLAGS) $(DEFINES) $(INCLUDES) - LIBS += + LIBS += -lOpenCL LDDEPS += - ALL_LDFLAGS += $(LDFLAGS) `pkg-config --libs libpng || libpng-config --ldflags` + ALL_LDFLAGS += $(LDFLAGS) -L"$(OPENCL_LIB)" `pkg-config --libs libpng || libpng-config --ldflags` LINKCMD = $(CXX) -o "$@" $(OBJECTS) $(RESOURCES) $(ALL_LDFLAGS) $(LIBS) define PREBUILDCMDS endef @@ -59,12 +59,18 @@ ifeq ($(config),debug) endef define POSTBUILDCMDS endef -all: $(TARGETDIR) $(OBJDIR) prebuild prelink $(TARGET) +all: prebuild prelink $(TARGET) @: endif OBJECTS := \ + $(OBJDIR)/clbutter_comparator.o \ + $(OBJDIR)/clguetzli.cl.o \ + $(OBJDIR)/clguetzli.o \ + $(OBJDIR)/clguetzli_test.o \ + $(OBJDIR)/ocl.o \ + $(OBJDIR)/utils.o \ $(OBJDIR)/butteraugli_comparator.o \ $(OBJDIR)/dct_double.o \ $(OBJDIR)/debug_print.o \ @@ -101,24 +107,13 @@ endif $(TARGET): $(GCH) ${CUSTOMFILES} $(OBJECTS) $(LDDEPS) $(RESOURCES) @echo Linking guetzli - $(SILENT) $(LINKCMD) - $(POSTBUILDCMDS) - -$(TARGETDIR): - @echo Creating $(TARGETDIR) ifeq (posix,$(SHELLTYPE)) $(SILENT) mkdir -p $(TARGETDIR) else $(SILENT) mkdir $(subst /,\\,$(TARGETDIR)) endif - -$(OBJDIR): - @echo Creating $(OBJDIR) -ifeq (posix,$(SHELLTYPE)) - $(SILENT) mkdir -p $(OBJDIR) -else - $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) -endif + $(SILENT) $(LINKCMD) + $(POSTBUILDCMDS) clean: @echo Cleaning guetzli @@ -143,68 +138,221 @@ $(GCH): $(PCH) $(SILENT) $(CXX) -x c++-header $(ALL_CXXFLAGS) -o "$@" -MF "$(@:%.gch=%.d)" -c "$<" endif +$(OBJDIR)/clbutter_comparator.o: clguetzli/clbutter_comparator.cpp + @echo $(notdir $<) +ifeq (posix,$(SHELLTYPE)) + $(SILENT) mkdir -p $(OBJDIR) +else + $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) +endif + $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" +$(OBJDIR)/clguetzli.cl.o: clguetzli/clguetzli.cl.cpp + @echo $(notdir $<) +ifeq (posix,$(SHELLTYPE)) + $(SILENT) mkdir -p $(OBJDIR) +else + $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) +endif + $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" +$(OBJDIR)/clguetzli.o: clguetzli/clguetzli.cpp + @echo $(notdir $<) +ifeq (posix,$(SHELLTYPE)) + $(SILENT) mkdir -p $(OBJDIR) +else + $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) +endif + $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" +$(OBJDIR)/clguetzli_test.o: clguetzli/clguetzli_test.cpp + @echo $(notdir $<) +ifeq (posix,$(SHELLTYPE)) + $(SILENT) mkdir -p $(OBJDIR) +else + $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) +endif + $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" +$(OBJDIR)/ocl.o: clguetzli/ocl.cpp + @echo $(notdir $<) +ifeq (posix,$(SHELLTYPE)) + $(SILENT) mkdir -p $(OBJDIR) +else + $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) +endif + $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" +$(OBJDIR)/utils.o: clguetzli/utils.cpp + @echo $(notdir $<) +ifeq (posix,$(SHELLTYPE)) + $(SILENT) mkdir -p $(OBJDIR) +else + $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) +endif + $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/butteraugli_comparator.o: guetzli/butteraugli_comparator.cc @echo $(notdir $<) +ifeq (posix,$(SHELLTYPE)) + $(SILENT) mkdir -p $(OBJDIR) +else + $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) +endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/dct_double.o: guetzli/dct_double.cc @echo $(notdir $<) +ifeq (posix,$(SHELLTYPE)) + $(SILENT) mkdir -p $(OBJDIR) +else + $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) +endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/debug_print.o: guetzli/debug_print.cc @echo $(notdir $<) +ifeq (posix,$(SHELLTYPE)) + $(SILENT) mkdir -p $(OBJDIR) +else + $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) +endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/entropy_encode.o: guetzli/entropy_encode.cc @echo $(notdir $<) +ifeq (posix,$(SHELLTYPE)) + $(SILENT) mkdir -p $(OBJDIR) +else + $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) +endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/fdct.o: guetzli/fdct.cc @echo $(notdir $<) +ifeq (posix,$(SHELLTYPE)) + $(SILENT) mkdir -p $(OBJDIR) +else + $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) +endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/gamma_correct.o: guetzli/gamma_correct.cc @echo $(notdir $<) +ifeq (posix,$(SHELLTYPE)) + $(SILENT) mkdir -p $(OBJDIR) +else + $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) +endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/guetzli.o: guetzli/guetzli.cc @echo $(notdir $<) +ifeq (posix,$(SHELLTYPE)) + $(SILENT) mkdir -p $(OBJDIR) +else + $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) +endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/idct.o: guetzli/idct.cc @echo $(notdir $<) +ifeq (posix,$(SHELLTYPE)) + $(SILENT) mkdir -p $(OBJDIR) +else + $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) +endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/jpeg_data.o: guetzli/jpeg_data.cc @echo $(notdir $<) +ifeq (posix,$(SHELLTYPE)) + $(SILENT) mkdir -p $(OBJDIR) +else + $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) +endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/jpeg_data_decoder.o: guetzli/jpeg_data_decoder.cc @echo $(notdir $<) +ifeq (posix,$(SHELLTYPE)) + $(SILENT) mkdir -p $(OBJDIR) +else + $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) +endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/jpeg_data_encoder.o: guetzli/jpeg_data_encoder.cc @echo $(notdir $<) +ifeq (posix,$(SHELLTYPE)) + $(SILENT) mkdir -p $(OBJDIR) +else + $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) +endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/jpeg_data_reader.o: guetzli/jpeg_data_reader.cc @echo $(notdir $<) +ifeq (posix,$(SHELLTYPE)) + $(SILENT) mkdir -p $(OBJDIR) +else + $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) +endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/jpeg_data_writer.o: guetzli/jpeg_data_writer.cc @echo $(notdir $<) +ifeq (posix,$(SHELLTYPE)) + $(SILENT) mkdir -p $(OBJDIR) +else + $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) +endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/jpeg_huffman_decode.o: guetzli/jpeg_huffman_decode.cc @echo $(notdir $<) +ifeq (posix,$(SHELLTYPE)) + $(SILENT) mkdir -p $(OBJDIR) +else + $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) +endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/output_image.o: guetzli/output_image.cc @echo $(notdir $<) +ifeq (posix,$(SHELLTYPE)) + $(SILENT) mkdir -p $(OBJDIR) +else + $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) +endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/preprocess_downsample.o: guetzli/preprocess_downsample.cc @echo $(notdir $<) +ifeq (posix,$(SHELLTYPE)) + $(SILENT) mkdir -p $(OBJDIR) +else + $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) +endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/processor.o: guetzli/processor.cc @echo $(notdir $<) +ifeq (posix,$(SHELLTYPE)) + $(SILENT) mkdir -p $(OBJDIR) +else + $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) +endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/quality.o: guetzli/quality.cc @echo $(notdir $<) +ifeq (posix,$(SHELLTYPE)) + $(SILENT) mkdir -p $(OBJDIR) +else + $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) +endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/quantize.o: guetzli/quantize.cc @echo $(notdir $<) +ifeq (posix,$(SHELLTYPE)) + $(SILENT) mkdir -p $(OBJDIR) +else + $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) +endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/score.o: guetzli/score.cc @echo $(notdir $<) +ifeq (posix,$(SHELLTYPE)) + $(SILENT) mkdir -p $(OBJDIR) +else + $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) +endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/butteraugli.o: third_party/butteraugli/butteraugli/butteraugli.cc @echo $(notdir $<) +ifeq (posix,$(SHELLTYPE)) + $(SILENT) mkdir -p $(OBJDIR) +else + $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) +endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" -include $(OBJECTS:%.o=%.d) diff --git a/guetzli/butteraugli_comparator.h b/guetzli/butteraugli_comparator.h index 5418c0d2..5fd140ba 100644 --- a/guetzli/butteraugli_comparator.h +++ b/guetzli/butteraugli_comparator.h @@ -20,7 +20,7 @@ #include #include "butteraugli/butteraugli.h" -#include "clguetzli\clbutter_comparator.h" +#include "clguetzli/clbutter_comparator.h" #include "guetzli/comparator.h" #include "guetzli/jpeg_data.h" #include "guetzli/output_image.h" diff --git a/guetzli/guetzli.cc b/guetzli/guetzli.cc index 5982bc1c..40544d90 100644 --- a/guetzli/guetzli.cc +++ b/guetzli/guetzli.cc @@ -28,7 +28,7 @@ #include "guetzli/processor.h" #include "guetzli/quality.h" #include "guetzli/stats.h" -#include "clguetzli\clguetzli.h" +#include "clguetzli/clguetzli.h" namespace { diff --git a/guetzli/processor.cc b/guetzli/processor.cc index 43b513dc..e5439460 100644 --- a/guetzli/processor.cc +++ b/guetzli/processor.cc @@ -31,7 +31,7 @@ #include "guetzli/jpeg_data_writer.h" #include "guetzli/output_image.h" #include "guetzli/quantize.h" -#include "clguetzli\clguetzli.h" +#include "clguetzli/clguetzli.h" namespace guetzli { diff --git a/guetzli_static.make b/guetzli_static.make index d20fb77d..f271c46f 100644 --- a/guetzli_static.make +++ b/guetzli_static.make @@ -16,7 +16,7 @@ ifeq ($(config),release) TARGET = $(TARGETDIR)/libguetzli_static.a OBJDIR = obj/Release/guetzli_static DEFINES += - INCLUDES += -I. -Ithird_party/butteraugli + INCLUDES += -I. -Ithird_party/butteraugli -Iclguetzli -I"$(OPENCL_INC)" FORCE_INCLUDE += ALL_CPPFLAGS += $(CPPFLAGS) -MMD -MP $(DEFINES) $(INCLUDES) ALL_CFLAGS += $(CFLAGS) $(ALL_CPPFLAGS) -O3 -g `pkg-config --static --cflags libpng || libpng-config --static --cflags` @@ -24,7 +24,7 @@ ifeq ($(config),release) ALL_RESFLAGS += $(RESFLAGS) $(DEFINES) $(INCLUDES) LIBS += LDDEPS += - ALL_LDFLAGS += $(LDFLAGS) `pkg-config --static --libs libpng || libpng-config --static --ldflags` + ALL_LDFLAGS += $(LDFLAGS) -L"$(OPENCL_LIB)" `pkg-config --static --libs libpng || libpng-config --static --ldflags` LINKCMD = $(AR) -rcs "$@" $(OBJECTS) define PREBUILDCMDS endef @@ -32,7 +32,7 @@ ifeq ($(config),release) endef define POSTBUILDCMDS endef -all: $(TARGETDIR) $(OBJDIR) prebuild prelink $(TARGET) +all: prebuild prelink $(TARGET) @: endif @@ -43,7 +43,7 @@ ifeq ($(config),debug) TARGET = $(TARGETDIR)/libguetzli_static.a OBJDIR = obj/Debug/guetzli_static DEFINES += - INCLUDES += -I. -Ithird_party/butteraugli + INCLUDES += -I. -Ithird_party/butteraugli -Iclguetzli -I"$(OPENCL_INC)" FORCE_INCLUDE += ALL_CPPFLAGS += $(CPPFLAGS) -MMD -MP $(DEFINES) $(INCLUDES) ALL_CFLAGS += $(CFLAGS) $(ALL_CPPFLAGS) -g `pkg-config --static --cflags libpng || libpng-config --static --cflags` @@ -51,7 +51,7 @@ ifeq ($(config),debug) ALL_RESFLAGS += $(RESFLAGS) $(DEFINES) $(INCLUDES) LIBS += LDDEPS += - ALL_LDFLAGS += $(LDFLAGS) `pkg-config --static --libs libpng || libpng-config --static --ldflags` + ALL_LDFLAGS += $(LDFLAGS) -L"$(OPENCL_LIB)" `pkg-config --static --libs libpng || libpng-config --static --ldflags` LINKCMD = $(AR) -rcs "$@" $(OBJECTS) define PREBUILDCMDS endef @@ -59,12 +59,18 @@ ifeq ($(config),debug) endef define POSTBUILDCMDS endef -all: $(TARGETDIR) $(OBJDIR) prebuild prelink $(TARGET) +all: prebuild prelink $(TARGET) @: endif OBJECTS := \ + $(OBJDIR)/clbutter_comparator.o \ + $(OBJDIR)/clguetzli.cl.o \ + $(OBJDIR)/clguetzli.o \ + $(OBJDIR)/clguetzli_test.o \ + $(OBJDIR)/ocl.o \ + $(OBJDIR)/utils.o \ $(OBJDIR)/butteraugli_comparator.o \ $(OBJDIR)/dct_double.o \ $(OBJDIR)/debug_print.o \ @@ -100,24 +106,13 @@ endif $(TARGET): $(GCH) ${CUSTOMFILES} $(OBJECTS) $(LDDEPS) $(RESOURCES) @echo Linking guetzli_static - $(SILENT) $(LINKCMD) - $(POSTBUILDCMDS) - -$(TARGETDIR): - @echo Creating $(TARGETDIR) ifeq (posix,$(SHELLTYPE)) $(SILENT) mkdir -p $(TARGETDIR) else $(SILENT) mkdir $(subst /,\\,$(TARGETDIR)) endif - -$(OBJDIR): - @echo Creating $(OBJDIR) -ifeq (posix,$(SHELLTYPE)) - $(SILENT) mkdir -p $(OBJDIR) -else - $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) -endif + $(SILENT) $(LINKCMD) + $(POSTBUILDCMDS) clean: @echo Cleaning guetzli_static @@ -142,65 +137,213 @@ $(GCH): $(PCH) $(SILENT) $(CXX) -x c++-header $(ALL_CXXFLAGS) -o "$@" -MF "$(@:%.gch=%.d)" -c "$<" endif +$(OBJDIR)/clbutter_comparator.o: clguetzli/clbutter_comparator.cpp + @echo $(notdir $<) +ifeq (posix,$(SHELLTYPE)) + $(SILENT) mkdir -p $(OBJDIR) +else + $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) +endif + $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" +$(OBJDIR)/clguetzli.cl.o: clguetzli/clguetzli.cl.cpp + @echo $(notdir $<) +ifeq (posix,$(SHELLTYPE)) + $(SILENT) mkdir -p $(OBJDIR) +else + $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) +endif + $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" +$(OBJDIR)/clguetzli.o: clguetzli/clguetzli.cpp + @echo $(notdir $<) +ifeq (posix,$(SHELLTYPE)) + $(SILENT) mkdir -p $(OBJDIR) +else + $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) +endif + $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" +$(OBJDIR)/clguetzli_test.o: clguetzli/clguetzli_test.cpp + @echo $(notdir $<) +ifeq (posix,$(SHELLTYPE)) + $(SILENT) mkdir -p $(OBJDIR) +else + $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) +endif + $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" +$(OBJDIR)/ocl.o: clguetzli/ocl.cpp + @echo $(notdir $<) +ifeq (posix,$(SHELLTYPE)) + $(SILENT) mkdir -p $(OBJDIR) +else + $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) +endif + $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" +$(OBJDIR)/utils.o: clguetzli/utils.cpp + @echo $(notdir $<) +ifeq (posix,$(SHELLTYPE)) + $(SILENT) mkdir -p $(OBJDIR) +else + $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) +endif + $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/butteraugli_comparator.o: guetzli/butteraugli_comparator.cc @echo $(notdir $<) +ifeq (posix,$(SHELLTYPE)) + $(SILENT) mkdir -p $(OBJDIR) +else + $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) +endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/dct_double.o: guetzli/dct_double.cc @echo $(notdir $<) +ifeq (posix,$(SHELLTYPE)) + $(SILENT) mkdir -p $(OBJDIR) +else + $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) +endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/debug_print.o: guetzli/debug_print.cc @echo $(notdir $<) +ifeq (posix,$(SHELLTYPE)) + $(SILENT) mkdir -p $(OBJDIR) +else + $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) +endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/entropy_encode.o: guetzli/entropy_encode.cc @echo $(notdir $<) +ifeq (posix,$(SHELLTYPE)) + $(SILENT) mkdir -p $(OBJDIR) +else + $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) +endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/fdct.o: guetzli/fdct.cc @echo $(notdir $<) +ifeq (posix,$(SHELLTYPE)) + $(SILENT) mkdir -p $(OBJDIR) +else + $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) +endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/gamma_correct.o: guetzli/gamma_correct.cc @echo $(notdir $<) +ifeq (posix,$(SHELLTYPE)) + $(SILENT) mkdir -p $(OBJDIR) +else + $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) +endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/idct.o: guetzli/idct.cc @echo $(notdir $<) +ifeq (posix,$(SHELLTYPE)) + $(SILENT) mkdir -p $(OBJDIR) +else + $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) +endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/jpeg_data.o: guetzli/jpeg_data.cc @echo $(notdir $<) +ifeq (posix,$(SHELLTYPE)) + $(SILENT) mkdir -p $(OBJDIR) +else + $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) +endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/jpeg_data_decoder.o: guetzli/jpeg_data_decoder.cc @echo $(notdir $<) +ifeq (posix,$(SHELLTYPE)) + $(SILENT) mkdir -p $(OBJDIR) +else + $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) +endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/jpeg_data_encoder.o: guetzli/jpeg_data_encoder.cc @echo $(notdir $<) +ifeq (posix,$(SHELLTYPE)) + $(SILENT) mkdir -p $(OBJDIR) +else + $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) +endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/jpeg_data_reader.o: guetzli/jpeg_data_reader.cc @echo $(notdir $<) +ifeq (posix,$(SHELLTYPE)) + $(SILENT) mkdir -p $(OBJDIR) +else + $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) +endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/jpeg_data_writer.o: guetzli/jpeg_data_writer.cc @echo $(notdir $<) +ifeq (posix,$(SHELLTYPE)) + $(SILENT) mkdir -p $(OBJDIR) +else + $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) +endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/jpeg_huffman_decode.o: guetzli/jpeg_huffman_decode.cc @echo $(notdir $<) +ifeq (posix,$(SHELLTYPE)) + $(SILENT) mkdir -p $(OBJDIR) +else + $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) +endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/output_image.o: guetzli/output_image.cc @echo $(notdir $<) +ifeq (posix,$(SHELLTYPE)) + $(SILENT) mkdir -p $(OBJDIR) +else + $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) +endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/preprocess_downsample.o: guetzli/preprocess_downsample.cc @echo $(notdir $<) +ifeq (posix,$(SHELLTYPE)) + $(SILENT) mkdir -p $(OBJDIR) +else + $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) +endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/processor.o: guetzli/processor.cc @echo $(notdir $<) +ifeq (posix,$(SHELLTYPE)) + $(SILENT) mkdir -p $(OBJDIR) +else + $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) +endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/quality.o: guetzli/quality.cc @echo $(notdir $<) +ifeq (posix,$(SHELLTYPE)) + $(SILENT) mkdir -p $(OBJDIR) +else + $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) +endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/quantize.o: guetzli/quantize.cc @echo $(notdir $<) +ifeq (posix,$(SHELLTYPE)) + $(SILENT) mkdir -p $(OBJDIR) +else + $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) +endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/score.o: guetzli/score.cc @echo $(notdir $<) +ifeq (posix,$(SHELLTYPE)) + $(SILENT) mkdir -p $(OBJDIR) +else + $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) +endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/butteraugli.o: third_party/butteraugli/butteraugli/butteraugli.cc @echo $(notdir $<) +ifeq (posix,$(SHELLTYPE)) + $(SILENT) mkdir -p $(OBJDIR) +else + $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) +endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" -include $(OBJECTS:%.o=%.d) diff --git a/premake5.lua b/premake5.lua index 1a109d7a..18f5ecee 100644 --- a/premake5.lua +++ b/premake5.lua @@ -2,7 +2,8 @@ workspace "guetzli" configurations { "Release", "Debug" } language "C++" flags { "C++11" } - includedirs { ".", "third_party/butteraugli" } + includedirs { ".", "third_party/butteraugli", "clguetzli", "$(OPENCL_INC)" } + libdirs { "$(OPENCL_LIB)" } filter "action:vs*" platforms { "x86_64", "x86" } @@ -29,7 +30,9 @@ workspace "guetzli" "guetzli/*.cc", "guetzli/*.h", "third_party/butteraugli/butteraugli/butteraugli.cc", - "third_party/butteraugli/butteraugli/butteraugli.h" + "third_party/butteraugli/butteraugli/butteraugli.h", + "clguetzli/*.cpp", + "clguetzli/*.h" } removefiles "guetzli/guetzli.cc" filter "action:gmake" @@ -41,6 +44,7 @@ workspace "guetzli" filter "action:gmake" linkoptions { "`pkg-config --libs libpng || libpng-config --ldflags`" } buildoptions { "`pkg-config --cflags libpng || libpng-config --cflags`" } + links { "OpenCL" } filter "action:vs*" links { "shlwapi" } filter {} @@ -49,5 +53,7 @@ workspace "guetzli" "guetzli/*.cc", "guetzli/*.h", "third_party/butteraugli/butteraugli/butteraugli.cc", - "third_party/butteraugli/butteraugli/butteraugli.h" + "third_party/butteraugli/butteraugli/butteraugli.h", + "clguetzli/*.cpp", + "clguetzli/*.h" } diff --git a/third_party/butteraugli/butteraugli/butteraugli.cc b/third_party/butteraugli/butteraugli/butteraugli.cc index 69511051..1b2c16f7 100644 --- a/third_party/butteraugli/butteraugli/butteraugli.cc +++ b/third_party/butteraugli/butteraugli/butteraugli.cc @@ -40,9 +40,9 @@ #include #include -#include "clguetzli\clbutter_comparator.h" -#include "clguetzli\clguetzli.h" -#include "clguetzli\clguetzli_test.h" +#include "clguetzli/clbutter_comparator.h" +#include "clguetzli/clguetzli.h" +#include "clguetzli/clguetzli_test.h" // Restricted pointers speed up Convolution(); MSVC uses a different keyword. #ifdef _MSC_VER From ec42b7b6f02baecc185a161070dbc33fd72750b7 Mon Sep 17 00:00:00 2001 From: strongtu Date: Wed, 24 May 2017 14:39:50 +0800 Subject: [PATCH 107/189] const control --- clguetzli/clguetzli.cpp | 66 +++++++++++++----------------------- clguetzli/clguetzli.h | 39 ++++++++++----------- clguetzli/clguetzli_test.cpp | 32 ++++++++--------- clguetzli/clguetzli_test.h | 33 ++++++++---------- clguetzli/ocl.cpp | 12 ++----- clguetzli/ocl.h | 4 +-- 6 files changed, 80 insertions(+), 106 deletions(-) diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index b12bdfc4..f6618d3c 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -251,10 +251,7 @@ void clBlurEx2(cl_mem image/*out, opt*/, size_t xsize, size_t ysize, cl_int err = 0; ocl_args_d_t &ocl = getOcl(); - cl_mem mem_expn = ocl.allocMem(sizeof(cl_float) * expn_size); - - clEnqueueWriteBuffer(ocl.commandQueue, mem_expn, CL_FALSE, 0, sizeof(cl_float) * expn_size, expn.data(), 0, NULL, NULL); - err = clFinish(ocl.commandQueue); + cl_mem mem_expn = ocl.allocMem(sizeof(cl_float) * expn_size, expn.data()); if (xstep > 1) { @@ -272,8 +269,8 @@ void clBlurEx2(cl_mem image/*out, opt*/, size_t xsize, size_t ysize, clReleaseMemObject(mem_expn); } -void clBlurEx(cl_mem image/*out, opt*/, size_t xsize, size_t ysize, - double sigma, double border_ratio, +void clBlurEx(cl_mem image/*out, opt*/, const size_t xsize, const size_t ysize, + const double sigma, const double border_ratio, cl_mem result/*out, opt*/) { clBlurEx2(image, xsize, ysize, sigma, border_ratio, result); @@ -296,10 +293,7 @@ void clBlurEx(cl_mem image/*out, opt*/, size_t xsize, size_t ysize, cl_int err = 0; ocl_args_d_t &ocl = getOcl(); - cl_mem mem_expn = ocl.allocMem(sizeof(cl_float) * expn_size); - - clEnqueueWriteBuffer(ocl.commandQueue, mem_expn, CL_FALSE, 0, sizeof(cl_float) * expn_size, expn.data(), 0, NULL, NULL); - err = clFinish(ocl.commandQueue); + cl_mem mem_expn = ocl.allocMem(sizeof(cl_float) * expn_size, expn.data()); if (xstep > 1) { @@ -320,7 +314,7 @@ void clBlurEx(cl_mem image/*out, opt*/, size_t xsize, size_t ysize, clReleaseMemObject(mem_expn); } -void clOpsinDynamicsImageEx(ocl_channels rgb/*in,out*/, size_t xsize, size_t ysize) +void clOpsinDynamicsImageEx(ocl_channels &rgb/*in,out*/, const size_t xsize, const size_t ysize) { static const double kSigma = 1.1; @@ -359,18 +353,13 @@ void clOpsinDynamicsImageEx(ocl_channels rgb/*in,out*/, size_t xsize, size_t ysi ocl.releaseMemChannels(rgb_blurred); } -void clOpsinDynamicsImage(size_t xsize, size_t ysize, float* r, float* g, float* b) +void clOpsinDynamicsImage(const size_t xsize, const size_t ysize, float* r, float* g, float* b) { cl_int channel_size = xsize * ysize * sizeof(float); cl_int err = 0; ocl_args_d_t &ocl = getOcl(); - ocl_channels rgb = ocl.allocMemChannels(channel_size); - - clEnqueueWriteBuffer(ocl.commandQueue, rgb.r, CL_FALSE, 0, channel_size, r, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, rgb.g, CL_FALSE, 0, channel_size, g, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, rgb.b, CL_FALSE, 0, channel_size, b, 0, NULL, NULL); - err = clFinish(ocl.commandQueue); + ocl_channels rgb = ocl.allocMemChannels(channel_size, r, g, b); clOpsinDynamicsImageEx(rgb, xsize, ysize); @@ -392,9 +381,9 @@ void clOpsinDynamicsImage(size_t xsize, size_t ysize, float* r, float* g, float* ocl.releaseMemChannels(rgb); } -void clMaskHighIntensityChangeEx(ocl_channels xyb0/*in,out*/, - ocl_channels xyb1/*in,out*/, - size_t xsize, size_t ysize) +void clMaskHighIntensityChangeEx(ocl_channels &xyb0/*in,out*/, + ocl_channels &xyb1/*in,out*/, + const size_t xsize, const size_t ysize) { cl_int channel_size = xsize * ysize * sizeof(float); @@ -442,7 +431,8 @@ void clMaskHighIntensityChangeEx(ocl_channels xyb0/*in,out*/, ocl.releaseMemChannels(c1); } -void clEdgeDetectorMapEx(ocl_channels rgb, ocl_channels rgb2, size_t xsize, size_t ysize, size_t step, cl_mem result/*out*/) +void clEdgeDetectorMapEx(const ocl_channels &rgb, const ocl_channels &rgb2, + const size_t xsize, const size_t ysize, const size_t step, cl_mem result/*out*/) { cl_int channel_size = xsize * ysize * sizeof(float); @@ -495,8 +485,8 @@ void clEdgeDetectorMapEx(ocl_channels rgb, ocl_channels rgb2, size_t xsize, size ocl.releaseMemChannels(rgb2_blured); } -void clBlockDiffMapEx(ocl_channels rgb, ocl_channels rgb2, - size_t xsize, size_t ysize, size_t step, +void clBlockDiffMapEx(const ocl_channels &rgb, const ocl_channels &rgb2, + const size_t xsize, const size_t ysize, const size_t step, cl_mem block_diff_dc/*out*/, cl_mem block_diff_ac/*out*/) { cl_int err = CL_SUCCESS; @@ -535,8 +525,8 @@ void clBlockDiffMapEx(ocl_channels rgb, ocl_channels rgb2, } } -void clEdgeDetectorLowFreqEx(ocl_channels rgb, ocl_channels rgb2, - size_t xsize, size_t ysize, size_t step, +void clEdgeDetectorLowFreqEx(const ocl_channels &rgb, const ocl_channels &rgb2, + const size_t xsize, const size_t ysize, const size_t step, cl_mem block_diff_ac/*out*/) { cl_int channel_size = xsize * ysize * sizeof(float); @@ -851,8 +841,8 @@ void clDoMask(ocl_channels mask/*in, out*/, ocl_channels mask_dc/*in, out*/, siz } -void clMaskEx(ocl_channels rgb, ocl_channels rgb2, - size_t xsize, size_t ysize, +void clMaskEx(const ocl_channels &rgb, const ocl_channels &rgb2, + const size_t xsize, const size_t ysize, ocl_channels mask/*out*/, ocl_channels mask_dc/*out*/) { clDiffPrecomputeEx(rgb, rgb2, xsize, ysize, mask); @@ -927,8 +917,8 @@ void clMask(const float* r, const float* g, const float* b, } void clCombineChannelsEx( - ocl_channels mask, - ocl_channels mask_dc, + const ocl_channels &mask, + const ocl_channels &mask_dc, cl_mem block_diff_dc, cl_mem block_diff_ac, cl_mem edge_detector_map, @@ -1098,7 +1088,7 @@ void clCalculateDiffmapEx(cl_mem diffmap/*in,out*/, size_t xsize, size_t ysize, } void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b, - float* r2, float* g2, float* b2, + const float* r2, const float* g2, const float* b2, size_t xsize, size_t ysize, size_t step, float* result) @@ -1112,18 +1102,10 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b, cl_int err = 0; ocl_args_d_t &ocl = getOcl(); - ocl_channels xyb0 = ocl.allocMemChannels(channel_size); - ocl_channels xyb1 = ocl.allocMemChannels(channel_size); - - clEnqueueWriteBuffer(ocl.commandQueue, xyb0.r, CL_FALSE, 0, channel_size, r, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, xyb0.g, CL_FALSE, 0, channel_size, g, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, xyb0.b, CL_FALSE, 0, channel_size, b, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, xyb1.r, CL_FALSE, 0, channel_size, r2, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, xyb1.g, CL_FALSE, 0, channel_size, g2, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, xyb1.b, CL_FALSE, 0, channel_size, b2, 0, NULL, NULL); - err = clFinish(ocl.commandQueue); + ocl_channels xyb0 = ocl.allocMemChannels(channel_size, r, g, b); + ocl_channels xyb1 = ocl.allocMemChannels(channel_size, r2, g2, b2); - cl_mem mem_result = ocl.allocMem(channel_size); + cl_mem mem_result = ocl.allocMem(channel_size); const float pattern = 0; clEnqueueFillBuffer(ocl.commandQueue, mem_result, &pattern, sizeof(float), 0, res_xsize * res_ysize, 0, NULL, NULL); clEnqueueWriteBuffer(ocl.commandQueue, mem_result, CL_FALSE, 0, channel_step_size, result, 0, NULL, NULL); diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h index 3235103e..4d4a2fcf 100644 --- a/clguetzli/clguetzli.h +++ b/clguetzli/clguetzli.h @@ -9,12 +9,12 @@ extern bool g_useOpenCL; extern bool g_checkOpenCL; -void clOpsinDynamicsImage(size_t xsize, size_t ysize, float* r, float* g, float* b); +void clOpsinDynamicsImage(const size_t xsize, const size_t ysize, float* r, float* g, float* b); void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b, - float* r2, float* g2, float* b2, - size_t xsize, size_t ysize, - size_t step, + const float* r2, const float* g2, const float* b2, + const size_t xsize, const size_t ysize, + const size_t step, float* result); void clComputeBlockZeroingOrder( @@ -31,35 +31,36 @@ void clComputeBlockZeroingOrder( void clMask(const float* r, const float* g, const float* b, const float* r2, const float* g2, const float* b2, - size_t xsize, size_t ysize, + const size_t xsize, const size_t ysize, float* mask_r, float* mask_g, float* mask_b, float* maskdc_r, float* maskdc_g, float* maskdc_b); -void clMaskHighIntensityChangeEx(ocl_channels xyb0/*in,out*/, - ocl_channels xyb1/*in,out*/, - size_t xsize, size_t ysize); +void clMaskHighIntensityChangeEx(ocl_channels &xyb0/*in,out*/, + ocl_channels &xyb1/*in,out*/, + const size_t xsize, const size_t ysize); -void clMaskEx(ocl_channels rgb, ocl_channels rgb2, - size_t xsize, size_t ysize, +void clMaskEx(const ocl_channels &rgb, const ocl_channels &rgb2, + const size_t xsize, const size_t ysize, ocl_channels mask/*out*/, ocl_channels mask_dc/*out*/); -void clEdgeDetectorMapEx(ocl_channels rgb, ocl_channels rgb2, size_t xsize, size_t ysize, size_t step, cl_mem result/*out*/); +void clEdgeDetectorMapEx(const ocl_channels &rgb, const ocl_channels &rgb2, + const size_t xsize, const size_t ysize, const size_t step, cl_mem result/*out*/); -void clBlockDiffMapEx(ocl_channels rgb, ocl_channels rgb2, - size_t xsize, size_t ysize, size_t step, +void clBlockDiffMapEx(const ocl_channels &rgb, const ocl_channels &rgb2, + const size_t xsize, const size_t ysize, const size_t step, cl_mem block_diff_dc/*out*/, cl_mem block_diff_ac/*out*/); -void clEdgeDetectorLowFreqEx(ocl_channels rgb, ocl_channels rgb2, - size_t xsize, size_t ysize, size_t step, +void clEdgeDetectorLowFreqEx(const ocl_channels &rgb, const ocl_channels &rgb2, + const size_t xsize, const size_t ysize, const size_t step, cl_mem block_diff_ac/*in,out*/); -void clBlurEx(cl_mem image, size_t xsize, size_t ysize, double sigma, double border_ratio, cl_mem result = nullptr); +void clBlurEx(cl_mem image, const size_t xsize, const size_t ysize, const double sigma, const double border_ratio, cl_mem result = nullptr); -void clOpsinDynamicsImageEx(ocl_channels rgb/*in,out*/, size_t xsize, size_t ysize); +void clOpsinDynamicsImageEx(ocl_channels &rgb/*in,out*/, const size_t xsize, const size_t ysize); void clCombineChannelsEx( - ocl_channels mask, - ocl_channels mask_dc, + const ocl_channels &mask, + const ocl_channels &mask_dc, cl_mem block_diff_dc, cl_mem block_diff_ac, cl_mem edge_detector_map, diff --git a/clguetzli/clguetzli_test.cpp b/clguetzli/clguetzli_test.cpp index 1797fe66..89d86bb4 100644 --- a/clguetzli/clguetzli_test.cpp +++ b/clguetzli/clguetzli_test.cpp @@ -267,8 +267,8 @@ void tclCombineChannels(const float *mask_xyb_x, const float *mask_xyb_y, const size_t xsize, size_t ysize, size_t res_xsize, size_t res_ysize, size_t step, - float *init_result, - float *result) + const float *init_result, + const float *result) { cl_int err = CL_SUCCESS; ocl_args_d_t &ocl = getOcl(); @@ -311,8 +311,8 @@ void tclCombineChannels(const float *mask_xyb_x, const float *mask_xyb_y, const // ian todo void tclCalculateDiffmap(const size_t xsize, const size_t ysize, const size_t step, - float *diffmap, size_t org_len, - float *diffmap_cmp) + const float *diffmap, size_t org_len, + const float *diffmap_cmp) { cl_int err = CL_SUCCESS; ocl_args_d_t &ocl = getOcl(); @@ -329,7 +329,7 @@ void tclCalculateDiffmap(const size_t xsize, const size_t ysize, } // chrisk todo -void tclBlur(float* channel, size_t xsize, size_t ysize, double sigma, double border_ratio, float* result) +void tclBlur(const float* channel, size_t xsize, size_t ysize, double sigma, double border_ratio, const float* result) { size_t channel_size = xsize * ysize * sizeof(float); cl_int err = 0; @@ -423,10 +423,10 @@ void tclUpsample(float* image, size_t xsize, size_t ysize, // ian todo void tclDiffPrecompute( - const std::vector > &xyb0, - const std::vector > &xyb1, - size_t xsize, size_t ysize, - std::vector > *mask_cmp) + const const std::vector > &xyb0, + const const std::vector > &xyb1, + size_t xsize, size_t ysize, + const std::vector > *mask_cmp) { cl_int err = 0; ocl_args_d_t &ocl = getOcl(); @@ -463,7 +463,7 @@ void tclDiffPrecompute( } // ian todo -void tclAverage5x5(int xsize, int ysize, std::vector &diffs_org, std::vector &diffs_cmp) +void tclAverage5x5(int xsize, int ysize, const std::vector &diffs_org, const std::vector &diffs_cmp) { cl_int err = 0; ocl_args_d_t &ocl = getOcl(); @@ -479,9 +479,9 @@ void tclAverage5x5(int xsize, int ysize, std::vector &diffs_org, std::vec } // chrisk todo -void tclMinSquareVal(float *img, size_t square_size, size_t offset, +void tclMinSquareVal(const float *img, size_t square_size, size_t offset, size_t xsize, size_t ysize, - float *values) + const float *result) { size_t img_size = xsize * ysize * sizeof(float); cl_int err = 0; @@ -496,7 +496,7 @@ void tclMinSquareVal(float *img, size_t square_size, size_t offset, cl_float *r_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, r, true, CL_MAP_READ, 0, img_size, 0, NULL, NULL, &err); err = clFinish(ocl.commandQueue); - FLOAT_COMPARE(values, r_r, xsize * ysize); + FLOAT_COMPARE(result, r_r, xsize * ysize); clEnqueueUnmapMemObject(ocl.commandQueue, r, r_r, 0, NULL, NULL); err = clFinish(ocl.commandQueue); @@ -504,7 +504,7 @@ void tclMinSquareVal(float *img, size_t square_size, size_t offset, clReleaseMemObject(r); } -void tclScaleImage(double scale, float *result_org, float *result_cmp, size_t length) +void tclScaleImage(double scale, const float *result_org, const float *result_cmp, size_t length) { cl_int err = 0; ocl_args_d_t &ocl = getOcl(); @@ -522,8 +522,8 @@ void tclScaleImage(double scale, float *result_org, float *result_cmp, size_t le } // strong todo -void tclOpsinDynamicsImage(float* r, float* g, float* b, size_t xsize, size_t ysize, - float* result_r, float* result_g, float* result_b) +void tclOpsinDynamicsImage(const float* r, const float* g, const float* b, size_t xsize, size_t ysize, + const float* result_r, const float* result_g, const float* result_b) { size_t channel_size = xsize * ysize * sizeof(float); cl_int err = 0; diff --git a/clguetzli/clguetzli_test.h b/clguetzli/clguetzli_test.h index 226d3d0a..a84b94ac 100644 --- a/clguetzli/clguetzli_test.h +++ b/clguetzli/clguetzli_test.h @@ -9,7 +9,7 @@ void tclMaskHighIntensityChange(const float* r, const float* g, const float* b, const float* result_r, const float* result_g, const float* result_b, const float* result_r2, const float* result_g2, const float* result_b2); -void tclBlur(float* channel, size_t xsize, size_t ysize, double sigma, double border_ratio, float* result); +void tclBlur(const float* channel, size_t xsize, size_t ysize, double sigma, double border_ratio, const float* result); void tclEdgeDetectorMap(const float* r, const float* g, const float* b, const float* r2, const float* g2, const float* b2, @@ -40,13 +40,13 @@ void tclCombineChannels(const float *mask_xyb_x, const float *mask_xyb_y, const size_t xsize, size_t ysize, size_t res_xsize, size_t res_ysize, size_t step, - float *init_result, - float *result); + const float *init_result, + const float *result); void tclCalculateDiffmap(const size_t xsize, const size_t ysize, const size_t step, - float *diffmap, size_t org_len, - float *diffmap_cmp); + const float *diffmap, size_t org_len, + const float *diffmap_cmp); void tclConvolution(size_t xsize, size_t ysize, size_t xstep, @@ -56,26 +56,23 @@ void tclConvolution(size_t xsize, size_t ysize, float border_ratio, float* result); -void tclBlur(size_t xsize, size_t ysize, float* channel, double sigma, - double border_ratio); - void tclDiffPrecompute( const std::vector > &xyb0, const std::vector > &xyb1, - size_t xsize, size_t ysize, - std::vector > *mask_cmp); + size_t xsize, size_t ysize, + const std::vector > *mask_cmp); -void tclAverage5x5(int xsize, int ysize, std::vector &diffs_org, std::vector &diffs_cmp); +void tclAverage5x5(int xsize, int ysize, const std::vector &diffs_org, const std::vector &diffs_cmp); -void tclScaleImage(double scale, float *result_org, float *result_cmp, size_t length); +void tclScaleImage(double scale, const float *result_org, const float *result_cmp, size_t length); -void tclOpsinDynamicsImage(float* r, float* g, float* b, size_t xsize, size_t ysize, - float* result_r, float* result_g, float* result_b); +void tclOpsinDynamicsImage(const float* r, const float* g, const float* b, size_t xsize, size_t ysize, + const float* result_r, const float* result_g, const float* result_b); -void tclMinSquareVal(float *img, size_t square_size, size_t offset, +void tclMinSquareVal(const float *img, size_t square_size, size_t offset, size_t xsize, size_t ysize, - float *values); + const float *result); -void tclUpsample(float* image, size_t xsize, size_t ysize, +void tclUpsample(const float* image, size_t xsize, size_t ysize, size_t xstep, size_t ystep, - float* result); + const float* result); diff --git a/clguetzli/ocl.cpp b/clguetzli/ocl.cpp index 517b42c3..594adeec 100644 --- a/clguetzli/ocl.cpp +++ b/clguetzli/ocl.cpp @@ -197,7 +197,6 @@ void* ocl_args_d_t::allocC(size_t s) cl_mem ocl_args_d_t::allocMem(size_t s, const void *init) { - cl_uint optimizedSize = ((s - 1) / 64 + 1) * 64; cl_int err = 0; cl_mem mem = clCreateBuffer(this->context, CL_MEM_READ_WRITE, s, nullptr, &err); if (CL_SUCCESS != err) @@ -221,19 +220,14 @@ cl_mem ocl_args_d_t::allocMem(size_t s, const void *init) return mem; } -ocl_channels ocl_args_d_t::allocMemChannels(size_t s) +ocl_channels ocl_args_d_t::allocMemChannels(size_t s, const void *c0, const void *c1, const void *c2) { - cl_uint optimizedSize = ((s - 1) / 64 + 1) * 64; - cl_int err = 0; + const void *c[3] = { c0, c1, c2 }; ocl_channels img; for (int i = 0; i < 3; i++) { - img.ch[i] = clCreateBuffer(this->context, CL_MEM_READ_WRITE, s, nullptr, &err); - if (CL_SUCCESS != err) - { - LogError("Error: allocMemChannel(%d) for buffer returned %s.\n", i, TranslateOpenCLError(err)); - } + img.ch[i] = allocMem(s, c[i]); } return img; diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h index d72000b3..ed2f1ee2 100644 --- a/clguetzli/ocl.h +++ b/clguetzli/ocl.h @@ -60,7 +60,7 @@ enum KernelName { KERNEL_UPSAMPLESQUAREROOT, KERNEL_ADDBORDER, KERNEL_REMOVEBORDER, - KERNEL_AVERAGE5X5, + KERNEL_AVERAGE5X5, KERNEL_EDGEDETECTOR, KERNEL_BLOCKDIFFMAP, KERNEL_EDGEDETECTORLOWFREQ, @@ -78,7 +78,7 @@ struct ocl_args_d_t void* allocC(size_t s); cl_mem allocMem(size_t s, const void *init = NULL); - ocl_channels allocMemChannels(size_t s); + ocl_channels allocMemChannels(size_t s, const void *c0 = NULL, const void *c1 = NULL, const void *c2 = NULL); void releaseMemChannels(ocl_channels rgb); // Regular OpenCL objects: From a469c023ab864003a52e821a1f91c74422a78589 Mon Sep 17 00:00:00 2001 From: strongtu Date: Wed, 24 May 2017 15:17:14 +0800 Subject: [PATCH 108/189] =?UTF-8?q?=E7=B2=BE=E7=AE=80=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clguetzli.cpp | 27 ++----- clguetzli/clguetzli_test.cpp | 145 ++++++++--------------------------- 2 files changed, 38 insertions(+), 134 deletions(-) diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index f6618d3c..ee85cba6 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -801,14 +801,8 @@ void clDoMask(ocl_channels mask/*in, out*/, ocl_channels mask_dc/*in, out*/, siz } size_t channel_size = 512 * 3 * sizeof(double); - ocl_channels xyb = ocl.allocMemChannels(channel_size); - ocl_channels xyb_dc = ocl.allocMemChannels(channel_size); - clEnqueueWriteBuffer(ocl.commandQueue, xyb.x, CL_FALSE, 0, channel_size, lut_x, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, xyb.y, CL_FALSE, 0, channel_size, lut_y, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, xyb.b, CL_FALSE, 0, channel_size, lut_b, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, xyb_dc.x, CL_FALSE, 0, channel_size, lut_dcx, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, xyb_dc.y, CL_FALSE, 0, channel_size, lut_dcy, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, xyb_dc.b, CL_FALSE, 0, channel_size, lut_dcb, 0, NULL, NULL); + ocl_channels xyb = ocl.allocMemChannels(channel_size, lut_x, lut_y, lut_b); + ocl_channels xyb_dc = ocl.allocMemChannels(channel_size, lut_dcx, lut_dcy, lut_dcb); cl_kernel kernel = ocl.kernel[KERNEL_DOMASK]; clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&mask.r); @@ -880,19 +874,11 @@ void clMask(const float* r, const float* g, const float* b, cl_int channel_size = xsize * ysize * sizeof(float); - ocl_channels rgb = ocl.allocMemChannels(channel_size); - ocl_channels rgb2 = ocl.allocMemChannels(channel_size); + ocl_channels rgb = ocl.allocMemChannels(channel_size, r, g, b); + ocl_channels rgb2 = ocl.allocMemChannels(channel_size, r2, g2, b2); ocl_channels mask = ocl.allocMemChannels(channel_size); ocl_channels mask_dc = ocl.allocMemChannels(channel_size); - clEnqueueWriteBuffer(ocl.commandQueue, rgb.r, CL_FALSE, 0, channel_size, r, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, rgb.g, CL_FALSE, 0, channel_size, g, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, rgb.b, CL_FALSE, 0, channel_size, b, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, rgb2.r, CL_FALSE, 0, channel_size, r2, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, rgb2.g, CL_FALSE, 0, channel_size, g2, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, rgb2.b, CL_FALSE, 0, channel_size, b2, 0, NULL, NULL); - err = clFinish(ocl.commandQueue); - clMaskEx(rgb, rgb2, xsize, ysize, mask, mask_dc); cl_float *r0_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask.r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); @@ -1105,10 +1091,7 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b, ocl_channels xyb0 = ocl.allocMemChannels(channel_size, r, g, b); ocl_channels xyb1 = ocl.allocMemChannels(channel_size, r2, g2, b2); - cl_mem mem_result = ocl.allocMem(channel_size); - const float pattern = 0; - clEnqueueFillBuffer(ocl.commandQueue, mem_result, &pattern, sizeof(float), 0, res_xsize * res_ysize, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, mem_result, CL_FALSE, 0, channel_step_size, result, 0, NULL, NULL); + cl_mem mem_result = ocl.allocMem(channel_size, result); cl_mem edge_detector_map = ocl.allocMem(3 * channel_step_size); cl_mem block_diff_dc = ocl.allocMem(3 * channel_step_size); diff --git a/clguetzli/clguetzli_test.cpp b/clguetzli/clguetzli_test.cpp index 89d86bb4..58a23d35 100644 --- a/clguetzli/clguetzli_test.cpp +++ b/clguetzli/clguetzli_test.cpp @@ -34,16 +34,8 @@ void tclMaskHighIntensityChange(const float* r, const float* g, const float* b, size_t channel_size = xsize * ysize * sizeof(float); cl_int err = 0; ocl_args_d_t &ocl = getOcl(); - ocl_channels xyb0 = ocl.allocMemChannels(channel_size); - ocl_channels xyb1 = ocl.allocMemChannels(channel_size); - - clEnqueueWriteBuffer(ocl.commandQueue, xyb0.r, CL_FALSE, 0, channel_size, r, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, xyb0.g, CL_FALSE, 0, channel_size, g, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, xyb0.b, CL_FALSE, 0, channel_size, b, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, xyb1.r, CL_FALSE, 0, channel_size, r2, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, xyb1.g, CL_FALSE, 0, channel_size, g2, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, xyb1.b, CL_FALSE, 0, channel_size, b2, 0, NULL, NULL); - err = clFinish(ocl.commandQueue); + ocl_channels xyb0 = ocl.allocMemChannels(channel_size, r, g, b); + ocl_channels xyb1 = ocl.allocMemChannels(channel_size, r2, g2, b2); clMaskHighIntensityChangeEx(xyb0, xyb1, xsize, ysize); @@ -87,18 +79,10 @@ void tclEdgeDetectorMap(const float* r, const float* g, const float* b, cl_int err = 0; ocl_args_d_t &ocl = getOcl(); - ocl_channels xyb0 = ocl.allocMemChannels(channel_size); - ocl_channels xyb1 = ocl.allocMemChannels(channel_size); + ocl_channels xyb0 = ocl.allocMemChannels(channel_size, r, g, b); + ocl_channels xyb1 = ocl.allocMemChannels(channel_size, r2, g2, b2); cl_mem edge = ocl.allocMem(edgemap_size); - clEnqueueWriteBuffer(ocl.commandQueue, xyb0.r, CL_FALSE, 0, channel_size, r, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, xyb0.g, CL_FALSE, 0, channel_size, g, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, xyb0.b, CL_FALSE, 0, channel_size, b, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, xyb1.r, CL_FALSE, 0, channel_size, r2, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, xyb1.g, CL_FALSE, 0, channel_size, g2, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, xyb1.b, CL_FALSE, 0, channel_size, b2, 0, NULL, NULL); - err = clFinish(ocl.commandQueue); - clEdgeDetectorMapEx(xyb0, xyb1, xsize, ysize, step, edge); cl_float *r_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, edge, true, CL_MAP_READ, 0, edgemap_size, 0, NULL, NULL, &err); @@ -127,20 +111,12 @@ void tclBlockDiffMap(const float* r, const float* g, const float* b, cl_int err = 0; ocl_args_d_t &ocl = getOcl(); - ocl_channels xyb0 = ocl.allocMemChannels(channel_size); - ocl_channels xyb1 = ocl.allocMemChannels(channel_size); + ocl_channels xyb0 = ocl.allocMemChannels(channel_size, r, g, b); + ocl_channels xyb1 = ocl.allocMemChannels(channel_size, r2, g2, b2); cl_mem block_diff_dc = ocl.allocMem(reschannel_size); cl_mem block_diff_ac = ocl.allocMem(reschannel_size); - clEnqueueWriteBuffer(ocl.commandQueue, xyb0.r, CL_FALSE, 0, channel_size, r, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, xyb0.g, CL_FALSE, 0, channel_size, g, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, xyb0.b, CL_FALSE, 0, channel_size, b, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, xyb1.r, CL_FALSE, 0, channel_size, r2, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, xyb1.g, CL_FALSE, 0, channel_size, g2, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, xyb1.b, CL_FALSE, 0, channel_size, b2, 0, NULL, NULL); - err = clFinish(ocl.commandQueue); - clBlockDiffMapEx(xyb0, xyb1, xsize, ysize, step, block_diff_dc, block_diff_ac); cl_float *r_dc = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, block_diff_dc, true, CL_MAP_READ, 0, reschannel_size, 0, NULL, NULL, &err); @@ -175,20 +151,10 @@ void tclEdgeDetectorLowFreq(const float* r, const float* g, const float* b, cl_int err = 0; ocl_args_d_t &ocl = getOcl(); - ocl_channels xyb0 = ocl.allocMemChannels(channel_size); - ocl_channels xyb1 = ocl.allocMemChannels(channel_size); - - cl_mem block_diff_ac = ocl.allocMem(reschannel_size); + ocl_channels xyb0 = ocl.allocMemChannels(channel_size, r, g, b); + ocl_channels xyb1 = ocl.allocMemChannels(channel_size, r2, g2, b2); - clEnqueueWriteBuffer(ocl.commandQueue, xyb0.r, CL_FALSE, 0, channel_size, r, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, xyb0.g, CL_FALSE, 0, channel_size, g, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, xyb0.b, CL_FALSE, 0, channel_size, b, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, xyb1.r, CL_FALSE, 0, channel_size, r2, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, xyb1.g, CL_FALSE, 0, channel_size, g2, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, xyb1.b, CL_FALSE, 0, channel_size, b2, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, block_diff_ac, CL_FALSE, 0, reschannel_size, orign_ac, 0, NULL, NULL); - - err = clFinish(ocl.commandQueue); + cl_mem block_diff_ac = ocl.allocMem(reschannel_size, orign_ac); clEdgeDetectorLowFreqEx(xyb0, xyb1, xsize, ysize, step, block_diff_ac); @@ -215,20 +181,12 @@ void tclMask(const float* r, const float* g, const float* b, size_t channel_size = xsize * ysize * sizeof(float); cl_int err = 0; ocl_args_d_t &ocl = getOcl(); - ocl_channels rgb = ocl.allocMemChannels(channel_size); - ocl_channels rgb2 = ocl.allocMemChannels(channel_size); + ocl_channels rgb = ocl.allocMemChannels(channel_size, r, g, b); + ocl_channels rgb2 = ocl.allocMemChannels(channel_size, r2, g2, b2); ocl_channels mask = ocl.allocMemChannels(channel_size); ocl_channels mask_dc = ocl.allocMemChannels(channel_size); - clEnqueueWriteBuffer(ocl.commandQueue, rgb.r, CL_FALSE, 0, channel_size, r, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, rgb.g, CL_FALSE, 0, channel_size, g, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, rgb.b, CL_FALSE, 0, channel_size, b, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, rgb2.r, CL_FALSE, 0, channel_size, r2, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, rgb2.g, CL_FALSE, 0, channel_size, g2, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, rgb2.b, CL_FALSE, 0, channel_size, b2, 0, NULL, NULL); - err = clFinish(ocl.commandQueue); - clMaskEx(rgb, rgb2, xsize, ysize, mask/*out*/, mask_dc/*out*/); cl_float *r0_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask.r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); @@ -274,24 +232,13 @@ void tclCombineChannels(const float *mask_xyb_x, const float *mask_xyb_y, const ocl_args_d_t &ocl = getOcl(); size_t channel_size = xsize * ysize * sizeof(float); - ocl_channels mask = ocl.allocMemChannels(channel_size); - ocl_channels mask_dc = ocl.allocMemChannels(channel_size); - cl_mem cl_block_diff_dc = ocl.allocMem(3 * res_xsize * res_ysize * sizeof(float)); - cl_mem cl_block_diff_ac = ocl.allocMem(3 * res_xsize * res_ysize * sizeof(float)); - cl_mem cl_edge_detector_map = ocl.allocMem(3 * res_xsize * res_ysize * sizeof(float)); - cl_mem cl_result = ocl.allocMem(res_xsize * res_ysize * sizeof(float)); - - clEnqueueWriteBuffer(ocl.commandQueue, mask.x, CL_FALSE, 0, channel_size, mask_xyb_x, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, mask.y, CL_FALSE, 0, channel_size, mask_xyb_y, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, mask.b, CL_FALSE, 0, channel_size, mask_xyb_b, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, mask_dc.x, CL_FALSE, 0, channel_size, mask_xyb_dc_x, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, mask_dc.y, CL_FALSE, 0, channel_size, mask_xyb_dc_y, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, mask_dc.b, CL_FALSE, 0, channel_size, mask_xyb_dc_b, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, cl_block_diff_dc, CL_FALSE, 0, 3 * res_xsize * res_ysize * sizeof(float), block_diff_dc, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, cl_block_diff_ac, CL_FALSE, 0, 3 * res_xsize * res_ysize * sizeof(float), block_diff_ac, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, cl_edge_detector_map, CL_FALSE, 0, 3 * res_xsize * res_ysize * sizeof(float), edge_detector_map, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, cl_result, CL_FALSE, 0, res_xsize * res_ysize * sizeof(float), init_result, 0, NULL, NULL); - err = clFinish(ocl.commandQueue); + size_t res_channel_size = res_xsize * res_ysize * sizeof(float); + ocl_channels mask = ocl.allocMemChannels(channel_size, mask_xyb_x, mask_xyb_y, mask_xyb_b); + ocl_channels mask_dc = ocl.allocMemChannels(channel_size, mask_xyb_dc_x, mask_xyb_dc_y, mask_xyb_dc_b); + cl_mem cl_block_diff_dc = ocl.allocMem(3 * res_channel_size, block_diff_dc); + cl_mem cl_block_diff_ac = ocl.allocMem(3 * res_channel_size, block_diff_ac); + cl_mem cl_edge_detector_map = ocl.allocMem(3 * res_xsize * res_ysize * sizeof(float), edge_detector_map); + cl_mem cl_result = ocl.allocMem(res_xsize * res_ysize * sizeof(float), init_result); clCombineChannelsEx(mask, mask_dc, cl_block_diff_dc, cl_block_diff_ac, cl_edge_detector_map, xsize, ysize, res_xsize, step, cl_result); @@ -299,7 +246,7 @@ void tclCombineChannels(const float *mask_xyb_x, const float *mask_xyb_y, const FLOAT_COMPARE(result_tmp, result, res_xsize * res_ysize); - clEnqueueUnmapMemObject(ocl.commandQueue, cl_result, result_tmp, 0, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, cl_result, result_tmp, 0, NULL, NULL); ocl.releaseMemChannels(mask); ocl.releaseMemChannels(mask_dc); clReleaseMemObject(cl_block_diff_dc); @@ -322,9 +269,9 @@ void tclCalculateDiffmap(const size_t xsize, const size_t ysize, clEnqueueWriteBuffer(ocl.commandQueue, mem_diffmap, CL_FALSE, 0, org_len * sizeof(float), diffmap, 0, NULL, NULL); clCalculateDiffmapEx(mem_diffmap, xsize, ysize, step); cl_float *result_tmp = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mem_diffmap, true, CL_MAP_READ, 0, length, 0, NULL, NULL, &err); - err = clFinish(ocl.commandQueue); + err = clFinish(ocl.commandQueue); FLOAT_COMPARE(result_tmp, diffmap_cmp, xsize * ysize); - clEnqueueUnmapMemObject(ocl.commandQueue, mem_diffmap, result_tmp, 0, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, mem_diffmap, result_tmp, 0, NULL, NULL); clReleaseMemObject(mem_diffmap); } @@ -334,10 +281,7 @@ void tclBlur(const float* channel, size_t xsize, size_t ysize, double sigma, dou size_t channel_size = xsize * ysize * sizeof(float); cl_int err = 0; ocl_args_d_t &ocl = getOcl(); - cl_mem r = ocl.allocMem(channel_size); - - clEnqueueWriteBuffer(ocl.commandQueue, r, CL_FALSE, 0, channel_size, channel, 0, NULL, NULL); - err = clFinish(ocl.commandQueue); + cl_mem r = ocl.allocMem(channel_size, channel); clBlurEx(r, xsize, ysize, sigma, border_ratio, r); @@ -369,12 +313,8 @@ void tclConvolution(size_t xsize, size_t ysize, ocl_args_d_t &ocl = getOcl(); ocl.allocA(result_size); cl_mem r = ocl.srcA; - cl_mem i = ocl.allocMem(inp_size); - cl_mem m = ocl.allocMem(multipliers_size); - - clEnqueueWriteBuffer(ocl.commandQueue, i, CL_FALSE, 0, inp_size, inp, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, m, CL_FALSE, 0, multipliers_size, multipliers, 0, NULL, NULL); - err = clFinish(ocl.commandQueue); + cl_mem i = ocl.allocMem(inp_size, inp); + cl_mem m = ocl.allocMem(multipliers_size, multipliers); clConvolutionEx(i, xsize, ysize, m, len, xstep, offset, border_ratio, r); @@ -401,13 +341,10 @@ void tclUpsample(float* image, size_t xsize, size_t ysize, size_t result_size = xsize * ysize * sizeof(float); cl_int err = 0; ocl_args_d_t &ocl = getOcl(); - cl_mem img = ocl.allocMem(img_size); + cl_mem img = ocl.allocMem(img_size, image); ocl.allocA(result_size); cl_mem r = ocl.srcA; - clEnqueueWriteBuffer(ocl.commandQueue, img, CL_FALSE, 0, img_size, image, 0, NULL, NULL); - err = clFinish(ocl.commandQueue); - clUpsampleEx(img, xsize, ysize, xstep, ystep, r); cl_float *r_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, r, true, CL_MAP_READ, 0, result_size, 0, NULL, NULL, &err); @@ -431,18 +368,10 @@ void tclDiffPrecompute( cl_int err = 0; ocl_args_d_t &ocl = getOcl(); size_t channel_size = xsize * ysize * sizeof(float); - ocl_channels cl_xyb0 = ocl.allocMemChannels(channel_size); - ocl_channels cl_xyb1 = ocl.allocMemChannels(channel_size); + ocl_channels cl_xyb0 = ocl.allocMemChannels(channel_size, xyb0[0].data(), xyb0[1].data(), xyb0[2].data()); + ocl_channels cl_xyb1 = ocl.allocMemChannels(channel_size, xyb1[0].data(), xyb1[1].data(), xyb1[2].data()); ocl_channels cl_mask = ocl.allocMemChannels(channel_size); - clEnqueueWriteBuffer(ocl.commandQueue, cl_xyb0.x, CL_FALSE, 0, channel_size, xyb0[0].data(), 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, cl_xyb0.y, CL_FALSE, 0, channel_size, xyb0[1].data(), 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, cl_xyb0.b, CL_FALSE, 0, channel_size, xyb0[2].data(), 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, cl_xyb1.x, CL_FALSE, 0, channel_size, xyb1[0].data(), 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, cl_xyb1.y, CL_FALSE, 0, channel_size, xyb1[1].data(), 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, cl_xyb1.b, CL_FALSE, 0, channel_size, xyb1[2].data(), 0, NULL, NULL); - - clDiffPrecomputeEx(cl_xyb0, cl_xyb1, xsize, ysize, cl_mask); cl_float *r_x = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, cl_mask.x, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); @@ -467,8 +396,8 @@ void tclAverage5x5(int xsize, int ysize, const std::vector &diffs_org, co { cl_int err = 0; ocl_args_d_t &ocl = getOcl(); - cl_mem mem_diff = ocl.allocMem(xsize * ysize * sizeof(float)); - clEnqueueWriteBuffer(ocl.commandQueue, mem_diff, CL_FALSE, 0, xsize * ysize * sizeof(float), diffs_org.data(), 0, NULL, NULL); + cl_mem mem_diff = ocl.allocMem(xsize * ysize * sizeof(float), diffs_org.data()); + clAverage5x5Ex(mem_diff, xsize, ysize); cl_float *r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mem_diff, true, CL_MAP_READ, 0, xsize * ysize * sizeof(float), 0, NULL, NULL, &err); err = clFinish(ocl.commandQueue); @@ -486,10 +415,7 @@ void tclMinSquareVal(const float *img, size_t square_size, size_t offset, size_t img_size = xsize * ysize * sizeof(float); cl_int err = 0; ocl_args_d_t &ocl = getOcl(); - cl_mem r = ocl.allocMem(img_size); - - clEnqueueWriteBuffer(ocl.commandQueue, r, CL_FALSE, 0, img_size, img, 0, NULL, NULL); - err = clFinish(ocl.commandQueue); + cl_mem r = ocl.allocMem(img_size, img); clMinSquareValEx(r, xsize, ysize, square_size, offset); @@ -508,8 +434,8 @@ void tclScaleImage(double scale, const float *result_org, const float *result_cm { cl_int err = 0; ocl_args_d_t &ocl = getOcl(); - cl_mem mem_result_org = ocl.allocMem(length * sizeof(float)); - clEnqueueWriteBuffer(ocl.commandQueue, mem_result_org, CL_FALSE, 0, length * sizeof(float), result_org, 0, NULL, NULL); + cl_mem mem_result_org = ocl.allocMem(length * sizeof(float), result_org); + clScaleImageEx(mem_result_org, length, scale); cl_float *r_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mem_result_org, true, CL_MAP_READ, 0, length * sizeof(float), 0, NULL, NULL, &err); @@ -528,12 +454,7 @@ void tclOpsinDynamicsImage(const float* r, const float* g, const float* b, size_ size_t channel_size = xsize * ysize * sizeof(float); cl_int err = 0; ocl_args_d_t &ocl = getOcl(); - ocl_channels rgb = ocl.allocMemChannels(channel_size); - - clEnqueueWriteBuffer(ocl.commandQueue, rgb.r, CL_FALSE, 0, channel_size, r, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, rgb.g, CL_FALSE, 0, channel_size, g, 0, NULL, NULL); - clEnqueueWriteBuffer(ocl.commandQueue, rgb.b, CL_FALSE, 0, channel_size, b, 0, NULL, NULL); - err = clFinish(ocl.commandQueue); + ocl_channels rgb = ocl.allocMemChannels(channel_size, r, g, b); clOpsinDynamicsImageEx(rgb, xsize, ysize); From e68cea4493b60ce77d7933d3275cc6a595a3559f Mon Sep 17 00:00:00 2001 From: strongtu Date: Wed, 24 May 2017 15:40:13 +0800 Subject: [PATCH 109/189] =?UTF-8?q?=E8=B0=83=E6=95=B4=E5=8F=82=E6=95=B0?= =?UTF-8?q?=E9=A1=BA=E5=BA=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clbutter_comparator.cpp | 15 +- clguetzli/clguetzli.cpp | 450 +++++++++++++++--------------- clguetzli/clguetzli.h | 25 +- guetzli/processor.cc | 20 +- 4 files changed, 260 insertions(+), 250 deletions(-) diff --git a/clguetzli/clbutter_comparator.cpp b/clguetzli/clbutter_comparator.cpp index c61c8578..c6b4ca0b 100644 --- a/clguetzli/clbutter_comparator.cpp +++ b/clguetzli/clbutter_comparator.cpp @@ -17,8 +17,8 @@ namespace butteraugli if (g_useOpenCL && xsize_ > 100 && ysize_ > 100) { result.resize(xsize_ * ysize_); - clDiffmapOpsinDynamicsImage(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), - xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), xsize_, ysize_, step_, result.data()); + clDiffmapOpsinDynamicsImage(result.data(), xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), + xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), xsize_, ysize_, step_); } else { @@ -163,11 +163,12 @@ namespace butteraugli (*mask)[i].resize(xsize * ysize); (*mask_dc)[i].resize(xsize * ysize); } - clMask(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), - xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), + clMask((*mask)[0].data(), (*mask)[1].data(), (*mask)[2].data(), + (*mask_dc)[0].data(), (*mask_dc)[1].data(), (*mask_dc)[2].data(), xsize, ysize, - (*mask)[0].data(), (*mask)[1].data(), (*mask)[2].data(), - (*mask_dc)[0].data(), (*mask_dc)[1].data(), (*mask_dc)[2].data()); + xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), + xyb1[0].data(), xyb1[1].data(), xyb1[2].data() + ); return; } @@ -279,7 +280,7 @@ namespace butteraugli float * g = rgb[1].data(); float * b = rgb[2].data(); - clOpsinDynamicsImage(xsize, ysize, r, g, b); + clOpsinDynamicsImage(r, g, b, xsize, ysize); } else { diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index ee85cba6..fa3507a4 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -68,6 +68,233 @@ ocl_args_d_t& getOcl(void) return ocl; } +void clOpsinDynamicsImage(float *r, float *g, float *b, const size_t xsize, const size_t ysize) +{ + cl_int channel_size = xsize * ysize * sizeof(float); + + cl_int err = 0; + ocl_args_d_t &ocl = getOcl(); + ocl_channels rgb = ocl.allocMemChannels(channel_size, r, g, b); + + clOpsinDynamicsImageEx(rgb, xsize, ysize); + + cl_float *result_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, rgb.r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + cl_float *result_g = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, rgb.g, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + cl_float *result_b = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, rgb.b, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + + err = clFinish(ocl.commandQueue); + + memcpy(r, result_r, channel_size); + memcpy(g, result_g, channel_size); + memcpy(b, result_b, channel_size); + + clEnqueueUnmapMemObject(ocl.commandQueue, rgb.r, result_r, 0, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, rgb.g, result_g, 0, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, rgb.b, result_b, 0, NULL, NULL); + clFinish(ocl.commandQueue); + + ocl.releaseMemChannels(rgb); +} + +void clDiffmapOpsinDynamicsImage( + float* result, + const float* r, const float* g, const float* b, + const float* r2, const float* g2, const float* b2, + size_t xsize, size_t ysize, + size_t step) +{ + + const size_t res_xsize = (xsize + step - 1) / step; + const size_t res_ysize = (ysize + step - 1) / step; + + cl_int channel_size = xsize * ysize * sizeof(float); + cl_int channel_step_size = res_xsize * res_ysize * sizeof(float); + + cl_int err = 0; + ocl_args_d_t &ocl = getOcl(); + ocl_channels xyb0 = ocl.allocMemChannels(channel_size, r, g, b); + ocl_channels xyb1 = ocl.allocMemChannels(channel_size, r2, g2, b2); + + cl_mem mem_result = ocl.allocMem(channel_size, result); + + cl_mem edge_detector_map = ocl.allocMem(3 * channel_step_size); + cl_mem block_diff_dc = ocl.allocMem(3 * channel_step_size); + cl_mem block_diff_ac = ocl.allocMem(3 * channel_step_size); + + clMaskHighIntensityChangeEx(xyb0, xyb1, xsize, ysize); + + clEdgeDetectorMapEx(xyb0, xyb1, xsize, ysize, step, edge_detector_map); + clBlockDiffMapEx(xyb0, xyb1, xsize, ysize, step, block_diff_dc, block_diff_ac); + clEdgeDetectorLowFreqEx(xyb0, xyb1, xsize, ysize, step, block_diff_ac); + { + ocl_channels mask = ocl.allocMemChannels(channel_size); + ocl_channels mask_dc = ocl.allocMemChannels(channel_size); + clMaskEx(xyb0, xyb1, xsize, ysize, mask, mask_dc); + clCombineChannelsEx(mask, mask_dc, block_diff_dc, block_diff_ac, edge_detector_map, xsize, ysize, res_xsize, step, mem_result); + + ocl.releaseMemChannels(mask); + ocl.releaseMemChannels(mask_dc); + } + + clCalculateDiffmapEx(mem_result, xsize, ysize, step); + + cl_float *result_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mem_result, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + err = clFinish(ocl.commandQueue); + memcpy(result, result_r, channel_size); + + clEnqueueUnmapMemObject(ocl.commandQueue, mem_result, result_r, 0, NULL, NULL); + clFinish(ocl.commandQueue); + + ocl.releaseMemChannels(xyb1); + ocl.releaseMemChannels(xyb0); + + clReleaseMemObject(edge_detector_map); + clReleaseMemObject(block_diff_dc); + clReleaseMemObject(block_diff_ac); + + clReleaseMemObject(mem_result); +} + +void clComputeBlockZeroingOrder( + guetzli::CoeffData *output_order_batch, + const channel_info orig_channel[3], + const float *orig_image_batch, + const float *mask_scale, + const int image_width, + const int image_height, + const channel_info mayout_channel[3], + const int factor, + const int comp_mask, + const float BlockErrorLimit) +{ + const int block8_width = (image_width + 8 - 1) / 8; + const int block8_height = (image_height + 8 - 1) / 8; + const int blockf_width = (image_width + 8 * factor - 1) / (8 * factor); + const int blockf_height = (image_height + 8 * factor - 1) / (8 * factor); + + using namespace guetzli; + + cl_int err = 0; + ocl_args_d_t &ocl = getOcl(); + + cl_mem mem_orig_coeff[3]; + cl_mem mem_mayout_coeff[3]; + cl_mem mem_mayout_pixel[3]; + for (int c = 0; c < 3; c++) + { + int block_count = orig_channel[c].block_width * orig_channel[c].block_height; + mem_orig_coeff[c] = ocl.allocMem(block_count * sizeof(::coeff_t) * kDCTBlockSize, orig_channel[c].coeff); + + block_count = mayout_channel[c].block_width * mayout_channel[c].block_height; + mem_mayout_coeff[c] = ocl.allocMem(block_count * sizeof(::coeff_t) * kDCTBlockSize, mayout_channel[c].coeff); + + mem_mayout_pixel[c] = ocl.allocMem(image_width * image_height * sizeof(uint16_t), mayout_channel[c].pixel); + } + cl_mem mem_orig_image = ocl.allocMem(sizeof(float) * 3 * kDCTBlockSize * block8_width * block8_height, orig_image_batch); + cl_mem mem_mask_scale = ocl.allocMem(sizeof(float) * 3 * block8_width * block8_height, mask_scale); + + int output_order_batch_size = sizeof(CoeffData) * 3 * kDCTBlockSize * blockf_width * blockf_height; + cl_mem mem_output_order_batch = ocl.allocMem(output_order_batch_size); + cl_float clBlockErrorLimit = BlockErrorLimit; + cl_int clWidth = image_width; + cl_int clHeight = image_height; + cl_int clFactor = factor; + cl_int clMask = comp_mask; + + cl_kernel kernel = ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER]; + clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&mem_orig_coeff[0]); + clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&mem_orig_coeff[1]); + clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&mem_orig_coeff[2]); + clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&mem_orig_image); + clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&mem_mask_scale); + clSetKernelArg(kernel, 5, sizeof(cl_int), &clWidth); + clSetKernelArg(kernel, 6, sizeof(cl_int), &clHeight); + clSetKernelArg(kernel, 7, sizeof(cl_mem), (void*)&mem_mayout_coeff[0]); + clSetKernelArg(kernel, 8, sizeof(cl_mem), (void*)&mem_mayout_coeff[1]); + clSetKernelArg(kernel, 9, sizeof(cl_mem), (void*)&mem_mayout_coeff[2]); + clSetKernelArg(kernel, 10, sizeof(cl_mem), (void*)&mem_mayout_pixel[0]); + clSetKernelArg(kernel, 11, sizeof(cl_mem), (void*)&mem_mayout_pixel[1]); + clSetKernelArg(kernel, 12, sizeof(cl_mem), (void*)&mem_mayout_pixel[2]); + clSetKernelArg(kernel, 13, sizeof(channel_info), &mayout_channel[0]); + clSetKernelArg(kernel, 14, sizeof(channel_info), &mayout_channel[1]); + clSetKernelArg(kernel, 15, sizeof(channel_info), &mayout_channel[2]); + clSetKernelArg(kernel, 16, sizeof(cl_int), &clFactor); + clSetKernelArg(kernel, 17, sizeof(cl_int), &clMask); + clSetKernelArg(kernel, 18, sizeof(cl_float), &clBlockErrorLimit); + clSetKernelArg(kernel, 19, sizeof(cl_mem), &mem_output_order_batch); + + size_t globalWorkSize[2] = { blockf_width, blockf_height }; + err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); + if (CL_SUCCESS != err) + { + LogError("Error: clComputeBlockZeroingOrder() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err)); + } + err = clFinish(ocl.commandQueue); + if (CL_SUCCESS != err) + { + LogError("Error: clComputeBlockZeroingOrder() clFinish returned %s.\n", TranslateOpenCLError(err)); + } + + CoeffData *result = (CoeffData *)clEnqueueMapBuffer(ocl.commandQueue, mem_output_order_batch, true, CL_MAP_READ, 0, output_order_batch_size, 0, NULL, NULL, &err); + err = clFinish(ocl.commandQueue); + memcpy(output_order_batch, result, output_order_batch_size); + + clEnqueueUnmapMemObject(ocl.commandQueue, mem_output_order_batch, result, 0, NULL, NULL); + clFinish(ocl.commandQueue); + + for (int c = 0; c < 3; c++) + { + clReleaseMemObject(mem_orig_coeff[c]); + clReleaseMemObject(mem_mayout_coeff[c]); + clReleaseMemObject(mem_mayout_pixel[c]); + + } + + clReleaseMemObject(mem_orig_image); + clReleaseMemObject(mem_mask_scale); + clReleaseMemObject(mem_output_order_batch); +} + +void clMask( + float* mask_r, float* mask_g, float* mask_b, + float* maskdc_r, float* maskdc_g, float* maskdc_b, + size_t xsize, size_t ysize, + const float* r, const float* g, const float* b, + const float* r2, const float* g2, const float* b2) +{ + cl_int err = CL_SUCCESS; + ocl_args_d_t &ocl = getOcl(); + + cl_int channel_size = xsize * ysize * sizeof(float); + + ocl_channels rgb = ocl.allocMemChannels(channel_size, r, g, b); + ocl_channels rgb2 = ocl.allocMemChannels(channel_size, r2, g2, b2); + ocl_channels mask = ocl.allocMemChannels(channel_size); + ocl_channels mask_dc = ocl.allocMemChannels(channel_size); + + clMaskEx(rgb, rgb2, xsize, ysize, mask, mask_dc); + + cl_float *r0_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask.r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + cl_float *r0_g = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask.g, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + cl_float *r0_b = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask.b, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + cl_float *r1_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask_dc.r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + cl_float *r1_g = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask_dc.g, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + cl_float *r1_b = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask_dc.b, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + err = clFinish(ocl.commandQueue); + + memcpy(mask_r, r0_r, channel_size); + memcpy(mask_g, r0_g, channel_size); + memcpy(mask_b, r0_b, channel_size); + memcpy(maskdc_r, r1_r, channel_size); + memcpy(maskdc_g, r1_g, channel_size); + memcpy(maskdc_b, r1_b, channel_size); + + ocl.releaseMemChannels(rgb); + ocl.releaseMemChannels(rgb2); + ocl.releaseMemChannels(mask); + ocl.releaseMemChannels(mask_dc); +} + void clConvolutionEx(cl_mem inp, size_t xsize, size_t ysize, cl_mem multipliers, size_t len, int xstep, int offset, double border_ratio, @@ -353,33 +580,6 @@ void clOpsinDynamicsImageEx(ocl_channels &rgb/*in,out*/, const size_t xsize, con ocl.releaseMemChannels(rgb_blurred); } -void clOpsinDynamicsImage(const size_t xsize, const size_t ysize, float* r, float* g, float* b) -{ - cl_int channel_size = xsize * ysize * sizeof(float); - - cl_int err = 0; - ocl_args_d_t &ocl = getOcl(); - ocl_channels rgb = ocl.allocMemChannels(channel_size, r, g, b); - - clOpsinDynamicsImageEx(rgb, xsize, ysize); - - cl_float *result_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, rgb.r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); - cl_float *result_g = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, rgb.g, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); - cl_float *result_b = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, rgb.b, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); - - err = clFinish(ocl.commandQueue); - - memcpy(r, result_r, channel_size); - memcpy(g, result_g, channel_size); - memcpy(b, result_b, channel_size); - - clEnqueueUnmapMemObject(ocl.commandQueue, rgb.r, result_r, 0, NULL, NULL); - clEnqueueUnmapMemObject(ocl.commandQueue, rgb.g, result_g, 0, NULL, NULL); - clEnqueueUnmapMemObject(ocl.commandQueue, rgb.b, result_b, 0, NULL, NULL); - clFinish(ocl.commandQueue); - - ocl.releaseMemChannels(rgb); -} void clMaskHighIntensityChangeEx(ocl_channels &xyb0/*in,out*/, ocl_channels &xyb1/*in,out*/, @@ -863,45 +1063,6 @@ void clMaskEx(const ocl_channels &rgb, const ocl_channels &rgb2, } } -void clMask(const float* r, const float* g, const float* b, - const float* r2, const float* g2, const float* b2, - size_t xsize, size_t ysize, - float* mask_r, float* mask_g, float* mask_b, - float* maskdc_r, float* maskdc_g, float* maskdc_b) -{ - cl_int err = CL_SUCCESS; - ocl_args_d_t &ocl = getOcl(); - - cl_int channel_size = xsize * ysize * sizeof(float); - - ocl_channels rgb = ocl.allocMemChannels(channel_size, r, g, b); - ocl_channels rgb2 = ocl.allocMemChannels(channel_size, r2, g2, b2); - ocl_channels mask = ocl.allocMemChannels(channel_size); - ocl_channels mask_dc = ocl.allocMemChannels(channel_size); - - clMaskEx(rgb, rgb2, xsize, ysize, mask, mask_dc); - - cl_float *r0_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask.r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); - cl_float *r0_g = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask.g, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); - cl_float *r0_b = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask.b, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); - cl_float *r1_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask_dc.r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); - cl_float *r1_g = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask_dc.g, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); - cl_float *r1_b = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask_dc.b, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); - err = clFinish(ocl.commandQueue); - - memcpy(mask_r, r0_r, channel_size); - memcpy(mask_g, r0_g, channel_size); - memcpy(mask_b, r0_b, channel_size); - memcpy(maskdc_r, r1_r, channel_size); - memcpy(maskdc_g, r1_g, channel_size); - memcpy(maskdc_b, r1_b, channel_size); - - ocl.releaseMemChannels(rgb); - ocl.releaseMemChannels(rgb2); - ocl.releaseMemChannels(mask); - ocl.releaseMemChannels(mask_dc); -} - void clCombineChannelsEx( const ocl_channels &mask, const ocl_channels &mask_dc, @@ -1073,160 +1234,3 @@ void clCalculateDiffmapEx(cl_mem diffmap/*in,out*/, size_t xsize, size_t ysize, clReleaseMemObject(blurred); } -void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b, - const float* r2, const float* g2, const float* b2, - size_t xsize, size_t ysize, - size_t step, - float* result) -{ - - const size_t res_xsize = (xsize + step - 1) / step; - const size_t res_ysize = (ysize + step - 1) / step; - - cl_int channel_size = xsize * ysize * sizeof(float); - cl_int channel_step_size = res_xsize * res_ysize * sizeof(float); - - cl_int err = 0; - ocl_args_d_t &ocl = getOcl(); - ocl_channels xyb0 = ocl.allocMemChannels(channel_size, r, g, b); - ocl_channels xyb1 = ocl.allocMemChannels(channel_size, r2, g2, b2); - - cl_mem mem_result = ocl.allocMem(channel_size, result); - - cl_mem edge_detector_map = ocl.allocMem(3 * channel_step_size); - cl_mem block_diff_dc = ocl.allocMem(3 * channel_step_size); - cl_mem block_diff_ac = ocl.allocMem(3 * channel_step_size); - - clMaskHighIntensityChangeEx(xyb0, xyb1, xsize, ysize); - - clEdgeDetectorMapEx(xyb0, xyb1, xsize, ysize, step, edge_detector_map); - clBlockDiffMapEx(xyb0, xyb1, xsize, ysize, step, block_diff_dc, block_diff_ac); - clEdgeDetectorLowFreqEx(xyb0, xyb1, xsize, ysize, step, block_diff_ac); - { - ocl_channels mask = ocl.allocMemChannels(channel_size); - ocl_channels mask_dc = ocl.allocMemChannels(channel_size); - clMaskEx(xyb0, xyb1, xsize, ysize, mask, mask_dc); - clCombineChannelsEx(mask, mask_dc, block_diff_dc, block_diff_ac, edge_detector_map, xsize, ysize, res_xsize, step, mem_result); - - ocl.releaseMemChannels(mask); - ocl.releaseMemChannels(mask_dc); - } - - clCalculateDiffmapEx(mem_result, xsize, ysize, step); - - cl_float *result_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mem_result, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); - err = clFinish(ocl.commandQueue); - memcpy(result, result_r, channel_size); - - clEnqueueUnmapMemObject(ocl.commandQueue, mem_result, result_r, 0, NULL, NULL); - clFinish(ocl.commandQueue); - - ocl.releaseMemChannels(xyb1); - ocl.releaseMemChannels(xyb0); - - clReleaseMemObject(edge_detector_map); - clReleaseMemObject(block_diff_dc); - clReleaseMemObject(block_diff_ac); - - clReleaseMemObject(mem_result); -} - -void clComputeBlockZeroingOrder( - const channel_info orig_channel[3], - const float *orig_image_batch, - const float *mask_scale, - const int image_width, - const int image_height, - const channel_info mayout_channel[3], - const int factor, - const int comp_mask, - const float BlockErrorLimit, - guetzli::CoeffData *output_order_batch) -{ - const int block8_width = (image_width + 8 - 1) / 8; - const int block8_height = (image_height + 8 - 1) / 8; - const int blockf_width = (image_width + 8 * factor - 1) / (8 * factor); - const int blockf_height = (image_height + 8 * factor - 1) / (8 * factor); - - using namespace guetzli; - - cl_int err = 0; - ocl_args_d_t &ocl = getOcl(); - - cl_mem mem_orig_coeff[3]; - cl_mem mem_mayout_coeff[3]; - cl_mem mem_mayout_pixel[3]; - for (int c = 0; c < 3; c++) - { - int block_count = orig_channel[c].block_width * orig_channel[c].block_height; - mem_orig_coeff[c] = ocl.allocMem(block_count * sizeof(::coeff_t) * kDCTBlockSize, orig_channel[c].coeff); - - block_count = mayout_channel[c].block_width * mayout_channel[c].block_height; - mem_mayout_coeff[c] = ocl.allocMem(block_count * sizeof(::coeff_t) * kDCTBlockSize, mayout_channel[c].coeff); - - mem_mayout_pixel[c] = ocl.allocMem(image_width * image_height * sizeof(uint16_t), mayout_channel[c].pixel); - } - cl_mem mem_orig_image = ocl.allocMem(sizeof(float) * 3 * kDCTBlockSize * block8_width * block8_height, orig_image_batch); - cl_mem mem_mask_scale = ocl.allocMem(sizeof(float) * 3 * block8_width * block8_height, mask_scale); - - int output_order_batch_size = sizeof(CoeffData) * 3 * kDCTBlockSize * blockf_width * blockf_height; - cl_mem mem_output_order_batch = ocl.allocMem(output_order_batch_size); - cl_float clBlockErrorLimit = BlockErrorLimit; - cl_int clWidth = image_width; - cl_int clHeight = image_height; - cl_int clFactor = factor; - cl_int clMask = comp_mask; - - cl_kernel kernel = ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER]; - clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&mem_orig_coeff[0]); - clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&mem_orig_coeff[1]); - clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&mem_orig_coeff[2]); - clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&mem_orig_image); - clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&mem_mask_scale); - clSetKernelArg(kernel, 5, sizeof(cl_int), &clWidth); - clSetKernelArg(kernel, 6, sizeof(cl_int), &clHeight); - clSetKernelArg(kernel, 7, sizeof(cl_mem), (void*)&mem_mayout_coeff[0]); - clSetKernelArg(kernel, 8, sizeof(cl_mem), (void*)&mem_mayout_coeff[1]); - clSetKernelArg(kernel, 9, sizeof(cl_mem), (void*)&mem_mayout_coeff[2]); - clSetKernelArg(kernel, 10, sizeof(cl_mem), (void*)&mem_mayout_pixel[0]); - clSetKernelArg(kernel, 11, sizeof(cl_mem), (void*)&mem_mayout_pixel[1]); - clSetKernelArg(kernel, 12, sizeof(cl_mem), (void*)&mem_mayout_pixel[2]); - clSetKernelArg(kernel, 13, sizeof(channel_info), &mayout_channel[0]); - clSetKernelArg(kernel, 14, sizeof(channel_info), &mayout_channel[1]); - clSetKernelArg(kernel, 15, sizeof(channel_info), &mayout_channel[2]); - clSetKernelArg(kernel, 16, sizeof(cl_int), &clFactor); - clSetKernelArg(kernel, 17, sizeof(cl_int), &clMask); - clSetKernelArg(kernel, 18, sizeof(cl_float), &clBlockErrorLimit); - clSetKernelArg(kernel, 19, sizeof(cl_mem), &mem_output_order_batch); - - size_t globalWorkSize[2] = { blockf_width, blockf_height}; - err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); - if (CL_SUCCESS != err) - { - LogError("Error: clComputeBlockZeroingOrder() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err)); - } - err = clFinish(ocl.commandQueue); - if (CL_SUCCESS != err) - { - LogError("Error: clComputeBlockZeroingOrder() clFinish returned %s.\n", TranslateOpenCLError(err)); - } - - CoeffData *result = (CoeffData *)clEnqueueMapBuffer(ocl.commandQueue, mem_output_order_batch, true, CL_MAP_READ, 0, output_order_batch_size, 0, NULL, NULL, &err); - err = clFinish(ocl.commandQueue); - memcpy(output_order_batch, result, output_order_batch_size); - - clEnqueueUnmapMemObject(ocl.commandQueue, mem_output_order_batch, result, 0, NULL, NULL); - clFinish(ocl.commandQueue); - - for (int c = 0; c < 3; c++) - { - clReleaseMemObject(mem_orig_coeff[c]); - clReleaseMemObject(mem_mayout_coeff[c]); - clReleaseMemObject(mem_mayout_pixel[c]); - - } - - clReleaseMemObject(mem_orig_image); - clReleaseMemObject(mem_mask_scale); - clReleaseMemObject(mem_output_order_batch); -} \ No newline at end of file diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h index 4d4a2fcf..a8be9a42 100644 --- a/clguetzli/clguetzli.h +++ b/clguetzli/clguetzli.h @@ -9,15 +9,19 @@ extern bool g_useOpenCL; extern bool g_checkOpenCL; -void clOpsinDynamicsImage(const size_t xsize, const size_t ysize, float* r, float* g, float* b); +void clOpsinDynamicsImage( + float *r, float *g, float *b, + const size_t xsize, const size_t ysize); -void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b, +void clDiffmapOpsinDynamicsImage( + float* result, + const float* r, const float* g, const float* b, const float* r2, const float* g2, const float* b2, const size_t xsize, const size_t ysize, - const size_t step, - float* result); + const size_t step); void clComputeBlockZeroingOrder( + guetzli::CoeffData *output_order_batch, const channel_info orig_channel[3], const float *orig_image_batch, const float *mask_scale, @@ -26,14 +30,15 @@ void clComputeBlockZeroingOrder( const channel_info mayout_channel[3], const int factor, const int comp_mask, - const float BlockErrorLimit, - guetzli::CoeffData *output_order_batch); + const float BlockErrorLimit + ); -void clMask(const float* r, const float* g, const float* b, - const float* r2, const float* g2, const float* b2, +void clMask( + float* mask_r, float* mask_g, float* mask_b, + float* maskdc_r, float* maskdc_g, float* maskdc_b, const size_t xsize, const size_t ysize, - float* mask_r, float* mask_g, float* mask_b, - float* maskdc_r, float* maskdc_g, float* maskdc_b); + const float* r, const float* g, const float* b, + const float* r2, const float* g2, const float* b2); void clMaskHighIntensityChangeEx(ocl_channels &xyb0/*in,out*/, ocl_channels &xyb1/*in,out*/, diff --git a/guetzli/processor.cc b/guetzli/processor.cc index e5439460..2c9811a9 100644 --- a/guetzli/processor.cc +++ b/guetzli/processor.cc @@ -588,16 +588,16 @@ void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, co output_order_gpu.resize(num_blocks * kBlockSize); output_order = output_order_gpu.data(); - clComputeBlockZeroingOrder(orig_channel, - comp->imgOpsinDynamicsBlockList.data(), - comp->imgMaskXyzScaleBlockList.data(), - width, - height, - mayout_channel, - factor_x, - comp_mask, - comp->BlockErrorLimit(), - output_order); + clComputeBlockZeroingOrder(output_order, + orig_channel, + comp->imgOpsinDynamicsBlockList.data(), + comp->imgMaskXyzScaleBlockList.data(), + width, + height, + mayout_channel, + factor_x, + comp_mask, + comp->BlockErrorLimit()); } if (!g_useOpenCL || g_checkOpenCL) From 7c9c34ad258b119e2b92e0f9aa79b4aae0ae5ab4 Mon Sep 17 00:00:00 2001 From: strongtu Date: Wed, 24 May 2017 19:18:15 +0800 Subject: [PATCH 110/189] =?UTF-8?q?=E8=B0=83=E6=95=B4=E5=8F=82=E6=95=B0?= =?UTF-8?q?=E8=A7=84=E5=88=99?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clguetzli.cpp | 225 +++++++++++++++++++---------------- clguetzli/clguetzli.h | 117 ++++++++++++------ clguetzli/clguetzli_test.cpp | 20 ++-- 3 files changed, 214 insertions(+), 148 deletions(-) diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index fa3507a4..67a7f2a0 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -123,14 +123,14 @@ void clDiffmapOpsinDynamicsImage( clMaskHighIntensityChangeEx(xyb0, xyb1, xsize, ysize); - clEdgeDetectorMapEx(xyb0, xyb1, xsize, ysize, step, edge_detector_map); - clBlockDiffMapEx(xyb0, xyb1, xsize, ysize, step, block_diff_dc, block_diff_ac); - clEdgeDetectorLowFreqEx(xyb0, xyb1, xsize, ysize, step, block_diff_ac); + clEdgeDetectorMapEx(edge_detector_map, xyb0, xyb1, xsize, ysize, step); + clBlockDiffMapEx(block_diff_dc, block_diff_ac, xyb0, xyb1, xsize, ysize, step); + clEdgeDetectorLowFreqEx(block_diff_ac, xyb0, xyb1, xsize, ysize, step); { ocl_channels mask = ocl.allocMemChannels(channel_size); ocl_channels mask_dc = ocl.allocMemChannels(channel_size); - clMaskEx(xyb0, xyb1, xsize, ysize, mask, mask_dc); - clCombineChannelsEx(mask, mask_dc, block_diff_dc, block_diff_ac, edge_detector_map, xsize, ysize, res_xsize, step, mem_result); + clMaskEx(mask, mask_dc, xyb0, xyb1, xsize, ysize); + clCombineChannelsEx(mem_result, mask, mask_dc, xsize, ysize, block_diff_dc, block_diff_ac, edge_detector_map, res_xsize, step); ocl.releaseMemChannels(mask); ocl.releaseMemChannels(mask_dc); @@ -272,7 +272,7 @@ void clMask( ocl_channels mask = ocl.allocMemChannels(channel_size); ocl_channels mask_dc = ocl.allocMemChannels(channel_size); - clMaskEx(rgb, rgb2, xsize, ysize, mask, mask_dc); + clMaskEx(mask, mask_dc, rgb, rgb2, xsize, ysize); cl_float *r0_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask.r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); cl_float *r0_g = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask.g, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); @@ -295,10 +295,11 @@ void clMask( ocl.releaseMemChannels(mask_dc); } -void clConvolutionEx(cl_mem inp, size_t xsize, size_t ysize, - cl_mem multipliers, size_t len, - int xstep, int offset, double border_ratio, - cl_mem result/*out*/) +void clConvolutionEx( + cl_mem result/*out*/, + const cl_mem inp, size_t xsize, size_t ysize, + const cl_mem multipliers, size_t len, + int xstep, int offset, double border_ratio) { cl_int err = CL_SUCCESS; ocl_args_d_t &ocl = getOcl(); @@ -334,10 +335,11 @@ void clConvolutionEx(cl_mem inp, size_t xsize, size_t ysize, } } -void clConvolutionX(cl_mem inp, size_t xsize, size_t ysize, - cl_mem multipliers, size_t len, - int xstep, int offset, double border_ratio, - cl_mem result/*out*/) +void clConvolutionX( + cl_mem result/*out*/, + const cl_mem inp, size_t xsize, size_t ysize, + const cl_mem multipliers, size_t len, + int xstep, int offset, double border_ratio) { cl_int err = CL_SUCCESS; ocl_args_d_t &ocl = getOcl(); @@ -369,10 +371,12 @@ void clConvolutionX(cl_mem inp, size_t xsize, size_t ysize, } } -void clConvolutionY(cl_mem inp, size_t xsize, size_t ysize, - cl_mem multipliers, size_t len, - int xstep, int offset, double border_ratio, - cl_mem result/*out*/) +void clConvolutionY( + cl_mem result/*out*/, + const cl_mem inp, size_t xsize, size_t ysize, + const cl_mem multipliers, size_t len, + int xstep, int offset, double border_ratio + ) { cl_int err = CL_SUCCESS; ocl_args_d_t &ocl = getOcl(); @@ -404,9 +408,10 @@ void clConvolutionY(cl_mem inp, size_t xsize, size_t ysize, } } -void clUpsampleEx2(cl_mem image, size_t xsize, size_t ysize, - size_t xstep, size_t ystep, - cl_mem result/*out*/) +void clUpsampleEx2( + cl_mem result/*out*/, + const cl_mem image, size_t xsize, size_t ysize, + size_t xstep, size_t ystep) { cl_int err = CL_SUCCESS; ocl_args_d_t &ocl = getOcl(); @@ -432,9 +437,11 @@ void clUpsampleEx2(cl_mem image, size_t xsize, size_t ysize, } } -void clUpsampleEx(cl_mem image, size_t xsize, size_t ysize, - size_t xstep, size_t ystep, - cl_mem result/*out*/) +void clUpsampleEx( + cl_mem result/*out*/, + const cl_mem image, + const size_t xsize, const size_t ysize, + const size_t xstep, const size_t ystep) { cl_int err = CL_SUCCESS; ocl_args_d_t &ocl = getOcl(); @@ -460,49 +467,55 @@ void clUpsampleEx(cl_mem image, size_t xsize, size_t ysize, } } -void clBlurEx2(cl_mem image/*out, opt*/, size_t xsize, size_t ysize, - double sigma, double border_ratio, - cl_mem result/*out, opt*/) +void clBlurEx(cl_mem image/*out, opt*/, const size_t xsize, const size_t ysize, + const double sigma, const double border_ratio, + cl_mem result/*out, opt*/) { - double m = 2.25; // Accuracy increases when m is increased. - const double scaler = -1.0 / (2 * sigma * sigma); - // For m = 9.0: exp(-scaler * diff * diff) < 2^ {-52} - const int diff = std::max(1, m * fabs(sigma)); - const int expn_size = 2 * diff + 1; - std::vector expn(expn_size); - for (int i = -diff; i <= diff; ++i) { - expn[i + diff] = static_cast(exp(scaler * i * i)); - } + clBlurEx2(image, xsize, ysize, sigma, border_ratio, result); + + return; + double m = 2.25; // Accuracy increases when m is increased. + const double scaler = -1.0 / (2 * sigma * sigma); + // For m = 9.0: exp(-scaler * diff * diff) < 2^ {-52} + const int diff = std::max(1, m * fabs(sigma)); + const int expn_size = 2 * diff + 1; + std::vector expn(expn_size); + for (int i = -diff; i <= diff; ++i) { + expn[i + diff] = static_cast(exp(scaler * i * i)); + } - const int xstep = std::max(1, int(sigma / 3)); + const int xstep = std::max(1, int(sigma / 3)); + const int ystep = xstep; + int dxsize = (xsize + xstep - 1) / xstep; + int dysize = (ysize + ystep - 1) / ystep; - cl_int err = 0; - ocl_args_d_t &ocl = getOcl(); - cl_mem mem_expn = ocl.allocMem(sizeof(cl_float) * expn_size, expn.data()); + cl_int err = 0; + ocl_args_d_t &ocl = getOcl(); + cl_mem mem_expn = ocl.allocMem(sizeof(cl_float) * expn_size, expn.data()); - if (xstep > 1) - { - ocl.allocA(sizeof(cl_float) * xsize * ysize); - clConvolutionX(image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio, ocl.srcA); - clConvolutionY(ocl.srcA, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio, result ? result : image); - clUpsampleEx2(result ? result : image, xsize, ysize, xstep, xstep, result ? result : image); - } - else - { - ocl.allocA(sizeof(cl_float) * xsize * ysize); - clConvolutionX(image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio, ocl.srcA); - clConvolutionY(ocl.srcA, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio, result ? result : image); - } + if (xstep > 1) + { + ocl.allocA(sizeof(cl_float) * dxsize * ysize); + ocl.allocB(sizeof(cl_float) * dxsize * dysize); - clReleaseMemObject(mem_expn); + clConvolutionEx(ocl.srcA, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio); + clConvolutionEx(ocl.srcB, ocl.srcA, ysize, dxsize, mem_expn, expn_size, ystep, diff, border_ratio); + clUpsampleEx(result ? result : image, ocl.srcB, xsize, ysize, xstep, ystep); + } + else + { + ocl.allocA(sizeof(cl_float) * xsize * ysize); + clConvolutionEx(ocl.srcA, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio); + clConvolutionEx(result ? result : image, ocl.srcA, ysize, dxsize, mem_expn, expn_size, ystep, diff, border_ratio); + } + + clReleaseMemObject(mem_expn); } -void clBlurEx(cl_mem image/*out, opt*/, const size_t xsize, const size_t ysize, - const double sigma, const double border_ratio, - cl_mem result/*out, opt*/) -{ - clBlurEx2(image, xsize, ysize, sigma, border_ratio, result); - return; +void clBlurEx2(cl_mem image/*out, opt*/, size_t xsize, size_t ysize, + double sigma, double border_ratio, + cl_mem result/*out, opt*/) +{ double m = 2.25; // Accuracy increases when m is increased. const double scaler = -1.0 / (2 * sigma * sigma); // For m = 9.0: exp(-scaler * diff * diff) < 2^ {-52} @@ -514,9 +527,6 @@ void clBlurEx(cl_mem image/*out, opt*/, const size_t xsize, const size_t ysize, } const int xstep = std::max(1, int(sigma / 3)); - const int ystep = xstep; - int dxsize = (xsize + xstep - 1) / xstep; - int dysize = (ysize + ystep - 1) / ystep; cl_int err = 0; ocl_args_d_t &ocl = getOcl(); @@ -524,24 +534,22 @@ void clBlurEx(cl_mem image/*out, opt*/, const size_t xsize, const size_t ysize, if (xstep > 1) { - ocl.allocA(sizeof(cl_float) * dxsize * ysize); - ocl.allocB(sizeof(cl_float) * dxsize * dysize); - - clConvolutionEx(image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio, ocl.srcA); - clConvolutionEx(ocl.srcA, ysize, dxsize, mem_expn, expn_size, ystep, diff, border_ratio, ocl.srcB); - clUpsampleEx(ocl.srcB, xsize, ysize, xstep, ystep, result ? result : image); + ocl.allocA(sizeof(cl_float) * xsize * ysize); + clConvolutionX(ocl.srcA, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio); + clConvolutionY(result ? result : image, ocl.srcA, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio); + clUpsampleEx2(result ? result : image, result ? result : image, xsize, ysize, xstep, xstep); } else { ocl.allocA(sizeof(cl_float) * xsize * ysize); - clConvolutionEx(image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio, ocl.srcA); - clConvolutionEx(ocl.srcA, ysize, dxsize, mem_expn, expn_size, ystep, diff, border_ratio, result ? result : image); + clConvolutionX(ocl.srcA, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio); + clConvolutionY(result ? result : image, ocl.srcA, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio); } clReleaseMemObject(mem_expn); } -void clOpsinDynamicsImageEx(ocl_channels &rgb/*in,out*/, const size_t xsize, const size_t ysize) +void clOpsinDynamicsImageEx(ocl_channels &rgb, const size_t xsize, const size_t ysize) { static const double kSigma = 1.1; @@ -581,9 +589,10 @@ void clOpsinDynamicsImageEx(ocl_channels &rgb/*in,out*/, const size_t xsize, con } -void clMaskHighIntensityChangeEx(ocl_channels &xyb0/*in,out*/, - ocl_channels &xyb1/*in,out*/, - const size_t xsize, const size_t ysize) +void clMaskHighIntensityChangeEx( + ocl_channels &xyb0/*in,out*/, + ocl_channels &xyb1/*in,out*/, + const size_t xsize, const size_t ysize) { cl_int channel_size = xsize * ysize * sizeof(float); @@ -631,8 +640,10 @@ void clMaskHighIntensityChangeEx(ocl_channels &xyb0/*in,out*/, ocl.releaseMemChannels(c1); } -void clEdgeDetectorMapEx(const ocl_channels &rgb, const ocl_channels &rgb2, - const size_t xsize, const size_t ysize, const size_t step, cl_mem result/*out*/) +void clEdgeDetectorMapEx( + cl_mem result/*out*/, + const ocl_channels &rgb, const ocl_channels &rgb2, + const size_t xsize, const size_t ysize, const size_t step) { cl_int channel_size = xsize * ysize * sizeof(float); @@ -685,9 +696,11 @@ void clEdgeDetectorMapEx(const ocl_channels &rgb, const ocl_channels &rgb2, ocl.releaseMemChannels(rgb2_blured); } -void clBlockDiffMapEx(const ocl_channels &rgb, const ocl_channels &rgb2, - const size_t xsize, const size_t ysize, const size_t step, - cl_mem block_diff_dc/*out*/, cl_mem block_diff_ac/*out*/) +void clBlockDiffMapEx( + cl_mem block_diff_dc/*out*/, + cl_mem block_diff_ac/*out*/, + const ocl_channels &rgb, const ocl_channels &rgb2, + const size_t xsize, const size_t ysize, const size_t step) { cl_int err = CL_SUCCESS; ocl_args_d_t &ocl = getOcl(); @@ -725,9 +738,10 @@ void clBlockDiffMapEx(const ocl_channels &rgb, const ocl_channels &rgb2, } } -void clEdgeDetectorLowFreqEx(const ocl_channels &rgb, const ocl_channels &rgb2, - const size_t xsize, const size_t ysize, const size_t step, - cl_mem block_diff_ac/*out*/) +void clEdgeDetectorLowFreqEx( + cl_mem block_diff_ac/*in,out*/, + const ocl_channels &rgb, const ocl_channels &rgb2, + const size_t xsize, const size_t ysize, const size_t step) { cl_int channel_size = xsize * ysize * sizeof(float); @@ -779,7 +793,10 @@ void clEdgeDetectorLowFreqEx(const ocl_channels &rgb, const ocl_channels &rgb2, ocl.releaseMemChannels(rgb2_blured); } -void clDiffPrecomputeEx(ocl_channels xyb0, ocl_channels xyb1, size_t xsize, size_t ysize, ocl_channels mask/*out*/) +void clDiffPrecomputeEx( + ocl_channels &mask/*out*/, + const ocl_channels &xyb0, const ocl_channels &xyb1, + const size_t xsize, const size_t ysize) { cl_int err = CL_SUCCESS; ocl_args_d_t &ocl = getOcl(); @@ -832,7 +849,7 @@ void clScaleImageEx(cl_mem img/*in, out*/, size_t size, double w) } } -void clAverage5x5Ex(cl_mem img/*in,out*/, size_t xsize, size_t ysize) +void clAverage5x5Ex(cl_mem img/*in,out*/, const size_t xsize, const size_t ysize) { if (xsize < 4 || ysize < 4) { // TODO: Make this work for small dimensions as well. @@ -865,7 +882,10 @@ void clAverage5x5Ex(cl_mem img/*in,out*/, size_t xsize, size_t ysize) } } -void clMinSquareValEx(cl_mem img/*in,out*/, size_t xsize, size_t ysize, size_t square_size, size_t offset) +void clMinSquareValEx( + cl_mem img/*in,out*/, + const size_t xsize, const size_t ysize, + const size_t square_size, const size_t offset) { cl_int err = CL_SUCCESS; ocl_args_d_t &ocl = getOcl(); @@ -899,7 +919,6 @@ void clMinSquareValEx(cl_mem img/*in,out*/, size_t xsize, size_t ysize, size_t s } } - static void MakeMask(double extmul, double extoff, double mul, double offset, double scaler, double *result) @@ -1034,12 +1053,12 @@ void clDoMask(ocl_channels mask/*in, out*/, ocl_channels mask_dc/*in, out*/, siz ocl.releaseMemChannels(xyb_dc); } - -void clMaskEx(const ocl_channels &rgb, const ocl_channels &rgb2, - const size_t xsize, const size_t ysize, - ocl_channels mask/*out*/, ocl_channels mask_dc/*out*/) +void clMaskEx( + ocl_channels mask/*out*/, ocl_channels mask_dc/*out*/, + const ocl_channels &rgb, const ocl_channels &rgb2, + const size_t xsize, const size_t ysize) { - clDiffPrecomputeEx(rgb, rgb2, xsize, ysize, mask); + clDiffPrecomputeEx(mask, rgb, rgb2, xsize, ysize); for (int i = 0; i < 3; i++) { clAverage5x5Ex(mask.ch[i], xsize, ysize); @@ -1064,15 +1083,15 @@ void clMaskEx(const ocl_channels &rgb, const ocl_channels &rgb2, } void clCombineChannelsEx( + cl_mem result/*out*/, const ocl_channels &mask, const ocl_channels &mask_dc, - cl_mem block_diff_dc, - cl_mem block_diff_ac, - cl_mem edge_detector_map, - size_t xsize, size_t ysize, - size_t res_xsize, - size_t step, - cl_mem result/*out*/) + const size_t xsize, const size_t ysize, + const cl_mem block_diff_dc, + const cl_mem block_diff_ac, + const cl_mem edge_detector_map, + const size_t res_xsize, + const size_t step) { cl_int err = CL_SUCCESS; ocl_args_d_t &ocl = getOcl(); @@ -1114,7 +1133,7 @@ void clCombineChannelsEx( } } -void clUpsampleSquareRootEx(cl_mem diffmap, size_t xsize, size_t ysize, int step) +void clUpsampleSquareRootEx(cl_mem diffmap, const size_t xsize, const size_t ysize, const int step) { cl_int err = CL_SUCCESS; ocl_args_d_t &ocl = getOcl(); @@ -1156,7 +1175,7 @@ void clUpsampleSquareRootEx(cl_mem diffmap, size_t xsize, size_t ysize, int step clReleaseMemObject(mem_diffmap); } -void clRemoveBorderEx(cl_mem in, size_t xsize, size_t ysize, int step, cl_mem out) +void clRemoveBorderEx(cl_mem out, const cl_mem in, const size_t xsize, const size_t ysize, const int step) { cl_int err = 0; ocl_args_d_t &ocl = getOcl(); @@ -1210,7 +1229,7 @@ void clAddBorderEx(cl_mem out, size_t xsize, size_t ysize, int step, cl_mem in) } } -void clCalculateDiffmapEx(cl_mem diffmap/*in,out*/, size_t xsize, size_t ysize, int step) +void clCalculateDiffmapEx(cl_mem diffmap/*in,out*/, const size_t xsize, const size_t ysize, const int step) { clUpsampleSquareRootEx(diffmap, xsize, ysize, step); @@ -1223,7 +1242,7 @@ void clCalculateDiffmapEx(cl_mem diffmap/*in,out*/, size_t xsize, size_t ysize, ocl_args_d_t &ocl = getOcl(); cl_mem blurred = ocl.allocMem((xsize - s) * (ysize - s) * sizeof(float)); - clRemoveBorderEx(diffmap, xsize, ysize, step, blurred); + clRemoveBorderEx(blurred, diffmap, xsize, ysize, step); static const double border_ratio = 0.03027655136; clBlurEx(blurred, xsize - s, ysize - s, kSigma, border_ratio); diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h index a8be9a42..f1aa6c22 100644 --- a/clguetzli/clguetzli.h +++ b/clguetzli/clguetzli.h @@ -40,58 +40,105 @@ void clMask( const float* r, const float* g, const float* b, const float* r2, const float* g2, const float* b2); -void clMaskHighIntensityChangeEx(ocl_channels &xyb0/*in,out*/, +void clConvolutionEx( + cl_mem result/*out*/, + const cl_mem inp, size_t xsize, size_t ysize, + const cl_mem multipliers, size_t len, + int xstep, int offset, double border_ratio); + +void clConvolutionX( + cl_mem result/*out*/, + const cl_mem inp, size_t xsize, size_t ysize, + const cl_mem multipliers, size_t len, + int xstep, int offset, double border_ratio); + +void clConvolutionY( + cl_mem result/*out*/, + const cl_mem inp, size_t xsize, size_t ysize, + const cl_mem multipliers, size_t len, + int xstep, int offset, double border_ratio); + +void clUpsampleEx2( + cl_mem result/*out*/, + const cl_mem image, size_t xsize, size_t ysize, + size_t xstep, size_t ystep); + +void clUpsampleEx( + cl_mem result/*out*/, + const cl_mem image, + const size_t xsize, const size_t ysize, + const size_t xstep, const size_t ystep); + +void clBlurEx(cl_mem image/*out, opt*/, const size_t xsize, const size_t ysize, + const double sigma, const double border_ratio, + cl_mem result = nullptr/*out, opt*/); + +void clBlurEx2(cl_mem image/*out, opt*/, size_t xsize, size_t ysize, + double sigma, double border_ratio, + cl_mem result = NULL/*out, opt*/); + +void clOpsinDynamicsImageEx(ocl_channels &rgb, const size_t xsize, const size_t ysize); + +void clMaskHighIntensityChangeEx( + ocl_channels &xyb0/*in,out*/, ocl_channels &xyb1/*in,out*/, const size_t xsize, const size_t ysize); -void clMaskEx(const ocl_channels &rgb, const ocl_channels &rgb2, - const size_t xsize, const size_t ysize, - ocl_channels mask/*out*/, ocl_channels mask_dc/*out*/); - -void clEdgeDetectorMapEx(const ocl_channels &rgb, const ocl_channels &rgb2, - const size_t xsize, const size_t ysize, const size_t step, cl_mem result/*out*/); +void clEdgeDetectorMapEx( + cl_mem result/*out*/, + const ocl_channels &rgb, const ocl_channels &rgb2, + const size_t xsize, const size_t ysize, const size_t step); + +void clBlockDiffMapEx( + cl_mem block_diff_dc/*out*/, + cl_mem block_diff_ac/*out*/, + const ocl_channels &rgb, const ocl_channels &rgb2, + const size_t xsize, const size_t ysize, const size_t step); + +void clEdgeDetectorLowFreqEx( + cl_mem block_diff_ac/*in,out*/, + const ocl_channels &rgb, const ocl_channels &rgb2, + const size_t xsize, const size_t ysize, const size_t step); + +void clDiffPrecomputeEx( + ocl_channels &mask/*out*/, + const ocl_channels &xyb0, const ocl_channels &xyb1, + const size_t xsize, const size_t ysize); -void clBlockDiffMapEx(const ocl_channels &rgb, const ocl_channels &rgb2, - const size_t xsize, const size_t ysize, const size_t step, - cl_mem block_diff_dc/*out*/, cl_mem block_diff_ac/*out*/); +void clScaleImageEx(cl_mem img/*in, out*/, size_t size, double w); -void clEdgeDetectorLowFreqEx(const ocl_channels &rgb, const ocl_channels &rgb2, - const size_t xsize, const size_t ysize, const size_t step, - cl_mem block_diff_ac/*in,out*/); +void clAverage5x5Ex(cl_mem img/*in,out*/, const size_t xsize, const size_t ysize); -void clBlurEx(cl_mem image, const size_t xsize, const size_t ysize, const double sigma, const double border_ratio, cl_mem result = nullptr); +void clMinSquareValEx( + cl_mem img/*in,out*/, + const size_t xsize, const size_t ysize, + const size_t square_size, const size_t offset); -void clOpsinDynamicsImageEx(ocl_channels &rgb/*in,out*/, const size_t xsize, const size_t ysize); +void clMaskEx( + ocl_channels mask/*out*/, ocl_channels mask_dc/*out*/, + const ocl_channels &rgb, const ocl_channels &rgb2, + const size_t xsize, const size_t ysize); void clCombineChannelsEx( + cl_mem result/*out*/, const ocl_channels &mask, const ocl_channels &mask_dc, - cl_mem block_diff_dc, - cl_mem block_diff_ac, - cl_mem edge_detector_map, - size_t xsize, size_t ysize, - size_t res_xsize, - size_t step, - cl_mem result/*out*/); - -void clConvolutionEx(cl_mem inp, size_t xsize, size_t ysize, - cl_mem multipliers, size_t len, - int xstep, int offset, double border_ratio, - cl_mem result/*out*/); - -void clMinSquareValEx(cl_mem img/*in,out*/, size_t xsize, size_t ysize, size_t square_size, size_t offset); + const size_t xsize, const size_t ysize, + const cl_mem block_diff_dc, + const cl_mem block_diff_ac, + const cl_mem edge_detector_map, + const size_t res_xsize, + const size_t step); -void clUpsampleEx(cl_mem image, size_t xsize, size_t ysize, - size_t xstep, size_t ystep, - cl_mem result/*out*/); +void clUpsampleSquareRootEx(cl_mem diffmap, const size_t xsize, const size_t ysize, const int step); void clCalculateDiffmapEx(cl_mem diffmap/*in,out*/, size_t xsize, size_t ysize, int step); -void clScaleImageEx(cl_mem img/*in, out*/, size_t size, double w); +void clRemoveBorderEx(cl_mem out, const cl_mem in, const size_t xsize, const size_t ysize, const int step); -void clDiffPrecomputeEx(ocl_channels xyb0, ocl_channels xyb1, size_t xsize, size_t ysize, ocl_channels mask/*out*/); +void clAddBorderEx(cl_mem out, const size_t xsize, const size_t ysize, const int step, const cl_mem in); -void clAverage5x5Ex(cl_mem img/*in,out*/, size_t xsize, size_t ysize); +void clCalculateDiffmapEx(cl_mem diffmap/*in,out*/, const size_t xsize, const size_t ysize, const int step); class guetzli::OutputImage; diff --git a/clguetzli/clguetzli_test.cpp b/clguetzli/clguetzli_test.cpp index 58a23d35..28ae9d1b 100644 --- a/clguetzli/clguetzli_test.cpp +++ b/clguetzli/clguetzli_test.cpp @@ -83,7 +83,7 @@ void tclEdgeDetectorMap(const float* r, const float* g, const float* b, ocl_channels xyb1 = ocl.allocMemChannels(channel_size, r2, g2, b2); cl_mem edge = ocl.allocMem(edgemap_size); - clEdgeDetectorMapEx(xyb0, xyb1, xsize, ysize, step, edge); + clEdgeDetectorMapEx(edge, xyb0, xyb1, xsize, ysize, step); cl_float *r_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, edge, true, CL_MAP_READ, 0, edgemap_size, 0, NULL, NULL, &err); err = clFinish(ocl.commandQueue); @@ -117,7 +117,7 @@ void tclBlockDiffMap(const float* r, const float* g, const float* b, cl_mem block_diff_dc = ocl.allocMem(reschannel_size); cl_mem block_diff_ac = ocl.allocMem(reschannel_size); - clBlockDiffMapEx(xyb0, xyb1, xsize, ysize, step, block_diff_dc, block_diff_ac); + clBlockDiffMapEx(block_diff_dc, block_diff_ac, xyb0, xyb1, xsize, ysize, step); cl_float *r_dc = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, block_diff_dc, true, CL_MAP_READ, 0, reschannel_size, 0, NULL, NULL, &err); cl_float *r_ac = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, block_diff_ac, true, CL_MAP_READ, 0, reschannel_size, 0, NULL, NULL, &err); @@ -156,7 +156,7 @@ void tclEdgeDetectorLowFreq(const float* r, const float* g, const float* b, cl_mem block_diff_ac = ocl.allocMem(reschannel_size, orign_ac); - clEdgeDetectorLowFreqEx(xyb0, xyb1, xsize, ysize, step, block_diff_ac); + clEdgeDetectorLowFreqEx(block_diff_ac, xyb0, xyb1, xsize, ysize, step); cl_float *r_ac = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, block_diff_ac, true, CL_MAP_READ, 0, reschannel_size, 0, NULL, NULL, &err); err = clFinish(ocl.commandQueue); @@ -187,7 +187,7 @@ void tclMask(const float* r, const float* g, const float* b, ocl_channels mask = ocl.allocMemChannels(channel_size); ocl_channels mask_dc = ocl.allocMemChannels(channel_size); - clMaskEx(rgb, rgb2, xsize, ysize, mask/*out*/, mask_dc/*out*/); + clMaskEx(mask/*out*/, mask_dc/*out*/, rgb, rgb2, xsize, ysize); cl_float *r0_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask.r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); cl_float *r0_g = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask.g, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); @@ -237,10 +237,10 @@ void tclCombineChannels(const float *mask_xyb_x, const float *mask_xyb_y, const ocl_channels mask_dc = ocl.allocMemChannels(channel_size, mask_xyb_dc_x, mask_xyb_dc_y, mask_xyb_dc_b); cl_mem cl_block_diff_dc = ocl.allocMem(3 * res_channel_size, block_diff_dc); cl_mem cl_block_diff_ac = ocl.allocMem(3 * res_channel_size, block_diff_ac); - cl_mem cl_edge_detector_map = ocl.allocMem(3 * res_xsize * res_ysize * sizeof(float), edge_detector_map); - cl_mem cl_result = ocl.allocMem(res_xsize * res_ysize * sizeof(float), init_result); + cl_mem cl_edge_detector_map = ocl.allocMem(3 * res_channel_size, edge_detector_map); + cl_mem cl_result = ocl.allocMem(res_channel_size, init_result); - clCombineChannelsEx(mask, mask_dc, cl_block_diff_dc, cl_block_diff_ac, cl_edge_detector_map, xsize, ysize, res_xsize, step, cl_result); + clCombineChannelsEx(cl_result, mask, mask_dc, xsize, ysize, cl_block_diff_dc, cl_block_diff_ac, cl_edge_detector_map, res_xsize, step); cl_float *result_tmp = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, cl_result, true, CL_MAP_READ, 0, res_xsize * res_ysize * sizeof(float), 0, NULL, NULL, &err); @@ -316,7 +316,7 @@ void tclConvolution(size_t xsize, size_t ysize, cl_mem i = ocl.allocMem(inp_size, inp); cl_mem m = ocl.allocMem(multipliers_size, multipliers); - clConvolutionEx(i, xsize, ysize, m, len, xstep, offset, border_ratio, r); + clConvolutionEx(r, i, xsize, ysize, m, len, xstep, offset, border_ratio); cl_float *r_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, r, true, CL_MAP_READ, 0, result_size, 0, NULL, NULL, &err); err = clFinish(ocl.commandQueue); @@ -345,7 +345,7 @@ void tclUpsample(float* image, size_t xsize, size_t ysize, ocl.allocA(result_size); cl_mem r = ocl.srcA; - clUpsampleEx(img, xsize, ysize, xstep, ystep, r); + clUpsampleEx(r, img, xsize, ysize, xstep, ystep); cl_float *r_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, r, true, CL_MAP_READ, 0, result_size, 0, NULL, NULL, &err); err = clFinish(ocl.commandQueue); @@ -372,7 +372,7 @@ void tclDiffPrecompute( ocl_channels cl_xyb1 = ocl.allocMemChannels(channel_size, xyb1[0].data(), xyb1[1].data(), xyb1[2].data()); ocl_channels cl_mask = ocl.allocMemChannels(channel_size); - clDiffPrecomputeEx(cl_xyb0, cl_xyb1, xsize, ysize, cl_mask); + clDiffPrecomputeEx(cl_mask, cl_xyb0, cl_xyb1, xsize, ysize); cl_float *r_x = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, cl_mask.x, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); cl_float *r_y = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, cl_mask.y, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); From b0d7b80346790b41204a419a6f598f518407c7fc Mon Sep 17 00:00:00 2001 From: strongtu Date: Wed, 24 May 2017 20:21:16 +0800 Subject: [PATCH 111/189] =?UTF-8?q?=E8=B0=83=E6=95=B4=E5=8F=82=E6=95=B0?= =?UTF-8?q?=E8=A7=84=E8=8C=83?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clguetzli.cl | 707 ++++++++++++++++++----------------- clguetzli/clguetzli.cpp | 187 ++++----- clguetzli/clguetzli.h | 2 +- clguetzli/clguetzli_test.cpp | 28 -- clguetzli/clguetzli_test.h | 4 - clguetzli/ocl.h | 1 - 6 files changed, 432 insertions(+), 497 deletions(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index 9722b08d..9639c018 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -7,6 +7,7 @@ #define kDCTBlockSize (kBlockEdge * kBlockEdge) #define kBlockEdgeHalf (kBlockEdge / 2) #define kBlockHalf (kBlockEdge * kBlockEdgeHalf) +#define kComputeBlockSize (kBlockSize * 3) void XybToVals(double x, double y, double z, double *valx, double *valy, double *valz); double InterpolateClampNegative(__global const double *array, int size, double sx); @@ -31,64 +32,52 @@ void Butteraugli8x8CornerEdgeDetectorDiff( __global const float *r2, __global const float* g2, __global const float *b2, double* diff_xyb); -__kernel void clOpsinDynamicsImage( - __global float *r, __global float *g, __global float *b, - __global const float *r_blurred, __global const float *g_blurred, __global const float *b_blurred, - int size) +__kernel void clConvolution( + __global float* result, + __global const float* inp, const int xsize, + __global const float* multipliers, const int len, + const int xstep, const int offset, const float border_ratio) { - const int i = get_global_id(0); - double pre[3] = { r_blurred[i], g_blurred[i], b_blurred[i] }; - double pre_mixed[3]; - OpsinAbsorbance(pre, pre_mixed); - - double sensitivity[3]; - sensitivity[0] = Gamma(pre_mixed[0]) / pre_mixed[0]; - sensitivity[1] = Gamma(pre_mixed[1]) / pre_mixed[1]; - sensitivity[2] = Gamma(pre_mixed[2]) / pre_mixed[2]; + const int ox = get_global_id(0); + const int y = get_global_id(1); - double cur_rgb[3] = { r[i], g[i], b[i] }; - double cur_mixed[3]; - OpsinAbsorbance(cur_rgb, cur_mixed); - cur_mixed[0] *= sensitivity[0]; - cur_mixed[1] *= sensitivity[1]; - cur_mixed[2] *= sensitivity[2]; + const int oxsize = get_global_size(0); + const int ysize = get_global_size(1); - double x, y, z; - RgbToXyb(cur_mixed[0], cur_mixed[1], cur_mixed[2], &x, &y, &z); - r[i] = x; - g[i] = y; - b[i] = z; -} + const int x = ox * xstep; -__kernel void clMinSquareVal(__global const float* pA, __global float* pC, int square_size, int offset) -{ - const int x = get_global_id(0); - const int y = get_global_id(1); - const int width = get_global_size(0); - const int height = get_global_size(1); + float weight_no_border = 0; + for (int j = 0; j <= 2 * offset; j++) + { + weight_no_border += multipliers[j]; + } - int minH = offset > y ? 0 : y - offset; - int maxH = min(y + square_size - offset, height); + int minx = x < offset ? 0 : x - offset; + int maxx = min(xsize, x + len - offset); - int minW = offset > x ? 0 : x - offset; - int maxW = min(x + square_size - offset, width); + float weight = 0.0; + for (int j = minx; j < maxx; j++) + { + weight += multipliers[j - x + offset]; + } - float minValue = pA[minH * width + minW]; + weight = (1.0 - border_ratio) * weight + border_ratio * weight_no_border; + float scale = 1.0 / weight; - for (int j = minH; j < maxH; j++) + float sum = 0.0; + for (int j = minx; j < maxx; j++) { - for (int i = minW; i < maxW; i++) - { - float tmp = pA[j * width + i]; - if (tmp < minValue) minValue = tmp; - } + sum += inp[y * xsize + j] * multipliers[j - x + offset]; } - pC[y * width + x] = minValue; + result[ox * ysize + y] = sum * scale; } -__kernel void clConvolutionX(__global const float* multipliers, __global const float* inp, __global float* result, - int step, int len, int offset, float border_ratio) +__kernel void clConvolutionX( + __global float* result, + __global const float* inp, + __global const float* multipliers, const int len, + const int step, const int offset, const float border_ratio) { const int x = get_global_id(0); const int y = get_global_id(1); @@ -125,8 +114,11 @@ __kernel void clConvolutionX(__global const float* multipliers, __global const f result[y * xsize + x] = sum * scale; } -__kernel void clConvolutionY(__global const float* multipliers, __global const float* inp, __global float* result, - int step, int len, int offset, float border_ratio) +__kernel void clConvolutionY( + __global float* result, + __global const float* inp, + __global const float* multipliers, const int len, + const int step, const int offset, const float border_ratio) { const int x = get_global_id(0); const int y = get_global_id(1); @@ -164,147 +156,270 @@ __kernel void clConvolutionY(__global const float* multipliers, __global const f result[y * xsize + x] = sum * scale; } -__kernel void clConvolution(__global const float* multipliers, __global const float* inp, __global float* result, - int xsize, int xstep, int len, int offset, float border_ratio) +__kernel void clSquareSample( + __global float* result, + __global const float* image, + const int xstep, const int ystep) { - const int ox = get_global_id(0); + const int x = get_global_id(0); const int y = get_global_id(1); - const int oxsize = get_global_size(0); - const int ysize = get_global_size(1); + int x_sample = x - x % xstep; + int y_sample = y - y % ystep; - const int x = ox * xstep; + if (x_sample == x && y_sample == y) return; - float weight_no_border = 0; - for (int j = 0; j <= 2 * offset; j++) - { - weight_no_border += multipliers[j]; - } + const int xsize = get_global_size(0); + const int ysize = get_global_size(1); - int minx = x < offset ? 0 : x - offset; - int maxx = min(xsize, x + len - offset); + result[y * xsize + x] = image[y_sample * xsize + x_sample]; +} - float weight = 0.0; - for (int j = minx; j < maxx; j++) - { - weight += multipliers[j - x + offset]; - } +__kernel void clOpsinDynamicsImage( + __global float *r, __global float *g, __global float *b, + __global const float *r_blurred, __global const float *g_blurred, __global const float *b_blurred) +{ + const int i = get_global_id(0); + double pre[3] = { r_blurred[i], g_blurred[i], b_blurred[i] }; + double pre_mixed[3]; + OpsinAbsorbance(pre, pre_mixed); - weight = (1.0 - border_ratio) * weight + border_ratio * weight_no_border; - float scale = 1.0 / weight; + double sensitivity[3]; + sensitivity[0] = Gamma(pre_mixed[0]) / pre_mixed[0]; + sensitivity[1] = Gamma(pre_mixed[1]) / pre_mixed[1]; + sensitivity[2] = Gamma(pre_mixed[2]) / pre_mixed[2]; - float sum = 0.0; - for (int j = minx; j < maxx; j++) - { - sum += inp[y * xsize + j] * multipliers[j - x + offset]; - } + double cur_rgb[3] = { r[i], g[i], b[i] }; + double cur_mixed[3]; + OpsinAbsorbance(cur_rgb, cur_mixed); + cur_mixed[0] *= sensitivity[0]; + cur_mixed[1] *= sensitivity[1]; + cur_mixed[2] *= sensitivity[2]; - result[ox * ysize + y] = sum * scale; + double x, y, z; + RgbToXyb(cur_mixed[0], cur_mixed[1], cur_mixed[2], &x, &y, &z); + r[i] = x; + g[i] = y; + b[i] = z; } -__kernel void clSquareSample(__global const float* pA, __global float* pC, int xstep, int ystep) +__kernel void clMaskHighIntensityChange( + __global float *xyb0_x, __global float *xyb0_y, __global float *xyb0_b, + __global float *xyb1_x, __global float *xyb1_y, __global float *xyb1_b, + __global const float *c0_x, __global const float *c0_y, __global const float *c0_b, + __global const float *c1_x, __global const float *c1_y, __global const float *c1_b +) { const int x = get_global_id(0); const int y = get_global_id(1); + const int xsize = get_global_size(0); + const int ysize = get_global_size(1); - int x_sample = x - x % xstep; - int y_sample = y - y % ystep; + size_t ix = y * xsize + x; + const double ave[3] = { + (c0_x[ix] + c1_x[ix]) * 0.5, + (c0_y[ix] + c1_y[ix]) * 0.5, + (c0_b[ix] + c1_b[ix]) * 0.5, + }; + double sqr_max_diff = -1; + { + int offset[4] = { -1, 1, -(int)(xsize), (int)(xsize) }; + int border[4] = { x == 0, x + 1 == xsize, y == 0, y + 1 == ysize }; + for (int dir = 0; dir < 4; ++dir) { + if (border[dir]) { + continue; + } + const int ix2 = ix + offset[dir]; + double diff = 0.5 * (c0_y[ix2] + c1_y[ix2]) - ave[1]; + diff *= diff; + if (sqr_max_diff < diff) { + sqr_max_diff = diff; + } + } + } + const double kReductionX = 275.19165240059317; + const double kReductionY = 18599.41286306991; + const double kReductionZ = 410.8995306951065; + const double kChromaBalance = 106.95800948271017; + double chroma_scale = kChromaBalance / (ave[1] + kChromaBalance); - if (x_sample == x && y_sample == y) return; + const double mix[3] = { + chroma_scale * kReductionX / (sqr_max_diff + kReductionX), + kReductionY / (sqr_max_diff + kReductionY), + chroma_scale * kReductionZ / (sqr_max_diff + kReductionZ), + }; + // Interpolate lineraly between the average color and the actual + // color -- to reduce the importance of this pixel. + xyb0_x[ix] = (float)(mix[0] * c0_x[ix] + (1 - mix[0]) * ave[0]); + xyb1_x[ix] = (float)(mix[0] * c1_x[ix] + (1 - mix[0]) * ave[0]); - const int xsize = get_global_size(0); - const int ysize = get_global_size(1); + xyb0_y[ix] = (float)(mix[1] * c0_y[ix] + (1 - mix[1]) * ave[1]); + xyb1_y[ix] = (float)(mix[1] * c1_y[ix] + (1 - mix[1]) * ave[1]); - pC[y * xsize + x] = pA[y_sample * xsize + x_sample]; + xyb0_b[ix] = (float)(mix[2] * c0_b[ix] + (1 - mix[2]) * ave[2]); + xyb1_b[ix] = (float)(mix[2] * c1_b[ix] + (1 - mix[2]) * ave[2]); } -__kernel void clDownSample(__global const float* pA, __global float* pC, int xstep, int ystep) +__kernel void clEdgeDetectorMap( + __global float *result, + __global const float *r, __global const float *g, __global const float* b, + __global const float *r2, __global const float* g2, __global const float *b2, + int xsize, int ysize, int step) { - const int x = get_global_id(0); - const int y = get_global_id(1); + const int res_x = get_global_id(0); + const int res_y = get_global_id(1); - const int xsize = get_global_size(0); - const int ysize = get_global_size(1); + const int res_xsize = get_global_size(0); + const int res_ysize = get_global_size(1); - const int oxsize = (xsize + xstep - 1) / xstep; + int pos_x = res_x * step; + int pos_y = res_y * step; - const int sample_x = x / xstep; - const int sample_y = y / ystep; + if (pos_x >= xsize - (8 - step)) return; + if (pos_y >= ysize - (8 - step)) return; - pC[y * xsize + x] = pA[sample_y * oxsize + sample_x]; -} + pos_x = min(pos_x, xsize - 8); + pos_y = min(pos_y, ysize - 8); -__kernel void clScaleImage(double scale, __global float *result) -{ - const int i = get_global_id(0); - result[i] *= scale; + double diff_xyb[3] = { 0.0 }; + Butteraugli8x8CornerEdgeDetectorDiff(pos_x, pos_y, xsize, ysize, + r, g, b, + r2, g2, b2, + &diff_xyb[0]); + + int idx = (res_y * res_xsize + res_x) * 3; + result[idx] = diff_xyb[0]; + result[idx + 1] = diff_xyb[1]; + result[idx + 2] = diff_xyb[2]; } -__kernel void clRemoveBorder(__global const float *in, int in_xsize, int s, int s2, __global float *out) + +__kernel void clBlockDiffMap( + __global float* block_diff_dc, __global float* block_diff_ac, + __global const float* r, __global const float* g, __global const float* b, + __global const float* r2, __global const float* g2, __global const float* b2, + int xsize, int ysize, int step) { - const int x = get_global_id(0); - const int y = get_global_id(1); + const int res_x = get_global_id(0); + const int res_y = get_global_id(1); - const int xsize = get_global_size(0); - const int ysize = get_global_size(1); + const int res_xsize = get_global_size(0); + const int res_ysize = get_global_size(1); - out[y * xsize + x] = in[(y + s2) * (xsize + s) + x + s2]; -} + int pos_x = res_x * step; + int pos_y = res_y * step; -__kernel void clAddBorder(__global float *out, int s, int s2, __global const float *in) -{ - const int x = get_global_id(0); - const int y = get_global_id(1); - const int xsize = get_global_size(0); - const int ysize = get_global_size(1); + if ((pos_x + kBlockEdge - step - 1) >= xsize) return; + if ((pos_y + kBlockEdge - step - 1) >= ysize) return; - if (x >= xsize - s || - y >= ysize - s) - { - return; - } + size_t res_ix = res_y * res_xsize + res_x; + size_t offset = min(pos_y, ysize - 8) * xsize + min(pos_x, xsize - 8); - const double mul1 = 24.8235314874; - out[(y + s2) * xsize + x + s2] += (float)(mul1) * in[y * (xsize - s) + x]; + double block0[3 * kBlockEdge * kBlockEdge]; + double block1[3 * kBlockEdge * kBlockEdge]; + double *block0_r = &block0[0]; + double *block0_g = &block0[kBlockEdge * kBlockEdge]; + double *block0_b = &block0[2 * kBlockEdge * kBlockEdge]; + + double *block1_r = &block1[0]; + double *block1_g = &block1[kBlockEdge * kBlockEdge]; + double *block1_b = &block1[2 * kBlockEdge * kBlockEdge]; + + for (int y = 0; y < kBlockEdge; y++) + { + for (int x = 0; x < kBlockEdge; x++) + { + block0_r[kBlockEdge * y + x] = r[offset + y * xsize + x]; + block0_g[kBlockEdge * y + x] = g[offset + y * xsize + x]; + block0_b[kBlockEdge * y + x] = b[offset + y * xsize + x]; + block1_r[kBlockEdge * y + x] = r2[offset + y * xsize + x]; + block1_g[kBlockEdge * y + x] = g2[offset + y * xsize + x]; + block1_b[kBlockEdge * y + x] = b2[offset + y * xsize + x]; + } + } + + double diff_xyb_dc[3] = { 0.0 }; + double diff_xyb_ac[3] = { 0.0 }; + double diff_xyb_edge_dc[3] = { 0.0 }; + + ButteraugliBlockDiff(block0, block1, diff_xyb_dc, diff_xyb_ac, diff_xyb_edge_dc); + + for (int i = 0; i < 3; i++) + { + block_diff_dc[3 * res_ix + i] = diff_xyb_dc[i]; + block_diff_ac[3 * res_ix + i] = diff_xyb_ac[i]; + } } -__kernel void clCombineChannels( - __global const float *mask_x, __global const float *mask_y, __global const float *mask_b, - __global const float *mask_dc_x, __global const float *mask_dc_y, __global const float *mask_dc_b, - __global const float *block_diff_dc, - __global const float *block_diff_ac, - __global float *edge_detector_map, - int xsize, int ysize, - int res_xsize, - int step, - __global float *result) +__kernel void clEdgeDetectorLowFreq( + __global float *block_diff_ac, + __global const float *r, __global const float *g, __global const float* b, + __global const float *r2, __global const float* g2, __global const float *b2, + int xsize, int ysize, int step) { - const int res_x = get_global_id(0) * step; - const int res_y = get_global_id(1) * step; + const int res_x = get_global_id(0); + const int res_y = get_global_id(1); - double mask[3]; - double dc_mask[3]; - mask[0] = mask_x[(res_y + 3) * xsize + (res_x + 3)]; - dc_mask[0] = mask_dc_x[(res_y + 3) * xsize + (res_x + 3)]; + if (res_x < 8 / step) return; - mask[1] = mask_y[(res_y + 3) * xsize + (res_x + 3)]; - dc_mask[1] = mask_dc_y[(res_y + 3) * xsize + (res_x + 3)]; + const int res_xsize = get_global_size(0); + const int res_ysize = get_global_size(1); - mask[2] = mask_b[(res_y + 3) * xsize + (res_x + 3)]; - dc_mask[2] = mask_dc_b[(res_y + 3) * xsize + (res_x + 3)]; + int pos_x = (res_x - (8 / step)) * step; + int pos_y = res_y * step; - size_t res_ix = (res_y * res_xsize + res_x) / step; - result[res_ix] = (float)( - DotProduct(&block_diff_dc[3 * res_ix], dc_mask) + - DotProduct(&block_diff_ac[3 * res_ix], mask) + - DotProduct(&edge_detector_map[3 * res_ix], mask)); + if (pos_x + 8 >= xsize) return; + if (pos_y + 8 >= ysize) return; + + int ix = pos_y * xsize + pos_x; + + double diff[4][3]; + __global const float* blurred0[3] = { r, g, b }; + __global const float* blurred1[3] = { r2, g2, b2 }; + + for (int i = 0; i < 3; ++i) { + int ix2 = ix + 8; + diff[0][i] = + ((blurred1[i][ix] - blurred0[i][ix]) + + (blurred0[i][ix2] - blurred1[i][ix2])); + ix2 = ix + 8 * xsize; + diff[1][i] = + ((blurred1[i][ix] - blurred0[i][ix]) + + (blurred0[i][ix2] - blurred1[i][ix2])); + ix2 = ix + 6 * xsize + 6; + diff[2][i] = + ((blurred1[i][ix] - blurred0[i][ix]) + + (blurred0[i][ix2] - blurred1[i][ix2])); + ix2 = ix + 6 * xsize - 6; + diff[3][i] = pos_x < 8 ? 0 : + ((blurred1[i][ix] - blurred0[i][ix]) + + (blurred0[i][ix2] - blurred1[i][ix2])); + } + double max_diff_xyb[3] = { 0 }; + for (int k = 0; k < 4; ++k) { + double diff_xyb[3] = { 0 }; + XybDiffLowFreqSquaredAccumulate(diff[k][0], diff[k][1], diff[k][2], + 0, 0, 0, 1.0, + diff_xyb); + for (int i = 0; i < 3; ++i) { + max_diff_xyb[i] = max(max_diff_xyb[i], diff_xyb[i]); + } + } + + int res_ix = res_y * res_xsize + res_x; + + const double kMul = 10; + + block_diff_ac[res_ix * 3] += max_diff_xyb[0] * kMul; + block_diff_ac[res_ix * 3 + 1] += max_diff_xyb[1] * kMul; + block_diff_ac[res_ix * 3 + 2] += max_diff_xyb[2] * kMul; } __kernel void clDiffPrecompute( + __global float *mask_x, __global float *mask_y, __global float *mask_b, __global const float *xyb0_x, __global const float *xyb0_y, __global const float *xyb0_b, - __global const float *xyb1_x, __global const float *xyb1_y, __global const float *xyb1_b, - __global float *mask_x, __global float *mask_y, __global float *mask_b) + __global const float *xyb1_x, __global const float *xyb1_y, __global const float *xyb1_b) { const int x = get_global_id(0); const int y = get_global_id(1); @@ -367,99 +482,79 @@ __kernel void clDiffPrecompute( mask_b[ix] = (float)(m); } -__kernel void clEdgeDetectorMap(__global float *result, - __global const float *r, __global const float *g, __global const float* b, - __global const float *r2, __global const float* g2, __global const float *b2, - int xsize, int ysize, int step) +__kernel void clScaleImage(__global float *img, double scale) { - const int res_x = get_global_id(0); - const int res_y = get_global_id(1); - - const int res_xsize = get_global_size(0); - const int res_ysize = get_global_size(1); + const int i = get_global_id(0); + img[i] *= scale; +} - int pos_x = res_x * step; - int pos_y = res_y * step; +#define Average5x5_w 0.679144890667f +__constant float Average5x5_scale = 1.0f / (5.0f + 4 * Average5x5_w); +__kernel void clAverage5x5(__global float *img, __global const float *img_org) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + const int xsize = get_global_size(0); + const int ysize = get_global_size(1); - if (pos_x >= xsize - (8 - step)) return; - if (pos_y >= ysize - (8 - step)) return; + const int row0 = y * xsize; + if (x - 1 >= 0) { + img[row0 + x] += img_org[row0 + x - 1]; + } + if (x + 1 < xsize) { + img[row0 + x] += img_org[row0 + x + 1]; + } - pos_x = min(pos_x, xsize - 8); - pos_y = min(pos_y, ysize - 8); + if (y > 0) { + const int rowd1 = row0 - xsize; + if (x - 1 >= 0) { + img[row0 + x] += img_org[rowd1 + x - 1] * Average5x5_w; + } + img[row0 + x] += img_org[rowd1 + x]; + if (x + 1 < xsize) { + img[row0 + x] += img_org[rowd1 + x + 1] * Average5x5_w; + } + } - double diff_xyb[3] = { 0.0 }; - Butteraugli8x8CornerEdgeDetectorDiff(pos_x, pos_y, xsize, ysize, - r, g, b, - r2, g2, b2, - &diff_xyb[0]); + if (y + 1 < ysize) { + const int rowu1 = row0 + xsize; + if (x - 1 >= 0) { + img[row0 + x] += img_org[rowu1 + x - 1] * Average5x5_w; + } + img[row0 + x] += img_org[rowu1 + x]; + if (x + 1 < xsize) { + img[row0 + x] += img_org[rowu1 + x + 1] * Average5x5_w; + } + } - int idx = (res_y * res_xsize + res_x) * 3; - result[idx] = diff_xyb[0]; - result[idx + 1] = diff_xyb[1]; - result[idx + 2] = diff_xyb[2]; + img[row0 + x] *= Average5x5_scale; } -__kernel void clEdgeDetectorLowFreq(__global float *result, - __global const float *r, __global const float *g, __global const float* b, - __global const float *r2, __global const float* g2, __global const float *b2, - int xsize, int ysize, int step) +__kernel void clMinSquareVal(__global float* result, __global const float* img, int square_size, int offset) { - const int res_x = get_global_id(0); - const int res_y = get_global_id(1); - - if (res_x < 8 / step) return; - - const int res_xsize = get_global_size(0); - const int res_ysize = get_global_size(1); - - int pos_x = (res_x - (8 / step)) * step; - int pos_y = res_y * step; + const int x = get_global_id(0); + const int y = get_global_id(1); + const int width = get_global_size(0); + const int height = get_global_size(1); - if (pos_x + 8 >= xsize) return; - if (pos_y + 8 >= ysize) return; + int minH = offset > y ? 0 : y - offset; + int maxH = min(y + square_size - offset, height); - int ix = pos_y * xsize + pos_x; + int minW = offset > x ? 0 : x - offset; + int maxW = min(x + square_size - offset, width); - double diff[4][3]; - __global const float* blurred0[3] = { r, g, b }; - __global const float* blurred1[3] = { r2, g2, b2 }; + float minValue = img[minH * width + minW]; - for (int i = 0; i < 3; ++i) { - int ix2 = ix + 8; - diff[0][i] = - ((blurred1[i][ix] - blurred0[i][ix]) + - (blurred0[i][ix2] - blurred1[i][ix2])); - ix2 = ix + 8 * xsize; - diff[1][i] = - ((blurred1[i][ix] - blurred0[i][ix]) + - (blurred0[i][ix2] - blurred1[i][ix2])); - ix2 = ix + 6 * xsize + 6; - diff[2][i] = - ((blurred1[i][ix] - blurred0[i][ix]) + - (blurred0[i][ix2] - blurred1[i][ix2])); - ix2 = ix + 6 * xsize - 6; - diff[3][i] = pos_x < 8 ? 0 : - ((blurred1[i][ix] - blurred0[i][ix]) + - (blurred0[i][ix2] - blurred1[i][ix2])); - } - double max_diff_xyb[3] = { 0 }; - for (int k = 0; k < 4; ++k) { - double diff_xyb[3] = { 0 }; - XybDiffLowFreqSquaredAccumulate(diff[k][0], diff[k][1], diff[k][2], - 0, 0, 0, 1.0, - diff_xyb); - for (int i = 0; i < 3; ++i) { - max_diff_xyb[i] = max(max_diff_xyb[i], diff_xyb[i]); + for (int j = minH; j < maxH; j++) + { + for (int i = minW; i < maxW; i++) + { + float tmp = img[j * width + i]; + if (tmp < minValue) minValue = tmp; } } - int res_ix = res_y * res_xsize + res_x; - - const double kMul = 10; - - result[res_ix * 3] += max_diff_xyb[0] * kMul; - result[res_ix * 3 + 1] += max_diff_xyb[1] * kMul; - result[res_ix * 3 + 2] += max_diff_xyb[2] * kMul; + result[y * width + x] = minValue; } __kernel void clDoMask( @@ -495,118 +590,65 @@ __kernel void clDoMask( } -__kernel void clBlockDiffMap(__global const float* r, __global const float* g, __global const float* b, - __global const float* r2, __global const float* g2, __global const float* b2, - __global float* block_diff_dc, __global float* block_diff_ac, - int xsize, int ysize, int step) +__kernel void clCombineChannels( + __global float *result, + __global const float *mask_x, __global const float *mask_y, __global const float *mask_b, + __global const float *mask_dc_x, __global const float *mask_dc_y, __global const float *mask_dc_b, + const int xsize, const int ysize, + __global const float *block_diff_dc, + __global const float *block_diff_ac, + __global float *edge_detector_map, + const int res_xsize, + const int step) { - const int res_x = get_global_id(0); - const int res_y = get_global_id(1); - - const int res_xsize = get_global_size(0); - const int res_ysize = get_global_size(1); - - int pos_x = res_x * step; - int pos_y = res_y * step; - - if ((pos_x + kBlockEdge - step - 1) >= xsize) return; - if ((pos_y + kBlockEdge - step - 1) >= ysize) return; - - size_t res_ix = res_y * res_xsize + res_x; - size_t offset = min(pos_y, ysize - 8) * xsize + min(pos_x, xsize - 8); + const int res_x = get_global_id(0) * step; + const int res_y = get_global_id(1) * step; - double block0[3 * kBlockEdge * kBlockEdge]; - double block1[3 * kBlockEdge * kBlockEdge]; + double mask[3]; + double dc_mask[3]; + mask[0] = mask_x[(res_y + 3) * xsize + (res_x + 3)]; + dc_mask[0] = mask_dc_x[(res_y + 3) * xsize + (res_x + 3)]; - double *block0_r = &block0[0]; - double *block0_g = &block0[kBlockEdge * kBlockEdge]; - double *block0_b = &block0[2 * kBlockEdge * kBlockEdge]; + mask[1] = mask_y[(res_y + 3) * xsize + (res_x + 3)]; + dc_mask[1] = mask_dc_y[(res_y + 3) * xsize + (res_x + 3)]; - double *block1_r = &block1[0]; - double *block1_g = &block1[kBlockEdge * kBlockEdge]; - double *block1_b = &block1[2 * kBlockEdge * kBlockEdge]; + mask[2] = mask_b[(res_y + 3) * xsize + (res_x + 3)]; + dc_mask[2] = mask_dc_b[(res_y + 3) * xsize + (res_x + 3)]; - for (int y = 0; y < kBlockEdge; y++) - { - for (int x = 0; x < kBlockEdge; x++) - { - block0_r[kBlockEdge * y + x] = r[offset + y * xsize + x]; - block0_g[kBlockEdge * y + x] = g[offset + y * xsize + x]; - block0_b[kBlockEdge * y + x] = b[offset + y * xsize + x]; - block1_r[kBlockEdge * y + x] = r2[offset + y * xsize + x]; - block1_g[kBlockEdge * y + x] = g2[offset + y * xsize + x]; - block1_b[kBlockEdge * y + x] = b2[offset + y * xsize + x]; - } - } + size_t res_ix = (res_y * res_xsize + res_x) / step; + result[res_ix] = (float)( + DotProduct(&block_diff_dc[3 * res_ix], dc_mask) + + DotProduct(&block_diff_ac[3 * res_ix], mask) + + DotProduct(&edge_detector_map[3 * res_ix], mask)); +} - double diff_xyb_dc[3] = { 0.0 }; - double diff_xyb_ac[3] = { 0.0 }; - double diff_xyb_edge_dc[3] = { 0.0 }; +__kernel void clRemoveBorder(__global const float *in, int in_xsize, int s, int s2, __global float *out) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); - ButteraugliBlockDiff(block0, block1, diff_xyb_dc, diff_xyb_ac, diff_xyb_edge_dc); + const int xsize = get_global_size(0); + const int ysize = get_global_size(1); - for (int i = 0; i < 3; i++) - { - block_diff_dc[3 * res_ix + i] = diff_xyb_dc[i]; - block_diff_ac[3 * res_ix + i] = diff_xyb_ac[i]; - } + out[y * xsize + x] = in[(y + s2) * (xsize + s) + x + s2]; } -__kernel void clMaskHighIntensityChange( - __global float *xyb0_x, __global float *xyb0_y, __global float *xyb0_b, - __global float *xyb1_x, __global float *xyb1_y, __global float *xyb1_b, - __global const float *c0_x, __global const float *c0_y, __global const float *c0_b, - __global const float *c1_x, __global const float *c1_y, __global const float *c1_b -) +__kernel void clAddBorder(__global float *out, int s, int s2, __global const float *in) { const int x = get_global_id(0); const int y = get_global_id(1); const int xsize = get_global_size(0); const int ysize = get_global_size(1); - size_t ix = y * xsize + x; - const double ave[3] = { - (c0_x[ix] + c1_x[ix]) * 0.5, - (c0_y[ix] + c1_y[ix]) * 0.5, - (c0_b[ix] + c1_b[ix]) * 0.5, - }; - double sqr_max_diff = -1; - { - int offset[4] = { -1, 1, -(int)(xsize), (int)(xsize) }; - int border[4] = { x == 0, x + 1 == xsize, y == 0, y + 1 == ysize }; - for (int dir = 0; dir < 4; ++dir) { - if (border[dir]) { - continue; - } - const int ix2 = ix + offset[dir]; - double diff = 0.5 * (c0_y[ix2] + c1_y[ix2]) - ave[1]; - diff *= diff; - if (sqr_max_diff < diff) { - sqr_max_diff = diff; - } - } - } - const double kReductionX = 275.19165240059317; - const double kReductionY = 18599.41286306991; - const double kReductionZ = 410.8995306951065; - const double kChromaBalance = 106.95800948271017; - double chroma_scale = kChromaBalance / (ave[1] + kChromaBalance); - - const double mix[3] = { - chroma_scale * kReductionX / (sqr_max_diff + kReductionX), - kReductionY / (sqr_max_diff + kReductionY), - chroma_scale * kReductionZ / (sqr_max_diff + kReductionZ), - }; - // Interpolate lineraly between the average color and the actual - // color -- to reduce the importance of this pixel. - xyb0_x[ix] = (float)(mix[0] * c0_x[ix] + (1 - mix[0]) * ave[0]); - xyb1_x[ix] = (float)(mix[0] * c1_x[ix] + (1 - mix[0]) * ave[0]); + if (x >= xsize - s || + y >= ysize - s) + { + return; + } - xyb0_y[ix] = (float)(mix[1] * c0_y[ix] + (1 - mix[1]) * ave[1]); - xyb1_y[ix] = (float)(mix[1] * c1_y[ix] + (1 - mix[1]) * ave[1]); + const double mul1 = 24.8235314874; + out[(y + s2) * xsize + x + s2] += (float)(mul1) * in[y * (xsize - s) + x]; - xyb0_b[ix] = (float)(mix[2] * c0_b[ix] + (1 - mix[2]) * ave[2]); - xyb1_b[ix] = (float)(mix[2] * c1_b[ix] + (1 - mix[2]) * ave[2]); } __kernel void clUpsampleSquareRoot(__global const float *diffmap, int xsize, int ysize, int step, __global float *diffmap_out) @@ -642,47 +684,7 @@ __kernel void clUpsampleSquareRoot(__global const float *diffmap, int xsize, int } } -#define Average5x5_w 0.679144890667f -__constant float Average5x5_scale = 1.0f / (5.0f + 4 * Average5x5_w); -__kernel void clAverage5x5(__global float *img, __global const float *img_org) -{ - const int x = get_global_id(0); - const int y = get_global_id(1); - const int xsize = get_global_size(0); - const int ysize = get_global_size(1); - - const int row0 = y * xsize; - if (x - 1 >= 0) { - img[row0 + x] += img_org[row0 + x - 1]; - } - if (x + 1 < xsize) { - img[row0 + x] += img_org[row0 + x + 1]; - } - - if (y > 0) { - const int rowd1 = row0 - xsize; - if (x - 1 >= 0) { - img[row0 + x] += img_org[rowd1 + x - 1] * Average5x5_w; - } - img[row0 + x] += img_org[rowd1 + x]; - if (x + 1 < xsize) { - img[row0 + x] += img_org[rowd1 + x + 1] * Average5x5_w; - } - } - - if (y + 1 < ysize) { - const int rowu1 = row0 + xsize; - if (x - 1 >= 0) { - img[row0 + x] += img_org[rowu1 + x - 1] * Average5x5_w; - } - img[row0 + x] += img_org[rowu1 + x]; - if (x + 1 < xsize) { - img[row0 + x] += img_org[rowu1 + x + 1] * Average5x5_w; - } - } - img[row0 + x] *= Average5x5_scale; -} void Butteraugli8x8CornerEdgeDetectorDiff( int pos_x, @@ -3138,7 +3140,6 @@ __kernel void clComputeBlockZeroingOrder( { const int block_x = get_global_id(0); const int block_y = get_global_id(1); -#define kComputeBlockSize (kBlockSize * 3) channel_info orig_channel[3]; orig_channel[0].coeff = orig_batch_0; diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index 67a7f2a0..3f5e46ff 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -49,7 +49,6 @@ ocl_args_d_t& getOcl(void) ocl.kernel[KERNEL_CONVOLUTIONX] = clCreateKernel(ocl.program, "clConvolutionX", &err); ocl.kernel[KERNEL_CONVOLUTIONY] = clCreateKernel(ocl.program, "clConvolutionY", &err); ocl.kernel[KERNEL_SQUARESAMPLE] = clCreateKernel(ocl.program, "clSquareSample", &err); - ocl.kernel[KERNEL_DOWNSAMPLE] = clCreateKernel(ocl.program, "clDownSample", &err); ocl.kernel[KERNEL_OPSINDYNAMICSIMAGE] = clCreateKernel(ocl.program, "clOpsinDynamicsImage", &err); ocl.kernel[KERNEL_DOMASK] = clCreateKernel(ocl.program, "clDoMask", &err); ocl.kernel[KERNEL_SCALEIMAGE] = clCreateKernel(ocl.program, "clScaleImage", &err); @@ -313,12 +312,12 @@ void clConvolutionEx( cl_float clborder_ratio = border_ratio; cl_kernel kernel = ocl.kernel[KERNEL_CONVOLUTION]; - clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&multipliers); - clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&inp); - clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&result); - clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&clxsize); - clSetKernelArg(kernel, 4, sizeof(cl_int), (void*)&clxstep); - clSetKernelArg(kernel, 5, sizeof(cl_int), (void*)&cllen); + clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&result); + clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&inp); + clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&clxsize); + clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&multipliers); + clSetKernelArg(kernel, 4, sizeof(cl_int), (void*)&cllen); + clSetKernelArg(kernel, 5, sizeof(cl_int), (void*)&clxstep); clSetKernelArg(kernel, 6, sizeof(cl_int), (void*)&cloffset); clSetKernelArg(kernel, 7, sizeof(cl_float), (void*)&clborder_ratio); @@ -350,11 +349,11 @@ void clConvolutionX( cl_float clborder_ratio = border_ratio; cl_kernel kernel = ocl.kernel[KERNEL_CONVOLUTIONX]; - clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&multipliers); - clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&inp); - clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&result); - clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&xstep); - clSetKernelArg(kernel, 4, sizeof(cl_int), (void*)&cllen); + clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&result); + clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&inp); + clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&multipliers); + clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&cllen); + clSetKernelArg(kernel, 4, sizeof(cl_int), (void*)&xstep); clSetKernelArg(kernel, 5, sizeof(cl_int), (void*)&cloffset); clSetKernelArg(kernel, 6, sizeof(cl_float), (void*)&clborder_ratio); @@ -387,11 +386,11 @@ void clConvolutionY( cl_float clborder_ratio = border_ratio; cl_kernel kernel = ocl.kernel[KERNEL_CONVOLUTIONY]; - clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&multipliers); - clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&inp); - clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&result); - clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&xstep); - clSetKernelArg(kernel, 4, sizeof(cl_int), (void*)&cllen); + clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&result); + clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&inp); + clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&multipliers); + clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&cllen); + clSetKernelArg(kernel, 4, sizeof(cl_int), (void*)&xstep); clSetKernelArg(kernel, 5, sizeof(cl_int), (void*)&cloffset); clSetKernelArg(kernel, 6, sizeof(cl_float), (void*)&clborder_ratio); @@ -408,7 +407,7 @@ void clConvolutionY( } } -void clUpsampleEx2( +void clSquareSampleEx( cl_mem result/*out*/, const cl_mem image, size_t xsize, size_t ysize, size_t xstep, size_t ystep) @@ -419,38 +418,8 @@ void clUpsampleEx2( cl_int clxstep = xstep; cl_int clystep = ystep; cl_kernel kernel = ocl.kernel[KERNEL_SQUARESAMPLE]; - clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&image); - clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&result); - clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&clxstep); - clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&clystep); - - size_t globalWorkSize[2] = { xsize, ysize }; - err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); - if (CL_SUCCESS != err) - { - LogError("Error: clUpsampleEx clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err)); - } - err = clFinish(ocl.commandQueue); - if (CL_SUCCESS != err) - { - LogError("Error: clUpsampleEx clFinish returned %s.\n", TranslateOpenCLError(err)); - } -} - -void clUpsampleEx( - cl_mem result/*out*/, - const cl_mem image, - const size_t xsize, const size_t ysize, - const size_t xstep, const size_t ystep) -{ - cl_int err = CL_SUCCESS; - ocl_args_d_t &ocl = getOcl(); - - cl_int clxstep = xstep; - cl_int clystep = ystep; - cl_kernel kernel = ocl.kernel[KERNEL_DOWNSAMPLE]; - clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&image); - clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&result); + clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&result); + clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&image); clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&clxstep); clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&clystep); @@ -537,7 +506,7 @@ void clBlurEx2(cl_mem image/*out, opt*/, size_t xsize, size_t ysize, ocl.allocA(sizeof(cl_float) * xsize * ysize); clConvolutionX(ocl.srcA, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio); clConvolutionY(result ? result : image, ocl.srcA, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio); - clUpsampleEx2(result ? result : image, result ? result : image, xsize, ysize, xstep, xstep); + clSquareSampleEx(result ? result : image, result ? result : image, xsize, ysize, xstep, xstep); } else { @@ -563,7 +532,6 @@ void clOpsinDynamicsImageEx(ocl_channels &rgb, const size_t xsize, const size_t clBlurEx(rgb.g, xsize, ysize, kSigma, 0.0, rgb_blurred.g); clBlurEx(rgb.b, xsize, ysize, kSigma, 0.0, rgb_blurred.b); - cl_int clSize = xsize * ysize; cl_kernel kernel = ocl.kernel[KERNEL_OPSINDYNAMICSIMAGE]; clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&rgb.r); clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&rgb.g); @@ -571,7 +539,6 @@ void clOpsinDynamicsImageEx(ocl_channels &rgb, const size_t xsize, const size_t clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&rgb_blurred.r); clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&rgb_blurred.g); clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&rgb_blurred.b); - clSetKernelArg(kernel, 6, sizeof(cl_int), (void*)&clSize); size_t globalWorkSize[1] = { xsize * ysize }; err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL); @@ -710,14 +677,14 @@ void clBlockDiffMapEx( cl_int clstep = step; cl_kernel kernel = ocl.kernel[KERNEL_BLOCKDIFFMAP]; - clSetKernelArg(kernel, 0, sizeof(cl_mem), &rgb.r); - clSetKernelArg(kernel, 1, sizeof(cl_mem), &rgb.g); - clSetKernelArg(kernel, 2, sizeof(cl_mem), &rgb.b); - clSetKernelArg(kernel, 3, sizeof(cl_mem), &rgb2.r); - clSetKernelArg(kernel, 4, sizeof(cl_mem), &rgb2.g); - clSetKernelArg(kernel, 5, sizeof(cl_mem), &rgb2.b); - clSetKernelArg(kernel, 6, sizeof(cl_mem), &block_diff_dc); - clSetKernelArg(kernel, 7, sizeof(cl_mem), &block_diff_ac); + clSetKernelArg(kernel, 0, sizeof(cl_mem), &block_diff_dc); + clSetKernelArg(kernel, 1, sizeof(cl_mem), &block_diff_ac); + clSetKernelArg(kernel, 2, sizeof(cl_mem), &rgb.r); + clSetKernelArg(kernel, 3, sizeof(cl_mem), &rgb.g); + clSetKernelArg(kernel, 4, sizeof(cl_mem), &rgb.b); + clSetKernelArg(kernel, 5, sizeof(cl_mem), &rgb2.r); + clSetKernelArg(kernel, 6, sizeof(cl_mem), &rgb2.g); + clSetKernelArg(kernel, 7, sizeof(cl_mem), &rgb2.b); clSetKernelArg(kernel, 8, sizeof(cl_int), &clxsize); clSetKernelArg(kernel, 9, sizeof(cl_int), &clysize); clSetKernelArg(kernel, 10, sizeof(cl_int), &clstep); @@ -802,15 +769,15 @@ void clDiffPrecomputeEx( ocl_args_d_t &ocl = getOcl(); cl_kernel kernel = ocl.kernel[KERNEL_DIFFPRECOMPUTE]; - clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&xyb0.x); - clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&xyb0.y); - clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&xyb0.b); - clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&xyb1.x); - clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&xyb1.y); - clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&xyb1.b); - clSetKernelArg(kernel, 6, sizeof(cl_mem), (void*)&mask.x); - clSetKernelArg(kernel, 7, sizeof(cl_mem), (void*)&mask.y); - clSetKernelArg(kernel, 8, sizeof(cl_mem), (void*)&mask.b); + clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&mask.x); + clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&mask.y); + clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&mask.b); + clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&xyb0.x); + clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&xyb0.y); + clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&xyb0.b); + clSetKernelArg(kernel, 6, sizeof(cl_mem), (void*)&xyb1.x); + clSetKernelArg(kernel, 7, sizeof(cl_mem), (void*)&xyb1.y); + clSetKernelArg(kernel, 8, sizeof(cl_mem), (void*)&xyb1.b); size_t globalWorkSize[2] = { xsize, ysize }; err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); @@ -833,8 +800,8 @@ void clScaleImageEx(cl_mem img/*in, out*/, size_t size, double w) cl_double clscale = w; cl_kernel kernel = ocl.kernel[KERNEL_SCALEIMAGE]; - clSetKernelArg(kernel, 0, sizeof(cl_double), (void*)&clscale); - clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&img); + clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&img); + clSetKernelArg(kernel, 1, sizeof(cl_double), (void*)&clscale); size_t globalWorkSize[1] = { size }; err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL); @@ -851,35 +818,35 @@ void clScaleImageEx(cl_mem img/*in, out*/, size_t size, double w) void clAverage5x5Ex(cl_mem img/*in,out*/, const size_t xsize, const size_t ysize) { - if (xsize < 4 || ysize < 4) { - // TODO: Make this work for small dimensions as well. - return; - } + if (xsize < 4 || ysize < 4) { + // TODO: Make this work for small dimensions as well. + return; + } - cl_int err = CL_SUCCESS; - ocl_args_d_t &ocl = getOcl(); + cl_int err = CL_SUCCESS; + ocl_args_d_t &ocl = getOcl(); - size_t len = xsize * ysize * sizeof(float); - ocl.allocA(len); - cl_mem tmp = ocl.srcA; + size_t len = xsize * ysize * sizeof(float); + ocl.allocA(len); + cl_mem img_org = ocl.srcA; - err = clEnqueueCopyBuffer(ocl.commandQueue, img, tmp, 0, 0, len, 0, NULL, NULL); + err = clEnqueueCopyBuffer(ocl.commandQueue, img, img_org, 0, 0, len, 0, NULL, NULL); - cl_kernel kernel = ocl.kernel[KERNEL_AVERAGE5X5]; - clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&img); - clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&tmp); + cl_kernel kernel = ocl.kernel[KERNEL_AVERAGE5X5]; + clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&img); + clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&img_org); - size_t globalWorkSize[2] = { xsize, ysize }; - err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); - if (CL_SUCCESS != err) - { + size_t globalWorkSize[2] = { xsize, ysize }; + err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); + if (CL_SUCCESS != err) + { LogError("Error: clAverage5x5Ex() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err)); - } - err = clFinish(ocl.commandQueue); - if (CL_SUCCESS != err) - { + } + err = clFinish(ocl.commandQueue); + if (CL_SUCCESS != err) + { LogError("Error: clAverage5x5Ex() clFinish returned %s.\n", TranslateOpenCLError(err)); - } + } } void clMinSquareValEx( @@ -895,8 +862,8 @@ void clMinSquareValEx( ocl.allocA(sizeof(cl_float) * xsize * ysize); cl_kernel kernel = ocl.kernel[KERNEL_MINSQUAREVAL]; - clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&img); - clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&ocl.srcA); + clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&ocl.srcA); + clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&img); clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&clsquare_size); clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&cloffset); @@ -1105,20 +1072,20 @@ void clCombineChannelsEx( cl_int clstep = step; cl_kernel kernel = ocl.kernel[KERNEL_COMBINECHANNELS]; - clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&mask.r); - clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&mask.g); - clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&mask.b); - clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&mask_dc.r); - clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&mask_dc.g); - clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&mask_dc.b); - clSetKernelArg(kernel, 6, sizeof(cl_mem), (void*)&block_diff_dc); - clSetKernelArg(kernel, 7, sizeof(cl_mem), (void*)&block_diff_ac); - clSetKernelArg(kernel, 8, sizeof(cl_mem), (void*)&edge_detector_map); - clSetKernelArg(kernel, 9, sizeof(cl_int), (void*)&clxsize); - clSetKernelArg(kernel, 10, sizeof(cl_int), (void*)&clysize); - clSetKernelArg(kernel, 11, sizeof(cl_int), (void*)&clres_size); - clSetKernelArg(kernel, 12, sizeof(cl_int), (void*)&clstep); - clSetKernelArg(kernel, 13, sizeof(cl_mem), (void*)&result); + clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&result); + clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&mask.r); + clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&mask.g); + clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&mask.b); + clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&mask_dc.r); + clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&mask_dc.g); + clSetKernelArg(kernel, 6, sizeof(cl_mem), (void*)&mask_dc.b); + clSetKernelArg(kernel, 7, sizeof(cl_int), (void*)&clxsize); + clSetKernelArg(kernel, 8, sizeof(cl_int), (void*)&clysize); + clSetKernelArg(kernel, 9, sizeof(cl_mem), (void*)&block_diff_dc); + clSetKernelArg(kernel, 10, sizeof(cl_mem), (void*)&block_diff_ac); + clSetKernelArg(kernel, 11, sizeof(cl_mem), (void*)&edge_detector_map); + clSetKernelArg(kernel, 12, sizeof(cl_int), (void*)&clres_size); + clSetKernelArg(kernel, 13, sizeof(cl_int), (void*)&clstep); size_t globalWorkSize[2] = { work_xsize, work_ysize }; err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h index f1aa6c22..de90c9b0 100644 --- a/clguetzli/clguetzli.h +++ b/clguetzli/clguetzli.h @@ -58,7 +58,7 @@ void clConvolutionY( const cl_mem multipliers, size_t len, int xstep, int offset, double border_ratio); -void clUpsampleEx2( +void clSquareSampleEx( cl_mem result/*out*/, const cl_mem image, size_t xsize, size_t ysize, size_t xstep, size_t ystep); diff --git a/clguetzli/clguetzli_test.cpp b/clguetzli/clguetzli_test.cpp index 28ae9d1b..a19121c1 100644 --- a/clguetzli/clguetzli_test.cpp +++ b/clguetzli/clguetzli_test.cpp @@ -330,34 +330,6 @@ void tclConvolution(size_t xsize, size_t ysize, clReleaseMemObject(m); } -// chirsk todo -void tclUpsample(float* image, size_t xsize, size_t ysize, - size_t xstep, size_t ystep, - float* result) -{ - int dxsize = (xsize + xstep - 1) / xstep; - int dysize = (ysize + ystep - 1) / ystep; - size_t img_size = dxsize * dysize * sizeof(float); - size_t result_size = xsize * ysize * sizeof(float); - cl_int err = 0; - ocl_args_d_t &ocl = getOcl(); - cl_mem img = ocl.allocMem(img_size, image); - ocl.allocA(result_size); - cl_mem r = ocl.srcA; - - clUpsampleEx(r, img, xsize, ysize, xstep, ystep); - - cl_float *r_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, r, true, CL_MAP_READ, 0, result_size, 0, NULL, NULL, &err); - err = clFinish(ocl.commandQueue); - - FLOAT_COMPARE(result, r_r, xsize * ysize); - - clEnqueueUnmapMemObject(ocl.commandQueue, r, r_r, 0, NULL, NULL); - err = clFinish(ocl.commandQueue); - - clReleaseMemObject(img); -} - // ian todo void tclDiffPrecompute( const const std::vector > &xyb0, diff --git a/clguetzli/clguetzli_test.h b/clguetzli/clguetzli_test.h index a84b94ac..b27c7942 100644 --- a/clguetzli/clguetzli_test.h +++ b/clguetzli/clguetzli_test.h @@ -72,7 +72,3 @@ void tclOpsinDynamicsImage(const float* r, const float* g, const float* b, size_ void tclMinSquareVal(const float *img, size_t square_size, size_t offset, size_t xsize, size_t ysize, const float *result); - -void tclUpsample(const float* image, size_t xsize, size_t ysize, - size_t xstep, size_t ystep, - const float* result); diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h index ed2f1ee2..802ded26 100644 --- a/clguetzli/ocl.h +++ b/clguetzli/ocl.h @@ -50,7 +50,6 @@ enum KernelName { KERNEL_CONVOLUTIONX, KERNEL_CONVOLUTIONY, KERNEL_SQUARESAMPLE, - KERNEL_DOWNSAMPLE, KERNEL_OPSINDYNAMICSIMAGE, KERNEL_DOMASK, KERNEL_SCALEIMAGE, From f5fcd1bd458e6e105dd59fa1ffa78a2cd52fa9af Mon Sep 17 00:00:00 2001 From: strongtu Date: Thu, 25 May 2017 13:37:50 +0800 Subject: [PATCH 112/189] Merge branch 'master' of https://github.com/ianhuang-777/guetzli --- clguetzli/clguetzli.cpp | 5 +++++ clguetzli/clguetzli_test.cpp | 4 ++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index 3f5e46ff..16d614d9 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -200,6 +200,9 @@ void clComputeBlockZeroingOrder( cl_int clFactor = factor; cl_int clMask = comp_mask; + clEnqueueWriteBuffer(ocl.commandQueue, mem_output_order_batch, CL_FALSE, 0, output_order_batch_size, output_order_batch, 0, NULL, NULL); + err = clFinish(ocl.commandQueue); + cl_kernel kernel = ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER]; clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&mem_orig_coeff[0]); clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&mem_orig_coeff[1]); @@ -443,6 +446,7 @@ void clBlurEx(cl_mem image/*out, opt*/, const size_t xsize, const size_t ysize, clBlurEx2(image, xsize, ysize, sigma, border_ratio, result); return; +/* double m = 2.25; // Accuracy increases when m is increased. const double scaler = -1.0 / (2 * sigma * sigma); // For m = 9.0: exp(-scaler * diff * diff) < 2^ {-52} @@ -479,6 +483,7 @@ void clBlurEx(cl_mem image/*out, opt*/, const size_t xsize, const size_t ysize, } clReleaseMemObject(mem_expn); +*/ } void clBlurEx2(cl_mem image/*out, opt*/, size_t xsize, size_t ysize, diff --git a/clguetzli/clguetzli_test.cpp b/clguetzli/clguetzli_test.cpp index a19121c1..bbfdb970 100644 --- a/clguetzli/clguetzli_test.cpp +++ b/clguetzli/clguetzli_test.cpp @@ -332,8 +332,8 @@ void tclConvolution(size_t xsize, size_t ysize, // ian todo void tclDiffPrecompute( - const const std::vector > &xyb0, - const const std::vector > &xyb1, + const std::vector > &xyb0, + const std::vector > &xyb1, size_t xsize, size_t ysize, const std::vector > *mask_cmp) { From bb1e067909222e135e654eb12b7dbc9337a5b5db Mon Sep 17 00:00:00 2001 From: strongtu Date: Thu, 25 May 2017 17:16:49 +0800 Subject: [PATCH 113/189] =?UTF-8?q?=E8=B0=83=E6=95=B4=E4=BB=A3=E7=A0=81?= =?UTF-8?q?=EF=BC=8C=E4=BF=AE=E6=AD=A3=E5=8F=82=E6=95=B0=E4=BC=A0=E9=80=92?= =?UTF-8?q?=E8=A7=84=E5=88=99?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clguetzli.cl | 371 +++++++++++++++++++++------------------- clguetzli/clguetzli.cpp | 64 +++---- clguetzli/clguetzli.h | 10 +- clguetzli/ocl.h | 24 +-- 4 files changed, 240 insertions(+), 229 deletions(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index 9639c018..644a009a 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -9,6 +9,19 @@ #define kBlockHalf (kBlockEdge * kBlockEdgeHalf) #define kComputeBlockSize (kBlockSize * 3) +// IntFloatPairÊÇΪÁËÄ£Äâoutput_order input_orderµÄvector +typedef struct __IntFloatPair +{ + int idx; + float err; +}IntFloatPair, DCTScoreData, CoeffData; + +typedef struct __IntFloatPairList +{ + int size; + IntFloatPair *pData; +}IntFloatPairList; + void XybToVals(double x, double y, double z, double *valx, double *valy, double *valz); double InterpolateClampNegative(__global const double *array, int size, double sx); void XybDiffLowFreqSquaredAccumulate(double r0, double g0, double b0, @@ -32,7 +45,25 @@ void Butteraugli8x8CornerEdgeDetectorDiff( __global const float *r2, __global const float* g2, __global const float *b2, double* diff_xyb); -__kernel void clConvolution( +int MakeInputOrderEx(const coeff_t block[3*8*8], const coeff_t orig_block[3*8*8], IntFloatPairList *input_order); + +double CompareBlockFactor(const channel_info mayout_channel[3], + const coeff_t* candidate_block, + const int block_x, + const int block_y, + __global const float *orig_image_batch, + __global const float *mask_scale, + const int image_width, + const int image_height, + const int factor); + +void floatcopy(float *dst, const float *src, int size); +void coeffcopy(coeff_t *dst, const coeff_t *src, int size); +void coeffcopy_g(coeff_t *dst, __global const coeff_t *src, int size); +int list_erase(IntFloatPairList* list, int idx); +int list_push_back(IntFloatPairList* list, int i, float f); + +__kernel void clConvolutionEx( __global float* result, __global const float* inp, const int xsize, __global const float* multipliers, const int len, @@ -73,7 +104,7 @@ __kernel void clConvolution( result[ox * ysize + y] = sum * scale; } -__kernel void clConvolutionX( +__kernel void clConvolutionXEx( __global float* result, __global const float* inp, __global const float* multipliers, const int len, @@ -114,7 +145,7 @@ __kernel void clConvolutionX( result[y * xsize + x] = sum * scale; } -__kernel void clConvolutionY( +__kernel void clConvolutionYEx( __global float* result, __global const float* inp, __global const float* multipliers, const int len, @@ -156,7 +187,7 @@ __kernel void clConvolutionY( result[y * xsize + x] = sum * scale; } -__kernel void clSquareSample( +__kernel void clSquareSampleEx( __global float* result, __global const float* image, const int xstep, const int ystep) @@ -175,7 +206,7 @@ __kernel void clSquareSample( result[y * xsize + x] = image[y_sample * xsize + x_sample]; } -__kernel void clOpsinDynamicsImage( +__kernel void clOpsinDynamicsImageEx( __global float *r, __global float *g, __global float *b, __global const float *r_blurred, __global const float *g_blurred, __global const float *b_blurred) { @@ -203,7 +234,7 @@ __kernel void clOpsinDynamicsImage( b[i] = z; } -__kernel void clMaskHighIntensityChange( +__kernel void clMaskHighIntensityChangeEx( __global float *xyb0_x, __global float *xyb0_y, __global float *xyb0_b, __global float *xyb1_x, __global float *xyb1_y, __global float *xyb1_b, __global const float *c0_x, __global const float *c0_y, __global const float *c0_b, @@ -260,7 +291,7 @@ __kernel void clMaskHighIntensityChange( xyb1_b[ix] = (float)(mix[2] * c1_b[ix] + (1 - mix[2]) * ave[2]); } -__kernel void clEdgeDetectorMap( +__kernel void clEdgeDetectorMapEx( __global float *result, __global const float *r, __global const float *g, __global const float* b, __global const float *r2, __global const float* g2, __global const float *b2, @@ -294,7 +325,7 @@ __kernel void clEdgeDetectorMap( } -__kernel void clBlockDiffMap( +__kernel void clBlockDiffMapEx( __global float* block_diff_dc, __global float* block_diff_ac, __global const float* r, __global const float* g, __global const float* b, __global const float* r2, __global const float* g2, __global const float* b2, @@ -352,7 +383,7 @@ __kernel void clBlockDiffMap( } } -__kernel void clEdgeDetectorLowFreq( +__kernel void clEdgeDetectorLowFreqEx( __global float *block_diff_ac, __global const float *r, __global const float *g, __global const float* b, __global const float *r2, __global const float* g2, __global const float *b2, @@ -416,7 +447,7 @@ __kernel void clEdgeDetectorLowFreq( block_diff_ac[res_ix * 3 + 2] += max_diff_xyb[2] * kMul; } -__kernel void clDiffPrecompute( +__kernel void clDiffPrecomputeEx( __global float *mask_x, __global float *mask_y, __global float *mask_b, __global const float *xyb0_x, __global const float *xyb0_y, __global const float *xyb0_b, __global const float *xyb1_x, __global const float *xyb1_y, __global const float *xyb1_b) @@ -482,7 +513,7 @@ __kernel void clDiffPrecompute( mask_b[ix] = (float)(m); } -__kernel void clScaleImage(__global float *img, double scale) +__kernel void clScaleImageEx(__global float *img, double scale) { const int i = get_global_id(0); img[i] *= scale; @@ -490,7 +521,7 @@ __kernel void clScaleImage(__global float *img, double scale) #define Average5x5_w 0.679144890667f __constant float Average5x5_scale = 1.0f / (5.0f + 4 * Average5x5_w); -__kernel void clAverage5x5(__global float *img, __global const float *img_org) +__kernel void clAverage5x5Ex(__global float *img, __global const float *img_org) { const int x = get_global_id(0); const int y = get_global_id(1); @@ -530,7 +561,7 @@ __kernel void clAverage5x5(__global float *img, __global const float *img_org) img[row0 + x] *= Average5x5_scale; } -__kernel void clMinSquareVal(__global float* result, __global const float* img, int square_size, int offset) +__kernel void clMinSquareValEx(__global float* result, __global const float* img, int square_size, int offset) { const int x = get_global_id(0); const int y = get_global_id(1); @@ -557,7 +588,7 @@ __kernel void clMinSquareVal(__global float* result, __global const float* img, result[y * width + x] = minValue; } -__kernel void clDoMask( +__kernel void clDoMaskEx( __global float *mask_x, __global float *mask_y, __global float *mask_b, __global float *mask_dc_x, __global float *mask_dc_y, __global float *mask_dc_b, __global const double *lut_x, __global const double *lut_y, __global const double *lut_b, @@ -590,7 +621,7 @@ __kernel void clDoMask( } -__kernel void clCombineChannels( +__kernel void clCombineChannelsEx( __global float *result, __global const float *mask_x, __global const float *mask_y, __global const float *mask_b, __global const float *mask_dc_x, __global const float *mask_dc_y, __global const float *mask_dc_b, @@ -622,7 +653,40 @@ __kernel void clCombineChannels( DotProduct(&edge_detector_map[3 * res_ix], mask)); } -__kernel void clRemoveBorder(__global const float *in, int in_xsize, int s, int s2, __global float *out) +__kernel void clUpsampleSquareRootEx(__global float *diffmap_out, __global const float *diffmap, int xsize, int ysize, int step) +{ + const int res_x = get_global_id(0); + const int res_y = get_global_id(1); + + const int res_xsize = get_global_size(0); + const int res_ysize = get_global_size(1); + + const int pos_x = res_x * step; + const int pos_y = res_y * step; + + if (pos_y + 8 - step >= ysize) return; + if (pos_x + 8 - step >= xsize) return; + + int s2 = (8 - step) / 2; + + // Upsample and take square root. + float orig_val = diffmap[res_y * res_xsize + res_x]; + + const float kInitialSlope = 100; + // TODO(b/29974893): Until that is fixed do not call sqrt on very small + // numbers. + double val = orig_val < (1.0 / (kInitialSlope * kInitialSlope)) + ? kInitialSlope * orig_val + : sqrt(orig_val); + + for (size_t off_y = 0; off_y < step; ++off_y) { + for (size_t off_x = 0; off_x < step; ++off_x) { + diffmap_out[(pos_y + off_y + s2) * xsize + pos_x + off_x + s2] = val; + } + } +} + +__kernel void clRemoveBorderEx(__global float *out, __global const float *in, int in_xsize, int s, int s2) { const int x = get_global_id(0); const int y = get_global_id(1); @@ -633,7 +697,7 @@ __kernel void clRemoveBorder(__global const float *in, int in_xsize, int s, int out[y * xsize + x] = in[(y + s2) * (xsize + s) + x + s2]; } -__kernel void clAddBorder(__global float *out, int s, int s2, __global const float *in) +__kernel void clAddBorderEx(__global float *out, int s, int s2, __global const float *in) { const int x = get_global_id(0); const int y = get_global_id(1); @@ -651,40 +715,131 @@ __kernel void clAddBorder(__global float *out, int s, int s2, __global const flo } -__kernel void clUpsampleSquareRoot(__global const float *diffmap, int xsize, int ysize, int step, __global float *diffmap_out) +// batchÊÇÖ¸ÒѾ­¶þά¿éÕ¹¿ªÎªÁËһά¿é +__kernel void clComputeBlockZeroingOrderEx( + __global const coeff_t *orig_batch_0, // ԭʼͼÏñϵÊý + __global const coeff_t *orig_batch_1, // ԭʼͼÏñϵÊý + __global const coeff_t *orig_batch_2, // ԭʼͼÏñϵÊý + __global const float *orig_image_batch, // ԭʼͼÏñpregamma + __global const float *mask_scale, // ԭʼͼÏñµÄij¸öÉñÃØ²ÎÊý + const int image_width, + const int image_height, + + __global const coeff_t *mayout_batch_0, // Êä³ö±¸Ñ¡Í¼µÄϵÊý + __global const coeff_t *mayout_batch_1, // Êä³ö±¸Ñ¡Í¼µÄϵÊý + __global const coeff_t *mayout_batch_2, // Êä³ö±¸Ñ¡Í¼µÄϵÊý + __global const ushort *mayout_pixel_0, + __global const ushort *mayout_pixel_1, + __global const ushort *mayout_pixel_2, + + const channel_info mayout_channel_0, + const channel_info mayout_channel_1, + const channel_info mayout_channel_2, + const int factor, // µ±Ç°²ÎÓëÔËËãµÄfactor + const int comp_mask, // µ±Ç°²ÎÓëÔËËãµÄchannel + const float BlockErrorLimit, + __global CoeffData *output_order_list/*out*/) { - const int res_x = get_global_id(0); - const int res_y = get_global_id(1); + const int block_x = get_global_id(0); + const int block_y = get_global_id(1); - const int res_xsize = get_global_size(0); - const int res_ysize = get_global_size(1); + channel_info orig_channel[3]; + orig_channel[0].coeff = orig_batch_0; + orig_channel[1].coeff = orig_batch_1; + orig_channel[2].coeff = orig_batch_2; - const int pos_x = res_x * step; - const int pos_y = res_y * step; + channel_info mayout_channel[3] = { mayout_channel_0, mayout_channel_1, mayout_channel_2 }; + mayout_channel[0].coeff = mayout_batch_0; + mayout_channel[1].coeff = mayout_batch_1; + mayout_channel[2].coeff = mayout_batch_2; + mayout_channel[0].pixel = mayout_pixel_0; + mayout_channel[1].pixel = mayout_pixel_1; + mayout_channel[2].pixel = mayout_pixel_2; - if (pos_y + 8 - step >= ysize) return; - if (pos_x + 8 - step >= xsize) return; + int block_idx = 0; // ¸ù¾ÝÏÂÃæmaskÃüÖеÄchannelÀ´¼ÆËãindx - int s2 = (8 - step) / 2; + coeff_t mayout_block[kComputeBlockSize] = { 0 }; + coeff_t orig_block[kComputeBlockSize] = { 0 }; - // Upsample and take square root. - float orig_val = diffmap[res_y * res_xsize + res_x]; + for (int c = 0; c < 3; c++) { + if (comp_mask & (1< 0) + { + float best_err = 1e17f; + int best_i = 0; + for (int i = 0; i < min(3, input_order.size); i++) + { + coeff_t candidate_block[kComputeBlockSize]; + coeffcopy(candidate_block, processed_block, kComputeBlockSize); + + const int idx = input_order.pData[i].idx; + candidate_block[idx] = 0; + + float max_err = CompareBlockFactor(mayout_channel, + candidate_block, + block_x, + block_y, + orig_image_batch, + mask_scale, + image_width, + image_height, + factor); + if (max_err < best_err) + { + best_err = max_err; + best_i = i; + } } + + int idx = input_order.pData[best_i].idx; + processed_block[idx] = 0; + list_erase(&input_order, best_i); + + list_push_back(&output_order, idx, best_err); + } + + // ×¢Òâoutput_orderÕâÀïµÄresize¾ÍÊǰÑβ²¿µÄÖÃλ0 + float min_err = 1e10; + for (int i = output_order.size - 1; i >= 0; --i) { + min_err = min(min_err, output_order.pData[i].err); + output_order.pData[i].err = min_err; } -} + __global CoeffData *output_block = output_order_list + block_idx * kComputeBlockSize; + int out_count = 0; + for (int i = 0; i < kComputeBlockSize && i < output_order.size; i++) + { + // ¹ýÂ˽ϴóµÄerr£¬Õⲿ·Ö½øÈëºó¶Ë¼ÆËãûÓÐÒâÒå + if (output_order.pData[i].err <= BlockErrorLimit) + { + output_block[out_count].idx = output_order.pData[i].idx; + output_block[out_count].err = output_order.pData[i].err; + out_count++; + } + } +} void Butteraugli8x8CornerEdgeDetectorDiff( int pos_x, @@ -1394,19 +1549,6 @@ void RgbToXyb(double r, double g, double b, double *valx, double *valy, double * *valz = b; } -// IntFloatPairÊÇΪÁËÄ£Äâoutput_order input_orderµÄvector£¬µ«ÊÇ´óС¹Ì¶¨Îª8x8 -typedef struct __IntFloatPair -{ - int idx; - float err; -}IntFloatPair, DCTScoreData, CoeffData; - -typedef struct __IntFloatPairList -{ - int size; - IntFloatPair *pData; -}IntFloatPairList; - // chrisk todo // return size int list_push_back(IntFloatPairList* list, int i, float f) @@ -3113,128 +3255,3 @@ double CompareBlockFactor(const channel_info mayout_channel[3], } } -// batchÊÇÖ¸ÒѾ­¶þά¿éÕ¹¿ªÎªÁËһά¿é -__kernel void clComputeBlockZeroingOrder( - __global const coeff_t *orig_batch_0, // ԭʼͼÏñϵÊý - __global const coeff_t *orig_batch_1, // ԭʼͼÏñϵÊý - __global const coeff_t *orig_batch_2, // ԭʼͼÏñϵÊý - __global const float *orig_image_batch, // ԭʼͼÏñpregamma - __global const float *mask_scale, // ԭʼͼÏñµÄij¸öÉñÃØ²ÎÊý - const int image_width, - const int image_height, - - __global const coeff_t *mayout_batch_0, // Êä³ö±¸Ñ¡Í¼µÄϵÊý - __global const coeff_t *mayout_batch_1, // Êä³ö±¸Ñ¡Í¼µÄϵÊý - __global const coeff_t *mayout_batch_2, // Êä³ö±¸Ñ¡Í¼µÄϵÊý - __global const ushort *mayout_pixel_0, - __global const ushort *mayout_pixel_1, - __global const ushort *mayout_pixel_2, - - const channel_info mayout_channel_0, - const channel_info mayout_channel_1, - const channel_info mayout_channel_2, - const int factor, // µ±Ç°²ÎÓëÔËËãµÄfactor - const int comp_mask, // µ±Ç°²ÎÓëÔËËãµÄchannel - const float BlockErrorLimit, - __global CoeffData *output_order_list/*out*/) -{ - const int block_x = get_global_id(0); - const int block_y = get_global_id(1); - - channel_info orig_channel[3]; - orig_channel[0].coeff = orig_batch_0; - orig_channel[1].coeff = orig_batch_1; - orig_channel[2].coeff = orig_batch_2; - - channel_info mayout_channel[3] = { mayout_channel_0, mayout_channel_1, mayout_channel_2 }; - mayout_channel[0].coeff = mayout_batch_0; - mayout_channel[1].coeff = mayout_batch_1; - mayout_channel[2].coeff = mayout_batch_2; - mayout_channel[0].pixel = mayout_pixel_0; - mayout_channel[1].pixel = mayout_pixel_1; - mayout_channel[2].pixel = mayout_pixel_2; - - int block_idx = 0; // ¸ù¾ÝÏÂÃæmaskÃüÖеÄchannelÀ´¼ÆËãindx - - coeff_t mayout_block[kComputeBlockSize] = { 0 }; - coeff_t orig_block[kComputeBlockSize] = { 0 }; - - for (int c = 0; c < 3; c++) { - if (comp_mask & (1< 0) - { - float best_err = 1e17f; - int best_i = 0; - for (int i = 0; i < min(3, input_order.size); i++) - { - coeff_t candidate_block[kComputeBlockSize]; - coeffcopy(candidate_block, processed_block, kComputeBlockSize); - - const int idx = input_order.pData[i].idx; - candidate_block[idx] = 0; - - float max_err = CompareBlockFactor(mayout_channel, - candidate_block, - block_x, - block_y, - orig_image_batch, - mask_scale, - image_width, - image_height, - factor); - if (max_err < best_err) - { - best_err = max_err; - best_i = i; - } - } - - int idx = input_order.pData[best_i].idx; - processed_block[idx] = 0; - list_erase(&input_order, best_i); - - list_push_back(&output_order, idx, best_err); - } - - // ×¢Òâoutput_orderÕâÀïµÄresize¾ÍÊǰÑβ²¿µÄÖÃλ0 - float min_err = 1e10; - for (int i = output_order.size - 1; i >= 0; --i) { - min_err = min(min_err, output_order.pData[i].err); - output_order.pData[i].err = min_err; - } - - __global CoeffData *output_block = output_order_list + block_idx * kComputeBlockSize; - - int out_count = 0; - for (int i = 0; i < kComputeBlockSize && i < output_order.size; i++) - { - // ¹ýÂ˽ϴóµÄerr£¬Õⲿ·Ö½øÈëºó¶Ë¼ÆËãûÓÐÒâÒå - if (output_order.pData[i].err <= BlockErrorLimit) - { - output_block[out_count].idx = output_order.pData[i].idx; - output_block[out_count].err = output_order.pData[i].err; - out_count++; - } - } -} \ No newline at end of file diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index 16d614d9..b58f7dc8 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -44,25 +44,25 @@ ocl_args_d_t& getOcl(void) LogError("Error happened during the build of OpenCL program.\nBuild log:%s", &build_log[0]); } } - ocl.kernel[KERNEL_MINSQUAREVAL] = clCreateKernel(ocl.program, "clMinSquareVal", &err); - ocl.kernel[KERNEL_CONVOLUTION] = clCreateKernel(ocl.program, "clConvolution", &err); - ocl.kernel[KERNEL_CONVOLUTIONX] = clCreateKernel(ocl.program, "clConvolutionX", &err); - ocl.kernel[KERNEL_CONVOLUTIONY] = clCreateKernel(ocl.program, "clConvolutionY", &err); - ocl.kernel[KERNEL_SQUARESAMPLE] = clCreateKernel(ocl.program, "clSquareSample", &err); - ocl.kernel[KERNEL_OPSINDYNAMICSIMAGE] = clCreateKernel(ocl.program, "clOpsinDynamicsImage", &err); - ocl.kernel[KERNEL_DOMASK] = clCreateKernel(ocl.program, "clDoMask", &err); - ocl.kernel[KERNEL_SCALEIMAGE] = clCreateKernel(ocl.program, "clScaleImage", &err); - ocl.kernel[KERNEL_COMBINECHANNELS] = clCreateKernel(ocl.program, "clCombineChannels", &err); - ocl.kernel[KERNEL_MASKHIGHINTENSITYCHANGE] = clCreateKernel(ocl.program, "clMaskHighIntensityChange", &err); - ocl.kernel[KERNEL_DIFFPRECOMPUTE] = clCreateKernel(ocl.program, "clDiffPrecompute", &err); - ocl.kernel[KERNEL_UPSAMPLESQUAREROOT] = clCreateKernel(ocl.program, "clUpsampleSquareRoot", &err); - ocl.kernel[KERNEL_ADDBORDER] = clCreateKernel(ocl.program, "clAddBorder", &err); - ocl.kernel[KERNEL_REMOVEBORDER] = clCreateKernel(ocl.program, "clRemoveBorder", &err); - ocl.kernel[KERNEL_AVERAGE5X5] = clCreateKernel(ocl.program, "clAverage5x5", &err); - ocl.kernel[KERNEL_EDGEDETECTOR] = clCreateKernel(ocl.program, "clEdgeDetectorMap", &err); - ocl.kernel[KERNEL_BLOCKDIFFMAP] = clCreateKernel(ocl.program, "clBlockDiffMap", &err); - ocl.kernel[KERNEL_EDGEDETECTORLOWFREQ] = clCreateKernel(ocl.program, "clEdgeDetectorLowFreq", &err); - ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER] = clCreateKernel(ocl.program, "clComputeBlockZeroingOrder", &err); + ocl.kernel[KERNEL_CONVOLUTION] = clCreateKernel(ocl.program, "clConvolutionEx", &err); + ocl.kernel[KERNEL_CONVOLUTIONX] = clCreateKernel(ocl.program, "clConvolutionXEx", &err); + ocl.kernel[KERNEL_CONVOLUTIONY] = clCreateKernel(ocl.program, "clConvolutionYEx", &err); + ocl.kernel[KERNEL_SQUARESAMPLE] = clCreateKernel(ocl.program, "clSquareSampleEx", &err); + ocl.kernel[KERNEL_OPSINDYNAMICSIMAGE] = clCreateKernel(ocl.program, "clOpsinDynamicsImageEx", &err); + ocl.kernel[KERNEL_MASKHIGHINTENSITYCHANGE] = clCreateKernel(ocl.program, "clMaskHighIntensityChangeEx", &err); + ocl.kernel[KERNEL_EDGEDETECTOR] = clCreateKernel(ocl.program, "clEdgeDetectorMapEx", &err); + ocl.kernel[KERNEL_BLOCKDIFFMAP] = clCreateKernel(ocl.program, "clBlockDiffMapEx", &err); + ocl.kernel[KERNEL_EDGEDETECTORLOWFREQ] = clCreateKernel(ocl.program, "clEdgeDetectorLowFreqEx", &err); + ocl.kernel[KERNEL_DIFFPRECOMPUTE] = clCreateKernel(ocl.program, "clDiffPrecomputeEx", &err); + ocl.kernel[KERNEL_SCALEIMAGE] = clCreateKernel(ocl.program, "clScaleImageEx", &err); + ocl.kernel[KERNEL_AVERAGE5X5] = clCreateKernel(ocl.program, "clAverage5x5Ex", &err); + ocl.kernel[KERNEL_MINSQUAREVAL] = clCreateKernel(ocl.program, "clMinSquareValEx", &err); + ocl.kernel[KERNEL_DOMASK] = clCreateKernel(ocl.program, "clDoMaskEx", &err); + ocl.kernel[KERNEL_COMBINECHANNELS] = clCreateKernel(ocl.program, "clCombineChannelsEx", &err); + ocl.kernel[KERNEL_UPSAMPLESQUAREROOT] = clCreateKernel(ocl.program, "clUpsampleSquareRootEx", &err); + ocl.kernel[KERNEL_REMOVEBORDER] = clCreateKernel(ocl.program, "clRemoveBorderEx", &err); + ocl.kernel[KERNEL_ADDBORDER] = clCreateKernel(ocl.program, "clAddBorderEx", &err); + ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER] = clCreateKernel(ocl.program, "clComputeBlockZeroingOrderEx", &err); return ocl; } @@ -1114,14 +1114,14 @@ void clUpsampleSquareRootEx(cl_mem diffmap, const size_t xsize, const size_t ysi cl_int clysize = ysize; cl_int clstep = step; - cl_mem mem_diffmap = ocl.allocMem(xsize * ysize * sizeof(float)); + cl_mem diffmap_out = ocl.allocMem(xsize * ysize * sizeof(float)); cl_kernel kernel = ocl.kernel[KERNEL_UPSAMPLESQUAREROOT]; - clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&diffmap); - clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&xsize); - clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&ysize); - clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&step); - clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&mem_diffmap); + clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&diffmap_out); + clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&diffmap); + clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&xsize); + clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&ysize); + clSetKernelArg(kernel, 4, sizeof(cl_int), (void*)&step); const size_t res_xsize = (xsize + step - 1) / step; const size_t res_ysize = (ysize + step - 1) / step; @@ -1133,7 +1133,7 @@ void clUpsampleSquareRootEx(cl_mem diffmap, const size_t xsize, const size_t ysi LogError("Error: clUpsampleSquareRootEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err)); } err = clFinish(ocl.commandQueue); - err = clEnqueueCopyBuffer(ocl.commandQueue, mem_diffmap, diffmap, 0, 0, xsize * ysize * sizeof(float), 0, NULL, NULL); + err = clEnqueueCopyBuffer(ocl.commandQueue, diffmap_out, diffmap, 0, 0, xsize * ysize * sizeof(float), 0, NULL, NULL); if (CL_SUCCESS != err) { LogError("Error: clUpsampleSquareRootEx() clEnqueueCopyBuffer returned %s.\n", TranslateOpenCLError(err)); @@ -1144,7 +1144,7 @@ void clUpsampleSquareRootEx(cl_mem diffmap, const size_t xsize, const size_t ysi LogError("Error: clUpsampleSquareRootEx() clFinish returned %s.\n", TranslateOpenCLError(err)); } - clReleaseMemObject(mem_diffmap); + clReleaseMemObject(diffmap_out); } void clRemoveBorderEx(cl_mem out, const cl_mem in, const size_t xsize, const size_t ysize, const int step) @@ -1156,11 +1156,11 @@ void clRemoveBorderEx(cl_mem out, const cl_mem in, const size_t xsize, const siz cl_int cls2 = (8 - step) / 2; cl_int clxsize = xsize; cl_kernel kernel = ocl.kernel[KERNEL_REMOVEBORDER]; - clSetKernelArg(kernel, 0, sizeof(cl_mem), &in); - clSetKernelArg(kernel, 1, sizeof(cl_int), &clxsize); - clSetKernelArg(kernel, 2, sizeof(cl_int), &cls); - clSetKernelArg(kernel, 3, sizeof(cl_int), &cls2); - clSetKernelArg(kernel, 4, sizeof(cl_mem), &out); + clSetKernelArg(kernel, 0, sizeof(cl_mem), &out); + clSetKernelArg(kernel, 1, sizeof(cl_mem), &in); + clSetKernelArg(kernel, 2, sizeof(cl_int), &clxsize); + clSetKernelArg(kernel, 3, sizeof(cl_int), &cls); + clSetKernelArg(kernel, 4, sizeof(cl_int), &cls2); size_t globalWorkSize[2] = { xsize - cls, ysize - cls}; err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h index de90c9b0..b5997fcd 100644 --- a/clguetzli/clguetzli.h +++ b/clguetzli/clguetzli.h @@ -46,13 +46,13 @@ void clConvolutionEx( const cl_mem multipliers, size_t len, int xstep, int offset, double border_ratio); -void clConvolutionX( +void clConvolutionXEx( cl_mem result/*out*/, const cl_mem inp, size_t xsize, size_t ysize, const cl_mem multipliers, size_t len, int xstep, int offset, double border_ratio); -void clConvolutionY( +void clConvolutionYEx( cl_mem result/*out*/, const cl_mem inp, size_t xsize, size_t ysize, const cl_mem multipliers, size_t len, @@ -63,12 +63,6 @@ void clSquareSampleEx( const cl_mem image, size_t xsize, size_t ysize, size_t xstep, size_t ystep); -void clUpsampleEx( - cl_mem result/*out*/, - const cl_mem image, - const size_t xsize, const size_t ysize, - const size_t xstep, const size_t ystep); - void clBlurEx(cl_mem image/*out, opt*/, const size_t xsize, const size_t ysize, const double sigma, const double border_ratio, cl_mem result = nullptr/*out, opt*/); diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h index 802ded26..04407f5c 100644 --- a/clguetzli/ocl.h +++ b/clguetzli/ocl.h @@ -45,25 +45,25 @@ int SetupOpenCL(ocl_args_d_t *ocl, cl_device_type deviceType); */ enum KernelName { - KERNEL_MINSQUAREVAL = 0, - KERNEL_CONVOLUTION, + KERNEL_CONVOLUTION = 0, KERNEL_CONVOLUTIONX, KERNEL_CONVOLUTIONY, KERNEL_SQUARESAMPLE, KERNEL_OPSINDYNAMICSIMAGE, - KERNEL_DOMASK, - KERNEL_SCALEIMAGE, + KERNEL_MASKHIGHINTENSITYCHANGE, + KERNEL_EDGEDETECTOR, + KERNEL_BLOCKDIFFMAP, + KERNEL_EDGEDETECTORLOWFREQ, + KERNEL_DIFFPRECOMPUTE, + KERNEL_SCALEIMAGE, + KERNEL_AVERAGE5X5, + KERNEL_MINSQUAREVAL, + KERNEL_DOMASK, KERNEL_COMBINECHANNELS, - KERNEL_MASKHIGHINTENSITYCHANGE, - KERNEL_DIFFPRECOMPUTE, KERNEL_UPSAMPLESQUAREROOT, + KERNEL_REMOVEBORDER, KERNEL_ADDBORDER, - KERNEL_REMOVEBORDER, - KERNEL_AVERAGE5X5, - KERNEL_EDGEDETECTOR, - KERNEL_BLOCKDIFFMAP, - KERNEL_EDGEDETECTORLOWFREQ, - KERNEL_COMPUTEBLOCKZEROINGORDER, + KERNEL_COMPUTEBLOCKZEROINGORDER, KERNEL_COUNT, }; From 34af91ddd9028f258799e580e2e40d63764c7c58 Mon Sep 17 00:00:00 2001 From: strongtu Date: Wed, 31 May 2017 15:10:07 +0800 Subject: [PATCH 114/189] Merge branch 'master' of https://github.com/ianhuang-777/guetzli Conflicts: clguetzli/clguetzli.cl.cpp --- clguetzli/clguetzli.cl | 17 +++++++++-------- clguetzli/clguetzli.cl.cpp | 5 +++-- clguetzli/ocl.cpp | 21 +++++++++++++++++++-- guetzli/processor.cc | 2 +- 4 files changed, 32 insertions(+), 13 deletions(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index 644a009a..cf7bca3e 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -387,23 +387,24 @@ __kernel void clEdgeDetectorLowFreqEx( __global float *block_diff_ac, __global const float *r, __global const float *g, __global const float* b, __global const float *r2, __global const float* g2, __global const float *b2, - int xsize, int ysize, int step) + int xsize, int ysize, int step_) { const int res_x = get_global_id(0); const int res_y = get_global_id(1); - if (res_x < 8 / step) return; + const int step = 8; + if (res_x < step / step_) return; const int res_xsize = get_global_size(0); const int res_ysize = get_global_size(1); - int pos_x = (res_x - (8 / step)) * step; - int pos_y = res_y * step; + int x = (res_x - (step / step_)) * step_; + int y = res_y * step_; - if (pos_x + 8 >= xsize) return; - if (pos_y + 8 >= ysize) return; + if (x + step >= xsize) return; + if (y + step >= ysize) return; - int ix = pos_y * xsize + pos_x; + int ix = y * xsize + x; double diff[4][3]; __global const float* blurred0[3] = { r, g, b }; @@ -423,7 +424,7 @@ __kernel void clEdgeDetectorLowFreqEx( ((blurred1[i][ix] - blurred0[i][ix]) + (blurred0[i][ix2] - blurred1[i][ix2])); ix2 = ix + 6 * xsize - 6; - diff[3][i] = pos_x < 8 ? 0 : + diff[3][i] = x < step ? 0 : ((blurred1[i][ix] - blurred0[i][ix]) + (blurred0[i][ix2] - blurred1[i][ix2])); } diff --git a/clguetzli/clguetzli.cl.cpp b/clguetzli/clguetzli.cl.cpp index 0a05b038..b3203fe9 100644 --- a/clguetzli/clguetzli.cl.cpp +++ b/clguetzli/clguetzli.cl.cpp @@ -96,6 +96,7 @@ namespace guetzli double ButteraugliComparatorEx::CompareBlock(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block, const int comp_mask) const { double err = ButteraugliComparator::CompareBlock(img, off_x, off_y, candidate_block, comp_mask); +/* if (g_checkOpenCL) { channel_info mayout_channel[3]; @@ -118,12 +119,12 @@ namespace guetzli height_, factor_x_); - if (err != err2) + if (fabs(err - err2) > 0.001) { LogError("CompareBlock miss %s(%d) \r\n", __FUNCTION__, __LINE__); } } - +*/ return err; } } diff --git a/clguetzli/ocl.cpp b/clguetzli/ocl.cpp index 594adeec..73a8d022 100644 --- a/clguetzli/ocl.cpp +++ b/clguetzli/ocl.cpp @@ -203,7 +203,10 @@ cl_mem ocl_args_d_t::allocMem(size_t s, const void *init) { LogError("Error: allocMem() for buffer returned %s.\n", TranslateOpenCLError(err)); } - if (mem && init) + if (!mem) return NULL; + + // init memory + if (init) { err = clEnqueueWriteBuffer(this->commandQueue, mem, CL_FALSE, 0, s, init, 0, NULL, NULL); if (CL_SUCCESS != err) @@ -213,7 +216,21 @@ cl_mem ocl_args_d_t::allocMem(size_t s, const void *init) err = clFinish(this->commandQueue); if (CL_SUCCESS != err) { - LogError("Error: allocMem() clFinish return %s.\n", TranslateOpenCLError(err)); + LogError("Error: allocMem() clEnqueueWriteBuffer/clFinish return %s.\n", TranslateOpenCLError(err)); + } + } + else + { + cl_char cc = 0; + err = clEnqueueFillBuffer(this->commandQueue, mem, &cc, sizeof(cc), 0, s / sizeof(cc), 0, NULL, NULL); + if (CL_SUCCESS != err) + { + LogError("Error: allocMem() clEnqueueFillBuffer return %s.\n", TranslateOpenCLError(err)); + } + err = clFinish(this->commandQueue); + if (CL_SUCCESS != err) + { + LogError("Error: allocMem() clEnqueueFillBuffer/clFinish return %s.\n", TranslateOpenCLError(err)); } } diff --git a/guetzli/processor.cc b/guetzli/processor.cc index 2c9811a9..1666d4fa 100644 --- a/guetzli/processor.cc +++ b/guetzli/processor.cc @@ -649,7 +649,7 @@ void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, co } if (count > 0) { - LogError("CHK %s(%d) %d:%d\r\n", __FUNCTION__, __LINE__, count, check_size); + LogError("CHK %s(%d) %d:%d\r\n", "SelectFrequencyMasking", __LINE__, count, check_size); } } From b47cb8ddc9c6ba699fd04e6b0c2e1ff0ca8bff61 Mon Sep 17 00:00:00 2001 From: strongtu Date: Wed, 31 May 2017 19:18:17 +0800 Subject: [PATCH 115/189] =?UTF-8?q?=E5=A2=9E=E5=8A=A0CUDA=E7=BC=96?= =?UTF-8?q?=E8=AF=91=EF=BC=8C=E8=AF=B7=E5=B0=8F=E5=BF=83=E6=9B=B4=E6=96=B0?= =?UTF-8?q?=EF=BC=8C=E6=B2=A1=E5=AE=89=E8=A3=85cuda=E4=BC=9A=E6=97=A0?= =?UTF-8?q?=E6=B3=95=E7=BC=96=E8=AF=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clguetzli.cu | 5 ++ clguetzli/clguetzli_test.cpp | 26 ++++-- clguetzli/ocu.cpp | 79 +++++++++++++++++ clguetzli/ocu.h | 19 +++++ compile.bat | 160 +++++++++++++++++++++++++++++++++++ guetzli.vcxproj | 74 +++++++++------- guetzli.vcxproj.filters | 9 ++ 7 files changed, 334 insertions(+), 38 deletions(-) create mode 100644 clguetzli/clguetzli.cu create mode 100644 clguetzli/ocu.cpp create mode 100644 clguetzli/ocu.h create mode 100644 compile.bat diff --git a/clguetzli/clguetzli.cu b/clguetzli/clguetzli.cu new file mode 100644 index 00000000..b76a81e7 --- /dev/null +++ b/clguetzli/clguetzli.cu @@ -0,0 +1,5 @@ +__global__ void clScaleImageEx(float *img, double scale) +{ + const int i = blockIdx.x; + img[i] *= scale; +} \ No newline at end of file diff --git a/clguetzli/clguetzli_test.cpp b/clguetzli/clguetzli_test.cpp index bbfdb970..e98e6369 100644 --- a/clguetzli/clguetzli_test.cpp +++ b/clguetzli/clguetzli_test.cpp @@ -5,6 +5,7 @@ #include "clguetzli_test.h" #include "clguetzli.h" #include "ocl.h" +#include "ocu.h" #define FLOAT_COMPARE(a, b, c) floatCompare((a), (b), (c), __FUNCTION__, __LINE__ ) @@ -404,19 +405,26 @@ void tclMinSquareVal(const float *img, size_t square_size, size_t offset, void tclScaleImage(double scale, const float *result_org, const float *result_cmp, size_t length) { - cl_int err = 0; - ocl_args_d_t &ocl = getOcl(); - cl_mem mem_result_org = ocl.allocMem(length * sizeof(float), result_org); +/* + ocu_args_d_t &ocu = getOcu(); + CUdeviceptr m = ocu.allocMem(length * sizeof(float), result_org); + cuLaunchKernel(ocu.kernel[KERNEL_SCALEIMAGE], + cuMemFree(m); + return; +*/ + cl_int err = 0; + ocl_args_d_t &ocl = getOcl(); + cl_mem mem_result_org = ocl.allocMem(length * sizeof(float), result_org); - clScaleImageEx(mem_result_org, length, scale); + clScaleImageEx(mem_result_org, length, scale); - cl_float *r_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mem_result_org, true, CL_MAP_READ, 0, length * sizeof(float), 0, NULL, NULL, &err); - err = clFinish(ocl.commandQueue); + cl_float *r_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mem_result_org, true, CL_MAP_READ, 0, length * sizeof(float), 0, NULL, NULL, &err); + err = clFinish(ocl.commandQueue); - FLOAT_COMPARE(r_r, result_cmp, length); + FLOAT_COMPARE(r_r, result_cmp, length); - clEnqueueUnmapMemObject(ocl.commandQueue, mem_result_org, r_r, 0, NULL, NULL); - clReleaseMemObject(mem_result_org); + clEnqueueUnmapMemObject(ocl.commandQueue, mem_result_org, r_r, 0, NULL, NULL); + clReleaseMemObject(mem_result_org); } // strong todo diff --git a/clguetzli/ocu.cpp b/clguetzli/ocu.cpp new file mode 100644 index 00000000..5b5da9b7 --- /dev/null +++ b/clguetzli/ocu.cpp @@ -0,0 +1,79 @@ + +#include +#include "ocu.h" + +ocu_args_d_t& getOcu(void) +{ + static bool bInit = false; + static ocu_args_d_t ocu; + + if (bInit == true) return ocu; + + cuInit(0); + + CUresult r; + CUcontext ctxt; + CUdevice dev = 0; + + cuCtxCreate(&ctxt, CU_CTX_SCHED_BLOCKING_SYNC, dev); + + char name[1024]; + int proc_count = 0; + int thread_count = 0; + int cap_major = 0, cap_minor = 0; + cuDeviceGetName(name, sizeof(name), dev); + cuDeviceGetAttribute(&cap_major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev); + cuDeviceGetAttribute(&cap_minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev); + cuDeviceGetAttribute(&proc_count, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev); + cuDeviceGetAttribute(&thread_count, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev); + LogError("CUDA Adapter:%s Ver%d.%d (%d x %d)\r\n", name, cap_major, cap_minor, proc_count, thread_count); + + CUmodule mod; + + char* source = nullptr; + size_t src_size = 0; + ReadSourceFromFile("clguetzli/clguetzli.cu.ptx30", &source, &src_size); + + CUjit_option jit_options[2]; + void *jit_optvals[2]; + jit_options[0] = CU_JIT_CACHE_MODE; + jit_optvals[0] = (void*)(uintptr_t)CU_JIT_CACHE_OPTION_CA; + cuModuleLoadDataEx(&mod, source, 1, jit_options, jit_optvals); + + delete[] source; + + cuModuleGetFunction(&ocu.kernel[KERNEL_SCALEIMAGE], mod, "clScaleImageEx"); + + cuCtxSetCacheConfig(CU_FUNC_CACHE_PREFER_SHARED); + cuCtxSetSharedMemConfig(CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE); + + cuStreamCreate(&ocu.stream, 0); + + return ocu; +} + +ocu_args_d_t::ocu_args_d_t() +{ + +} + +ocu_args_d_t::~ocu_args_d_t() +{ + +} + +CUdeviceptr ocu_args_d_t::allocMem(size_t s, const void *init) +{ + CUdeviceptr mem; + cuMemAlloc(&mem, s); + if (init) + { + cuMemcpyHtoDAsync(mem, init, s, this->stream); + } + else + { + cuMemsetD8(mem, 0, s); + } + + return mem; +} \ No newline at end of file diff --git a/clguetzli/ocu.h b/clguetzli/ocu.h new file mode 100644 index 00000000..f33c856f --- /dev/null +++ b/clguetzli/ocu.h @@ -0,0 +1,19 @@ +#pragma once + +#include +#include "ocl.h" + +struct ocu_args_d_t; + +ocu_args_d_t& getOcu(void); + +struct ocu_args_d_t +{ + ocu_args_d_t(); + ~ocu_args_d_t(); + + CUdeviceptr allocMem(size_t s, const void *init); + + CUfunction kernel[KERNEL_COUNT]; + CUstream stream; +}; \ No newline at end of file diff --git a/compile.bat b/compile.bat new file mode 100644 index 00000000..b27c9e49 --- /dev/null +++ b/compile.bat @@ -0,0 +1,160 @@ +@if "%1" == "" goto start +@setlocal +@set userinput=%1 +@if not "%1"=="store" @if not "%1"=="8.1" @if not "%userinput:~0,3%"=="10." goto usage +@endlocal + +:start +@call :GetVSCommonToolsDir +@if "%VS140COMNTOOLS%"=="" goto error_no_VS140COMNTOOLSDIR + +@call "%VS140COMNTOOLS%VCVarsQueryRegistry.bat" No32bit 64bit %1 %2 + +@if "%VSINSTALLDIR%"=="" goto error_no_VSINSTALLDIR +@if "%VCINSTALLDIR%"=="" goto error_no_VCINSTALLDIR +@if "%FrameworkDir64%"=="" goto error_no_FrameworkDIR64 +@if "%FrameworkVersion64%"=="" goto error_no_FrameworkVer64 +@if "%Framework40Version%"=="" goto error_no_Framework40Version + +@set FrameworkDir=%FrameworkDir64% +@set FrameworkVersion=%FrameworkVersion64% + +@if not "%WindowsSDK_ExecutablePath_x64%" == "" @set PATH=%WindowsSDK_ExecutablePath_x64%;%PATH% + +@rem +@rem Set Windows SDK include/lib path +@rem +@if not "%WindowsSdkDir%" == "" @set PATH=%WindowsSdkDir%bin\x64;%WindowsSdkDir%bin\x86;%PATH% +@if not "%WindowsSdkDir%" == "" @set INCLUDE=%WindowsSdkDir%include\%WindowsSDKVersion%shared;%WindowsSdkDir%include\%WindowsSDKVersion%um;%WindowsSdkDir%include\%WindowsSDKVersion%winrt;%INCLUDE% +@if not "%WindowsSdkDir%" == "" @set LIB=%WindowsSdkDir%lib\%WindowsSDKLibVersion%um\x64;%LIB% +@if not "%WindowsSdkDir%" == "" @set LIBPATH=%WindowsLibPath%;%ExtensionSDKDir%\Microsoft.VCLibs\14.0\References\CommonConfiguration\neutral;%LIBPATH% + +@REM Set NETFXSDK include/lib path +@if not "%NETFXSDKDir%" == "" @set INCLUDE=%NETFXSDKDir%include\um;%INCLUDE% +@if not "%NETFXSDKDir%" == "" @set LIB=%NETFXSDKDir%lib\um\x64;%LIB% + +@rem +@rem Set UniversalCRT include/lib path, the default is the latest installed version. +@rem +@if not "%UCRTVersion%" == "" @set INCLUDE=%UniversalCRTSdkDir%include\%UCRTVersion%\ucrt;%INCLUDE% +@if not "%UCRTVersion%" == "" @set LIB=%UniversalCRTSdkDir%lib\%UCRTVersion%\ucrt\x64;%LIB% + +@rem PATH +@rem ---- +@if exist "%VSINSTALLDIR%Team Tools\Performance Tools\x64" @set PATH=%VSINSTALLDIR%Team Tools\Performance Tools\x64;%VSINSTALLDIR%Team Tools\Performance Tools;%PATH% + +@if exist "%ProgramFiles%\HTML Help Workshop" set PATH=%ProgramFiles%\HTML Help Workshop;%PATH% +@if exist "%ProgramFiles(x86)%\HTML Help Workshop" set PATH=%ProgramFiles(x86)%\HTML Help Workshop;%PATH% +@if exist "%VSINSTALLDIR%Common7\Tools" set PATH=%VSINSTALLDIR%Common7\Tools;%PATH% +@if exist "%VSINSTALLDIR%Common7\IDE" set PATH=%VSINSTALLDIR%Common7\IDE;%PATH% +@if exist "%VCINSTALLDIR%VCPackages" set PATH=%VCINSTALLDIR%VCPackages;%PATH% +@if exist "%FrameworkDir%\%Framework40Version%" set PATH=%FrameworkDir%\%Framework40Version%;%PATH% +@if exist "%FrameworkDir%\%FrameworkVersion%" set PATH=%FrameworkDir%\%FrameworkVersion%;%PATH% +@if exist "%VCINSTALLDIR%BIN\amd64" set PATH=%VCINSTALLDIR%BIN\amd64;%PATH% + +@rem Add path to MSBuild Binaries +@if exist "%ProgramFiles%\MSBuild\14.0\bin\amd64" set PATH=%ProgramFiles%\MSBuild\14.0\bin\amd64;%PATH% +@if exist "%ProgramFiles(x86)%\MSBuild\14.0\bin\amd64" set PATH=%ProgramFiles(x86)%\MSBuild\14.0\bin\amd64;%PATH% + +@if exist "%VSINSTALLDIR%Common7\IDE\CommonExtensions\Microsoft\TestWindow" @set PATH=%VSINSTALLDIR%Common7\IDE\CommonExtensions\Microsoft\TestWindow;%PATH% + +@rem INCLUDE +@rem ------- +@if exist "%VCINSTALLDIR%ATLMFC\INCLUDE" set INCLUDE=%VCINSTALLDIR%ATLMFC\INCLUDE;%INCLUDE% +@if exist "%VCINSTALLDIR%INCLUDE" set INCLUDE=%VCINSTALLDIR%INCLUDE;%INCLUDE% + +@rem LIB +@rem --- +@if "%1" == "store" goto setstorelib +@if exist "%VCINSTALLDIR%ATLMFC\LIB\amd64" set LIB=%VCINSTALLDIR%ATLMFC\LIB\amd64;%LIB% +@if exist "%VCINSTALLDIR%LIB\amd64" set LIB=%VCINSTALLDIR%LIB\amd64;%LIB% +@goto setlibpath +:setstorelib +@if exist "%VCINSTALLDIR%LIB\store\amd64" set LIB=%VCINSTALLDIR%LIB\store\amd64;%LIB% + +:setlibpath +@rem LIBPATH +@rem ------- +@if "%1" == "store" goto setstorelibpath +@if exist "%VCINSTALLDIR%ATLMFC\LIB\amd64" set LIBPATH=%VCINSTALLDIR%ATLMFC\LIB\amd64;%LIBPATH% +@if exist "%VCINSTALLDIR%LIB\amd64" set LIBPATH=%VCINSTALLDIR%LIB\amd64;%LIBPATH% +@goto appendlibpath +:setstorelibpath +@if exist "%VCINSTALLDIR%LIB\store\amd64" set LIBPATH=%VCINSTALLDIR%LIB\store\amd64;%VCINSTALLDIR%LIB\store\references;%LIBPATH% +:appendlibpath +@if exist "%FrameworkDir%\%Framework40Version%" set LIBPATH=%FrameworkDir%\%Framework40Version%;%LIBPATH% +@if exist "%FrameworkDir%\%FrameworkVersion%" set LIBPATH=%FrameworkDir%\%FrameworkVersion%;%LIBPATH% + +@set Platform=X64 +@set CommandPromptType=Native + +@goto end + +@REM ----------------------------------------------------------------------- +:GetVSCommonToolsDir +@set VS140COMNTOOLS= +@call :GetVSCommonToolsDirHelper32 HKLM > nul 2>&1 +@if errorlevel 1 call :GetVSCommonToolsDirHelper32 HKCU > nul 2>&1 +@if errorlevel 1 call :GetVSCommonToolsDirHelper64 HKLM > nul 2>&1 +@if errorlevel 1 call :GetVSCommonToolsDirHelper64 HKCU > nul 2>&1 +@exit /B 0 + +:GetVSCommonToolsDirHelper32 +@for /F "tokens=1,2*" %%i in ('reg query "%1\SOFTWARE\Microsoft\VisualStudio\SxS\VS7" /v "14.0"') DO ( + @if "%%i"=="14.0" ( + @SET VS140COMNTOOLS=%%k + ) +) +@if "%VS140COMNTOOLS%"=="" exit /B 1 +@SET VS140COMNTOOLS=%VS140COMNTOOLS%Common7\Tools\ +@exit /B 0 + +:GetVSCommonToolsDirHelper64 +@for /F "tokens=1,2*" %%i in ('reg query "%1\SOFTWARE\Wow6432Node\Microsoft\VisualStudio\SxS\VS7" /v "14.0"') DO ( + @if "%%i"=="14.0" ( + @SET VS140COMNTOOLS=%%k + ) +) +@if "%VS140COMNTOOLS%"=="" exit /B 1 +@SET VS140COMNTOOLS=%VS140COMNTOOLS%Common7\Tools\ +@exit /B 0 + +@REM ----------------------------------------------------------------------- +:error_no_VS140COMNTOOLSDIR +@echo ERROR: Cannot determine the location of the VS Common Tools folder. +@goto end + +:error_no_VSINSTALLDIR +@echo ERROR: Cannot determine the location of the VS installation. +@goto end + +:error_no_VCINSTALLDIR +@echo ERROR: Cannot determine the location of the VC installation. +@goto end + +:error_no_FrameworkDIR64 +@echo ERROR: Cannot determine the location of the .NET Framework 64bit installation. +@goto end + +:error_no_FrameworkVer64 +@echo ERROR: Cannot determine the version of the .NET Framework 64bit installation. +@goto end + +:error_no_Framework40Version +@echo ERROR: Cannot determine the .NET Framework 4.0 version. +@goto end + +:usage +echo Error in script usage. The correct usage is: +echo %0 +echo or +echo %0 store +echo or +echo %0 10.0.10240.0 +echo or +echo %0 store 10.0.10240.0 + +:end + + +nvcc -Xcompiler "/wd 4819" -arch=sm_30 -ptx -o clguetzli\clguetzli.cu.ptx30 clguetzli\clguetzli.cu \ No newline at end of file diff --git a/guetzli.vcxproj b/guetzli.vcxproj index 42a13971..7f3b26ca 100644 --- a/guetzli.vcxproj +++ b/guetzli.vcxproj @@ -99,24 +99,24 @@ $(VC_LibraryPath_x86);$(WindowsSDK_LibraryPath_x86);$(NETFXKitsDir)Lib\um\x86;$(VC_LibThirdParty_x86) - - NotUsing - Level3 - .;$(INTELOCLSDKROOT)include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories) - Full - true - true + + NotUsing + Level3 + .;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories) + Full + true + true false true ENABLE_OPENCL;_UNICODE;UNICODE;%(PreprocessorDefinitions) - Console - true - true - OpenCL.lib;shlwapi.lib;%(AdditionalDependencies) + Console + true + true + cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies) mainCRTStartup - $(INTELOCLSDKROOT)lib\x64 + $(CUDA_PATH)\lib\x64 "$(INTELOCLSDKROOT)bin\x64\ioc64.exe" -cmd=build -input="%(FullPath)" -output="x64\Release\%(Filename).out" -VS -device=GPU -simd=default -targetos=current -bo=" " @@ -128,16 +128,19 @@ false + + + + - - + compile.bat NotUsing Level3 - .;$(INTELOCLSDKROOT)include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories) + .;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories) MaxSpeed true false @@ -149,38 +152,40 @@ Console true true - shlwapi.lib;OpenCL.lib;%(AdditionalDependencies) + cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies) mainCRTStartup __tcmalloc - $(INTELOCLSDKROOT)lib\x86 + $(CUDA_PATH)\lib\Win32 + + compile.bat + - - NotUsing - Level3 - .;$(INTELOCLSDKROOT)include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories) + + NotUsing + Level3 + .;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories) EditAndContinue Disabled ENABLE_OPENCL;ENABLE_OPENCL_CHECK;_UNICODE;UNICODE;%(PreprocessorDefinitions) Console - true - OpenCL.lib;shlwapi.lib;%(AdditionalDependencies) + true + cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies) mainCRTStartup - $(INTELOCLSDKROOT)lib\x64 + $(CUDA_PATH)\lib\x64 - - + compile.bat NotUsing Level3 - .;$(INTELOCLSDKROOT)include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories) + .;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories) EditAndContinue Disabled PERFTOOLS_DLL_DECL=;ENABLE_OPENCL;%(PreprocessorDefinitions) @@ -188,11 +193,14 @@ Console true - shlwapi.lib;OpenCL.lib;%(AdditionalDependencies) + cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies) mainCRTStartup __tcmalloc - $(INTELOCLSDKROOT)lib\x86 + $(CUDA_PATH)\lib\Win32 + + compile.bat + @@ -200,6 +208,7 @@ + @@ -297,6 +306,7 @@ + @@ -388,6 +398,12 @@ + + Document + true + CUDA Code Builder + $(ProjectDir)clguetzli\compile.bat + diff --git a/guetzli.vcxproj.filters b/guetzli.vcxproj.filters index fc895c38..bfdedbe0 100644 --- a/guetzli.vcxproj.filters +++ b/guetzli.vcxproj.filters @@ -312,6 +312,9 @@ clguetzli + + clguetzli + @@ -581,6 +584,9 @@ clguetzli + + clguetzli + @@ -598,6 +604,9 @@ third_party\zlib + + clguetzli + From 99631b87fea40e3e5faf6ef2dac39845b643a98c Mon Sep 17 00:00:00 2001 From: strongtu Date: Thu, 1 Jun 2017 10:00:43 +0800 Subject: [PATCH 116/189] support cuda opt --- clguetzli/clbutter_comparator.cpp | 9 ++++++++- clguetzli/clguetzli.cpp | 22 ++++++++++++++++++++++ clguetzli/clguetzli.cu | 21 ++++++++++++++++++++- clguetzli/clguetzli.h | 3 +++ guetzli/guetzli.cc | 4 ++++ 5 files changed, 57 insertions(+), 2 deletions(-) diff --git a/clguetzli/clbutter_comparator.cpp b/clguetzli/clbutter_comparator.cpp index c6b4ca0b..fd31632d 100644 --- a/clguetzli/clbutter_comparator.cpp +++ b/clguetzli/clbutter_comparator.cpp @@ -229,7 +229,14 @@ namespace butteraugli result_org = *result; } - _ScaleImage(scale, result); + if (g_useCuda) + { + cuScaleImage(result->data(), result->size(), scale); + } + else + { + _ScaleImage(scale, result); + } if (g_checkOpenCL && result->size() > 64) { diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index b58f7dc8..77aa68bb 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -2,8 +2,10 @@ #include #include #include "clguetzli.h" +#include "ocu.h" extern bool g_useOpenCL = false; +extern bool g_useCuda = false; extern bool g_checkOpenCL = false; ocl_args_d_t& getOcl(void) @@ -1225,3 +1227,23 @@ void clCalculateDiffmapEx(cl_mem diffmap/*in,out*/, const size_t xsize, const si clReleaseMemObject(blurred); } +void cuScaleImage(float *img, size_t length, double scale) +{ + ocu_args_d_t &ocu = getOcu(); + CUdeviceptr m = ocu.allocMem(length * sizeof(float), img); + + void *args[2] = { &m, &scale}; + + CUresult r = cuLaunchKernel(ocu.kernel[KERNEL_SCALEIMAGE], + 1, 1, 1, + length, 1, 1, + 0, + ocu.stream, args, NULL); + + r = cuStreamSynchronize(ocu.stream); + + cuMemcpyDtoH(img, m, length * sizeof(float)); + + cuMemFree(m); + return; +} \ No newline at end of file diff --git a/clguetzli/clguetzli.cu b/clguetzli/clguetzli.cu index b76a81e7..3ed6d05e 100644 --- a/clguetzli/clguetzli.cu +++ b/clguetzli/clguetzli.cu @@ -1,5 +1,24 @@ +#ifdef __CUDACC__ +//#ifdef __OPENCL_VERSION__ +__device__ int get_global_id(int dim) +{ + switch (dim) + { + case 0: + return threadIdx.x; + case 1: + return threadIdx.y; + case 2: + return threadIdx.z; + default: + return threadIdx.x; + } +} +#endif + + __global__ void clScaleImageEx(float *img, double scale) { - const int i = blockIdx.x; + const int i = get_global_id(0); img[i] *= scale; } \ No newline at end of file diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h index b5997fcd..760677fd 100644 --- a/clguetzli/clguetzli.h +++ b/clguetzli/clguetzli.h @@ -7,6 +7,7 @@ #include "clguetzli.cl.h" extern bool g_useOpenCL; +extern bool g_useCuda; extern bool g_checkOpenCL; void clOpsinDynamicsImage( @@ -134,6 +135,8 @@ void clAddBorderEx(cl_mem out, const size_t xsize, const size_t ysize, const int void clCalculateDiffmapEx(cl_mem diffmap/*in,out*/, const size_t xsize, const size_t ysize, const int step); +void cuScaleImage(float *img, size_t length, double scale); + class guetzli::OutputImage; namespace guetzli { diff --git a/guetzli/guetzli.cc b/guetzli/guetzli.cc index 40544d90..d8937978 100644 --- a/guetzli/guetzli.cc +++ b/guetzli/guetzli.cc @@ -227,6 +227,7 @@ void Usage() { " --memlimit M - Memory limit in MB. Guetzli will fail if unable to stay under\n" " the limit. Default limit is %d MB.\n" " --opencl - Use OpenCL\n" + " --cuda - Use CUDA\n" " --nomemlimit - Do not limit memory usage.\n", kDefaultJPEGQuality, kDefaultMemlimitMB); exit(1); } @@ -262,6 +263,9 @@ int main(int argc, char** argv) { else if (!strcmp(argv[opt_idx], "--opencl")) { g_useOpenCL = true; } + else if (!strcmp(argv[opt_idx], "--cuda")) { + g_useCuda = true; + } else if (!strcmp(argv[opt_idx], "--checkcl")) { g_checkOpenCL = true; } From ef025dab8c43b3200cd616508efdd8cb817a73a2 Mon Sep 17 00:00:00 2001 From: strongtu Date: Thu, 1 Jun 2017 14:37:37 +0800 Subject: [PATCH 117/189] =?UTF-8?q?=E8=BF=90=E8=A1=8C=E6=9C=9F=E7=BC=96?= =?UTF-8?q?=E8=AF=91.cu?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clguetzli.cl.cpp | 2 +- clguetzli/clguetzli.cl.h | 15 ++++++++-- clguetzli/clguetzli.cpp | 6 ++-- clguetzli/clguetzli.cu | 8 ++++-- clguetzli/clguetzli_test.cpp | 7 ----- clguetzli/ocu.cpp | 54 ++++++++++++++++++++++++++---------- clguetzli/ocu.h | 3 ++ compile.bat | 2 +- guetzli.vcxproj | 18 ++++-------- 9 files changed, 71 insertions(+), 44 deletions(-) diff --git a/clguetzli/clguetzli.cl.cpp b/clguetzli/clguetzli.cl.cpp index b3203fe9..cafb0bf7 100644 --- a/clguetzli/clguetzli.cl.cpp +++ b/clguetzli/clguetzli.cl.cpp @@ -22,7 +22,7 @@ void set_global_size(int dim, int size){ g_sizevec[dim] = size; } -#define __opencl +#define __checkcl #define abs(exper) fabs((exper)) #include "clguetzli.h" #include "clguetzli.cl" diff --git a/clguetzli/clguetzli.cl.h b/clguetzli/clguetzli.cl.h index 35b4ed3c..cf4d9212 100644 --- a/clguetzli/clguetzli.cl.h +++ b/clguetzli/clguetzli.cl.h @@ -2,6 +2,7 @@ #define __CLGUETZLI_CL_H__ #ifdef __cplusplus +#ifndef __CUDACC__ #define __kernel #define __private #define __global @@ -14,7 +15,7 @@ void set_global_id(int dim, int id); void set_global_size(int dim, int size); - #ifdef __opencl + #ifdef __checkcl typedef union ocl_channels_t { struct @@ -49,7 +50,10 @@ }; }ocl_channels; #endif -#else /*__cplusplus*/ +#endif +#endif /*__cplusplus*/ + +#ifdef __OPENCL_VERSION__ typedef union ocl_channels_t { struct @@ -65,7 +69,12 @@ }; }ocl_channels; -#endif /*__cplusplus*/ +#endif /*__OPENCL_VERSION__*/ + +#ifdef __CUDACC__ + #define __global + typedef unsigned short ushort; +#endif /*__CUDACC__*/ typedef short coeff_t; diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index 77aa68bb..0606793b 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -1,7 +1,7 @@ +#include "clguetzli.h" #include #include #include -#include "clguetzli.h" #include "ocu.h" extern bool g_useOpenCL = false; @@ -1235,14 +1235,14 @@ void cuScaleImage(float *img, size_t length, double scale) void *args[2] = { &m, &scale}; CUresult r = cuLaunchKernel(ocu.kernel[KERNEL_SCALEIMAGE], + length, 1, 1, 1, 1, 1, - length, 1, 1, 0, ocu.stream, args, NULL); r = cuStreamSynchronize(ocu.stream); - cuMemcpyDtoH(img, m, length * sizeof(float)); + r = cuMemcpyDtoH(img, m, length * sizeof(float)); cuMemFree(m); return; diff --git a/clguetzli/clguetzli.cu b/clguetzli/clguetzli.cu index 3ed6d05e..17b65143 100644 --- a/clguetzli/clguetzli.cu +++ b/clguetzli/clguetzli.cu @@ -1,3 +1,5 @@ +#include "clguetzli\clguetzli.cl.h" + #ifdef __CUDACC__ //#ifdef __OPENCL_VERSION__ __device__ int get_global_id(int dim) @@ -17,8 +19,8 @@ __device__ int get_global_id(int dim) #endif -__global__ void clScaleImageEx(float *img, double scale) +extern "C" __global__ void clScaleImageEx(float * img, double scale) { const int i = get_global_id(0); - img[i] *= scale; -} \ No newline at end of file + img[i] = 0.0001; +} diff --git a/clguetzli/clguetzli_test.cpp b/clguetzli/clguetzli_test.cpp index e98e6369..15c1317b 100644 --- a/clguetzli/clguetzli_test.cpp +++ b/clguetzli/clguetzli_test.cpp @@ -405,13 +405,6 @@ void tclMinSquareVal(const float *img, size_t square_size, size_t offset, void tclScaleImage(double scale, const float *result_org, const float *result_cmp, size_t length) { -/* - ocu_args_d_t &ocu = getOcu(); - CUdeviceptr m = ocu.allocMem(length * sizeof(float), result_org); - cuLaunchKernel(ocu.kernel[KERNEL_SCALEIMAGE], - cuMemFree(m); - return; -*/ cl_int err = 0; ocl_args_d_t &ocl = getOcl(); cl_mem mem_result_org = ocl.allocMem(length * sizeof(float), result_org); diff --git a/clguetzli/ocu.cpp b/clguetzli/ocu.cpp index 5b5da9b7..7846afcb 100644 --- a/clguetzli/ocu.cpp +++ b/clguetzli/ocu.cpp @@ -1,5 +1,5 @@ - #include +#include #include "ocu.h" ocu_args_d_t& getOcu(void) @@ -9,13 +9,12 @@ ocu_args_d_t& getOcu(void) if (bInit == true) return ocu; - cuInit(0); - - CUresult r; - CUcontext ctxt; + CUresult r = cuInit(0); CUdevice dev = 0; + CUcontext ctxt; + CUstream stream; - cuCtxCreate(&ctxt, CU_CTX_SCHED_BLOCKING_SYNC, dev); + r = cuCtxCreate(&ctxt, CU_CTX_SCHED_BLOCKING_SYNC, dev); char name[1024]; int proc_count = 0; @@ -26,28 +25,53 @@ ocu_args_d_t& getOcu(void) cuDeviceGetAttribute(&cap_minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev); cuDeviceGetAttribute(&proc_count, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev); cuDeviceGetAttribute(&thread_count, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev); - LogError("CUDA Adapter:%s Ver%d.%d (%d x %d)\r\n", name, cap_major, cap_minor, proc_count, thread_count); - - CUmodule mod; + LogError("CUDA Adapter:%s Ver%d.%d MP %d Core %d)\r\n", name, cap_major, cap_minor, proc_count, thread_count); char* source = nullptr; size_t src_size = 0; - ReadSourceFromFile("clguetzli/clguetzli.cu.ptx30", &source, &src_size); + ReadSourceFromFile("clguetzli/clguetzli.cu", &source, &src_size); + + nvrtcProgram prog; + const char *opts[] = { "-arch=compute_30", "--fmad=false" }; + nvrtcCreateProgram(&prog, source, "clguetzli.cu", 0, NULL, NULL); + nvrtcCompileProgram(prog, 2, opts); + // Obtain compilation log from the program. + size_t logSize = 0; + nvrtcGetProgramLogSize(prog, &logSize); + char *log = new char[logSize]; + nvrtcGetProgramLog(prog, log); + + // Obtain PTX from the program. + size_t ptxSize = 0; + nvrtcGetPTXSize(prog, &ptxSize); + char *ptx = new char[ptxSize]; + nvrtcGetPTX(prog, ptx); + + LogError("BuildInfo:\r\n%s\r\n", log); + + CUmodule mod; CUjit_option jit_options[2]; void *jit_optvals[2]; jit_options[0] = CU_JIT_CACHE_MODE; jit_optvals[0] = (void*)(uintptr_t)CU_JIT_CACHE_OPTION_CA; - cuModuleLoadDataEx(&mod, source, 1, jit_options, jit_optvals); + r = cuModuleLoadDataEx(&mod, ptx, 1, jit_options, jit_optvals); delete[] source; + delete[] log; + delete[] ptx; - cuModuleGetFunction(&ocu.kernel[KERNEL_SCALEIMAGE], mod, "clScaleImageEx"); + r = cuModuleGetFunction(&ocu.kernel[KERNEL_SCALEIMAGE], mod, "clScaleImageEx"); cuCtxSetCacheConfig(CU_FUNC_CACHE_PREFER_SHARED); cuCtxSetSharedMemConfig(CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE); - cuStreamCreate(&ocu.stream, 0); + cuStreamCreate(&stream, 0); + + ocu.dev = dev; + ocu.stream = stream; + ocu.mod = mod; + ocu.ctxt = ctxt; return ocu; } @@ -59,7 +83,9 @@ ocu_args_d_t::ocu_args_d_t() ocu_args_d_t::~ocu_args_d_t() { - + cuModuleUnload(mod); + cuCtxDestroy(ctxt); + cuStreamDestroy(stream); } CUdeviceptr ocu_args_d_t::allocMem(size_t s, const void *init) diff --git a/clguetzli/ocu.h b/clguetzli/ocu.h index f33c856f..0ab97945 100644 --- a/clguetzli/ocu.h +++ b/clguetzli/ocu.h @@ -16,4 +16,7 @@ struct ocu_args_d_t CUfunction kernel[KERNEL_COUNT]; CUstream stream; + CUmodule mod; + CUcontext ctxt; + CUdevice dev; }; \ No newline at end of file diff --git a/compile.bat b/compile.bat index b27c9e49..05cdd472 100644 --- a/compile.bat +++ b/compile.bat @@ -157,4 +157,4 @@ echo %0 store 10.0.10240.0 :end -nvcc -Xcompiler "/wd 4819" -arch=sm_30 -ptx -o clguetzli\clguetzli.cu.ptx30 clguetzli\clguetzli.cu \ No newline at end of file +nvcc -Xcompiler "/wd 4819" -I"./" -arch=sm_30 -ptx -o clguetzli\clguetzli.cu.ptx30 clguetzli\clguetzli.cu \ No newline at end of file diff --git a/guetzli.vcxproj b/guetzli.vcxproj index 7f3b26ca..e31abaff 100644 --- a/guetzli.vcxproj +++ b/guetzli.vcxproj @@ -114,7 +114,7 @@ Console true true - cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies) + nvrtc.lib;cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies) mainCRTStartup $(CUDA_PATH)\lib\x64 @@ -132,9 +132,7 @@ - - compile.bat - + @@ -173,13 +171,11 @@ Console true - cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies) + nvrtc.lib;cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies) mainCRTStartup $(CUDA_PATH)\lib\x64 - - compile.bat - + @@ -193,14 +189,12 @@ Console true - cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies) + nvrtc.lib;cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies) mainCRTStartup __tcmalloc $(CUDA_PATH)\lib\Win32 - - compile.bat - + From a8bcf1f2df768730ce378f239da0ffb22cdc2512 Mon Sep 17 00:00:00 2001 From: strongtu Date: Thu, 1 Jun 2017 19:24:49 +0800 Subject: [PATCH 118/189] =?UTF-8?q?=E5=85=BC=E5=AE=B9CUDA=E7=BC=96?= =?UTF-8?q?=E8=AF=91=EF=BC=8C=E7=BC=96=E8=AF=91=E5=99=A8=E8=AF=AD=E6=B3=95?= =?UTF-8?q?=E6=A3=80=E6=9F=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clguetzli.cl | 141 ++++++++++++++++++++------------------- clguetzli/clguetzli.cl.h | 19 +++++- clguetzli/clguetzli.cpp | 4 +- clguetzli/clguetzli.cu | 11 +-- clguetzli/ocu.cpp | 19 ++++++ compile.bat | 3 +- guetzli.vcxproj | 18 +++-- guetzli.vcxproj.filters | 8 ++- 8 files changed, 132 insertions(+), 91 deletions(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index cf7bca3e..ec04630d 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -22,21 +22,21 @@ typedef struct __IntFloatPairList IntFloatPair *pData; }IntFloatPairList; -void XybToVals(double x, double y, double z, double *valx, double *valy, double *valz); -double InterpolateClampNegative(__global const double *array, int size, double sx); -void XybDiffLowFreqSquaredAccumulate(double r0, double g0, double b0, +__device__ void XybToVals(double x, double y, double z, double *valx, double *valy, double *valz); +__device__ double InterpolateClampNegative(__global const double *array, int size, double sx); +__device__ void XybDiffLowFreqSquaredAccumulate(double r0, double g0, double b0, double r1, double g1, double b1, double factor, double res[3]); -double DotProduct(__global const float u[3], const double v[3]); -void OpsinAbsorbance(const double in[3], double out[3]); -void RgbToXyb(double r, double g, double b, double *valx, double *valy, double *valz); -double Gamma(double v); -void ButteraugliBlockDiff(__private double xyb0[3 * kBlockSize], +__device__ double DotProduct(__global const float u[3], const double v[3]); +__device__ void OpsinAbsorbance(const double in[3], double out[3]); +__device__ void RgbToXyb(double r, double g, double b, double *valx, double *valy, double *valz); +__device__ double Gamma(double v); +__device__ void ButteraugliBlockDiff(__private double xyb0[3 * kBlockSize], __private double xyb1[3 * kBlockSize], double diff_xyb_dc[3], double diff_xyb_ac[3], double diff_xyb_edge_dc[3]); -void Butteraugli8x8CornerEdgeDetectorDiff( +__device__ void Butteraugli8x8CornerEdgeDetectorDiff( int pos_x, int pos_y, int xsize, @@ -45,9 +45,9 @@ void Butteraugli8x8CornerEdgeDetectorDiff( __global const float *r2, __global const float* g2, __global const float *b2, double* diff_xyb); -int MakeInputOrderEx(const coeff_t block[3*8*8], const coeff_t orig_block[3*8*8], IntFloatPairList *input_order); +__device__ int MakeInputOrderEx(const coeff_t block[3*8*8], const coeff_t orig_block[3*8*8], IntFloatPairList *input_order); -double CompareBlockFactor(const channel_info mayout_channel[3], +__device__ double CompareBlockFactor(const channel_info mayout_channel[3], const coeff_t* candidate_block, const int block_x, const int block_y, @@ -57,11 +57,11 @@ double CompareBlockFactor(const channel_info mayout_channel[3], const int image_height, const int factor); -void floatcopy(float *dst, const float *src, int size); -void coeffcopy(coeff_t *dst, const coeff_t *src, int size); -void coeffcopy_g(coeff_t *dst, __global const coeff_t *src, int size); -int list_erase(IntFloatPairList* list, int idx); -int list_push_back(IntFloatPairList* list, int i, float f); +__device__ void floatcopy(float *dst, const float *src, int size); +__device__ void coeffcopy(coeff_t *dst, const coeff_t *src, int size); +__device__ void coeffcopy_g(coeff_t *dst, __global const coeff_t *src, int size); +__device__ int list_erase(IntFloatPairList* list, int idx); +__device__ int list_push_back(IntFloatPairList* list, int i, float f); __kernel void clConvolutionEx( __global float* result, @@ -842,7 +842,7 @@ __kernel void clComputeBlockZeroingOrderEx( } } -void Butteraugli8x8CornerEdgeDetectorDiff( +__device__ void Butteraugli8x8CornerEdgeDetectorDiff( int pos_x, int pos_y, int xsize, @@ -898,11 +898,11 @@ void Butteraugli8x8CornerEdgeDetectorDiff( } } -double DotProduct(__global const float u[3], const double v[3]) { +__device__ double DotProduct(__global const float u[3], const double v[3]) { return u[0] * v[0] + u[1] * v[1] + u[2] * v[2]; } -double Interpolate(__constant const double *array, const int size, const double sx) { +__device__ double Interpolate(__constant_ex const double *array, const int size, const double sx) { double ix = fabs(sx); int baseix = (int)(ix); @@ -971,7 +971,7 @@ __constant double XybToVals_lut_y[21] = { XybToVals_off_y + 19 * XybToVals_inc_y, }; -void XybToVals( +__device__ void XybToVals( double x, double y, double z, double *valx, double *valy, double *valz) { @@ -1009,7 +1009,7 @@ __constant double XybLowFreqToVals_lut[21] = { 20 * XybLowFreqToVals_inc, }; -void XybLowFreqToVals(double x, double y, double z, +__device__ void XybLowFreqToVals(double x, double y, double z, double *valx, double *valy, double *valz) { const double xmul = 6.64482198135; const double ymul = 0.837846224276; @@ -1022,7 +1022,7 @@ void XybLowFreqToVals(double x, double y, double z, *valy = Interpolate(&XybLowFreqToVals_lut[0], 21, y * ymul); } -double InterpolateClampNegative(__global const double *array, +__device__ double InterpolateClampNegative(__global const double *array, int size, double sx) { if (sx < 0) { sx = 0; @@ -1041,7 +1041,7 @@ double InterpolateClampNegative(__global const double *array, return res; } -void XybDiffLowFreqSquaredAccumulate(double r0, double g0, double b0, +__device__ void XybDiffLowFreqSquaredAccumulate(double r0, double g0, double b0, double r1, double g1, double b1, double factor, double res[3]) { double valx0, valy0, valz0; @@ -1072,7 +1072,7 @@ typedef struct __Complex }Complex; __constant double kSqrtHalf = 0.70710678118654752440084436210484903; -void RealFFT8(const double* in, Complex* out) { +__device__ void RealFFT8(const double* in, Complex* out) { double t1, t2, t3, t5, t6, t7, t8; t8 = in[6]; t5 = in[2] - t8; @@ -1145,7 +1145,7 @@ void RealFFT8(const double* in, Complex* out) { out[6] = tmp; } -void TransposeBlock(Complex data[kBlockSize]) { +__device__ void TransposeBlock(Complex data[kBlockSize]) { for (int i = 0; i < kBlockEdge; i++) { for (int j = 0; j < i; j++) { Complex tmp = data[kBlockEdge * i + j]; @@ -1156,7 +1156,7 @@ void TransposeBlock(Complex data[kBlockSize]) { } // D. J. Bernstein's Fast Fourier Transform algorithm on 4 elements. -inline void FFT4(Complex* a) { +__device__ inline void FFT4(Complex* a) { double t1, t2, t3, t4, t5, t6, t7, t8; t5 = a[2].real; t1 = a[0].real - t5; @@ -1186,7 +1186,7 @@ inline void FFT4(Complex* a) { } // D. J. Bernstein's Fast Fourier Transform algorithm on 8 elements. -void FFT8(Complex* a) { +__device__ void FFT8(Complex* a) { const double kSqrtHalf = 0.70710678118654752440084436210484903; double t1, t2, t3, t4, t5, t6, t7, t8; @@ -1280,11 +1280,11 @@ void FFT8(Complex* a) { a[6] = tmp; } -double abssq(const Complex c) { +__device__ double abssq(const Complex c) { return c.real * c.real + c.imag * c.imag; } -void ButteraugliFFTSquared(__private double block[kBlockSize]) { +__device__ void ButteraugliFFTSquared(__private double block[kBlockSize]) { double global_mul = 0.000064; Complex block_c[kBlockSize]; @@ -1309,7 +1309,7 @@ void ButteraugliFFTSquared(__private double block[kBlockSize]) { } } -double RemoveRangeAroundZero(double v, double range) { +__device__ double RemoveRangeAroundZero(double v, double range) { if (v >= -range && v < range) { return 0; } @@ -1390,7 +1390,7 @@ __constant double csf8x8[kBlockHalf + kBlockEdgeHalf + 1] = { // Computes 8x8 FFT of each channel of xyb0 and xyb1 and adds the total squared // 3-dimensional xybdiff of the two blocks to diff_xyb_{dc,ac} and the average // diff on the edges to diff_xyb_edge_dc. -void ButteraugliBlockDiff(__private double xyb0[3 * kBlockSize], +__device__ void ButteraugliBlockDiff(__private double xyb0[3 * kBlockSize], __private double xyb1[3 * kBlockSize], double diff_xyb_dc[3], double diff_xyb_ac[3], @@ -1488,14 +1488,14 @@ __constant static float g_mix[12] = { 10.6524069248, }; -void OpsinAbsorbance(const double in[3], double out[3]) +__device__ void OpsinAbsorbance(const double in[3], double out[3]) { out[0] = g_mix[0] * in[0] + g_mix[1] * in[1] + g_mix[2] * in[2] + g_mix[3]; out[1] = g_mix[4] * in[0] + g_mix[5] * in[1] + g_mix[6] * in[2] + g_mix[7]; out[2] = g_mix[8] * in[0] + g_mix[9] * in[1] + g_mix[10] * in[2] + g_mix[11]; } -double EvaluatePolynomial(const double x, __constant const double *coefficients, int n) +__device__ double EvaluatePolynomial(const double x, __constant_ex const double *coefficients, int n) { double b1 = 0.0; double b2 = 0.0; @@ -1526,7 +1526,7 @@ static __constant double g_gamma_q[5 + 1] = { 4.711532733641639, 0.899112889751053, 0.035662329617191, }; -double Gamma(double v) +__device__ double Gamma(double v) { const double min_value = 0.770000000000000; const double max_value = 274.579999999999984; @@ -1539,7 +1539,7 @@ double Gamma(double v) return (float)(yp / yq); } -void RgbToXyb(double r, double g, double b, double *valx, double *valy, double *valz) +__device__ void RgbToXyb(double r, double g, double b, double *valx, double *valy, double *valz) { const double a0 = 1.01611726948; const double a1 = 0.982482243696; @@ -1552,7 +1552,7 @@ void RgbToXyb(double r, double g, double b, double *valx, double *valy, double * // chrisk todo // return size -int list_push_back(IntFloatPairList* list, int i, float f) +__device__ int list_push_back(IntFloatPairList* list, int i, float f) { list->pData[list->size].idx = i; list->pData[list->size].err = f; @@ -1561,7 +1561,7 @@ int list_push_back(IntFloatPairList* list, int i, float f) // chrisk todo // remove idx and return size -int list_erase(IntFloatPairList* list, int idx) +__device__ int list_erase(IntFloatPairList* list, int idx) { for (int i = idx; i < list->size - 1; i++) { @@ -1572,7 +1572,7 @@ int list_erase(IntFloatPairList* list, int idx) } // chrisk todo -int SortInputOrder(DCTScoreData* input_order, int size) +__device__ int SortInputOrder(DCTScoreData* input_order, int size) { int i, j; DCTScoreData tmp; @@ -1984,7 +1984,7 @@ __constant static float bias[192] = { // chrisk todo // return the count of Non-zero item -int MakeInputOrder(__global const coeff_t *block, __global const coeff_t *orig_block, IntFloatPairList *input_order, int block_size) +__device__ int MakeInputOrder(__global const coeff_t *block, __global const coeff_t *orig_block, IntFloatPairList *input_order, int block_size) { int size = 0; for (int c = 0; c < 3; ++c) { @@ -2011,7 +2011,7 @@ __constant static int kIDCTMatrix[kDCTBlockSize] = { }; // Computes out[x] = sum{kIDCTMatrix[8*x+u]*in[u*stride]; for u in [0..7]} -void Compute1dIDCT(const coeff_t* in, const int stride, int out[8]) { +__device__ void Compute1dIDCT(const coeff_t* in, const int stride, int out[8]) { int tmp0, tmp1, tmp2, tmp3, tmp4; tmp1 = kIDCTMatrix[0] * in[0]; @@ -2109,7 +2109,7 @@ void Compute1dIDCT(const coeff_t* in, const int stride, int out[8]) { out[7] -= tmp1; } -void CoeffToIDCT(__private const coeff_t block[8*8], uchar out[8*8]) +__device__ void CoeffToIDCT(__private const coeff_t block[8*8], uchar out[8*8]) { coeff_t colidcts[kDCTBlockSize]; const int kColScale = 11; @@ -2136,7 +2136,7 @@ void CoeffToIDCT(__private const coeff_t block[8*8], uchar out[8*8]) } } -void IDCTToPixel8x8(const uchar idct[8 * 8], ushort pixels_[8 * 8]) +__device__ void IDCTToPixel8x8(const uchar idct[8 * 8], ushort pixels_[8 * 8]) { const int block_x = 0; const int block_y = 0; @@ -2154,7 +2154,7 @@ void IDCTToPixel8x8(const uchar idct[8 * 8], ushort pixels_[8 * 8]) } } -void IDCTToPixel16x16(const uchar idct[8 * 8], ushort pixels_out[16 * 16], __global const ushort *pixel_orig, int block_x, int block_y, int width_, int height_) +__device__ void IDCTToPixel16x16(const uchar idct[8 * 8], ushort pixels_out[16 * 16], __global const ushort *pixel_orig, int block_x, int block_y, int width_, int height_) { // Fill in the 10x10 pixel area in the subsampled image that will be the // basis of the upsampling. This area is enough to hold the 3x3 kernel of @@ -2223,7 +2223,7 @@ void IDCTToPixel16x16(const uchar idct[8 * 8], ushort pixels_out[16 * 16], __glo } // out = [YUVYUV....YUVYUV] -void PixelToYUV(ushort pixels_[8 * 8], uchar out[8 * 8], int xsize/* = 8*/, int ysize/* = 8*/) +__device__ void PixelToYUV(ushort pixels_[8 * 8], uchar out[8 * 8], int xsize/* = 8*/, int ysize/* = 8*/) { const int stride = 3; @@ -2423,9 +2423,10 @@ __constant static uchar kRangeLimitLut[4 * 256] = { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }; -void YUVToRGB(__private uchar pixelBlock[3*8*8], int size /*= 8 * 8*/) +__device__ void YUVToRGB(__private uchar pixelBlock[3*8*8], int size /*= 8 * 8*/) { - __constant uchar* kRangeLimit = kRangeLimitLut + 384; + __constant_ex uchar* kRangeLimit = kRangeLimitLut + 384; + for (int i = 0; i < size; i++) { uchar *pixel = &pixelBlock[i * 3]; @@ -2698,12 +2699,12 @@ __constant static double kSrgb8ToLinearTable[256] = { 255.000000, }; - -void YUVToImage(__private uchar yuv[3 * 8 * 8], float* r, float* g, float* b, int xsize/* = 8*/, int ysize/* = 8*/, int inside_x/* = 8*/, int inside_y/* = 8*/) +__device__ void YUVToImage(__private uchar yuv[3 * 8 * 8], float* r, float* g, float* b, int xsize/* = 8*/, int ysize/* = 8*/, int inside_x/* = 8*/, int inside_y/* = 8*/) { YUVToRGB(yuv, xsize * ysize); - const __constant double* lut = kSrgb8ToLinearTable; +#define lut kSrgb8ToLinearTable +// const __constant double* lut = kSrgb8ToLinearTable; for (int i = 0; i < xsize * ysize; i++) { @@ -2731,11 +2732,11 @@ void YUVToImage(__private uchar yuv[3 * 8 * 8], float* r, float* g, float* b, in b[y * xsize + x] = b[idx]; } } +#undef lut } - // chrisk todo -void BlockToImage(__private const coeff_t block[8*8*3], float r[8*8], float g[8*8], float b[8*8], int inside_x, int inside_y) +__device__ void BlockToImage(__private const coeff_t block[8*8*3], float r[8*8], float g[8*8], float b[8*8], int inside_x, int inside_y) { uchar idct[3][8 * 8]; CoeffToIDCT(&block[0], idct[0]); @@ -2782,7 +2783,7 @@ void BlockToImage(__private const coeff_t block[8*8*3], float r[8*8], float g[8* } } -void CoeffToYUV16x16(__private const coeff_t block[8 * 8], uchar *yuv, __global const ushort *pixel_orig, int block_x, int block_y, int width_, int height_) +__device__ void CoeffToYUV16x16(__private const coeff_t block[8 * 8], uchar *yuv, __global const ushort *pixel_orig, int block_x, int block_y, int width_, int height_) { uchar idct[8 * 8]; CoeffToIDCT(&block[0], &idct[0]); @@ -2793,7 +2794,7 @@ void CoeffToYUV16x16(__private const coeff_t block[8 * 8], uchar *yuv, __global PixelToYUV(pixels, yuv, 16, 16); } -void CoeffToYUV16x16_g(__global const coeff_t block[8 * 8], uchar *yuv, __global const ushort *pixel_orig, int block_x, int block_y, int width_, int height_) +__device__ void CoeffToYUV16x16_g(__global const coeff_t block[8 * 8], uchar *yuv, __global const ushort *pixel_orig, int block_x, int block_y, int width_, int height_) { coeff_t b[8 * 8]; for (int i = 0; i < 8 * 8; i++) @@ -2803,7 +2804,7 @@ void CoeffToYUV16x16_g(__global const coeff_t block[8 * 8], uchar *yuv, __global CoeffToYUV16x16(b, yuv, pixel_orig, block_x, block_y, width_, height_); } -void CoeffToYUV8x8(__private const coeff_t block[8 * 8], uchar *yuv) +__device__ void CoeffToYUV8x8(__private const coeff_t block[8 * 8], uchar *yuv) { uchar idct[8 * 8]; CoeffToIDCT(&block[0], &idct[0]); @@ -2814,7 +2815,7 @@ void CoeffToYUV8x8(__private const coeff_t block[8 * 8], uchar *yuv) PixelToYUV(pixels, yuv, 8, 8); } -void CoeffToYUV8x8_g(__global const coeff_t block[8 * 8], uchar *yuv) +__device__ void CoeffToYUV8x8_g(__global const coeff_t block[8 * 8], uchar *yuv) { coeff_t b[8 * 8]; for (int i = 0; i < 8 * 8; i++) @@ -2825,7 +2826,7 @@ void CoeffToYUV8x8_g(__global const coeff_t block[8 * 8], uchar *yuv) CoeffToYUV8x8(b, yuv); } -void Copy8x8To16x16(const uchar yuv8x8[3 * 8 * 8], uchar yuv16x16[3 * 16 * 16], int off_x, int off_y) +__device__ void Copy8x8To16x16(const uchar yuv8x8[3 * 8 * 8], uchar yuv16x16[3 * 16 * 16], int off_x, int off_y) { for (int y = 0; y < 8; y++) { @@ -2838,7 +2839,7 @@ void Copy8x8To16x16(const uchar yuv8x8[3 * 8 * 8], uchar yuv16x16[3 * 16 * 16], } } -void Copy16x16To8x8(const uchar yuv16x16[3 * 16 * 16], uchar yuv8x8[3 * 8 * 8], int off_x, int off_y) +__device__ void Copy16x16To8x8(const uchar yuv16x16[3 * 16 * 16], uchar yuv8x8[3 * 8 * 8], int off_x, int off_y) { for (int y = 0; y < 8; y++) { @@ -2851,7 +2852,7 @@ void Copy16x16To8x8(const uchar yuv16x16[3 * 16 * 16], uchar yuv8x8[3 * 8 * 8], } } -void Copy16x16ToChannel(const float rgb16x16[3][16 * 16], float r[8 * 8], float g[8 * 8], float b[8 * 8], int off_x, int off_y) +__device__ void Copy16x16ToChannel(const float rgb16x16[3][16 * 16], float r[8 * 8], float g[8 * 8], float b[8 * 8], int off_x, int off_y) { for (int y = 0; y < 8; y++) { @@ -2866,7 +2867,7 @@ void Copy16x16ToChannel(const float rgb16x16[3][16 * 16], float r[8 * 8], float } } -void Convolution(size_t xsize, size_t ysize, +__device__ void Convolution(size_t xsize, size_t ysize, int xstep, int len, int offset, const float* multipliers, const float* inp, @@ -2900,7 +2901,7 @@ void Convolution(size_t xsize, size_t ysize, // ian todo // ¼ÆËã½á¹ûÊä³öµ½output -void BlurEx(const float *r, int xsize, int ysize, double kSigma, double border_ratio, float *output) +__device__ void BlurEx(const float *r, int xsize, int ysize, double kSigma, double border_ratio, float *output) { // ²Î¿¼clBlurEx2µÄʵÏÖ£¬sigma = 1.1£¬Õâʱstep¡¢diff¶¼½«ÌØ»¯Îª¹Ì¶¨Öµ const double sigma = 1.1; @@ -2925,7 +2926,7 @@ void BlurEx(const float *r, int xsize, int ysize, double kSigma, double border_r } // ian todo -void OpsinDynamicsImageBlock(__private float *r, __private float *g, __private float *b, +__device__ void OpsinDynamicsImageBlock(__private float *r, __private float *g, __private float *b, __private const float *r_blurred, __private const float *g_blurred, __private const float *b_blurred, int size) { @@ -2955,7 +2956,7 @@ void OpsinDynamicsImageBlock(__private float *r, __private float *g, __private f } // chrisk todo -void MaskHighIntensityChangeBlock(float *xyb0_x, float *xyb0_y, float *xyb0_b, +__device__ void MaskHighIntensityChangeBlock(float *xyb0_x, float *xyb0_y, float *xyb0_b, float *xyb1_x, float *xyb1_y, float *xyb1_b, const float *c0_x, const float *c0_y, const float *c0_b, const float *c1_x, const float *c1_y, const float *c1_b, @@ -3014,7 +3015,7 @@ void MaskHighIntensityChangeBlock(float *xyb0_x, float *xyb0_y, float *xyb0_b, } } -void floatcopy(float *dst, const float *src, int size) +__device__ void floatcopy(float *dst, const float *src, int size) { for (int i = 0; i < size; i++) { @@ -3022,7 +3023,7 @@ void floatcopy(float *dst, const float *src, int size) } } -void coeffcopy_g(coeff_t *dst, __global const coeff_t *src, int size) +__device__ void coeffcopy_g(coeff_t *dst, __global const coeff_t *src, int size) { for (int i = 0; i < size; i++) { @@ -3030,7 +3031,7 @@ void coeffcopy_g(coeff_t *dst, __global const coeff_t *src, int size) } } -void coeffcopy(coeff_t *dst, const coeff_t *src, int size) +__device__ void coeffcopy(coeff_t *dst, const coeff_t *src, int size) { for (int i = 0; i < size; i++) { @@ -3038,7 +3039,7 @@ void coeffcopy(coeff_t *dst, const coeff_t *src, int size) } } -void CalcOpsinDynamicsImage(__private float rgb[3][kDCTBlockSize]) +__device__ void CalcOpsinDynamicsImage(__private float rgb[3][kDCTBlockSize]) { float rgb_blurred[3][kDCTBlockSize]; for (int i = 0; i < 3; i++) @@ -3048,7 +3049,7 @@ void CalcOpsinDynamicsImage(__private float rgb[3][kDCTBlockSize]) OpsinDynamicsImageBlock(rgb[0], rgb[1], rgb[2], rgb_blurred[0], rgb_blurred[1], rgb_blurred[2], kDCTBlockSize); } -double ComputeImage8x8Block(__private float rgb0_c[3][kDCTBlockSize], __private float rgb1_c[3][kDCTBlockSize], const __global float* mask_scale_block) +__device__ double ComputeImage8x8Block(__private float rgb0_c[3][kDCTBlockSize], __private float rgb1_c[3][kDCTBlockSize], const __global float* mask_scale_block) { // CalcOpsinDynamicsImage(rgb0_c); CalcOpsinDynamicsImage(rgb1_c); @@ -3093,7 +3094,7 @@ double ComputeImage8x8Block(__private float rgb0_c[3][kDCTBlockSize], __private } // return the count of Non-zero item -int MakeInputOrderEx(const coeff_t block[3*8*8], const coeff_t orig_block[3*8*8], IntFloatPairList *input_order) +__device__ int MakeInputOrderEx(const coeff_t block[3*8*8], const coeff_t orig_block[3*8*8], IntFloatPairList *input_order) { const int block_size = 64; int size = 0; @@ -3110,7 +3111,7 @@ int MakeInputOrderEx(const coeff_t block[3*8*8], const coeff_t orig_block[3*8*8] return SortInputOrder(input_order->pData, size); } -int GetOrigBlock(float rgb0_c[3][kDCTBlockSize], +__device__ int GetOrigBlock(float rgb0_c[3][kDCTBlockSize], const __global float *orig_image_batch, int width_, int height_, int block_x, int block_y, @@ -3135,7 +3136,7 @@ int GetOrigBlock(float rgb0_c[3][kDCTBlockSize], return block_ix; } -double CompareBlockFactor(const channel_info mayout_channel[3], +__device__ double CompareBlockFactor(const channel_info mayout_channel[3], const coeff_t* candidate_block, const int block_x, const int block_y, diff --git a/clguetzli/clguetzli.cl.h b/clguetzli/clguetzli.cl.h index cf4d9212..8287e341 100644 --- a/clguetzli/clguetzli.cl.h +++ b/clguetzli/clguetzli.cl.h @@ -7,6 +7,9 @@ #define __private #define __global #define __constant + #define __constant_ex + #define __device__ + typedef unsigned char uchar; typedef unsigned short ushort; @@ -50,10 +53,12 @@ }; }ocl_channels; #endif -#endif +#endif /*__CUDACC__*/ #endif /*__cplusplus*/ #ifdef __OPENCL_VERSION__ + #define __constant_ex __constant + #define __device__ typedef union ocl_channels_t { struct @@ -68,12 +73,22 @@ float *ch[3]; }; }ocl_channels; - #endif /*__OPENCL_VERSION__*/ #ifdef __CUDACC__ + #define __kernel extern "C" __global__ + #define __private #define __global + #define __constant __constant__ + #define __constant_ex + typedef unsigned char uchar; typedef unsigned short ushort; + + __device__ int get_global_id(int dim); + __device__ int get_global_size(int dim); + void set_global_id(int dim, int id); + void set_global_size(int dim, int size); + #endif /*__CUDACC__*/ typedef short coeff_t; diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index 0606793b..47cf78d1 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -1235,8 +1235,8 @@ void cuScaleImage(float *img, size_t length, double scale) void *args[2] = { &m, &scale}; CUresult r = cuLaunchKernel(ocu.kernel[KERNEL_SCALEIMAGE], - length, 1, 1, - 1, 1, 1, + 1, 1, 1, + length, 1, 1, 0, ocu.stream, args, NULL); diff --git a/clguetzli/clguetzli.cu b/clguetzli/clguetzli.cu index 17b65143..d8591f5f 100644 --- a/clguetzli/clguetzli.cu +++ b/clguetzli/clguetzli.cu @@ -1,7 +1,5 @@ -#include "clguetzli\clguetzli.cl.h" +#include "clguetzli\clguetzli.cl" -#ifdef __CUDACC__ -//#ifdef __OPENCL_VERSION__ __device__ int get_global_id(int dim) { switch (dim) @@ -16,11 +14,8 @@ __device__ int get_global_id(int dim) return threadIdx.x; } } -#endif - -extern "C" __global__ void clScaleImageEx(float * img, double scale) +__device__ int get_global_size(int dim) { - const int i = get_global_id(0); - img[i] = 0.0001; + return 0; } diff --git a/clguetzli/ocu.cpp b/clguetzli/ocu.cpp index 7846afcb..32578c36 100644 --- a/clguetzli/ocu.cpp +++ b/clguetzli/ocu.cpp @@ -48,6 +48,7 @@ ocu_args_d_t& getOcu(void) char *ptx = new char[ptxSize]; nvrtcGetPTX(prog, ptx); + nvrtcDestroyProgram(&prog); LogError("BuildInfo:\r\n%s\r\n", log); CUmodule mod; @@ -61,7 +62,25 @@ ocu_args_d_t& getOcu(void) delete[] log; delete[] ptx; + r = cuModuleGetFunction(&ocu.kernel[KERNEL_CONVOLUTION], mod, "clConvolutionEx"); + r = cuModuleGetFunction(&ocu.kernel[KERNEL_CONVOLUTIONX], mod, "clConvolutionXEx"); + r = cuModuleGetFunction(&ocu.kernel[KERNEL_CONVOLUTIONY], mod, "clConvolutionYEx"); + r = cuModuleGetFunction(&ocu.kernel[KERNEL_SQUARESAMPLE], mod, "clSquareSampleEx"); + r = cuModuleGetFunction(&ocu.kernel[KERNEL_OPSINDYNAMICSIMAGE], mod, "clOpsinDynamicsImageEx"); + r = cuModuleGetFunction(&ocu.kernel[KERNEL_MASKHIGHINTENSITYCHANGE], mod, "clMaskHighIntensityChangeEx"); + r = cuModuleGetFunction(&ocu.kernel[KERNEL_EDGEDETECTOR], mod, "clEdgeDetectorMapEx"); + r = cuModuleGetFunction(&ocu.kernel[KERNEL_BLOCKDIFFMAP], mod, "clBlockDiffMapEx"); + r = cuModuleGetFunction(&ocu.kernel[KERNEL_EDGEDETECTORLOWFREQ], mod, "clEdgeDetectorLowFreqEx"); + r = cuModuleGetFunction(&ocu.kernel[KERNEL_DIFFPRECOMPUTE], mod, "clDiffPrecomputeEx"); r = cuModuleGetFunction(&ocu.kernel[KERNEL_SCALEIMAGE], mod, "clScaleImageEx"); + r = cuModuleGetFunction(&ocu.kernel[KERNEL_AVERAGE5X5], mod, "clAverage5x5Ex"); + r = cuModuleGetFunction(&ocu.kernel[KERNEL_MINSQUAREVAL], mod, "clMinSquareValEx"); + r = cuModuleGetFunction(&ocu.kernel[KERNEL_DOMASK], mod, "clDoMaskEx"); + r = cuModuleGetFunction(&ocu.kernel[KERNEL_COMBINECHANNELS], mod, "clCombineChannelsEx"); + r = cuModuleGetFunction(&ocu.kernel[KERNEL_UPSAMPLESQUAREROOT], mod, "clUpsampleSquareRootEx"); + r = cuModuleGetFunction(&ocu.kernel[KERNEL_REMOVEBORDER], mod, "clRemoveBorderEx"); + r = cuModuleGetFunction(&ocu.kernel[KERNEL_ADDBORDER], mod, "clAddBorderEx"); + r = cuModuleGetFunction(&ocu.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER], mod, "clComputeBlockZeroingOrderEx"); cuCtxSetCacheConfig(CU_FUNC_CACHE_PREFER_SHARED); cuCtxSetSharedMemConfig(CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE); diff --git a/compile.bat b/compile.bat index 05cdd472..8aa9430f 100644 --- a/compile.bat +++ b/compile.bat @@ -156,5 +156,4 @@ echo %0 store 10.0.10240.0 :end - -nvcc -Xcompiler "/wd 4819" -I"./" -arch=sm_30 -ptx -o clguetzli\clguetzli.cu.ptx30 clguetzli\clguetzli.cu \ No newline at end of file +nvcc -Xcompiler "/wd 4819" -I"./" -arch=compute_30 --fmad=false --machine 64 -G -g -ptx -o clguetzli\clguetzli.cu.ptx64 clguetzli\clguetzli.cu \ No newline at end of file diff --git a/guetzli.vcxproj b/guetzli.vcxproj index e31abaff..a9154d35 100644 --- a/guetzli.vcxproj +++ b/guetzli.vcxproj @@ -158,6 +158,9 @@ compile.bat + + CUDA CU + @@ -392,12 +395,19 @@ - + Document - true + false CUDA Code Builder - $(ProjectDir)clguetzli\compile.bat - + $(ProjectDir)compile.bat + compile.bat + false + cu.ptx + $(ProjectDir)compile.bat + CUDA Code Builder + cu.ptx + false + diff --git a/guetzli.vcxproj.filters b/guetzli.vcxproj.filters index bfdedbe0..17b8edf4 100644 --- a/guetzli.vcxproj.filters +++ b/guetzli.vcxproj.filters @@ -604,13 +604,15 @@ third_party\zlib - - clguetzli - clguetzli + + + clguetzli + + \ No newline at end of file From 4533a020130709fb5690637779f2f28e170f8b97 Mon Sep 17 00:00:00 2001 From: strongtu Date: Fri, 2 Jun 2017 12:24:52 +0800 Subject: [PATCH 119/189] =?UTF-8?q?cuScaleImage=E8=B7=91=E9=80=9A?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clguetzli.cl.h | 23 +++++++++++++++++++---- clguetzli/clguetzli.cpp | 26 ++++++++++++++++---------- clguetzli/clguetzli.cu | 3 ++- clguetzli/ocu.cpp | 32 ++++++++++++++++++-------------- guetzli.vcxproj | 5 ++++- 5 files changed, 59 insertions(+), 30 deletions(-) diff --git a/clguetzli/clguetzli.cl.h b/clguetzli/clguetzli.cl.h index 8287e341..aeabed49 100644 --- a/clguetzli/clguetzli.cl.h +++ b/clguetzli/clguetzli.cl.h @@ -84,10 +84,25 @@ typedef unsigned char uchar; typedef unsigned short ushort; - __device__ int get_global_id(int dim); - __device__ int get_global_size(int dim); - void set_global_id(int dim, int id); - void set_global_size(int dim, int size); + __device__ int get_global_id(int dim) + { + switch (dim) + { + case 0: return blockIdx.x; + case 1: return blockIdx.y; + default: return blockIdx.z; + } + } + + __device__ int get_global_size(int dim) + { + switch(dim) + { + case 0: return gridDim.x; + case 1: return gridDim.y; + default: return gridDim.z; + } + } #endif /*__CUDACC__*/ diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index 47cf78d1..09158f55 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -339,7 +339,7 @@ void clConvolutionEx( } } -void clConvolutionX( +void clConvolutionXEx( cl_mem result/*out*/, const cl_mem inp, size_t xsize, size_t ysize, const cl_mem multipliers, size_t len, @@ -375,7 +375,7 @@ void clConvolutionX( } } -void clConvolutionY( +void clConvolutionYEx( cl_mem result/*out*/, const cl_mem inp, size_t xsize, size_t ysize, const cl_mem multipliers, size_t len, @@ -511,15 +511,15 @@ void clBlurEx2(cl_mem image/*out, opt*/, size_t xsize, size_t ysize, if (xstep > 1) { ocl.allocA(sizeof(cl_float) * xsize * ysize); - clConvolutionX(ocl.srcA, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio); - clConvolutionY(result ? result : image, ocl.srcA, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio); + clConvolutionXEx(ocl.srcA, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio); + clConvolutionYEx(result ? result : image, ocl.srcA, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio); clSquareSampleEx(result ? result : image, result ? result : image, xsize, ysize, xstep, xstep); } else { ocl.allocA(sizeof(cl_float) * xsize * ysize); - clConvolutionX(ocl.srcA, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio); - clConvolutionY(result ? result : image, ocl.srcA, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio); + clConvolutionXEx(ocl.srcA, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio); + clConvolutionYEx(result ? result : image, ocl.srcA, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio); } clReleaseMemObject(mem_expn); @@ -1227,6 +1227,7 @@ void clCalculateDiffmapEx(cl_mem diffmap/*in,out*/, const size_t xsize, const si clReleaseMemObject(blurred); } +////////////////////////////////////////////////////////////////////////////////////// void cuScaleImage(float *img, size_t length, double scale) { ocu_args_d_t &ocu = getOcu(); @@ -1235,10 +1236,10 @@ void cuScaleImage(float *img, size_t length, double scale) void *args[2] = { &m, &scale}; CUresult r = cuLaunchKernel(ocu.kernel[KERNEL_SCALEIMAGE], - 1, 1, 1, - length, 1, 1, - 0, - ocu.stream, args, NULL); + length, 1, 1, + 1, 1, 1, + 0, + ocu.stream, args, NULL); r = cuStreamSynchronize(ocu.stream); @@ -1246,4 +1247,9 @@ void cuScaleImage(float *img, size_t length, double scale) cuMemFree(m); return; +} + +void cuOpsinDynamicsImage(float *r, float *g, float *b, const size_t xsize, const size_t ysize) +{ + } \ No newline at end of file diff --git a/clguetzli/clguetzli.cu b/clguetzli/clguetzli.cu index d8591f5f..dbca9906 100644 --- a/clguetzli/clguetzli.cu +++ b/clguetzli/clguetzli.cu @@ -1,5 +1,5 @@ #include "clguetzli\clguetzli.cl" - +/* __device__ int get_global_id(int dim) { switch (dim) @@ -19,3 +19,4 @@ __device__ int get_global_size(int dim) { return 0; } +*/ diff --git a/clguetzli/ocu.cpp b/clguetzli/ocu.cpp index 32578c36..40b37225 100644 --- a/clguetzli/ocu.cpp +++ b/clguetzli/ocu.cpp @@ -9,6 +9,8 @@ ocu_args_d_t& getOcu(void) if (bInit == true) return ocu; + bInit = true; + CUresult r = cuInit(0); CUdevice dev = 0; CUcontext ctxt; @@ -29,18 +31,24 @@ ocu_args_d_t& getOcu(void) char* source = nullptr; size_t src_size = 0; - ReadSourceFromFile("clguetzli/clguetzli.cu", &source, &src_size); + ReadSourceFromFile("clguetzli/clguetzli.cl", &source, &src_size); nvrtcProgram prog; - const char *opts[] = { "-arch=compute_30", "--fmad=false" }; - nvrtcCreateProgram(&prog, source, "clguetzli.cu", 0, NULL, NULL); - nvrtcCompileProgram(prog, 2, opts); + const char *opts[] = { "-arch=compute_30", "-default-device", "-G", "-I\"./\"", "--fmad=false" }; + nvrtcCreateProgram(&prog, source, "clguetzli.cl", 0, NULL, NULL); + nvrtcResult compile_result = nvrtcCompileProgram(prog, 3, opts); + if (NVRTC_SUCCESS != compile_result) + { + // Obtain compilation log from the program. + size_t logSize = 0; + nvrtcGetProgramLogSize(prog, &logSize); + char *log = new char[logSize]; + nvrtcGetProgramLog(prog, log); - // Obtain compilation log from the program. - size_t logSize = 0; - nvrtcGetProgramLogSize(prog, &logSize); - char *log = new char[logSize]; - nvrtcGetProgramLog(prog, log); + LogError("BuildInfo:\r\n%s\r\n", log); + + delete[] log; + } // Obtain PTX from the program. size_t ptxSize = 0; @@ -48,9 +56,6 @@ ocu_args_d_t& getOcu(void) char *ptx = new char[ptxSize]; nvrtcGetPTX(prog, ptx); - nvrtcDestroyProgram(&prog); - LogError("BuildInfo:\r\n%s\r\n", log); - CUmodule mod; CUjit_option jit_options[2]; void *jit_optvals[2]; @@ -59,7 +64,6 @@ ocu_args_d_t& getOcu(void) r = cuModuleLoadDataEx(&mod, ptx, 1, jit_options, jit_optvals); delete[] source; - delete[] log; delete[] ptx; r = cuModuleGetFunction(&ocu.kernel[KERNEL_CONVOLUTION], mod, "clConvolutionEx"); @@ -104,7 +108,7 @@ ocu_args_d_t::~ocu_args_d_t() { cuModuleUnload(mod); cuCtxDestroy(ctxt); - cuStreamDestroy(stream); +// cuStreamDestroy(stream); } CUdeviceptr ocu_args_d_t::allocMem(size_t s, const void *init) diff --git a/guetzli.vcxproj b/guetzli.vcxproj index a9154d35..f2711884 100644 --- a/guetzli.vcxproj +++ b/guetzli.vcxproj @@ -394,10 +394,12 @@ Document + true + true Document - false + true CUDA Code Builder $(ProjectDir)compile.bat compile.bat @@ -407,6 +409,7 @@ CUDA Code Builder cu.ptx false + true From 6240acea8875a451b168e8b0962528a0ce330b0a Mon Sep 17 00:00:00 2001 From: strongtu Date: Fri, 2 Jun 2017 14:47:39 +0800 Subject: [PATCH 120/189] =?UTF-8?q?cuOpsinDynamicsImage=20=E5=AE=8C?= =?UTF-8?q?=E6=88=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clbutter_comparator.cpp | 8 ++ clguetzli/clguetzli.cl.h | 43 +++++++++ clguetzli/clguetzli.cpp | 144 +++++++++++++++++++++++++++++- clguetzli/clguetzli.h | 9 ++ clguetzli/ocl.cpp | 3 +- clguetzli/ocl.h | 2 +- clguetzli/ocu.cpp | 24 ++++- clguetzli/ocu.h | 4 +- 8 files changed, 229 insertions(+), 8 deletions(-) diff --git a/clguetzli/clbutter_comparator.cpp b/clguetzli/clbutter_comparator.cpp index fd31632d..d3c1fdce 100644 --- a/clguetzli/clbutter_comparator.cpp +++ b/clguetzli/clbutter_comparator.cpp @@ -289,6 +289,14 @@ namespace butteraugli clOpsinDynamicsImage(r, g, b, xsize, ysize); } + else if (g_useCuda && xsize > 100 && ysize > 100) + { + float * r = rgb[0].data(); + float * g = rgb[1].data(); + float * b = rgb[2].data(); + + cuOpsinDynamicsImage(r, g, b, xsize, ysize); + } else { std::vector< std::vector> orig_rgb; diff --git a/clguetzli/clguetzli.cl.h b/clguetzli/clguetzli.cl.h index aeabed49..4e461399 100644 --- a/clguetzli/clguetzli.cl.h +++ b/clguetzli/clguetzli.cl.h @@ -1,6 +1,13 @@ #ifndef __CLGUETZLI_CL_H__ #define __CLGUETZLI_CL_H__ +#ifdef __cplusplus +#ifndef __CUDACC__ +#include "CL\cl.h" +#include "cuda.h" +#endif +#endif + #ifdef __cplusplus #ifndef __CUDACC__ #define __kernel @@ -32,6 +39,20 @@ float *ch[3]; }; }ocl_channels; + + typedef union ocu_channels_t + { + struct + { + float * r; + float * g; + float * b; + }; + union + { + float *ch[3]; + }; + }ocu_channels; #else typedef union ocl_channels_t { @@ -52,6 +73,26 @@ cl_mem ch[3]; }; }ocl_channels; + + typedef union ocu_channels_t + { + struct + { + CUdeviceptr r; + CUdeviceptr g; + CUdeviceptr b; + }; + struct + { + CUdeviceptr x; + CUdeviceptr y; + CUdeviceptr b_; + }; + union + { + CUdeviceptr ch[3]; + }; + }ocu_channels; #endif #endif /*__CUDACC__*/ #endif /*__cplusplus*/ @@ -59,6 +100,7 @@ #ifdef __OPENCL_VERSION__ #define __constant_ex __constant #define __device__ +/* typedef union ocl_channels_t { struct @@ -73,6 +115,7 @@ float *ch[3]; }; }ocl_channels; +*/ #endif /*__OPENCL_VERSION__*/ #ifdef __CUDACC__ diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index 09158f55..02d4ed71 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -71,7 +71,7 @@ ocl_args_d_t& getOcl(void) void clOpsinDynamicsImage(float *r, float *g, float *b, const size_t xsize, const size_t ysize) { - cl_int channel_size = xsize * ysize * sizeof(float); + size_t channel_size = xsize * ysize * sizeof(float); cl_int err = 0; ocl_args_d_t &ocl = getOcl(); @@ -379,8 +379,7 @@ void clConvolutionYEx( cl_mem result/*out*/, const cl_mem inp, size_t xsize, size_t ysize, const cl_mem multipliers, size_t len, - int xstep, int offset, double border_ratio - ) + int xstep, int offset, double border_ratio) { cl_int err = CL_SUCCESS; ocl_args_d_t &ocl = getOcl(); @@ -1233,7 +1232,7 @@ void cuScaleImage(float *img, size_t length, double scale) ocu_args_d_t &ocu = getOcu(); CUdeviceptr m = ocu.allocMem(length * sizeof(float), img); - void *args[2] = { &m, &scale}; + void *args[] = { &m, &scale}; CUresult r = cuLaunchKernel(ocu.kernel[KERNEL_SCALEIMAGE], length, 1, 1, @@ -1251,5 +1250,142 @@ void cuScaleImage(float *img, size_t length, double scale) void cuOpsinDynamicsImage(float *r, float *g, float *b, const size_t xsize, const size_t ysize) { + size_t channel_size = xsize * ysize * sizeof(float); + + CUresult err = CUDA_SUCCESS; + ocu_args_d_t &ocu = getOcu(); + ocu_channels rgb = ocu.allocMemChannels(channel_size, r, g, b); + + cuOpsinDynamicsImageEx(rgb, xsize, ysize); + + cuMemcpyDtoH(r, rgb.r, channel_size); + cuMemcpyDtoH(g, rgb.g, channel_size); + cuMemcpyDtoH(b, rgb.b, channel_size); + + ocu.releaseMemChannels(rgb); +} + +void cuConvolutionXEx( + CUdeviceptr result/*out*/, + const CUdeviceptr inp, size_t xsize, size_t ysize, + const CUdeviceptr multipliers, size_t len, + int xstep, int offset, double border_ratio) +{ + CUresult err = CUDA_SUCCESS; + ocu_args_d_t &ocu = getOcu(); + + const void *args[] = { &result, &inp, &multipliers, &len, &xstep, &offset, &border_ratio }; + + err = cuLaunchKernel(ocu.kernel[KERNEL_CONVOLUTIONX], + xsize, ysize, 1, + 1, 1, 1, + 0, + ocu.stream, (void**)args, NULL); + + err = cuStreamSynchronize(ocu.stream); +} + +void cuConvolutionYEx( + CUdeviceptr result/*out*/, + const CUdeviceptr inp, size_t xsize, size_t ysize, + const CUdeviceptr multipliers, size_t len, + int xstep, int offset, double border_ratio) +{ + CUresult err = CUDA_SUCCESS; + ocu_args_d_t &ocu = getOcu(); + + const void *args[] = { &result, &inp, &multipliers, &len, &xstep, &offset, &border_ratio }; + + err = cuLaunchKernel(ocu.kernel[KERNEL_CONVOLUTIONY], + xsize, ysize, 1, + 1, 1, 1, + 0, + ocu.stream, (void**)args, NULL); + + err = cuStreamSynchronize(ocu.stream); +} + +void cuSquareSampleEx( + CUdeviceptr result/*out*/, + const CUdeviceptr image, size_t xsize, size_t ysize, + size_t xstep, size_t ystep) +{ + CUresult err = CUDA_SUCCESS; + ocu_args_d_t &ocu = getOcu(); + + const void *args[] = { &result, &image, &xstep, &ystep}; + + err = cuLaunchKernel(ocu.kernel[KERNEL_SQUARESAMPLE], + xsize, ysize, 1, + 1, 1, 1, + 0, + ocu.stream, (void**)args, NULL); + + err = cuStreamSynchronize(ocu.stream); +} + +void cuBlurEx(CUdeviceptr image/*out, opt*/, const size_t xsize, const size_t ysize, + const double sigma, const double border_ratio, + CUdeviceptr result/*out, opt*/) +{ + double m = 2.25; // Accuracy increases when m is increased. + const double scaler = -1.0 / (2 * sigma * sigma); + // For m = 9.0: exp(-scaler * diff * diff) < 2^ {-52} + const int diff = std::max(1, m * fabs(sigma)); + const int expn_size = 2 * diff + 1; + std::vector expn(expn_size); + for (int i = -diff; i <= diff; ++i) { + expn[i + diff] = static_cast(exp(scaler * i * i)); + } + + const int xstep = std::max(1, int(sigma / 3)); + + CUresult err = CUDA_SUCCESS; + ocu_args_d_t &ocu = getOcu(); + CUdeviceptr mem_expn = ocu.allocMem(sizeof(cl_float) * expn_size, expn.data()); + + if (xstep > 1) + { + CUdeviceptr srcA = ocu.allocMem(sizeof(cl_float) * xsize * ysize); + cuConvolutionXEx(srcA, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio); + cuConvolutionYEx(result ? result : image, srcA, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio); + cuSquareSampleEx(result ? result : image, result ? result : image, xsize, ysize, xstep, xstep); + cuMemFree(srcA); + } + else + { + CUdeviceptr srcA = ocu.allocMem(sizeof(cl_float) * xsize * ysize); + cuConvolutionXEx(srcA, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio); + cuConvolutionYEx(result ? result : image, srcA, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio); + cuMemFree(srcA); + } + + cuMemFree(mem_expn); +} + +void cuOpsinDynamicsImageEx(ocu_channels &rgb, const size_t xsize, const size_t ysize) +{ + static const double kSigma = 1.1; + + size_t channel_size = xsize * ysize * sizeof(float); + + CUresult err = CUDA_SUCCESS; + ocu_args_d_t &ocu = getOcu(); + ocu_channels rgb_blurred = ocu.allocMemChannels(channel_size); + + cuBlurEx(rgb.r, xsize, ysize, kSigma, 0.0, rgb_blurred.r); + cuBlurEx(rgb.g, xsize, ysize, kSigma, 0.0, rgb_blurred.g); + cuBlurEx(rgb.b, xsize, ysize, kSigma, 0.0, rgb_blurred.b); + + void *args[] = { &rgb.r, &rgb.g, &rgb.b, &rgb_blurred.r, &rgb_blurred.g, &rgb_blurred.b}; + + CUresult r = cuLaunchKernel(ocu.kernel[KERNEL_OPSINDYNAMICSIMAGE], + xsize * ysize, 1, 1, + 1, 1, 1, + 0, + ocu.stream, args, NULL); + + r = cuStreamSynchronize(ocu.stream); + ocu.releaseMemChannels(rgb_blurred); } \ No newline at end of file diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h index 760677fd..40717d4f 100644 --- a/clguetzli/clguetzli.h +++ b/clguetzli/clguetzli.h @@ -135,8 +135,17 @@ void clAddBorderEx(cl_mem out, const size_t xsize, const size_t ysize, const int void clCalculateDiffmapEx(cl_mem diffmap/*in,out*/, const size_t xsize, const size_t ysize, const int step); +//////////////////////////////////////////////////////////////// +void cuOpsinDynamicsImage(float *r, float *g, float *b, const size_t xsize, const size_t ysize); + void cuScaleImage(float *img, size_t length, double scale); +void cuBlurEx(CUdeviceptr image/*out, opt*/, const size_t xsize, const size_t ysize, + const double sigma, const double border_ratio, + CUdeviceptr result = NULL/*out, opt*/); + +void cuOpsinDynamicsImageEx(ocu_channels &rgb, const size_t xsize, const size_t ysize); + class guetzli::OutputImage; namespace guetzli { diff --git a/clguetzli/ocl.cpp b/clguetzli/ocl.cpp index 73a8d022..aecd900e 100644 --- a/clguetzli/ocl.cpp +++ b/clguetzli/ocl.cpp @@ -250,11 +250,12 @@ ocl_channels ocl_args_d_t::allocMemChannels(size_t s, const void *c0, const void return img; } -void ocl_args_d_t::releaseMemChannels(ocl_channels rgb) +void ocl_args_d_t::releaseMemChannels(ocl_channels &rgb) { for (int i = 0; i < 3; i++) { clReleaseMemObject(rgb.ch[i]); + rgb.ch[i] = NULL; } } diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h index 04407f5c..37679770 100644 --- a/clguetzli/ocl.h +++ b/clguetzli/ocl.h @@ -78,7 +78,7 @@ struct ocl_args_d_t cl_mem allocMem(size_t s, const void *init = NULL); ocl_channels allocMemChannels(size_t s, const void *c0 = NULL, const void *c1 = NULL, const void *c2 = NULL); - void releaseMemChannels(ocl_channels rgb); + void releaseMemChannels(ocl_channels &rgb); // Regular OpenCL objects: cl_context context; // hold the context handler diff --git a/clguetzli/ocu.cpp b/clguetzli/ocu.cpp index 40b37225..3a263c3d 100644 --- a/clguetzli/ocu.cpp +++ b/clguetzli/ocu.cpp @@ -125,4 +125,26 @@ CUdeviceptr ocu_args_d_t::allocMem(size_t s, const void *init) } return mem; -} \ No newline at end of file +} + +ocu_channels ocu_args_d_t::allocMemChannels(size_t s, const void *c0, const void *c1, const void *c2) +{ + const void *c[3] = { c0, c1, c2 }; + + ocu_channels img; + for (int i = 0; i < 3; i++) + { + img.ch[i] = allocMem(s, c[i]); + } + + return img; +} + +void ocu_args_d_t::releaseMemChannels(ocu_channels &rgb) +{ + for (int i = 0; i < 3; i++) + { + cuMemFree(rgb.ch[i]); + rgb.ch[i] = NULL; + } +} diff --git a/clguetzli/ocu.h b/clguetzli/ocu.h index 0ab97945..63a4bb47 100644 --- a/clguetzli/ocu.h +++ b/clguetzli/ocu.h @@ -12,7 +12,9 @@ struct ocu_args_d_t ocu_args_d_t(); ~ocu_args_d_t(); - CUdeviceptr allocMem(size_t s, const void *init); + CUdeviceptr allocMem(size_t s, const void *init = NULL); + ocu_channels allocMemChannels(size_t s, const void *c0 = NULL, const void *c1 = NULL, const void *c2 = NULL); + void releaseMemChannels(ocu_channels &rgb); CUfunction kernel[KERNEL_COUNT]; CUstream stream; From 49d74ab07e3d21b99803529cdff1daf09eecd177 Mon Sep 17 00:00:00 2001 From: strongtu Date: Fri, 2 Jun 2017 14:58:49 +0800 Subject: [PATCH 121/189] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E5=89=A9=E4=BD=99?= =?UTF-8?q?=E7=9A=84cu=E5=85=A5=E5=8F=A3=E5=87=BD=E6=95=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clbutter_comparator.cpp | 9 +---- clguetzli/clguetzli.cpp | 57 +++++++++++++++++++------------ clguetzli/clguetzli.h | 26 +++++++++++++- 3 files changed, 61 insertions(+), 31 deletions(-) diff --git a/clguetzli/clbutter_comparator.cpp b/clguetzli/clbutter_comparator.cpp index d3c1fdce..fa0e3920 100644 --- a/clguetzli/clbutter_comparator.cpp +++ b/clguetzli/clbutter_comparator.cpp @@ -229,14 +229,7 @@ namespace butteraugli result_org = *result; } - if (g_useCuda) - { - cuScaleImage(result->data(), result->size(), scale); - } - else - { - _ScaleImage(scale, result); - } + _ScaleImage(scale, result); if (g_checkOpenCL && result->size() > 64) { diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index 02d4ed71..00449722 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -104,7 +104,6 @@ void clDiffmapOpsinDynamicsImage( size_t xsize, size_t ysize, size_t step) { - const size_t res_xsize = (xsize + step - 1) / step; const size_t res_ysize = (ysize + step - 1) / step; @@ -1227,27 +1226,6 @@ void clCalculateDiffmapEx(cl_mem diffmap/*in,out*/, const size_t xsize, const si } ////////////////////////////////////////////////////////////////////////////////////// -void cuScaleImage(float *img, size_t length, double scale) -{ - ocu_args_d_t &ocu = getOcu(); - CUdeviceptr m = ocu.allocMem(length * sizeof(float), img); - - void *args[] = { &m, &scale}; - - CUresult r = cuLaunchKernel(ocu.kernel[KERNEL_SCALEIMAGE], - length, 1, 1, - 1, 1, 1, - 0, - ocu.stream, args, NULL); - - r = cuStreamSynchronize(ocu.stream); - - r = cuMemcpyDtoH(img, m, length * sizeof(float)); - - cuMemFree(m); - return; -} - void cuOpsinDynamicsImage(float *r, float *g, float *b, const size_t xsize, const size_t ysize) { size_t channel_size = xsize * ysize * sizeof(float); @@ -1265,6 +1243,41 @@ void cuOpsinDynamicsImage(float *r, float *g, float *b, const size_t xsize, cons ocu.releaseMemChannels(rgb); } +void cuDiffmapOpsinDynamicsImage( + float* result, + const float* r, const float* g, const float* b, + const float* r2, const float* g2, const float* b2, + const size_t xsize, const size_t ysize, + const size_t step) +{ + +} + +void cuComputeBlockZeroingOrder( + guetzli::CoeffData *output_order_batch, + const channel_info orig_channel[3], + const float *orig_image_batch, + const float *mask_scale, + const int image_width, + const int image_height, + const channel_info mayout_channel[3], + const int factor, + const int comp_mask, + const float BlockErrorLimit) +{ + +} + +void cuMask( + float* mask_r, float* mask_g, float* mask_b, + float* maskdc_r, float* maskdc_g, float* maskdc_b, + const size_t xsize, const size_t ysize, + const float* r, const float* g, const float* b, + const float* r2, const float* g2, const float* b2) +{ + +} + void cuConvolutionXEx( CUdeviceptr result/*out*/, const CUdeviceptr inp, size_t xsize, size_t ysize, diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h index 40717d4f..61743e3d 100644 --- a/clguetzli/clguetzli.h +++ b/clguetzli/clguetzli.h @@ -138,7 +138,31 @@ void clCalculateDiffmapEx(cl_mem diffmap/*in,out*/, const size_t xsize, const si //////////////////////////////////////////////////////////////// void cuOpsinDynamicsImage(float *r, float *g, float *b, const size_t xsize, const size_t ysize); -void cuScaleImage(float *img, size_t length, double scale); +void cuDiffmapOpsinDynamicsImage( + float* result, + const float* r, const float* g, const float* b, + const float* r2, const float* g2, const float* b2, + const size_t xsize, const size_t ysize, + const size_t step); + +void cuComputeBlockZeroingOrder( + guetzli::CoeffData *output_order_batch, + const channel_info orig_channel[3], + const float *orig_image_batch, + const float *mask_scale, + const int image_width, + const int image_height, + const channel_info mayout_channel[3], + const int factor, + const int comp_mask, + const float BlockErrorLimit); + +void cuMask( + float* mask_r, float* mask_g, float* mask_b, + float* maskdc_r, float* maskdc_g, float* maskdc_b, + const size_t xsize, const size_t ysize, + const float* r, const float* g, const float* b, + const float* r2, const float* g2, const float* b2); void cuBlurEx(CUdeviceptr image/*out, opt*/, const size_t xsize, const size_t ysize, const double sigma, const double border_ratio, From 63ac0642b687ed4cd0ef16c8a577c6c6c6582a4c Mon Sep 17 00:00:00 2001 From: strongtu Date: Fri, 2 Jun 2017 15:15:00 +0800 Subject: [PATCH 122/189] =?UTF-8?q?=E7=AE=80=E5=8C=96=E7=82=B9=E4=BB=A3?= =?UTF-8?q?=E7=A0=81=E5=96=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- appveyor.yml | 32 -- clguetzli/clbutter_comparator.cpp | 6 +- clguetzli/clguetzli.cl | 74 +-- clguetzli/clguetzli.cl.cpp | 2 +- clguetzli/clguetzli.cpp | 69 +-- clguetzli/clguetzli.h | 14 +- clguetzli/clguetzli_test.cpp | 6 +- clguetzli/ocl.cpp | 6 +- clguetzli/ocl.h | 1 + clguetzli/utils.h | 1 + guetzli.vcxproj | 568 +++++++++--------- guetzli.vcxproj.filters | 928 +++++++++++++++--------------- guetzli/butteraugli_comparator.cc | 2 +- guetzli/guetzli.cc | 3 +- guetzli/processor.cc | 8 +- guetzli/processor.h | 2 +- guetzli_static.vcxproj | 110 ++-- guetzli_static.vcxproj.filters | 312 +++++----- tests/golden_checksums.txt | 20 +- 19 files changed, 1054 insertions(+), 1110 deletions(-) delete mode 100644 appveyor.yml diff --git a/appveyor.yml b/appveyor.yml deleted file mode 100644 index 061ab6d0..00000000 --- a/appveyor.yml +++ /dev/null @@ -1,32 +0,0 @@ -version: '1.0.1#{build}' - -shallow_clone: true - -os: - - Visual Studio 2015 - -environment: - matrix: - - TOOLSET: vs2015 - -install: - - ps: Start-FileDownload 'https://github.com/premake/premake-core/releases/download/v5.0.0-alpha11/premake-5.0.0-alpha11-windows.zip' 'premake.zip' - - 7z x premake.zip - - premake5.exe %TOOLSET% - - git clone https://github.com/Microsoft/vcpkg - - md vcpkg\downloads\nuget-3.5.0 - - appveyor DownloadFile https://dist.nuget.org/win-x86-commandline/latest/nuget.exe -FileName %appveyor_build_folder%\vcpkg\downloads\nuget-3.5.0\nuget.exe - - appveyor DownloadFile https://cmake.org/files/v3.8/cmake-3.8.0-rc1-win32-x86.zip -FileName %appveyor_build_folder%\vcpkg\downloads\cmake-3.8.0-rc1-win32-x86.zip - - 7z x %appveyor_build_folder%\vcpkg\downloads\cmake-3.8.0-rc1-win32-x86.zip - - cd vcpkg - - powershell -exec bypass -File scripts\bootstrap.ps1 - - vcpkg integrate install - - vcpkg install libpng - - cd .. - -configuration: - - Debug - - Release - -build: - project: guetzli.sln diff --git a/clguetzli/clbutter_comparator.cpp b/clguetzli/clbutter_comparator.cpp index fa0e3920..64c0a3dd 100644 --- a/clguetzli/clbutter_comparator.cpp +++ b/clguetzli/clbutter_comparator.cpp @@ -53,7 +53,7 @@ namespace butteraugli { tclEdgeDetectorMap(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), - xsize_, ysize_, step_, + xsize_, ysize_, step_, (*edge_detector_map).data()); } } @@ -104,7 +104,7 @@ namespace butteraugli } } - void MinSquareVal(size_t square_size, size_t offset, size_t xsize, size_t ysize, float *values) + void MinSquareVal(size_t square_size, size_t offset, size_t xsize, size_t ysize, float *values) { std::vector img; if (g_checkOpenCL && xsize > 8 && ysize > 8) @@ -305,6 +305,6 @@ namespace butteraugli tclOpsinDynamicsImage(orig_rgb[0].data(), orig_rgb[1].data(), orig_rgb[2].data(), xsize, ysize, rgb[0].data(), rgb[1].data(), rgb[2].data()); } - } + } } } \ No newline at end of file diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index ec04630d..1e026fa9 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -16,7 +16,7 @@ typedef struct __IntFloatPair float err; }IntFloatPair, DCTScoreData, CoeffData; -typedef struct __IntFloatPairList +typedef struct __IntFloatPairList { int size; IntFloatPair *pData; @@ -48,9 +48,9 @@ __device__ void Butteraugli8x8CornerEdgeDetectorDiff( __device__ int MakeInputOrderEx(const coeff_t block[3*8*8], const coeff_t orig_block[3*8*8], IntFloatPairList *input_order); __device__ double CompareBlockFactor(const channel_info mayout_channel[3], - const coeff_t* candidate_block, - const int block_x, - const int block_y, + const coeff_t* candidate_block, + const int block_x, + const int block_y, __global const float *orig_image_batch, __global const float *mask_scale, const int image_width, @@ -65,7 +65,7 @@ __device__ int list_push_back(IntFloatPairList* list, int i, float f); __kernel void clConvolutionEx( __global float* result, - __global const float* inp, const int xsize, + __global const float* inp, const int xsize, __global const float* multipliers, const int len, const int xstep, const int offset, const float border_ratio) { @@ -107,7 +107,7 @@ __kernel void clConvolutionEx( __kernel void clConvolutionXEx( __global float* result, __global const float* inp, - __global const float* multipliers, const int len, + __global const float* multipliers, const int len, const int step, const int offset, const float border_ratio) { const int x = get_global_id(0); @@ -147,8 +147,8 @@ __kernel void clConvolutionXEx( __kernel void clConvolutionYEx( __global float* result, - __global const float* inp, - __global const float* multipliers, const int len, + __global const float* inp, + __global const float* multipliers, const int len, const int step, const int offset, const float border_ratio) { const int x = get_global_id(0); @@ -189,7 +189,7 @@ __kernel void clConvolutionYEx( __kernel void clSquareSampleEx( __global float* result, - __global const float* image, + __global const float* image, const int xstep, const int ystep) { const int x = get_global_id(0); @@ -528,7 +528,7 @@ __kernel void clAverage5x5Ex(__global float *img, __global const float *img_org) const int y = get_global_id(1); const int xsize = get_global_size(0); const int ysize = get_global_size(1); - + const int row0 = y * xsize; if (x - 1 >= 0) { img[row0 + x] += img_org[row0 + x - 1]; @@ -707,7 +707,7 @@ __kernel void clAddBorderEx(__global float *out, int s, int s2, __global const f if (x >= xsize - s || y >= ysize - s) - { + { return; } @@ -803,8 +803,8 @@ __kernel void clComputeBlockZeroingOrderEx( block_y, orig_image_batch, mask_scale, - image_width, - image_height, + image_width, + image_height, factor); if (max_err < best_err) { @@ -2868,12 +2868,12 @@ __device__ void Copy16x16ToChannel(const float rgb16x16[3][16 * 16], float r[8 * } __device__ void Convolution(size_t xsize, size_t ysize, - int xstep, int len, int offset, - const float* multipliers, - const float* inp, + int xstep, int len, int offset, + const float* multipliers, + const float* inp, float border_ratio, float* result) -{ +{ float weight_no_border = 0; for (size_t j = 0; j <= 2 * offset; ++j) { @@ -2909,17 +2909,17 @@ __device__ void BlurEx(const float *r, int xsize, int ysize, double kSigma, doub const double scaler = -0.41322314049586772; // when sigma=1.1, scaler is -0.41322314049586772 const int diff = 2; // when sigma=1.1, diff's value is 2. const int expn_size = 5; // when sigma=1.1, scaler is 5 - float expn[5] = { exp(scaler * (-diff) * (-diff)), - exp(scaler * (-diff + 1) * (-diff + 1)), + float expn[5] = { exp(scaler * (-diff) * (-diff)), + exp(scaler * (-diff + 1) * (-diff + 1)), exp(scaler * (-diff + 2) * (-diff + 2)), exp(scaler * (-diff + 3) * (-diff + 3)), - exp(scaler * (-diff + 4) * (-diff + 4))}; + exp(scaler * (-diff + 4) * (-diff + 4))}; const int xstep = 1; // when sigma=1.1, xstep is 1. const int ystep = xstep; int dxsize = (xsize + xstep - 1) / xstep; - float tmp[8*8] = { 0 }; + float tmp[8*8] = { 0 }; Convolution(xsize, ysize, xstep, expn_size, diff, expn, r, border_ratio, tmp); Convolution(ysize, dxsize, ystep, expn_size, diff, expn, tmp, border_ratio, output); @@ -3050,7 +3050,7 @@ __device__ void CalcOpsinDynamicsImage(__private float rgb[3][kDCTBlockSize]) } __device__ double ComputeImage8x8Block(__private float rgb0_c[3][kDCTBlockSize], __private float rgb1_c[3][kDCTBlockSize], const __global float* mask_scale_block) -{ +{ // CalcOpsinDynamicsImage(rgb0_c); CalcOpsinDynamicsImage(rgb1_c); @@ -3067,7 +3067,7 @@ __device__ double ComputeImage8x8Block(__private float rgb0_c[3][kDCTBlockSize], 8, 8); // ÕâÀïΪɶҪ°Ñfloatת³Édouble²ÅÄܼÌÐø×ö¼ÆË㣿 - double b0[3 * kDCTBlockSize]; // + double b0[3 * kDCTBlockSize]; // double b1[3 * kDCTBlockSize]; for (int c = 0; c < 3; ++c) { for (int ix = 0; ix < kDCTBlockSize; ++ix) { @@ -3107,14 +3107,14 @@ __device__ int MakeInputOrderEx(const coeff_t block[3*8*8], const coeff_t orig_b } } } - + return SortInputOrder(input_order->pData, size); } __device__ int GetOrigBlock(float rgb0_c[3][kDCTBlockSize], - const __global float *orig_image_batch, + const __global float *orig_image_batch, int width_, int height_, - int block_x, int block_y, + int block_x, int block_y, int factor, int off_x, int off_y) { @@ -3137,9 +3137,9 @@ __device__ int GetOrigBlock(float rgb0_c[3][kDCTBlockSize], } __device__ double CompareBlockFactor(const channel_info mayout_channel[3], - const coeff_t* candidate_block, - const int block_x, - const int block_y, + const coeff_t* candidate_block, + const int block_x, + const int block_y, __global const float *orig_image_batch, __global const float *mask_scale, const int image_width, @@ -3183,7 +3183,7 @@ __device__ double CompareBlockFactor(const channel_info mayout_channel[3], } } } - else { + else { if (factor == 1) { int block_xx = block_x / mayout_channel[c].factor; int block_yy = block_y / mayout_channel[c].factor; @@ -3192,9 +3192,9 @@ __device__ double CompareBlockFactor(const channel_info mayout_channel[3], int block_16x16idx = block_yy * mayout_channel[c].block_width + block_xx; __global const coeff_t * coeff_block = mayout_channel[c].coeff + block_16x16idx * 8 * 8; - - CoeffToYUV16x16_g(coeff_block, &yuv16x16[c], - mayout_channel[c].pixel, block_xx, block_yy, + + CoeffToYUV16x16_g(coeff_block, &yuv16x16[c], + mayout_channel[c].pixel, block_xx, block_yy, image_width, image_height); @@ -3203,9 +3203,9 @@ __device__ double CompareBlockFactor(const channel_info mayout_channel[3], } else { const coeff_t * coeff_block = candidate_channel[c]; - CoeffToYUV16x16(coeff_block, &yuv16x16[c], - mayout_channel[c].pixel, block_x, block_y, - image_width, + CoeffToYUV16x16(coeff_block, &yuv16x16[c], + mayout_channel[c].pixel, block_x, block_y, + image_width, image_height); } } @@ -3243,7 +3243,7 @@ __device__ double CompareBlockFactor(const channel_info mayout_channel[3], { continue; } - + float rgb0_c[3][kDCTBlockSize]; int block_8x8idx = GetOrigBlock(rgb0_c, orig_image_batch, image_width, image_height, block_x, block_y, factor, ix, iy); diff --git a/clguetzli/clguetzli.cl.cpp b/clguetzli/clguetzli.cl.cpp index cafb0bf7..a18cd110 100644 --- a/clguetzli/clguetzli.cl.cpp +++ b/clguetzli/clguetzli.cl.cpp @@ -92,7 +92,7 @@ namespace guetzli imgOpsinDynamicsBlockList.clear(); imgMaskXyzScaleBlockList.clear(); } - + double ButteraugliComparatorEx::CompareBlock(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block, const int comp_mask) const { double err = ButteraugliComparator::CompareBlock(img, off_x, off_y, candidate_block, comp_mask); diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index 00449722..7b2cd995 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -79,21 +79,11 @@ void clOpsinDynamicsImage(float *r, float *g, float *b, const size_t xsize, cons clOpsinDynamicsImageEx(rgb, xsize, ysize); - cl_float *result_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, rgb.r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); - cl_float *result_g = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, rgb.g, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); - cl_float *result_b = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, rgb.b, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); - + clEnqueueReadBuffer(ocl.commandQueue, rgb.r, false, 0, channel_size, r, 0, NULL, NULL); + clEnqueueReadBuffer(ocl.commandQueue, rgb.g, false, 0, channel_size, g, 0, NULL, NULL); + clEnqueueReadBuffer(ocl.commandQueue, rgb.b, false, 0, channel_size, b, 0, NULL, NULL); err = clFinish(ocl.commandQueue); - memcpy(r, result_r, channel_size); - memcpy(g, result_g, channel_size); - memcpy(b, result_b, channel_size); - - clEnqueueUnmapMemObject(ocl.commandQueue, rgb.r, result_r, 0, NULL, NULL); - clEnqueueUnmapMemObject(ocl.commandQueue, rgb.g, result_g, 0, NULL, NULL); - clEnqueueUnmapMemObject(ocl.commandQueue, rgb.b, result_b, 0, NULL, NULL); - clFinish(ocl.commandQueue); - ocl.releaseMemChannels(rgb); } @@ -138,12 +128,8 @@ void clDiffmapOpsinDynamicsImage( clCalculateDiffmapEx(mem_result, xsize, ysize, step); - cl_float *result_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mem_result, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + clEnqueueReadBuffer(ocl.commandQueue, mem_result, false, 0, channel_size, result, 0, NULL, NULL); err = clFinish(ocl.commandQueue); - memcpy(result, result_r, channel_size); - - clEnqueueUnmapMemObject(ocl.commandQueue, mem_result, result_r, 0, NULL, NULL); - clFinish(ocl.commandQueue); ocl.releaseMemChannels(xyb1); ocl.releaseMemChannels(xyb0); @@ -238,11 +224,7 @@ void clComputeBlockZeroingOrder( LogError("Error: clComputeBlockZeroingOrder() clFinish returned %s.\n", TranslateOpenCLError(err)); } - CoeffData *result = (CoeffData *)clEnqueueMapBuffer(ocl.commandQueue, mem_output_order_batch, true, CL_MAP_READ, 0, output_order_batch_size, 0, NULL, NULL, &err); - err = clFinish(ocl.commandQueue); - memcpy(output_order_batch, result, output_order_batch_size); - - clEnqueueUnmapMemObject(ocl.commandQueue, mem_output_order_batch, result, 0, NULL, NULL); + clEnqueueReadBuffer(ocl.commandQueue, mem_output_order_batch, false, 0, output_order_batch_size, output_order_batch, 0, NULL, NULL); clFinish(ocl.commandQueue); for (int c = 0; c < 3; c++) @@ -277,21 +259,14 @@ void clMask( clMaskEx(mask, mask_dc, rgb, rgb2, xsize, ysize); - cl_float *r0_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask.r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); - cl_float *r0_g = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask.g, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); - cl_float *r0_b = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask.b, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); - cl_float *r1_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask_dc.r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); - cl_float *r1_g = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask_dc.g, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); - cl_float *r1_b = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask_dc.b, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + clEnqueueReadBuffer(ocl.commandQueue, mask.r, false, 0, channel_size, mask_r, 0, NULL, NULL); + clEnqueueReadBuffer(ocl.commandQueue, mask.g, false, 0, channel_size, mask_g, 0, NULL, NULL); + clEnqueueReadBuffer(ocl.commandQueue, mask.b, false, 0, channel_size, mask_b, 0, NULL, NULL); + clEnqueueReadBuffer(ocl.commandQueue, mask_dc.r, false, 0, channel_size, maskdc_r, 0, NULL, NULL); + clEnqueueReadBuffer(ocl.commandQueue, mask_dc.g, false, 0, channel_size, maskdc_g, 0, NULL, NULL); + clEnqueueReadBuffer(ocl.commandQueue, mask_dc.b, false, 0, channel_size, maskdc_b, 0, NULL, NULL); err = clFinish(ocl.commandQueue); - memcpy(mask_r, r0_r, channel_size); - memcpy(mask_g, r0_g, channel_size); - memcpy(mask_b, r0_b, channel_size); - memcpy(maskdc_r, r1_r, channel_size); - memcpy(maskdc_g, r1_g, channel_size); - memcpy(maskdc_b, r1_b, channel_size); - ocl.releaseMemChannels(rgb); ocl.releaseMemChannels(rgb2); ocl.releaseMemChannels(mask); @@ -614,7 +589,7 @@ void clMaskHighIntensityChangeEx( void clEdgeDetectorMapEx( cl_mem result/*out*/, - const ocl_channels &rgb, const ocl_channels &rgb2, + const ocl_channels &rgb, const ocl_channels &rgb2, const size_t xsize, const size_t ysize, const size_t step) { cl_int channel_size = xsize * ysize * sizeof(float); @@ -625,7 +600,7 @@ void clEdgeDetectorMapEx( ocl_channels rgb_blured = ocl.allocMemChannels(channel_size); ocl_channels rgb2_blured = ocl.allocMemChannels(channel_size); - static const double kSigma[3] = { 1.5, 0.586, 0.4 }; + static const double kSigma[3] = { 1.5, 0.586, 0.4 }; for (int i = 0; i < 3; i++) { @@ -669,7 +644,7 @@ void clEdgeDetectorMapEx( } void clBlockDiffMapEx( - cl_mem block_diff_dc/*out*/, + cl_mem block_diff_dc/*out*/, cl_mem block_diff_ac/*out*/, const ocl_channels &rgb, const ocl_channels &rgb2, const size_t xsize, const size_t ysize, const size_t step) @@ -767,7 +742,7 @@ void clEdgeDetectorLowFreqEx( void clDiffPrecomputeEx( ocl_channels &mask/*out*/, - const ocl_channels &xyb0, const ocl_channels &xyb1, + const ocl_channels &xyb0, const ocl_channels &xyb1, const size_t xsize, const size_t ysize) { cl_int err = CL_SUCCESS; @@ -855,8 +830,8 @@ void clAverage5x5Ex(cl_mem img/*in,out*/, const size_t xsize, const size_t ysize } void clMinSquareValEx( - cl_mem img/*in,out*/, - const size_t xsize, const size_t ysize, + cl_mem img/*in,out*/, + const size_t xsize, const size_t ysize, const size_t square_size, const size_t offset) { cl_int err = CL_SUCCESS; @@ -1056,12 +1031,12 @@ void clMaskEx( void clCombineChannelsEx( cl_mem result/*out*/, - const ocl_channels &mask, - const ocl_channels &mask_dc, + const ocl_channels &mask, + const ocl_channels &mask_dc, const size_t xsize, const size_t ysize, - const cl_mem block_diff_dc, - const cl_mem block_diff_ac, - const cl_mem edge_detector_map, + const cl_mem block_diff_dc, + const cl_mem block_diff_ac, + const cl_mem edge_detector_map, const size_t res_xsize, const size_t step) { diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h index 61743e3d..8407a1c5 100644 --- a/clguetzli/clguetzli.h +++ b/clguetzli/clguetzli.h @@ -11,7 +11,7 @@ extern bool g_useCuda; extern bool g_checkOpenCL; void clOpsinDynamicsImage( - float *r, float *g, float *b, + float *r, float *g, float *b, const size_t xsize, const size_t ysize); void clDiffmapOpsinDynamicsImage( @@ -36,7 +36,7 @@ void clComputeBlockZeroingOrder( void clMask( float* mask_r, float* mask_g, float* mask_b, - float* maskdc_r, float* maskdc_g, float* maskdc_b, + float* maskdc_r, float* maskdc_g, float* maskdc_b, const size_t xsize, const size_t ysize, const float* r, const float* g, const float* b, const float* r2, const float* g2, const float* b2); @@ -48,7 +48,7 @@ void clConvolutionEx( int xstep, int offset, double border_ratio); void clConvolutionXEx( - cl_mem result/*out*/, + cl_mem result/*out*/, const cl_mem inp, size_t xsize, size_t ysize, const cl_mem multipliers, size_t len, int xstep, int offset, double border_ratio); @@ -85,7 +85,7 @@ void clEdgeDetectorMapEx( const size_t xsize, const size_t ysize, const size_t step); void clBlockDiffMapEx( - cl_mem block_diff_dc/*out*/, + cl_mem block_diff_dc/*out*/, cl_mem block_diff_ac/*out*/, const ocl_channels &rgb, const ocl_channels &rgb2, const size_t xsize, const size_t ysize, const size_t step); @@ -97,7 +97,7 @@ void clEdgeDetectorLowFreqEx( void clDiffPrecomputeEx( ocl_channels &mask/*out*/, - const ocl_channels &xyb0, const ocl_channels &xyb1, + const ocl_channels &xyb0, const ocl_channels &xyb1, const size_t xsize, const size_t ysize); void clScaleImageEx(cl_mem img/*in, out*/, size_t size, double w); @@ -105,8 +105,8 @@ void clScaleImageEx(cl_mem img/*in, out*/, size_t size, double w); void clAverage5x5Ex(cl_mem img/*in,out*/, const size_t xsize, const size_t ysize); void clMinSquareValEx( - cl_mem img/*in,out*/, - const size_t xsize, const size_t ysize, + cl_mem img/*in,out*/, + const size_t xsize, const size_t ysize, const size_t square_size, const size_t offset); void clMaskEx( diff --git a/clguetzli/clguetzli_test.cpp b/clguetzli/clguetzli_test.cpp index 15c1317b..2cadfb85 100644 --- a/clguetzli/clguetzli_test.cpp +++ b/clguetzli/clguetzli_test.cpp @@ -90,7 +90,7 @@ void tclEdgeDetectorMap(const float* r, const float* g, const float* b, err = clFinish(ocl.commandQueue); FLOAT_COMPARE(result, r_r, res_xsize * res_ysize * 3); - + clEnqueueUnmapMemObject(ocl.commandQueue, edge, r_r, 0, NULL, NULL); err = clFinish(ocl.commandQueue); @@ -114,7 +114,7 @@ void tclBlockDiffMap(const float* r, const float* g, const float* b, ocl_args_d_t &ocl = getOcl(); ocl_channels xyb0 = ocl.allocMemChannels(channel_size, r, g, b); ocl_channels xyb1 = ocl.allocMemChannels(channel_size, r2, g2, b2); - + cl_mem block_diff_dc = ocl.allocMem(reschannel_size); cl_mem block_diff_ac = ocl.allocMem(reschannel_size); @@ -187,7 +187,7 @@ void tclMask(const float* r, const float* g, const float* b, ocl_channels mask = ocl.allocMemChannels(channel_size); ocl_channels mask_dc = ocl.allocMemChannels(channel_size); - + clMaskEx(mask/*out*/, mask_dc/*out*/, rgb, rgb2, xsize, ysize); cl_float *r0_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask.r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); diff --git a/clguetzli/ocl.cpp b/clguetzli/ocl.cpp index aecd900e..d92fb1a4 100644 --- a/clguetzli/ocl.cpp +++ b/clguetzli/ocl.cpp @@ -184,7 +184,7 @@ void* ocl_args_d_t::allocC(size_t s) cl_uint optimizedSize = ((s - 1) / 64 + 1) * 64; outputC = _aligned_malloc(optimizedSize, 4096); lenC = s; - + cl_int err = 0; dstMem = clCreateBuffer(this->context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, s, outputC, &err); if (CL_SUCCESS != err) @@ -204,7 +204,7 @@ cl_mem ocl_args_d_t::allocMem(size_t s, const void *init) LogError("Error: allocMem() for buffer returned %s.\n", TranslateOpenCLError(err)); } if (!mem) return NULL; - + // init memory if (init) { @@ -323,7 +323,7 @@ const char* TranslateOpenCLError(cl_int errorCode) case CL_INVALID_LINKER_OPTIONS: return "CL_INVALID_LINKER_OPTIONS"; //-67 case CL_INVALID_DEVICE_PARTITION_COUNT: return "CL_INVALID_DEVICE_PARTITION_COUNT"; //-68 // case CL_INVALID_PIPE_SIZE: return "CL_INVALID_PIPE_SIZE"; //-69 - // case CL_INVALID_DEVICE_QUEUE: return "CL_INVALID_DEVICE_QUEUE"; //-70 + // case CL_INVALID_DEVICE_QUEUE: return "CL_INVALID_DEVICE_QUEUE"; //-70 default: return "UNKNOWN ERROR CODE"; diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h index 37679770..fd7e78e7 100644 --- a/clguetzli/ocl.h +++ b/clguetzli/ocl.h @@ -104,3 +104,4 @@ struct ocl_args_d_t void* outputC; size_t lenC; }; + diff --git a/clguetzli/utils.h b/clguetzli/utils.h index fc68fec5..71d8d7a1 100644 --- a/clguetzli/utils.h +++ b/clguetzli/utils.h @@ -29,3 +29,4 @@ void LogError(const char* str, ...); // Read OpenCL source code from fileName and store it in source. The number of read bytes returns in sourceSize int ReadSourceFromFile(const char* fileName, char** source, size_t* sourceSize); + diff --git a/guetzli.vcxproj b/guetzli.vcxproj index f2711884..b8798eb2 100644 --- a/guetzli.vcxproj +++ b/guetzli.vcxproj @@ -51,7 +51,7 @@ - + @@ -79,8 +79,8 @@ obj\x86\Release\guetzli\ guetzli .exe - $(VC_IncludePath);$(WindowsSDK_IncludePath);$(VC_IncludeThirdParty) - $(VC_LibraryPath_x86);$(WindowsSDK_LibraryPath_x86);$(NETFXKitsDir)Lib\um\x86;$(VC_LibThirdParty_x86) + $(VC_IncludePath);$(WindowsSDK_IncludePath);$(VC_IncludeThirdParty) + $(VC_LibraryPath_x86);$(WindowsSDK_LibraryPath_x86);$(NETFXKitsDir)Lib\um\x86;$(VC_LibThirdParty_x86) true @@ -95,121 +95,121 @@ obj\x86\Debug\guetzli\ guetzli .exe - $(VC_IncludePath);$(WindowsSDK_IncludePath);$(VC_IncludeThirdParty) - $(VC_LibraryPath_x86);$(WindowsSDK_LibraryPath_x86);$(NETFXKitsDir)Lib\um\x86;$(VC_LibThirdParty_x86) + $(VC_IncludePath);$(WindowsSDK_IncludePath);$(VC_IncludeThirdParty) + $(VC_LibraryPath_x86);$(WindowsSDK_LibraryPath_x86);$(NETFXKitsDir)Lib\um\x86;$(VC_LibThirdParty_x86) - - NotUsing - Level3 - .;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories) - Full - true - true + + NotUsing + Level3 + .;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories) + Full + true + true false true - ENABLE_OPENCL;_UNICODE;UNICODE;%(PreprocessorDefinitions) + ENABLE_OPENCL;_UNICODE;UNICODE;%(PreprocessorDefinitions) - Console - true - true - nvrtc.lib;cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies) - mainCRTStartup - $(CUDA_PATH)\lib\x64 - - - "$(INTELOCLSDKROOT)bin\x64\ioc64.exe" -cmd=build -input="%(FullPath)" -output="x64\Release\%(Filename).out" -VS -device=GPU -simd=default -targetos=current -bo=" " - - - OpenCL Code Builder - - - false - - - - - - - - - - - NotUsing - Level3 - .;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories) - MaxSpeed - true - false - false - true - PERFTOOLS_DLL_DECL=;ENABLE_OPENCL;%(PreprocessorDefinitions) - - - Console - true - true - cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies) - mainCRTStartup - __tcmalloc - $(CUDA_PATH)\lib\Win32 - - - compile.bat - - - CUDA CU - - - - - NotUsing - Level3 - .;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories) - EditAndContinue - Disabled - ENABLE_OPENCL;ENABLE_OPENCL_CHECK;_UNICODE;UNICODE;%(PreprocessorDefinitions) - - - Console - true - nvrtc.lib;cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies) - mainCRTStartup - $(CUDA_PATH)\lib\x64 - - - - - - NotUsing - Level3 - .;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories) - EditAndContinue - Disabled - PERFTOOLS_DLL_DECL=;ENABLE_OPENCL;%(PreprocessorDefinitions) - - - Console - true - nvrtc.lib;cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies) - mainCRTStartup - __tcmalloc - $(CUDA_PATH)\lib\Win32 - - - - - - - - - - - - - - + Console + true + true + nvrtc.lib;cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies) + mainCRTStartup + $(CUDA_PATH)\lib\x64 + + + "$(INTELOCLSDKROOT)bin\x64\ioc64.exe" -cmd=build -input="%(FullPath)" -output="x64\Release\%(Filename).out" -VS -device=GPU -simd=default -targetos=current -bo=" " + + + OpenCL Code Builder + + + false + + + + + + + + + + + NotUsing + Level3 + .;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories) + MaxSpeed + true + false + false + true + PERFTOOLS_DLL_DECL=;ENABLE_OPENCL;%(PreprocessorDefinitions) + + + Console + true + true + cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies) + mainCRTStartup + __tcmalloc + $(CUDA_PATH)\lib\Win32 + + + compile.bat + + + CUDA CU + + + + + NotUsing + Level3 + .;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories) + EditAndContinue + Disabled + ENABLE_OPENCL;ENABLE_OPENCL_CHECK;_UNICODE;UNICODE;%(PreprocessorDefinitions) + + + Console + true + nvrtc.lib;cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies) + mainCRTStartup + $(CUDA_PATH)\lib\x64 + + + + + + NotUsing + Level3 + .;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories) + EditAndContinue + Disabled + PERFTOOLS_DLL_DECL=;ENABLE_OPENCL;%(PreprocessorDefinitions) + + + Console + true + nvrtc.lib;cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies) + mainCRTStartup + __tcmalloc + $(CUDA_PATH)\lib\Winocument - - - true - true - - - Document - true - CUDA Code Builder - $(ProjectDir)compile.bat - compile.bat - false - cu.ptx - $(ProjectDir)compile.bat - CUDA Code Builder - cu.ptx - false - true - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Document + + + true + true + + + Document + true + CUDA Code Builder + $(ProjectDir)compile.bat + compile.bat + false + cu.ptx + $(ProjectDir)compile.bat + CUDA Code Builder + cu.ptx + false + true + + + + + + - + \ No newline at end of file diff --git a/guetzli.vcxproj.filters b/guetzli.vcxproj.filters index 17b8edf4..07f56763 100644 --- a/guetzli.vcxproj.filters +++ b/guetzli.vcxproj.filters @@ -1,4 +1,4 @@ - + @@ -13,21 +13,21 @@ {FD6FCB41-6929-36EC-F288-50C65E41EC5B} - - {40be58d6-6dfc-45a3-8ca1-7d1b14051ddc} - - - {cb89c1ac-8399-4814-88f2-4b69576bc9f9} - - - {f2b475de-6219-478e-9e5e-08f07ef25dbc} - - - {64847a89-ca39-4556-ba0e-d6875c4d39ca} - - - - + + {40be58d6-6dfc-45a3-8ca1-7d1b14051ddc} + + + {cb89c1ac-8399-4814-88f2-4b69576bc9f9} + + + {f2b475de-6219-478e-9e5e-08f07ef25dbc} + + + {64847a89-ca39-4556-ba0e-d6875c4d39ca} + + + + guetzli @@ -105,219 +105,219 @@ third_party\butteraugli\butteraugli - - third_party\libpng - - - third_party\libpng - - - third_party\libpng - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - clguetzli - - - clguetzli - - - clguetzli - - - clguetzli - - - clguetzli - - - clguetzli - - - clguetzli - - - - + + third_party\libpng + + + third_party\libpng + + + third_party\libpng + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + clguetzli + + + clguetzli + + + clguetzli + + + clguetzli + + + clguetzli + + + clguetzli + + + clguetzli + + + + guetzli @@ -380,239 +380,239 @@ third_party\butteraugli\butteraugli - - third_party\libpng - - - third_party\libpng - - - third_party\libpng - - - third_party\libpng - - - third_party\libpng - - - third_party\libpng - - - third_party\libpng - - - third_party\libpng - - - third_party\libpng - - - third_party\libpng - - - third_party\libpng - - - third_party\libpng - - - third_party\libpng - - - third_party\libpng - - - third_party\libpng - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - clguetzli - - - clguetzli - - - clguetzli - - - clguetzli - - - clguetzli - - - clguetzli - - - clguetzli - - - - - third_party\libpng - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - - - clguetzli - - - - - clguetzli - - + + third_party\libpng + + + third_party\libpng + + + third_party\libpng + + + third_party\libpng + + + third_party\libpng + + + third_party\libpng + + + third_party\libpng + + + third_party\libpng + + + third_party\libpng + + + third_party\libpng + + + third_party\libpng + + + third_party\libpng + + + third_party\libpng + + + third_party\libpng + + + third_party\libpng + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + third_party\tcmalloc_minimal + + + clguetzli + + + clguetzli + + + clguetzli + + + clguetzli + + + clguetzli + + + clguetzli + + + clguetzli + + + + + third_party\libpng + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + + + clguetzli + + + + + clguetzli + + \ No newline at end of file diff --git a/guetzli/butteraugli_comparator.cc b/guetzli/butteraugli_comparator.cc index 124aea8d..02256e95 100644 --- a/guetzli/butteraugli_comparator.cc +++ b/guetzli/butteraugli_comparator.cc @@ -97,7 +97,7 @@ void ButteraugliComparator::SwitchBlock(int block_x, int block_y, } double ButteraugliComparator::CompareBlock(const OutputImage& img, - int off_x, int off_y, + int off_x, int off_y, const coeff_t* candidate_block, const int comp_mask) const { int block_x = block_x_ * factor_x_ + off_x; diff --git a/guetzli/guetzli.cc b/guetzli/guetzli.cc index d8937978..587c06d4 100644 --- a/guetzli/guetzli.cc +++ b/guetzli/guetzli.cc @@ -14,7 +14,6 @@ * limitations under the License. */ -#include #include #include #include @@ -227,7 +226,7 @@ void Usage() { " --memlimit M - Memory limit in MB. Guetzli will fail if unable to stay under\n" " the limit. Default limit is %d MB.\n" " --opencl - Use OpenCL\n" - " --cuda - Use CUDA\n" + " --cuda - Use CUDA\n" " --nomemlimit - Do not limit memory usage.\n", kDefaultJPEGQuality, kDefaultMemlimitMB); exit(1); } diff --git a/guetzli/processor.cc b/guetzli/processor.cc index 1666d4fa..35783e41 100644 --- a/guetzli/processor.cc +++ b/guetzli/processor.cc @@ -545,7 +545,7 @@ size_t EstimateDCSize(const JPEGData& jpg) { } // namespace -void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, const uint8_t comp_mask, +void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, const uint8_t comp_mask, const double target_mul, bool stop_early) { const int width = img->width(); @@ -660,7 +660,7 @@ void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, co for (int block_y = 0, block_ix = 0; block_y < block_height; ++block_y) { for (int block_x = 0; block_x < block_width; ++block_x, ++block_ix) { CoeffData * p = &output_order[block_ix * kBlockSize]; - + candidate_coeff_offsets[block_ix] = candidate_coeffs.size(); for (int i = 0; i < kBlockSize; i++) { @@ -682,9 +682,9 @@ void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, co } -void Processor::SelectFrequencyBackEnd(const JPEGData& jpg, OutputImage* img, +void Processor::SelectFrequencyBackEnd(const JPEGData& jpg, OutputImage* img, const uint8_t comp_mask, - const double target_mul, + const double target_mul, bool stop_early, std::vector &candidate_coeff_offsets, std::vector& candidate_coeffs, diff --git a/guetzli/processor.h b/guetzli/processor.h index b36b184e..9f2c0c61 100644 --- a/guetzli/processor.h +++ b/guetzli/processor.h @@ -30,7 +30,7 @@ struct CoeffData { int idx; float block_err; }; - + struct Params { float butteraugli_target = 1.0; bool clear_metadata = true; diff --git a/guetzli_static.vcxproj b/guetzli_static.vcxproj index 44a911b2..1d4d4e3f 100644 --- a/guetzli_static.vcxproj +++ b/guetzli_static.vcxproj @@ -93,7 +93,7 @@ NotUsing Level3 - .;third_party\butteraugli;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows%(AdditionalIncludeDirectories) + .;third_party\butteraugli;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows%(AdditionalIncludeDirectories) Full true true @@ -110,7 +110,7 @@ NotUsing Level3 - .;third_party\butteraugli;third_party\libpng;third_party\zlib;%(AdditionalIncludeDirectories) + .;third_party\butteraugli;third_party\libpng;third_party\zlib;%(AdditionalIncludeDirectories) Full true true @@ -127,7 +127,7 @@ NotUsing Level3 - .;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories) + .;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories) EditAndContinue Disabled @@ -140,7 +140,7 @@ NotUsing Level3 - .;third_party\butteraugli;third_party\libpng;third_party\zlib;%(AdditionalIncludeDirectories) + .;third_party\butteraugli;third_party\libpng;third_party\zlib;%(AdditionalIncludeDirectories) EditAndContinue Disabled @@ -176,20 +176,20 @@ - - - - - - - - - - - - - - + + + + + + + + + + + + + + @@ -212,43 +212,43 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/guetzli_static.vcxproj.filters b/guetzli_static.vcxproj.filters index 9362cd94..37876e3d 100644 --- a/guetzli_static.vcxproj.filters +++ b/guetzli_static.vcxproj.filters @@ -1,4 +1,4 @@ - + @@ -13,12 +13,12 @@ {FD6FCB41-6929-36EC-F288-50C65E41EC5B} - - {61f0e3eb-c213-49c5-883a-060bdaf927bb} - - - {ba7b6163-a7d1-4f14-b4b3-3d35f296563a} - + + {61f0e3eb-c213-49c5-883a-060bdaf927bb} + + + {ba7b6163-a7d1-4f14-b4b3-3d35f296563a} + @@ -99,48 +99,48 @@ third_party\butteraugli\butteraugli - - third_party\libpng - - - third_party\libpng - - - third_party\libpng - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - + + third_party\libpng + + + third_party\libpng + + + third_party\libpng + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + @@ -203,112 +203,112 @@ third_party\butteraugli\butteraugli - - third_party\libpng - - - third_party\libpng - - - third_party\libpng - - - third_party\libpng - - - third_party\libpng - - - third_party\libpng - - - third_party\libpng - - - third_party\libpng - - - third_party\libpng - - - third_party\libpng - - - third_party\libpng - - - third_party\libpng - - - third_party\libpng - - - third_party\libpng - - - third_party\libpng - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - - - third_party\libpng - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - + + third_party\libpng + + + third_party\libpng + + + third_party\libpng + + + third_party\libpng + + + third_party\libpng + + + third_party\libpng + + + third_party\libpng + + + third_party\libpng + + + third_party\libpng + + + third_party\libpng + + + third_party\libpng + + + third_party\libpng + + + third_party\libpng + + + third_party\libpng + + + third_party\libpng + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + + + third_party\libpng + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + + + third_party\zlib + \ No newline at end of file diff --git a/tests/golden_checksums.txt b/tests/golden_checksums.txt index 531d0b21..5c09ef45 100644 --- a/tests/golden_checksums.txt +++ b/tests/golden_checksums.txt @@ -18,17 +18,17 @@ a9439e530c365a62c965e2d858c8de8fbd3f44e6e9c6ade1c2248ad373fb8755 bicycles.png.g 9ccdf1be8f0d121d4b6888e47d187e8b6378b63d5592a2b9f001a4d017506bde blue-rose.jpg.guetzli.jpg 6c75b537c2d603aa51c9fbc8f7f6b1bff413de269b13eec5982d24cfcf1e0d08 blue-rose.png.guetzli.jpg 9ccdf1be8f0d121d4b6888e47d187e8b6378b63d5592a2b9f001a4d017506bde blue-rose-progressive.jpg.guetzli.jpg -2763977af200403d57c7b9d9d33cfb76947076413c9531b16171c0a5fbc83339 brake-light-420.jpg.guetzli.jpg -a84703948373a12cc1db1edd32616c983db08ff07f550a3be7b111bf9e4ba06f brake-light.jpg.guetzli.jpg +ac979358ac843082f2c4c094dbd757d91816710853f4c6dfaaa8b828cc03db3c brake-light-420.jpg.guetzli.jpg +1c1e1754f7a3304eb8e1aada4cf923af1226bfccadc6d69643af6fa7f457ec71 brake-light.jpg.guetzli.jpg eda4f537c54ca55eddc03097b6aa57db61c57f80e276306ff4c73b72123e5402 brake-light.png.guetzli.jpg -a84703948373a12cc1db1edd32616c983db08ff07f550a3be7b111bf9e4ba06f brake-light-progressive.jpg.guetzli.jpg +1c1e1754f7a3304eb8e1aada4cf923af1226bfccadc6d69643af6fa7f457ec71 brake-light-progressive.jpg.guetzli.jpg c25be9699cad0796f1c663d8a7d92c2e99dfb95bed4efd0d58c15d52bb4fe049 cloth-420.jpg.guetzli.jpg 6bf2ef7d27a5a8614db0f698b1f977cd0f486fdc2d4260b90f0d64d653383c35 cloth.jpg.guetzli.jpg -446850f4decd68d77c8a5b09d925fdc9075f44fdfe6dfb2e60a1064677678a3f cloth.png.guetzli.jpg +4c98a84823bb60753dbee01e4719215edb326b4b1db51e74f5c1602fe4b51656 cloth.png.guetzli.jpg 6bf2ef7d27a5a8614db0f698b1f977cd0f486fdc2d4260b90f0d64d653383c35 cloth-progressive.jpg.guetzli.jpg e6407f3d38f70dde51584ee174ae29f53cc2d2d7e63812a3405d20079d67a45c geranium2-420.jpg.guetzli.jpg 14fb9aab1ebd6d7b8779566665fe2f94b07158b277f29296ef2f9fee71c3c4a4 geranium2.jpg.guetzli.jpg -b7ded98029eca0ecf75bb01d5ea54a833fa13377136b4e7e404ae35503f82eb8 geranium2.png.guetzli.jpg +d857d546a49e9e1c59f86be86656a87fcbfbbf77adc2abe39ca504c4a674e7b7 geranium2.png.guetzli.jpg 14fb9aab1ebd6d7b8779566665fe2f94b07158b277f29296ef2f9fee71c3c4a4 geranium2-progressive.jpg.guetzli.jpg 9eea5d54068ccaacfb1839c67c401685646c73edfdc38bac8e1e3a084e268f0d geranium-420.jpg.guetzli.jpg 4f249d42280d6f982fa093343236b318be887dc0f2241e125fcf6d4c913305e3 geranium.jpg.guetzli.jpg @@ -39,12 +39,12 @@ e1fbdb05fe74f2d78cf6547621d99afea6e72069d8de68da274d66530b5dcdd7 green.jpg.guet 3df6e963406121db078b99653d7c4e49ce2affe99b31212b026239d082748291 green.png.guetzli.jpg e1fbdb05fe74f2d78cf6547621d99afea6e72069d8de68da274d66530b5dcdd7 green-progressive.jpg.guetzli.jpg c2fcd25260b5c52871def4a7ef0136be7e7e7f63f836a974c51a4681a651d7c7 green-rose-420.jpg.guetzli.jpg -90998e98318bb62538fe64f3b60d3100230474bf57dda72cd737eeee8ea482ae green-rose.jpg.guetzli.jpg +8cf041993b4ba59d5dd478ce4171a48ec8335301400de34286541e5a1769622c green-rose.jpg.guetzli.jpg 513e03accb79e60e9c8a2e9832bdbf9f1af8b23c6905cdb476e0626e1a7009d2 green-rose.png.guetzli.jpg -90998e98318bb62538fe64f3b60d3100230474bf57dda72cd737eeee8ea482ae green-rose-progressive.jpg.guetzli.jpg +8cf041993b4ba59d5dd478ce4171a48ec8335301400de34286541e5a1769622c green-rose-progressive.jpg.guetzli.jpg f1279ca9177e0aea7451bafa4abcd8ecfdf8a939ae97c974fbc802b668d8a56b hand-420.jpg.guetzli.jpg 8d2d8f4a95deea2dca8539a0c12ba8186ea93e7d9bcff9cb2c0bb9eab5504d1f hand.jpg.guetzli.jpg -4d156e4dbec82cb2f8fa324ea9f9142327d63853379cf3332775fdd40bbafc2f hand.png.guetzli.jpg +a71c821561c30d55fc4d83cfc7cb55f77ba6095749eafa38491b619c30b43571 hand.png.guetzli.jpg 8d2d8f4a95deea2dca8539a0c12ba8186ea93e7d9bcff9cb2c0bb9eab5504d1f hand-progressive.jpg.guetzli.jpg c3a3f86da0eeacc015504139181c19f874e9631336bb5f90fbf8a367058ec95f lichen-420.jpg.guetzli.jpg 44db143ce962b2eb45fcc1a79468d96ca7c677cba864efd9a984e3f86de5a0a5 lichen.jpg.guetzli.jpg @@ -70,7 +70,7 @@ aceb338115241d9984510fd2e8a2bf46b3c5fc431e827a2d1efe496dff038675 port-420.jpg.g 02aac145b6df57db2913952d3d46c8d456e2f000cff1ff4bfc574b27175335e0 port.jpg.guetzli.jpg 157a25812bcf7bce343fe6c6a88932ee49117b46b5d3ba0aa3421edc8f2f1a09 port.png.guetzli.jpg 02aac145b6df57db2913952d3d46c8d456e2f000cff1ff4bfc574b27175335e0 port-progressive.jpg.guetzli.jpg -f41f613ecfae42d050115b785c1591724fcb7937c361b9c5d2a3248b7580953f rainbow-420.jpg.guetzli.jpg +c98795b77f49833c4e75bd58f78d1e58c0b7ca95990f8006247feb6e230c7f5d rainbow-420.jpg.guetzli.jpg 74d94a13c52b0d582c50d6bc70cecb6762c08740db6c234dff9b0e1c04fccbb5 rainbow.jpg.guetzli.jpg 657efb5cfa742fbdfd6304703b131a63c2ddf8b686600840a800e7d94b4da0eb rainbow.png.guetzli.jpg 74d94a13c52b0d582c50d6bc70cecb6762c08740db6c234dff9b0e1c04fccbb5 rainbow-progressive.jpg.guetzli.jpg @@ -86,7 +86,7 @@ c5499fdc97b3ae02d77ea12140d6da8ad645406e66adc88250a3c980bb70fe7d red-rose-420.j f9a97e475af9127ea6d6d4d41fec52330ca075aae707185d90910fe198695e8d red-rose.jpg.guetzli.jpg 22f21955e7078745d03c1eb1985b8c5ffbd0b615870071a821102c44bd94af97 red-rose.png.guetzli.jpg f9a97e475af9127ea6d6d4d41fec52330ca075aae707185d90910fe198695e8d red-rose-progressive.jpg.guetzli.jpg -4df6d9b244c2d02cacff35ada998da3be13d1c9f5e42d4a2ab9b4725fc78dfa5 rgb-420.jpg.guetzli.jpg +22cc4f5431c339e67958870a07c6ebc12fefdc849038230c9b8b98eac7f384ba rgb-420.jpg.guetzli.jpg 19256b30557be9dc6a7effe6418f2c1ba6e624940ef1f41c0ca71e356963014c rgb.jpg.guetzli.jpg c1f8e4161a8b6baddea1d279f4d490670560d9c5d1161b66ee101c4250d8dd48 rgb.png.guetzli.jpg 19256b30557be9dc6a7effe6418f2c1ba6e624940ef1f41c0ca71e356963014c rgb-progressive.jpg.guetzli.jpg From e5efe988858480e2ed144e57f35e0b69e8393728 Mon Sep 17 00:00:00 2001 From: strongtu Date: Fri, 2 Jun 2017 17:29:50 +0800 Subject: [PATCH 123/189] =?UTF-8?q?=E5=AE=8C=E6=88=90=E4=BB=A3=E7=A0=81?= =?UTF-8?q?=E6=B5=81=E7=A8=8B=EF=BC=8C=E4=BD=86=E8=AE=A1=E7=AE=97=E7=BB=93?= =?UTF-8?q?=E6=9E=9C=E8=BF=98=E9=9C=80=E8=A6=81=E6=A0=A1=E6=AD=A3=20cuDiff?= =?UTF-8?q?mapOpsinDynamicsImage=20cuComputeBlockZeroingOrder=20cuMask?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clbutter_comparator.cpp | 23 ++ clguetzli/clguetzli.cpp | 634 +++++++++++++++++++++++++++++- clguetzli/ocu.cpp | 2 +- guetzli/processor.cc | 38 +- 4 files changed, 669 insertions(+), 28 deletions(-) diff --git a/clguetzli/clbutter_comparator.cpp b/clguetzli/clbutter_comparator.cpp index 64c0a3dd..53cd89fb 100644 --- a/clguetzli/clbutter_comparator.cpp +++ b/clguetzli/clbutter_comparator.cpp @@ -20,6 +20,12 @@ namespace butteraugli clDiffmapOpsinDynamicsImage(result.data(), xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), xsize_, ysize_, step_); } + else if (g_useCuda && xsize_ > 100 && ysize_ > 100) + { + result.resize(xsize_ * ysize_); + clDiffmapOpsinDynamicsImage(result.data(), xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), + xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), xsize_, ysize_, step_); + } else { ButteraugliComparator::DiffmapOpsinDynamicsImage(xyb0, xyb1, result); @@ -171,6 +177,23 @@ namespace butteraugli ); return; } + else if (g_useCuda && xsize > 100 && ysize > 100) + { + mask->resize(3); + mask_dc->resize(3); + for (int i = 0; i < 3; i++) + { + (*mask)[i].resize(xsize * ysize); + (*mask_dc)[i].resize(xsize * ysize); + } + cuMask((*mask)[0].data(), (*mask)[1].data(), (*mask)[2].data(), + (*mask_dc)[0].data(), (*mask_dc)[1].data(), (*mask_dc)[2].data(), + xsize, ysize, + xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), + xyb1[0].data(), xyb1[1].data(), xyb1[2].data() + ); + return; + } _Mask(xyb0, xyb1, xsize, ysize, mask, mask_dc); diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index 7b2cd995..b0e9fefe 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -97,8 +97,8 @@ void clDiffmapOpsinDynamicsImage( const size_t res_xsize = (xsize + step - 1) / step; const size_t res_ysize = (ysize + step - 1) / step; - cl_int channel_size = xsize * ysize * sizeof(float); - cl_int channel_step_size = res_xsize * res_ysize * sizeof(float); + size_t channel_size = xsize * ysize * sizeof(float); + size_t channel_step_size = res_xsize * res_ysize * sizeof(float); cl_int err = 0; ocl_args_d_t &ocl = getOcl(); @@ -180,16 +180,13 @@ void clComputeBlockZeroingOrder( cl_mem mem_mask_scale = ocl.allocMem(sizeof(float) * 3 * block8_width * block8_height, mask_scale); int output_order_batch_size = sizeof(CoeffData) * 3 * kDCTBlockSize * blockf_width * blockf_height; - cl_mem mem_output_order_batch = ocl.allocMem(output_order_batch_size); + cl_mem mem_output_order_batch = ocl.allocMem(output_order_batch_size, output_order_batch); cl_float clBlockErrorLimit = BlockErrorLimit; cl_int clWidth = image_width; cl_int clHeight = image_height; cl_int clFactor = factor; cl_int clMask = comp_mask; - clEnqueueWriteBuffer(ocl.commandQueue, mem_output_order_batch, CL_FALSE, 0, output_order_batch_size, output_order_batch, 0, NULL, NULL); - err = clFinish(ocl.commandQueue); - cl_kernel kernel = ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER]; clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&mem_orig_coeff[0]); clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&mem_orig_coeff[1]); @@ -250,7 +247,7 @@ void clMask( cl_int err = CL_SUCCESS; ocl_args_d_t &ocl = getOcl(); - cl_int channel_size = xsize * ysize * sizeof(float); + size_t channel_size = xsize * ysize * sizeof(float); ocl_channels rgb = ocl.allocMemChannels(channel_size, r, g, b); ocl_channels rgb2 = ocl.allocMemChannels(channel_size, r2, g2, b2); @@ -502,7 +499,7 @@ void clOpsinDynamicsImageEx(ocl_channels &rgb, const size_t xsize, const size_t { static const double kSigma = 1.1; - cl_int channel_size = xsize * ysize * sizeof(float); + size_t channel_size = xsize * ysize * sizeof(float); cl_int err = 0; ocl_args_d_t &ocl = getOcl(); @@ -541,7 +538,7 @@ void clMaskHighIntensityChangeEx( ocl_channels &xyb1/*in,out*/, const size_t xsize, const size_t ysize) { - cl_int channel_size = xsize * ysize * sizeof(float); + size_t channel_size = xsize * ysize * sizeof(float); cl_int err = CL_SUCCESS; ocl_args_d_t &ocl = getOcl(); @@ -592,7 +589,7 @@ void clEdgeDetectorMapEx( const ocl_channels &rgb, const ocl_channels &rgb2, const size_t xsize, const size_t ysize, const size_t step) { - cl_int channel_size = xsize * ysize * sizeof(float); + size_t channel_size = xsize * ysize * sizeof(float); cl_int err = CL_SUCCESS; ocl_args_d_t &ocl = getOcl(); @@ -690,7 +687,7 @@ void clEdgeDetectorLowFreqEx( const ocl_channels &rgb, const ocl_channels &rgb2, const size_t xsize, const size_t ysize, const size_t step) { - cl_int channel_size = xsize * ysize * sizeof(float); + size_t channel_size = xsize * ysize * sizeof(float); static const double kSigma = 14; @@ -885,9 +882,6 @@ void clDoMask(ocl_channels mask/*in, out*/, ocl_channels mask_dc/*in, out*/, siz cl_int err = CL_SUCCESS; ocl_args_d_t &ocl = getOcl(); - cl_int clxsize = xsize; - cl_int clysize = ysize; - double extmul = 0.975741017749; double extoff = -4.25328244168; double offset = 0.454909521427; @@ -1218,6 +1212,490 @@ void cuOpsinDynamicsImage(float *r, float *g, float *b, const size_t xsize, cons ocu.releaseMemChannels(rgb); } +void cuMaskHighIntensityChangeEx( + ocu_channels &xyb0/*in,out*/, + ocu_channels &xyb1/*in,out*/, + const size_t xsize, const size_t ysize) +{ + size_t channel_size = xsize * ysize * sizeof(float); + + ocu_args_d_t &ocl = getOcu(); + + ocu_channels c0 = ocl.allocMemChannels(channel_size); + ocu_channels c1 = ocl.allocMemChannels(channel_size); + + cuMemcpyDtoD(c0.r, xyb0.r, channel_size); + cuMemcpyDtoD(c0.g, xyb0.g, channel_size); + cuMemcpyDtoD(c0.b, xyb0.b, channel_size); + cuMemcpyDtoD(c1.r, xyb1.r, channel_size); + cuMemcpyDtoD(c1.g, xyb1.g, channel_size); + cuMemcpyDtoD(c1.b, xyb1.b, channel_size); + + const void *args[] = { &xyb0.r, &xyb0.g, &xyb0.b, + &xyb1.r, &xyb1.g, &xyb1.b, + &c0.r, &c0.g, &c0.b, + &c1.r, &c1.g, &c1.b }; + + CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_MASKHIGHINTENSITYCHANGE], + xsize, ysize, 1, + 1, 1, 1, + 0, + ocl.stream, (void**)args, NULL); + + err = cuStreamSynchronize(ocl.stream); + + ocl.releaseMemChannels(c0); + ocl.releaseMemChannels(c1); +} + +void cuEdgeDetectorMapEx( + CUdeviceptr result/*out*/, + const ocu_channels &rgb, const ocu_channels &rgb2, + const size_t xsize, const size_t ysize, const size_t step) +{ + size_t channel_size = xsize * ysize * sizeof(float); + + ocu_args_d_t &ocl = getOcu(); + + ocu_channels rgb_blured = ocl.allocMemChannels(channel_size); + ocu_channels rgb2_blured = ocl.allocMemChannels(channel_size); + + static const double kSigma[3] = { 1.5, 0.586, 0.4 }; + + for (int i = 0; i < 3; i++) + { + cuBlurEx(rgb.ch[i], xsize, ysize, kSigma[i], 0.0, rgb_blured.ch[i]); + cuBlurEx(rgb2.ch[i], xsize, ysize, kSigma[i], 0.0, rgb2_blured.ch[i]); + } + + const void *args[] = { &result, + &rgb_blured.r, &rgb_blured.g, &rgb_blured.b, + &rgb2_blured.r, &rgb2_blured.g, &rgb2_blured.b, + &xsize, &ysize, &step }; + + const size_t res_xsize = (xsize + step - 1) / step; + const size_t res_ysize = (ysize + step - 1) / step; + + CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_EDGEDETECTOR], + res_xsize, res_ysize, 1, + 1, 1, 1, + 0, + ocl.stream, (void**)args, NULL); + + err = cuStreamSynchronize(ocl.stream); + + ocl.releaseMemChannels(rgb_blured); + ocl.releaseMemChannels(rgb2_blured); +} + +void cuBlockDiffMapEx( + CUdeviceptr block_diff_dc/*out*/, + CUdeviceptr block_diff_ac/*out*/, + const ocu_channels &rgb, const ocu_channels &rgb2, + const size_t xsize, const size_t ysize, const size_t step) +{ + ocu_args_d_t &ocl = getOcu(); + + const void *args[] = { &block_diff_dc, &block_diff_ac, + &rgb.r, &rgb.g, &rgb.b, + &rgb2.r, &rgb2.g, &rgb2.b, + &xsize, &ysize, &step }; + + const size_t res_xsize = (xsize + step - 1) / step; + const size_t res_ysize = (ysize + step - 1) / step; + + CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_BLOCKDIFFMAP], + res_xsize, res_ysize, 1, + 1, 1, 1, + 0, + ocl.stream, (void**)args, NULL); + + err = cuStreamSynchronize(ocl.stream); +} + +void cuEdgeDetectorLowFreqEx( + CUdeviceptr block_diff_ac/*in,out*/, + const ocu_channels &rgb, const ocu_channels &rgb2, + const size_t xsize, const size_t ysize, const size_t step) +{ + size_t channel_size = xsize * ysize * sizeof(float); + + static const double kSigma = 14; + + ocu_args_d_t &ocl = getOcu(); + ocu_channels rgb_blured = ocl.allocMemChannels(channel_size); + ocu_channels rgb2_blured = ocl.allocMemChannels(channel_size); + + for (int i = 0; i < 3; i++) + { + cuBlurEx(rgb.ch[i], xsize, ysize, kSigma, 0.0, rgb_blured.ch[i]); + cuBlurEx(rgb2.ch[i], xsize, ysize, kSigma, 0.0, rgb2_blured.ch[i]); + } + + const void *args[] = { &block_diff_ac, + &rgb_blured.r, &rgb_blured.g, &rgb_blured.b, + &rgb2_blured.r, &rgb2_blured.g, &rgb2_blured.b, + &xsize, &ysize, &step }; + + const size_t res_xsize = (xsize + step - 1) / step; + const size_t res_ysize = (ysize + step - 1) / step; + + CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_EDGEDETECTORLOWFREQ], + res_xsize, res_ysize, 1, + 1, 1, 1, + 0, + ocl.stream, (void**)args, NULL); + + err = cuStreamSynchronize(ocl.stream); + + ocl.releaseMemChannels(rgb_blured); + ocl.releaseMemChannels(rgb2_blured); +} + +void cuDiffPrecomputeEx( + ocu_channels &mask/*out*/, + const ocu_channels &xyb0, const ocu_channels &xyb1, + const size_t xsize, const size_t ysize) +{ + ocu_args_d_t &ocl = getOcu(); + + const void *args[] = { &mask.x, &mask.y, &mask.b, + &xyb0.x, &xyb0.y, &xyb0.b, + &xyb1.x, &xyb1.y, &xyb1.b }; + + CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_DIFFPRECOMPUTE], + xsize, ysize, 1, + 1, 1, 1, + 0, + ocl.stream, (void**)args, NULL); + + err = cuStreamSynchronize(ocl.stream); +} + +void cuScaleImageEx(CUdeviceptr img/*in, out*/, size_t size, double w) +{ + ocu_args_d_t &ocl = getOcu(); + + const void *args[] = { &img, &w }; + + CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_SCALEIMAGE], + size, 1, 1, + 1, 1, 1, + 0, + ocl.stream, (void**)args, NULL); + + err = cuStreamSynchronize(ocl.stream); +} + +void cuAverage5x5Ex(CUdeviceptr img/*in,out*/, const size_t xsize, const size_t ysize) +{ + if (xsize < 4 || ysize < 4) { + // TODO: Make this work for small dimensions as well. + return; + } + + ocu_args_d_t &ocl = getOcu(); + + size_t len = xsize * ysize * sizeof(float); + CUdeviceptr img_org = ocl.allocMem(len); + + cuMemcpyDtoD(img_org, img, len); + + const void *args[] = { &img, &img_org}; + + CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_AVERAGE5X5], + xsize, ysize, 1, + 1, 1, 1, + 0, + ocl.stream, (void**)args, NULL); + + err = cuStreamSynchronize(ocl.stream); + + cuMemFree(img_org); +} + +void cuMinSquareValEx( + CUdeviceptr img/*in,out*/, + const size_t xsize, const size_t ysize, + const size_t square_size, const size_t offset) +{ + ocu_args_d_t &ocl = getOcu(); + + CUdeviceptr srcA = ocl.allocMem(sizeof(float) * xsize * ysize); + + const void *args[] = { &srcA, &img, &square_size, &offset}; + + CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_MINSQUAREVAL], + xsize, ysize, 1, + 1, 1, 1, + 0, + ocl.stream, (void**)args, NULL); + + err = cuStreamSynchronize(ocl.stream); + + cuMemcpyDtoD(img, srcA, sizeof(float) * xsize * ysize); + cuMemFree(srcA); +} + +void cuDoMask(ocu_channels mask/*in, out*/, ocu_channels mask_dc/*in, out*/, size_t xsize, size_t ysize) +{ + ocu_args_d_t &ocl = getOcu(); + + double extmul = 0.975741017749; + double extoff = -4.25328244168; + double offset = 0.454909521427; + double scaler = 0.0738288224836; + double mul = 20.8029176447; + static double lut_x[512]; + static bool lutx_init = false; + if (!lutx_init) + { + lutx_init = true; + MakeMask(extmul, extoff, mul, offset, scaler, lut_x); + } + + extmul = 0.373995618954; + extoff = 1.5307267433; + offset = 0.911952641929; + scaler = 1.1731667845; + mul = 16.2447033988; + static double lut_y[512]; + static bool luty_init = false; + if (!luty_init) + { + luty_init = true; + MakeMask(extmul, extoff, mul, offset, scaler, lut_y); + } + + extmul = 0.61582234137; + extoff = -4.25376118646; + offset = 1.05105070921; + scaler = 0.47434643535; + mul = 31.1444967089; + static double lut_b[512]; + static bool lutb_init = false; + if (!lutb_init) + { + lutb_init = true; + MakeMask(extmul, extoff, mul, offset, scaler, lut_b); + } + + extmul = 1.79116943438; + extoff = -3.86797479189; + offset = 0.670960225853; + scaler = 0.486575865525; + mul = 20.4563479139; + static double lut_dcx[512]; + static bool lutdcx_init = false; + if (!lutdcx_init) + { + lutdcx_init = true; + MakeMask(extmul, extoff, mul, offset, scaler, lut_dcx); + } + + extmul = 0.212223514236; + extoff = -3.65647120524; + offset = 1.73396799447; + scaler = 0.170392660501; + mul = 21.6566724788; + static double lut_dcy[512]; + static bool lutdcy_init = false; + if (!lutdcy_init) + { + lutdcy_init = true; + MakeMask(extmul, extoff, mul, offset, scaler, lut_dcy); + } + + extmul = 0.349376011816; + extoff = -0.894711072781; + offset = 0.901647926679; + scaler = 0.380086095024; + mul = 18.0373825149; + static double lut_dcb[512]; + static bool lutdcb_init = false; + if (!lutdcb_init) + { + lutdcb_init = true; + MakeMask(extmul, extoff, mul, offset, scaler, lut_dcb); + } + + size_t channel_size = 512 * 3 * sizeof(double); + ocu_channels xyb = ocl.allocMemChannels(channel_size, lut_x, lut_y, lut_b); + ocu_channels xyb_dc = ocl.allocMemChannels(channel_size, lut_dcx, lut_dcy, lut_dcb); + + const void *args[] = { &mask.r, &mask.g, &mask.b, + &mask_dc.r, &mask_dc.g, &mask_dc.b, + &xyb.x, &xyb.y, &xyb.b, + &xyb_dc.x, &xyb_dc.y, &xyb_dc.b}; + + CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_DOMASK], + xsize, ysize, 1, + 1, 1, 1, + 0, + ocl.stream, (void**)args, NULL); + + err = cuStreamSynchronize(ocl.stream); + + ocl.releaseMemChannels(xyb); + ocl.releaseMemChannels(xyb_dc); +} + +void cuMaskEx( + ocu_channels mask/*out*/, ocu_channels mask_dc/*out*/, + const ocu_channels &rgb, const ocu_channels &rgb2, + const size_t xsize, const size_t ysize) +{ + cuDiffPrecomputeEx(mask, rgb, rgb2, xsize, ysize); + for (int i = 0; i < 3; i++) + { + cuAverage5x5Ex(mask.ch[i], xsize, ysize); + cuMinSquareValEx(mask.ch[i], xsize, ysize, 4, 0); + + static const double sigma[3] = { + 9.65781083553, + 14.2644604355, + 4.53358927369, + }; + + cuBlurEx(mask.ch[i], xsize, ysize, sigma[i], 0.0); + } + + cuDoMask(mask, mask_dc, xsize, ysize); + + for (int i = 0; i < 3; i++) + { + cuScaleImageEx(mask.ch[i], xsize * ysize, kGlobalScale * kGlobalScale); + cuScaleImageEx(mask_dc.ch[i], xsize * ysize, kGlobalScale * kGlobalScale); + } +} + +void cuCombineChannelsEx( + CUdeviceptr result/*out*/, + const ocu_channels &mask, + const ocu_channels &mask_dc, + const size_t xsize, const size_t ysize, + const CUdeviceptr block_diff_dc, + const CUdeviceptr block_diff_ac, + const CUdeviceptr edge_detector_map, + const size_t res_xsize, + const size_t step) +{ + ocu_args_d_t &ocl = getOcu(); + + const size_t work_xsize = ((xsize - 8 + step) + step - 1) / step; + const size_t work_ysize = ((ysize - 8 + step) + step - 1) / step; + + const void *args[] = { &result, + &mask.r, &mask.g, &mask.b, + &mask_dc.r, &mask_dc.g, &mask_dc.b, + &xsize, &ysize, + &block_diff_dc, &block_diff_ac, &edge_detector_map, + &res_xsize, + &step }; + + CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_COMBINECHANNELS], + work_xsize, work_ysize, 1, + 1, 1, 1, + 0, + ocl.stream, (void**)args, NULL); + + err = cuStreamSynchronize(ocl.stream); +} + +void cuUpsampleSquareRootEx(CUdeviceptr diffmap, const size_t xsize, const size_t ysize, const int step) +{ + ocu_args_d_t &ocl = getOcu(); + + CUdeviceptr diffmap_out = ocl.allocMem(xsize * ysize * sizeof(float)); + + const void *args[] = { &diffmap_out, + &diffmap, + &xsize, &ysize, + &step }; + + const size_t res_xsize = (xsize + step - 1) / step; + const size_t res_ysize = (ysize + step - 1) / step; + + CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_UPSAMPLESQUAREROOT], + res_xsize, res_ysize, 1, + 1, 1, 1, + 0, + ocl.stream, (void**)args, NULL); + + err = cuStreamSynchronize(ocl.stream); + + cuMemcpyDtoD(diffmap, diffmap_out, xsize * ysize * sizeof(float)); + + cuMemFree(diffmap_out); +} + +void cuRemoveBorderEx(CUdeviceptr out, const CUdeviceptr in, const size_t xsize, const size_t ysize, const int step) +{ + ocu_args_d_t &ocl = getOcu(); + + int cls = 8 - step; + int cls2 = (8 - step) / 2; + + const void *args[] = { &out, + &in, + &xsize, + &cls, + &cls2 }; + + CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_REMOVEBORDER], + xsize - cls, ysize - cls, 1, + 1, 1, 1, + 0, + ocl.stream, (void**)args, NULL); + + err = cuStreamSynchronize(ocl.stream); +} + +void cuAddBorderEx(CUdeviceptr out, size_t xsize, size_t ysize, int step, CUdeviceptr in) +{ + ocu_args_d_t &ocl = getOcu(); + + int cls = 8 - step; + int cls2 = (8 - step) / 2; + + const void *args[] = { &out, + &cls, + &cls2, + &in}; + + CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_ADDBORDER], + xsize, ysize, 1, + 1, 1, 1, + 0, + ocl.stream, (void**)args, NULL); + + err = cuStreamSynchronize(ocl.stream); +} + +void cuCalculateDiffmapEx(CUdeviceptr diffmap/*in,out*/, const size_t xsize, const size_t ysize, const int step) +{ + cuUpsampleSquareRootEx(diffmap, xsize, ysize, step); + + static const double kSigma = 8.8510880283; + static const double mul1 = 24.8235314874; + static const double scale = 1.0 / (1.0 + mul1); + + const int s = 8 - step; + int s2 = (8 - step) / 2; + + ocu_args_d_t &ocl = getOcu(); + CUdeviceptr blurred = ocl.allocMem((xsize - s) * (ysize - s) * sizeof(float)); + cuRemoveBorderEx(blurred, diffmap, xsize, ysize, step); + + static const double border_ratio = 0.03027655136; + cuBlurEx(blurred, xsize - s, ysize - s, kSigma, border_ratio); + + cuAddBorderEx(diffmap, xsize, ysize, step, blurred); + cuScaleImageEx(diffmap, xsize * ysize, scale); + + cuMemFree(blurred); +} + void cuDiffmapOpsinDynamicsImage( float* result, const float* r, const float* g, const float* b, @@ -1225,7 +1703,50 @@ void cuDiffmapOpsinDynamicsImage( const size_t xsize, const size_t ysize, const size_t step) { + const size_t res_xsize = (xsize + step - 1) / step; + const size_t res_ysize = (ysize + step - 1) / step; + + size_t channel_size = xsize * ysize * sizeof(float); + size_t channel_step_size = res_xsize * res_ysize * sizeof(float); + + CUresult err = CUDA_SUCCESS; + ocu_args_d_t &ocl = getOcu(); + ocu_channels xyb0 = ocl.allocMemChannels(channel_size, r, g, b); + ocu_channels xyb1 = ocl.allocMemChannels(channel_size, r2, g2, b2); + + CUdeviceptr mem_result = ocl.allocMem(channel_size, result); + + CUdeviceptr edge_detector_map = ocl.allocMem(3 * channel_step_size); + CUdeviceptr block_diff_dc = ocl.allocMem(3 * channel_step_size); + CUdeviceptr block_diff_ac = ocl.allocMem(3 * channel_step_size); + + cuMaskHighIntensityChangeEx(xyb0, xyb1, xsize, ysize); + + cuEdgeDetectorMapEx(edge_detector_map, xyb0, xyb1, xsize, ysize, step); + cuBlockDiffMapEx(block_diff_dc, block_diff_ac, xyb0, xyb1, xsize, ysize, step); + cuEdgeDetectorLowFreqEx(block_diff_ac, xyb0, xyb1, xsize, ysize, step); + { + ocu_channels mask = ocl.allocMemChannels(channel_size); + ocu_channels mask_dc = ocl.allocMemChannels(channel_size); + cuMaskEx(mask, mask_dc, xyb0, xyb1, xsize, ysize); + cuCombineChannelsEx(mem_result, mask, mask_dc, xsize, ysize, block_diff_dc, block_diff_ac, edge_detector_map, res_xsize, step); + + ocl.releaseMemChannels(mask); + ocl.releaseMemChannels(mask_dc); + } + cuCalculateDiffmapEx(mem_result, xsize, ysize, step); + + cuMemcpyDtoH(result, mem_result, channel_size); + + ocl.releaseMemChannels(xyb1); + ocl.releaseMemChannels(xyb0); + + cuMemFree(edge_detector_map); + cuMemFree(block_diff_dc); + cuMemFree(block_diff_ac); + + cuMemFree(mem_result); } void cuComputeBlockZeroingOrder( @@ -1240,7 +1761,67 @@ void cuComputeBlockZeroingOrder( const int comp_mask, const float BlockErrorLimit) { + const int block8_width = (image_width + 8 - 1) / 8; + const int block8_height = (image_height + 8 - 1) / 8; + const int blockf_width = (image_width + 8 * factor - 1) / (8 * factor); + const int blockf_height = (image_height + 8 * factor - 1) / (8 * factor); + + using namespace guetzli; + + cl_int err = 0; + ocu_args_d_t &ocl = getOcu(); + + CUdeviceptr mem_orig_coeff[3]; + CUdeviceptr mem_mayout_coeff[3]; + CUdeviceptr mem_mayout_pixel[3]; + for (int c = 0; c < 3; c++) + { + int block_count = orig_channel[c].block_width * orig_channel[c].block_height; + mem_orig_coeff[c] = ocl.allocMem(block_count * sizeof(::coeff_t) * kDCTBlockSize, orig_channel[c].coeff); + + block_count = mayout_channel[c].block_width * mayout_channel[c].block_height; + mem_mayout_coeff[c] = ocl.allocMem(block_count * sizeof(::coeff_t) * kDCTBlockSize, mayout_channel[c].coeff); + + mem_mayout_pixel[c] = ocl.allocMem(image_width * image_height * sizeof(uint16_t), mayout_channel[c].pixel); + } + CUdeviceptr mem_orig_image = ocl.allocMem(sizeof(float) * 3 * kDCTBlockSize * block8_width * block8_height, orig_image_batch); + CUdeviceptr mem_mask_scale = ocl.allocMem(sizeof(float) * 3 * block8_width * block8_height, mask_scale); + + int output_order_batch_size = sizeof(CoeffData) * 3 * kDCTBlockSize * blockf_width * blockf_height; + CUdeviceptr mem_output_order_batch = ocl.allocMem(output_order_batch_size, output_order_batch); + + const void *args[] = { &mem_orig_coeff[0], &mem_orig_coeff[1], &mem_orig_coeff[2], + &mem_orig_image, &mem_orig_image, &mem_mask_scale, + &image_width, &image_height, + &mem_mayout_coeff[0], &mem_mayout_coeff[1], &mem_mayout_coeff[2], + &mem_mayout_pixel[0], &mem_mayout_pixel[1], &mem_mayout_pixel[2], + &mayout_channel[0], &mayout_channel[1], &mayout_channel[2], + &factor, + &comp_mask, + &BlockErrorLimit, + &mem_output_order_batch}; + + err = cuLaunchKernel(ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER], + blockf_width, blockf_height, 1, + 1, 1, 1, + 0, + ocl.stream, (void**)args, NULL); + + err = cuStreamSynchronize(ocl.stream); + + cuMemcpyDtoH(output_order_batch, mem_output_order_batch, output_order_batch_size); + + for (int c = 0; c < 3; c++) + { + cuMemFree(mem_orig_coeff[c]); + cuMemFree(mem_mayout_coeff[c]); + cuMemFree(mem_mayout_pixel[c]); + + } + cuMemFree(mem_orig_image); + cuMemFree(mem_mask_scale); + cuMemFree(mem_output_order_batch); } void cuMask( @@ -1250,7 +1831,29 @@ void cuMask( const float* r, const float* g, const float* b, const float* r2, const float* g2, const float* b2) { + CUresult err = CUDA_SUCCESS; + ocu_args_d_t &ocl = getOcu(); + size_t channel_size = xsize * ysize * sizeof(float); + + ocu_channels rgb = ocl.allocMemChannels(channel_size, r, g, b); + ocu_channels rgb2 = ocl.allocMemChannels(channel_size, r2, g2, b2); + ocu_channels mask = ocl.allocMemChannels(channel_size); + ocu_channels mask_dc = ocl.allocMemChannels(channel_size); + + cuMaskEx(mask, mask_dc, rgb, rgb2, xsize, ysize); + + cuMemcpyDtoH(mask_r, mask.r, channel_size); + cuMemcpyDtoH(mask_g, mask.r, channel_size); + cuMemcpyDtoH(mask_b, mask.r, channel_size); + cuMemcpyDtoH(maskdc_r, mask_dc.r, channel_size); + cuMemcpyDtoH(maskdc_g, mask_dc.r, channel_size); + cuMemcpyDtoH(maskdc_b, mask_dc.r, channel_size); + + ocl.releaseMemChannels(rgb); + ocl.releaseMemChannels(rgb2); + ocl.releaseMemChannels(mask); + ocl.releaseMemChannels(mask_dc); } void cuConvolutionXEx( @@ -1259,12 +1862,11 @@ void cuConvolutionXEx( const CUdeviceptr multipliers, size_t len, int xstep, int offset, double border_ratio) { - CUresult err = CUDA_SUCCESS; ocu_args_d_t &ocu = getOcu(); const void *args[] = { &result, &inp, &multipliers, &len, &xstep, &offset, &border_ratio }; - err = cuLaunchKernel(ocu.kernel[KERNEL_CONVOLUTIONX], + CUresult err = cuLaunchKernel(ocu.kernel[KERNEL_CONVOLUTIONX], xsize, ysize, 1, 1, 1, 1, 0, diff --git a/clguetzli/ocu.cpp b/clguetzli/ocu.cpp index 3a263c3d..c99d6ea9 100644 --- a/clguetzli/ocu.cpp +++ b/clguetzli/ocu.cpp @@ -117,7 +117,7 @@ CUdeviceptr ocu_args_d_t::allocMem(size_t s, const void *init) cuMemAlloc(&mem, s); if (init) { - cuMemcpyHtoDAsync(mem, init, s, this->stream); + cuMemcpyHtoD(mem, init, s); } else { diff --git a/guetzli/processor.cc b/guetzli/processor.cc index 35783e41..4690aff1 100644 --- a/guetzli/processor.cc +++ b/guetzli/processor.cc @@ -567,7 +567,7 @@ void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, co CoeffData * output_order = NULL; ButteraugliComparatorEx * comp = (ButteraugliComparatorEx*)comparator_; - if (g_useOpenCL || g_checkOpenCL) + if (g_useOpenCL || g_useCuda || g_checkOpenCL) { channel_info orig_channel[3]; channel_info mayout_channel[3]; @@ -588,16 +588,32 @@ void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, co output_order_gpu.resize(num_blocks * kBlockSize); output_order = output_order_gpu.data(); - clComputeBlockZeroingOrder(output_order, - orig_channel, - comp->imgOpsinDynamicsBlockList.data(), - comp->imgMaskXyzScaleBlockList.data(), - width, - height, - mayout_channel, - factor_x, - comp_mask, - comp->BlockErrorLimit()); + if (g_useCuda) + { + clComputeBlockZeroingOrder(output_order, + orig_channel, + comp->imgOpsinDynamicsBlockList.data(), + comp->imgMaskXyzScaleBlockList.data(), + width, + height, + mayout_channel, + factor_x, + comp_mask, + comp->BlockErrorLimit()); + } + else + { + clComputeBlockZeroingOrder(output_order, + orig_channel, + comp->imgOpsinDynamicsBlockList.data(), + comp->imgMaskXyzScaleBlockList.data(), + width, + height, + mayout_channel, + factor_x, + comp_mask, + comp->BlockErrorLimit()); + } } if (!g_useOpenCL || g_checkOpenCL) From 9a6a17cc54374b31fbe952d411771980cd94a27c Mon Sep 17 00:00:00 2001 From: strongtu Date: Fri, 2 Jun 2017 17:32:41 +0800 Subject: [PATCH 124/189] =?UTF-8?q?=E4=BF=AE=E6=AD=A3=20cuMask=20=E8=AE=A1?= =?UTF-8?q?=E7=AE=97=E7=BB=93=E6=9E=9C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clguetzli.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index b0e9fefe..4a49ef1e 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -1844,11 +1844,11 @@ void cuMask( cuMaskEx(mask, mask_dc, rgb, rgb2, xsize, ysize); cuMemcpyDtoH(mask_r, mask.r, channel_size); - cuMemcpyDtoH(mask_g, mask.r, channel_size); - cuMemcpyDtoH(mask_b, mask.r, channel_size); + cuMemcpyDtoH(mask_g, mask.g, channel_size); + cuMemcpyDtoH(mask_b, mask.b, channel_size); cuMemcpyDtoH(maskdc_r, mask_dc.r, channel_size); - cuMemcpyDtoH(maskdc_g, mask_dc.r, channel_size); - cuMemcpyDtoH(maskdc_b, mask_dc.r, channel_size); + cuMemcpyDtoH(maskdc_g, mask_dc.g, channel_size); + cuMemcpyDtoH(maskdc_b, mask_dc.b, channel_size); ocl.releaseMemChannels(rgb); ocl.releaseMemChannels(rgb2); From 3345026b39685108d524b4d51c68e20f34f80e28 Mon Sep 17 00:00:00 2001 From: strongtu Date: Sat, 3 Jun 2017 00:04:54 +0800 Subject: [PATCH 125/189] =?UTF-8?q?=E8=B0=83=E6=95=B4cu=E4=BB=A3=E7=A0=81?= =?UTF-8?q?=E7=BB=93=E6=9E=84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clguetzli.cpp | 790 +-------------------------------------- clguetzli/clguetzli.h | 37 +- clguetzli/cuguetzli.cpp | 801 ++++++++++++++++++++++++++++++++++++++++ clguetzli/cuguetzli.h | 37 ++ guetzli.vcxproj | 2 + guetzli.vcxproj.filters | 6 + guetzli/processor.cc | 2 +- 7 files changed, 850 insertions(+), 825 deletions(-) create mode 100644 clguetzli/cuguetzli.cpp create mode 100644 clguetzli/cuguetzli.h diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index 4a49ef1e..f50ce17c 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -2,7 +2,6 @@ #include #include #include -#include "ocu.h" extern bool g_useOpenCL = false; extern bool g_useCuda = false; @@ -73,7 +72,6 @@ void clOpsinDynamicsImage(float *r, float *g, float *b, const size_t xsize, cons { size_t channel_size = xsize * ysize * sizeof(float); - cl_int err = 0; ocl_args_d_t &ocl = getOcl(); ocl_channels rgb = ocl.allocMemChannels(channel_size, r, g, b); @@ -82,7 +80,7 @@ void clOpsinDynamicsImage(float *r, float *g, float *b, const size_t xsize, cons clEnqueueReadBuffer(ocl.commandQueue, rgb.r, false, 0, channel_size, r, 0, NULL, NULL); clEnqueueReadBuffer(ocl.commandQueue, rgb.g, false, 0, channel_size, g, 0, NULL, NULL); clEnqueueReadBuffer(ocl.commandQueue, rgb.b, false, 0, channel_size, b, 0, NULL, NULL); - err = clFinish(ocl.commandQueue); + clFinish(ocl.commandQueue); ocl.releaseMemChannels(rgb); } @@ -1193,789 +1191,3 @@ void clCalculateDiffmapEx(cl_mem diffmap/*in,out*/, const size_t xsize, const si clReleaseMemObject(blurred); } - -////////////////////////////////////////////////////////////////////////////////////// -void cuOpsinDynamicsImage(float *r, float *g, float *b, const size_t xsize, const size_t ysize) -{ - size_t channel_size = xsize * ysize * sizeof(float); - - CUresult err = CUDA_SUCCESS; - ocu_args_d_t &ocu = getOcu(); - ocu_channels rgb = ocu.allocMemChannels(channel_size, r, g, b); - - cuOpsinDynamicsImageEx(rgb, xsize, ysize); - - cuMemcpyDtoH(r, rgb.r, channel_size); - cuMemcpyDtoH(g, rgb.g, channel_size); - cuMemcpyDtoH(b, rgb.b, channel_size); - - ocu.releaseMemChannels(rgb); -} - -void cuMaskHighIntensityChangeEx( - ocu_channels &xyb0/*in,out*/, - ocu_channels &xyb1/*in,out*/, - const size_t xsize, const size_t ysize) -{ - size_t channel_size = xsize * ysize * sizeof(float); - - ocu_args_d_t &ocl = getOcu(); - - ocu_channels c0 = ocl.allocMemChannels(channel_size); - ocu_channels c1 = ocl.allocMemChannels(channel_size); - - cuMemcpyDtoD(c0.r, xyb0.r, channel_size); - cuMemcpyDtoD(c0.g, xyb0.g, channel_size); - cuMemcpyDtoD(c0.b, xyb0.b, channel_size); - cuMemcpyDtoD(c1.r, xyb1.r, channel_size); - cuMemcpyDtoD(c1.g, xyb1.g, channel_size); - cuMemcpyDtoD(c1.b, xyb1.b, channel_size); - - const void *args[] = { &xyb0.r, &xyb0.g, &xyb0.b, - &xyb1.r, &xyb1.g, &xyb1.b, - &c0.r, &c0.g, &c0.b, - &c1.r, &c1.g, &c1.b }; - - CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_MASKHIGHINTENSITYCHANGE], - xsize, ysize, 1, - 1, 1, 1, - 0, - ocl.stream, (void**)args, NULL); - - err = cuStreamSynchronize(ocl.stream); - - ocl.releaseMemChannels(c0); - ocl.releaseMemChannels(c1); -} - -void cuEdgeDetectorMapEx( - CUdeviceptr result/*out*/, - const ocu_channels &rgb, const ocu_channels &rgb2, - const size_t xsize, const size_t ysize, const size_t step) -{ - size_t channel_size = xsize * ysize * sizeof(float); - - ocu_args_d_t &ocl = getOcu(); - - ocu_channels rgb_blured = ocl.allocMemChannels(channel_size); - ocu_channels rgb2_blured = ocl.allocMemChannels(channel_size); - - static const double kSigma[3] = { 1.5, 0.586, 0.4 }; - - for (int i = 0; i < 3; i++) - { - cuBlurEx(rgb.ch[i], xsize, ysize, kSigma[i], 0.0, rgb_blured.ch[i]); - cuBlurEx(rgb2.ch[i], xsize, ysize, kSigma[i], 0.0, rgb2_blured.ch[i]); - } - - const void *args[] = { &result, - &rgb_blured.r, &rgb_blured.g, &rgb_blured.b, - &rgb2_blured.r, &rgb2_blured.g, &rgb2_blured.b, - &xsize, &ysize, &step }; - - const size_t res_xsize = (xsize + step - 1) / step; - const size_t res_ysize = (ysize + step - 1) / step; - - CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_EDGEDETECTOR], - res_xsize, res_ysize, 1, - 1, 1, 1, - 0, - ocl.stream, (void**)args, NULL); - - err = cuStreamSynchronize(ocl.stream); - - ocl.releaseMemChannels(rgb_blured); - ocl.releaseMemChannels(rgb2_blured); -} - -void cuBlockDiffMapEx( - CUdeviceptr block_diff_dc/*out*/, - CUdeviceptr block_diff_ac/*out*/, - const ocu_channels &rgb, const ocu_channels &rgb2, - const size_t xsize, const size_t ysize, const size_t step) -{ - ocu_args_d_t &ocl = getOcu(); - - const void *args[] = { &block_diff_dc, &block_diff_ac, - &rgb.r, &rgb.g, &rgb.b, - &rgb2.r, &rgb2.g, &rgb2.b, - &xsize, &ysize, &step }; - - const size_t res_xsize = (xsize + step - 1) / step; - const size_t res_ysize = (ysize + step - 1) / step; - - CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_BLOCKDIFFMAP], - res_xsize, res_ysize, 1, - 1, 1, 1, - 0, - ocl.stream, (void**)args, NULL); - - err = cuStreamSynchronize(ocl.stream); -} - -void cuEdgeDetectorLowFreqEx( - CUdeviceptr block_diff_ac/*in,out*/, - const ocu_channels &rgb, const ocu_channels &rgb2, - const size_t xsize, const size_t ysize, const size_t step) -{ - size_t channel_size = xsize * ysize * sizeof(float); - - static const double kSigma = 14; - - ocu_args_d_t &ocl = getOcu(); - ocu_channels rgb_blured = ocl.allocMemChannels(channel_size); - ocu_channels rgb2_blured = ocl.allocMemChannels(channel_size); - - for (int i = 0; i < 3; i++) - { - cuBlurEx(rgb.ch[i], xsize, ysize, kSigma, 0.0, rgb_blured.ch[i]); - cuBlurEx(rgb2.ch[i], xsize, ysize, kSigma, 0.0, rgb2_blured.ch[i]); - } - - const void *args[] = { &block_diff_ac, - &rgb_blured.r, &rgb_blured.g, &rgb_blured.b, - &rgb2_blured.r, &rgb2_blured.g, &rgb2_blured.b, - &xsize, &ysize, &step }; - - const size_t res_xsize = (xsize + step - 1) / step; - const size_t res_ysize = (ysize + step - 1) / step; - - CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_EDGEDETECTORLOWFREQ], - res_xsize, res_ysize, 1, - 1, 1, 1, - 0, - ocl.stream, (void**)args, NULL); - - err = cuStreamSynchronize(ocl.stream); - - ocl.releaseMemChannels(rgb_blured); - ocl.releaseMemChannels(rgb2_blured); -} - -void cuDiffPrecomputeEx( - ocu_channels &mask/*out*/, - const ocu_channels &xyb0, const ocu_channels &xyb1, - const size_t xsize, const size_t ysize) -{ - ocu_args_d_t &ocl = getOcu(); - - const void *args[] = { &mask.x, &mask.y, &mask.b, - &xyb0.x, &xyb0.y, &xyb0.b, - &xyb1.x, &xyb1.y, &xyb1.b }; - - CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_DIFFPRECOMPUTE], - xsize, ysize, 1, - 1, 1, 1, - 0, - ocl.stream, (void**)args, NULL); - - err = cuStreamSynchronize(ocl.stream); -} - -void cuScaleImageEx(CUdeviceptr img/*in, out*/, size_t size, double w) -{ - ocu_args_d_t &ocl = getOcu(); - - const void *args[] = { &img, &w }; - - CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_SCALEIMAGE], - size, 1, 1, - 1, 1, 1, - 0, - ocl.stream, (void**)args, NULL); - - err = cuStreamSynchronize(ocl.stream); -} - -void cuAverage5x5Ex(CUdeviceptr img/*in,out*/, const size_t xsize, const size_t ysize) -{ - if (xsize < 4 || ysize < 4) { - // TODO: Make this work for small dimensions as well. - return; - } - - ocu_args_d_t &ocl = getOcu(); - - size_t len = xsize * ysize * sizeof(float); - CUdeviceptr img_org = ocl.allocMem(len); - - cuMemcpyDtoD(img_org, img, len); - - const void *args[] = { &img, &img_org}; - - CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_AVERAGE5X5], - xsize, ysize, 1, - 1, 1, 1, - 0, - ocl.stream, (void**)args, NULL); - - err = cuStreamSynchronize(ocl.stream); - - cuMemFree(img_org); -} - -void cuMinSquareValEx( - CUdeviceptr img/*in,out*/, - const size_t xsize, const size_t ysize, - const size_t square_size, const size_t offset) -{ - ocu_args_d_t &ocl = getOcu(); - - CUdeviceptr srcA = ocl.allocMem(sizeof(float) * xsize * ysize); - - const void *args[] = { &srcA, &img, &square_size, &offset}; - - CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_MINSQUAREVAL], - xsize, ysize, 1, - 1, 1, 1, - 0, - ocl.stream, (void**)args, NULL); - - err = cuStreamSynchronize(ocl.stream); - - cuMemcpyDtoD(img, srcA, sizeof(float) * xsize * ysize); - cuMemFree(srcA); -} - -void cuDoMask(ocu_channels mask/*in, out*/, ocu_channels mask_dc/*in, out*/, size_t xsize, size_t ysize) -{ - ocu_args_d_t &ocl = getOcu(); - - double extmul = 0.975741017749; - double extoff = -4.25328244168; - double offset = 0.454909521427; - double scaler = 0.0738288224836; - double mul = 20.8029176447; - static double lut_x[512]; - static bool lutx_init = false; - if (!lutx_init) - { - lutx_init = true; - MakeMask(extmul, extoff, mul, offset, scaler, lut_x); - } - - extmul = 0.373995618954; - extoff = 1.5307267433; - offset = 0.911952641929; - scaler = 1.1731667845; - mul = 16.2447033988; - static double lut_y[512]; - static bool luty_init = false; - if (!luty_init) - { - luty_init = true; - MakeMask(extmul, extoff, mul, offset, scaler, lut_y); - } - - extmul = 0.61582234137; - extoff = -4.25376118646; - offset = 1.05105070921; - scaler = 0.47434643535; - mul = 31.1444967089; - static double lut_b[512]; - static bool lutb_init = false; - if (!lutb_init) - { - lutb_init = true; - MakeMask(extmul, extoff, mul, offset, scaler, lut_b); - } - - extmul = 1.79116943438; - extoff = -3.86797479189; - offset = 0.670960225853; - scaler = 0.486575865525; - mul = 20.4563479139; - static double lut_dcx[512]; - static bool lutdcx_init = false; - if (!lutdcx_init) - { - lutdcx_init = true; - MakeMask(extmul, extoff, mul, offset, scaler, lut_dcx); - } - - extmul = 0.212223514236; - extoff = -3.65647120524; - offset = 1.73396799447; - scaler = 0.170392660501; - mul = 21.6566724788; - static double lut_dcy[512]; - static bool lutdcy_init = false; - if (!lutdcy_init) - { - lutdcy_init = true; - MakeMask(extmul, extoff, mul, offset, scaler, lut_dcy); - } - - extmul = 0.349376011816; - extoff = -0.894711072781; - offset = 0.901647926679; - scaler = 0.380086095024; - mul = 18.0373825149; - static double lut_dcb[512]; - static bool lutdcb_init = false; - if (!lutdcb_init) - { - lutdcb_init = true; - MakeMask(extmul, extoff, mul, offset, scaler, lut_dcb); - } - - size_t channel_size = 512 * 3 * sizeof(double); - ocu_channels xyb = ocl.allocMemChannels(channel_size, lut_x, lut_y, lut_b); - ocu_channels xyb_dc = ocl.allocMemChannels(channel_size, lut_dcx, lut_dcy, lut_dcb); - - const void *args[] = { &mask.r, &mask.g, &mask.b, - &mask_dc.r, &mask_dc.g, &mask_dc.b, - &xyb.x, &xyb.y, &xyb.b, - &xyb_dc.x, &xyb_dc.y, &xyb_dc.b}; - - CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_DOMASK], - xsize, ysize, 1, - 1, 1, 1, - 0, - ocl.stream, (void**)args, NULL); - - err = cuStreamSynchronize(ocl.stream); - - ocl.releaseMemChannels(xyb); - ocl.releaseMemChannels(xyb_dc); -} - -void cuMaskEx( - ocu_channels mask/*out*/, ocu_channels mask_dc/*out*/, - const ocu_channels &rgb, const ocu_channels &rgb2, - const size_t xsize, const size_t ysize) -{ - cuDiffPrecomputeEx(mask, rgb, rgb2, xsize, ysize); - for (int i = 0; i < 3; i++) - { - cuAverage5x5Ex(mask.ch[i], xsize, ysize); - cuMinSquareValEx(mask.ch[i], xsize, ysize, 4, 0); - - static const double sigma[3] = { - 9.65781083553, - 14.2644604355, - 4.53358927369, - }; - - cuBlurEx(mask.ch[i], xsize, ysize, sigma[i], 0.0); - } - - cuDoMask(mask, mask_dc, xsize, ysize); - - for (int i = 0; i < 3; i++) - { - cuScaleImageEx(mask.ch[i], xsize * ysize, kGlobalScale * kGlobalScale); - cuScaleImageEx(mask_dc.ch[i], xsize * ysize, kGlobalScale * kGlobalScale); - } -} - -void cuCombineChannelsEx( - CUdeviceptr result/*out*/, - const ocu_channels &mask, - const ocu_channels &mask_dc, - const size_t xsize, const size_t ysize, - const CUdeviceptr block_diff_dc, - const CUdeviceptr block_diff_ac, - const CUdeviceptr edge_detector_map, - const size_t res_xsize, - const size_t step) -{ - ocu_args_d_t &ocl = getOcu(); - - const size_t work_xsize = ((xsize - 8 + step) + step - 1) / step; - const size_t work_ysize = ((ysize - 8 + step) + step - 1) / step; - - const void *args[] = { &result, - &mask.r, &mask.g, &mask.b, - &mask_dc.r, &mask_dc.g, &mask_dc.b, - &xsize, &ysize, - &block_diff_dc, &block_diff_ac, &edge_detector_map, - &res_xsize, - &step }; - - CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_COMBINECHANNELS], - work_xsize, work_ysize, 1, - 1, 1, 1, - 0, - ocl.stream, (void**)args, NULL); - - err = cuStreamSynchronize(ocl.stream); -} - -void cuUpsampleSquareRootEx(CUdeviceptr diffmap, const size_t xsize, const size_t ysize, const int step) -{ - ocu_args_d_t &ocl = getOcu(); - - CUdeviceptr diffmap_out = ocl.allocMem(xsize * ysize * sizeof(float)); - - const void *args[] = { &diffmap_out, - &diffmap, - &xsize, &ysize, - &step }; - - const size_t res_xsize = (xsize + step - 1) / step; - const size_t res_ysize = (ysize + step - 1) / step; - - CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_UPSAMPLESQUAREROOT], - res_xsize, res_ysize, 1, - 1, 1, 1, - 0, - ocl.stream, (void**)args, NULL); - - err = cuStreamSynchronize(ocl.stream); - - cuMemcpyDtoD(diffmap, diffmap_out, xsize * ysize * sizeof(float)); - - cuMemFree(diffmap_out); -} - -void cuRemoveBorderEx(CUdeviceptr out, const CUdeviceptr in, const size_t xsize, const size_t ysize, const int step) -{ - ocu_args_d_t &ocl = getOcu(); - - int cls = 8 - step; - int cls2 = (8 - step) / 2; - - const void *args[] = { &out, - &in, - &xsize, - &cls, - &cls2 }; - - CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_REMOVEBORDER], - xsize - cls, ysize - cls, 1, - 1, 1, 1, - 0, - ocl.stream, (void**)args, NULL); - - err = cuStreamSynchronize(ocl.stream); -} - -void cuAddBorderEx(CUdeviceptr out, size_t xsize, size_t ysize, int step, CUdeviceptr in) -{ - ocu_args_d_t &ocl = getOcu(); - - int cls = 8 - step; - int cls2 = (8 - step) / 2; - - const void *args[] = { &out, - &cls, - &cls2, - &in}; - - CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_ADDBORDER], - xsize, ysize, 1, - 1, 1, 1, - 0, - ocl.stream, (void**)args, NULL); - - err = cuStreamSynchronize(ocl.stream); -} - -void cuCalculateDiffmapEx(CUdeviceptr diffmap/*in,out*/, const size_t xsize, const size_t ysize, const int step) -{ - cuUpsampleSquareRootEx(diffmap, xsize, ysize, step); - - static const double kSigma = 8.8510880283; - static const double mul1 = 24.8235314874; - static const double scale = 1.0 / (1.0 + mul1); - - const int s = 8 - step; - int s2 = (8 - step) / 2; - - ocu_args_d_t &ocl = getOcu(); - CUdeviceptr blurred = ocl.allocMem((xsize - s) * (ysize - s) * sizeof(float)); - cuRemoveBorderEx(blurred, diffmap, xsize, ysize, step); - - static const double border_ratio = 0.03027655136; - cuBlurEx(blurred, xsize - s, ysize - s, kSigma, border_ratio); - - cuAddBorderEx(diffmap, xsize, ysize, step, blurred); - cuScaleImageEx(diffmap, xsize * ysize, scale); - - cuMemFree(blurred); -} - -void cuDiffmapOpsinDynamicsImage( - float* result, - const float* r, const float* g, const float* b, - const float* r2, const float* g2, const float* b2, - const size_t xsize, const size_t ysize, - const size_t step) -{ - const size_t res_xsize = (xsize + step - 1) / step; - const size_t res_ysize = (ysize + step - 1) / step; - - size_t channel_size = xsize * ysize * sizeof(float); - size_t channel_step_size = res_xsize * res_ysize * sizeof(float); - - CUresult err = CUDA_SUCCESS; - ocu_args_d_t &ocl = getOcu(); - ocu_channels xyb0 = ocl.allocMemChannels(channel_size, r, g, b); - ocu_channels xyb1 = ocl.allocMemChannels(channel_size, r2, g2, b2); - - CUdeviceptr mem_result = ocl.allocMem(channel_size, result); - - CUdeviceptr edge_detector_map = ocl.allocMem(3 * channel_step_size); - CUdeviceptr block_diff_dc = ocl.allocMem(3 * channel_step_size); - CUdeviceptr block_diff_ac = ocl.allocMem(3 * channel_step_size); - - cuMaskHighIntensityChangeEx(xyb0, xyb1, xsize, ysize); - - cuEdgeDetectorMapEx(edge_detector_map, xyb0, xyb1, xsize, ysize, step); - cuBlockDiffMapEx(block_diff_dc, block_diff_ac, xyb0, xyb1, xsize, ysize, step); - cuEdgeDetectorLowFreqEx(block_diff_ac, xyb0, xyb1, xsize, ysize, step); - { - ocu_channels mask = ocl.allocMemChannels(channel_size); - ocu_channels mask_dc = ocl.allocMemChannels(channel_size); - cuMaskEx(mask, mask_dc, xyb0, xyb1, xsize, ysize); - cuCombineChannelsEx(mem_result, mask, mask_dc, xsize, ysize, block_diff_dc, block_diff_ac, edge_detector_map, res_xsize, step); - - ocl.releaseMemChannels(mask); - ocl.releaseMemChannels(mask_dc); - } - - cuCalculateDiffmapEx(mem_result, xsize, ysize, step); - - cuMemcpyDtoH(result, mem_result, channel_size); - - ocl.releaseMemChannels(xyb1); - ocl.releaseMemChannels(xyb0); - - cuMemFree(edge_detector_map); - cuMemFree(block_diff_dc); - cuMemFree(block_diff_ac); - - cuMemFree(mem_result); -} - -void cuComputeBlockZeroingOrder( - guetzli::CoeffData *output_order_batch, - const channel_info orig_channel[3], - const float *orig_image_batch, - const float *mask_scale, - const int image_width, - const int image_height, - const channel_info mayout_channel[3], - const int factor, - const int comp_mask, - const float BlockErrorLimit) -{ - const int block8_width = (image_width + 8 - 1) / 8; - const int block8_height = (image_height + 8 - 1) / 8; - const int blockf_width = (image_width + 8 * factor - 1) / (8 * factor); - const int blockf_height = (image_height + 8 * factor - 1) / (8 * factor); - - using namespace guetzli; - - cl_int err = 0; - ocu_args_d_t &ocl = getOcu(); - - CUdeviceptr mem_orig_coeff[3]; - CUdeviceptr mem_mayout_coeff[3]; - CUdeviceptr mem_mayout_pixel[3]; - for (int c = 0; c < 3; c++) - { - int block_count = orig_channel[c].block_width * orig_channel[c].block_height; - mem_orig_coeff[c] = ocl.allocMem(block_count * sizeof(::coeff_t) * kDCTBlockSize, orig_channel[c].coeff); - - block_count = mayout_channel[c].block_width * mayout_channel[c].block_height; - mem_mayout_coeff[c] = ocl.allocMem(block_count * sizeof(::coeff_t) * kDCTBlockSize, mayout_channel[c].coeff); - - mem_mayout_pixel[c] = ocl.allocMem(image_width * image_height * sizeof(uint16_t), mayout_channel[c].pixel); - } - CUdeviceptr mem_orig_image = ocl.allocMem(sizeof(float) * 3 * kDCTBlockSize * block8_width * block8_height, orig_image_batch); - CUdeviceptr mem_mask_scale = ocl.allocMem(sizeof(float) * 3 * block8_width * block8_height, mask_scale); - - int output_order_batch_size = sizeof(CoeffData) * 3 * kDCTBlockSize * blockf_width * blockf_height; - CUdeviceptr mem_output_order_batch = ocl.allocMem(output_order_batch_size, output_order_batch); - - const void *args[] = { &mem_orig_coeff[0], &mem_orig_coeff[1], &mem_orig_coeff[2], - &mem_orig_image, &mem_orig_image, &mem_mask_scale, - &image_width, &image_height, - &mem_mayout_coeff[0], &mem_mayout_coeff[1], &mem_mayout_coeff[2], - &mem_mayout_pixel[0], &mem_mayout_pixel[1], &mem_mayout_pixel[2], - &mayout_channel[0], &mayout_channel[1], &mayout_channel[2], - &factor, - &comp_mask, - &BlockErrorLimit, - &mem_output_order_batch}; - - err = cuLaunchKernel(ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER], - blockf_width, blockf_height, 1, - 1, 1, 1, - 0, - ocl.stream, (void**)args, NULL); - - err = cuStreamSynchronize(ocl.stream); - - cuMemcpyDtoH(output_order_batch, mem_output_order_batch, output_order_batch_size); - - for (int c = 0; c < 3; c++) - { - cuMemFree(mem_orig_coeff[c]); - cuMemFree(mem_mayout_coeff[c]); - cuMemFree(mem_mayout_pixel[c]); - - } - - cuMemFree(mem_orig_image); - cuMemFree(mem_mask_scale); - cuMemFree(mem_output_order_batch); -} - -void cuMask( - float* mask_r, float* mask_g, float* mask_b, - float* maskdc_r, float* maskdc_g, float* maskdc_b, - const size_t xsize, const size_t ysize, - const float* r, const float* g, const float* b, - const float* r2, const float* g2, const float* b2) -{ - CUresult err = CUDA_SUCCESS; - ocu_args_d_t &ocl = getOcu(); - - size_t channel_size = xsize * ysize * sizeof(float); - - ocu_channels rgb = ocl.allocMemChannels(channel_size, r, g, b); - ocu_channels rgb2 = ocl.allocMemChannels(channel_size, r2, g2, b2); - ocu_channels mask = ocl.allocMemChannels(channel_size); - ocu_channels mask_dc = ocl.allocMemChannels(channel_size); - - cuMaskEx(mask, mask_dc, rgb, rgb2, xsize, ysize); - - cuMemcpyDtoH(mask_r, mask.r, channel_size); - cuMemcpyDtoH(mask_g, mask.g, channel_size); - cuMemcpyDtoH(mask_b, mask.b, channel_size); - cuMemcpyDtoH(maskdc_r, mask_dc.r, channel_size); - cuMemcpyDtoH(maskdc_g, mask_dc.g, channel_size); - cuMemcpyDtoH(maskdc_b, mask_dc.b, channel_size); - - ocl.releaseMemChannels(rgb); - ocl.releaseMemChannels(rgb2); - ocl.releaseMemChannels(mask); - ocl.releaseMemChannels(mask_dc); -} - -void cuConvolutionXEx( - CUdeviceptr result/*out*/, - const CUdeviceptr inp, size_t xsize, size_t ysize, - const CUdeviceptr multipliers, size_t len, - int xstep, int offset, double border_ratio) -{ - ocu_args_d_t &ocu = getOcu(); - - const void *args[] = { &result, &inp, &multipliers, &len, &xstep, &offset, &border_ratio }; - - CUresult err = cuLaunchKernel(ocu.kernel[KERNEL_CONVOLUTIONX], - xsize, ysize, 1, - 1, 1, 1, - 0, - ocu.stream, (void**)args, NULL); - - err = cuStreamSynchronize(ocu.stream); -} - -void cuConvolutionYEx( - CUdeviceptr result/*out*/, - const CUdeviceptr inp, size_t xsize, size_t ysize, - const CUdeviceptr multipliers, size_t len, - int xstep, int offset, double border_ratio) -{ - CUresult err = CUDA_SUCCESS; - ocu_args_d_t &ocu = getOcu(); - - const void *args[] = { &result, &inp, &multipliers, &len, &xstep, &offset, &border_ratio }; - - err = cuLaunchKernel(ocu.kernel[KERNEL_CONVOLUTIONY], - xsize, ysize, 1, - 1, 1, 1, - 0, - ocu.stream, (void**)args, NULL); - - err = cuStreamSynchronize(ocu.stream); -} - -void cuSquareSampleEx( - CUdeviceptr result/*out*/, - const CUdeviceptr image, size_t xsize, size_t ysize, - size_t xstep, size_t ystep) -{ - CUresult err = CUDA_SUCCESS; - ocu_args_d_t &ocu = getOcu(); - - const void *args[] = { &result, &image, &xstep, &ystep}; - - err = cuLaunchKernel(ocu.kernel[KERNEL_SQUARESAMPLE], - xsize, ysize, 1, - 1, 1, 1, - 0, - ocu.stream, (void**)args, NULL); - - err = cuStreamSynchronize(ocu.stream); -} - -void cuBlurEx(CUdeviceptr image/*out, opt*/, const size_t xsize, const size_t ysize, - const double sigma, const double border_ratio, - CUdeviceptr result/*out, opt*/) -{ - double m = 2.25; // Accuracy increases when m is increased. - const double scaler = -1.0 / (2 * sigma * sigma); - // For m = 9.0: exp(-scaler * diff * diff) < 2^ {-52} - const int diff = std::max(1, m * fabs(sigma)); - const int expn_size = 2 * diff + 1; - std::vector expn(expn_size); - for (int i = -diff; i <= diff; ++i) { - expn[i + diff] = static_cast(exp(scaler * i * i)); - } - - const int xstep = std::max(1, int(sigma / 3)); - - CUresult err = CUDA_SUCCESS; - ocu_args_d_t &ocu = getOcu(); - CUdeviceptr mem_expn = ocu.allocMem(sizeof(cl_float) * expn_size, expn.data()); - - if (xstep > 1) - { - CUdeviceptr srcA = ocu.allocMem(sizeof(cl_float) * xsize * ysize); - cuConvolutionXEx(srcA, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio); - cuConvolutionYEx(result ? result : image, srcA, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio); - cuSquareSampleEx(result ? result : image, result ? result : image, xsize, ysize, xstep, xstep); - cuMemFree(srcA); - } - else - { - CUdeviceptr srcA = ocu.allocMem(sizeof(cl_float) * xsize * ysize); - cuConvolutionXEx(srcA, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio); - cuConvolutionYEx(result ? result : image, srcA, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio); - cuMemFree(srcA); - } - - cuMemFree(mem_expn); -} - -void cuOpsinDynamicsImageEx(ocu_channels &rgb, const size_t xsize, const size_t ysize) -{ - static const double kSigma = 1.1; - - size_t channel_size = xsize * ysize * sizeof(float); - - CUresult err = CUDA_SUCCESS; - ocu_args_d_t &ocu = getOcu(); - ocu_channels rgb_blurred = ocu.allocMemChannels(channel_size); - - cuBlurEx(rgb.r, xsize, ysize, kSigma, 0.0, rgb_blurred.r); - cuBlurEx(rgb.g, xsize, ysize, kSigma, 0.0, rgb_blurred.g); - cuBlurEx(rgb.b, xsize, ysize, kSigma, 0.0, rgb_blurred.b); - - void *args[] = { &rgb.r, &rgb.g, &rgb.b, &rgb_blurred.r, &rgb_blurred.g, &rgb_blurred.b}; - - CUresult r = cuLaunchKernel(ocu.kernel[KERNEL_OPSINDYNAMICSIMAGE], - xsize * ysize, 1, 1, - 1, 1, 1, - 0, - ocu.stream, args, NULL); - - r = cuStreamSynchronize(ocu.stream); - - ocu.releaseMemChannels(rgb_blurred); -} \ No newline at end of file diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h index 8407a1c5..279884d6 100644 --- a/clguetzli/clguetzli.h +++ b/clguetzli/clguetzli.h @@ -6,6 +6,8 @@ #include "ocl.h" #include "clguetzli.cl.h" +#include "cuguetzli.h" + extern bool g_useOpenCL; extern bool g_useCuda; extern bool g_checkOpenCL; @@ -135,41 +137,6 @@ void clAddBorderEx(cl_mem out, const size_t xsize, const size_t ysize, const int void clCalculateDiffmapEx(cl_mem diffmap/*in,out*/, const size_t xsize, const size_t ysize, const int step); -//////////////////////////////////////////////////////////////// -void cuOpsinDynamicsImage(float *r, float *g, float *b, const size_t xsize, const size_t ysize); - -void cuDiffmapOpsinDynamicsImage( - float* result, - const float* r, const float* g, const float* b, - const float* r2, const float* g2, const float* b2, - const size_t xsize, const size_t ysize, - const size_t step); - -void cuComputeBlockZeroingOrder( - guetzli::CoeffData *output_order_batch, - const channel_info orig_channel[3], - const float *orig_image_batch, - const float *mask_scale, - const int image_width, - const int image_height, - const channel_info mayout_channel[3], - const int factor, - const int comp_mask, - const float BlockErrorLimit); - -void cuMask( - float* mask_r, float* mask_g, float* mask_b, - float* maskdc_r, float* maskdc_g, float* maskdc_b, - const size_t xsize, const size_t ysize, - const float* r, const float* g, const float* b, - const float* r2, const float* g2, const float* b2); - -void cuBlurEx(CUdeviceptr image/*out, opt*/, const size_t xsize, const size_t ysize, - const double sigma, const double border_ratio, - CUdeviceptr result = NULL/*out, opt*/); - -void cuOpsinDynamicsImageEx(ocu_channels &rgb, const size_t xsize, const size_t ysize); - class guetzli::OutputImage; namespace guetzli { diff --git a/clguetzli/cuguetzli.cpp b/clguetzli/cuguetzli.cpp new file mode 100644 index 00000000..1bb85334 --- /dev/null +++ b/clguetzli/cuguetzli.cpp @@ -0,0 +1,801 @@ +#include "cuguetzli.h" +#include +#include "ocu.h" + +void cuOpsinDynamicsImage(float *r, float *g, float *b, const size_t xsize, const size_t ysize) +{ + size_t channel_size = xsize * ysize * sizeof(float); + + ocu_args_d_t &ocu = getOcu(); + ocu_channels rgb = ocu.allocMemChannels(channel_size, r, g, b); + + cuOpsinDynamicsImageEx(rgb, xsize, ysize); + + cuMemcpyDtoH(r, rgb.r, channel_size); + cuMemcpyDtoH(g, rgb.g, channel_size); + cuMemcpyDtoH(b, rgb.b, channel_size); + + ocu.releaseMemChannels(rgb); +} + +void cuMaskHighIntensityChangeEx( + ocu_channels &xyb0/*in,out*/, + ocu_channels &xyb1/*in,out*/, + const size_t xsize, const size_t ysize) +{ + size_t channel_size = xsize * ysize * sizeof(float); + + ocu_args_d_t &ocl = getOcu(); + + ocu_channels c0 = ocl.allocMemChannels(channel_size); + ocu_channels c1 = ocl.allocMemChannels(channel_size); + + cuMemcpyDtoD(c0.r, xyb0.r, channel_size); + cuMemcpyDtoD(c0.g, xyb0.g, channel_size); + cuMemcpyDtoD(c0.b, xyb0.b, channel_size); + cuMemcpyDtoD(c1.r, xyb1.r, channel_size); + cuMemcpyDtoD(c1.g, xyb1.g, channel_size); + cuMemcpyDtoD(c1.b, xyb1.b, channel_size); + + const void *args[] = { &xyb0.r, &xyb0.g, &xyb0.b, + &xyb1.r, &xyb1.g, &xyb1.b, + &c0.r, &c0.g, &c0.b, + &c1.r, &c1.g, &c1.b }; + + CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_MASKHIGHINTENSITYCHANGE], + xsize, ysize, 1, + 1, 1, 1, + 0, + ocl.stream, (void**)args, NULL); + + err = cuStreamSynchronize(ocl.stream); + + ocl.releaseMemChannels(c0); + ocl.releaseMemChannels(c1); +} + +void cuEdgeDetectorMapEx( + CUdeviceptr result/*out*/, + const ocu_channels &rgb, const ocu_channels &rgb2, + const size_t xsize, const size_t ysize, const size_t step) +{ + size_t channel_size = xsize * ysize * sizeof(float); + + ocu_args_d_t &ocl = getOcu(); + + ocu_channels rgb_blured = ocl.allocMemChannels(channel_size); + ocu_channels rgb2_blured = ocl.allocMemChannels(channel_size); + + static const double kSigma[3] = { 1.5, 0.586, 0.4 }; + + for (int i = 0; i < 3; i++) + { + cuBlurEx(rgb.ch[i], xsize, ysize, kSigma[i], 0.0, rgb_blured.ch[i]); + cuBlurEx(rgb2.ch[i], xsize, ysize, kSigma[i], 0.0, rgb2_blured.ch[i]); + } + + const void *args[] = { &result, + &rgb_blured.r, &rgb_blured.g, &rgb_blured.b, + &rgb2_blured.r, &rgb2_blured.g, &rgb2_blured.b, + &xsize, &ysize, &step }; + + const size_t res_xsize = (xsize + step - 1) / step; + const size_t res_ysize = (ysize + step - 1) / step; + + CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_EDGEDETECTOR], + res_xsize, res_ysize, 1, + 1, 1, 1, + 0, + ocl.stream, (void**)args, NULL); + + err = cuStreamSynchronize(ocl.stream); + + ocl.releaseMemChannels(rgb_blured); + ocl.releaseMemChannels(rgb2_blured); +} + +void cuBlockDiffMapEx( + CUdeviceptr block_diff_dc/*out*/, + CUdeviceptr block_diff_ac/*out*/, + const ocu_channels &rgb, const ocu_channels &rgb2, + const size_t xsize, const size_t ysize, const size_t step) +{ + ocu_args_d_t &ocl = getOcu(); + + const void *args[] = { &block_diff_dc, &block_diff_ac, + &rgb.r, &rgb.g, &rgb.b, + &rgb2.r, &rgb2.g, &rgb2.b, + &xsize, &ysize, &step }; + + const size_t res_xsize = (xsize + step - 1) / step; + const size_t res_ysize = (ysize + step - 1) / step; + + CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_BLOCKDIFFMAP], + res_xsize, res_ysize, 1, + 1, 1, 1, + 0, + ocl.stream, (void**)args, NULL); + + err = cuStreamSynchronize(ocl.stream); +} + +void cuEdgeDetectorLowFreqEx( + CUdeviceptr block_diff_ac/*in,out*/, + const ocu_channels &rgb, const ocu_channels &rgb2, + const size_t xsize, const size_t ysize, const size_t step) +{ + size_t channel_size = xsize * ysize * sizeof(float); + + static const double kSigma = 14; + + ocu_args_d_t &ocl = getOcu(); + ocu_channels rgb_blured = ocl.allocMemChannels(channel_size); + ocu_channels rgb2_blured = ocl.allocMemChannels(channel_size); + + for (int i = 0; i < 3; i++) + { + cuBlurEx(rgb.ch[i], xsize, ysize, kSigma, 0.0, rgb_blured.ch[i]); + cuBlurEx(rgb2.ch[i], xsize, ysize, kSigma, 0.0, rgb2_blured.ch[i]); + } + + const void *args[] = { &block_diff_ac, + &rgb_blured.r, &rgb_blured.g, &rgb_blured.b, + &rgb2_blured.r, &rgb2_blured.g, &rgb2_blured.b, + &xsize, &ysize, &step }; + + const size_t res_xsize = (xsize + step - 1) / step; + const size_t res_ysize = (ysize + step - 1) / step; + + CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_EDGEDETECTORLOWFREQ], + res_xsize, res_ysize, 1, + 1, 1, 1, + 0, + ocl.stream, (void**)args, NULL); + + err = cuStreamSynchronize(ocl.stream); + + ocl.releaseMemChannels(rgb_blured); + ocl.releaseMemChannels(rgb2_blured); +} + +void cuDiffPrecomputeEx( + ocu_channels &mask/*out*/, + const ocu_channels &xyb0, const ocu_channels &xyb1, + const size_t xsize, const size_t ysize) +{ + ocu_args_d_t &ocl = getOcu(); + + const void *args[] = { &mask.x, &mask.y, &mask.b, + &xyb0.x, &xyb0.y, &xyb0.b, + &xyb1.x, &xyb1.y, &xyb1.b }; + + CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_DIFFPRECOMPUTE], + xsize, ysize, 1, + 1, 1, 1, + 0, + ocl.stream, (void**)args, NULL); + + err = cuStreamSynchronize(ocl.stream); +} + +void cuScaleImageEx(CUdeviceptr img/*in, out*/, size_t size, double w) +{ + ocu_args_d_t &ocl = getOcu(); + + const void *args[] = { &img, &w }; + + CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_SCALEIMAGE], + size, 1, 1, + 1, 1, 1, + 0, + ocl.stream, (void**)args, NULL); + + err = cuStreamSynchronize(ocl.stream); +} + +void cuAverage5x5Ex(CUdeviceptr img/*in,out*/, const size_t xsize, const size_t ysize) +{ + if (xsize < 4 || ysize < 4) { + // TODO: Make this work for small dimensions as well. + return; + } + + ocu_args_d_t &ocl = getOcu(); + + size_t len = xsize * ysize * sizeof(float); + CUdeviceptr img_org = ocl.allocMem(len); + + cuMemcpyDtoD(img_org, img, len); + + const void *args[] = { &img, &img_org }; + + CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_AVERAGE5X5], + xsize, ysize, 1, + 1, 1, 1, + 0, + ocl.stream, (void**)args, NULL); + + err = cuStreamSynchronize(ocl.stream); + + cuMemFree(img_org); +} + +void cuMinSquareValEx( + CUdeviceptr img/*in,out*/, + const size_t xsize, const size_t ysize, + const size_t square_size, const size_t offset) +{ + ocu_args_d_t &ocl = getOcu(); + + CUdeviceptr srcA = ocl.allocMem(sizeof(float) * xsize * ysize); + + const void *args[] = { &srcA, &img, &square_size, &offset }; + + CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_MINSQUAREVAL], + xsize, ysize, 1, + 1, 1, 1, + 0, + ocl.stream, (void**)args, NULL); + + err = cuStreamSynchronize(ocl.stream); + + cuMemcpyDtoD(img, srcA, sizeof(float) * xsize * ysize); + cuMemFree(srcA); +} + +static void MakeMask(double extmul, double extoff, + double mul, double offset, + double scaler, double *result) +{ + for (size_t i = 0; i < 512; ++i) { + const double c = mul / ((0.01 * scaler * i) + offset); + result[i] = 1.0 + extmul * (c + extoff); + result[i] *= result[i]; + } +} + +static const double kInternalGoodQualityThreshold = 14.921561160295326; +static const double kGlobalScale = 1.0 / kInternalGoodQualityThreshold; + +void cuDoMask(ocu_channels mask/*in, out*/, ocu_channels mask_dc/*in, out*/, size_t xsize, size_t ysize) +{ + ocu_args_d_t &ocl = getOcu(); + + double extmul = 0.975741017749; + double extoff = -4.25328244168; + double offset = 0.454909521427; + double scaler = 0.0738288224836; + double mul = 20.8029176447; + static double lut_x[512]; + static bool lutx_init = false; + if (!lutx_init) + { + lutx_init = true; + MakeMask(extmul, extoff, mul, offset, scaler, lut_x); + } + + extmul = 0.373995618954; + extoff = 1.5307267433; + offset = 0.911952641929; + scaler = 1.1731667845; + mul = 16.2447033988; + static double lut_y[512]; + static bool luty_init = false; + if (!luty_init) + { + luty_init = true; + MakeMask(extmul, extoff, mul, offset, scaler, lut_y); + } + + extmul = 0.61582234137; + extoff = -4.25376118646; + offset = 1.05105070921; + scaler = 0.47434643535; + mul = 31.1444967089; + static double lut_b[512]; + static bool lutb_init = false; + if (!lutb_init) + { + lutb_init = true; + MakeMask(extmul, extoff, mul, offset, scaler, lut_b); + } + + extmul = 1.79116943438; + extoff = -3.86797479189; + offset = 0.670960225853; + scaler = 0.486575865525; + mul = 20.4563479139; + static double lut_dcx[512]; + static bool lutdcx_init = false; + if (!lutdcx_init) + { + lutdcx_init = true; + MakeMask(extmul, extoff, mul, offset, scaler, lut_dcx); + } + + extmul = 0.212223514236; + extoff = -3.65647120524; + offset = 1.73396799447; + scaler = 0.170392660501; + mul = 21.6566724788; + static double lut_dcy[512]; + static bool lutdcy_init = false; + if (!lutdcy_init) + { + lutdcy_init = true; + MakeMask(extmul, extoff, mul, offset, scaler, lut_dcy); + } + + extmul = 0.349376011816; + extoff = -0.894711072781; + offset = 0.901647926679; + scaler = 0.380086095024; + mul = 18.0373825149; + static double lut_dcb[512]; + static bool lutdcb_init = false; + if (!lutdcb_init) + { + lutdcb_init = true; + MakeMask(extmul, extoff, mul, offset, scaler, lut_dcb); + } + + size_t channel_size = 512 * 3 * sizeof(double); + ocu_channels xyb = ocl.allocMemChannels(channel_size, lut_x, lut_y, lut_b); + ocu_channels xyb_dc = ocl.allocMemChannels(channel_size, lut_dcx, lut_dcy, lut_dcb); + + const void *args[] = { &mask.r, &mask.g, &mask.b, + &mask_dc.r, &mask_dc.g, &mask_dc.b, + &xyb.x, &xyb.y, &xyb.b, + &xyb_dc.x, &xyb_dc.y, &xyb_dc.b }; + + CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_DOMASK], + xsize, ysize, 1, + 1, 1, 1, + 0, + ocl.stream, (void**)args, NULL); + + err = cuStreamSynchronize(ocl.stream); + + ocl.releaseMemChannels(xyb); + ocl.releaseMemChannels(xyb_dc); +} + +void cuMaskEx( + ocu_channels mask/*out*/, ocu_channels mask_dc/*out*/, + const ocu_channels &rgb, const ocu_channels &rgb2, + const size_t xsize, const size_t ysize) +{ + cuDiffPrecomputeEx(mask, rgb, rgb2, xsize, ysize); + for (int i = 0; i < 3; i++) + { + cuAverage5x5Ex(mask.ch[i], xsize, ysize); + cuMinSquareValEx(mask.ch[i], xsize, ysize, 4, 0); + + static const double sigma[3] = { + 9.65781083553, + 14.2644604355, + 4.53358927369, + }; + + cuBlurEx(mask.ch[i], xsize, ysize, sigma[i], 0.0); + } + + cuDoMask(mask, mask_dc, xsize, ysize); + + for (int i = 0; i < 3; i++) + { + cuScaleImageEx(mask.ch[i], xsize * ysize, kGlobalScale * kGlobalScale); + cuScaleImageEx(mask_dc.ch[i], xsize * ysize, kGlobalScale * kGlobalScale); + } +} + +void cuCombineChannelsEx( + CUdeviceptr result/*out*/, + const ocu_channels &mask, + const ocu_channels &mask_dc, + const size_t xsize, const size_t ysize, + const CUdeviceptr block_diff_dc, + const CUdeviceptr block_diff_ac, + const CUdeviceptr edge_detector_map, + const size_t res_xsize, + const size_t step) +{ + ocu_args_d_t &ocl = getOcu(); + + const size_t work_xsize = ((xsize - 8 + step) + step - 1) / step; + const size_t work_ysize = ((ysize - 8 + step) + step - 1) / step; + + const void *args[] = { &result, + &mask.r, &mask.g, &mask.b, + &mask_dc.r, &mask_dc.g, &mask_dc.b, + &xsize, &ysize, + &block_diff_dc, &block_diff_ac, &edge_detector_map, + &res_xsize, + &step }; + + CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_COMBINECHANNELS], + work_xsize, work_ysize, 1, + 1, 1, 1, + 0, + ocl.stream, (void**)args, NULL); + + err = cuStreamSynchronize(ocl.stream); +} + +void cuUpsampleSquareRootEx(CUdeviceptr diffmap, const size_t xsize, const size_t ysize, const int step) +{ + ocu_args_d_t &ocl = getOcu(); + + CUdeviceptr diffmap_out = ocl.allocMem(xsize * ysize * sizeof(float)); + + const void *args[] = { &diffmap_out, + &diffmap, + &xsize, &ysize, + &step }; + + const size_t res_xsize = (xsize + step - 1) / step; + const size_t res_ysize = (ysize + step - 1) / step; + + CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_UPSAMPLESQUAREROOT], + res_xsize, res_ysize, 1, + 1, 1, 1, + 0, + ocl.stream, (void**)args, NULL); + + err = cuStreamSynchronize(ocl.stream); + + cuMemcpyDtoD(diffmap, diffmap_out, xsize * ysize * sizeof(float)); + + cuMemFree(diffmap_out); +} + +void cuRemoveBorderEx(CUdeviceptr out, const CUdeviceptr in, const size_t xsize, const size_t ysize, const int step) +{ + ocu_args_d_t &ocl = getOcu(); + + int cls = 8 - step; + int cls2 = (8 - step) / 2; + + const void *args[] = { &out, + &in, + &xsize, + &cls, + &cls2 }; + + CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_REMOVEBORDER], + xsize - cls, ysize - cls, 1, + 1, 1, 1, + 0, + ocl.stream, (void**)args, NULL); + + err = cuStreamSynchronize(ocl.stream); +} + +void cuAddBorderEx(CUdeviceptr out, size_t xsize, size_t ysize, int step, CUdeviceptr in) +{ + ocu_args_d_t &ocl = getOcu(); + + int cls = 8 - step; + int cls2 = (8 - step) / 2; + + const void *args[] = { &out, + &cls, + &cls2, + &in }; + + CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_ADDBORDER], + xsize, ysize, 1, + 1, 1, 1, + 0, + ocl.stream, (void**)args, NULL); + + err = cuStreamSynchronize(ocl.stream); +} + +void cuCalculateDiffmapEx(CUdeviceptr diffmap/*in,out*/, const size_t xsize, const size_t ysize, const int step) +{ + cuUpsampleSquareRootEx(diffmap, xsize, ysize, step); + + static const double kSigma = 8.8510880283; + static const double mul1 = 24.8235314874; + static const double scale = 1.0 / (1.0 + mul1); + + const int s = 8 - step; + int s2 = (8 - step) / 2; + + ocu_args_d_t &ocl = getOcu(); + CUdeviceptr blurred = ocl.allocMem((xsize - s) * (ysize - s) * sizeof(float)); + cuRemoveBorderEx(blurred, diffmap, xsize, ysize, step); + + static const double border_ratio = 0.03027655136; + cuBlurEx(blurred, xsize - s, ysize - s, kSigma, border_ratio); + + cuAddBorderEx(diffmap, xsize, ysize, step, blurred); + cuScaleImageEx(diffmap, xsize * ysize, scale); + + cuMemFree(blurred); +} + +void cuDiffmapOpsinDynamicsImage( + float* result, + const float* r, const float* g, const float* b, + const float* r2, const float* g2, const float* b2, + const size_t xsize, const size_t ysize, + const size_t step) +{ + const size_t res_xsize = (xsize + step - 1) / step; + const size_t res_ysize = (ysize + step - 1) / step; + + size_t channel_size = xsize * ysize * sizeof(float); + size_t channel_step_size = res_xsize * res_ysize * sizeof(float); + + CUresult err = CUDA_SUCCESS; + ocu_args_d_t &ocl = getOcu(); + ocu_channels xyb0 = ocl.allocMemChannels(channel_size, r, g, b); + ocu_channels xyb1 = ocl.allocMemChannels(channel_size, r2, g2, b2); + + CUdeviceptr mem_result = ocl.allocMem(channel_size, result); + + CUdeviceptr edge_detector_map = ocl.allocMem(3 * channel_step_size); + CUdeviceptr block_diff_dc = ocl.allocMem(3 * channel_step_size); + CUdeviceptr block_diff_ac = ocl.allocMem(3 * channel_step_size); + + cuMaskHighIntensityChangeEx(xyb0, xyb1, xsize, ysize); + + cuEdgeDetectorMapEx(edge_detector_map, xyb0, xyb1, xsize, ysize, step); + cuBlockDiffMapEx(block_diff_dc, block_diff_ac, xyb0, xyb1, xsize, ysize, step); + cuEdgeDetectorLowFreqEx(block_diff_ac, xyb0, xyb1, xsize, ysize, step); + { + ocu_channels mask = ocl.allocMemChannels(channel_size); + ocu_channels mask_dc = ocl.allocMemChannels(channel_size); + cuMaskEx(mask, mask_dc, xyb0, xyb1, xsize, ysize); + cuCombineChannelsEx(mem_result, mask, mask_dc, xsize, ysize, block_diff_dc, block_diff_ac, edge_detector_map, res_xsize, step); + + ocl.releaseMemChannels(mask); + ocl.releaseMemChannels(mask_dc); + } + + cuCalculateDiffmapEx(mem_result, xsize, ysize, step); + + cuMemcpyDtoH(result, mem_result, channel_size); + + ocl.releaseMemChannels(xyb1); + ocl.releaseMemChannels(xyb0); + + cuMemFree(edge_detector_map); + cuMemFree(block_diff_dc); + cuMemFree(block_diff_ac); + + cuMemFree(mem_result); +} + +void cuComputeBlockZeroingOrder( + guetzli::CoeffData *output_order_batch, + const channel_info orig_channel[3], + const float *orig_image_batch, + const float *mask_scale, + const int image_width, + const int image_height, + const channel_info mayout_channel[3], + const int factor, + const int comp_mask, + const float BlockErrorLimit) +{ + const int block8_width = (image_width + 8 - 1) / 8; + const int block8_height = (image_height + 8 - 1) / 8; + const int blockf_width = (image_width + 8 * factor - 1) / (8 * factor); + const int blockf_height = (image_height + 8 * factor - 1) / (8 * factor); + + using namespace guetzli; + + cl_int err = 0; + ocu_args_d_t &ocl = getOcu(); + + CUdeviceptr mem_orig_coeff[3]; + CUdeviceptr mem_mayout_coeff[3]; + CUdeviceptr mem_mayout_pixel[3]; + for (int c = 0; c < 3; c++) + { + int block_count = orig_channel[c].block_width * orig_channel[c].block_height; + mem_orig_coeff[c] = ocl.allocMem(block_count * sizeof(::coeff_t) * kDCTBlockSize, orig_channel[c].coeff); + + block_count = mayout_channel[c].block_width * mayout_channel[c].block_height; + mem_mayout_coeff[c] = ocl.allocMem(block_count * sizeof(::coeff_t) * kDCTBlockSize, mayout_channel[c].coeff); + + mem_mayout_pixel[c] = ocl.allocMem(image_width * image_height * sizeof(uint16_t), mayout_channel[c].pixel); + } + CUdeviceptr mem_orig_image = ocl.allocMem(sizeof(float) * 3 * kDCTBlockSize * block8_width * block8_height, orig_image_batch); + CUdeviceptr mem_mask_scale = ocl.allocMem(sizeof(float) * 3 * block8_width * block8_height, mask_scale); + + int output_order_batch_size = sizeof(CoeffData) * 3 * kDCTBlockSize * blockf_width * blockf_height; + CUdeviceptr mem_output_order_batch = ocl.allocMem(output_order_batch_size, output_order_batch); + + const void *args[] = { &mem_orig_coeff[0], &mem_orig_coeff[1], &mem_orig_coeff[2], + &mem_orig_image, &mem_orig_image, &mem_mask_scale, + &image_width, &image_height, + &mem_mayout_coeff[0], &mem_mayout_coeff[1], &mem_mayout_coeff[2], + &mem_mayout_pixel[0], &mem_mayout_pixel[1], &mem_mayout_pixel[2], + &mayout_channel[0], &mayout_channel[1], &mayout_channel[2], + &factor, + &comp_mask, + &BlockErrorLimit, + &mem_output_order_batch }; + + err = cuLaunchKernel(ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER], + blockf_width, blockf_height, 1, + 1, 1, 1, + 0, + ocl.stream, (void**)args, NULL); + + err = cuStreamSynchronize(ocl.stream); + + cuMemcpyDtoH(output_order_batch, mem_output_order_batch, output_order_batch_size); + + for (int c = 0; c < 3; c++) + { + cuMemFree(mem_orig_coeff[c]); + cuMemFree(mem_mayout_coeff[c]); + cuMemFree(mem_mayout_pixel[c]); + + } + + cuMemFree(mem_orig_image); + cuMemFree(mem_mask_scale); + cuMemFree(mem_output_order_batch); +} + +void cuMask( + float* mask_r, float* mask_g, float* mask_b, + float* maskdc_r, float* maskdc_g, float* maskdc_b, + const size_t xsize, const size_t ysize, + const float* r, const float* g, const float* b, + const float* r2, const float* g2, const float* b2) +{ + CUresult err = CUDA_SUCCESS; + ocu_args_d_t &ocl = getOcu(); + + size_t channel_size = xsize * ysize * sizeof(float); + + ocu_channels rgb = ocl.allocMemChannels(channel_size, r, g, b); + ocu_channels rgb2 = ocl.allocMemChannels(channel_size, r2, g2, b2); + ocu_channels mask = ocl.allocMemChannels(channel_size); + ocu_channels mask_dc = ocl.allocMemChannels(channel_size); + + cuMaskEx(mask, mask_dc, rgb, rgb2, xsize, ysize); + + cuMemcpyDtoH(mask_r, mask.r, channel_size); + cuMemcpyDtoH(mask_g, mask.g, channel_size); + cuMemcpyDtoH(mask_b, mask.b, channel_size); + cuMemcpyDtoH(maskdc_r, mask_dc.r, channel_size); + cuMemcpyDtoH(maskdc_g, mask_dc.g, channel_size); + cuMemcpyDtoH(maskdc_b, mask_dc.b, channel_size); + + ocl.releaseMemChannels(rgb); + ocl.releaseMemChannels(rgb2); + ocl.releaseMemChannels(mask); + ocl.releaseMemChannels(mask_dc); +} + +void cuConvolutionXEx( + CUdeviceptr result/*out*/, + const CUdeviceptr inp, size_t xsize, size_t ysize, + const CUdeviceptr multipliers, size_t len, + int xstep, int offset, double border_ratio) +{ + ocu_args_d_t &ocu = getOcu(); + + const void *args[] = { &result, &inp, &multipliers, &len, &xstep, &offset, &border_ratio }; + + CUresult err = cuLaunchKernel(ocu.kernel[KERNEL_CONVOLUTIONX], + xsize, ysize, 1, + 1, 1, 1, + 0, + ocu.stream, (void**)args, NULL); + + err = cuStreamSynchronize(ocu.stream); +} + +void cuConvolutionYEx( + CUdeviceptr result/*out*/, + const CUdeviceptr inp, size_t xsize, size_t ysize, + const CUdeviceptr multipliers, size_t len, + int xstep, int offset, double border_ratio) +{ + CUresult err = CUDA_SUCCESS; + ocu_args_d_t &ocu = getOcu(); + + const void *args[] = { &result, &inp, &multipliers, &len, &xstep, &offset, &border_ratio }; + + err = cuLaunchKernel(ocu.kernel[KERNEL_CONVOLUTIONY], + xsize, ysize, 1, + 1, 1, 1, + 0, + ocu.stream, (void**)args, NULL); + + err = cuStreamSynchronize(ocu.stream); +} + +void cuSquareSampleEx( + CUdeviceptr result/*out*/, + const CUdeviceptr image, size_t xsize, size_t ysize, + size_t xstep, size_t ystep) +{ + CUresult err = CUDA_SUCCESS; + ocu_args_d_t &ocu = getOcu(); + + const void *args[] = { &result, &image, &xstep, &ystep }; + + err = cuLaunchKernel(ocu.kernel[KERNEL_SQUARESAMPLE], + xsize, ysize, 1, + 1, 1, 1, + 0, + ocu.stream, (void**)args, NULL); + + err = cuStreamSynchronize(ocu.stream); +} + +void cuBlurEx(CUdeviceptr image/*out, opt*/, const size_t xsize, const size_t ysize, + const double sigma, const double border_ratio, + CUdeviceptr result/*out, opt*/) +{ + double m = 2.25; // Accuracy increases when m is increased. + const double scaler = -1.0 / (2 * sigma * sigma); + // For m = 9.0: exp(-scaler * diff * diff) < 2^ {-52} + const int diff = std::max(1, m * fabs(sigma)); + const int expn_size = 2 * diff + 1; + std::vector expn(expn_size); + for (int i = -diff; i <= diff; ++i) { + expn[i + diff] = static_cast(exp(scaler * i * i)); + } + + const int xstep = std::max(1, int(sigma / 3)); + + CUresult err = CUDA_SUCCESS; + ocu_args_d_t &ocu = getOcu(); + CUdeviceptr mem_expn = ocu.allocMem(sizeof(cl_float) * expn_size, expn.data()); + + if (xstep > 1) + { + CUdeviceptr srcA = ocu.allocMem(sizeof(cl_float) * xsize * ysize); + cuConvolutionXEx(srcA, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio); + cuConvolutionYEx(result ? result : image, srcA, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio); + cuSquareSampleEx(result ? result : image, result ? result : image, xsize, ysize, xstep, xstep); + cuMemFree(srcA); + } + else + { + CUdeviceptr srcA = ocu.allocMem(sizeof(cl_float) * xsize * ysize); + cuConvolutionXEx(srcA, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio); + cuConvolutionYEx(result ? result : image, srcA, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio); + cuMemFree(srcA); + } + + cuMemFree(mem_expn); +} + +void cuOpsinDynamicsImageEx(ocu_channels &rgb, const size_t xsize, const size_t ysize) +{ + static const double kSigma = 1.1; + + size_t channel_size = xsize * ysize * sizeof(float); + + CUresult err = CUDA_SUCCESS; + ocu_args_d_t &ocu = getOcu(); + ocu_channels rgb_blurred = ocu.allocMemChannels(channel_size); + + cuBlurEx(rgb.r, xsize, ysize, kSigma, 0.0, rgb_blurred.r); + cuBlurEx(rgb.g, xsize, ysize, kSigma, 0.0, rgb_blurred.g); + cuBlurEx(rgb.b, xsize, ysize, kSigma, 0.0, rgb_blurred.b); + + void *args[] = { &rgb.r, &rgb.g, &rgb.b, &rgb_blurred.r, &rgb_blurred.g, &rgb_blurred.b }; + + CUresult r = cuLaunchKernel(ocu.kernel[KERNEL_OPSINDYNAMICSIMAGE], + xsize * ysize, 1, 1, + 1, 1, 1, + 0, + ocu.stream, args, NULL); + + r = cuStreamSynchronize(ocu.stream); + + ocu.releaseMemChannels(rgb_blurred); +} diff --git a/clguetzli/cuguetzli.h b/clguetzli/cuguetzli.h new file mode 100644 index 00000000..14c607cc --- /dev/null +++ b/clguetzli/cuguetzli.h @@ -0,0 +1,37 @@ +#pragma once +#include "guetzli/processor.h" +#include "clguetzli.cl.h" + +void cuOpsinDynamicsImage(float *r, float *g, float *b, const size_t xsize, const size_t ysize); + +void cuDiffmapOpsinDynamicsImage( + float* result, + const float* r, const float* g, const float* b, + const float* r2, const float* g2, const float* b2, + const size_t xsize, const size_t ysize, + const size_t step); + +void cuComputeBlockZeroingOrder( + guetzli::CoeffData *output_order_batch, + const channel_info orig_channel[3], + const float *orig_image_batch, + const float *mask_scale, + const int image_width, + const int image_height, + const channel_info mayout_channel[3], + const int factor, + const int comp_mask, + const float BlockErrorLimit); + +void cuMask( + float* mask_r, float* mask_g, float* mask_b, + float* maskdc_r, float* maskdc_g, float* maskdc_b, + const size_t xsize, const size_t ysize, + const float* r, const float* g, const float* b, + const float* r2, const float* g2, const float* b2); + +void cuBlurEx(CUdeviceptr image/*out, opt*/, const size_t xsize, const size_t ysize, + const double sigma, const double border_ratio, + CUdeviceptr result = NULL/*out, opt*/); + +void cuOpsinDynamicsImageEx(ocu_channels &rgb, const size_t xsize, const size_t ysize); \ No newline at end of file diff --git a/guetzli.vcxproj b/guetzli.vcxproj index b8798eb2..fc36b9a0 100644 --- a/guetzli.vcxproj +++ b/guetzli.vcxproj @@ -204,6 +204,7 @@ + @@ -302,6 +303,7 @@ + diff --git a/guetzli.vcxproj.filters b/guetzli.vcxproj.filters index 07f56763..38921bde 100644 --- a/guetzli.vcxproj.filters +++ b/guetzli.vcxproj.filters @@ -315,6 +315,9 @@ clguetzli + + clguetzli + @@ -587,6 +590,9 @@ clguetzli + + clguetzli + diff --git a/guetzli/processor.cc b/guetzli/processor.cc index 4690aff1..63ebb609 100644 --- a/guetzli/processor.cc +++ b/guetzli/processor.cc @@ -567,7 +567,7 @@ void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, co CoeffData * output_order = NULL; ButteraugliComparatorEx * comp = (ButteraugliComparatorEx*)comparator_; - if (g_useOpenCL || g_useCuda || g_checkOpenCL) + if (g_useOpenCL || g_checkOpenCL) { channel_info orig_channel[3]; channel_info mayout_channel[3]; From 5d49f244ed2209d791bc409a298215019f8f3ca0 Mon Sep 17 00:00:00 2001 From: strongtu Date: Sat, 3 Jun 2017 02:08:39 +0800 Subject: [PATCH 126/189] =?UTF-8?q?=E8=B0=83=E6=95=B4=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clguetzli.cpp | 73 +--- clguetzli/clguetzli.h | 10 +- clguetzli/cuguetzli.cpp | 804 ++++++++++++++++++++-------------------- clguetzli/cuguetzli.h | 82 +++- 4 files changed, 492 insertions(+), 477 deletions(-) diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index f50ce17c..0774a074 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -89,8 +89,8 @@ void clDiffmapOpsinDynamicsImage( float* result, const float* r, const float* g, const float* b, const float* r2, const float* g2, const float* b2, - size_t xsize, size_t ysize, - size_t step) + const size_t xsize, const size_t ysize, + const size_t step) { const size_t res_xsize = (xsize + step - 1) / step; const size_t res_ysize = (ysize + step - 1) / step; @@ -98,7 +98,6 @@ void clDiffmapOpsinDynamicsImage( size_t channel_size = xsize * ysize * sizeof(float); size_t channel_step_size = res_xsize * res_ysize * sizeof(float); - cl_int err = 0; ocl_args_d_t &ocl = getOcl(); ocl_channels xyb0 = ocl.allocMemChannels(channel_size, r, g, b); ocl_channels xyb1 = ocl.allocMemChannels(channel_size, r2, g2, b2); @@ -127,7 +126,7 @@ void clDiffmapOpsinDynamicsImage( clCalculateDiffmapEx(mem_result, xsize, ysize, step); clEnqueueReadBuffer(ocl.commandQueue, mem_result, false, 0, channel_size, result, 0, NULL, NULL); - err = clFinish(ocl.commandQueue); + cl_int err = clFinish(ocl.commandQueue); ocl.releaseMemChannels(xyb1); ocl.releaseMemChannels(xyb0); @@ -242,7 +241,6 @@ void clMask( const float* r, const float* g, const float* b, const float* r2, const float* g2, const float* b2) { - cl_int err = CL_SUCCESS; ocl_args_d_t &ocl = getOcl(); size_t channel_size = xsize * ysize * sizeof(float); @@ -260,7 +258,7 @@ void clMask( clEnqueueReadBuffer(ocl.commandQueue, mask_dc.r, false, 0, channel_size, maskdc_r, 0, NULL, NULL); clEnqueueReadBuffer(ocl.commandQueue, mask_dc.g, false, 0, channel_size, maskdc_g, 0, NULL, NULL); clEnqueueReadBuffer(ocl.commandQueue, mask_dc.b, false, 0, channel_size, maskdc_b, 0, NULL, NULL); - err = clFinish(ocl.commandQueue); + cl_int err = clFinish(ocl.commandQueue); ocl.releaseMemChannels(rgb); ocl.releaseMemChannels(rgb2); @@ -410,54 +408,7 @@ void clSquareSampleEx( } void clBlurEx(cl_mem image/*out, opt*/, const size_t xsize, const size_t ysize, - const double sigma, const double border_ratio, - cl_mem result/*out, opt*/) -{ - clBlurEx2(image, xsize, ysize, sigma, border_ratio, result); - - return; -/* - double m = 2.25; // Accuracy increases when m is increased. - const double scaler = -1.0 / (2 * sigma * sigma); - // For m = 9.0: exp(-scaler * diff * diff) < 2^ {-52} - const int diff = std::max(1, m * fabs(sigma)); - const int expn_size = 2 * diff + 1; - std::vector expn(expn_size); - for (int i = -diff; i <= diff; ++i) { - expn[i + diff] = static_cast(exp(scaler * i * i)); - } - - const int xstep = std::max(1, int(sigma / 3)); - const int ystep = xstep; - int dxsize = (xsize + xstep - 1) / xstep; - int dysize = (ysize + ystep - 1) / ystep; - - cl_int err = 0; - ocl_args_d_t &ocl = getOcl(); - cl_mem mem_expn = ocl.allocMem(sizeof(cl_float) * expn_size, expn.data()); - - if (xstep > 1) - { - ocl.allocA(sizeof(cl_float) * dxsize * ysize); - ocl.allocB(sizeof(cl_float) * dxsize * dysize); - - clConvolutionEx(ocl.srcA, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio); - clConvolutionEx(ocl.srcB, ocl.srcA, ysize, dxsize, mem_expn, expn_size, ystep, diff, border_ratio); - clUpsampleEx(result ? result : image, ocl.srcB, xsize, ysize, xstep, ystep); - } - else - { - ocl.allocA(sizeof(cl_float) * xsize * ysize); - clConvolutionEx(ocl.srcA, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio); - clConvolutionEx(result ? result : image, ocl.srcA, ysize, dxsize, mem_expn, expn_size, ystep, diff, border_ratio); - } - - clReleaseMemObject(mem_expn); -*/ -} - -void clBlurEx2(cl_mem image/*out, opt*/, size_t xsize, size_t ysize, - double sigma, double border_ratio, + const double sigma, const double border_ratio, cl_mem result/*out, opt*/) { double m = 2.25; // Accuracy increases when m is increased. @@ -538,7 +489,6 @@ void clMaskHighIntensityChangeEx( { size_t channel_size = xsize * ysize * sizeof(float); - cl_int err = CL_SUCCESS; ocl_args_d_t &ocl = getOcl(); ocl_channels c0 = ocl.allocMemChannels(channel_size); @@ -550,7 +500,7 @@ void clMaskHighIntensityChangeEx( clEnqueueCopyBuffer(ocl.commandQueue, xyb1.r, c1.r, 0, 0, channel_size, 0, NULL, NULL); clEnqueueCopyBuffer(ocl.commandQueue, xyb1.g, c1.g, 0, 0, channel_size, 0, NULL, NULL); clEnqueueCopyBuffer(ocl.commandQueue, xyb1.b, c1.b, 0, 0, channel_size, 0, NULL, NULL); - err = clFinish(ocl.commandQueue); + cl_int err = clFinish(ocl.commandQueue); cl_kernel kernel = ocl.kernel[KERNEL_MASKHIGHINTENSITYCHANGE]; clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&xyb0.r); @@ -588,8 +538,6 @@ void clEdgeDetectorMapEx( const size_t xsize, const size_t ysize, const size_t step) { size_t channel_size = xsize * ysize * sizeof(float); - - cl_int err = CL_SUCCESS; ocl_args_d_t &ocl = getOcl(); ocl_channels rgb_blured = ocl.allocMemChannels(channel_size); @@ -623,7 +571,7 @@ void clEdgeDetectorMapEx( const size_t res_ysize = (ysize + step - 1) / step; size_t globalWorkSize[2] = { res_xsize, res_ysize}; - err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); + cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); if (CL_SUCCESS != err) { LogError("Error: clEdgeDetectorMapEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err)); @@ -644,7 +592,6 @@ void clBlockDiffMapEx( const ocl_channels &rgb, const ocl_channels &rgb2, const size_t xsize, const size_t ysize, const size_t step) { - cl_int err = CL_SUCCESS; ocl_args_d_t &ocl = getOcl(); cl_int clxsize = xsize; @@ -668,7 +615,7 @@ void clBlockDiffMapEx( const size_t res_ysize = (ysize + step - 1) / step; size_t globalWorkSize[2] = { res_xsize, res_ysize }; - err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); + cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); if (CL_SUCCESS != err) { LogError("Error: clBlockDiffMapEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err)); @@ -688,8 +635,6 @@ void clEdgeDetectorLowFreqEx( size_t channel_size = xsize * ysize * sizeof(float); static const double kSigma = 14; - - cl_int err = 0; ocl_args_d_t &ocl = getOcl(); ocl_channels rgb_blured = ocl.allocMemChannels(channel_size); ocl_channels rgb2_blured = ocl.allocMemChannels(channel_size); @@ -720,7 +665,7 @@ void clEdgeDetectorLowFreqEx( const size_t res_ysize = (ysize + step - 1) / step; size_t globalWorkSize[2] = { res_xsize, res_ysize }; - err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); + cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); if (CL_SUCCESS != err) { LogError("Error: clEdgeDetectorLowFreqEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err)); diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h index 279884d6..31b10e36 100644 --- a/clguetzli/clguetzli.h +++ b/clguetzli/clguetzli.h @@ -1,6 +1,5 @@ #pragma once #include -#include "CL/cl.h" #include "guetzli/processor.h" #include "guetzli/butteraugli_comparator.h" #include "ocl.h" @@ -33,8 +32,7 @@ void clComputeBlockZeroingOrder( const channel_info mayout_channel[3], const int factor, const int comp_mask, - const float BlockErrorLimit - ); + const float BlockErrorLimit); void clMask( float* mask_r, float* mask_g, float* mask_b, @@ -70,10 +68,6 @@ void clBlurEx(cl_mem image/*out, opt*/, const size_t xsize, const size_t ysize, const double sigma, const double border_ratio, cl_mem result = nullptr/*out, opt*/); -void clBlurEx2(cl_mem image/*out, opt*/, size_t xsize, size_t ysize, - double sigma, double border_ratio, - cl_mem result = NULL/*out, opt*/); - void clOpsinDynamicsImageEx(ocl_channels &rgb, const size_t xsize, const size_t ysize); void clMaskHighIntensityChangeEx( @@ -129,8 +123,6 @@ void clCombineChannelsEx( void clUpsampleSquareRootEx(cl_mem diffmap, const size_t xsize, const size_t ysize, const int step); -void clCalculateDiffmapEx(cl_mem diffmap/*in,out*/, size_t xsize, size_t ysize, int step); - void clRemoveBorderEx(cl_mem out, const cl_mem in, const size_t xsize, const size_t ysize, const int step); void clAddBorderEx(cl_mem out, const size_t xsize, const size_t ysize, const int step, const cl_mem in); diff --git a/clguetzli/cuguetzli.cpp b/clguetzli/cuguetzli.cpp index 1bb85334..5eaba7e5 100644 --- a/clguetzli/cuguetzli.cpp +++ b/clguetzli/cuguetzli.cpp @@ -6,189 +6,472 @@ void cuOpsinDynamicsImage(float *r, float *g, float *b, const size_t xsize, cons { size_t channel_size = xsize * ysize * sizeof(float); - ocu_args_d_t &ocu = getOcu(); - ocu_channels rgb = ocu.allocMemChannels(channel_size, r, g, b); + ocu_args_d_t &ocl = getOcu(); + ocu_channels rgb = ocl.allocMemChannels(channel_size, r, g, b); cuOpsinDynamicsImageEx(rgb, xsize, ysize); cuMemcpyDtoH(r, rgb.r, channel_size); cuMemcpyDtoH(g, rgb.g, channel_size); - cuMemcpyDtoH(b, rgb.b, channel_size); + cuMemcpyDtoH(b, rgb.b, channel_size); - ocu.releaseMemChannels(rgb); + ocl.releaseMemChannels(rgb); } -void cuMaskHighIntensityChangeEx( - ocu_channels &xyb0/*in,out*/, - ocu_channels &xyb1/*in,out*/, - const size_t xsize, const size_t ysize) +void cuDiffmapOpsinDynamicsImage( + float* result, + const float* r, const float* g, const float* b, + const float* r2, const float* g2, const float* b2, + const size_t xsize, const size_t ysize, + const size_t step) { + const size_t res_xsize = (xsize + step - 1) / step; + const size_t res_ysize = (ysize + step - 1) / step; + size_t channel_size = xsize * ysize * sizeof(float); + size_t channel_step_size = res_xsize * res_ysize * sizeof(float); ocu_args_d_t &ocl = getOcu(); + ocu_channels xyb0 = ocl.allocMemChannels(channel_size, r, g, b); + ocu_channels xyb1 = ocl.allocMemChannels(channel_size, r2, g2, b2); - ocu_channels c0 = ocl.allocMemChannels(channel_size); - ocu_channels c1 = ocl.allocMemChannels(channel_size); + CUdeviceptr mem_result = ocl.allocMem(channel_size, result); - cuMemcpyDtoD(c0.r, xyb0.r, channel_size); - cuMemcpyDtoD(c0.g, xyb0.g, channel_size); - cuMemcpyDtoD(c0.b, xyb0.b, channel_size); - cuMemcpyDtoD(c1.r, xyb1.r, channel_size); - cuMemcpyDtoD(c1.g, xyb1.g, channel_size); - cuMemcpyDtoD(c1.b, xyb1.b, channel_size); + CUdeviceptr edge_detector_map = ocl.allocMem(3 * channel_step_size); + CUdeviceptr block_diff_dc = ocl.allocMem(3 * channel_step_size); + CUdeviceptr block_diff_ac = ocl.allocMem(3 * channel_step_size); - const void *args[] = { &xyb0.r, &xyb0.g, &xyb0.b, - &xyb1.r, &xyb1.g, &xyb1.b, - &c0.r, &c0.g, &c0.b, - &c1.r, &c1.g, &c1.b }; + cuMaskHighIntensityChangeEx(xyb0, xyb1, xsize, ysize); - CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_MASKHIGHINTENSITYCHANGE], - xsize, ysize, 1, - 1, 1, 1, - 0, - ocl.stream, (void**)args, NULL); + cuEdgeDetectorMapEx(edge_detector_map, xyb0, xyb1, xsize, ysize, step); + cuBlockDiffMapEx(block_diff_dc, block_diff_ac, xyb0, xyb1, xsize, ysize, step); + cuEdgeDetectorLowFreqEx(block_diff_ac, xyb0, xyb1, xsize, ysize, step); + { + ocu_channels mask = ocl.allocMemChannels(channel_size); + ocu_channels mask_dc = ocl.allocMemChannels(channel_size); + cuMaskEx(mask, mask_dc, xyb0, xyb1, xsize, ysize); + cuCombineChannelsEx(mem_result, mask, mask_dc, xsize, ysize, block_diff_dc, block_diff_ac, edge_detector_map, res_xsize, step); - err = cuStreamSynchronize(ocl.stream); + ocl.releaseMemChannels(mask); + ocl.releaseMemChannels(mask_dc); + } - ocl.releaseMemChannels(c0); - ocl.releaseMemChannels(c1); + cuCalculateDiffmapEx(mem_result, xsize, ysize, step); + + cuMemcpyDtoH(result, mem_result, channel_size); + + ocl.releaseMemChannels(xyb1); + ocl.releaseMemChannels(xyb0); + + cuMemFree(edge_detector_map); + cuMemFree(block_diff_dc); + cuMemFree(block_diff_ac); + + cuMemFree(mem_result); } -void cuEdgeDetectorMapEx( - CUdeviceptr result/*out*/, - const ocu_channels &rgb, const ocu_channels &rgb2, - const size_t xsize, const size_t ysize, const size_t step) + +void cuComputeBlockZeroingOrder( + guetzli::CoeffData *output_order_batch, + const channel_info orig_channel[3], + const float *orig_image_batch, + const float *mask_scale, + const int image_width, + const int image_height, + const channel_info mayout_channel[3], + const int factor, + const int comp_mask, + const float BlockErrorLimit) { - size_t channel_size = xsize * ysize * sizeof(float); + const int block8_width = (image_width + 8 - 1) / 8; + const int block8_height = (image_height + 8 - 1) / 8; + const int blockf_width = (image_width + 8 * factor - 1) / (8 * factor); + const int blockf_height = (image_height + 8 * factor - 1) / (8 * factor); + + using namespace guetzli; + cl_int err = 0; ocu_args_d_t &ocl = getOcu(); - ocu_channels rgb_blured = ocl.allocMemChannels(channel_size); - ocu_channels rgb2_blured = ocl.allocMemChannels(channel_size); + CUdeviceptr mem_orig_coeff[3]; + CUdeviceptr mem_mayout_coeff[3]; + CUdeviceptr mem_mayout_pixel[3]; + for (int c = 0; c < 3; c++) + { + int block_count = orig_channel[c].block_width * orig_channel[c].block_height; + mem_orig_coeff[c] = ocl.allocMem(block_count * sizeof(::coeff_t) * kDCTBlockSize, orig_channel[c].coeff); - static const double kSigma[3] = { 1.5, 0.586, 0.4 }; + block_count = mayout_channel[c].block_width * mayout_channel[c].block_height; + mem_mayout_coeff[c] = ocl.allocMem(block_count * sizeof(::coeff_t) * kDCTBlockSize, mayout_channel[c].coeff); - for (int i = 0; i < 3; i++) - { - cuBlurEx(rgb.ch[i], xsize, ysize, kSigma[i], 0.0, rgb_blured.ch[i]); - cuBlurEx(rgb2.ch[i], xsize, ysize, kSigma[i], 0.0, rgb2_blured.ch[i]); + mem_mayout_pixel[c] = ocl.allocMem(image_width * image_height * sizeof(uint16_t), mayout_channel[c].pixel); } + CUdeviceptr mem_orig_image = ocl.allocMem(sizeof(float) * 3 * kDCTBlockSize * block8_width * block8_height, orig_image_batch); + CUdeviceptr mem_mask_scale = ocl.allocMem(sizeof(float) * 3 * block8_width * block8_height, mask_scale); - const void *args[] = { &result, - &rgb_blured.r, &rgb_blured.g, &rgb_blured.b, - &rgb2_blured.r, &rgb2_blured.g, &rgb2_blured.b, - &xsize, &ysize, &step }; + int output_order_batch_size = sizeof(CoeffData) * 3 * kDCTBlockSize * blockf_width * blockf_height; + CUdeviceptr mem_output_order_batch = ocl.allocMem(output_order_batch_size, output_order_batch); - const size_t res_xsize = (xsize + step - 1) / step; - const size_t res_ysize = (ysize + step - 1) / step; + const void *args[] = { &mem_orig_coeff[0], &mem_orig_coeff[1], &mem_orig_coeff[2], + &mem_orig_image, &mem_orig_image, &mem_mask_scale, + &image_width, &image_height, + &mem_mayout_coeff[0], &mem_mayout_coeff[1], &mem_mayout_coeff[2], + &mem_mayout_pixel[0], &mem_mayout_pixel[1], &mem_mayout_pixel[2], + &mayout_channel[0], &mayout_channel[1], &mayout_channel[2], + &factor, + &comp_mask, + &BlockErrorLimit, + &mem_output_order_batch }; - CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_EDGEDETECTOR], - res_xsize, res_ysize, 1, + err = cuLaunchKernel(ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER], + blockf_width, blockf_height, 1, 1, 1, 1, 0, ocl.stream, (void**)args, NULL); err = cuStreamSynchronize(ocl.stream); - ocl.releaseMemChannels(rgb_blured); - ocl.releaseMemChannels(rgb2_blured); -} - -void cuBlockDiffMapEx( - CUdeviceptr block_diff_dc/*out*/, - CUdeviceptr block_diff_ac/*out*/, - const ocu_channels &rgb, const ocu_channels &rgb2, - const size_t xsize, const size_t ysize, const size_t step) -{ - ocu_args_d_t &ocl = getOcu(); - - const void *args[] = { &block_diff_dc, &block_diff_ac, - &rgb.r, &rgb.g, &rgb.b, - &rgb2.r, &rgb2.g, &rgb2.b, - &xsize, &ysize, &step }; + cuMemcpyDtoH(output_order_batch, mem_output_order_batch, output_order_batch_size); - const size_t res_xsize = (xsize + step - 1) / step; - const size_t res_ysize = (ysize + step - 1) / step; + for (int c = 0; c < 3; c++) + { + cuMemFree(mem_orig_coeff[c]); + cuMemFree(mem_mayout_coeff[c]); + cuMemFree(mem_mayout_pixel[c]); - CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_BLOCKDIFFMAP], - res_xsize, res_ysize, 1, - 1, 1, 1, - 0, - ocl.stream, (void**)args, NULL); + } - err = cuStreamSynchronize(ocl.stream); + cuMemFree(mem_orig_image); + cuMemFree(mem_mask_scale); + cuMemFree(mem_output_order_batch); } -void cuEdgeDetectorLowFreqEx( - CUdeviceptr block_diff_ac/*in,out*/, - const ocu_channels &rgb, const ocu_channels &rgb2, - const size_t xsize, const size_t ysize, const size_t step) +void cuMask( + float* mask_r, float* mask_g, float* mask_b, + float* maskdc_r, float* maskdc_g, float* maskdc_b, + const size_t xsize, const size_t ysize, + const float* r, const float* g, const float* b, + const float* r2, const float* g2, const float* b2) { - size_t channel_size = xsize * ysize * sizeof(float); - - static const double kSigma = 14; - ocu_args_d_t &ocl = getOcu(); - ocu_channels rgb_blured = ocl.allocMemChannels(channel_size); - ocu_channels rgb2_blured = ocl.allocMemChannels(channel_size); - - for (int i = 0; i < 3; i++) - { - cuBlurEx(rgb.ch[i], xsize, ysize, kSigma, 0.0, rgb_blured.ch[i]); - cuBlurEx(rgb2.ch[i], xsize, ysize, kSigma, 0.0, rgb2_blured.ch[i]); - } - const void *args[] = { &block_diff_ac, - &rgb_blured.r, &rgb_blured.g, &rgb_blured.b, - &rgb2_blured.r, &rgb2_blured.g, &rgb2_blured.b, - &xsize, &ysize, &step }; + size_t channel_size = xsize * ysize * sizeof(float); - const size_t res_xsize = (xsize + step - 1) / step; - const size_t res_ysize = (ysize + step - 1) / step; + ocu_channels rgb = ocl.allocMemChannels(channel_size, r, g, b); + ocu_channels rgb2 = ocl.allocMemChannels(channel_size, r2, g2, b2); + ocu_channels mask = ocl.allocMemChannels(channel_size); + ocu_channels mask_dc = ocl.allocMemChannels(channel_size); - CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_EDGEDETECTORLOWFREQ], - res_xsize, res_ysize, 1, - 1, 1, 1, - 0, - ocl.stream, (void**)args, NULL); + cuMaskEx(mask, mask_dc, rgb, rgb2, xsize, ysize); - err = cuStreamSynchronize(ocl.stream); + cuMemcpyDtoH(mask_r, mask.r, channel_size); + cuMemcpyDtoH(mask_g, mask.g, channel_size); + cuMemcpyDtoH(mask_b, mask.b, channel_size); + cuMemcpyDtoH(maskdc_r, mask_dc.r, channel_size); + cuMemcpyDtoH(maskdc_g, mask_dc.g, channel_size); + cuMemcpyDtoH(maskdc_b, mask_dc.b, channel_size); - ocl.releaseMemChannels(rgb_blured); - ocl.releaseMemChannels(rgb2_blured); + ocl.releaseMemChannels(rgb); + ocl.releaseMemChannels(rgb2); + ocl.releaseMemChannels(mask); + ocl.releaseMemChannels(mask_dc); } -void cuDiffPrecomputeEx( - ocu_channels &mask/*out*/, - const ocu_channels &xyb0, const ocu_channels &xyb1, - const size_t xsize, const size_t ysize) +void cuConvolutionXEx( + CUdeviceptr result/*out*/, + const CUdeviceptr inp, size_t xsize, size_t ysize, + const CUdeviceptr multipliers, size_t len, + int xstep, int offset, double border_ratio) { - ocu_args_d_t &ocl = getOcu(); + ocu_args_d_t &ocu = getOcu(); - const void *args[] = { &mask.x, &mask.y, &mask.b, - &xyb0.x, &xyb0.y, &xyb0.b, - &xyb1.x, &xyb1.y, &xyb1.b }; + const void *args[] = { &result, &inp, &multipliers, &len, &xstep, &offset, &border_ratio }; - CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_DIFFPRECOMPUTE], + CUresult err = cuLaunchKernel(ocu.kernel[KERNEL_CONVOLUTIONX], xsize, ysize, 1, 1, 1, 1, 0, - ocl.stream, (void**)args, NULL); + ocu.stream, (void**)args, NULL); - err = cuStreamSynchronize(ocl.stream); + err = cuStreamSynchronize(ocu.stream); } -void cuScaleImageEx(CUdeviceptr img/*in, out*/, size_t size, double w) +void cuConvolutionYEx( + CUdeviceptr result/*out*/, + const CUdeviceptr inp, size_t xsize, size_t ysize, + const CUdeviceptr multipliers, size_t len, + int xstep, int offset, double border_ratio) { - ocu_args_d_t &ocl = getOcu(); + CUresult err = CUDA_SUCCESS; + ocu_args_d_t &ocu = getOcu(); - const void *args[] = { &img, &w }; + const void *args[] = { &result, &inp, &multipliers, &len, &xstep, &offset, &border_ratio }; - CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_SCALEIMAGE], - size, 1, 1, + err = cuLaunchKernel(ocu.kernel[KERNEL_CONVOLUTIONY], + xsize, ysize, 1, 1, 1, 1, 0, - ocl.stream, (void**)args, NULL); + ocu.stream, (void**)args, NULL); + + err = cuStreamSynchronize(ocu.stream); +} + +void cuSquareSampleEx( + CUdeviceptr result/*out*/, + const CUdeviceptr image, size_t xsize, size_t ysize, + size_t xstep, size_t ystep) +{ + CUresult err = CUDA_SUCCESS; + ocu_args_d_t &ocu = getOcu(); + + const void *args[] = { &result, &image, &xstep, &ystep }; + + err = cuLaunchKernel(ocu.kernel[KERNEL_SQUARESAMPLE], + xsize, ysize, 1, + 1, 1, 1, + 0, + ocu.stream, (void**)args, NULL); + + err = cuStreamSynchronize(ocu.stream); +} + +void cuBlurEx(CUdeviceptr image/*out, opt*/, const size_t xsize, const size_t ysize, + const double sigma, const double border_ratio, + CUdeviceptr result/*out, opt*/) +{ + double m = 2.25; // Accuracy increases when m is increased. + const double scaler = -1.0 / (2 * sigma * sigma); + // For m = 9.0: exp(-scaler * diff * diff) < 2^ {-52} + const int diff = std::max(1, m * fabs(sigma)); + const int expn_size = 2 * diff + 1; + std::vector expn(expn_size); + for (int i = -diff; i <= diff; ++i) { + expn[i + diff] = static_cast(exp(scaler * i * i)); + } + + const int xstep = std::max(1, int(sigma / 3)); + + CUresult err = CUDA_SUCCESS; + ocu_args_d_t &ocu = getOcu(); + CUdeviceptr mem_expn = ocu.allocMem(sizeof(cl_float) * expn_size, expn.data()); + + if (xstep > 1) + { + CUdeviceptr srcA = ocu.allocMem(sizeof(cl_float) * xsize * ysize); + cuConvolutionXEx(srcA, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio); + cuConvolutionYEx(result ? result : image, srcA, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio); + cuSquareSampleEx(result ? result : image, result ? result : image, xsize, ysize, xstep, xstep); + cuMemFree(srcA); + } + else + { + CUdeviceptr srcA = ocu.allocMem(sizeof(cl_float) * xsize * ysize); + cuConvolutionXEx(srcA, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio); + cuConvolutionYEx(result ? result : image, srcA, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio); + cuMemFree(srcA); + } + + cuMemFree(mem_expn); +} + +void cuOpsinDynamicsImageEx(ocu_channels &rgb, const size_t xsize, const size_t ysize) +{ + static const double kSigma = 1.1; + + size_t channel_size = xsize * ysize * sizeof(float); + + CUresult err = CUDA_SUCCESS; + ocu_args_d_t &ocl = getOcu(); + ocu_channels rgb_blurred = ocl.allocMemChannels(channel_size); + + cuBlurEx(rgb.r, xsize, ysize, kSigma, 0.0, rgb_blurred.r); + cuBlurEx(rgb.g, xsize, ysize, kSigma, 0.0, rgb_blurred.g); + cuBlurEx(rgb.b, xsize, ysize, kSigma, 0.0, rgb_blurred.b); + + void *args[] = { &rgb.r, &rgb.g, &rgb.b, &rgb_blurred.r, &rgb_blurred.g, &rgb_blurred.b }; + + CUresult r = cuLaunchKernel(ocl.kernel[KERNEL_OPSINDYNAMICSIMAGE], + xsize * ysize, 1, 1, + 1, 1, 1, + 0, + ocl.stream, args, NULL); + + r = cuStreamSynchronize(ocl.stream); + + ocl.releaseMemChannels(rgb_blurred); +} + +void cuMaskHighIntensityChangeEx( + ocu_channels &xyb0/*in,out*/, + ocu_channels &xyb1/*in,out*/, + const size_t xsize, const size_t ysize) +{ + size_t channel_size = xsize * ysize * sizeof(float); + + ocu_args_d_t &ocl = getOcu(); + + ocu_channels c0 = ocl.allocMemChannels(channel_size); + ocu_channels c1 = ocl.allocMemChannels(channel_size); + + cuMemcpyDtoD(c0.r, xyb0.r, channel_size); + cuMemcpyDtoD(c0.g, xyb0.g, channel_size); + cuMemcpyDtoD(c0.b, xyb0.b, channel_size); + cuMemcpyDtoD(c1.r, xyb1.r, channel_size); + cuMemcpyDtoD(c1.g, xyb1.g, channel_size); + cuMemcpyDtoD(c1.b, xyb1.b, channel_size); + + const void *args[] = { &xyb0.r, &xyb0.g, &xyb0.b, + &xyb1.r, &xyb1.g, &xyb1.b, + &c0.r, &c0.g, &c0.b, + &c1.r, &c1.g, &c1.b }; + + CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_MASKHIGHINTENSITYCHANGE], + xsize, ysize, 1, + 1, 1, 1, + 0, + ocl.stream, (void**)args, NULL); + + err = cuStreamSynchronize(ocl.stream); + + ocl.releaseMemChannels(c0); + ocl.releaseMemChannels(c1); +} + +void cuEdgeDetectorMapEx( + CUdeviceptr result/*out*/, + const ocu_channels &rgb, const ocu_channels &rgb2, + const size_t xsize, const size_t ysize, const size_t step) +{ + size_t channel_size = xsize * ysize * sizeof(float); + + ocu_args_d_t &ocl = getOcu(); + + ocu_channels rgb_blured = ocl.allocMemChannels(channel_size); + ocu_channels rgb2_blured = ocl.allocMemChannels(channel_size); + + static const double kSigma[3] = { 1.5, 0.586, 0.4 }; + + for (int i = 0; i < 3; i++) + { + cuBlurEx(rgb.ch[i], xsize, ysize, kSigma[i], 0.0, rgb_blured.ch[i]); + cuBlurEx(rgb2.ch[i], xsize, ysize, kSigma[i], 0.0, rgb2_blured.ch[i]); + } + + const void *args[] = { &result, + &rgb_blured.r, &rgb_blured.g, &rgb_blured.b, + &rgb2_blured.r, &rgb2_blured.g, &rgb2_blured.b, + &xsize, &ysize, &step }; + + const size_t res_xsize = (xsize + step - 1) / step; + const size_t res_ysize = (ysize + step - 1) / step; + + CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_EDGEDETECTOR], + res_xsize, res_ysize, 1, + 1, 1, 1, + 0, + ocl.stream, (void**)args, NULL); + + err = cuStreamSynchronize(ocl.stream); + + ocl.releaseMemChannels(rgb_blured); + ocl.releaseMemChannels(rgb2_blured); +} + +void cuBlockDiffMapEx( + CUdeviceptr block_diff_dc/*out*/, + CUdeviceptr block_diff_ac/*out*/, + const ocu_channels &rgb, const ocu_channels &rgb2, + const size_t xsize, const size_t ysize, const size_t step) +{ + ocu_args_d_t &ocl = getOcu(); + + const void *args[] = { &block_diff_dc, &block_diff_ac, + &rgb.r, &rgb.g, &rgb.b, + &rgb2.r, &rgb2.g, &rgb2.b, + &xsize, &ysize, &step }; + + const size_t res_xsize = (xsize + step - 1) / step; + const size_t res_ysize = (ysize + step - 1) / step; + + CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_BLOCKDIFFMAP], + res_xsize, res_ysize, 1, + 1, 1, 1, + 0, + ocl.stream, (void**)args, NULL); + + err = cuStreamSynchronize(ocl.stream); +} + +void cuEdgeDetectorLowFreqEx( + CUdeviceptr block_diff_ac/*in,out*/, + const ocu_channels &rgb, const ocu_channels &rgb2, + const size_t xsize, const size_t ysize, const size_t step) +{ + size_t channel_size = xsize * ysize * sizeof(float); + + static const double kSigma = 14; + + ocu_args_d_t &ocl = getOcu(); + ocu_channels rgb_blured = ocl.allocMemChannels(channel_size); + ocu_channels rgb2_blured = ocl.allocMemChannels(channel_size); + + for (int i = 0; i < 3; i++) + { + cuBlurEx(rgb.ch[i], xsize, ysize, kSigma, 0.0, rgb_blured.ch[i]); + cuBlurEx(rgb2.ch[i], xsize, ysize, kSigma, 0.0, rgb2_blured.ch[i]); + } + + const void *args[] = { &block_diff_ac, + &rgb_blured.r, &rgb_blured.g, &rgb_blured.b, + &rgb2_blured.r, &rgb2_blured.g, &rgb2_blured.b, + &xsize, &ysize, &step }; + + const size_t res_xsize = (xsize + step - 1) / step; + const size_t res_ysize = (ysize + step - 1) / step; + + CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_EDGEDETECTORLOWFREQ], + res_xsize, res_ysize, 1, + 1, 1, 1, + 0, + ocl.stream, (void**)args, NULL); + + err = cuStreamSynchronize(ocl.stream); + + ocl.releaseMemChannels(rgb_blured); + ocl.releaseMemChannels(rgb2_blured); +} + +void cuDiffPrecomputeEx( + ocu_channels &mask/*out*/, + const ocu_channels &xyb0, const ocu_channels &xyb1, + const size_t xsize, const size_t ysize) +{ + ocu_args_d_t &ocl = getOcu(); + + const void *args[] = { &mask.x, &mask.y, &mask.b, + &xyb0.x, &xyb0.y, &xyb0.b, + &xyb1.x, &xyb1.y, &xyb1.b }; + + CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_DIFFPRECOMPUTE], + xsize, ysize, 1, + 1, 1, 1, + 0, + ocl.stream, (void**)args, NULL); + + err = cuStreamSynchronize(ocl.stream); +} + +void cuScaleImageEx(CUdeviceptr img/*in, out*/, size_t size, double w) +{ + ocu_args_d_t &ocl = getOcu(); + + const void *args[] = { &img, &w }; + + CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_SCALEIMAGE], + size, 1, 1, + 1, 1, 1, + 0, + ocl.stream, (void**)args, NULL); err = cuStreamSynchronize(ocl.stream); } @@ -516,286 +799,3 @@ void cuCalculateDiffmapEx(CUdeviceptr diffmap/*in,out*/, const size_t xsize, con cuMemFree(blurred); } -void cuDiffmapOpsinDynamicsImage( - float* result, - const float* r, const float* g, const float* b, - const float* r2, const float* g2, const float* b2, - const size_t xsize, const size_t ysize, - const size_t step) -{ - const size_t res_xsize = (xsize + step - 1) / step; - const size_t res_ysize = (ysize + step - 1) / step; - - size_t channel_size = xsize * ysize * sizeof(float); - size_t channel_step_size = res_xsize * res_ysize * sizeof(float); - - CUresult err = CUDA_SUCCESS; - ocu_args_d_t &ocl = getOcu(); - ocu_channels xyb0 = ocl.allocMemChannels(channel_size, r, g, b); - ocu_channels xyb1 = ocl.allocMemChannels(channel_size, r2, g2, b2); - - CUdeviceptr mem_result = ocl.allocMem(channel_size, result); - - CUdeviceptr edge_detector_map = ocl.allocMem(3 * channel_step_size); - CUdeviceptr block_diff_dc = ocl.allocMem(3 * channel_step_size); - CUdeviceptr block_diff_ac = ocl.allocMem(3 * channel_step_size); - - cuMaskHighIntensityChangeEx(xyb0, xyb1, xsize, ysize); - - cuEdgeDetectorMapEx(edge_detector_map, xyb0, xyb1, xsize, ysize, step); - cuBlockDiffMapEx(block_diff_dc, block_diff_ac, xyb0, xyb1, xsize, ysize, step); - cuEdgeDetectorLowFreqEx(block_diff_ac, xyb0, xyb1, xsize, ysize, step); - { - ocu_channels mask = ocl.allocMemChannels(channel_size); - ocu_channels mask_dc = ocl.allocMemChannels(channel_size); - cuMaskEx(mask, mask_dc, xyb0, xyb1, xsize, ysize); - cuCombineChannelsEx(mem_result, mask, mask_dc, xsize, ysize, block_diff_dc, block_diff_ac, edge_detector_map, res_xsize, step); - - ocl.releaseMemChannels(mask); - ocl.releaseMemChannels(mask_dc); - } - - cuCalculateDiffmapEx(mem_result, xsize, ysize, step); - - cuMemcpyDtoH(result, mem_result, channel_size); - - ocl.releaseMemChannels(xyb1); - ocl.releaseMemChannels(xyb0); - - cuMemFree(edge_detector_map); - cuMemFree(block_diff_dc); - cuMemFree(block_diff_ac); - - cuMemFree(mem_result); -} - -void cuComputeBlockZeroingOrder( - guetzli::CoeffData *output_order_batch, - const channel_info orig_channel[3], - const float *orig_image_batch, - const float *mask_scale, - const int image_width, - const int image_height, - const channel_info mayout_channel[3], - const int factor, - const int comp_mask, - const float BlockErrorLimit) -{ - const int block8_width = (image_width + 8 - 1) / 8; - const int block8_height = (image_height + 8 - 1) / 8; - const int blockf_width = (image_width + 8 * factor - 1) / (8 * factor); - const int blockf_height = (image_height + 8 * factor - 1) / (8 * factor); - - using namespace guetzli; - - cl_int err = 0; - ocu_args_d_t &ocl = getOcu(); - - CUdeviceptr mem_orig_coeff[3]; - CUdeviceptr mem_mayout_coeff[3]; - CUdeviceptr mem_mayout_pixel[3]; - for (int c = 0; c < 3; c++) - { - int block_count = orig_channel[c].block_width * orig_channel[c].block_height; - mem_orig_coeff[c] = ocl.allocMem(block_count * sizeof(::coeff_t) * kDCTBlockSize, orig_channel[c].coeff); - - block_count = mayout_channel[c].block_width * mayout_channel[c].block_height; - mem_mayout_coeff[c] = ocl.allocMem(block_count * sizeof(::coeff_t) * kDCTBlockSize, mayout_channel[c].coeff); - - mem_mayout_pixel[c] = ocl.allocMem(image_width * image_height * sizeof(uint16_t), mayout_channel[c].pixel); - } - CUdeviceptr mem_orig_image = ocl.allocMem(sizeof(float) * 3 * kDCTBlockSize * block8_width * block8_height, orig_image_batch); - CUdeviceptr mem_mask_scale = ocl.allocMem(sizeof(float) * 3 * block8_width * block8_height, mask_scale); - - int output_order_batch_size = sizeof(CoeffData) * 3 * kDCTBlockSize * blockf_width * blockf_height; - CUdeviceptr mem_output_order_batch = ocl.allocMem(output_order_batch_size, output_order_batch); - - const void *args[] = { &mem_orig_coeff[0], &mem_orig_coeff[1], &mem_orig_coeff[2], - &mem_orig_image, &mem_orig_image, &mem_mask_scale, - &image_width, &image_height, - &mem_mayout_coeff[0], &mem_mayout_coeff[1], &mem_mayout_coeff[2], - &mem_mayout_pixel[0], &mem_mayout_pixel[1], &mem_mayout_pixel[2], - &mayout_channel[0], &mayout_channel[1], &mayout_channel[2], - &factor, - &comp_mask, - &BlockErrorLimit, - &mem_output_order_batch }; - - err = cuLaunchKernel(ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER], - blockf_width, blockf_height, 1, - 1, 1, 1, - 0, - ocl.stream, (void**)args, NULL); - - err = cuStreamSynchronize(ocl.stream); - - cuMemcpyDtoH(output_order_batch, mem_output_order_batch, output_order_batch_size); - - for (int c = 0; c < 3; c++) - { - cuMemFree(mem_orig_coeff[c]); - cuMemFree(mem_mayout_coeff[c]); - cuMemFree(mem_mayout_pixel[c]); - - } - - cuMemFree(mem_orig_image); - cuMemFree(mem_mask_scale); - cuMemFree(mem_output_order_batch); -} - -void cuMask( - float* mask_r, float* mask_g, float* mask_b, - float* maskdc_r, float* maskdc_g, float* maskdc_b, - const size_t xsize, const size_t ysize, - const float* r, const float* g, const float* b, - const float* r2, const float* g2, const float* b2) -{ - CUresult err = CUDA_SUCCESS; - ocu_args_d_t &ocl = getOcu(); - - size_t channel_size = xsize * ysize * sizeof(float); - - ocu_channels rgb = ocl.allocMemChannels(channel_size, r, g, b); - ocu_channels rgb2 = ocl.allocMemChannels(channel_size, r2, g2, b2); - ocu_channels mask = ocl.allocMemChannels(channel_size); - ocu_channels mask_dc = ocl.allocMemChannels(channel_size); - - cuMaskEx(mask, mask_dc, rgb, rgb2, xsize, ysize); - - cuMemcpyDtoH(mask_r, mask.r, channel_size); - cuMemcpyDtoH(mask_g, mask.g, channel_size); - cuMemcpyDtoH(mask_b, mask.b, channel_size); - cuMemcpyDtoH(maskdc_r, mask_dc.r, channel_size); - cuMemcpyDtoH(maskdc_g, mask_dc.g, channel_size); - cuMemcpyDtoH(maskdc_b, mask_dc.b, channel_size); - - ocl.releaseMemChannels(rgb); - ocl.releaseMemChannels(rgb2); - ocl.releaseMemChannels(mask); - ocl.releaseMemChannels(mask_dc); -} - -void cuConvolutionXEx( - CUdeviceptr result/*out*/, - const CUdeviceptr inp, size_t xsize, size_t ysize, - const CUdeviceptr multipliers, size_t len, - int xstep, int offset, double border_ratio) -{ - ocu_args_d_t &ocu = getOcu(); - - const void *args[] = { &result, &inp, &multipliers, &len, &xstep, &offset, &border_ratio }; - - CUresult err = cuLaunchKernel(ocu.kernel[KERNEL_CONVOLUTIONX], - xsize, ysize, 1, - 1, 1, 1, - 0, - ocu.stream, (void**)args, NULL); - - err = cuStreamSynchronize(ocu.stream); -} - -void cuConvolutionYEx( - CUdeviceptr result/*out*/, - const CUdeviceptr inp, size_t xsize, size_t ysize, - const CUdeviceptr multipliers, size_t len, - int xstep, int offset, double border_ratio) -{ - CUresult err = CUDA_SUCCESS; - ocu_args_d_t &ocu = getOcu(); - - const void *args[] = { &result, &inp, &multipliers, &len, &xstep, &offset, &border_ratio }; - - err = cuLaunchKernel(ocu.kernel[KERNEL_CONVOLUTIONY], - xsize, ysize, 1, - 1, 1, 1, - 0, - ocu.stream, (void**)args, NULL); - - err = cuStreamSynchronize(ocu.stream); -} - -void cuSquareSampleEx( - CUdeviceptr result/*out*/, - const CUdeviceptr image, size_t xsize, size_t ysize, - size_t xstep, size_t ystep) -{ - CUresult err = CUDA_SUCCESS; - ocu_args_d_t &ocu = getOcu(); - - const void *args[] = { &result, &image, &xstep, &ystep }; - - err = cuLaunchKernel(ocu.kernel[KERNEL_SQUARESAMPLE], - xsize, ysize, 1, - 1, 1, 1, - 0, - ocu.stream, (void**)args, NULL); - - err = cuStreamSynchronize(ocu.stream); -} - -void cuBlurEx(CUdeviceptr image/*out, opt*/, const size_t xsize, const size_t ysize, - const double sigma, const double border_ratio, - CUdeviceptr result/*out, opt*/) -{ - double m = 2.25; // Accuracy increases when m is increased. - const double scaler = -1.0 / (2 * sigma * sigma); - // For m = 9.0: exp(-scaler * diff * diff) < 2^ {-52} - const int diff = std::max(1, m * fabs(sigma)); - const int expn_size = 2 * diff + 1; - std::vector expn(expn_size); - for (int i = -diff; i <= diff; ++i) { - expn[i + diff] = static_cast(exp(scaler * i * i)); - } - - const int xstep = std::max(1, int(sigma / 3)); - - CUresult err = CUDA_SUCCESS; - ocu_args_d_t &ocu = getOcu(); - CUdeviceptr mem_expn = ocu.allocMem(sizeof(cl_float) * expn_size, expn.data()); - - if (xstep > 1) - { - CUdeviceptr srcA = ocu.allocMem(sizeof(cl_float) * xsize * ysize); - cuConvolutionXEx(srcA, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio); - cuConvolutionYEx(result ? result : image, srcA, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio); - cuSquareSampleEx(result ? result : image, result ? result : image, xsize, ysize, xstep, xstep); - cuMemFree(srcA); - } - else - { - CUdeviceptr srcA = ocu.allocMem(sizeof(cl_float) * xsize * ysize); - cuConvolutionXEx(srcA, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio); - cuConvolutionYEx(result ? result : image, srcA, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio); - cuMemFree(srcA); - } - - cuMemFree(mem_expn); -} - -void cuOpsinDynamicsImageEx(ocu_channels &rgb, const size_t xsize, const size_t ysize) -{ - static const double kSigma = 1.1; - - size_t channel_size = xsize * ysize * sizeof(float); - - CUresult err = CUDA_SUCCESS; - ocu_args_d_t &ocu = getOcu(); - ocu_channels rgb_blurred = ocu.allocMemChannels(channel_size); - - cuBlurEx(rgb.r, xsize, ysize, kSigma, 0.0, rgb_blurred.r); - cuBlurEx(rgb.g, xsize, ysize, kSigma, 0.0, rgb_blurred.g); - cuBlurEx(rgb.b, xsize, ysize, kSigma, 0.0, rgb_blurred.b); - - void *args[] = { &rgb.r, &rgb.g, &rgb.b, &rgb_blurred.r, &rgb_blurred.g, &rgb_blurred.b }; - - CUresult r = cuLaunchKernel(ocu.kernel[KERNEL_OPSINDYNAMICSIMAGE], - xsize * ysize, 1, 1, - 1, 1, 1, - 0, - ocu.stream, args, NULL); - - r = cuStreamSynchronize(ocu.stream); - - ocu.releaseMemChannels(rgb_blurred); -} diff --git a/clguetzli/cuguetzli.h b/clguetzli/cuguetzli.h index 14c607cc..0783796a 100644 --- a/clguetzli/cuguetzli.h +++ b/clguetzli/cuguetzli.h @@ -2,7 +2,9 @@ #include "guetzli/processor.h" #include "clguetzli.cl.h" -void cuOpsinDynamicsImage(float *r, float *g, float *b, const size_t xsize, const size_t ysize); +void cuOpsinDynamicsImage( + float *r, float *g, float *b, + const size_t xsize, const size_t ysize); void cuDiffmapOpsinDynamicsImage( float* result, @@ -30,8 +32,84 @@ void cuMask( const float* r, const float* g, const float* b, const float* r2, const float* g2, const float* b2); +void cuConvolutionXEx( + CUdeviceptr result/*out*/, + const CUdeviceptr inp, size_t xsize, size_t ysize, + const CUdeviceptr multipliers, size_t len, + int xstep, int offset, double border_ratio); + +void cuConvolutionYEx( + CUdeviceptr result/*out*/, + const CUdeviceptr inp, size_t xsize, size_t ysize, + const CUdeviceptr multipliers, size_t len, + int xstep, int offset, double border_ratio); + +void cuSquareSampleEx( + CUdeviceptr result/*out*/, + const CUdeviceptr image, size_t xsize, size_t ysize, + size_t xstep, size_t ystep); + void cuBlurEx(CUdeviceptr image/*out, opt*/, const size_t xsize, const size_t ysize, const double sigma, const double border_ratio, CUdeviceptr result = NULL/*out, opt*/); -void cuOpsinDynamicsImageEx(ocu_channels &rgb, const size_t xsize, const size_t ysize); \ No newline at end of file +void cuOpsinDynamicsImageEx(ocu_channels &rgb, const size_t xsize, const size_t ysize); + +void cuMaskHighIntensityChangeEx( + ocu_channels &xyb0/*in,out*/, + ocu_channels &xyb1/*in,out*/, + const size_t xsize, const size_t ysize); + +void cuEdgeDetectorMapEx( + CUdeviceptr result/*out*/, + const ocu_channels &rgb, const ocu_channels &rgb2, + const size_t xsize, const size_t ysize, const size_t step); + +void cuBlockDiffMapEx( + CUdeviceptr block_diff_dc/*out*/, + CUdeviceptr block_diff_ac/*out*/, + const ocu_channels &rgb, const ocu_channels &rgb2, + const size_t xsize, const size_t ysize, const size_t step); + +void cuEdgeDetectorLowFreqEx( + CUdeviceptr block_diff_ac/*in,out*/, + const ocu_channels &rgb, const ocu_channels &rgb2, + const size_t xsize, const size_t ysize, const size_t step); + +void cuDiffPrecomputeEx( + ocu_channels &mask/*out*/, + const ocu_channels &xyb0, const ocu_channels &xyb1, + const size_t xsize, const size_t ysize); + +void cuScaleImageEx(CUdeviceptr img/*in, out*/, size_t size, double w); + +void cuAverage5x5Ex(CUdeviceptr img/*in,out*/, const size_t xsize, const size_t ysize); + +void cuMinSquareValEx( + CUdeviceptr img/*in,out*/, + const size_t xsize, const size_t ysize, + const size_t square_size, const size_t offset); + +void cuMaskEx( + ocu_channels mask/*out*/, ocu_channels mask_dc/*out*/, + const ocu_channels &rgb, const ocu_channels &rgb2, + const size_t xsize, const size_t ysize); + +void cuCombineChannelsEx( + CUdeviceptr result/*out*/, + const ocu_channels &mask, + const ocu_channels &mask_dc, + const size_t xsize, const size_t ysize, + const CUdeviceptr block_diff_dc, + const CUdeviceptr block_diff_ac, + const CUdeviceptr edge_detector_map, + const size_t res_xsize, + const size_t step); + +void cuUpsampleSquareRootEx(CUdeviceptr diffmap, const size_t xsize, const size_t ysize, const int step); + +void cuRemoveBorderEx(CUdeviceptr out, const CUdeviceptr in, const size_t xsize, const size_t ysize, const int step); + +void cuAddBorderEx(CUdeviceptr out, const size_t xsize, const size_t ysize, const int step, const CUdeviceptr in); + +void cuCalculateDiffmapEx(CUdeviceptr diffmap/*in,out*/, const size_t xsize, const size_t ysize, const int step); From b4d0ffe9dde8b2ee45ccfe043d08de7043ba957e Mon Sep 17 00:00:00 2001 From: strongtu Date: Sat, 3 Jun 2017 02:49:23 +0800 Subject: [PATCH 127/189] =?UTF-8?q?=E7=AE=80=E5=8C=96=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clguetzli.cpp | 311 ++++++++++------------------------- clguetzli/clguetzli_test.cpp | 6 +- clguetzli/cuguetzli.cpp | 16 +- clguetzli/ocl.cpp | 133 +-------------- clguetzli/ocl.h | 20 +-- 5 files changed, 104 insertions(+), 382 deletions(-) diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index 0774a074..c6c5eb4f 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -16,10 +16,7 @@ ocl_args_d_t& getOcl(void) bInit = true; cl_int err = SetupOpenCL(&ocl, CL_DEVICE_TYPE_GPU); - if (CL_SUCCESS != err) - { - LogError("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err)); - } + LOG_CL_RESULT(err); char* source = nullptr; size_t src_size = 0; @@ -30,21 +27,18 @@ ocl_args_d_t& getOcl(void) delete[] source; err = clBuildProgram(ocl.program, 1, &ocl.device, "", NULL, NULL); - if (CL_SUCCESS != err) - { - LogError("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err)); - - if (err == CL_BUILD_PROGRAM_FAILURE) - { - size_t log_size = 0; - clGetProgramBuildInfo(ocl.program, ocl.device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); + LOG_CL_RESULT(err); + if (CL_BUILD_PROGRAM_FAILURE == err) + { + size_t log_size = 0; + clGetProgramBuildInfo(ocl.program, ocl.device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); - std::vector build_log(log_size); - clGetProgramBuildInfo(ocl.program, ocl.device, CL_PROGRAM_BUILD_LOG, log_size, &build_log[0], NULL); + std::vector build_log(log_size); + clGetProgramBuildInfo(ocl.program, ocl.device, CL_PROGRAM_BUILD_LOG, log_size, &build_log[0], NULL); - LogError("Error happened during the build of OpenCL program.\nBuild log:%s", &build_log[0]); - } - } + LogError("Error happened during the build of OpenCL program.\nBuild log:%s", &build_log[0]); + } + ocl.kernel[KERNEL_CONVOLUTION] = clCreateKernel(ocl.program, "clConvolutionEx", &err); ocl.kernel[KERNEL_CONVOLUTIONX] = clCreateKernel(ocl.program, "clConvolutionXEx", &err); ocl.kernel[KERNEL_CONVOLUTIONY] = clCreateKernel(ocl.program, "clConvolutionYEx", &err); @@ -208,15 +202,9 @@ void clComputeBlockZeroingOrder( size_t globalWorkSize[2] = { blockf_width, blockf_height }; err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); - if (CL_SUCCESS != err) - { - LogError("Error: clComputeBlockZeroingOrder() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err)); - } + LOG_CL_RESULT(err); err = clFinish(ocl.commandQueue); - if (CL_SUCCESS != err) - { - LogError("Error: clComputeBlockZeroingOrder() clFinish returned %s.\n", TranslateOpenCLError(err)); - } + LOG_CL_RESULT(err); clEnqueueReadBuffer(ocl.commandQueue, mem_output_order_batch, false, 0, output_order_batch_size, output_order_batch, 0, NULL, NULL); clFinish(ocl.commandQueue); @@ -226,7 +214,6 @@ void clComputeBlockZeroingOrder( clReleaseMemObject(mem_orig_coeff[c]); clReleaseMemObject(mem_mayout_coeff[c]); clReleaseMemObject(mem_mayout_pixel[c]); - } clReleaseMemObject(mem_orig_image); @@ -258,7 +245,7 @@ void clMask( clEnqueueReadBuffer(ocl.commandQueue, mask_dc.r, false, 0, channel_size, maskdc_r, 0, NULL, NULL); clEnqueueReadBuffer(ocl.commandQueue, mask_dc.g, false, 0, channel_size, maskdc_g, 0, NULL, NULL); clEnqueueReadBuffer(ocl.commandQueue, mask_dc.b, false, 0, channel_size, maskdc_b, 0, NULL, NULL); - cl_int err = clFinish(ocl.commandQueue); + clFinish(ocl.commandQueue); ocl.releaseMemChannels(rgb); ocl.releaseMemChannels(rgb2); @@ -272,7 +259,6 @@ void clConvolutionEx( const cl_mem multipliers, size_t len, int xstep, int offset, double border_ratio) { - cl_int err = CL_SUCCESS; ocl_args_d_t &ocl = getOcl(); size_t oxsize = (xsize + xstep - 1) / xstep; @@ -294,16 +280,10 @@ void clConvolutionEx( clSetKernelArg(kernel, 7, sizeof(cl_float), (void*)&clborder_ratio); size_t globalWorkSize[2] = { oxsize, ysize }; - err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); - if (CL_SUCCESS != err) - { - LogError("Error: clConvolutionEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err)); - } + cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); + LOG_CL_RESULT(err); err = clFinish(ocl.commandQueue); - if (CL_SUCCESS != err) - { - LogError("Error: clConvolutionEx() clFinish returned %s.\n", TranslateOpenCLError(err)); - } + LOG_CL_RESULT(err); } void clConvolutionXEx( @@ -312,7 +292,6 @@ void clConvolutionXEx( const cl_mem multipliers, size_t len, int xstep, int offset, double border_ratio) { - cl_int err = CL_SUCCESS; ocl_args_d_t &ocl = getOcl(); cl_int clxstep = xstep; @@ -330,16 +309,10 @@ void clConvolutionXEx( clSetKernelArg(kernel, 6, sizeof(cl_float), (void*)&clborder_ratio); size_t globalWorkSize[2] = { xsize, ysize }; - err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); - if (CL_SUCCESS != err) - { - LogError("Error: clConvolutionEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err)); - } + cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); + LOG_CL_RESULT(err); err = clFinish(ocl.commandQueue); - if (CL_SUCCESS != err) - { - LogError("Error: clConvolutionEx() clFinish returned %s.\n", TranslateOpenCLError(err)); - } + LOG_CL_RESULT(err); } void clConvolutionYEx( @@ -348,7 +321,6 @@ void clConvolutionYEx( const cl_mem multipliers, size_t len, int xstep, int offset, double border_ratio) { - cl_int err = CL_SUCCESS; ocl_args_d_t &ocl = getOcl(); cl_int clxstep = xstep; @@ -366,16 +338,10 @@ void clConvolutionYEx( clSetKernelArg(kernel, 6, sizeof(cl_float), (void*)&clborder_ratio); size_t globalWorkSize[2] = { xsize, ysize }; - err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); - if (CL_SUCCESS != err) - { - LogError("Error: clConvolutionEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err)); - } + cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); + LOG_CL_RESULT(err); err = clFinish(ocl.commandQueue); - if (CL_SUCCESS != err) - { - LogError("Error: clConvolutionEx() clFinish returned %s.\n", TranslateOpenCLError(err)); - } + LOG_CL_RESULT(err); } void clSquareSampleEx( @@ -383,7 +349,6 @@ void clSquareSampleEx( const cl_mem image, size_t xsize, size_t ysize, size_t xstep, size_t ystep) { - cl_int err = CL_SUCCESS; ocl_args_d_t &ocl = getOcl(); cl_int clxstep = xstep; @@ -395,16 +360,10 @@ void clSquareSampleEx( clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&clystep); size_t globalWorkSize[2] = { xsize, ysize }; - err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); - if (CL_SUCCESS != err) - { - LogError("Error: clUpsampleEx clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err)); - } + cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); + LOG_CL_RESULT(err); err = clFinish(ocl.commandQueue); - if (CL_SUCCESS != err) - { - LogError("Error: clUpsampleEx clFinish returned %s.\n", TranslateOpenCLError(err)); - } + LOG_CL_RESULT(err); } void clBlurEx(cl_mem image/*out, opt*/, const size_t xsize, const size_t ysize, @@ -423,23 +382,24 @@ void clBlurEx(cl_mem image/*out, opt*/, const size_t xsize, const size_t ysize, const int xstep = std::max(1, int(sigma / 3)); - cl_int err = 0; ocl_args_d_t &ocl = getOcl(); cl_mem mem_expn = ocl.allocMem(sizeof(cl_float) * expn_size, expn.data()); if (xstep > 1) { - ocl.allocA(sizeof(cl_float) * xsize * ysize); - clConvolutionXEx(ocl.srcA, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio); - clConvolutionYEx(result ? result : image, ocl.srcA, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio); + cl_mem m = ocl.allocMem(sizeof(cl_float) * xsize * ysize); + clConvolutionXEx(m, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio); + clConvolutionYEx(result ? result : image, m, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio); clSquareSampleEx(result ? result : image, result ? result : image, xsize, ysize, xstep, xstep); + clReleaseMemObject(m); } else { - ocl.allocA(sizeof(cl_float) * xsize * ysize); - clConvolutionXEx(ocl.srcA, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio); - clConvolutionYEx(result ? result : image, ocl.srcA, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio); - } + cl_mem m = ocl.allocMem(sizeof(cl_float) * xsize * ysize); + clConvolutionXEx(m, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio); + clConvolutionYEx(result ? result : image, m, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio); + clReleaseMemObject(m); + } clReleaseMemObject(mem_expn); } @@ -450,7 +410,6 @@ void clOpsinDynamicsImageEx(ocl_channels &rgb, const size_t xsize, const size_t size_t channel_size = xsize * ysize * sizeof(float); - cl_int err = 0; ocl_args_d_t &ocl = getOcl(); ocl_channels rgb_blurred = ocl.allocMemChannels(channel_size); @@ -467,21 +426,14 @@ void clOpsinDynamicsImageEx(ocl_channels &rgb, const size_t xsize, const size_t clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&rgb_blurred.b); size_t globalWorkSize[1] = { xsize * ysize }; - err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL); - if (CL_SUCCESS != err) - { - LogError("Error: clOpsinDynamicsImageEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err)); - } + cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL); + LOG_CL_RESULT(err); err = clFinish(ocl.commandQueue); - if (CL_SUCCESS != err) - { - LogError("Error: clOpsinDynamicsImageEx() clFinish returned %s.\n", TranslateOpenCLError(err)); - } + LOG_CL_RESULT(err); ocl.releaseMemChannels(rgb_blurred); } - void clMaskHighIntensityChangeEx( ocl_channels &xyb0/*in,out*/, ocl_channels &xyb1/*in,out*/, @@ -500,7 +452,7 @@ void clMaskHighIntensityChangeEx( clEnqueueCopyBuffer(ocl.commandQueue, xyb1.r, c1.r, 0, 0, channel_size, 0, NULL, NULL); clEnqueueCopyBuffer(ocl.commandQueue, xyb1.g, c1.g, 0, 0, channel_size, 0, NULL, NULL); clEnqueueCopyBuffer(ocl.commandQueue, xyb1.b, c1.b, 0, 0, channel_size, 0, NULL, NULL); - cl_int err = clFinish(ocl.commandQueue); + clFinish(ocl.commandQueue); cl_kernel kernel = ocl.kernel[KERNEL_MASKHIGHINTENSITYCHANGE]; clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&xyb0.r); @@ -517,16 +469,10 @@ void clMaskHighIntensityChangeEx( clSetKernelArg(kernel, 11, sizeof(cl_mem), (void*)&c1.b); size_t globalWorkSize[2] = { xsize, ysize }; - err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); - if (CL_SUCCESS != err) - { - LogError("Error: clMaskHighIntensityChangeEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err)); - } + cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); + LOG_CL_RESULT(err); err = clFinish(ocl.commandQueue); - if (CL_SUCCESS != err) - { - LogError("Error: clMaskHighIntensityChangeEx() clFinish returned %s.\n", TranslateOpenCLError(err)); - } + LOG_CL_RESULT(err); ocl.releaseMemChannels(c0); ocl.releaseMemChannels(c1); @@ -572,15 +518,9 @@ void clEdgeDetectorMapEx( size_t globalWorkSize[2] = { res_xsize, res_ysize}; cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); - if (CL_SUCCESS != err) - { - LogError("Error: clEdgeDetectorMapEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err)); - } + LOG_CL_RESULT(err); err = clFinish(ocl.commandQueue); - if (CL_SUCCESS != err) - { - LogError("Error: clEdgeDetectorMapEx() clFinish returned %s.\n", TranslateOpenCLError(err)); - } + LOG_CL_RESULT(err); ocl.releaseMemChannels(rgb_blured); ocl.releaseMemChannels(rgb2_blured); @@ -616,15 +556,9 @@ void clBlockDiffMapEx( size_t globalWorkSize[2] = { res_xsize, res_ysize }; cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); - if (CL_SUCCESS != err) - { - LogError("Error: clBlockDiffMapEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err)); - } + LOG_CL_RESULT(err); err = clFinish(ocl.commandQueue); - if (CL_SUCCESS != err) - { - LogError("Error: clBlockDiffMapEx() clFinish returned %s.\n", TranslateOpenCLError(err)); - } + LOG_CL_RESULT(err); } void clEdgeDetectorLowFreqEx( @@ -666,15 +600,9 @@ void clEdgeDetectorLowFreqEx( size_t globalWorkSize[2] = { res_xsize, res_ysize }; cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); - if (CL_SUCCESS != err) - { - LogError("Error: clEdgeDetectorLowFreqEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err)); - } + LOG_CL_RESULT(err); err = clFinish(ocl.commandQueue); - if (CL_SUCCESS != err) - { - LogError("Error: clEdgeDetectorLowFreqEx() clFinish returned %s.\n", TranslateOpenCLError(err)); - } + LOG_CL_RESULT(err); ocl.releaseMemChannels(rgb_blured); ocl.releaseMemChannels(rgb2_blured); @@ -685,7 +613,6 @@ void clDiffPrecomputeEx( const ocl_channels &xyb0, const ocl_channels &xyb1, const size_t xsize, const size_t ysize) { - cl_int err = CL_SUCCESS; ocl_args_d_t &ocl = getOcl(); cl_kernel kernel = ocl.kernel[KERNEL_DIFFPRECOMPUTE]; @@ -700,21 +627,14 @@ void clDiffPrecomputeEx( clSetKernelArg(kernel, 8, sizeof(cl_mem), (void*)&xyb1.b); size_t globalWorkSize[2] = { xsize, ysize }; - err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); - if (CL_SUCCESS != err) - { - LogError("Error: clDiffPrecomputeEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err)); - } + cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); + LOG_CL_RESULT(err); err = clFinish(ocl.commandQueue); - if (CL_SUCCESS != err) - { - LogError("Error: clDiffPrecomputeEx() clFinish returned %s.\n", TranslateOpenCLError(err)); - } + LOG_CL_RESULT(err); } void clScaleImageEx(cl_mem img/*in, out*/, size_t size, double w) { - cl_int err = CL_SUCCESS; ocl_args_d_t &ocl = getOcl(); cl_double clscale = w; @@ -724,16 +644,10 @@ void clScaleImageEx(cl_mem img/*in, out*/, size_t size, double w) clSetKernelArg(kernel, 1, sizeof(cl_double), (void*)&clscale); size_t globalWorkSize[1] = { size }; - err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL); - if (CL_SUCCESS != err) - { - LogError("Error: clScaleImageEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err)); - } + cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL); + LOG_CL_RESULT(err); err = clFinish(ocl.commandQueue); - if (CL_SUCCESS != err) - { - LogError("Error: clScaleImageEx() clFinish returned %s.\n", TranslateOpenCLError(err)); - } + LOG_CL_RESULT(err); } void clAverage5x5Ex(cl_mem img/*in,out*/, const size_t xsize, const size_t ysize) @@ -743,30 +657,24 @@ void clAverage5x5Ex(cl_mem img/*in,out*/, const size_t xsize, const size_t ysize return; } - cl_int err = CL_SUCCESS; ocl_args_d_t &ocl = getOcl(); size_t len = xsize * ysize * sizeof(float); - ocl.allocA(len); - cl_mem img_org = ocl.srcA; + cl_mem img_org = ocl.allocMem(len); - err = clEnqueueCopyBuffer(ocl.commandQueue, img, img_org, 0, 0, len, 0, NULL, NULL); + clEnqueueCopyBuffer(ocl.commandQueue, img, img_org, 0, 0, len, 0, NULL, NULL); cl_kernel kernel = ocl.kernel[KERNEL_AVERAGE5X5]; clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&img); clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&img_org); size_t globalWorkSize[2] = { xsize, ysize }; - err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); - if (CL_SUCCESS != err) - { - LogError("Error: clAverage5x5Ex() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err)); - } + cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); + LOG_CL_RESULT(err); err = clFinish(ocl.commandQueue); - if (CL_SUCCESS != err) - { - LogError("Error: clAverage5x5Ex() clFinish returned %s.\n", TranslateOpenCLError(err)); - } + LOG_CL_RESULT(err); + + clReleaseMemObject(img_org); } void clMinSquareValEx( @@ -774,36 +682,26 @@ void clMinSquareValEx( const size_t xsize, const size_t ysize, const size_t square_size, const size_t offset) { - cl_int err = CL_SUCCESS; ocl_args_d_t &ocl = getOcl(); cl_int cloffset = offset; cl_int clsquare_size = square_size; - ocl.allocA(sizeof(cl_float) * xsize * ysize); + cl_mem result = ocl.allocMem(sizeof(cl_float) * xsize * ysize); cl_kernel kernel = ocl.kernel[KERNEL_MINSQUAREVAL]; - clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&ocl.srcA); + clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&result); clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&img); clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&clsquare_size); clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&cloffset); size_t globalWorkSize[2] = { xsize, ysize }; - err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); - if (CL_SUCCESS != err) - { - LogError("Error: clMinSquareValEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err)); - } - - err = clEnqueueCopyBuffer(ocl.commandQueue, ocl.srcA, img, 0, 0, sizeof(cl_float) * xsize * ysize, 0, NULL, NULL); - if (CL_SUCCESS != err) - { - LogError("Error: clMinSquareValEx() clEnqueueCopyBuffer returned %s.\n", TranslateOpenCLError(err)); - } + cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); + LOG_CL_RESULT(err); + err = clEnqueueCopyBuffer(ocl.commandQueue, result, img, 0, 0, sizeof(cl_float) * xsize * ysize, 0, NULL, NULL); + LOG_CL_RESULT(err); err = clFinish(ocl.commandQueue); - if (CL_SUCCESS != err) - { - LogError("Error: clMinSquareValEx() clFinish returned %s.\n", TranslateOpenCLError(err)); - } + LOG_CL_RESULT(err); + clReleaseMemObject(result); } static void MakeMask(double extmul, double extoff, @@ -822,7 +720,6 @@ static const double kGlobalScale = 1.0 / kInternalGoodQualityThreshold; void clDoMask(ocl_channels mask/*in, out*/, ocl_channels mask_dc/*in, out*/, size_t xsize, size_t ysize) { - cl_int err = CL_SUCCESS; ocl_args_d_t &ocl = getOcl(); double extmul = 0.975741017749; @@ -922,16 +819,10 @@ void clDoMask(ocl_channels mask/*in, out*/, ocl_channels mask_dc/*in, out*/, siz clSetKernelArg(kernel, 11, sizeof(cl_mem), (void*)&xyb_dc.b); size_t globalWorkSize[2] = { xsize, ysize }; - err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); - if (CL_SUCCESS != err) - { - LogError("Error: clDoMask() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err)); - } + cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); + LOG_CL_RESULT(err); err = clFinish(ocl.commandQueue); - if (CL_SUCCESS != err) - { - LogError("Error: clDoMask() clFinish returned %s.\n", TranslateOpenCLError(err)); - } + LOG_CL_RESULT(err); ocl.releaseMemChannels(xyb); ocl.releaseMemChannels(xyb_dc); @@ -977,7 +868,6 @@ void clCombineChannelsEx( const size_t res_xsize, const size_t step) { - cl_int err = CL_SUCCESS; ocl_args_d_t &ocl = getOcl(); const size_t work_xsize = ((xsize - 8 + step) + step - 1) / step; @@ -1005,21 +895,14 @@ void clCombineChannelsEx( clSetKernelArg(kernel, 13, sizeof(cl_int), (void*)&clstep); size_t globalWorkSize[2] = { work_xsize, work_ysize }; - err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); - if (CL_SUCCESS != err) - { - LogError("Error: clCombineChannelsEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err)); - } + cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); + LOG_CL_RESULT(err); err = clFinish(ocl.commandQueue); - if (CL_SUCCESS != err) - { - LogError("Error: clCombineChannelsEx() clFinish returned %s.\n", TranslateOpenCLError(err)); - } + LOG_CL_RESULT(err); } void clUpsampleSquareRootEx(cl_mem diffmap, const size_t xsize, const size_t ysize, const int step) { - cl_int err = CL_SUCCESS; ocl_args_d_t &ocl = getOcl(); cl_int clxsize = xsize; @@ -1039,29 +922,18 @@ void clUpsampleSquareRootEx(cl_mem diffmap, const size_t xsize, const size_t ysi const size_t res_ysize = (ysize + step - 1) / step; size_t globalWorkSize[2] = { res_xsize, res_ysize }; - err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); - if (CL_SUCCESS != err) - { - LogError("Error: clUpsampleSquareRootEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err)); - } - err = clFinish(ocl.commandQueue); + cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); + LOG_CL_RESULT(err); err = clEnqueueCopyBuffer(ocl.commandQueue, diffmap_out, diffmap, 0, 0, xsize * ysize * sizeof(float), 0, NULL, NULL); - if (CL_SUCCESS != err) - { - LogError("Error: clUpsampleSquareRootEx() clEnqueueCopyBuffer returned %s.\n", TranslateOpenCLError(err)); - } + LOG_CL_RESULT(err); err = clFinish(ocl.commandQueue); - if (CL_SUCCESS != err) - { - LogError("Error: clUpsampleSquareRootEx() clFinish returned %s.\n", TranslateOpenCLError(err)); - } + LOG_CL_RESULT(err); clReleaseMemObject(diffmap_out); } void clRemoveBorderEx(cl_mem out, const cl_mem in, const size_t xsize, const size_t ysize, const int step) { - cl_int err = 0; ocl_args_d_t &ocl = getOcl(); cl_int cls = 8 - step; @@ -1075,21 +947,14 @@ void clRemoveBorderEx(cl_mem out, const cl_mem in, const size_t xsize, const siz clSetKernelArg(kernel, 4, sizeof(cl_int), &cls2); size_t globalWorkSize[2] = { xsize - cls, ysize - cls}; - err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); - if (CL_SUCCESS != err) - { - LogError("Error: clCalculateDiffmapGetBlurredEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err)); - } + cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); + LOG_CL_RESULT(err); err = clFinish(ocl.commandQueue); - if (CL_SUCCESS != err) - { - LogError("Error: clCalculateDiffmapGetBlurredEx() clFinish returned %s.\n", TranslateOpenCLError(err)); - } + LOG_CL_RESULT(err); } void clAddBorderEx(cl_mem out, size_t xsize, size_t ysize, int step, cl_mem in) { - cl_int err = 0; ocl_args_d_t &ocl = getOcl(); cl_int cls = 8 - step; @@ -1101,16 +966,10 @@ void clAddBorderEx(cl_mem out, size_t xsize, size_t ysize, int step, cl_mem in) clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&in); size_t globalWorkSize[2] = { xsize, ysize}; - err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); - if (CL_SUCCESS != err) - { - LogError("Error: clGetDiffmapFromBlurredEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err)); - } + cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); + LOG_CL_RESULT(err); err = clFinish(ocl.commandQueue); - if (CL_SUCCESS != err) - { - LogError("Error: clGetDiffmapFromBlurredEx() clFinish returned %s.\n", TranslateOpenCLError(err)); - } + LOG_CL_RESULT(err); } void clCalculateDiffmapEx(cl_mem diffmap/*in,out*/, const size_t xsize, const size_t ysize, const int step) diff --git a/clguetzli/clguetzli_test.cpp b/clguetzli/clguetzli_test.cpp index 2cadfb85..9cb4007d 100644 --- a/clguetzli/clguetzli_test.cpp +++ b/clguetzli/clguetzli_test.cpp @@ -311,9 +311,8 @@ void tclConvolution(size_t xsize, size_t ysize, size_t inp_size = xsize * ysize * sizeof(float); size_t multipliers_size = len * sizeof(float); cl_int err = 0; - ocl_args_d_t &ocl = getOcl(); - ocl.allocA(result_size); - cl_mem r = ocl.srcA; + ocl_args_d_t &ocl = getOcl(); + cl_mem r = ocl.allocMem(result_size); cl_mem i = ocl.allocMem(inp_size, inp); cl_mem m = ocl.allocMem(multipliers_size, multipliers); @@ -327,6 +326,7 @@ void tclConvolution(size_t xsize, size_t ysize, clEnqueueUnmapMemObject(ocl.commandQueue, r, r_r, 0, NULL, NULL); err = clFinish(ocl.commandQueue); + clReleaseMemObject(r); clReleaseMemObject(i); clReleaseMemObject(m); } diff --git a/clguetzli/cuguetzli.cpp b/clguetzli/cuguetzli.cpp index 5eaba7e5..a445d930 100644 --- a/clguetzli/cuguetzli.cpp +++ b/clguetzli/cuguetzli.cpp @@ -183,17 +183,17 @@ void cuConvolutionXEx( const CUdeviceptr multipliers, size_t len, int xstep, int offset, double border_ratio) { - ocu_args_d_t &ocu = getOcu(); + ocu_args_d_t &ocl = getOcu(); const void *args[] = { &result, &inp, &multipliers, &len, &xstep, &offset, &border_ratio }; - CUresult err = cuLaunchKernel(ocu.kernel[KERNEL_CONVOLUTIONX], + CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_CONVOLUTIONX], xsize, ysize, 1, 1, 1, 1, 0, - ocu.stream, (void**)args, NULL); + ocl.stream, (void**)args, NULL); - err = cuStreamSynchronize(ocu.stream); + err = cuStreamSynchronize(ocl.stream); } void cuConvolutionYEx( @@ -203,17 +203,17 @@ void cuConvolutionYEx( int xstep, int offset, double border_ratio) { CUresult err = CUDA_SUCCESS; - ocu_args_d_t &ocu = getOcu(); + ocu_args_d_t &ocl = getOcu(); const void *args[] = { &result, &inp, &multipliers, &len, &xstep, &offset, &border_ratio }; - err = cuLaunchKernel(ocu.kernel[KERNEL_CONVOLUTIONY], + err = cuLaunchKernel(ocl.kernel[KERNEL_CONVOLUTIONY], xsize, ysize, 1, 1, 1, 1, 0, - ocu.stream, (void**)args, NULL); + ocl.stream, (void**)args, NULL); - err = cuStreamSynchronize(ocu.stream); + err = cuStreamSynchronize(ocl.stream); } void cuSquareSampleEx( diff --git a/clguetzli/ocl.cpp b/clguetzli/ocl.cpp index d92fb1a4..5218ce9b 100644 --- a/clguetzli/ocl.cpp +++ b/clguetzli/ocl.cpp @@ -15,16 +15,7 @@ ocl_args_d_t::ocl_args_d_t() : program(NULL), platformVersion(OPENCL_VERSION_1_2), deviceVersion(OPENCL_VERSION_1_2), - compilerVersion(OPENCL_VERSION_1_2), - srcA(NULL), - srcB(NULL), - dstMem(NULL), - inputA(NULL), - lenA(0), - inputB(NULL), - lenB(0), - outputC(NULL), - lenC(0) + compilerVersion(OPENCL_VERSION_1_2) { for (int i = 0; i < KERNEL_COUNT; i++) { @@ -72,30 +63,6 @@ ocl_args_d_t::~ocl_args_d_t() LogError("Error: clReleaseProgram returned '%s'.\n", TranslateOpenCLError(err)); } } - if (srcA) - { - err = clReleaseMemObject(srcA); - if (CL_SUCCESS != err) - { - LogError("Error: clReleaseMemObject returned '%s'.\n", TranslateOpenCLError(err)); - } - } - if (srcB) - { - err = clReleaseMemObject(srcB); - if (CL_SUCCESS != err) - { - LogError("Error: clReleaseMemObject returned '%s'.\n", TranslateOpenCLError(err)); - } - } - if (dstMem) - { - err = clReleaseMemObject(dstMem); - if (CL_SUCCESS != err) - { - LogError("Error: clReleaseMemObject returned '%s'.\n", TranslateOpenCLError(err)); - } - } if (commandQueue) { err = clReleaseCommandQueue(commandQueue); @@ -120,118 +87,30 @@ ocl_args_d_t::~ocl_args_d_t() LogError("Error: clReleaseContext returned '%s'.\n", TranslateOpenCLError(err)); } } - - /* - * Note there is no procedure to deallocate platform - * because it was not created at the startup, - * but just queried from OpenCL runtime. - */ - - if (inputA) _aligned_free(inputA); - if (inputB) _aligned_free(inputB); - if (outputC) _aligned_free(outputC); -} - -void* ocl_args_d_t::allocA(size_t s) -{ - if (s <= lenA) return inputA; - lenA = 0; - _aligned_free(inputA); - clReleaseMemObject(srcA); - - cl_uint optimizedSize = ((s - 1) / 64 + 1) * 64; - inputA = _aligned_malloc(optimizedSize, 4096); - lenA = s; - - cl_int err = 0; - srcA = clCreateBuffer(this->context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, s, inputA, &err); - if (CL_SUCCESS != err) - { - LogError("Error: allocA() for buffer returned %s.\n", TranslateOpenCLError(err)); - } - - return inputA; -} - -void* ocl_args_d_t::allocB(size_t s) -{ - if (s <= lenB) return inputB; - lenB = 0; - _aligned_free(inputB); - clReleaseMemObject(srcB); - - cl_uint optimizedSize = ((s - 1) / 64 + 1) * 64; - inputB = _aligned_malloc(optimizedSize, 4096); - lenB = s; - - cl_int err = 0; - srcB = clCreateBuffer(this->context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, s, inputB, &err); - if (CL_SUCCESS != err) - { - LogError("Error: allocB() for buffer returned %s.\n", TranslateOpenCLError(err)); - } - - return inputB; -} - -void* ocl_args_d_t::allocC(size_t s) -{ - if (s <= lenC) return outputC; - lenC = 0; - _aligned_free(outputC); - clReleaseMemObject(dstMem); - - cl_uint optimizedSize = ((s - 1) / 64 + 1) * 64; - outputC = _aligned_malloc(optimizedSize, 4096); - lenC = s; - - cl_int err = 0; - dstMem = clCreateBuffer(this->context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, s, outputC, &err); - if (CL_SUCCESS != err) - { - LogError("Error: allocB() for buffer returned %s.\n", TranslateOpenCLError(err)); - } - - return outputC; } cl_mem ocl_args_d_t::allocMem(size_t s, const void *init) { cl_int err = 0; cl_mem mem = clCreateBuffer(this->context, CL_MEM_READ_WRITE, s, nullptr, &err); - if (CL_SUCCESS != err) - { - LogError("Error: allocMem() for buffer returned %s.\n", TranslateOpenCLError(err)); - } + LOG_CL_RESULT(err); if (!mem) return NULL; // init memory if (init) { err = clEnqueueWriteBuffer(this->commandQueue, mem, CL_FALSE, 0, s, init, 0, NULL, NULL); - if (CL_SUCCESS != err) - { - LogError("Error: allocMem() clEnqueueWriteBuffer return %s.\n", TranslateOpenCLError(err)); - } + LOG_CL_RESULT(err); err = clFinish(this->commandQueue); - if (CL_SUCCESS != err) - { - LogError("Error: allocMem() clEnqueueWriteBuffer/clFinish return %s.\n", TranslateOpenCLError(err)); - } + LOG_CL_RESULT(err); } else { cl_char cc = 0; err = clEnqueueFillBuffer(this->commandQueue, mem, &cc, sizeof(cc), 0, s / sizeof(cc), 0, NULL, NULL); - if (CL_SUCCESS != err) - { - LogError("Error: allocMem() clEnqueueFillBuffer return %s.\n", TranslateOpenCLError(err)); - } + LOG_CL_RESULT(err); err = clFinish(this->commandQueue); - if (CL_SUCCESS != err) - { - LogError("Error: allocMem() clEnqueueFillBuffer/clFinish return %s.\n", TranslateOpenCLError(err)); - } + LOG_CL_RESULT(err); } return mem; diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h index fd7e78e7..13eb232b 100644 --- a/clguetzli/ocl.h +++ b/clguetzli/ocl.h @@ -67,15 +67,13 @@ enum KernelName { KERNEL_COUNT, }; +#define LOG_CL_RESULT(e) if (CL_SUCCESS != (e)) { LogError("Error: %s:%d returned %s.\n", __FUNCTION__, __LINE__, TranslateOpenCLError((e)));} + struct ocl_args_d_t { ocl_args_d_t(); ~ocl_args_d_t(); - void* allocA(size_t s); - void* allocB(size_t s); - void* allocC(size_t s); - cl_mem allocMem(size_t s, const void *init = NULL); ocl_channels allocMemChannels(size_t s, const void *c0 = NULL, const void *c1 = NULL, const void *c2 = NULL); void releaseMemChannels(ocl_channels &rgb); @@ -89,19 +87,5 @@ struct ocl_args_d_t float platformVersion; // hold the OpenCL platform version (default 1.2) float deviceVersion; // hold the OpenCL device version (default. 1.2) float compilerVersion; // hold the device OpenCL C version (default. 1.2) - - // Objects that are specific for algorithm implemented in this sample - cl_mem srcA; // hold first source buffer - cl_mem srcB; // hold second source buffer - cl_mem dstMem; // hold destination buffer - - void* inputA; - size_t lenA; - - void* inputB; - size_t lenB; - - void* outputC; - size_t lenC; }; From 18f9672660fbcebd91f4a56ba02095129f140bd9 Mon Sep 17 00:00:00 2001 From: strongtu Date: Sat, 3 Jun 2017 14:39:44 +0800 Subject: [PATCH 128/189] =?UTF-8?q?=E8=B0=83=E6=95=B4cu=E7=BC=96=E8=AF=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- compile.bat | 159 +--------------------------------------------------- 1 file changed, 2 insertions(+), 157 deletions(-) diff --git a/compile.bat b/compile.bat index 8aa9430f..05a3a361 100644 --- a/compile.bat +++ b/compile.bat @@ -1,159 +1,4 @@ -@if "%1" == "" goto start -@setlocal -@set userinput=%1 -@if not "%1"=="store" @if not "%1"=="8.1" @if not "%userinput:~0,3%"=="10." goto usage -@endlocal - -:start -@call :GetVSCommonToolsDir -@if "%VS140COMNTOOLS%"=="" goto error_no_VS140COMNTOOLSDIR - -@call "%VS140COMNTOOLS%VCVarsQueryRegistry.bat" No32bit 64bit %1 %2 - -@if "%VSINSTALLDIR%"=="" goto error_no_VSINSTALLDIR -@if "%VCINSTALLDIR%"=="" goto error_no_VCINSTALLDIR -@if "%FrameworkDir64%"=="" goto error_no_FrameworkDIR64 -@if "%FrameworkVersion64%"=="" goto error_no_FrameworkVer64 -@if "%Framework40Version%"=="" goto error_no_Framework40Version - -@set FrameworkDir=%FrameworkDir64% -@set FrameworkVersion=%FrameworkVersion64% - -@if not "%WindowsSDK_ExecutablePath_x64%" == "" @set PATH=%WindowsSDK_ExecutablePath_x64%;%PATH% - -@rem -@rem Set Windows SDK include/lib path -@rem -@if not "%WindowsSdkDir%" == "" @set PATH=%WindowsSdkDir%bin\x64;%WindowsSdkDir%bin\x86;%PATH% -@if not "%WindowsSdkDir%" == "" @set INCLUDE=%WindowsSdkDir%include\%WindowsSDKVersion%shared;%WindowsSdkDir%include\%WindowsSDKVersion%um;%WindowsSdkDir%include\%WindowsSDKVersion%winrt;%INCLUDE% -@if not "%WindowsSdkDir%" == "" @set LIB=%WindowsSdkDir%lib\%WindowsSDKLibVersion%um\x64;%LIB% -@if not "%WindowsSdkDir%" == "" @set LIBPATH=%WindowsLibPath%;%ExtensionSDKDir%\Microsoft.VCLibs\14.0\References\CommonConfiguration\neutral;%LIBPATH% - -@REM Set NETFXSDK include/lib path -@if not "%NETFXSDKDir%" == "" @set INCLUDE=%NETFXSDKDir%include\um;%INCLUDE% -@if not "%NETFXSDKDir%" == "" @set LIB=%NETFXSDKDir%lib\um\x64;%LIB% - -@rem -@rem Set UniversalCRT include/lib path, the default is the latest installed version. -@rem -@if not "%UCRTVersion%" == "" @set INCLUDE=%UniversalCRTSdkDir%include\%UCRTVersion%\ucrt;%INCLUDE% -@if not "%UCRTVersion%" == "" @set LIB=%UniversalCRTSdkDir%lib\%UCRTVersion%\ucrt\x64;%LIB% - -@rem PATH -@rem ---- -@if exist "%VSINSTALLDIR%Team Tools\Performance Tools\x64" @set PATH=%VSINSTALLDIR%Team Tools\Performance Tools\x64;%VSINSTALLDIR%Team Tools\Performance Tools;%PATH% - -@if exist "%ProgramFiles%\HTML Help Workshop" set PATH=%ProgramFiles%\HTML Help Workshop;%PATH% -@if exist "%ProgramFiles(x86)%\HTML Help Workshop" set PATH=%ProgramFiles(x86)%\HTML Help Workshop;%PATH% -@if exist "%VSINSTALLDIR%Common7\Tools" set PATH=%VSINSTALLDIR%Common7\Tools;%PATH% -@if exist "%VSINSTALLDIR%Common7\IDE" set PATH=%VSINSTALLDIR%Common7\IDE;%PATH% -@if exist "%VCINSTALLDIR%VCPackages" set PATH=%VCINSTALLDIR%VCPackages;%PATH% -@if exist "%FrameworkDir%\%Framework40Version%" set PATH=%FrameworkDir%\%Framework40Version%;%PATH% -@if exist "%FrameworkDir%\%FrameworkVersion%" set PATH=%FrameworkDir%\%FrameworkVersion%;%PATH% -@if exist "%VCINSTALLDIR%BIN\amd64" set PATH=%VCINSTALLDIR%BIN\amd64;%PATH% - -@rem Add path to MSBuild Binaries -@if exist "%ProgramFiles%\MSBuild\14.0\bin\amd64" set PATH=%ProgramFiles%\MSBuild\14.0\bin\amd64;%PATH% -@if exist "%ProgramFiles(x86)%\MSBuild\14.0\bin\amd64" set PATH=%ProgramFiles(x86)%\MSBuild\14.0\bin\amd64;%PATH% - -@if exist "%VSINSTALLDIR%Common7\IDE\CommonExtensions\Microsoft\TestWindow" @set PATH=%VSINSTALLDIR%Common7\IDE\CommonExtensions\Microsoft\TestWindow;%PATH% - -@rem INCLUDE -@rem ------- -@if exist "%VCINSTALLDIR%ATLMFC\INCLUDE" set INCLUDE=%VCINSTALLDIR%ATLMFC\INCLUDE;%INCLUDE% -@if exist "%VCINSTALLDIR%INCLUDE" set INCLUDE=%VCINSTALLDIR%INCLUDE;%INCLUDE% - -@rem LIB -@rem --- -@if "%1" == "store" goto setstorelib -@if exist "%VCINSTALLDIR%ATLMFC\LIB\amd64" set LIB=%VCINSTALLDIR%ATLMFC\LIB\amd64;%LIB% -@if exist "%VCINSTALLDIR%LIB\amd64" set LIB=%VCINSTALLDIR%LIB\amd64;%LIB% -@goto setlibpath -:setstorelib -@if exist "%VCINSTALLDIR%LIB\store\amd64" set LIB=%VCINSTALLDIR%LIB\store\amd64;%LIB% - -:setlibpath -@rem LIBPATH -@rem ------- -@if "%1" == "store" goto setstorelibpath -@if exist "%VCINSTALLDIR%ATLMFC\LIB\amd64" set LIBPATH=%VCINSTALLDIR%ATLMFC\LIB\amd64;%LIBPATH% -@if exist "%VCINSTALLDIR%LIB\amd64" set LIBPATH=%VCINSTALLDIR%LIB\amd64;%LIBPATH% -@goto appendlibpath -:setstorelibpath -@if exist "%VCINSTALLDIR%LIB\store\amd64" set LIBPATH=%VCINSTALLDIR%LIB\store\amd64;%VCINSTALLDIR%LIB\store\references;%LIBPATH% -:appendlibpath -@if exist "%FrameworkDir%\%Framework40Version%" set LIBPATH=%FrameworkDir%\%Framework40Version%;%LIBPATH% -@if exist "%FrameworkDir%\%FrameworkVersion%" set LIBPATH=%FrameworkDir%\%FrameworkVersion%;%LIBPATH% - -@set Platform=X64 -@set CommandPromptType=Native - -@goto end - -@REM ----------------------------------------------------------------------- -:GetVSCommonToolsDir -@set VS140COMNTOOLS= -@call :GetVSCommonToolsDirHelper32 HKLM > nul 2>&1 -@if errorlevel 1 call :GetVSCommonToolsDirHelper32 HKCU > nul 2>&1 -@if errorlevel 1 call :GetVSCommonToolsDirHelper64 HKLM > nul 2>&1 -@if errorlevel 1 call :GetVSCommonToolsDirHelper64 HKCU > nul 2>&1 -@exit /B 0 - -:GetVSCommonToolsDirHelper32 -@for /F "tokens=1,2*" %%i in ('reg query "%1\SOFTWARE\Microsoft\VisualStudio\SxS\VS7" /v "14.0"') DO ( - @if "%%i"=="14.0" ( - @SET VS140COMNTOOLS=%%k - ) -) -@if "%VS140COMNTOOLS%"=="" exit /B 1 -@SET VS140COMNTOOLS=%VS140COMNTOOLS%Common7\Tools\ -@exit /B 0 - -:GetVSCommonToolsDirHelper64 -@for /F "tokens=1,2*" %%i in ('reg query "%1\SOFTWARE\Wow6432Node\Microsoft\VisualStudio\SxS\VS7" /v "14.0"') DO ( - @if "%%i"=="14.0" ( - @SET VS140COMNTOOLS=%%k - ) -) -@if "%VS140COMNTOOLS%"=="" exit /B 1 -@SET VS140COMNTOOLS=%VS140COMNTOOLS%Common7\Tools\ -@exit /B 0 - -@REM ----------------------------------------------------------------------- -:error_no_VS140COMNTOOLSDIR -@echo ERROR: Cannot determine the location of the VS Common Tools folder. -@goto end - -:error_no_VSINSTALLDIR -@echo ERROR: Cannot determine the location of the VS installation. -@goto end - -:error_no_VCINSTALLDIR -@echo ERROR: Cannot determine the location of the VC installation. -@goto end - -:error_no_FrameworkDIR64 -@echo ERROR: Cannot determine the location of the .NET Framework 64bit installation. -@goto end - -:error_no_FrameworkVer64 -@echo ERROR: Cannot determine the version of the .NET Framework 64bit installation. -@goto end - -:error_no_Framework40Version -@echo ERROR: Cannot determine the .NET Framework 4.0 version. -@goto end - -:usage -echo Error in script usage. The correct usage is: -echo %0 -echo or -echo %0 store -echo or -echo %0 10.0.10240.0 -echo or -echo %0 store 10.0.10240.0 - -:end +@rem setupt windows var +call vcvars64.bat nvcc -Xcompiler "/wd 4819" -I"./" -arch=compute_30 --fmad=false --machine 64 -G -g -ptx -o clguetzli\clguetzli.cu.ptx64 clguetzli\clguetzli.cu \ No newline at end of file From 601e367ab6c4411f7cd57eadc5158327a005380e Mon Sep 17 00:00:00 2001 From: strongtu Date: Sat, 3 Jun 2017 14:40:48 +0800 Subject: [PATCH 129/189] =?UTF-8?q?CUDA=E7=BC=96=E8=AF=91=E6=94=AF?= =?UTF-8?q?=E6=8C=81=E5=AE=8F=E5=BC=80=E5=85=B3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clbutter_comparator.cpp | 5 ++++- clguetzli/cuguetzli.cpp | 3 +++ clguetzli/cuguetzli.h | 4 ++++ clguetzli/ocu.cpp | 4 ++++ clguetzli/ocu.h | 6 +++++- 5 files changed, 20 insertions(+), 2 deletions(-) diff --git a/clguetzli/clbutter_comparator.cpp b/clguetzli/clbutter_comparator.cpp index 53cd89fb..178e70e9 100644 --- a/clguetzli/clbutter_comparator.cpp +++ b/clguetzli/clbutter_comparator.cpp @@ -177,6 +177,7 @@ namespace butteraugli ); return; } +#ifdef __USE_CUDA__ else if (g_useCuda && xsize > 100 && ysize > 100) { mask->resize(3); @@ -194,7 +195,7 @@ namespace butteraugli ); return; } - +#endif _Mask(xyb0, xyb1, xsize, ysize, mask, mask_dc); if (g_checkOpenCL && xsize > 8 && ysize > 8) @@ -305,6 +306,7 @@ namespace butteraugli clOpsinDynamicsImage(r, g, b, xsize, ysize); } +#ifdef __USE_CUDA__ else if (g_useCuda && xsize > 100 && ysize > 100) { float * r = rgb[0].data(); @@ -313,6 +315,7 @@ namespace butteraugli cuOpsinDynamicsImage(r, g, b, xsize, ysize); } +#endif else { std::vector< std::vector> orig_rgb; diff --git a/clguetzli/cuguetzli.cpp b/clguetzli/cuguetzli.cpp index a445d930..ec158691 100644 --- a/clguetzli/cuguetzli.cpp +++ b/clguetzli/cuguetzli.cpp @@ -2,6 +2,8 @@ #include #include "ocu.h" +#ifdef __USE_CUDA__ + void cuOpsinDynamicsImage(float *r, float *g, float *b, const size_t xsize, const size_t ysize) { size_t channel_size = xsize * ysize * sizeof(float); @@ -799,3 +801,4 @@ void cuCalculateDiffmapEx(CUdeviceptr diffmap/*in,out*/, const size_t xsize, con cuMemFree(blurred); } +#endif \ No newline at end of file diff --git a/clguetzli/cuguetzli.h b/clguetzli/cuguetzli.h index 0783796a..e9dddde6 100644 --- a/clguetzli/cuguetzli.h +++ b/clguetzli/cuguetzli.h @@ -2,6 +2,8 @@ #include "guetzli/processor.h" #include "clguetzli.cl.h" +#ifdef __USE_CUDA__ + void cuOpsinDynamicsImage( float *r, float *g, float *b, const size_t xsize, const size_t ysize); @@ -113,3 +115,5 @@ void cuRemoveBorderEx(CUdeviceptr out, const CUdeviceptr in, const size_t xsize, void cuAddBorderEx(CUdeviceptr out, const size_t xsize, const size_t ysize, const int step, const CUdeviceptr in); void cuCalculateDiffmapEx(CUdeviceptr diffmap/*in,out*/, const size_t xsize, const size_t ysize, const int step); + +#endif \ No newline at end of file diff --git a/clguetzli/ocu.cpp b/clguetzli/ocu.cpp index c99d6ea9..6fbf58ee 100644 --- a/clguetzli/ocu.cpp +++ b/clguetzli/ocu.cpp @@ -2,6 +2,8 @@ #include #include "ocu.h" +#ifdef __USE_CUDA__ + ocu_args_d_t& getOcu(void) { static bool bInit = false; @@ -148,3 +150,5 @@ void ocu_args_d_t::releaseMemChannels(ocu_channels &rgb) rgb.ch[i] = NULL; } } + +#endif \ No newline at end of file diff --git a/clguetzli/ocu.h b/clguetzli/ocu.h index 63a4bb47..4c34edaf 100644 --- a/clguetzli/ocu.h +++ b/clguetzli/ocu.h @@ -1,5 +1,7 @@ #pragma once +#ifdef __USE_CUDA__ + #include #include "ocl.h" @@ -21,4 +23,6 @@ struct ocu_args_d_t CUmodule mod; CUcontext ctxt; CUdevice dev; -}; \ No newline at end of file +}; + +#endif \ No newline at end of file From c0bab473191af8bc97762a9bc622c69ddad608d0 Mon Sep 17 00:00:00 2001 From: strongtu Date: Sun, 4 Jun 2017 14:01:12 +0800 Subject: [PATCH 130/189] =?UTF-8?q?=E4=BC=98=E5=8C=96clSetKernelArg?= =?UTF-8?q?=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/cl.hpp | 318 ++++++++++++++++++++++++++++++++++++++++ clguetzli/clguetzli.cpp | 263 ++++++++------------------------- clguetzli/clguetzli.h | 6 +- clguetzli/cuguetzli.cpp | 39 +++-- 4 files changed, 408 insertions(+), 218 deletions(-) create mode 100644 clguetzli/cl.hpp diff --git a/clguetzli/cl.hpp b/clguetzli/cl.hpp new file mode 100644 index 00000000..8be6313e --- /dev/null +++ b/clguetzli/cl.hpp @@ -0,0 +1,318 @@ +#pragma once + +template +inline void clSetKernelArgK(cl_kernel k, int idx, T* t) +{ + clSetKernelArg(k, idx, sizeof(T), t); +} + +template<> +inline void clSetKernelArgK(cl_kernel k, int idx, int* t) +{ + cl_int c = *t; + clSetKernelArg(k, idx, sizeof(cl_int), &c); +} + +template<> +inline void clSetKernelArgK(cl_kernel k, int idx, const int* t) +{ + cl_int c = *t; + clSetKernelArg(k, idx, sizeof(cl_int), &c); +} + +template<> +inline void clSetKernelArgK(cl_kernel k, int idx, size_t* t) +{ + cl_int c = *t; + clSetKernelArg(k, idx, sizeof(cl_int), &c); +} + +template<> +inline void clSetKernelArgK(cl_kernel k, int idx, const size_t* t) +{ + cl_int c = *t; + clSetKernelArg(k, idx, sizeof(cl_int), &c); +} + +template +inline void clSetKernelArgEx(cl_kernel k, T0* t0) +{ + clSetKernelArgK(k, 0, t0); +} + +template +inline void clSetKernelArgEx(cl_kernel k, T0* t0, T1* t1) +{ + clSetKernelArgK(k, 1, t1); + clSetKernelArgEx(k, t0); +} + +template +inline void clSetKernelArgEx(cl_kernel k, T0* t0, T1* t1, T2* t2) +{ + clSetKernelArgK(k, 2, t2); + clSetKernelArgEx(k, t0, t1); +} + +template +inline void clSetKernelArgEx(cl_kernel k, T0* t0, T1* t1, T2* t2, T3* t3) +{ + clSetKernelArgK(k, 3, t3); + clSetKernelArgEx(k, t0, t1, t2); +} + +template +inline void clSetKernelArgEx(cl_kernel k, T0* t0, T1* t1, T2* t2, T3* t3, T4* t4) +{ + clSetKernelArgK(k, 4, t4); + clSetKernelArgEx(k, t0, t1, t2, t3); +} + +template +inline void clSetKernelArgEx(cl_kernel k, T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5) +{ + clSetKernelArgK(k, 5, t5); + clSetKernelArgEx(k, t0, t1, t2, t3, t4); +} + +template +inline void clSetKernelArgEx(cl_kernel k, T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6) +{ + clSetKernelArgK(k, 6, t6); + clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5); +} + +template +inline void clSetKernelArgEx(cl_kernel k, T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7) +{ + clSetKernelArgK(k, 7, t7); + clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6); +} + +template +inline void clSetKernelArgEx(cl_kernel k, T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7, T8* t8) +{ + clSetKernelArgK(k, 8, t8); + clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7); +} + +template +inline void clSetKernelArgEx(cl_kernel k, T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7, T8* t8, T9* t9) +{ + clSetKernelArgK(k, 9, t9); + clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7, t8); +} + +template + inline void clSetKernelArgEx(cl_kernel k, T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7, T8* t8, T9* t9, T10* t10) +{ + clSetKernelArgK(k, 10, t10); + clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9); +} + +template + inline void clSetKernelArgEx(cl_kernel k, T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7, T8* t8, T9* t9, T10* t10, T11* t11) +{ + clSetKernelArgK(k, 11, t11); + clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10); +} + +template + inline void clSetKernelArgEx(cl_kernel k, T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, + T5* t5, T6* t6, T7* t7, T8* t8, T9* t9, + T10* t10, T11* t11, T12* t12) +{ + clSetKernelArgK(k, 12, t12); + clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11); +} + +template + inline void clSetKernelArgEx(cl_kernel k, T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, + T5* t5, T6* t6, T7* t7, T8* t8, T9* t9, + T10* t10, T11* t11, T12* t12, T13* t13) +{ + clSetKernelArgK(k, 13, t13); + clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12); +} + +template + inline void clSetKernelArgEx(cl_kernel k, T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, + T5* t5, T6* t6, T7* t7, T8* t8, T9* t9, + T10* t10, T11* t11, T12* t12, T13* t13, + T14* t14) +{ + clSetKernelArgK(k, 14, t14); + clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13); +} + +template< + typename T0, typename T1, typename T2, typename T3, typename T4, + typename T5, typename T6, typename T7, typename T8, typename T9, + typename T10, typename T11, typename T12, typename T13, typename T14, + typename T15> + inline void clSetKernelArgEx(cl_kernel k, + T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, + T5* t5, T6* t6, T7* t7, T8* t8, T9* t9, + T10* t10, T11* t11, T12* t12, T13* t13, T14* t14, + T15* t15) +{ + clSetKernelArgK(k, 15, t15); + clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14); +} + +template< + typename T0, typename T1, typename T2, typename T3, typename T4, + typename T5, typename T6, typename T7, typename T8, typename T9, + typename T10, typename T11, typename T12, typename T13, typename T14, + typename T15, typename T16> + inline void clSetKernelArgEx(cl_kernel k, + T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, + T5* t5, T6* t6, T7* t7, T8* t8, T9* t9, + T10* t10, T11* t11, T12* t12, T13* t13, T14* t14, + T15* t15, T16* t16) +{ + clSetKernelArgK(k, 16, t16); + clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15); +} + +template< + typename T0, typename T1, typename T2, typename T3, typename T4, + typename T5, typename T6, typename T7, typename T8, typename T9, + typename T10, typename T11, typename T12, typename T13, typename T14, + typename T15, typename T16, typename T17> + inline void clSetKernelArgEx(cl_kernel k, + T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, + T5* t5, T6* t6, T7* t7, T8* t8, T9* t9, + T10* t10, T11* t11, T12* t12, T13* t13, T14* t14, + T15* t15, T16* t16, T17* t17) +{ + clSetKernelArgK(k, 17, t17); + clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15, t16); +} + +template< + typename T0, typename T1, typename T2, typename T3, typename T4, + typename T5, typename T6, typename T7, typename T8, typename T9, + typename T10, typename T11, typename T12, typename T13, typename T14, + typename T15, typename T16, typename T17, typename T18> + inline void clSetKernelArgEx(cl_kernel k, + T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, + T5* t5, T6* t6, T7* t7, T8* t8, T9* t9, + T10* t10, T11* t11, T12* t12, T13* t13, T14* t14, + T15* t15, T16* t16, T17* t17, T18* t18) +{ + clSetKernelArgK(k, 18, t18); + clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15, t16, t17); +} + +template< + typename T0, typename T1, typename T2, typename T3, typename T4, + typename T5, typename T6, typename T7, typename T8, typename T9, + typename T10, typename T11, typename T12, typename T13, typename T14, + typename T15, typename T16, typename T17, typename T18, typename T19> + inline void clSetKernelArgEx(cl_kernel k, + T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, + T5* t5, T6* t6, T7* t7, T8* t8, T9* t9, + T10* t10, T11* t11, T12* t12, T13* t13, T14* t14, + T15* t15, T16* t16, T17* t17, T18* t18, T19* t19) +{ + clSetKernelArgK(k, 19, t19); + clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15, t16, t17, t18); +} + +template< + typename T0, typename T1, typename T2, typename T3, typename T4, + typename T5, typename T6, typename T7, typename T8, typename T9, + typename T10, typename T11, typename T12, typename T13, typename T14, + typename T15, typename T16, typename T17, typename T18, typename T19, + typename T20> + inline void clSetKernelArgEx(cl_kernel k, + T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, + T5* t5, T6* t6, T7* t7, T8* t8, T9* t9, + T10* t10, T11* t11, T12* t12, T13* t13, T14* t14, + T15* t15, T16* t16, T17* t17, T18* t18, T19* t19, + T20* t20) +{ + clSetKernelArgK(k, 20, t20); + clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15, t16, t17, t18, t19); +} + +template< + typename T0, typename T1, typename T2, typename T3, typename T4, + typename T5, typename T6, typename T7, typename T8, typename T9, + typename T10, typename T11, typename T12, typename T13, typename T14, + typename T15, typename T16, typename T17, typename T18, typename T19, + typename T20, typename T21> + inline void clSetKernelArgEx(cl_kernel k, + T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, + T5* t5, T6* t6, T7* t7, T8* t8, T9* t9, + T10* t10, T11* t11, T12* t12, T13* t13, T14* t14, + T15* t15, T16* t16, T17* t17, T18* t18, T19* t19, + T20* t20, T21* t21) +{ + clSetKernelArgK(k, 21, t21); + clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15, t16, t17, t18, t19, t20); +} + +template< + typename T0, typename T1, typename T2, typename T3, typename T4, + typename T5, typename T6, typename T7, typename T8, typename T9, + typename T10, typename T11, typename T12, typename T13, typename T14, + typename T15, typename T16, typename T17, typename T18, typename T19, + typename T20, typename T21, typename T22> + inline void clSetKernelArgEx(cl_kernel k, + T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, + T5* t5, T6* t6, T7* t7, T8* t8, T9* t9, + T10* t10, T11* t11, T12* t12, T13* t13, T14* t14, + T15* t15, T16* t16, T17* t17, T18* t18, T19* t19, + T20* t20, T21* t21, T22* t22) +{ + clSetKernelArgK(k, 22, t22); + clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15, t16, t17, t18, t19, t20, t21); +} + +template< + typename T0, typename T1, typename T2, typename T3, typename T4, + typename T5, typename T6, typename T7, typename T8, typename T9, + typename T10, typename T11, typename T12, typename T13, typename T14, + typename T15, typename T16, typename T17, typename T18, typename T19, + typename T20, typename T21, typename T22, typename T23> + inline void clSetKernelArgEx(cl_kernel k, + T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, + T5* t5, T6* t6, T7* t7, T8* t8, T9* t9, + T10* t10, T11* t11, T12* t12, T13* t13, T14* t14, + T15* t15, T16* t16, T17* t17, T18* t18, T19* t19, + T20* t20, T21* t21, T22* t22, T23* t23) +{ + clSetKernelArgK(k, 23, t23); + clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15, t16, t17, t18, t19, t20, t21, t22); +} + +template< + typename T0, typename T1, typename T2, typename T3, typename T4, + typename T5, typename T6, typename T7, typename T8, typename T9, + typename T10, typename T11, typename T12, typename T13, typename T14, + typename T15, typename T16, typename T17, typename T18, typename T19, + typename T20, typename T21, typename T22, typename T23, typename T24> +inline void clSetKernelArgEx(cl_kernel k, + T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, + T5* t5, T6* t6, T7* t7, T8* t8, T9* t9, + T10* t10, T11* t11, T12* t12, T13* t13, T14* t14, + T15* t15, T16* t16, T17* t17, T18* t18, T19* t19, + T20* t20, T21* t21, T22* t22, T23* t23, T24* t24) +{ + clSetKernelArgK(k, 24, t24); + clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15, t16, t17, t18, t19, t20, t21, t22, t23); +} diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index c6c5eb4f..53be7348 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -2,6 +2,7 @@ #include #include #include +#include "cl.hpp" extern bool g_useOpenCL = false; extern bool g_useCuda = false; @@ -172,33 +173,18 @@ void clComputeBlockZeroingOrder( int output_order_batch_size = sizeof(CoeffData) * 3 * kDCTBlockSize * blockf_width * blockf_height; cl_mem mem_output_order_batch = ocl.allocMem(output_order_batch_size, output_order_batch); - cl_float clBlockErrorLimit = BlockErrorLimit; - cl_int clWidth = image_width; - cl_int clHeight = image_height; - cl_int clFactor = factor; - cl_int clMask = comp_mask; cl_kernel kernel = ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER]; - clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&mem_orig_coeff[0]); - clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&mem_orig_coeff[1]); - clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&mem_orig_coeff[2]); - clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&mem_orig_image); - clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&mem_mask_scale); - clSetKernelArg(kernel, 5, sizeof(cl_int), &clWidth); - clSetKernelArg(kernel, 6, sizeof(cl_int), &clHeight); - clSetKernelArg(kernel, 7, sizeof(cl_mem), (void*)&mem_mayout_coeff[0]); - clSetKernelArg(kernel, 8, sizeof(cl_mem), (void*)&mem_mayout_coeff[1]); - clSetKernelArg(kernel, 9, sizeof(cl_mem), (void*)&mem_mayout_coeff[2]); - clSetKernelArg(kernel, 10, sizeof(cl_mem), (void*)&mem_mayout_pixel[0]); - clSetKernelArg(kernel, 11, sizeof(cl_mem), (void*)&mem_mayout_pixel[1]); - clSetKernelArg(kernel, 12, sizeof(cl_mem), (void*)&mem_mayout_pixel[2]); - clSetKernelArg(kernel, 13, sizeof(channel_info), &mayout_channel[0]); - clSetKernelArg(kernel, 14, sizeof(channel_info), &mayout_channel[1]); - clSetKernelArg(kernel, 15, sizeof(channel_info), &mayout_channel[2]); - clSetKernelArg(kernel, 16, sizeof(cl_int), &clFactor); - clSetKernelArg(kernel, 17, sizeof(cl_int), &clMask); - clSetKernelArg(kernel, 18, sizeof(cl_float), &clBlockErrorLimit); - clSetKernelArg(kernel, 19, sizeof(cl_mem), &mem_output_order_batch); + clSetKernelArgEx(kernel, &mem_orig_coeff[0], &mem_orig_coeff[1], &mem_orig_coeff[2], + &mem_orig_image, &mem_mask_scale, + &image_width, &image_height, + &mem_mayout_coeff[0], &mem_mayout_coeff[1], &mem_mayout_coeff[2], + &mem_mayout_pixel[0], &mem_mayout_pixel[1], &mem_mayout_pixel[2], + &mayout_channel[0], &mayout_channel[1], &mayout_channel[2], + &factor, + &comp_mask, + &BlockErrorLimit, + &mem_output_order_batch); size_t globalWorkSize[2] = { blockf_width, blockf_height }; err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); @@ -224,7 +210,7 @@ void clComputeBlockZeroingOrder( void clMask( float* mask_r, float* mask_g, float* mask_b, float* maskdc_r, float* maskdc_g, float* maskdc_b, - size_t xsize, size_t ysize, + const size_t xsize, const size_t ysize, const float* r, const float* g, const float* b, const float* r2, const float* g2, const float* b2) { @@ -257,27 +243,14 @@ void clConvolutionEx( cl_mem result/*out*/, const cl_mem inp, size_t xsize, size_t ysize, const cl_mem multipliers, size_t len, - int xstep, int offset, double border_ratio) + int xstep, int offset, float border_ratio) { ocl_args_d_t &ocl = getOcl(); size_t oxsize = (xsize + xstep - 1) / xstep; - cl_int clxsize = xsize; - cl_int clxstep = xstep; - cl_int cllen = len; - cl_int cloffset = offset; - cl_float clborder_ratio = border_ratio; - cl_kernel kernel = ocl.kernel[KERNEL_CONVOLUTION]; - clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&result); - clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&inp); - clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&clxsize); - clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&multipliers); - clSetKernelArg(kernel, 4, sizeof(cl_int), (void*)&cllen); - clSetKernelArg(kernel, 5, sizeof(cl_int), (void*)&clxstep); - clSetKernelArg(kernel, 6, sizeof(cl_int), (void*)&cloffset); - clSetKernelArg(kernel, 7, sizeof(cl_float), (void*)&clborder_ratio); + clSetKernelArgEx(kernel, &result, &inp, &xsize, &multipliers, &len, &xstep, &offset, &border_ratio); size_t globalWorkSize[2] = { oxsize, ysize }; cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); @@ -290,23 +263,12 @@ void clConvolutionXEx( cl_mem result/*out*/, const cl_mem inp, size_t xsize, size_t ysize, const cl_mem multipliers, size_t len, - int xstep, int offset, double border_ratio) + int xstep, int offset, float border_ratio) { ocl_args_d_t &ocl = getOcl(); - cl_int clxstep = xstep; - cl_int cllen = len; - cl_int cloffset = offset; - cl_float clborder_ratio = border_ratio; - cl_kernel kernel = ocl.kernel[KERNEL_CONVOLUTIONX]; - clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&result); - clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&inp); - clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&multipliers); - clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&cllen); - clSetKernelArg(kernel, 4, sizeof(cl_int), (void*)&xstep); - clSetKernelArg(kernel, 5, sizeof(cl_int), (void*)&cloffset); - clSetKernelArg(kernel, 6, sizeof(cl_float), (void*)&clborder_ratio); + clSetKernelArgEx(kernel, &result, &inp, &multipliers, &len, &xstep, &offset, &border_ratio); size_t globalWorkSize[2] = { xsize, ysize }; cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); @@ -319,23 +281,12 @@ void clConvolutionYEx( cl_mem result/*out*/, const cl_mem inp, size_t xsize, size_t ysize, const cl_mem multipliers, size_t len, - int xstep, int offset, double border_ratio) + int xstep, int offset, float border_ratio) { ocl_args_d_t &ocl = getOcl(); - cl_int clxstep = xstep; - cl_int cllen = len; - cl_int cloffset = offset; - cl_float clborder_ratio = border_ratio; - cl_kernel kernel = ocl.kernel[KERNEL_CONVOLUTIONY]; - clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&result); - clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&inp); - clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&multipliers); - clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&cllen); - clSetKernelArg(kernel, 4, sizeof(cl_int), (void*)&xstep); - clSetKernelArg(kernel, 5, sizeof(cl_int), (void*)&cloffset); - clSetKernelArg(kernel, 6, sizeof(cl_float), (void*)&clborder_ratio); + clSetKernelArgEx(kernel, &result, &inp, &multipliers, &len, &xstep, &offset, &border_ratio); size_t globalWorkSize[2] = { xsize, ysize }; cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); @@ -351,13 +302,8 @@ void clSquareSampleEx( { ocl_args_d_t &ocl = getOcl(); - cl_int clxstep = xstep; - cl_int clystep = ystep; cl_kernel kernel = ocl.kernel[KERNEL_SQUARESAMPLE]; - clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&result); - clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&image); - clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&clxstep); - clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&clystep); + clSetKernelArgEx(kernel, &result, &image, &xstep, &ystep); size_t globalWorkSize[2] = { xsize, ysize }; cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); @@ -418,12 +364,7 @@ void clOpsinDynamicsImageEx(ocl_channels &rgb, const size_t xsize, const size_t clBlurEx(rgb.b, xsize, ysize, kSigma, 0.0, rgb_blurred.b); cl_kernel kernel = ocl.kernel[KERNEL_OPSINDYNAMICSIMAGE]; - clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&rgb.r); - clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&rgb.g); - clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&rgb.b); - clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&rgb_blurred.r); - clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&rgb_blurred.g); - clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&rgb_blurred.b); + clSetKernelArgEx(kernel, &rgb.r, &rgb.g, &rgb.b, &rgb_blurred.r, &rgb_blurred.g, &rgb_blurred.b); size_t globalWorkSize[1] = { xsize * ysize }; cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL); @@ -455,18 +396,10 @@ void clMaskHighIntensityChangeEx( clFinish(ocl.commandQueue); cl_kernel kernel = ocl.kernel[KERNEL_MASKHIGHINTENSITYCHANGE]; - clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&xyb0.r); - clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&xyb0.g); - clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&xyb0.b); - clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&xyb1.r); - clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&xyb1.g); - clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&xyb1.b); - clSetKernelArg(kernel, 6, sizeof(cl_mem), (void*)&c0.r); - clSetKernelArg(kernel, 7, sizeof(cl_mem), (void*)&c0.g); - clSetKernelArg(kernel, 8, sizeof(cl_mem), (void*)&c0.b); - clSetKernelArg(kernel, 9, sizeof(cl_mem), (void*)&c1.r); - clSetKernelArg(kernel, 10, sizeof(cl_mem), (void*)&c1.g); - clSetKernelArg(kernel, 11, sizeof(cl_mem), (void*)&c1.b); + clSetKernelArgEx(kernel, &xyb0.r, &xyb0.g, &xyb0.b, + &xyb1.r, &xyb1.g, &xyb1.b, + &c0.r, &c0.g, &c0.b, + &c1.r, &c1.g, &c1.b); size_t globalWorkSize[2] = { xsize, ysize }; cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); @@ -497,21 +430,11 @@ void clEdgeDetectorMapEx( clBlurEx(rgb2.ch[i], xsize, ysize, kSigma[i], 0.0, rgb2_blured.ch[i]); } - cl_int clxsize = xsize; - cl_int clysize = ysize; - cl_int clstep = step; - cl_kernel kernel = ocl.kernel[KERNEL_EDGEDETECTOR]; - clSetKernelArg(kernel, 0, sizeof(cl_mem), &result); - clSetKernelArg(kernel, 1, sizeof(cl_mem), &rgb_blured.r); - clSetKernelArg(kernel, 2, sizeof(cl_mem), &rgb_blured.g); - clSetKernelArg(kernel, 3, sizeof(cl_mem), &rgb_blured.b); - clSetKernelArg(kernel, 4, sizeof(cl_mem), &rgb2_blured.r); - clSetKernelArg(kernel, 5, sizeof(cl_mem), &rgb2_blured.g); - clSetKernelArg(kernel, 6, sizeof(cl_mem), &rgb2_blured.b); - clSetKernelArg(kernel, 7, sizeof(cl_int), &clxsize); - clSetKernelArg(kernel, 8, sizeof(cl_int), &clysize); - clSetKernelArg(kernel, 9, sizeof(cl_int), &clstep); + clSetKernelArgEx(kernel, &result, + &rgb_blured.r, &rgb_blured.g, &rgb_blured.b, + &rgb2_blured.r, &rgb2_blured.g, &rgb2_blured.b, + &xsize, &ysize, &step); const size_t res_xsize = (xsize + step - 1) / step; const size_t res_ysize = (ysize + step - 1) / step; @@ -534,22 +457,11 @@ void clBlockDiffMapEx( { ocl_args_d_t &ocl = getOcl(); - cl_int clxsize = xsize; - cl_int clysize = ysize; - cl_int clstep = step; - cl_kernel kernel = ocl.kernel[KERNEL_BLOCKDIFFMAP]; - clSetKernelArg(kernel, 0, sizeof(cl_mem), &block_diff_dc); - clSetKernelArg(kernel, 1, sizeof(cl_mem), &block_diff_ac); - clSetKernelArg(kernel, 2, sizeof(cl_mem), &rgb.r); - clSetKernelArg(kernel, 3, sizeof(cl_mem), &rgb.g); - clSetKernelArg(kernel, 4, sizeof(cl_mem), &rgb.b); - clSetKernelArg(kernel, 5, sizeof(cl_mem), &rgb2.r); - clSetKernelArg(kernel, 6, sizeof(cl_mem), &rgb2.g); - clSetKernelArg(kernel, 7, sizeof(cl_mem), &rgb2.b); - clSetKernelArg(kernel, 8, sizeof(cl_int), &clxsize); - clSetKernelArg(kernel, 9, sizeof(cl_int), &clysize); - clSetKernelArg(kernel, 10, sizeof(cl_int), &clstep); + clSetKernelArgEx(kernel, &block_diff_dc, &block_diff_ac, + &rgb.r, &rgb.g, &rgb.b, + &rgb2.r, &rgb2.g, &rgb2.b, + &xsize, &ysize, &step); const size_t res_xsize = (xsize + step - 1) / step; const size_t res_ysize = (ysize + step - 1) / step; @@ -579,21 +491,11 @@ void clEdgeDetectorLowFreqEx( clBlurEx(rgb2.ch[i], xsize, ysize, kSigma, 0.0, rgb2_blured.ch[i]); } - cl_int clxsize = xsize; - cl_int clysize = ysize; - cl_int clstep = step; - cl_kernel kernel = ocl.kernel[KERNEL_EDGEDETECTORLOWFREQ]; - clSetKernelArg(kernel, 0, sizeof(cl_mem), &block_diff_ac); - clSetKernelArg(kernel, 1, sizeof(cl_mem), &rgb_blured.r); - clSetKernelArg(kernel, 2, sizeof(cl_mem), &rgb_blured.g); - clSetKernelArg(kernel, 3, sizeof(cl_mem), &rgb_blured.b); - clSetKernelArg(kernel, 4, sizeof(cl_mem), &rgb2_blured.r); - clSetKernelArg(kernel, 5, sizeof(cl_mem), &rgb2_blured.g); - clSetKernelArg(kernel, 6, sizeof(cl_mem), &rgb2_blured.b); - clSetKernelArg(kernel, 7, sizeof(cl_int), &clxsize); - clSetKernelArg(kernel, 8, sizeof(cl_int), &clysize); - clSetKernelArg(kernel, 9, sizeof(cl_int), &clstep); + clSetKernelArgEx(kernel, &block_diff_ac, + &rgb_blured.r, &rgb_blured.g, &rgb_blured.b, + &rgb2_blured.r, &rgb2_blured.g, &rgb2_blured.b, + &xsize, &ysize, &step); const size_t res_xsize = (xsize + step - 1) / step; const size_t res_ysize = (ysize + step - 1) / step; @@ -616,15 +518,9 @@ void clDiffPrecomputeEx( ocl_args_d_t &ocl = getOcl(); cl_kernel kernel = ocl.kernel[KERNEL_DIFFPRECOMPUTE]; - clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&mask.x); - clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&mask.y); - clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&mask.b); - clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&xyb0.x); - clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&xyb0.y); - clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&xyb0.b); - clSetKernelArg(kernel, 6, sizeof(cl_mem), (void*)&xyb1.x); - clSetKernelArg(kernel, 7, sizeof(cl_mem), (void*)&xyb1.y); - clSetKernelArg(kernel, 8, sizeof(cl_mem), (void*)&xyb1.b); + clSetKernelArgEx(kernel, &mask.x, &mask.y, &mask.b, + &xyb0.x, &xyb0.y, &xyb0.b, + &xyb1.x, &xyb1.y, &xyb1.b); size_t globalWorkSize[2] = { xsize, ysize }; cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); @@ -637,11 +533,8 @@ void clScaleImageEx(cl_mem img/*in, out*/, size_t size, double w) { ocl_args_d_t &ocl = getOcl(); - cl_double clscale = w; - cl_kernel kernel = ocl.kernel[KERNEL_SCALEIMAGE]; - clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&img); - clSetKernelArg(kernel, 1, sizeof(cl_double), (void*)&clscale); + clSetKernelArgEx(kernel, &img, &w); size_t globalWorkSize[1] = { size }; cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL); @@ -665,8 +558,7 @@ void clAverage5x5Ex(cl_mem img/*in,out*/, const size_t xsize, const size_t ysize clEnqueueCopyBuffer(ocl.commandQueue, img, img_org, 0, 0, len, 0, NULL, NULL); cl_kernel kernel = ocl.kernel[KERNEL_AVERAGE5X5]; - clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&img); - clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&img_org); + clSetKernelArgEx(kernel, &img, &img_org); size_t globalWorkSize[2] = { xsize, ysize }; cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); @@ -684,15 +576,10 @@ void clMinSquareValEx( { ocl_args_d_t &ocl = getOcl(); - cl_int cloffset = offset; - cl_int clsquare_size = square_size; cl_mem result = ocl.allocMem(sizeof(cl_float) * xsize * ysize); cl_kernel kernel = ocl.kernel[KERNEL_MINSQUAREVAL]; - clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&result); - clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&img); - clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&clsquare_size); - clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&cloffset); + clSetKernelArgEx(kernel, &result, &img, &square_size, &offset); size_t globalWorkSize[2] = { xsize, ysize }; cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); @@ -805,18 +692,10 @@ void clDoMask(ocl_channels mask/*in, out*/, ocl_channels mask_dc/*in, out*/, siz ocl_channels xyb_dc = ocl.allocMemChannels(channel_size, lut_dcx, lut_dcy, lut_dcb); cl_kernel kernel = ocl.kernel[KERNEL_DOMASK]; - clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&mask.r); - clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&mask.g); - clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&mask.b); - clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&mask_dc.r); - clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&mask_dc.g); - clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&mask_dc.b); - clSetKernelArg(kernel, 6, sizeof(cl_mem), (void*)&xyb.x); - clSetKernelArg(kernel, 7, sizeof(cl_mem), (void*)&xyb.y); - clSetKernelArg(kernel, 8, sizeof(cl_mem), (void*)&xyb.b); - clSetKernelArg(kernel, 9, sizeof(cl_mem), (void*)&xyb_dc.x); - clSetKernelArg(kernel, 10, sizeof(cl_mem), (void*)&xyb_dc.y); - clSetKernelArg(kernel, 11, sizeof(cl_mem), (void*)&xyb_dc.b); + clSetKernelArgEx(kernel, &mask.r, &mask.g, &mask.b, + &mask_dc.r, &mask_dc.g, &mask_dc.b, + &xyb.x, &xyb.y, &xyb.b, + &xyb_dc.x, &xyb_dc.y, &xyb_dc.b); size_t globalWorkSize[2] = { xsize, ysize }; cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); @@ -873,26 +752,15 @@ void clCombineChannelsEx( const size_t work_xsize = ((xsize - 8 + step) + step - 1) / step; const size_t work_ysize = ((ysize - 8 + step) + step - 1) / step; - cl_int clres_size = res_xsize; - cl_int clxsize = xsize; - cl_int clysize = ysize; - cl_int clstep = step; - cl_kernel kernel = ocl.kernel[KERNEL_COMBINECHANNELS]; - clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&result); - clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&mask.r); - clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&mask.g); - clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&mask.b); - clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&mask_dc.r); - clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&mask_dc.g); - clSetKernelArg(kernel, 6, sizeof(cl_mem), (void*)&mask_dc.b); - clSetKernelArg(kernel, 7, sizeof(cl_int), (void*)&clxsize); - clSetKernelArg(kernel, 8, sizeof(cl_int), (void*)&clysize); - clSetKernelArg(kernel, 9, sizeof(cl_mem), (void*)&block_diff_dc); - clSetKernelArg(kernel, 10, sizeof(cl_mem), (void*)&block_diff_ac); - clSetKernelArg(kernel, 11, sizeof(cl_mem), (void*)&edge_detector_map); - clSetKernelArg(kernel, 12, sizeof(cl_int), (void*)&clres_size); - clSetKernelArg(kernel, 13, sizeof(cl_int), (void*)&clstep); + clSetKernelArgEx(kernel, &result, + &mask.r, &mask.g, &mask.b, + &mask_dc.r, &mask_dc.g, &mask_dc.b, + &xsize, &ysize, + &block_diff_dc, &block_diff_ac, + &edge_detector_map, + &res_xsize, + &step); size_t globalWorkSize[2] = { work_xsize, work_ysize }; cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); @@ -905,18 +773,10 @@ void clUpsampleSquareRootEx(cl_mem diffmap, const size_t xsize, const size_t ysi { ocl_args_d_t &ocl = getOcl(); - cl_int clxsize = xsize; - cl_int clysize = ysize; - cl_int clstep = step; - cl_mem diffmap_out = ocl.allocMem(xsize * ysize * sizeof(float)); cl_kernel kernel = ocl.kernel[KERNEL_UPSAMPLESQUAREROOT]; - clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&diffmap_out); - clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&diffmap); - clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&xsize); - clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&ysize); - clSetKernelArg(kernel, 4, sizeof(cl_int), (void*)&step); + clSetKernelArgEx(kernel, &diffmap_out, &diffmap, &xsize, &ysize, &step); const size_t res_xsize = (xsize + step - 1) / step; const size_t res_ysize = (ysize + step - 1) / step; @@ -938,13 +798,9 @@ void clRemoveBorderEx(cl_mem out, const cl_mem in, const size_t xsize, const siz cl_int cls = 8 - step; cl_int cls2 = (8 - step) / 2; - cl_int clxsize = xsize; + cl_kernel kernel = ocl.kernel[KERNEL_REMOVEBORDER]; - clSetKernelArg(kernel, 0, sizeof(cl_mem), &out); - clSetKernelArg(kernel, 1, sizeof(cl_mem), &in); - clSetKernelArg(kernel, 2, sizeof(cl_int), &clxsize); - clSetKernelArg(kernel, 3, sizeof(cl_int), &cls); - clSetKernelArg(kernel, 4, sizeof(cl_int), &cls2); + clSetKernelArgEx(kernel, &out, &in, &xsize, &cls, &cls2); size_t globalWorkSize[2] = { xsize - cls, ysize - cls}; cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); @@ -960,10 +816,7 @@ void clAddBorderEx(cl_mem out, size_t xsize, size_t ysize, int step, cl_mem in) cl_int cls = 8 - step; cl_int cls2 = (8 - step) / 2; cl_kernel kernel = ocl.kernel[KERNEL_ADDBORDER]; - clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&out); - clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&cls); - clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&cls2); - clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&in); + clSetKernelArgEx(kernel, &out, &cls, &cls2, &in); size_t globalWorkSize[2] = { xsize, ysize}; cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h index 31b10e36..ccdf24a8 100644 --- a/clguetzli/clguetzli.h +++ b/clguetzli/clguetzli.h @@ -45,19 +45,19 @@ void clConvolutionEx( cl_mem result/*out*/, const cl_mem inp, size_t xsize, size_t ysize, const cl_mem multipliers, size_t len, - int xstep, int offset, double border_ratio); + int xstep, int offset, float border_ratio); void clConvolutionXEx( cl_mem result/*out*/, const cl_mem inp, size_t xsize, size_t ysize, const cl_mem multipliers, size_t len, - int xstep, int offset, double border_ratio); + int xstep, int offset, float border_ratio); void clConvolutionYEx( cl_mem result/*out*/, const cl_mem inp, size_t xsize, size_t ysize, const cl_mem multipliers, size_t len, - int xstep, int offset, double border_ratio); + int xstep, int offset, float border_ratio); void clSquareSampleEx( cl_mem result/*out*/, diff --git a/clguetzli/cuguetzli.cpp b/clguetzli/cuguetzli.cpp index ec158691..ced35da7 100644 --- a/clguetzli/cuguetzli.cpp +++ b/clguetzli/cuguetzli.cpp @@ -115,7 +115,7 @@ void cuComputeBlockZeroingOrder( CUdeviceptr mem_output_order_batch = ocl.allocMem(output_order_batch_size, output_order_batch); const void *args[] = { &mem_orig_coeff[0], &mem_orig_coeff[1], &mem_orig_coeff[2], - &mem_orig_image, &mem_orig_image, &mem_mask_scale, + &mem_orig_image, &mem_mask_scale, &image_width, &image_height, &mem_mayout_coeff[0], &mem_mayout_coeff[1], &mem_mayout_coeff[2], &mem_mayout_pixel[0], &mem_mayout_pixel[1], &mem_mayout_pixel[2], @@ -179,11 +179,33 @@ void cuMask( ocl.releaseMemChannels(mask_dc); } +void cuConvolutionEx( + CUdeviceptr result/*out*/, + const CUdeviceptr inp, size_t xsize, size_t ysize, + const CUdeviceptr multipliers, size_t len, + int xstep, int offset, float border_ratio) +{ + ocu_args_d_t &ocl = getOcu(); + + size_t oxsize = (xsize + xstep - 1) / xstep; + + const void *args[] = { &result, &inp, &xsize, &multipliers, &len, &xstep, &offset, &border_ratio }; + + CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_CONVOLUTIONX], + oxsize, ysize, 1, + 1, 1, 1, + 0, + ocl.stream, (void**)args, NULL); + + err = cuStreamSynchronize(ocl.stream); +} + + void cuConvolutionXEx( CUdeviceptr result/*out*/, const CUdeviceptr inp, size_t xsize, size_t ysize, const CUdeviceptr multipliers, size_t len, - int xstep, int offset, double border_ratio) + int xstep, int offset, float border_ratio) { ocu_args_d_t &ocl = getOcu(); @@ -204,12 +226,11 @@ void cuConvolutionYEx( const CUdeviceptr multipliers, size_t len, int xstep, int offset, double border_ratio) { - CUresult err = CUDA_SUCCESS; ocu_args_d_t &ocl = getOcu(); const void *args[] = { &result, &inp, &multipliers, &len, &xstep, &offset, &border_ratio }; - err = cuLaunchKernel(ocl.kernel[KERNEL_CONVOLUTIONY], + CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_CONVOLUTIONY], xsize, ysize, 1, 1, 1, 1, 0, @@ -223,12 +244,11 @@ void cuSquareSampleEx( const CUdeviceptr image, size_t xsize, size_t ysize, size_t xstep, size_t ystep) { - CUresult err = CUDA_SUCCESS; ocu_args_d_t &ocu = getOcu(); const void *args[] = { &result, &image, &xstep, &ystep }; - err = cuLaunchKernel(ocu.kernel[KERNEL_SQUARESAMPLE], + CUresult err = cuLaunchKernel(ocu.kernel[KERNEL_SQUARESAMPLE], xsize, ysize, 1, 1, 1, 1, 0, @@ -253,7 +273,6 @@ void cuBlurEx(CUdeviceptr image/*out, opt*/, const size_t xsize, const size_t ys const int xstep = std::max(1, int(sigma / 3)); - CUresult err = CUDA_SUCCESS; ocu_args_d_t &ocu = getOcu(); CUdeviceptr mem_expn = ocu.allocMem(sizeof(cl_float) * expn_size, expn.data()); @@ -282,7 +301,6 @@ void cuOpsinDynamicsImageEx(ocu_channels &rgb, const size_t xsize, const size_t size_t channel_size = xsize * ysize * sizeof(float); - CUresult err = CUDA_SUCCESS; ocu_args_d_t &ocl = getOcu(); ocu_channels rgb_blurred = ocl.allocMemChannels(channel_size); @@ -292,7 +310,7 @@ void cuOpsinDynamicsImageEx(ocu_channels &rgb, const size_t xsize, const size_t void *args[] = { &rgb.r, &rgb.g, &rgb.b, &rgb_blurred.r, &rgb_blurred.g, &rgb_blurred.b }; - CUresult r = cuLaunchKernel(ocl.kernel[KERNEL_OPSINDYNAMICSIMAGE], + CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_OPSINDYNAMICSIMAGE], xsize * ysize, 1, 1, 1, 1, 1, 0, @@ -694,7 +712,8 @@ void cuCombineChannelsEx( &mask.r, &mask.g, &mask.b, &mask_dc.r, &mask_dc.g, &mask_dc.b, &xsize, &ysize, - &block_diff_dc, &block_diff_ac, &edge_detector_map, + &block_diff_dc, &block_diff_ac, + &edge_detector_map, &res_xsize, &step }; From 39bcbd149e88fcc12e21dd20542b012d6530c878 Mon Sep 17 00:00:00 2001 From: strongtu Date: Sun, 4 Jun 2017 14:38:19 +0800 Subject: [PATCH 131/189] =?UTF-8?q?=E7=B2=BE=E7=AE=80=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clguetzli.cl.h | 30 ++--- clguetzli/clguetzli.cpp | 76 +++---------- clguetzli/clguetzli_test.h | 2 - clguetzli/cuguetzli.cpp | 220 ++++++++++++++++++++----------------- clguetzli/cuguetzli.h | 54 ++++----- clguetzli/ocl.cpp | 81 +++++++++----- clguetzli/ocl.h | 64 +++-------- clguetzli/ocu.cpp | 10 +- clguetzli/ocu.h | 6 +- 9 files changed, 254 insertions(+), 289 deletions(-) diff --git a/clguetzli/clguetzli.cl.h b/clguetzli/clguetzli.cl.h index 4e461399..6dec83c8 100644 --- a/clguetzli/clguetzli.cl.h +++ b/clguetzli/clguetzli.cl.h @@ -19,6 +19,7 @@ typedef unsigned char uchar; typedef unsigned short ushort; + typedef CUdeviceptr cu_mem; int get_global_id(int dim); int get_global_size(int dim); @@ -78,19 +79,19 @@ { struct { - CUdeviceptr r; - CUdeviceptr g; - CUdeviceptr b; + cu_mem r; + cu_mem g; + cu_mem b; }; struct { - CUdeviceptr x; - CUdeviceptr y; - CUdeviceptr b_; + cu_mem x; + cu_mem y; + cu_mem b_; }; union { - CUdeviceptr ch[3]; + cu_mem ch[3]; }; }ocu_channels; #endif @@ -100,22 +101,7 @@ #ifdef __OPENCL_VERSION__ #define __constant_ex __constant #define __device__ -/* - typedef union ocl_channels_t - { - struct - { - float * r; - float * g; - float * b; - }; - union - { - float *ch[3]; - }; - }ocl_channels; -*/ #endif /*__OPENCL_VERSION__*/ #ifdef __CUDACC__ diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index 53be7348..15feb7d1 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -8,61 +8,6 @@ extern bool g_useOpenCL = false; extern bool g_useCuda = false; extern bool g_checkOpenCL = false; -ocl_args_d_t& getOcl(void) -{ - static bool bInit = false; - static ocl_args_d_t ocl; - - if (bInit == true) return ocl; - - bInit = true; - cl_int err = SetupOpenCL(&ocl, CL_DEVICE_TYPE_GPU); - LOG_CL_RESULT(err); - - char* source = nullptr; - size_t src_size = 0; - ReadSourceFromFile("clguetzli/clguetzli.cl", &source, &src_size); - - ocl.program = clCreateProgramWithSource(ocl.context, 1, (const char**)&source, &src_size, &err); - - delete[] source; - - err = clBuildProgram(ocl.program, 1, &ocl.device, "", NULL, NULL); - LOG_CL_RESULT(err); - if (CL_BUILD_PROGRAM_FAILURE == err) - { - size_t log_size = 0; - clGetProgramBuildInfo(ocl.program, ocl.device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); - - std::vector build_log(log_size); - clGetProgramBuildInfo(ocl.program, ocl.device, CL_PROGRAM_BUILD_LOG, log_size, &build_log[0], NULL); - - LogError("Error happened during the build of OpenCL program.\nBuild log:%s", &build_log[0]); - } - - ocl.kernel[KERNEL_CONVOLUTION] = clCreateKernel(ocl.program, "clConvolutionEx", &err); - ocl.kernel[KERNEL_CONVOLUTIONX] = clCreateKernel(ocl.program, "clConvolutionXEx", &err); - ocl.kernel[KERNEL_CONVOLUTIONY] = clCreateKernel(ocl.program, "clConvolutionYEx", &err); - ocl.kernel[KERNEL_SQUARESAMPLE] = clCreateKernel(ocl.program, "clSquareSampleEx", &err); - ocl.kernel[KERNEL_OPSINDYNAMICSIMAGE] = clCreateKernel(ocl.program, "clOpsinDynamicsImageEx", &err); - ocl.kernel[KERNEL_MASKHIGHINTENSITYCHANGE] = clCreateKernel(ocl.program, "clMaskHighIntensityChangeEx", &err); - ocl.kernel[KERNEL_EDGEDETECTOR] = clCreateKernel(ocl.program, "clEdgeDetectorMapEx", &err); - ocl.kernel[KERNEL_BLOCKDIFFMAP] = clCreateKernel(ocl.program, "clBlockDiffMapEx", &err); - ocl.kernel[KERNEL_EDGEDETECTORLOWFREQ] = clCreateKernel(ocl.program, "clEdgeDetectorLowFreqEx", &err); - ocl.kernel[KERNEL_DIFFPRECOMPUTE] = clCreateKernel(ocl.program, "clDiffPrecomputeEx", &err); - ocl.kernel[KERNEL_SCALEIMAGE] = clCreateKernel(ocl.program, "clScaleImageEx", &err); - ocl.kernel[KERNEL_AVERAGE5X5] = clCreateKernel(ocl.program, "clAverage5x5Ex", &err); - ocl.kernel[KERNEL_MINSQUAREVAL] = clCreateKernel(ocl.program, "clMinSquareValEx", &err); - ocl.kernel[KERNEL_DOMASK] = clCreateKernel(ocl.program, "clDoMaskEx", &err); - ocl.kernel[KERNEL_COMBINECHANNELS] = clCreateKernel(ocl.program, "clCombineChannelsEx", &err); - ocl.kernel[KERNEL_UPSAMPLESQUAREROOT] = clCreateKernel(ocl.program, "clUpsampleSquareRootEx", &err); - ocl.kernel[KERNEL_REMOVEBORDER] = clCreateKernel(ocl.program, "clRemoveBorderEx", &err); - ocl.kernel[KERNEL_ADDBORDER] = clCreateKernel(ocl.program, "clAddBorderEx", &err); - ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER] = clCreateKernel(ocl.program, "clComputeBlockZeroingOrderEx", &err); - - return ocl; -} - void clOpsinDynamicsImage(float *r, float *g, float *b, const size_t xsize, const size_t ysize) { size_t channel_size = xsize * ysize * sizeof(float); @@ -250,7 +195,8 @@ void clConvolutionEx( size_t oxsize = (xsize + xstep - 1) / xstep; cl_kernel kernel = ocl.kernel[KERNEL_CONVOLUTION]; - clSetKernelArgEx(kernel, &result, &inp, &xsize, &multipliers, &len, &xstep, &offset, &border_ratio); + clSetKernelArgEx(kernel, + &result, &inp, &xsize, &multipliers, &len, &xstep, &offset, &border_ratio); size_t globalWorkSize[2] = { oxsize, ysize }; cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); @@ -268,7 +214,8 @@ void clConvolutionXEx( ocl_args_d_t &ocl = getOcl(); cl_kernel kernel = ocl.kernel[KERNEL_CONVOLUTIONX]; - clSetKernelArgEx(kernel, &result, &inp, &multipliers, &len, &xstep, &offset, &border_ratio); + clSetKernelArgEx(kernel, + &result, &inp, &multipliers, &len, &xstep, &offset, &border_ratio); size_t globalWorkSize[2] = { xsize, ysize }; cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); @@ -286,7 +233,8 @@ void clConvolutionYEx( ocl_args_d_t &ocl = getOcl(); cl_kernel kernel = ocl.kernel[KERNEL_CONVOLUTIONY]; - clSetKernelArgEx(kernel, &result, &inp, &multipliers, &len, &xstep, &offset, &border_ratio); + clSetKernelArgEx(kernel, + &result, &inp, &multipliers, &len, &xstep, &offset, &border_ratio); size_t globalWorkSize[2] = { xsize, ysize }; cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); @@ -364,7 +312,8 @@ void clOpsinDynamicsImageEx(ocl_channels &rgb, const size_t xsize, const size_t clBlurEx(rgb.b, xsize, ysize, kSigma, 0.0, rgb_blurred.b); cl_kernel kernel = ocl.kernel[KERNEL_OPSINDYNAMICSIMAGE]; - clSetKernelArgEx(kernel, &rgb.r, &rgb.g, &rgb.b, &rgb_blurred.r, &rgb_blurred.g, &rgb_blurred.b); + clSetKernelArgEx(kernel, + &rgb.r, &rgb.g, &rgb.b, &rgb_blurred.r, &rgb_blurred.g, &rgb_blurred.b); size_t globalWorkSize[1] = { xsize * ysize }; cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL); @@ -396,10 +345,11 @@ void clMaskHighIntensityChangeEx( clFinish(ocl.commandQueue); cl_kernel kernel = ocl.kernel[KERNEL_MASKHIGHINTENSITYCHANGE]; - clSetKernelArgEx(kernel, &xyb0.r, &xyb0.g, &xyb0.b, - &xyb1.r, &xyb1.g, &xyb1.b, - &c0.r, &c0.g, &c0.b, - &c1.r, &c1.g, &c1.b); + clSetKernelArgEx(kernel, + &xyb0.r, &xyb0.g, &xyb0.b, + &xyb1.r, &xyb1.g, &xyb1.b, + &c0.r, &c0.g, &c0.b, + &c1.r, &c1.g, &c1.b); size_t globalWorkSize[2] = { xsize, ysize }; cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); diff --git a/clguetzli/clguetzli_test.h b/clguetzli/clguetzli_test.h index b27c7942..94c0a2c6 100644 --- a/clguetzli/clguetzli_test.h +++ b/clguetzli/clguetzli_test.h @@ -1,8 +1,6 @@ #pragma once #include "ocl.h" -ocl_args_d_t& getOcl(void); - void tclMaskHighIntensityChange(const float* r, const float* g, const float* b, const float* r2, const float* g2, const float* b2, size_t xsize, size_t ysize, diff --git a/clguetzli/cuguetzli.cpp b/clguetzli/cuguetzli.cpp index ced35da7..9ffae8b4 100644 --- a/clguetzli/cuguetzli.cpp +++ b/clguetzli/cuguetzli.cpp @@ -4,6 +4,8 @@ #ifdef __USE_CUDA__ +#define cuFinish cuStreamSynchronize + void cuOpsinDynamicsImage(float *r, float *g, float *b, const size_t xsize, const size_t ysize) { size_t channel_size = xsize * ysize * sizeof(float); @@ -37,11 +39,11 @@ void cuDiffmapOpsinDynamicsImage( ocu_channels xyb0 = ocl.allocMemChannels(channel_size, r, g, b); ocu_channels xyb1 = ocl.allocMemChannels(channel_size, r2, g2, b2); - CUdeviceptr mem_result = ocl.allocMem(channel_size, result); + cu_mem mem_result = ocl.allocMem(channel_size, result); - CUdeviceptr edge_detector_map = ocl.allocMem(3 * channel_step_size); - CUdeviceptr block_diff_dc = ocl.allocMem(3 * channel_step_size); - CUdeviceptr block_diff_ac = ocl.allocMem(3 * channel_step_size); + cu_mem edge_detector_map = ocl.allocMem(3 * channel_step_size); + cu_mem block_diff_dc = ocl.allocMem(3 * channel_step_size); + cu_mem block_diff_ac = ocl.allocMem(3 * channel_step_size); cuMaskHighIntensityChangeEx(xyb0, xyb1, xsize, ysize); @@ -95,9 +97,9 @@ void cuComputeBlockZeroingOrder( cl_int err = 0; ocu_args_d_t &ocl = getOcu(); - CUdeviceptr mem_orig_coeff[3]; - CUdeviceptr mem_mayout_coeff[3]; - CUdeviceptr mem_mayout_pixel[3]; + cu_mem mem_orig_coeff[3]; + cu_mem mem_mayout_coeff[3]; + cu_mem mem_mayout_pixel[3]; for (int c = 0; c < 3; c++) { int block_count = orig_channel[c].block_width * orig_channel[c].block_height; @@ -108,11 +110,11 @@ void cuComputeBlockZeroingOrder( mem_mayout_pixel[c] = ocl.allocMem(image_width * image_height * sizeof(uint16_t), mayout_channel[c].pixel); } - CUdeviceptr mem_orig_image = ocl.allocMem(sizeof(float) * 3 * kDCTBlockSize * block8_width * block8_height, orig_image_batch); - CUdeviceptr mem_mask_scale = ocl.allocMem(sizeof(float) * 3 * block8_width * block8_height, mask_scale); + cu_mem mem_orig_image = ocl.allocMem(sizeof(float) * 3 * kDCTBlockSize * block8_width * block8_height, orig_image_batch); + cu_mem mem_mask_scale = ocl.allocMem(sizeof(float) * 3 * block8_width * block8_height, mask_scale); int output_order_batch_size = sizeof(CoeffData) * 3 * kDCTBlockSize * blockf_width * blockf_height; - CUdeviceptr mem_output_order_batch = ocl.allocMem(output_order_batch_size, output_order_batch); + cu_mem mem_output_order_batch = ocl.allocMem(output_order_batch_size, output_order_batch); const void *args[] = { &mem_orig_coeff[0], &mem_orig_coeff[1], &mem_orig_coeff[2], &mem_orig_image, &mem_mask_scale, @@ -130,8 +132,10 @@ void cuComputeBlockZeroingOrder( 1, 1, 1, 0, ocl.stream, (void**)args, NULL); + LOG_CU_RESULT(err); - err = cuStreamSynchronize(ocl.stream); + err = cuFinish(ocl.stream); + LOG_CU_RESULT(err); cuMemcpyDtoH(output_order_batch, mem_output_order_batch, output_order_batch_size); @@ -180,9 +184,9 @@ void cuMask( } void cuConvolutionEx( - CUdeviceptr result/*out*/, - const CUdeviceptr inp, size_t xsize, size_t ysize, - const CUdeviceptr multipliers, size_t len, + cu_mem result/*out*/, + const cu_mem inp, size_t xsize, size_t ysize, + const cu_mem multipliers, size_t len, int xstep, int offset, float border_ratio) { ocu_args_d_t &ocl = getOcu(); @@ -196,15 +200,16 @@ void cuConvolutionEx( 1, 1, 1, 0, ocl.stream, (void**)args, NULL); - - err = cuStreamSynchronize(ocl.stream); + LOG_CU_RESULT(err); + err = cuFinish(ocl.stream); + LOG_CU_RESULT(err); } void cuConvolutionXEx( - CUdeviceptr result/*out*/, - const CUdeviceptr inp, size_t xsize, size_t ysize, - const CUdeviceptr multipliers, size_t len, + cu_mem result/*out*/, + const cu_mem inp, size_t xsize, size_t ysize, + const cu_mem multipliers, size_t len, int xstep, int offset, float border_ratio) { ocu_args_d_t &ocl = getOcu(); @@ -216,15 +221,16 @@ void cuConvolutionXEx( 1, 1, 1, 0, ocl.stream, (void**)args, NULL); - - err = cuStreamSynchronize(ocl.stream); + LOG_CU_RESULT(err); + err = cuFinish(ocl.stream); + LOG_CU_RESULT(err); } void cuConvolutionYEx( - CUdeviceptr result/*out*/, - const CUdeviceptr inp, size_t xsize, size_t ysize, - const CUdeviceptr multipliers, size_t len, - int xstep, int offset, double border_ratio) + cu_mem result/*out*/, + const cu_mem inp, size_t xsize, size_t ysize, + const cu_mem multipliers, size_t len, + int xstep, int offset, float border_ratio) { ocu_args_d_t &ocl = getOcu(); @@ -235,13 +241,14 @@ void cuConvolutionYEx( 1, 1, 1, 0, ocl.stream, (void**)args, NULL); - - err = cuStreamSynchronize(ocl.stream); + LOG_CU_RESULT(err); + err = cuFinish(ocl.stream); + LOG_CU_RESULT(err); } void cuSquareSampleEx( - CUdeviceptr result/*out*/, - const CUdeviceptr image, size_t xsize, size_t ysize, + cu_mem result/*out*/, + const cu_mem image, size_t xsize, size_t ysize, size_t xstep, size_t ystep) { ocu_args_d_t &ocu = getOcu(); @@ -253,13 +260,14 @@ void cuSquareSampleEx( 1, 1, 1, 0, ocu.stream, (void**)args, NULL); - - err = cuStreamSynchronize(ocu.stream); + LOG_CU_RESULT(err); + err = cuFinish(ocu.stream); + LOG_CU_RESULT(err); } -void cuBlurEx(CUdeviceptr image/*out, opt*/, const size_t xsize, const size_t ysize, +void cuBlurEx(cu_mem image/*out, opt*/, const size_t xsize, const size_t ysize, const double sigma, const double border_ratio, - CUdeviceptr result/*out, opt*/) + cu_mem result/*out, opt*/) { double m = 2.25; // Accuracy increases when m is increased. const double scaler = -1.0 / (2 * sigma * sigma); @@ -273,23 +281,23 @@ void cuBlurEx(CUdeviceptr image/*out, opt*/, const size_t xsize, const size_t ys const int xstep = std::max(1, int(sigma / 3)); - ocu_args_d_t &ocu = getOcu(); - CUdeviceptr mem_expn = ocu.allocMem(sizeof(cl_float) * expn_size, expn.data()); + ocu_args_d_t &ocl = getOcu(); + cu_mem mem_expn = ocl.allocMem(sizeof(cl_float) * expn_size, expn.data()); if (xstep > 1) { - CUdeviceptr srcA = ocu.allocMem(sizeof(cl_float) * xsize * ysize); - cuConvolutionXEx(srcA, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio); - cuConvolutionYEx(result ? result : image, srcA, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio); + cu_mem m = ocl.allocMem(sizeof(cl_float) * xsize * ysize); + cuConvolutionXEx(m, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio); + cuConvolutionYEx(result ? result : image, m, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio); cuSquareSampleEx(result ? result : image, result ? result : image, xsize, ysize, xstep, xstep); - cuMemFree(srcA); + cuMemFree(m); } else { - CUdeviceptr srcA = ocu.allocMem(sizeof(cl_float) * xsize * ysize); - cuConvolutionXEx(srcA, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio); - cuConvolutionYEx(result ? result : image, srcA, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio); - cuMemFree(srcA); + cu_mem m = ocl.allocMem(sizeof(cl_float) * xsize * ysize); + cuConvolutionXEx(m, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio); + cuConvolutionYEx(result ? result : image, m, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio); + cuMemFree(m); } cuMemFree(mem_expn); @@ -315,8 +323,9 @@ void cuOpsinDynamicsImageEx(ocu_channels &rgb, const size_t xsize, const size_t 1, 1, 1, 0, ocl.stream, args, NULL); - - r = cuStreamSynchronize(ocl.stream); + LOG_CU_RESULT(err); + r = cuFinish(ocl.stream); + LOG_CU_RESULT(err); ocl.releaseMemChannels(rgb_blurred); } @@ -333,14 +342,16 @@ void cuMaskHighIntensityChangeEx( ocu_channels c0 = ocl.allocMemChannels(channel_size); ocu_channels c1 = ocl.allocMemChannels(channel_size); - cuMemcpyDtoD(c0.r, xyb0.r, channel_size); - cuMemcpyDtoD(c0.g, xyb0.g, channel_size); - cuMemcpyDtoD(c0.b, xyb0.b, channel_size); - cuMemcpyDtoD(c1.r, xyb1.r, channel_size); - cuMemcpyDtoD(c1.g, xyb1.g, channel_size); - cuMemcpyDtoD(c1.b, xyb1.b, channel_size); + cuMemcpyDtoDAsync(c0.r, xyb0.r, channel_size, ocl.stream); + cuMemcpyDtoDAsync(c0.g, xyb0.g, channel_size, ocl.stream); + cuMemcpyDtoDAsync(c0.b, xyb0.b, channel_size, ocl.stream); + cuMemcpyDtoDAsync(c1.r, xyb1.r, channel_size, ocl.stream); + cuMemcpyDtoDAsync(c1.g, xyb1.g, channel_size, ocl.stream); + cuMemcpyDtoDAsync(c1.b, xyb1.b, channel_size, ocl.stream); + cuFinish(ocl.stream); - const void *args[] = { &xyb0.r, &xyb0.g, &xyb0.b, + const void *args[] = { + &xyb0.r, &xyb0.g, &xyb0.b, &xyb1.r, &xyb1.g, &xyb1.b, &c0.r, &c0.g, &c0.b, &c1.r, &c1.g, &c1.b }; @@ -350,15 +361,16 @@ void cuMaskHighIntensityChangeEx( 1, 1, 1, 0, ocl.stream, (void**)args, NULL); - - err = cuStreamSynchronize(ocl.stream); + LOG_CU_RESULT(err); + err = cuFinish(ocl.stream); + LOG_CU_RESULT(err); ocl.releaseMemChannels(c0); ocl.releaseMemChannels(c1); } void cuEdgeDetectorMapEx( - CUdeviceptr result/*out*/, + cu_mem result/*out*/, const ocu_channels &rgb, const ocu_channels &rgb2, const size_t xsize, const size_t ysize, const size_t step) { @@ -390,16 +402,17 @@ void cuEdgeDetectorMapEx( 1, 1, 1, 0, ocl.stream, (void**)args, NULL); - - err = cuStreamSynchronize(ocl.stream); + LOG_CU_RESULT(err); + err = cuFinish(ocl.stream); + LOG_CU_RESULT(err); ocl.releaseMemChannels(rgb_blured); ocl.releaseMemChannels(rgb2_blured); } void cuBlockDiffMapEx( - CUdeviceptr block_diff_dc/*out*/, - CUdeviceptr block_diff_ac/*out*/, + cu_mem block_diff_dc/*out*/, + cu_mem block_diff_ac/*out*/, const ocu_channels &rgb, const ocu_channels &rgb2, const size_t xsize, const size_t ysize, const size_t step) { @@ -418,12 +431,13 @@ void cuBlockDiffMapEx( 1, 1, 1, 0, ocl.stream, (void**)args, NULL); - - err = cuStreamSynchronize(ocl.stream); + LOG_CU_RESULT(err); + err = cuFinish(ocl.stream); + LOG_CU_RESULT(err); } void cuEdgeDetectorLowFreqEx( - CUdeviceptr block_diff_ac/*in,out*/, + cu_mem block_diff_ac/*in,out*/, const ocu_channels &rgb, const ocu_channels &rgb2, const size_t xsize, const size_t ysize, const size_t step) { @@ -454,8 +468,9 @@ void cuEdgeDetectorLowFreqEx( 1, 1, 1, 0, ocl.stream, (void**)args, NULL); - - err = cuStreamSynchronize(ocl.stream); + LOG_CU_RESULT(err); + err = cuFinish(ocl.stream); + LOG_CU_RESULT(err); ocl.releaseMemChannels(rgb_blured); ocl.releaseMemChannels(rgb2_blured); @@ -477,11 +492,12 @@ void cuDiffPrecomputeEx( 1, 1, 1, 0, ocl.stream, (void**)args, NULL); - - err = cuStreamSynchronize(ocl.stream); + LOG_CU_RESULT(err); + err = cuFinish(ocl.stream); + LOG_CU_RESULT(err); } -void cuScaleImageEx(CUdeviceptr img/*in, out*/, size_t size, double w) +void cuScaleImageEx(cu_mem img/*in, out*/, size_t size, double w) { ocu_args_d_t &ocl = getOcu(); @@ -492,11 +508,12 @@ void cuScaleImageEx(CUdeviceptr img/*in, out*/, size_t size, double w) 1, 1, 1, 0, ocl.stream, (void**)args, NULL); - - err = cuStreamSynchronize(ocl.stream); + LOG_CU_RESULT(err); + err = cuFinish(ocl.stream); + LOG_CU_RESULT(err); } -void cuAverage5x5Ex(CUdeviceptr img/*in,out*/, const size_t xsize, const size_t ysize) +void cuAverage5x5Ex(cu_mem img/*in,out*/, const size_t xsize, const size_t ysize) { if (xsize < 4 || ysize < 4) { // TODO: Make this work for small dimensions as well. @@ -506,7 +523,7 @@ void cuAverage5x5Ex(CUdeviceptr img/*in,out*/, const size_t xsize, const size_t ocu_args_d_t &ocl = getOcu(); size_t len = xsize * ysize * sizeof(float); - CUdeviceptr img_org = ocl.allocMem(len); + cu_mem img_org = ocl.allocMem(len); cuMemcpyDtoD(img_org, img, len); @@ -517,20 +534,21 @@ void cuAverage5x5Ex(CUdeviceptr img/*in,out*/, const size_t xsize, const size_t 1, 1, 1, 0, ocl.stream, (void**)args, NULL); - - err = cuStreamSynchronize(ocl.stream); + LOG_CU_RESULT(err); + err = cuFinish(ocl.stream); + LOG_CU_RESULT(err); cuMemFree(img_org); } void cuMinSquareValEx( - CUdeviceptr img/*in,out*/, + cu_mem img/*in,out*/, const size_t xsize, const size_t ysize, const size_t square_size, const size_t offset) { ocu_args_d_t &ocl = getOcu(); - CUdeviceptr srcA = ocl.allocMem(sizeof(float) * xsize * ysize); + cu_mem srcA = ocl.allocMem(sizeof(float) * xsize * ysize); const void *args[] = { &srcA, &img, &square_size, &offset }; @@ -539,9 +557,9 @@ void cuMinSquareValEx( 1, 1, 1, 0, ocl.stream, (void**)args, NULL); - - err = cuStreamSynchronize(ocl.stream); - + LOG_CU_RESULT(err); + err = cuFinish(ocl.stream); + LOG_CU_RESULT(err); cuMemcpyDtoD(img, srcA, sizeof(float) * xsize * ysize); cuMemFree(srcA); } @@ -656,8 +674,9 @@ void cuDoMask(ocu_channels mask/*in, out*/, ocu_channels mask_dc/*in, out*/, siz 1, 1, 1, 0, ocl.stream, (void**)args, NULL); - - err = cuStreamSynchronize(ocl.stream); + LOG_CU_RESULT(err); + err = cuFinish(ocl.stream); + LOG_CU_RESULT(err); ocl.releaseMemChannels(xyb); ocl.releaseMemChannels(xyb_dc); @@ -693,13 +712,13 @@ void cuMaskEx( } void cuCombineChannelsEx( - CUdeviceptr result/*out*/, + cu_mem result/*out*/, const ocu_channels &mask, const ocu_channels &mask_dc, const size_t xsize, const size_t ysize, - const CUdeviceptr block_diff_dc, - const CUdeviceptr block_diff_ac, - const CUdeviceptr edge_detector_map, + const cu_mem block_diff_dc, + const cu_mem block_diff_ac, + const cu_mem edge_detector_map, const size_t res_xsize, const size_t step) { @@ -722,15 +741,16 @@ void cuCombineChannelsEx( 1, 1, 1, 0, ocl.stream, (void**)args, NULL); - - err = cuStreamSynchronize(ocl.stream); + LOG_CU_RESULT(err); + err = cuFinish(ocl.stream); + LOG_CU_RESULT(err); } -void cuUpsampleSquareRootEx(CUdeviceptr diffmap, const size_t xsize, const size_t ysize, const int step) +void cuUpsampleSquareRootEx(cu_mem diffmap, const size_t xsize, const size_t ysize, const int step) { ocu_args_d_t &ocl = getOcu(); - CUdeviceptr diffmap_out = ocl.allocMem(xsize * ysize * sizeof(float)); + cu_mem diffmap_out = ocl.allocMem(xsize * ysize * sizeof(float)); const void *args[] = { &diffmap_out, &diffmap, @@ -745,15 +765,15 @@ void cuUpsampleSquareRootEx(CUdeviceptr diffmap, const size_t xsize, const size_ 1, 1, 1, 0, ocl.stream, (void**)args, NULL); - - err = cuStreamSynchronize(ocl.stream); - + LOG_CU_RESULT(err); + err = cuFinish(ocl.stream); + LOG_CU_RESULT(err); cuMemcpyDtoD(diffmap, diffmap_out, xsize * ysize * sizeof(float)); cuMemFree(diffmap_out); } -void cuRemoveBorderEx(CUdeviceptr out, const CUdeviceptr in, const size_t xsize, const size_t ysize, const int step) +void cuRemoveBorderEx(cu_mem out, const cu_mem in, const size_t xsize, const size_t ysize, const int step) { ocu_args_d_t &ocl = getOcu(); @@ -771,11 +791,12 @@ void cuRemoveBorderEx(CUdeviceptr out, const CUdeviceptr in, const size_t xsize, 1, 1, 1, 0, ocl.stream, (void**)args, NULL); - - err = cuStreamSynchronize(ocl.stream); + LOG_CU_RESULT(err); + err = cuFinish(ocl.stream); + LOG_CU_RESULT(err); } -void cuAddBorderEx(CUdeviceptr out, size_t xsize, size_t ysize, int step, CUdeviceptr in) +void cuAddBorderEx(cu_mem out, size_t xsize, size_t ysize, int step, cu_mem in) { ocu_args_d_t &ocl = getOcu(); @@ -792,11 +813,12 @@ void cuAddBorderEx(CUdeviceptr out, size_t xsize, size_t ysize, int step, CUdevi 1, 1, 1, 0, ocl.stream, (void**)args, NULL); - - err = cuStreamSynchronize(ocl.stream); + LOG_CU_RESULT(err); + err = cuFinish(ocl.stream); + LOG_CU_RESULT(err); } -void cuCalculateDiffmapEx(CUdeviceptr diffmap/*in,out*/, const size_t xsize, const size_t ysize, const int step) +void cuCalculateDiffmapEx(cu_mem diffmap/*in,out*/, const size_t xsize, const size_t ysize, const int step) { cuUpsampleSquareRootEx(diffmap, xsize, ysize, step); @@ -808,7 +830,7 @@ void cuCalculateDiffmapEx(CUdeviceptr diffmap/*in,out*/, const size_t xsize, con int s2 = (8 - step) / 2; ocu_args_d_t &ocl = getOcu(); - CUdeviceptr blurred = ocl.allocMem((xsize - s) * (ysize - s) * sizeof(float)); + cu_mem blurred = ocl.allocMem((xsize - s) * (ysize - s) * sizeof(float)); cuRemoveBorderEx(blurred, diffmap, xsize, ysize, step); static const double border_ratio = 0.03027655136; diff --git a/clguetzli/cuguetzli.h b/clguetzli/cuguetzli.h index e9dddde6..81ec377b 100644 --- a/clguetzli/cuguetzli.h +++ b/clguetzli/cuguetzli.h @@ -35,25 +35,25 @@ void cuMask( const float* r2, const float* g2, const float* b2); void cuConvolutionXEx( - CUdeviceptr result/*out*/, - const CUdeviceptr inp, size_t xsize, size_t ysize, - const CUdeviceptr multipliers, size_t len, - int xstep, int offset, double border_ratio); + cu_mem result/*out*/, + const cu_mem inp, size_t xsize, size_t ysize, + const cu_mem multipliers, size_t len, + int xstep, int offset, float border_ratio); void cuConvolutionYEx( - CUdeviceptr result/*out*/, - const CUdeviceptr inp, size_t xsize, size_t ysize, - const CUdeviceptr multipliers, size_t len, - int xstep, int offset, double border_ratio); + cu_mem result/*out*/, + const cu_mem inp, size_t xsize, size_t ysize, + const cu_mem multipliers, size_t len, + int xstep, int offset, float border_ratio); void cuSquareSampleEx( - CUdeviceptr result/*out*/, - const CUdeviceptr image, size_t xsize, size_t ysize, + cu_mem result/*out*/, + const cu_mem image, size_t xsize, size_t ysize, size_t xstep, size_t ystep); -void cuBlurEx(CUdeviceptr image/*out, opt*/, const size_t xsize, const size_t ysize, +void cuBlurEx(cu_mem image/*out, opt*/, const size_t xsize, const size_t ysize, const double sigma, const double border_ratio, - CUdeviceptr result = NULL/*out, opt*/); + cu_mem result = NULL/*out, opt*/); void cuOpsinDynamicsImageEx(ocu_channels &rgb, const size_t xsize, const size_t ysize); @@ -63,18 +63,18 @@ void cuMaskHighIntensityChangeEx( const size_t xsize, const size_t ysize); void cuEdgeDetectorMapEx( - CUdeviceptr result/*out*/, + cu_mem result/*out*/, const ocu_channels &rgb, const ocu_channels &rgb2, const size_t xsize, const size_t ysize, const size_t step); void cuBlockDiffMapEx( - CUdeviceptr block_diff_dc/*out*/, - CUdeviceptr block_diff_ac/*out*/, + cu_mem block_diff_dc/*out*/, + cu_mem block_diff_ac/*out*/, const ocu_channels &rgb, const ocu_channels &rgb2, const size_t xsize, const size_t ysize, const size_t step); void cuEdgeDetectorLowFreqEx( - CUdeviceptr block_diff_ac/*in,out*/, + cu_mem block_diff_ac/*in,out*/, const ocu_channels &rgb, const ocu_channels &rgb2, const size_t xsize, const size_t ysize, const size_t step); @@ -83,12 +83,12 @@ void cuDiffPrecomputeEx( const ocu_channels &xyb0, const ocu_channels &xyb1, const size_t xsize, const size_t ysize); -void cuScaleImageEx(CUdeviceptr img/*in, out*/, size_t size, double w); +void cuScaleImageEx(cu_mem img/*in, out*/, size_t size, double w); -void cuAverage5x5Ex(CUdeviceptr img/*in,out*/, const size_t xsize, const size_t ysize); +void cuAverage5x5Ex(cu_mem img/*in,out*/, const size_t xsize, const size_t ysize); void cuMinSquareValEx( - CUdeviceptr img/*in,out*/, + cu_mem img/*in,out*/, const size_t xsize, const size_t ysize, const size_t square_size, const size_t offset); @@ -98,22 +98,22 @@ void cuMaskEx( const size_t xsize, const size_t ysize); void cuCombineChannelsEx( - CUdeviceptr result/*out*/, + cu_mem result/*out*/, const ocu_channels &mask, const ocu_channels &mask_dc, const size_t xsize, const size_t ysize, - const CUdeviceptr block_diff_dc, - const CUdeviceptr block_diff_ac, - const CUdeviceptr edge_detector_map, + const cu_mem block_diff_dc, + const cu_mem block_diff_ac, + const cu_mem edge_detector_map, const size_t res_xsize, const size_t step); -void cuUpsampleSquareRootEx(CUdeviceptr diffmap, const size_t xsize, const size_t ysize, const int step); +void cuUpsampleSquareRootEx(cu_mem diffmap, const size_t xsize, const size_t ysize, const int step); -void cuRemoveBorderEx(CUdeviceptr out, const CUdeviceptr in, const size_t xsize, const size_t ysize, const int step); +void cuRemoveBorderEx(cu_mem out, const cu_mem in, const size_t xsize, const size_t ysize, const int step); -void cuAddBorderEx(CUdeviceptr out, const size_t xsize, const size_t ysize, const int step, const CUdeviceptr in); +void cuAddBorderEx(cu_mem out, const size_t xsize, const size_t ysize, const int step, const cu_mem in); -void cuCalculateDiffmapEx(CUdeviceptr diffmap/*in,out*/, const size_t xsize, const size_t ysize, const int step); +void cuCalculateDiffmapEx(cu_mem diffmap/*in,out*/, const size_t xsize, const size_t ysize, const int step); #endif \ No newline at end of file diff --git a/clguetzli/ocl.cpp b/clguetzli/ocl.cpp index 5218ce9b..639ad68e 100644 --- a/clguetzli/ocl.cpp +++ b/clguetzli/ocl.cpp @@ -1,12 +1,61 @@ #include "ocl.h" #include -#ifdef __linux__ -#include -#define _aligned_malloc memalign -#define _aligned_free free -#endif #include +ocl_args_d_t& getOcl(void) +{ + static bool bInit = false; + static ocl_args_d_t ocl; + + if (bInit == true) return ocl; + + bInit = true; + cl_int err = SetupOpenCL(&ocl, CL_DEVICE_TYPE_GPU); + LOG_CL_RESULT(err); + + char* source = nullptr; + size_t src_size = 0; + ReadSourceFromFile("clguetzli/clguetzli.cl", &source, &src_size); + + ocl.program = clCreateProgramWithSource(ocl.context, 1, (const char**)&source, &src_size, &err); + + delete[] source; + + err = clBuildProgram(ocl.program, 1, &ocl.device, "", NULL, NULL); + LOG_CL_RESULT(err); + if (CL_BUILD_PROGRAM_FAILURE == err) + { + size_t log_size = 0; + clGetProgramBuildInfo(ocl.program, ocl.device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); + + std::vector build_log(log_size); + clGetProgramBuildInfo(ocl.program, ocl.device, CL_PROGRAM_BUILD_LOG, log_size, &build_log[0], NULL); + + LogError("Error happened during the build of OpenCL program.\nBuild log:%s", &build_log[0]); + } + + ocl.kernel[KERNEL_CONVOLUTION] = clCreateKernel(ocl.program, "clConvolutionEx", &err); + ocl.kernel[KERNEL_CONVOLUTIONX] = clCreateKernel(ocl.program, "clConvolutionXEx", &err); + ocl.kernel[KERNEL_CONVOLUTIONY] = clCreateKernel(ocl.program, "clConvolutionYEx", &err); + ocl.kernel[KERNEL_SQUARESAMPLE] = clCreateKernel(ocl.program, "clSquareSampleEx", &err); + ocl.kernel[KERNEL_OPSINDYNAMICSIMAGE] = clCreateKernel(ocl.program, "clOpsinDynamicsImageEx", &err); + ocl.kernel[KERNEL_MASKHIGHINTENSITYCHANGE] = clCreateKernel(ocl.program, "clMaskHighIntensityChangeEx", &err); + ocl.kernel[KERNEL_EDGEDETECTOR] = clCreateKernel(ocl.program, "clEdgeDetectorMapEx", &err); + ocl.kernel[KERNEL_BLOCKDIFFMAP] = clCreateKernel(ocl.program, "clBlockDiffMapEx", &err); + ocl.kernel[KERNEL_EDGEDETECTORLOWFREQ] = clCreateKernel(ocl.program, "clEdgeDetectorLowFreqEx", &err); + ocl.kernel[KERNEL_DIFFPRECOMPUTE] = clCreateKernel(ocl.program, "clDiffPrecomputeEx", &err); + ocl.kernel[KERNEL_SCALEIMAGE] = clCreateKernel(ocl.program, "clScaleImageEx", &err); + ocl.kernel[KERNEL_AVERAGE5X5] = clCreateKernel(ocl.program, "clAverage5x5Ex", &err); + ocl.kernel[KERNEL_MINSQUAREVAL] = clCreateKernel(ocl.program, "clMinSquareValEx", &err); + ocl.kernel[KERNEL_DOMASK] = clCreateKernel(ocl.program, "clDoMaskEx", &err); + ocl.kernel[KERNEL_COMBINECHANNELS] = clCreateKernel(ocl.program, "clCombineChannelsEx", &err); + ocl.kernel[KERNEL_UPSAMPLESQUAREROOT] = clCreateKernel(ocl.program, "clUpsampleSquareRootEx", &err); + ocl.kernel[KERNEL_REMOVEBORDER] = clCreateKernel(ocl.program, "clRemoveBorderEx", &err); + ocl.kernel[KERNEL_ADDBORDER] = clCreateKernel(ocl.program, "clAddBorderEx", &err); + ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER] = clCreateKernel(ocl.program, "clComputeBlockZeroingOrderEx", &err); + + return ocl; +} ocl_args_d_t::ocl_args_d_t() : context(NULL), @@ -23,17 +72,6 @@ ocl_args_d_t::ocl_args_d_t() : } } -/* -* destructor - called only once -* Release all OpenCL objects -* This is a regular sequence of calls to deallocate all created OpenCL resources in bootstrapOpenCL. -* -* You may want to call these deallocation procedures in the middle of your application execution -* (not at the end) if you don't further need OpenCL runtime. -* You may want to do that in order to free some memory, for example, -* or recreate OpenCL objects with different parameters. -* -*/ ocl_args_d_t::~ocl_args_d_t() { cl_int err = CL_SUCCESS; @@ -45,16 +83,7 @@ ocl_args_d_t::~ocl_args_d_t() LogError("Error: clReleaseKernel returned '%s'.\n", TranslateOpenCLError(err)); } } -/* - if (kernel) - { - err = clReleaseKernel(kernel); - if (CL_SUCCESS != err) - { - LogError("Error: clReleaseKernel returned '%s'.\n", TranslateOpenCLError(err)); - } - } -*/ + if (program) { err = clReleaseProgram(program); diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h index 13eb232b..a9573fa6 100644 --- a/clguetzli/ocl.h +++ b/clguetzli/ocl.h @@ -8,48 +8,12 @@ #define OPENCL_VERSION_1_2 1.2f #define OPENCL_VERSION_2_0 2.0f -struct ocl_args_d_t; - -/* This function helps to create informative messages in -* case when OpenCL errors occur. It returns a string -* representation for an OpenCL error code. -* (E.g. "CL_DEVICE_NOT_FOUND" instead of just -1.) -*/ -const char* TranslateOpenCLError(cl_int errorCode); - -/* -* This function picks/creates necessary OpenCL objects which are needed. -* The objects are: -* OpenCL platform, device, context, and command queue. -* -* All these steps are needed to be performed once in a regular OpenCL application. -* This happens before actual compute kernels calls are performed. -* -* For convenience, in this application you store all those basic OpenCL objects in structure ocl_args_d_t, -* so this function populates fields of this structure, which is passed as parameter ocl. -* Please, consider reviewing the fields before going further. -* The structure definition is right in the beginning of this file. -*/ -int SetupOpenCL(ocl_args_d_t *ocl, cl_device_type deviceType); - - -/* Convenient container for all OpenCL specific objects used in the sample -* -* It consists of two parts: -* - regular OpenCL objects which are used in almost each normal OpenCL applications -* - several OpenCL objects that are specific for this particular sample -* -* You collect all these objects in one structure for utility purposes -* only, there is no OpenCL specific here: just to avoid global variables -* and make passing all these arguments in functions easier. -*/ - enum KernelName { - KERNEL_CONVOLUTION = 0, - KERNEL_CONVOLUTIONX, - KERNEL_CONVOLUTIONY, - KERNEL_SQUARESAMPLE, - KERNEL_OPSINDYNAMICSIMAGE, + KERNEL_CONVOLUTION = 0, + KERNEL_CONVOLUTIONX, + KERNEL_CONVOLUTIONY, + KERNEL_SQUARESAMPLE, + KERNEL_OPSINDYNAMICSIMAGE, KERNEL_MASKHIGHINTENSITYCHANGE, KERNEL_EDGEDETECTOR, KERNEL_BLOCKDIFFMAP, @@ -59,16 +23,24 @@ enum KernelName { KERNEL_AVERAGE5X5, KERNEL_MINSQUAREVAL, KERNEL_DOMASK, - KERNEL_COMBINECHANNELS, - KERNEL_UPSAMPLESQUAREROOT, + KERNEL_COMBINECHANNELS, + KERNEL_UPSAMPLESQUAREROOT, KERNEL_REMOVEBORDER, - KERNEL_ADDBORDER, - KERNEL_COMPUTEBLOCKZEROINGORDER, - KERNEL_COUNT, + KERNEL_ADDBORDER, + KERNEL_COMPUTEBLOCKZEROINGORDER, + KERNEL_COUNT, }; #define LOG_CL_RESULT(e) if (CL_SUCCESS != (e)) { LogError("Error: %s:%d returned %s.\n", __FUNCTION__, __LINE__, TranslateOpenCLError((e)));} +struct ocl_args_d_t; + +const char* TranslateOpenCLError(cl_int errorCode); + +int SetupOpenCL(ocl_args_d_t *ocl, cl_device_type deviceType); + +ocl_args_d_t& getOcl(void); + struct ocl_args_d_t { ocl_args_d_t(); diff --git a/clguetzli/ocu.cpp b/clguetzli/ocu.cpp index 6fbf58ee..d15733c9 100644 --- a/clguetzli/ocu.cpp +++ b/clguetzli/ocu.cpp @@ -1,6 +1,6 @@ +#include "ocu.h" #include #include -#include "ocu.h" #ifdef __USE_CUDA__ @@ -113,9 +113,9 @@ ocu_args_d_t::~ocu_args_d_t() // cuStreamDestroy(stream); } -CUdeviceptr ocu_args_d_t::allocMem(size_t s, const void *init) +cu_mem ocu_args_d_t::allocMem(size_t s, const void *init) { - CUdeviceptr mem; + cu_mem mem; cuMemAlloc(&mem, s); if (init) { @@ -151,4 +151,8 @@ void ocu_args_d_t::releaseMemChannels(ocu_channels &rgb) } } +const char* TranslateCUDAError(CUresult errorCode) +{ + return "Unknwon"; +} #endif \ No newline at end of file diff --git a/clguetzli/ocu.h b/clguetzli/ocu.h index 4c34edaf..426019cc 100644 --- a/clguetzli/ocu.h +++ b/clguetzli/ocu.h @@ -5,8 +5,12 @@ #include #include "ocl.h" +#define LOG_CU_RESULT(e) if (CUDA_SUCCESS != (e)) { LogError("Error: %s:%d returned %s.\n", __FUNCTION__, __LINE__, TranslateCUDAError((e)));} + struct ocu_args_d_t; +const char* TranslateCUDAError(CUresult errorCode); + ocu_args_d_t& getOcu(void); struct ocu_args_d_t @@ -14,7 +18,7 @@ struct ocu_args_d_t ocu_args_d_t(); ~ocu_args_d_t(); - CUdeviceptr allocMem(size_t s, const void *init = NULL); + cu_mem allocMem(size_t s, const void *init = NULL); ocu_channels allocMemChannels(size_t s, const void *c0 = NULL, const void *c1 = NULL, const void *c2 = NULL); void releaseMemChannels(ocu_channels &rgb); From 1cb6e526d4629896bd4e513d0307a42aa20b5de1 Mon Sep 17 00:00:00 2001 From: strongtu Date: Sun, 4 Jun 2017 22:30:07 +0800 Subject: [PATCH 132/189] =?UTF-8?q?cu=E7=BC=96=E8=AF=91=E6=94=B9=E5=9B=9En?= =?UTF-8?q?vcc=E6=8F=90=E5=89=8D=E7=BC=96=E8=AF=91=20=E7=BB=A7=E7=BB=AD?= =?UTF-8?q?=E7=AE=80=E5=8C=96=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clguetzli.cpp | 22 ++--- clguetzli/cuguetzli.cpp | 189 +++++++++++++++++++++------------------- clguetzli/ocu.cpp | 138 ++++++++++++++++++++++------- clguetzli/ocu.h | 2 +- compile.bat | 5 +- guetzli.vcxproj | 20 +++-- 6 files changed, 231 insertions(+), 145 deletions(-) diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index 15feb7d1..4495e935 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -97,7 +97,6 @@ void clComputeBlockZeroingOrder( using namespace guetzli; - cl_int err = 0; ocl_args_d_t &ocl = getOcl(); cl_mem mem_orig_coeff[3]; @@ -132,7 +131,7 @@ void clComputeBlockZeroingOrder( &mem_output_order_batch); size_t globalWorkSize[2] = { blockf_width, blockf_height }; - err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); + cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); LOG_CL_RESULT(err); err = clFinish(ocl.commandQueue); LOG_CL_RESULT(err); @@ -195,8 +194,7 @@ void clConvolutionEx( size_t oxsize = (xsize + xstep - 1) / xstep; cl_kernel kernel = ocl.kernel[KERNEL_CONVOLUTION]; - clSetKernelArgEx(kernel, - &result, &inp, &xsize, &multipliers, &len, &xstep, &offset, &border_ratio); + clSetKernelArgEx(kernel, &result, &inp, &xsize, &multipliers, &len, &xstep, &offset, &border_ratio); size_t globalWorkSize[2] = { oxsize, ysize }; cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); @@ -214,8 +212,7 @@ void clConvolutionXEx( ocl_args_d_t &ocl = getOcl(); cl_kernel kernel = ocl.kernel[KERNEL_CONVOLUTIONX]; - clSetKernelArgEx(kernel, - &result, &inp, &multipliers, &len, &xstep, &offset, &border_ratio); + clSetKernelArgEx(kernel, &result, &inp, &multipliers, &len, &xstep, &offset, &border_ratio); size_t globalWorkSize[2] = { xsize, ysize }; cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); @@ -233,8 +230,7 @@ void clConvolutionYEx( ocl_args_d_t &ocl = getOcl(); cl_kernel kernel = ocl.kernel[KERNEL_CONVOLUTIONY]; - clSetKernelArgEx(kernel, - &result, &inp, &multipliers, &len, &xstep, &offset, &border_ratio); + clSetKernelArgEx(kernel, &result, &inp, &multipliers, &len, &xstep, &offset, &border_ratio); size_t globalWorkSize[2] = { xsize, ysize }; cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); @@ -312,8 +308,7 @@ void clOpsinDynamicsImageEx(ocl_channels &rgb, const size_t xsize, const size_t clBlurEx(rgb.b, xsize, ysize, kSigma, 0.0, rgb_blurred.b); cl_kernel kernel = ocl.kernel[KERNEL_OPSINDYNAMICSIMAGE]; - clSetKernelArgEx(kernel, - &rgb.r, &rgb.g, &rgb.b, &rgb_blurred.r, &rgb_blurred.g, &rgb_blurred.b); + clSetKernelArgEx(kernel, &rgb.r, &rgb.g, &rgb.b, &rgb_blurred.r, &rgb_blurred.g, &rgb_blurred.b); size_t globalWorkSize[1] = { xsize * ysize }; cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL); @@ -367,6 +362,7 @@ void clEdgeDetectorMapEx( const size_t xsize, const size_t ysize, const size_t step) { size_t channel_size = xsize * ysize * sizeof(float); + ocl_args_d_t &ocl = getOcl(); ocl_channels rgb_blured = ocl.allocMemChannels(channel_size); @@ -470,7 +466,7 @@ void clDiffPrecomputeEx( cl_kernel kernel = ocl.kernel[KERNEL_DIFFPRECOMPUTE]; clSetKernelArgEx(kernel, &mask.x, &mask.y, &mask.b, &xyb0.x, &xyb0.y, &xyb0.b, - &xyb1.x, &xyb1.y, &xyb1.b); + &xyb1.x, &xyb1.y, &xyb1.b); size_t globalWorkSize[2] = { xsize, ysize }; cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); @@ -484,7 +480,7 @@ void clScaleImageEx(cl_mem img/*in, out*/, size_t size, double w) ocl_args_d_t &ocl = getOcl(); cl_kernel kernel = ocl.kernel[KERNEL_SCALEIMAGE]; - clSetKernelArgEx(kernel, &img, &w); + clSetKernelArgEx(kernel, &img, &w); size_t globalWorkSize[1] = { size }; cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL); @@ -704,7 +700,7 @@ void clCombineChannelsEx( cl_kernel kernel = ocl.kernel[KERNEL_COMBINECHANNELS]; clSetKernelArgEx(kernel, &result, - &mask.r, &mask.g, &mask.b, + &mask.r, &mask.g, &mask.b, &mask_dc.r, &mask_dc.g, &mask_dc.b, &xsize, &ysize, &block_diff_dc, &block_diff_ac, diff --git a/clguetzli/cuguetzli.cpp b/clguetzli/cuguetzli.cpp index 9ffae8b4..0a464a77 100644 --- a/clguetzli/cuguetzli.cpp +++ b/clguetzli/cuguetzli.cpp @@ -74,7 +74,6 @@ void cuDiffmapOpsinDynamicsImage( cuMemFree(mem_result); } - void cuComputeBlockZeroingOrder( guetzli::CoeffData *output_order_batch, const channel_info orig_channel[3], @@ -94,7 +93,6 @@ void cuComputeBlockZeroingOrder( using namespace guetzli; - cl_int err = 0; ocu_args_d_t &ocl = getOcu(); cu_mem mem_orig_coeff[3]; @@ -116,6 +114,7 @@ void cuComputeBlockZeroingOrder( int output_order_batch_size = sizeof(CoeffData) * 3 * kDCTBlockSize * blockf_width * blockf_height; cu_mem mem_output_order_batch = ocl.allocMem(output_order_batch_size, output_order_batch); + CUfunction kernel = ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER]; const void *args[] = { &mem_orig_coeff[0], &mem_orig_coeff[1], &mem_orig_coeff[2], &mem_orig_image, &mem_mask_scale, &image_width, &image_height, @@ -127,14 +126,14 @@ void cuComputeBlockZeroingOrder( &BlockErrorLimit, &mem_output_order_batch }; - err = cuLaunchKernel(ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER], + CUresult err = cuLaunchKernel(kernel, blockf_width, blockf_height, 1, 1, 1, 1, 0, - ocl.stream, (void**)args, NULL); + ocl.commandQueue, (void**)args, NULL); LOG_CU_RESULT(err); - err = cuFinish(ocl.stream); + err = cuFinish(ocl.commandQueue); LOG_CU_RESULT(err); cuMemcpyDtoH(output_order_batch, mem_output_order_batch, output_order_batch_size); @@ -170,12 +169,13 @@ void cuMask( cuMaskEx(mask, mask_dc, rgb, rgb2, xsize, ysize); - cuMemcpyDtoH(mask_r, mask.r, channel_size); - cuMemcpyDtoH(mask_g, mask.g, channel_size); - cuMemcpyDtoH(mask_b, mask.b, channel_size); - cuMemcpyDtoH(maskdc_r, mask_dc.r, channel_size); - cuMemcpyDtoH(maskdc_g, mask_dc.g, channel_size); - cuMemcpyDtoH(maskdc_b, mask_dc.b, channel_size); + cuMemcpyDtoHAsync(mask_r, mask.r, channel_size, ocl.commandQueue); + cuMemcpyDtoHAsync(mask_g, mask.g, channel_size, ocl.commandQueue); + cuMemcpyDtoHAsync(mask_b, mask.b, channel_size, ocl.commandQueue); + cuMemcpyDtoHAsync(maskdc_r, mask_dc.r, channel_size, ocl.commandQueue); + cuMemcpyDtoHAsync(maskdc_g, mask_dc.g, channel_size, ocl.commandQueue); + cuMemcpyDtoHAsync(maskdc_b, mask_dc.b, channel_size, ocl.commandQueue); + cuFinish(ocl.commandQueue); ocl.releaseMemChannels(rgb); ocl.releaseMemChannels(rgb2); @@ -193,15 +193,16 @@ void cuConvolutionEx( size_t oxsize = (xsize + xstep - 1) / xstep; + CUfunction kernel = ocl.kernel[KERNEL_CONVOLUTION]; const void *args[] = { &result, &inp, &xsize, &multipliers, &len, &xstep, &offset, &border_ratio }; - CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_CONVOLUTIONX], + CUresult err = cuLaunchKernel(kernel, oxsize, ysize, 1, 1, 1, 1, 0, - ocl.stream, (void**)args, NULL); + ocl.commandQueue, (void**)args, NULL); LOG_CU_RESULT(err); - err = cuFinish(ocl.stream); + err = cuFinish(ocl.commandQueue); LOG_CU_RESULT(err); } @@ -214,15 +215,16 @@ void cuConvolutionXEx( { ocu_args_d_t &ocl = getOcu(); + CUfunction kernel = ocl.kernel[KERNEL_CONVOLUTIONX]; const void *args[] = { &result, &inp, &multipliers, &len, &xstep, &offset, &border_ratio }; - CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_CONVOLUTIONX], + CUresult err = cuLaunchKernel(kernel, xsize, ysize, 1, 1, 1, 1, 0, - ocl.stream, (void**)args, NULL); + ocl.commandQueue, (void**)args, NULL); LOG_CU_RESULT(err); - err = cuFinish(ocl.stream); + err = cuFinish(ocl.commandQueue); LOG_CU_RESULT(err); } @@ -234,15 +236,16 @@ void cuConvolutionYEx( { ocu_args_d_t &ocl = getOcu(); + CUfunction kernel = ocl.kernel[KERNEL_CONVOLUTIONY]; const void *args[] = { &result, &inp, &multipliers, &len, &xstep, &offset, &border_ratio }; - CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_CONVOLUTIONY], + CUresult err = cuLaunchKernel(kernel, xsize, ysize, 1, 1, 1, 1, 0, - ocl.stream, (void**)args, NULL); + ocl.commandQueue, (void**)args, NULL); LOG_CU_RESULT(err); - err = cuFinish(ocl.stream); + err = cuFinish(ocl.commandQueue); LOG_CU_RESULT(err); } @@ -251,17 +254,18 @@ void cuSquareSampleEx( const cu_mem image, size_t xsize, size_t ysize, size_t xstep, size_t ystep) { - ocu_args_d_t &ocu = getOcu(); + ocu_args_d_t &ocl = getOcu(); + CUfunction kernel = ocl.kernel[KERNEL_SQUARESAMPLE]; const void *args[] = { &result, &image, &xstep, &ystep }; - CUresult err = cuLaunchKernel(ocu.kernel[KERNEL_SQUARESAMPLE], + CUresult err = cuLaunchKernel(kernel, xsize, ysize, 1, 1, 1, 1, 0, - ocu.stream, (void**)args, NULL); + ocl.commandQueue, (void**)args, NULL); LOG_CU_RESULT(err); - err = cuFinish(ocu.stream); + err = cuFinish(ocl.commandQueue); LOG_CU_RESULT(err); } @@ -316,15 +320,16 @@ void cuOpsinDynamicsImageEx(ocu_channels &rgb, const size_t xsize, const size_t cuBlurEx(rgb.g, xsize, ysize, kSigma, 0.0, rgb_blurred.g); cuBlurEx(rgb.b, xsize, ysize, kSigma, 0.0, rgb_blurred.b); + CUfunction kernel = ocl.kernel[KERNEL_OPSINDYNAMICSIMAGE]; void *args[] = { &rgb.r, &rgb.g, &rgb.b, &rgb_blurred.r, &rgb_blurred.g, &rgb_blurred.b }; - CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_OPSINDYNAMICSIMAGE], + CUresult err = cuLaunchKernel(kernel, xsize * ysize, 1, 1, 1, 1, 1, 0, - ocl.stream, args, NULL); + ocl.commandQueue, args, NULL); LOG_CU_RESULT(err); - r = cuFinish(ocl.stream); + err = cuFinish(ocl.commandQueue); LOG_CU_RESULT(err); ocl.releaseMemChannels(rgb_blurred); @@ -342,27 +347,28 @@ void cuMaskHighIntensityChangeEx( ocu_channels c0 = ocl.allocMemChannels(channel_size); ocu_channels c1 = ocl.allocMemChannels(channel_size); - cuMemcpyDtoDAsync(c0.r, xyb0.r, channel_size, ocl.stream); - cuMemcpyDtoDAsync(c0.g, xyb0.g, channel_size, ocl.stream); - cuMemcpyDtoDAsync(c0.b, xyb0.b, channel_size, ocl.stream); - cuMemcpyDtoDAsync(c1.r, xyb1.r, channel_size, ocl.stream); - cuMemcpyDtoDAsync(c1.g, xyb1.g, channel_size, ocl.stream); - cuMemcpyDtoDAsync(c1.b, xyb1.b, channel_size, ocl.stream); - cuFinish(ocl.stream); + cuMemcpyDtoDAsync(c0.r, xyb0.r, channel_size, ocl.commandQueue); + cuMemcpyDtoDAsync(c0.g, xyb0.g, channel_size, ocl.commandQueue); + cuMemcpyDtoDAsync(c0.b, xyb0.b, channel_size, ocl.commandQueue); + cuMemcpyDtoDAsync(c1.r, xyb1.r, channel_size, ocl.commandQueue); + cuMemcpyDtoDAsync(c1.g, xyb1.g, channel_size, ocl.commandQueue); + cuMemcpyDtoDAsync(c1.b, xyb1.b, channel_size, ocl.commandQueue); + cuFinish(ocl.commandQueue); + CUfunction kernel = ocl.kernel[KERNEL_MASKHIGHINTENSITYCHANGE]; const void *args[] = { &xyb0.r, &xyb0.g, &xyb0.b, &xyb1.r, &xyb1.g, &xyb1.b, &c0.r, &c0.g, &c0.b, &c1.r, &c1.g, &c1.b }; - CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_MASKHIGHINTENSITYCHANGE], + CUresult err = cuLaunchKernel(kernel, xsize, ysize, 1, 1, 1, 1, 0, - ocl.stream, (void**)args, NULL); + ocl.commandQueue, (void**)args, NULL); LOG_CU_RESULT(err); - err = cuFinish(ocl.stream); + err = cuFinish(ocl.commandQueue); LOG_CU_RESULT(err); ocl.releaseMemChannels(c0); @@ -389,6 +395,7 @@ void cuEdgeDetectorMapEx( cuBlurEx(rgb2.ch[i], xsize, ysize, kSigma[i], 0.0, rgb2_blured.ch[i]); } + CUfunction kernel = ocl.kernel[KERNEL_EDGEDETECTOR]; const void *args[] = { &result, &rgb_blured.r, &rgb_blured.g, &rgb_blured.b, &rgb2_blured.r, &rgb2_blured.g, &rgb2_blured.b, @@ -397,13 +404,13 @@ void cuEdgeDetectorMapEx( const size_t res_xsize = (xsize + step - 1) / step; const size_t res_ysize = (ysize + step - 1) / step; - CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_EDGEDETECTOR], + CUresult err = cuLaunchKernel(kernel, res_xsize, res_ysize, 1, 1, 1, 1, 0, - ocl.stream, (void**)args, NULL); + ocl.commandQueue, (void**)args, NULL); LOG_CU_RESULT(err); - err = cuFinish(ocl.stream); + err = cuFinish(ocl.commandQueue); LOG_CU_RESULT(err); ocl.releaseMemChannels(rgb_blured); @@ -418,6 +425,7 @@ void cuBlockDiffMapEx( { ocu_args_d_t &ocl = getOcu(); + CUfunction kernel = ocl.kernel[KERNEL_BLOCKDIFFMAP]; const void *args[] = { &block_diff_dc, &block_diff_ac, &rgb.r, &rgb.g, &rgb.b, &rgb2.r, &rgb2.g, &rgb2.b, @@ -426,13 +434,13 @@ void cuBlockDiffMapEx( const size_t res_xsize = (xsize + step - 1) / step; const size_t res_ysize = (ysize + step - 1) / step; - CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_BLOCKDIFFMAP], + CUresult err = cuLaunchKernel(kernel, res_xsize, res_ysize, 1, 1, 1, 1, 0, - ocl.stream, (void**)args, NULL); + ocl.commandQueue, (void**)args, NULL); LOG_CU_RESULT(err); - err = cuFinish(ocl.stream); + err = cuFinish(ocl.commandQueue); LOG_CU_RESULT(err); } @@ -455,6 +463,7 @@ void cuEdgeDetectorLowFreqEx( cuBlurEx(rgb2.ch[i], xsize, ysize, kSigma, 0.0, rgb2_blured.ch[i]); } + CUfunction kernel = ocl.kernel[KERNEL_EDGEDETECTORLOWFREQ]; const void *args[] = { &block_diff_ac, &rgb_blured.r, &rgb_blured.g, &rgb_blured.b, &rgb2_blured.r, &rgb2_blured.g, &rgb2_blured.b, @@ -463,13 +472,13 @@ void cuEdgeDetectorLowFreqEx( const size_t res_xsize = (xsize + step - 1) / step; const size_t res_ysize = (ysize + step - 1) / step; - CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_EDGEDETECTORLOWFREQ], + CUresult err = cuLaunchKernel(kernel, res_xsize, res_ysize, 1, 1, 1, 1, 0, - ocl.stream, (void**)args, NULL); + ocl.commandQueue, (void**)args, NULL); LOG_CU_RESULT(err); - err = cuFinish(ocl.stream); + err = cuFinish(ocl.commandQueue); LOG_CU_RESULT(err); ocl.releaseMemChannels(rgb_blured); @@ -483,17 +492,18 @@ void cuDiffPrecomputeEx( { ocu_args_d_t &ocl = getOcu(); + CUfunction kernel = ocl.kernel[KERNEL_DIFFPRECOMPUTE]; const void *args[] = { &mask.x, &mask.y, &mask.b, &xyb0.x, &xyb0.y, &xyb0.b, &xyb1.x, &xyb1.y, &xyb1.b }; - CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_DIFFPRECOMPUTE], + CUresult err = cuLaunchKernel(kernel, xsize, ysize, 1, 1, 1, 1, 0, - ocl.stream, (void**)args, NULL); + ocl.commandQueue, (void**)args, NULL); LOG_CU_RESULT(err); - err = cuFinish(ocl.stream); + err = cuFinish(ocl.commandQueue); LOG_CU_RESULT(err); } @@ -501,15 +511,16 @@ void cuScaleImageEx(cu_mem img/*in, out*/, size_t size, double w) { ocu_args_d_t &ocl = getOcu(); + CUfunction kernel = ocl.kernel[KERNEL_SCALEIMAGE]; const void *args[] = { &img, &w }; - CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_SCALEIMAGE], + CUresult err = cuLaunchKernel(kernel, size, 1, 1, 1, 1, 1, 0, - ocl.stream, (void**)args, NULL); + ocl.commandQueue, (void**)args, NULL); LOG_CU_RESULT(err); - err = cuFinish(ocl.stream); + err = cuFinish(ocl.commandQueue); LOG_CU_RESULT(err); } @@ -527,15 +538,16 @@ void cuAverage5x5Ex(cu_mem img/*in,out*/, const size_t xsize, const size_t ysize cuMemcpyDtoD(img_org, img, len); + CUfunction kernel = ocl.kernel[KERNEL_AVERAGE5X5]; const void *args[] = { &img, &img_org }; - CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_AVERAGE5X5], + CUresult err = cuLaunchKernel(kernel, xsize, ysize, 1, 1, 1, 1, 0, - ocl.stream, (void**)args, NULL); + ocl.commandQueue, (void**)args, NULL); LOG_CU_RESULT(err); - err = cuFinish(ocl.stream); + err = cuFinish(ocl.commandQueue); LOG_CU_RESULT(err); cuMemFree(img_org); @@ -548,20 +560,21 @@ void cuMinSquareValEx( { ocu_args_d_t &ocl = getOcu(); - cu_mem srcA = ocl.allocMem(sizeof(float) * xsize * ysize); + cu_mem result = ocl.allocMem(sizeof(float) * xsize * ysize); - const void *args[] = { &srcA, &img, &square_size, &offset }; + CUfunction kernel = ocl.kernel[KERNEL_MINSQUAREVAL]; + const void *args[] = { &result, &img, &square_size, &offset }; - CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_MINSQUAREVAL], + CUresult err = cuLaunchKernel(kernel, xsize, ysize, 1, 1, 1, 1, 0, - ocl.stream, (void**)args, NULL); + ocl.commandQueue, (void**)args, NULL); LOG_CU_RESULT(err); - err = cuFinish(ocl.stream); + err = cuFinish(ocl.commandQueue); LOG_CU_RESULT(err); - cuMemcpyDtoD(img, srcA, sizeof(float) * xsize * ysize); - cuMemFree(srcA); + cuMemcpyDtoD(img, result, sizeof(float) * xsize * ysize); + cuMemFree(result); } static void MakeMask(double extmul, double extoff, @@ -664,18 +677,19 @@ void cuDoMask(ocu_channels mask/*in, out*/, ocu_channels mask_dc/*in, out*/, siz ocu_channels xyb = ocl.allocMemChannels(channel_size, lut_x, lut_y, lut_b); ocu_channels xyb_dc = ocl.allocMemChannels(channel_size, lut_dcx, lut_dcy, lut_dcb); + CUfunction kernel = ocl.kernel[KERNEL_DOMASK]; const void *args[] = { &mask.r, &mask.g, &mask.b, &mask_dc.r, &mask_dc.g, &mask_dc.b, &xyb.x, &xyb.y, &xyb.b, &xyb_dc.x, &xyb_dc.y, &xyb_dc.b }; - CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_DOMASK], + CUresult err = cuLaunchKernel(kernel, xsize, ysize, 1, 1, 1, 1, 0, - ocl.stream, (void**)args, NULL); + ocl.commandQueue, (void**)args, NULL); LOG_CU_RESULT(err); - err = cuFinish(ocl.stream); + err = cuFinish(ocl.commandQueue); LOG_CU_RESULT(err); ocl.releaseMemChannels(xyb); @@ -727,6 +741,7 @@ void cuCombineChannelsEx( const size_t work_xsize = ((xsize - 8 + step) + step - 1) / step; const size_t work_ysize = ((ysize - 8 + step) + step - 1) / step; + CUfunction kernel = ocl.kernel[KERNEL_COMBINECHANNELS]; const void *args[] = { &result, &mask.r, &mask.g, &mask.b, &mask_dc.r, &mask_dc.g, &mask_dc.b, @@ -736,13 +751,13 @@ void cuCombineChannelsEx( &res_xsize, &step }; - CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_COMBINECHANNELS], + CUresult err = cuLaunchKernel(kernel, work_xsize, work_ysize, 1, 1, 1, 1, 0, - ocl.stream, (void**)args, NULL); + ocl.commandQueue, (void**)args, NULL); LOG_CU_RESULT(err); - err = cuFinish(ocl.stream); + err = cuFinish(ocl.commandQueue); LOG_CU_RESULT(err); } @@ -752,21 +767,19 @@ void cuUpsampleSquareRootEx(cu_mem diffmap, const size_t xsize, const size_t ysi cu_mem diffmap_out = ocl.allocMem(xsize * ysize * sizeof(float)); - const void *args[] = { &diffmap_out, - &diffmap, - &xsize, &ysize, - &step }; + CUfunction kernel = ocl.kernel[KERNEL_UPSAMPLESQUAREROOT]; + const void *args[] = { &diffmap_out, &diffmap, &xsize, &ysize, &step }; const size_t res_xsize = (xsize + step - 1) / step; const size_t res_ysize = (ysize + step - 1) / step; - CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_UPSAMPLESQUAREROOT], + CUresult err = cuLaunchKernel(kernel, res_xsize, res_ysize, 1, 1, 1, 1, 0, - ocl.stream, (void**)args, NULL); + ocl.commandQueue, (void**)args, NULL); LOG_CU_RESULT(err); - err = cuFinish(ocl.stream); + err = cuFinish(ocl.commandQueue); LOG_CU_RESULT(err); cuMemcpyDtoD(diffmap, diffmap_out, xsize * ysize * sizeof(float)); @@ -780,19 +793,16 @@ void cuRemoveBorderEx(cu_mem out, const cu_mem in, const size_t xsize, const siz int cls = 8 - step; int cls2 = (8 - step) / 2; - const void *args[] = { &out, - &in, - &xsize, - &cls, - &cls2 }; + CUfunction kernel = ocl.kernel[KERNEL_REMOVEBORDER]; + const void *args[] = { &out, &in, &xsize, &cls, &cls2 }; - CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_REMOVEBORDER], + CUresult err = cuLaunchKernel(kernel, xsize - cls, ysize - cls, 1, 1, 1, 1, 0, - ocl.stream, (void**)args, NULL); + ocl.commandQueue, (void**)args, NULL); LOG_CU_RESULT(err); - err = cuFinish(ocl.stream); + err = cuFinish(ocl.commandQueue); LOG_CU_RESULT(err); } @@ -802,19 +812,16 @@ void cuAddBorderEx(cu_mem out, size_t xsize, size_t ysize, int step, cu_mem in) int cls = 8 - step; int cls2 = (8 - step) / 2; + CUfunction kernel = ocl.kernel[KERNEL_ADDBORDER]; + const void *args[] = { &out, &cls, &cls2, &in }; - const void *args[] = { &out, - &cls, - &cls2, - &in }; - - CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_ADDBORDER], + CUresult err = cuLaunchKernel(kernel, xsize, ysize, 1, 1, 1, 1, 0, - ocl.stream, (void**)args, NULL); + ocl.commandQueue, (void**)args, NULL); LOG_CU_RESULT(err); - err = cuFinish(ocl.stream); + err = cuFinish(ocl.commandQueue); LOG_CU_RESULT(err); } diff --git a/clguetzli/ocu.cpp b/clguetzli/ocu.cpp index d15733c9..8923bcd4 100644 --- a/clguetzli/ocu.cpp +++ b/clguetzli/ocu.cpp @@ -13,12 +13,14 @@ ocu_args_d_t& getOcu(void) bInit = true; - CUresult r = cuInit(0); + CUresult err = cuInit(0); + LOG_CU_RESULT(err); CUdevice dev = 0; CUcontext ctxt; CUstream stream; - r = cuCtxCreate(&ctxt, CU_CTX_SCHED_BLOCKING_SYNC, dev); + err = cuCtxCreate(&ctxt, CU_CTX_SCHED_BLOCKING_SYNC, dev); + LOG_CU_RESULT(err); char name[1024]; int proc_count = 0; @@ -30,7 +32,7 @@ ocu_args_d_t& getOcu(void) cuDeviceGetAttribute(&proc_count, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev); cuDeviceGetAttribute(&thread_count, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev); LogError("CUDA Adapter:%s Ver%d.%d MP %d Core %d)\r\n", name, cap_major, cap_minor, proc_count, thread_count); - +/* char* source = nullptr; size_t src_size = 0; ReadSourceFromFile("clguetzli/clguetzli.cl", &source, &src_size); @@ -38,7 +40,7 @@ ocu_args_d_t& getOcu(void) nvrtcProgram prog; const char *opts[] = { "-arch=compute_30", "-default-device", "-G", "-I\"./\"", "--fmad=false" }; nvrtcCreateProgram(&prog, source, "clguetzli.cl", 0, NULL, NULL); - nvrtcResult compile_result = nvrtcCompileProgram(prog, 3, opts); + nvrtcResult compile_result;// = nvrtcCompileProgram(prog, 3, opts); if (NVRTC_SUCCESS != compile_result) { // Obtain compilation log from the program. @@ -52,41 +54,51 @@ ocu_args_d_t& getOcu(void) delete[] log; } + delete[] source; // Obtain PTX from the program. size_t ptxSize = 0; nvrtcGetPTXSize(prog, &ptxSize); char *ptx = new char[ptxSize]; nvrtcGetPTX(prog, ptx); +*/ + + char* ptx = nullptr; + size_t src_size = 0; +#ifdef _WIN64 + ReadSourceFromFile("clguetzli/clguetzli.cu.ptx64", &ptx, &src_size); +#else + ReadSourceFromFile("clguetzli/clguetzli.cu.ptx32", &ptx, &src_size); +#endif CUmodule mod; CUjit_option jit_options[2]; void *jit_optvals[2]; jit_options[0] = CU_JIT_CACHE_MODE; jit_optvals[0] = (void*)(uintptr_t)CU_JIT_CACHE_OPTION_CA; - r = cuModuleLoadDataEx(&mod, ptx, 1, jit_options, jit_optvals); + err = cuModuleLoadDataEx(&mod, ptx, 1, jit_options, jit_optvals); + LOG_CU_RESULT(err); - delete[] source; delete[] ptx; - r = cuModuleGetFunction(&ocu.kernel[KERNEL_CONVOLUTION], mod, "clConvolutionEx"); - r = cuModuleGetFunction(&ocu.kernel[KERNEL_CONVOLUTIONX], mod, "clConvolutionXEx"); - r = cuModuleGetFunction(&ocu.kernel[KERNEL_CONVOLUTIONY], mod, "clConvolutionYEx"); - r = cuModuleGetFunction(&ocu.kernel[KERNEL_SQUARESAMPLE], mod, "clSquareSampleEx"); - r = cuModuleGetFunction(&ocu.kernel[KERNEL_OPSINDYNAMICSIMAGE], mod, "clOpsinDynamicsImageEx"); - r = cuModuleGetFunction(&ocu.kernel[KERNEL_MASKHIGHINTENSITYCHANGE], mod, "clMaskHighIntensityChangeEx"); - r = cuModuleGetFunction(&ocu.kernel[KERNEL_EDGEDETECTOR], mod, "clEdgeDetectorMapEx"); - r = cuModuleGetFunction(&ocu.kernel[KERNEL_BLOCKDIFFMAP], mod, "clBlockDiffMapEx"); - r = cuModuleGetFunction(&ocu.kernel[KERNEL_EDGEDETECTORLOWFREQ], mod, "clEdgeDetectorLowFreqEx"); - r = cuModuleGetFunction(&ocu.kernel[KERNEL_DIFFPRECOMPUTE], mod, "clDiffPrecomputeEx"); - r = cuModuleGetFunction(&ocu.kernel[KERNEL_SCALEIMAGE], mod, "clScaleImageEx"); - r = cuModuleGetFunction(&ocu.kernel[KERNEL_AVERAGE5X5], mod, "clAverage5x5Ex"); - r = cuModuleGetFunction(&ocu.kernel[KERNEL_MINSQUAREVAL], mod, "clMinSquareValEx"); - r = cuModuleGetFunction(&ocu.kernel[KERNEL_DOMASK], mod, "clDoMaskEx"); - r = cuModuleGetFunction(&ocu.kernel[KERNEL_COMBINECHANNELS], mod, "clCombineChannelsEx"); - r = cuModuleGetFunction(&ocu.kernel[KERNEL_UPSAMPLESQUAREROOT], mod, "clUpsampleSquareRootEx"); - r = cuModuleGetFunction(&ocu.kernel[KERNEL_REMOVEBORDER], mod, "clRemoveBorderEx"); - r = cuModuleGetFunction(&ocu.kernel[KERNEL_ADDBORDER], mod, "clAddBorderEx"); - r = cuModuleGetFunction(&ocu.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER], mod, "clComputeBlockZeroingOrderEx"); + cuModuleGetFunction(&ocu.kernel[KERNEL_CONVOLUTION], mod, "clConvolutionEx"); + cuModuleGetFunction(&ocu.kernel[KERNEL_CONVOLUTIONX], mod, "clConvolutionXEx"); + cuModuleGetFunction(&ocu.kernel[KERNEL_CONVOLUTIONY], mod, "clConvolutionYEx"); + cuModuleGetFunction(&ocu.kernel[KERNEL_SQUARESAMPLE], mod, "clSquareSampleEx"); + cuModuleGetFunction(&ocu.kernel[KERNEL_OPSINDYNAMICSIMAGE], mod, "clOpsinDynamicsImageEx"); + cuModuleGetFunction(&ocu.kernel[KERNEL_MASKHIGHINTENSITYCHANGE], mod, "clMaskHighIntensityChangeEx"); + cuModuleGetFunction(&ocu.kernel[KERNEL_EDGEDETECTOR], mod, "clEdgeDetectorMapEx"); + cuModuleGetFunction(&ocu.kernel[KERNEL_BLOCKDIFFMAP], mod, "clBlockDiffMapEx"); + cuModuleGetFunction(&ocu.kernel[KERNEL_EDGEDETECTORLOWFREQ], mod, "clEdgeDetectorLowFreqEx"); + cuModuleGetFunction(&ocu.kernel[KERNEL_DIFFPRECOMPUTE], mod, "clDiffPrecomputeEx"); + cuModuleGetFunction(&ocu.kernel[KERNEL_SCALEIMAGE], mod, "clScaleImageEx"); + cuModuleGetFunction(&ocu.kernel[KERNEL_AVERAGE5X5], mod, "clAverage5x5Ex"); + cuModuleGetFunction(&ocu.kernel[KERNEL_MINSQUAREVAL], mod, "clMinSquareValEx"); + cuModuleGetFunction(&ocu.kernel[KERNEL_DOMASK], mod, "clDoMaskEx"); + cuModuleGetFunction(&ocu.kernel[KERNEL_COMBINECHANNELS], mod, "clCombineChannelsEx"); + cuModuleGetFunction(&ocu.kernel[KERNEL_UPSAMPLESQUAREROOT], mod, "clUpsampleSquareRootEx"); + cuModuleGetFunction(&ocu.kernel[KERNEL_REMOVEBORDER], mod, "clRemoveBorderEx"); + cuModuleGetFunction(&ocu.kernel[KERNEL_ADDBORDER], mod, "clAddBorderEx"); + cuModuleGetFunction(&ocu.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER], mod, "clComputeBlockZeroingOrderEx"); cuCtxSetCacheConfig(CU_FUNC_CACHE_PREFER_SHARED); cuCtxSetSharedMemConfig(CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE); @@ -94,7 +106,7 @@ ocu_args_d_t& getOcu(void) cuStreamCreate(&stream, 0); ocu.dev = dev; - ocu.stream = stream; + ocu.commandQueue = stream; ocu.mod = mod; ocu.ctxt = ctxt; @@ -102,6 +114,10 @@ ocu_args_d_t& getOcu(void) } ocu_args_d_t::ocu_args_d_t() + : dev(0) + , commandQueue(NULL) + , mod(NULL) + , ctxt(NULL) { } @@ -110,7 +126,7 @@ ocu_args_d_t::~ocu_args_d_t() { cuModuleUnload(mod); cuCtxDestroy(ctxt); -// cuStreamDestroy(stream); +// cuStreamDestroy(commandQueue); } cu_mem ocu_args_d_t::allocMem(size_t s, const void *init) @@ -119,11 +135,11 @@ cu_mem ocu_args_d_t::allocMem(size_t s, const void *init) cuMemAlloc(&mem, s); if (init) { - cuMemcpyHtoD(mem, init, s); + cuMemcpyHtoDAsync(mem, init, s, commandQueue); } else { - cuMemsetD8(mem, 0, s); + cuMemsetD8Async(mem, 0, s, commandQueue); } return mem; @@ -153,6 +169,68 @@ void ocu_args_d_t::releaseMemChannels(ocu_channels &rgb) const char* TranslateCUDAError(CUresult errorCode) { - return "Unknwon"; + switch (errorCode) + { + case CUDA_SUCCESS: return "CUDA_SUCCESS"; + case CUDA_ERROR_INVALID_VALUE: return "CUDA_ERROR_INVALID_VALUE"; + case CUDA_ERROR_OUT_OF_MEMORY: return "CUDA_ERROR_OUT_OF_MEMORY"; + case CUDA_ERROR_NOT_INITIALIZED: return "CUDA_ERROR_NOT_INITIALIZED"; + case CUDA_ERROR_DEINITIALIZED: return "CUDA_ERROR_DEINITIALIZED"; + case CUDA_ERROR_PROFILER_DISABLED: return "CUDA_ERROR_PROFILER_DISABLED"; + case CUDA_ERROR_PROFILER_NOT_INITIALIZED: return "CUDA_ERROR_PROFILER_NOT_INITIALIZED"; + case CUDA_ERROR_PROFILER_ALREADY_STARTED: return "CUDA_ERROR_PROFILER_ALREADY_STARTED"; + case CUDA_ERROR_PROFILER_ALREADY_STOPPED: return "CUDA_ERROR_PROFILER_ALREADY_STOPPED"; + case CUDA_ERROR_NO_DEVICE: return "CUDA_ERROR_NO_DEVICE"; + case CUDA_ERROR_INVALID_DEVICE: return "CUDA_ERROR_INVALID_DEVICE"; + case CUDA_ERROR_INVALID_IMAGE: return "CUDA_ERROR_INVALID_IMAGE"; + case CUDA_ERROR_INVALID_CONTEXT: return "CUDA_ERROR_INVALID_CONTEXT"; + case CUDA_ERROR_CONTEXT_ALREADY_CURRENT: return "CUDA_ERROR_CONTEXT_ALREADY_CURRENT"; + case CUDA_ERROR_MAP_FAILED: return "CUDA_ERROR_MAP_FAILED"; + case CUDA_ERROR_UNMAP_FAILED: return "CUDA_ERROR_UNMAP_FAILED"; + case CUDA_ERROR_ARRAY_IS_MAPPED: return "CUDA_ERROR_ARRAY_IS_MAPPED"; + case CUDA_ERROR_ALREADY_MAPPED: return "CUDA_ERROR_ALREADY_MAPPED"; + case CUDA_ERROR_NO_BINARY_FOR_GPU: return "CUDA_ERROR_NO_BINARY_FOR_GPU"; + case CUDA_ERROR_ALREADY_ACQUIRED: return "CUDA_ERROR_ALREADY_ACQUIRED"; + case CUDA_ERROR_NOT_MAPPED: return "CUDA_ERROR_NOT_MAPPED"; + case CUDA_ERROR_NOT_MAPPED_AS_ARRAY: return "CUDA_ERROR_NOT_MAPPED_AS_ARRAY"; + case CUDA_ERROR_NOT_MAPPED_AS_POINTER: return "CUDA_ERROR_NOT_MAPPED_AS_POINTER"; + case CUDA_ERROR_ECC_UNCORRECTABLE: return "CUDA_ERROR_ECC_UNCORRECTABLE"; + case CUDA_ERROR_UNSUPPORTED_LIMIT: return "CUDA_ERROR_UNSUPPORTED_LIMIT"; + case CUDA_ERROR_CONTEXT_ALREADY_IN_USE: return "CUDA_ERROR_CONTEXT_ALREADY_IN_USE"; + case CUDA_ERROR_PEER_ACCESS_UNSUPPORTED: return "CUDA_ERROR_PEER_ACCESS_UNSUPPORTED"; + case CUDA_ERROR_INVALID_PTX: return "CUDA_ERROR_INVALID_PTX"; + case CUDA_ERROR_INVALID_GRAPHICS_CONTEXT: return "CUDA_ERROR_INVALID_GRAPHICS_CONTEXT"; + case CUDA_ERROR_NVLINK_UNCORRECTABLE: return "CUDA_ERROR_NVLINK_UNCORRECTABLE"; + case CUDA_ERROR_INVALID_SOURCE: return "CUDA_ERROR_INVALID_SOURCE"; + case CUDA_ERROR_FILE_NOT_FOUND: return "CUDA_ERROR_FILE_NOT_FOUND"; + case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND: return "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND"; + case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED: return "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED"; + case CUDA_ERROR_OPERATING_SYSTEM: return "CUDA_ERROR_OPERATING_SYSTEM"; + case CUDA_ERROR_INVALID_HANDLE: return "CUDA_ERROR_INVALID_HANDLE"; + case CUDA_ERROR_NOT_FOUND: return "CUDA_ERROR_NOT_FOUND"; + case CUDA_ERROR_NOT_READY: return "CUDA_ERROR_NOT_READY"; + case CUDA_ERROR_ILLEGAL_ADDRESS: return "CUDA_ERROR_ILLEGAL_ADDRESS"; + case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: return "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES"; + case CUDA_ERROR_LAUNCH_TIMEOUT: return "CUDA_ERROR_LAUNCH_TIMEOUT"; + case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING: return "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING"; + case CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED: return "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED"; + case CUDA_ERROR_PEER_ACCESS_NOT_ENABLED: return "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED"; + case CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE: return "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE"; + case CUDA_ERROR_CONTEXT_IS_DESTROYED: return "CUDA_ERROR_CONTEXT_IS_DESTROYED"; + case CUDA_ERROR_ASSERT: return "CUDA_ERROR_ASSERT"; + case CUDA_ERROR_TOO_MANY_PEERS: return "CUDA_ERROR_TOO_MANY_PEERS"; + case CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED: return "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED"; + case CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED: return "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED"; + case CUDA_ERROR_HARDWARE_STACK_ERROR: return "CUDA_ERROR_HARDWARE_STACK_ERROR"; + case CUDA_ERROR_ILLEGAL_INSTRUCTION: return "CUDA_ERROR_ILLEGAL_INSTRUCTION"; + case CUDA_ERROR_MISALIGNED_ADDRESS: return "CUDA_ERROR_MISALIGNED_ADDRESS"; + case CUDA_ERROR_INVALID_ADDRESS_SPACE: return "CUDA_ERROR_INVALID_ADDRESS_SPACE"; + case CUDA_ERROR_INVALID_PC: return "CUDA_ERROR_INVALID_PC"; + case CUDA_ERROR_LAUNCH_FAILED: return "CUDA_ERROR_LAUNCH_FAILED"; + case CUDA_ERROR_NOT_PERMITTED: return "CUDA_ERROR_NOT_PERMITTED"; + case CUDA_ERROR_NOT_SUPPORTED: return "CUDA_ERROR_NOT_SUPPORTED"; + case CUDA_ERROR_UNKNOWN: return "CUDA_ERROR_UNKNOWN"; + default: return "CUDA_ERROR_UNKNOWN"; + } } #endif \ No newline at end of file diff --git a/clguetzli/ocu.h b/clguetzli/ocu.h index 426019cc..dbc42916 100644 --- a/clguetzli/ocu.h +++ b/clguetzli/ocu.h @@ -23,7 +23,7 @@ struct ocu_args_d_t void releaseMemChannels(ocu_channels &rgb); CUfunction kernel[KERNEL_COUNT]; - CUstream stream; + CUstream commandQueue; CUmodule mod; CUcontext ctxt; CUdevice dev; diff --git a/compile.bat b/compile.bat index 05a3a361..156ee639 100644 --- a/compile.bat +++ b/compile.bat @@ -1,4 +1,7 @@ @rem setupt windows var call vcvars64.bat -nvcc -Xcompiler "/wd 4819" -I"./" -arch=compute_30 --fmad=false --machine 64 -G -g -ptx -o clguetzli\clguetzli.cu.ptx64 clguetzli\clguetzli.cu \ No newline at end of file +@echo %1 --machine 64 or 32 +@echo %2 -G + +nvcc -Xcompiler "/wd 4819" -I"./" -arch=compute_30 --fmad=false --machine %1 %2 -ptx -o clguetzli\clguetzli.cu.ptx%1 clguetzli\clguetzli.cu \ No newline at end of file diff --git a/guetzli.vcxproj b/guetzli.vcxproj index fc36b9a0..c8936a47 100644 --- a/guetzli.vcxproj +++ b/guetzli.vcxproj @@ -108,13 +108,13 @@ true false true - ENABLE_OPENCL;_UNICODE;UNICODE;%(PreprocessorDefinitions) + __USE_CUDA__;_UNICODE;UNICODE;%(PreprocessorDefinitions) Console true true - nvrtc.lib;cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies) + cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies) mainCRTStartup $(CUDA_PATH)\lib\x64 @@ -174,7 +174,7 @@ Console true - nvrtc.lib;cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies) + cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies) mainCRTStartup $(CUDA_PATH)\lib\x64 @@ -396,22 +396,24 @@ Document - true - true Document - true CUDA Code Builder - $(ProjectDir)compile.bat - compile.bat + $(ProjectDir)compile.bat 64 -G + $(ProjectDir)compile.bat 32 -G false cu.ptx - $(ProjectDir)compile.bat + $(ProjectDir)compile.bat 64 -G CUDA Code Builder cu.ptx false true + $(ProjectDir)compile.bat 32 -G + CUDA Code Builder + cu.ptx + CUDA Code Builder + cu.ptx From cd2e614be6c963e028ab5413986e7f6adc4e8520 Mon Sep 17 00:00:00 2001 From: strongtu Date: Sun, 4 Jun 2017 23:48:25 +0800 Subject: [PATCH 133/189] =?UTF-8?q?=E6=9B=B4=E6=8D=A2mode=E6=96=B9?= =?UTF-8?q?=E5=BC=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clbutter_comparator.cpp | 158 ++++++++++++++---------------- clguetzli/clguetzli.cpp | 4 +- clguetzli/clguetzli.h | 13 ++- guetzli.vcxproj | 4 +- guetzli/guetzli.cc | 15 ++- guetzli/processor.cc | 14 +-- 6 files changed, 103 insertions(+), 105 deletions(-) diff --git a/clguetzli/clbutter_comparator.cpp b/clguetzli/clbutter_comparator.cpp index 178e70e9..6e20976a 100644 --- a/clguetzli/clbutter_comparator.cpp +++ b/clguetzli/clbutter_comparator.cpp @@ -14,18 +14,20 @@ namespace butteraugli std::vector> &xyb1, std::vector &result) { - if (g_useOpenCL && xsize_ > 100 && ysize_ > 100) + if (MODE_OPENCL == g_mathMode && xsize_ > 100 && ysize_ > 100) { result.resize(xsize_ * ysize_); clDiffmapOpsinDynamicsImage(result.data(), xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), xsize_, ysize_, step_); } - else if (g_useCuda && xsize_ > 100 && ysize_ > 100) +#ifdef __USE_CUDA__ + else if (MODE_CUDA == g_mathMode && xsize_ > 100 && ysize_ > 100) { result.resize(xsize_ * ysize_); - clDiffmapOpsinDynamicsImage(result.data(), xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), + cuDiffmapOpsinDynamicsImage(result.data(), xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), xsize_, ysize_, step_); } +#endif else { ButteraugliComparator::DiffmapOpsinDynamicsImage(xyb0, xyb1, result); @@ -39,7 +41,7 @@ namespace butteraugli { ButteraugliComparator::BlockDiffMap(xyb0, xyb1, block_diff_dc, block_diff_ac); - if (g_checkOpenCL && xsize_ > 8 && ysize_ > 8) + if (MODE_CHECKCL == g_mathMode && xsize_ > 8 && ysize_ > 8) { tclBlockDiffMap(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), @@ -55,7 +57,7 @@ namespace butteraugli { ButteraugliComparator::EdgeDetectorMap(xyb0, xyb1, edge_detector_map); - if (g_checkOpenCL && xsize_ > 8 && ysize_ > 8) + if (MODE_CHECKCL == g_mathMode && xsize_ > 8 && ysize_ > 8) { tclEdgeDetectorMap(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), @@ -68,21 +70,19 @@ namespace butteraugli const std::vector > &xyb1, std::vector* block_diff_ac) { - std::vector orign_ac; - if (g_checkOpenCL && xsize_ > 8 && ysize_ > 8) - { - orign_ac = *block_diff_ac; - } - - ButteraugliComparator::EdgeDetectorLowFreq(xyb0, xyb1, block_diff_ac); - - if (g_checkOpenCL && xsize_ > 8 && ysize_ > 8) + if (MODE_CHECKCL == g_mathMode && xsize_ > 8 && ysize_ > 8) { + std::vector orign_ac = *block_diff_ac; + ButteraugliComparator::EdgeDetectorLowFreq(xyb0, xyb1, block_diff_ac); tclEdgeDetectorLowFreq(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), xsize_, ysize_, step_, orign_ac.data(), (*block_diff_ac).data()); } + else + { + ButteraugliComparator::EdgeDetectorLowFreq(xyb0, xyb1, block_diff_ac); + } } void clButteraugliComparator::CombineChannels(const std::vector >& mask_xyb, @@ -92,55 +92,49 @@ namespace butteraugli const std::vector& edge_detector_map, std::vector* result) { - std::vector temp; - if (g_checkOpenCL && xsize_ > 8 && ysize_ > 8) - { - temp = *result; - } - - ButteraugliComparator::CombineChannels(mask_xyb, mask_xyb_dc, block_diff_dc, block_diff_ac, edge_detector_map, result); - - if (g_checkOpenCL && xsize_ > 8 && ysize_ > 8) + if (MODE_CHECKCL == g_mathMode && xsize_ > 8 && ysize_ > 8) { + std::vector temp = *result; temp.resize(res_xsize_ * res_ysize_); + ButteraugliComparator::CombineChannels(mask_xyb, mask_xyb_dc, block_diff_dc, block_diff_ac, edge_detector_map, result); tclCombineChannels(mask_xyb[0].data(), mask_xyb[1].data(), mask_xyb[2].data(), mask_xyb_dc[0].data(), mask_xyb_dc[1].data(), mask_xyb_dc[2].data(), block_diff_dc.data(), block_diff_ac.data(), edge_detector_map.data(), xsize_, ysize_, res_xsize_, res_ysize_, step_, &temp[0], &(*result)[0]); } + else + { + ButteraugliComparator::CombineChannels(mask_xyb, mask_xyb_dc, block_diff_dc, block_diff_ac, edge_detector_map, result); + } } void MinSquareVal(size_t square_size, size_t offset, size_t xsize, size_t ysize, float *values) { - std::vector img; - if (g_checkOpenCL && xsize > 8 && ysize > 8) + if (MODE_CHECKCL == g_mathMode && xsize > 8 && ysize > 8) { + std::vector img; img.resize(xsize * ysize); memcpy(img.data(), values, xsize * ysize * sizeof(float)); + _MinSquareVal(square_size, offset, xsize, ysize, values); + tclMinSquareVal(img.data(), square_size, offset, xsize, ysize, values); } - - _MinSquareVal(square_size, offset, xsize, ysize, values); - - - if (g_checkOpenCL && xsize > 8 && ysize > 8) + else { - tclMinSquareVal(img.data(), square_size, offset, xsize, ysize, values); + _MinSquareVal(square_size, offset, xsize, ysize, values); } } void Average5x5(int xsize, int ysize, std::vector* diffs) { - std::vector diffs_org; - if (g_checkOpenCL && xsize > 8 && ysize > 8) + if (MODE_CHECKCL == g_mathMode && xsize > 8 && ysize > 8) { - diffs_org = *diffs; + std::vector diffs_org = *diffs; + _Average5x5(xsize, ysize, diffs); + tclAverage5x5(xsize, ysize, diffs_org, *diffs); } - - _Average5x5(xsize, ysize, diffs); - - if (g_checkOpenCL && xsize > 8 && ysize > 8) + else { - tclAverage5x5(xsize, ysize, diffs_org, *diffs); + _Average5x5(xsize, ysize, diffs); } } @@ -148,7 +142,7 @@ namespace butteraugli { _DiffPrecompute(xyb0, xyb1, xsize, ysize, mask); - if (g_checkOpenCL && xsize > 8 && ysize > 8) + if (MODE_CHECKCL == g_mathMode && xsize > 8 && ysize > 8) { tclDiffPrecompute(xyb0, xyb1, xsize, ysize, mask); } @@ -160,7 +154,7 @@ namespace butteraugli std::vector > *mask, std::vector > *mask_dc) { - if (g_useOpenCL && xsize > 100 && ysize > 100) + if (MODE_OPENCL == g_mathMode && xsize > 100 && ysize > 100) { mask->resize(3); mask_dc->resize(3); @@ -175,10 +169,9 @@ namespace butteraugli xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), xyb1[0].data(), xyb1[1].data(), xyb1[2].data() ); - return; } #ifdef __USE_CUDA__ - else if (g_useCuda && xsize > 100 && ysize > 100) + else if (MODE_CUDA == g_mathMode && xsize > 100 && ysize > 100) { mask->resize(3); mask_dc->resize(3); @@ -193,36 +186,36 @@ namespace butteraugli xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), xyb1[0].data(), xyb1[1].data(), xyb1[2].data() ); - return; } #endif - _Mask(xyb0, xyb1, xsize, ysize, mask, mask_dc); - - if (g_checkOpenCL && xsize > 8 && ysize > 8) + else if (MODE_CHECKCL == g_mathMode && xsize > 8 && ysize > 8) { + _Mask(xyb0, xyb1, xsize, ysize, mask, mask_dc); tclMask(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), xsize, ysize, (*mask)[0].data(), (*mask)[1].data(), (*mask)[2].data(), (*mask_dc)[0].data(), (*mask_dc)[1].data(), (*mask_dc)[2].data()); } + else + { + _Mask(xyb0, xyb1, xsize, ysize, mask, mask_dc); + } } void CalculateDiffmap(const size_t xsize, const size_t ysize, const size_t step, std::vector* diffmap) { - std::vector diffmap_org; - if (g_checkOpenCL && xsize > 8 && ysize > 8) + if (MODE_CHECKCL == g_mathMode && xsize > 8 && ysize > 8) { - diffmap_org = *diffmap; + std::vector diffmap_org = *diffmap; + _CalculateDiffmap(xsize, ysize, step, diffmap); + tclCalculateDiffmap(xsize, ysize, step, diffmap_org.data(), diffmap_org.size(), (*diffmap).data()); } - - _CalculateDiffmap(xsize, ysize, step, diffmap); - - if (g_checkOpenCL && xsize > 8 && ysize > 8) + else { - tclCalculateDiffmap(xsize, ysize, step, diffmap_org.data(), diffmap_org.size(), (*diffmap).data()); + _CalculateDiffmap(xsize, ysize, step, diffmap); } } @@ -235,7 +228,7 @@ namespace butteraugli { _MaskHighIntensityChange(xsize, ysize, c0, c1, xyb0, xyb1); - if (g_checkOpenCL && xsize > 8 && ysize > 8) + if (MODE_CHECKCL == g_mathMode && xsize > 8 && ysize > 8) { tclMaskHighIntensityChange(c0[0].data(), c0[1].data(), c0[2].data(), c1[0].data(), c1[1].data(), c1[2].data(), @@ -247,17 +240,15 @@ namespace butteraugli void ScaleImage(double scale, std::vector *result) { - std::vector result_org; - if (g_checkOpenCL && result->size() > 64) + if (MODE_CHECKCL == g_mathMode && result->size() > 64) { - result_org = *result; + std::vector result_org = *result; + _ScaleImage(scale, result); + tclScaleImage(scale, result_org.data(), (*result).data(), (*result).size()); } - - _ScaleImage(scale, result); - - if (g_checkOpenCL && result->size() > 64) + else { - tclScaleImage(scale, result_org.data(), (*result).data(), (*result).size()); + _ScaleImage(scale, result); } } @@ -271,7 +262,7 @@ namespace butteraugli { _Convolution(xsize, ysize, xstep, len, offset, multipliers, inp, border_ratio, result); - if (g_checkOpenCL && xsize > 8 && ysize > 8) + if (MODE_CHECKCL == g_mathMode && xsize > 8 && ysize > 8) { tclConvolution(xsize, ysize, xstep, len, offset, multipliers, inp, border_ratio, result); } @@ -280,25 +271,24 @@ namespace butteraugli void Blur(size_t xsize, size_t ysize, float* channel, double sigma, double border_ratio) { - std::vector orignChannel; - if (g_checkOpenCL && xsize > 8 && ysize > 8) + if (MODE_CHECKCL == g_mathMode && xsize > 8 && ysize > 8) { + std::vector orignChannel; orignChannel.resize(xsize * ysize); memcpy(orignChannel.data(), channel, xsize * ysize * sizeof(float)); + _Blur(xsize, ysize, channel, sigma, border_ratio); + tclBlur(orignChannel.data(), xsize, ysize, sigma, border_ratio, channel); } - - _Blur(xsize, ysize, channel, sigma, border_ratio); - - if (g_checkOpenCL && xsize > 8 && ysize > 8) + else { - tclBlur(orignChannel.data(), xsize, ysize, sigma, border_ratio, channel); + _Blur(xsize, ysize, channel, sigma, border_ratio); } } void OpsinDynamicsImage(size_t xsize, size_t ysize, std::vector > &rgb) { - if (g_useOpenCL && xsize > 100 && ysize > 100) + if (MODE_OPENCL == g_mathMode && xsize > 100 && ysize > 100) { float * r = rgb[0].data(); float * g = rgb[1].data(); @@ -307,7 +297,7 @@ namespace butteraugli clOpsinDynamicsImage(r, g, b, xsize, ysize); } #ifdef __USE_CUDA__ - else if (g_useCuda && xsize > 100 && ysize > 100) + else if (MODE_CUDA == g_mathMode && xsize > 100 && ysize > 100) { float * r = rgb[0].data(); float * g = rgb[1].data(); @@ -316,21 +306,17 @@ namespace butteraugli cuOpsinDynamicsImage(r, g, b, xsize, ysize); } #endif - else + else if (MODE_CHECKCL == g_mathMode && xsize > 8 & ysize > 8) { - std::vector< std::vector> orig_rgb; - if (g_checkOpenCL && xsize > 8 && ysize > 8) - { - orig_rgb = rgb; - } - + std::vector< std::vector> orig_rgb = rgb; _OpsinDynamicsImage(xsize, ysize, rgb); - - if (g_checkOpenCL && xsize > 8 && ysize > 8) - { - tclOpsinDynamicsImage(orig_rgb[0].data(), orig_rgb[1].data(), orig_rgb[2].data(), xsize, ysize, + tclOpsinDynamicsImage(orig_rgb[0].data(), orig_rgb[1].data(), orig_rgb[2].data(), + xsize, ysize, rgb[0].data(), rgb[1].data(), rgb[2].data()); - } } + else + { + _OpsinDynamicsImage(xsize, ysize, rgb); + } } } \ No newline at end of file diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index 4495e935..1870a638 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -4,9 +4,7 @@ #include #include "cl.hpp" -extern bool g_useOpenCL = false; -extern bool g_useCuda = false; -extern bool g_checkOpenCL = false; +extern MATH_MODE g_mathMode = MODE_CPU; void clOpsinDynamicsImage(float *r, float *g, float *b, const size_t xsize, const size_t ysize) { diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h index ccdf24a8..cad4ef6e 100644 --- a/clguetzli/clguetzli.h +++ b/clguetzli/clguetzli.h @@ -7,9 +7,16 @@ #include "cuguetzli.h" -extern bool g_useOpenCL; -extern bool g_useCuda; -extern bool g_checkOpenCL; +enum MATH_MODE +{ + MODE_CPU = 0, + MODE_OPENCL, + MODE_CUDA, + MODE_CHECKCL, + MODE_CHECKCUDA +}; + +extern MATH_MODE g_mathMode; void clOpsinDynamicsImage( float *r, float *g, float *b, diff --git a/guetzli.vcxproj b/guetzli.vcxproj index c8936a47..6c41c349 100644 --- a/guetzli.vcxproj +++ b/guetzli.vcxproj @@ -108,7 +108,7 @@ true false true - __USE_CUDA__;_UNICODE;UNICODE;%(PreprocessorDefinitions) + _UNICODE;UNICODE;%(PreprocessorDefinitions) Console @@ -192,7 +192,7 @@ Console true - nvrtc.lib;cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies) + cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies) mainCRTStartup __tcmalloc $(CUDA_PATH)\lib\Win32 diff --git a/guetzli/guetzli.cc b/guetzli/guetzli.cc index 587c06d4..276cb9d6 100644 --- a/guetzli/guetzli.cc +++ b/guetzli/guetzli.cc @@ -227,6 +227,8 @@ void Usage() { " the limit. Default limit is %d MB.\n" " --opencl - Use OpenCL\n" " --cuda - Use CUDA\n" + " --checkcl - Check OpenCL result\n" + " --checkcuda - Check CUDA result\n" " --nomemlimit - Do not limit memory usage.\n", kDefaultJPEGQuality, kDefaultMemlimitMB); exit(1); } @@ -260,14 +262,17 @@ int main(int argc, char** argv) { memlimit_mb = -1; } else if (!strcmp(argv[opt_idx], "--opencl")) { - g_useOpenCL = true; + g_mathMode = MODE_OPENCL; } else if (!strcmp(argv[opt_idx], "--cuda")) { - g_useCuda = true; - } - else if (!strcmp(argv[opt_idx], "--checkcl")) { - g_checkOpenCL = true; + g_mathMode = MODE_CUDA; } + else if (!strcmp(argv[opt_idx], "--checkcl")) { + g_mathMode = MODE_CHECKCL; + } + else if (!strcmp(argv[opt_idx], "--checkcuda")) { + g_mathMode = MODE_CHECKCUDA; + } else if (!strcmp(argv[opt_idx], "--")) { opt_idx++; break; diff --git a/guetzli/processor.cc b/guetzli/processor.cc index 63ebb609..59b5dff6 100644 --- a/guetzli/processor.cc +++ b/guetzli/processor.cc @@ -567,7 +567,7 @@ void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, co CoeffData * output_order = NULL; ButteraugliComparatorEx * comp = (ButteraugliComparatorEx*)comparator_; - if (g_useOpenCL || g_checkOpenCL) + if (MODE_OPENCL == g_mathMode || MODE_CUDA == g_mathMode) { channel_info orig_channel[3]; channel_info mayout_channel[3]; @@ -588,7 +588,7 @@ void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, co output_order_gpu.resize(num_blocks * kBlockSize); output_order = output_order_gpu.data(); - if (g_useCuda) + if (MODE_OPENCL == g_mathMode) { clComputeBlockZeroingOrder(output_order, orig_channel, @@ -601,9 +601,10 @@ void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, co comp_mask, comp->BlockErrorLimit()); } +#ifdef __USE_CUDA__ else { - clComputeBlockZeroingOrder(output_order, + cuComputeBlockZeroingOrder(output_order, orig_channel, comp->imgOpsinDynamicsBlockList.data(), comp->imgMaskXyzScaleBlockList.data(), @@ -614,9 +615,10 @@ void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, co comp_mask, comp->BlockErrorLimit()); } - +#endif } - if (!g_useOpenCL || g_checkOpenCL) + + if (MODE_CPU == g_mathMode || MODE_CHECKCL == g_mathMode) { output_order_cpu.resize(num_blocks * kBlockSize); output_order = output_order_cpu.data(); @@ -651,7 +653,7 @@ void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, co } } - if (g_checkOpenCL) + if (MODE_CHECKCL == g_mathMode) { int count = 0; int check_size = output_order_gpu.size(); From 598603b0217a5701584c0238f4a84d19666ac38a Mon Sep 17 00:00:00 2001 From: strongtu Date: Mon, 5 Jun 2017 00:00:21 +0800 Subject: [PATCH 134/189] =?UTF-8?q?=E5=BC=82=E6=AD=A5=E6=8B=B7=E8=B4=9D?= =?UTF-8?q?=E5=86=85=E5=AD=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/cuguetzli.cpp | 8 ++++---- guetzli.vcxproj | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/clguetzli/cuguetzli.cpp b/clguetzli/cuguetzli.cpp index 0a464a77..c4ddc6c7 100644 --- a/clguetzli/cuguetzli.cpp +++ b/clguetzli/cuguetzli.cpp @@ -15,9 +15,10 @@ void cuOpsinDynamicsImage(float *r, float *g, float *b, const size_t xsize, cons cuOpsinDynamicsImageEx(rgb, xsize, ysize); - cuMemcpyDtoH(r, rgb.r, channel_size); - cuMemcpyDtoH(g, rgb.g, channel_size); - cuMemcpyDtoH(b, rgb.b, channel_size); + cuMemcpyDtoHAsync(r, rgb.r, channel_size, ocl.commandQueue); + cuMemcpyDtoHAsync(g, rgb.g, channel_size, ocl.commandQueue); + cuMemcpyDtoHAsync(b, rgb.b, channel_size, ocl.commandQueue); + cuFinish(ocl.commandQueue); ocl.releaseMemChannels(rgb); } @@ -143,7 +144,6 @@ void cuComputeBlockZeroingOrder( cuMemFree(mem_orig_coeff[c]); cuMemFree(mem_mayout_coeff[c]); cuMemFree(mem_mayout_pixel[c]); - } cuMemFree(mem_orig_image); diff --git a/guetzli.vcxproj b/guetzli.vcxproj index 6c41c349..8bbeebc2 100644 --- a/guetzli.vcxproj +++ b/guetzli.vcxproj @@ -108,7 +108,7 @@ true false true - _UNICODE;UNICODE;%(PreprocessorDefinitions) + __USE_CUDA__;_UNICODE;UNICODE;%(PreprocessorDefinitions) Console From 8c29f1fba59e821b1dac29810ffba1b81159cd1e Mon Sep 17 00:00:00 2001 From: strongtu Date: Mon, 5 Jun 2017 19:18:56 +0800 Subject: [PATCH 135/189] =?UTF-8?q?=E5=AE=8C=E6=88=90CUDA=E5=B9=B6?= =?UTF-8?q?=E8=A1=8C=E4=BC=98=E5=8C=96=EF=BC=8C=E8=AE=A1=E7=AE=97=E7=BB=93?= =?UTF-8?q?=E6=9E=9C=E6=AD=A3=E5=B8=B8=20=E7=9B=AE=E5=89=8D=E9=80=9F?= =?UTF-8?q?=E5=BA=A6=E6=AF=94opencl=E7=95=A5=E5=B7=AE=EF=BC=8C=E5=BE=85?= =?UTF-8?q?=E5=88=86=E6=9E=90=E4=BC=98=E5=8C=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clguetzli.cl | 90 ++++++++++++++++++++++++++-------------- clguetzli/clguetzli.cl.h | 12 +++--- clguetzli/clguetzli.cpp | 33 +++++++++------ clguetzli/cuguetzli.cpp | 77 +++++++++++++++++++--------------- compile.bat | 2 +- guetzli.vcxproj | 14 ++++--- 6 files changed, 141 insertions(+), 87 deletions(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index 1e026fa9..8df441e9 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -106,6 +106,7 @@ __kernel void clConvolutionEx( __kernel void clConvolutionXEx( __global float* result, + const int xsize, const int ysize, __global const float* inp, __global const float* multipliers, const int len, const int step, const int offset, const float border_ratio) @@ -113,10 +114,12 @@ __kernel void clConvolutionXEx( const int x = get_global_id(0); const int y = get_global_id(1); + if (x >= xsize || y >= ysize) return; + if (x % step != 0) return; - const int xsize = get_global_size(0); - const int ysize = get_global_size(1); +// const int xsize = get_global_size(0); +// const int ysize = get_global_size(1); float weight_no_border = 0; for (int j = 0; j <= 2 * offset; j++) @@ -147,6 +150,7 @@ __kernel void clConvolutionXEx( __kernel void clConvolutionYEx( __global float* result, + const int xsize, const int ysize, __global const float* inp, __global const float* multipliers, const int len, const int step, const int offset, const float border_ratio) @@ -154,11 +158,12 @@ __kernel void clConvolutionYEx( const int x = get_global_id(0); const int y = get_global_id(1); + if (x >= xsize || y >= ysize) return; if (x % step != 0) return; if (y % step != 0) return; - const int xsize = get_global_size(0); - const int ysize = get_global_size(1); +// const int xsize = get_global_size(0); +// const int ysize = get_global_size(1); float weight_no_border = 0; for (int j = 0; j <= 2 * offset; j++) @@ -189,28 +194,33 @@ __kernel void clConvolutionYEx( __kernel void clSquareSampleEx( __global float* result, + const int xsize, const int ysize, __global const float* image, const int xstep, const int ystep) { const int x = get_global_id(0); const int y = get_global_id(1); + if (x >= xsize || y >= ysize) return; int x_sample = x - x % xstep; int y_sample = y - y % ystep; if (x_sample == x && y_sample == y) return; - const int xsize = get_global_size(0); - const int ysize = get_global_size(1); +// const int xsize = get_global_size(0); +// const int ysize = get_global_size(1); result[y * xsize + x] = image[y_sample * xsize + x_sample]; } __kernel void clOpsinDynamicsImageEx( __global float *r, __global float *g, __global float *b, + const int size, __global const float *r_blurred, __global const float *g_blurred, __global const float *b_blurred) { const int i = get_global_id(0); + if (i >= size) return; + double pre[3] = { r_blurred[i], g_blurred[i], b_blurred[i] }; double pre_mixed[3]; OpsinAbsorbance(pre, pre_mixed); @@ -236,6 +246,7 @@ __kernel void clOpsinDynamicsImageEx( __kernel void clMaskHighIntensityChangeEx( __global float *xyb0_x, __global float *xyb0_y, __global float *xyb0_b, + const int xsize, const int ysize, __global float *xyb1_x, __global float *xyb1_y, __global float *xyb1_b, __global const float *c0_x, __global const float *c0_y, __global const float *c0_b, __global const float *c1_x, __global const float *c1_y, __global const float *c1_b @@ -243,8 +254,9 @@ __kernel void clMaskHighIntensityChangeEx( { const int x = get_global_id(0); const int y = get_global_id(1); - const int xsize = get_global_size(0); - const int ysize = get_global_size(1); + if (x >= xsize || y >= ysize) return; +// const int xsize = get_global_size(0); + //const int ysize = get_global_size(1); size_t ix = y * xsize + x; const double ave[3] = { @@ -327,6 +339,7 @@ __kernel void clEdgeDetectorMapEx( __kernel void clBlockDiffMapEx( __global float* block_diff_dc, __global float* block_diff_ac, + const int res_xsize, const int res_ysize, __global const float* r, __global const float* g, __global const float* b, __global const float* r2, __global const float* g2, __global const float* b2, int xsize, int ysize, int step) @@ -334,8 +347,10 @@ __kernel void clBlockDiffMapEx( const int res_x = get_global_id(0); const int res_y = get_global_id(1); - const int res_xsize = get_global_size(0); - const int res_ysize = get_global_size(1); +// const int res_xsize = get_global_size(0); +// const int res_ysize = get_global_size(1); + + if (res_x >= res_xsize || res_y >= res_ysize) return; int pos_x = res_x * step; int pos_y = res_y * step; @@ -450,13 +465,15 @@ __kernel void clEdgeDetectorLowFreqEx( __kernel void clDiffPrecomputeEx( __global float *mask_x, __global float *mask_y, __global float *mask_b, + const int xsize, const int ysize, __global const float *xyb0_x, __global const float *xyb0_y, __global const float *xyb0_b, __global const float *xyb1_x, __global const float *xyb1_y, __global const float *xyb1_b) { const int x = get_global_id(0); const int y = get_global_id(1); - const int xsize = get_global_size(0); - const int ysize = get_global_size(1); + if (x >= xsize || y >= ysize) return; +// const int xsize = get_global_size(0); + //const int ysize = get_global_size(1); double valsh0[3] = { 0.0 }; double valsv0[3] = { 0.0 }; @@ -514,20 +531,23 @@ __kernel void clDiffPrecomputeEx( mask_b[ix] = (float)(m); } -__kernel void clScaleImageEx(__global float *img, double scale) +__kernel void clScaleImageEx(__global float *img, const int size, float scale) { const int i = get_global_id(0); + if (i >= size) return; + img[i] *= scale; } #define Average5x5_w 0.679144890667f __constant float Average5x5_scale = 1.0f / (5.0f + 4 * Average5x5_w); -__kernel void clAverage5x5Ex(__global float *img, __global const float *img_org) +__kernel void clAverage5x5Ex(__global float *img, const int xsize, const int ysize, __global const float *img_org) { const int x = get_global_id(0); const int y = get_global_id(1); - const int xsize = get_global_size(0); - const int ysize = get_global_size(1); + if (x >= xsize || y >= ysize) return; +// const int xsize = get_global_size(0); +// const int ysize = get_global_size(1); const int row0 = y * xsize; if (x - 1 >= 0) { @@ -562,31 +582,33 @@ __kernel void clAverage5x5Ex(__global float *img, __global const float *img_org) img[row0 + x] *= Average5x5_scale; } -__kernel void clMinSquareValEx(__global float* result, __global const float* img, int square_size, int offset) +__kernel void clMinSquareValEx(__global float* result, const int xsize, const int ysize, __global const float* img, int square_size, int offset) { const int x = get_global_id(0); const int y = get_global_id(1); - const int width = get_global_size(0); - const int height = get_global_size(1); + + if (x >= xsize || y >= ysize) return; +// const int width = get_global_size(0); +// const int height = get_global_size(1); int minH = offset > y ? 0 : y - offset; - int maxH = min(y + square_size - offset, height); + int maxH = min(y + square_size - offset, ysize); int minW = offset > x ? 0 : x - offset; - int maxW = min(x + square_size - offset, width); + int maxW = min(x + square_size - offset, xsize); - float minValue = img[minH * width + minW]; + float minValue = img[minH * xsize + minW]; for (int j = minH; j < maxH; j++) { for (int i = minW; i < maxW; i++) { - float tmp = img[j * width + i]; + float tmp = img[j * xsize + i]; if (tmp < minValue) minValue = tmp; } } - result[y * width + x] = minValue; + result[y * xsize + x] = minValue; } __kernel void clDoMaskEx( @@ -723,6 +745,8 @@ __kernel void clComputeBlockZeroingOrderEx( __global const coeff_t *orig_batch_2, // ԭʼͼÏñϵÊý __global const float *orig_image_batch, // ԭʼͼÏñpregamma __global const float *mask_scale, // ԭʼͼÏñµÄij¸öÉñÃØ²ÎÊý + const int block_xsize, + const int block_ysize, const int image_width, const int image_height, @@ -744,6 +768,8 @@ __kernel void clComputeBlockZeroingOrderEx( const int block_x = get_global_id(0); const int block_y = get_global_id(1); + if (block_x >= block_xsize || block_y >= block_ysize) return; + channel_info orig_channel[3]; orig_channel[0].coeff = orig_batch_0; orig_channel[1].coeff = orig_batch_1; @@ -3151,16 +3177,16 @@ __device__ double CompareBlockFactor(const channel_info mayout_channel[3], candidate_channel[c] = &candidate_block[c * 8 * 8]; } - uchar yuv16x16[3 * 16 * 16] = { 0 }; // factor 2 mode output image +// uchar yuv16x16[3 * 16 * 16] = { 0 }; // factor 2 mode output image uchar yuv8x8[3 * 8 * 8] = { 0 }; // factor 1 mode output image for (int c = 0; c < 3; c++) { - if (mayout_channel[c].factor == 1) { - if (factor == 1) { +// if (mayout_channel[c].factor == 1) { + // if (factor == 1) { const coeff_t *coeff_block = candidate_channel[c]; CoeffToYUV8x8(coeff_block, &yuv8x8[c]); - } + /* } else { for (int iy = 0; iy < factor; ++iy) { for (int ix = 0; ix < factor; ++ix) { @@ -3182,7 +3208,8 @@ __device__ double CompareBlockFactor(const channel_info mayout_channel[3], } } } - } +*/ + /* } else { if (factor == 1) { int block_xx = block_x / mayout_channel[c].factor; @@ -3209,9 +3236,10 @@ __device__ double CompareBlockFactor(const channel_info mayout_channel[3], image_height); } } +*/ } - if (factor == 1) + // if (factor == 1) { float rgb0_c[3][kDCTBlockSize]; int block_8x8idx = GetOrigBlock(rgb0_c, orig_image_batch, image_width, image_height, block_x, block_y, factor, 0, 0); @@ -3224,6 +3252,7 @@ __device__ double CompareBlockFactor(const channel_info mayout_channel[3], return ComputeImage8x8Block(rgb0_c, rgb1_c, mask_scale + block_8x8idx * 3); } +/* else { int inside_x = block_x * 16 + 16 > image_width ? image_width - block_x * 16 : 16; @@ -3255,5 +3284,6 @@ __device__ double CompareBlockFactor(const channel_info mayout_channel[3], } return max_err; } +*/ } diff --git a/clguetzli/clguetzli.cl.h b/clguetzli/clguetzli.cl.h index 6dec83c8..2a8ed044 100644 --- a/clguetzli/clguetzli.cl.h +++ b/clguetzli/clguetzli.cl.h @@ -117,9 +117,9 @@ { switch (dim) { - case 0: return blockIdx.x; - case 1: return blockIdx.y; - default: return blockIdx.z; + case 0: return blockIdx.x * blockDim.x + threadIdx.x; + case 1: return blockIdx.y * blockDim.y + threadIdx.y; + default: return blockIdx.z * blockDim.z + threadIdx.z; } } @@ -127,9 +127,9 @@ { switch(dim) { - case 0: return gridDim.x; - case 1: return gridDim.y; - default: return gridDim.z; + case 0: return gridDim.x * blockDim.x; + case 1: return gridDim.y * blockDim.y; + default: return gridDim.z * blockDim.z; } } diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index 1870a638..11cb16ed 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -119,6 +119,7 @@ void clComputeBlockZeroingOrder( cl_kernel kernel = ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER]; clSetKernelArgEx(kernel, &mem_orig_coeff[0], &mem_orig_coeff[1], &mem_orig_coeff[2], &mem_orig_image, &mem_mask_scale, + &blockf_width, &blockf_height, &image_width, &image_height, &mem_mayout_coeff[0], &mem_mayout_coeff[1], &mem_mayout_coeff[2], &mem_mayout_pixel[0], &mem_mayout_pixel[1], &mem_mayout_pixel[2], @@ -210,7 +211,7 @@ void clConvolutionXEx( ocl_args_d_t &ocl = getOcl(); cl_kernel kernel = ocl.kernel[KERNEL_CONVOLUTIONX]; - clSetKernelArgEx(kernel, &result, &inp, &multipliers, &len, &xstep, &offset, &border_ratio); + clSetKernelArgEx(kernel, &result, &xsize, &ysize, &inp, &multipliers, &len, &xstep, &offset, &border_ratio); size_t globalWorkSize[2] = { xsize, ysize }; cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); @@ -228,7 +229,7 @@ void clConvolutionYEx( ocl_args_d_t &ocl = getOcl(); cl_kernel kernel = ocl.kernel[KERNEL_CONVOLUTIONY]; - clSetKernelArgEx(kernel, &result, &inp, &multipliers, &len, &xstep, &offset, &border_ratio); + clSetKernelArgEx(kernel, &result, &xsize, &ysize, &inp, &multipliers, &len, &xstep, &offset, &border_ratio); size_t globalWorkSize[2] = { xsize, ysize }; cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); @@ -245,7 +246,7 @@ void clSquareSampleEx( ocl_args_d_t &ocl = getOcl(); cl_kernel kernel = ocl.kernel[KERNEL_SQUARESAMPLE]; - clSetKernelArgEx(kernel, &result, &image, &xstep, &ystep); + clSetKernelArgEx(kernel, &result, &xsize, &ysize, &image, &xstep, &ystep); size_t globalWorkSize[2] = { xsize, ysize }; cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); @@ -301,12 +302,14 @@ void clOpsinDynamicsImageEx(ocl_channels &rgb, const size_t xsize, const size_t ocl_args_d_t &ocl = getOcl(); ocl_channels rgb_blurred = ocl.allocMemChannels(channel_size); - clBlurEx(rgb.r, xsize, ysize, kSigma, 0.0, rgb_blurred.r); - clBlurEx(rgb.g, xsize, ysize, kSigma, 0.0, rgb_blurred.g); - clBlurEx(rgb.b, xsize, ysize, kSigma, 0.0, rgb_blurred.b); + const int size = xsize * ysize; + + clBlurEx(rgb.r, xsize, ysize, kSigma, 0.0, rgb_blurred.r); + clBlurEx(rgb.g, xsize, ysize, kSigma, 0.0, rgb_blurred.g); + clBlurEx(rgb.b, xsize, ysize, kSigma, 0.0, rgb_blurred.b); cl_kernel kernel = ocl.kernel[KERNEL_OPSINDYNAMICSIMAGE]; - clSetKernelArgEx(kernel, &rgb.r, &rgb.g, &rgb.b, &rgb_blurred.r, &rgb_blurred.g, &rgb_blurred.b); + clSetKernelArgEx(kernel, &rgb.r, &rgb.g, &rgb.b, &size, &rgb_blurred.r, &rgb_blurred.g, &rgb_blurred.b); size_t globalWorkSize[1] = { xsize * ysize }; cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL); @@ -340,6 +343,7 @@ void clMaskHighIntensityChangeEx( cl_kernel kernel = ocl.kernel[KERNEL_MASKHIGHINTENSITYCHANGE]; clSetKernelArgEx(kernel, &xyb0.r, &xyb0.g, &xyb0.b, + &xsize, &ysize, &xyb1.r, &xyb1.g, &xyb1.b, &c0.r, &c0.g, &c0.b, &c1.r, &c1.g, &c1.b); @@ -401,14 +405,17 @@ void clBlockDiffMapEx( { ocl_args_d_t &ocl = getOcl(); + + const size_t res_xsize = (xsize + step - 1) / step; + const size_t res_ysize = (ysize + step - 1) / step; + cl_kernel kernel = ocl.kernel[KERNEL_BLOCKDIFFMAP]; clSetKernelArgEx(kernel, &block_diff_dc, &block_diff_ac, + &res_xsize, &res_ysize, &rgb.r, &rgb.g, &rgb.b, &rgb2.r, &rgb2.g, &rgb2.b, &xsize, &ysize, &step); - const size_t res_xsize = (xsize + step - 1) / step; - const size_t res_ysize = (ysize + step - 1) / step; size_t globalWorkSize[2] = { res_xsize, res_ysize }; cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); @@ -463,6 +470,7 @@ void clDiffPrecomputeEx( cl_kernel kernel = ocl.kernel[KERNEL_DIFFPRECOMPUTE]; clSetKernelArgEx(kernel, &mask.x, &mask.y, &mask.b, + &xsize, &ysize, &xyb0.x, &xyb0.y, &xyb0.b, &xyb1.x, &xyb1.y, &xyb1.b); @@ -476,9 +484,10 @@ void clDiffPrecomputeEx( void clScaleImageEx(cl_mem img/*in, out*/, size_t size, double w) { ocl_args_d_t &ocl = getOcl(); + float fw = w; cl_kernel kernel = ocl.kernel[KERNEL_SCALEIMAGE]; - clSetKernelArgEx(kernel, &img, &w); + clSetKernelArgEx(kernel, &img, &size, &fw); size_t globalWorkSize[1] = { size }; cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL); @@ -502,7 +511,7 @@ void clAverage5x5Ex(cl_mem img/*in,out*/, const size_t xsize, const size_t ysize clEnqueueCopyBuffer(ocl.commandQueue, img, img_org, 0, 0, len, 0, NULL, NULL); cl_kernel kernel = ocl.kernel[KERNEL_AVERAGE5X5]; - clSetKernelArgEx(kernel, &img, &img_org); + clSetKernelArgEx(kernel, &img, &xsize, &ysize, &img_org); size_t globalWorkSize[2] = { xsize, ysize }; cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); @@ -523,7 +532,7 @@ void clMinSquareValEx( cl_mem result = ocl.allocMem(sizeof(cl_float) * xsize * ysize); cl_kernel kernel = ocl.kernel[KERNEL_MINSQUAREVAL]; - clSetKernelArgEx(kernel, &result, &img, &square_size, &offset); + clSetKernelArgEx(kernel, &result, &xsize, &ysize, &img, &square_size, &offset); size_t globalWorkSize[2] = { xsize, ysize }; cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); diff --git a/clguetzli/cuguetzli.cpp b/clguetzli/cuguetzli.cpp index c4ddc6c7..80199dea 100644 --- a/clguetzli/cuguetzli.cpp +++ b/clguetzli/cuguetzli.cpp @@ -5,6 +5,10 @@ #ifdef __USE_CUDA__ #define cuFinish cuStreamSynchronize +#define BLOCK_SIZE_X 16 +#define BLOCK_SIZE_Y 12 +#define BLOCK_COUNT_X(size) ((size + BLOCK_SIZE_X - 1) / BLOCK_SIZE_X) +#define BLOCK_COUNT_Y(size) ((size + BLOCK_SIZE_Y - 1) / BLOCK_SIZE_Y) void cuOpsinDynamicsImage(float *r, float *g, float *b, const size_t xsize, const size_t ysize) { @@ -118,6 +122,7 @@ void cuComputeBlockZeroingOrder( CUfunction kernel = ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER]; const void *args[] = { &mem_orig_coeff[0], &mem_orig_coeff[1], &mem_orig_coeff[2], &mem_orig_image, &mem_mask_scale, + &blockf_width, &blockf_height, &image_width, &image_height, &mem_mayout_coeff[0], &mem_mayout_coeff[1], &mem_mayout_coeff[2], &mem_mayout_pixel[0], &mem_mayout_pixel[1], &mem_mayout_pixel[2], @@ -128,8 +133,8 @@ void cuComputeBlockZeroingOrder( &mem_output_order_batch }; CUresult err = cuLaunchKernel(kernel, - blockf_width, blockf_height, 1, - 1, 1, 1, + BLOCK_COUNT_X(blockf_width), BLOCK_COUNT_Y(blockf_height), 1, + BLOCK_SIZE_X, BLOCK_SIZE_Y, 1, 0, ocl.commandQueue, (void**)args, NULL); LOG_CU_RESULT(err); @@ -216,11 +221,11 @@ void cuConvolutionXEx( ocu_args_d_t &ocl = getOcu(); CUfunction kernel = ocl.kernel[KERNEL_CONVOLUTIONX]; - const void *args[] = { &result, &inp, &multipliers, &len, &xstep, &offset, &border_ratio }; + const void *args[] = { &result, &xsize, &ysize, &inp, &multipliers, &len, &xstep, &offset, &border_ratio }; CUresult err = cuLaunchKernel(kernel, - xsize, ysize, 1, - 1, 1, 1, + BLOCK_COUNT_X(xsize), BLOCK_COUNT_Y(ysize), 1, + BLOCK_SIZE_X, BLOCK_SIZE_Y, 1, 0, ocl.commandQueue, (void**)args, NULL); LOG_CU_RESULT(err); @@ -237,11 +242,11 @@ void cuConvolutionYEx( ocu_args_d_t &ocl = getOcu(); CUfunction kernel = ocl.kernel[KERNEL_CONVOLUTIONY]; - const void *args[] = { &result, &inp, &multipliers, &len, &xstep, &offset, &border_ratio }; + const void *args[] = { &result, &xsize, &ysize, &inp, &multipliers, &len, &xstep, &offset, &border_ratio }; CUresult err = cuLaunchKernel(kernel, - xsize, ysize, 1, - 1, 1, 1, + BLOCK_COUNT_X(xsize), BLOCK_COUNT_Y(ysize), 1, + BLOCK_SIZE_X, BLOCK_SIZE_Y, 1, 0, ocl.commandQueue, (void**)args, NULL); LOG_CU_RESULT(err); @@ -257,11 +262,11 @@ void cuSquareSampleEx( ocu_args_d_t &ocl = getOcu(); CUfunction kernel = ocl.kernel[KERNEL_SQUARESAMPLE]; - const void *args[] = { &result, &image, &xstep, &ystep }; + const void *args[] = { &result, &xsize, &ysize, &image, &xstep, &ystep }; CUresult err = cuLaunchKernel(kernel, - xsize, ysize, 1, - 1, 1, 1, + BLOCK_COUNT_X(xsize), BLOCK_COUNT_Y(ysize), 1, + BLOCK_SIZE_X, BLOCK_SIZE_Y, 1, 0, ocl.commandQueue, (void**)args, NULL); LOG_CU_RESULT(err); @@ -316,18 +321,20 @@ void cuOpsinDynamicsImageEx(ocu_channels &rgb, const size_t xsize, const size_t ocu_args_d_t &ocl = getOcu(); ocu_channels rgb_blurred = ocl.allocMemChannels(channel_size); + const int size = xsize * ysize; + cuBlurEx(rgb.r, xsize, ysize, kSigma, 0.0, rgb_blurred.r); cuBlurEx(rgb.g, xsize, ysize, kSigma, 0.0, rgb_blurred.g); cuBlurEx(rgb.b, xsize, ysize, kSigma, 0.0, rgb_blurred.b); CUfunction kernel = ocl.kernel[KERNEL_OPSINDYNAMICSIMAGE]; - void *args[] = { &rgb.r, &rgb.g, &rgb.b, &rgb_blurred.r, &rgb_blurred.g, &rgb_blurred.b }; + const void *args[] = { &rgb.r, &rgb.g, &rgb.b, &size, &rgb_blurred.r, &rgb_blurred.g, &rgb_blurred.b }; CUresult err = cuLaunchKernel(kernel, - xsize * ysize, 1, 1, - 1, 1, 1, + (size + BLOCK_SIZE_X * BLOCK_SIZE_Y - 1) / BLOCK_SIZE_X * BLOCK_SIZE_Y, 1, 1, + BLOCK_SIZE_X * BLOCK_SIZE_Y, 1, 1, 0, - ocl.commandQueue, args, NULL); + ocl.commandQueue, (void**)args, NULL); LOG_CU_RESULT(err); err = cuFinish(ocl.commandQueue); LOG_CU_RESULT(err); @@ -358,13 +365,14 @@ void cuMaskHighIntensityChangeEx( CUfunction kernel = ocl.kernel[KERNEL_MASKHIGHINTENSITYCHANGE]; const void *args[] = { &xyb0.r, &xyb0.g, &xyb0.b, + &xsize, &ysize, &xyb1.r, &xyb1.g, &xyb1.b, &c0.r, &c0.g, &c0.b, &c1.r, &c1.g, &c1.b }; CUresult err = cuLaunchKernel(kernel, - xsize, ysize, 1, - 1, 1, 1, + BLOCK_COUNT_X(xsize), BLOCK_COUNT_Y(ysize), 1, + BLOCK_SIZE_X, BLOCK_SIZE_Y, 1, 0, ocl.commandQueue, (void**)args, NULL); LOG_CU_RESULT(err); @@ -425,18 +433,19 @@ void cuBlockDiffMapEx( { ocu_args_d_t &ocl = getOcu(); + const size_t res_xsize = (xsize + step - 1) / step; + const size_t res_ysize = (ysize + step - 1) / step; + CUfunction kernel = ocl.kernel[KERNEL_BLOCKDIFFMAP]; const void *args[] = { &block_diff_dc, &block_diff_ac, + &res_xsize, &res_ysize, &rgb.r, &rgb.g, &rgb.b, &rgb2.r, &rgb2.g, &rgb2.b, &xsize, &ysize, &step }; - const size_t res_xsize = (xsize + step - 1) / step; - const size_t res_ysize = (ysize + step - 1) / step; - CUresult err = cuLaunchKernel(kernel, - res_xsize, res_ysize, 1, - 1, 1, 1, + BLOCK_COUNT_X(res_xsize), BLOCK_COUNT_Y(res_ysize), 1, + BLOCK_SIZE_X, BLOCK_SIZE_Y, 1, 0, ocl.commandQueue, (void**)args, NULL); LOG_CU_RESULT(err); @@ -494,12 +503,13 @@ void cuDiffPrecomputeEx( CUfunction kernel = ocl.kernel[KERNEL_DIFFPRECOMPUTE]; const void *args[] = { &mask.x, &mask.y, &mask.b, + &xsize, &ysize, &xyb0.x, &xyb0.y, &xyb0.b, &xyb1.x, &xyb1.y, &xyb1.b }; CUresult err = cuLaunchKernel(kernel, - xsize, ysize, 1, - 1, 1, 1, + BLOCK_COUNT_X(xsize), BLOCK_COUNT_Y(ysize), 1, + BLOCK_SIZE_X, BLOCK_SIZE_Y, 1, 0, ocl.commandQueue, (void**)args, NULL); LOG_CU_RESULT(err); @@ -510,13 +520,14 @@ void cuDiffPrecomputeEx( void cuScaleImageEx(cu_mem img/*in, out*/, size_t size, double w) { ocu_args_d_t &ocl = getOcu(); + float fw = w; CUfunction kernel = ocl.kernel[KERNEL_SCALEIMAGE]; - const void *args[] = { &img, &w }; + const void *args[] = { &img, &size, &fw }; CUresult err = cuLaunchKernel(kernel, - size, 1, 1, - 1, 1, 1, + (size + BLOCK_SIZE_X * BLOCK_SIZE_Y - 1) / BLOCK_SIZE_X * BLOCK_SIZE_Y, 1, 1, + BLOCK_SIZE_X, BLOCK_SIZE_Y, 1, 0, ocl.commandQueue, (void**)args, NULL); LOG_CU_RESULT(err); @@ -539,11 +550,11 @@ void cuAverage5x5Ex(cu_mem img/*in,out*/, const size_t xsize, const size_t ysize cuMemcpyDtoD(img_org, img, len); CUfunction kernel = ocl.kernel[KERNEL_AVERAGE5X5]; - const void *args[] = { &img, &img_org }; + const void *args[] = { &img, &xsize, &ysize, &img_org }; CUresult err = cuLaunchKernel(kernel, - xsize, ysize, 1, - 1, 1, 1, + BLOCK_COUNT_X(xsize), BLOCK_COUNT_Y(ysize), 1, + BLOCK_SIZE_X, BLOCK_SIZE_Y, 1, 0, ocl.commandQueue, (void**)args, NULL); LOG_CU_RESULT(err); @@ -563,11 +574,11 @@ void cuMinSquareValEx( cu_mem result = ocl.allocMem(sizeof(float) * xsize * ysize); CUfunction kernel = ocl.kernel[KERNEL_MINSQUAREVAL]; - const void *args[] = { &result, &img, &square_size, &offset }; + const void *args[] = { &result, &xsize, &ysize, &img, &square_size, &offset }; CUresult err = cuLaunchKernel(kernel, - xsize, ysize, 1, - 1, 1, 1, + BLOCK_COUNT_X(xsize), BLOCK_COUNT_Y(ysize), 1, + BLOCK_SIZE_X, BLOCK_SIZE_Y, 1, 0, ocl.commandQueue, (void**)args, NULL); LOG_CU_RESULT(err); diff --git a/compile.bat b/compile.bat index 156ee639..4d462695 100644 --- a/compile.bat +++ b/compile.bat @@ -4,4 +4,4 @@ call vcvars64.bat @echo %1 --machine 64 or 32 @echo %2 -G -nvcc -Xcompiler "/wd 4819" -I"./" -arch=compute_30 --fmad=false --machine %1 %2 -ptx -o clguetzli\clguetzli.cu.ptx%1 clguetzli\clguetzli.cu \ No newline at end of file +nvcc -Xcompiler "/wd 4819" -I"./" -arch=compute_30 --machine %1 %2 -ptx -o clguetzli\clguetzli.cu.ptx%1 clguetzli\clguetzli.cu \ No newline at end of file diff --git a/guetzli.vcxproj b/guetzli.vcxproj index 8bbeebc2..f0e3fafe 100644 --- a/guetzli.vcxproj +++ b/guetzli.vcxproj @@ -156,7 +156,8 @@ $(CUDA_PATH)\lib\Win32 - compile.bat + + CUDA CU @@ -396,24 +397,27 @@ Document + true Document CUDA Code Builder - $(ProjectDir)compile.bat 64 -G - $(ProjectDir)compile.bat 32 -G + $(ProjectDir)compile.bat 64 + $(ProjectDir)compile.bat 32 false - cu.ptx + clguetzli\clguetzli.cu.ptx64 $(ProjectDir)compile.bat 64 -G CUDA Code Builder cu.ptx false - true + false $(ProjectDir)compile.bat 32 -G CUDA Code Builder cu.ptx CUDA Code Builder cu.ptx + false + false From d13a9bad949cf1a3e0c80fe7745eecf04c269a0f Mon Sep 17 00:00:00 2001 From: strongtu Date: Mon, 5 Jun 2017 19:20:46 +0800 Subject: [PATCH 136/189] =?UTF-8?q?=E4=BF=AE=E6=AD=A3=E5=91=BD=E4=BB=A4?= =?UTF-8?q?=E8=A1=8C=E6=8F=90=E7=A4=BA=EF=BC=8CMax=20Thread=20Per=20MP?= =?UTF-8?q?=E5=92=8CSP=E6=98=AF=E4=B8=8D=E4=B8=80=E6=A0=B7=E7=9A=84?= =?UTF-8?q?=E6=A6=82=E5=BF=B5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/ocu.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clguetzli/ocu.cpp b/clguetzli/ocu.cpp index 8923bcd4..137c28bd 100644 --- a/clguetzli/ocu.cpp +++ b/clguetzli/ocu.cpp @@ -31,7 +31,7 @@ ocu_args_d_t& getOcu(void) cuDeviceGetAttribute(&cap_minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev); cuDeviceGetAttribute(&proc_count, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev); cuDeviceGetAttribute(&thread_count, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev); - LogError("CUDA Adapter:%s Ver%d.%d MP %d Core %d)\r\n", name, cap_major, cap_minor, proc_count, thread_count); + LogError("CUDA Adapter:%s Ver%d.%d MP %d MaxThread Per MP %d)\r\n", name, cap_major, cap_minor, proc_count, thread_count); /* char* source = nullptr; size_t src_size = 0; From f9ba50ebfa9223172539fcdf3046e44b5dc6a564 Mon Sep 17 00:00:00 2001 From: strongtu Date: Mon, 5 Jun 2017 20:17:03 +0800 Subject: [PATCH 137/189] =?UTF-8?q?=E8=B0=83=E6=95=B4=E5=8F=82=E6=95=B0?= =?UTF-8?q?=E8=AF=95=E8=AF=95=E6=80=A7=E8=83=BD=E6=83=85=E5=86=B5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clguetzli.cl | 5 +++-- clguetzli/clguetzli.cpp | 1 + clguetzli/cuguetzli.cpp | 19 ++++++++++++------- guetzli.vcxproj | 2 +- 4 files changed, 17 insertions(+), 10 deletions(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index 8df441e9..6ddd429c 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -613,6 +613,7 @@ __kernel void clMinSquareValEx(__global float* result, const int xsize, const in __kernel void clDoMaskEx( __global float *mask_x, __global float *mask_y, __global float *mask_b, + const int xsize, const int ysize, __global float *mask_dc_x, __global float *mask_dc_y, __global float *mask_dc_b, __global const double *lut_x, __global const double *lut_y, __global const double *lut_b, __global const double *lut_dc_x, __global const double *lut_dc_y, __global const double *lut_dc_b) @@ -620,8 +621,8 @@ __kernel void clDoMaskEx( const int x = get_global_id(0); const int y = get_global_id(1); - const int xsize = get_global_size(0); - const int ysize = get_global_size(1); +// const int xsize = get_global_size(0); +// const int ysize = get_global_size(1); const double w00 = 232.206464018; const double w11 = 22.9455222245; diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index 11cb16ed..a81b1189 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -646,6 +646,7 @@ void clDoMask(ocl_channels mask/*in, out*/, ocl_channels mask_dc/*in, out*/, siz cl_kernel kernel = ocl.kernel[KERNEL_DOMASK]; clSetKernelArgEx(kernel, &mask.r, &mask.g, &mask.b, + &xsize, &ysize, &mask_dc.r, &mask_dc.g, &mask_dc.b, &xyb.x, &xyb.y, &xyb.b, &xyb_dc.x, &xyb_dc.y, &xyb_dc.b); diff --git a/clguetzli/cuguetzli.cpp b/clguetzli/cuguetzli.cpp index 80199dea..8df19d01 100644 --- a/clguetzli/cuguetzli.cpp +++ b/clguetzli/cuguetzli.cpp @@ -6,7 +6,7 @@ #define cuFinish cuStreamSynchronize #define BLOCK_SIZE_X 16 -#define BLOCK_SIZE_Y 12 +#define BLOCK_SIZE_Y 16 #define BLOCK_COUNT_X(size) ((size + BLOCK_SIZE_X - 1) / BLOCK_SIZE_X) #define BLOCK_COUNT_Y(size) ((size + BLOCK_SIZE_Y - 1) / BLOCK_SIZE_Y) @@ -331,8 +331,10 @@ void cuOpsinDynamicsImageEx(ocu_channels &rgb, const size_t xsize, const size_t const void *args[] = { &rgb.r, &rgb.g, &rgb.b, &size, &rgb_blurred.r, &rgb_blurred.g, &rgb_blurred.b }; CUresult err = cuLaunchKernel(kernel, - (size + BLOCK_SIZE_X * BLOCK_SIZE_Y - 1) / BLOCK_SIZE_X * BLOCK_SIZE_Y, 1, 1, - BLOCK_SIZE_X * BLOCK_SIZE_Y, 1, 1, +// (size + BLOCK_SIZE_X * BLOCK_SIZE_Y - 1) / BLOCK_SIZE_X * BLOCK_SIZE_Y, 1, 1, +// BLOCK_SIZE_X * BLOCK_SIZE_Y, 1, 1, + (size + 511) / 512, 1, 1, + 512, 1, 1, 0, ocl.commandQueue, (void**)args, NULL); LOG_CU_RESULT(err); @@ -526,8 +528,10 @@ void cuScaleImageEx(cu_mem img/*in, out*/, size_t size, double w) const void *args[] = { &img, &size, &fw }; CUresult err = cuLaunchKernel(kernel, - (size + BLOCK_SIZE_X * BLOCK_SIZE_Y - 1) / BLOCK_SIZE_X * BLOCK_SIZE_Y, 1, 1, - BLOCK_SIZE_X, BLOCK_SIZE_Y, 1, +// (size + BLOCK_SIZE_X * BLOCK_SIZE_Y - 1) / BLOCK_SIZE_X * BLOCK_SIZE_Y, 1, 1, + (size + 511) / 512, 1, 1, +// BLOCK_SIZE_X * BLOCK_SIZE_Y, 1, 1, + 512, 1, 1, 0, ocl.commandQueue, (void**)args, NULL); LOG_CU_RESULT(err); @@ -690,13 +694,14 @@ void cuDoMask(ocu_channels mask/*in, out*/, ocu_channels mask_dc/*in, out*/, siz CUfunction kernel = ocl.kernel[KERNEL_DOMASK]; const void *args[] = { &mask.r, &mask.g, &mask.b, + &xsize, &ysize, &mask_dc.r, &mask_dc.g, &mask_dc.b, &xyb.x, &xyb.y, &xyb.b, &xyb_dc.x, &xyb_dc.y, &xyb_dc.b }; CUresult err = cuLaunchKernel(kernel, - xsize, ysize, 1, - 1, 1, 1, + BLOCK_COUNT_X(xsize), BLOCK_COUNT_Y(ysize), 1, + BLOCK_SIZE_X, BLOCK_SIZE_Y, 1, 0, ocl.commandQueue, (void**)args, NULL); LOG_CU_RESULT(err); diff --git a/guetzli.vcxproj b/guetzli.vcxproj index f0e3fafe..e6070b25 100644 --- a/guetzli.vcxproj +++ b/guetzli.vcxproj @@ -397,7 +397,7 @@ Document - true + false Document From cce5bc3fd303a416b5fdbf7d5851d7895f5233e7 Mon Sep 17 00:00:00 2001 From: strongtu Date: Tue, 6 Jun 2017 09:23:31 +0800 Subject: [PATCH 138/189] =?UTF-8?q?=E4=BF=AE=E6=AD=A364=E3=80=8132?= =?UTF-8?q?=E4=BD=8D=E5=88=A4=E6=96=AD=E7=9A=84=E5=AE=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/ocu.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/clguetzli/ocu.cpp b/clguetzli/ocu.cpp index 137c28bd..3a1b695f 100644 --- a/clguetzli/ocu.cpp +++ b/clguetzli/ocu.cpp @@ -64,11 +64,10 @@ ocu_args_d_t& getOcu(void) char* ptx = nullptr; size_t src_size = 0; -#ifdef _WIN64 +if (sizeof(void*) == 8) ReadSourceFromFile("clguetzli/clguetzli.cu.ptx64", &ptx, &src_size); -#else +else ReadSourceFromFile("clguetzli/clguetzli.cu.ptx32", &ptx, &src_size); -#endif CUmodule mod; CUjit_option jit_options[2]; From 3237a5006e89943a17c57579fdfb04d690aa925b Mon Sep 17 00:00:00 2001 From: strongtu Date: Tue, 6 Jun 2017 10:10:01 +0800 Subject: [PATCH 139/189] =?UTF-8?q?=E4=BC=98=E5=8C=96=20cuEdgeDetectorMapE?= =?UTF-8?q?x=20cuEdgeDetectorLowFreqEx=20cuRemoveBorderEx=20cuAddBorderEx?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clguetzli.cl | 44 +++++++++-------------------------------- clguetzli/clguetzli.cpp | 23 ++++++++++++--------- clguetzli/cuguetzli.cpp | 38 ++++++++++++++++++++--------------- 3 files changed, 45 insertions(+), 60 deletions(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index 6ddd429c..c6648638 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -118,9 +118,6 @@ __kernel void clConvolutionXEx( if (x % step != 0) return; -// const int xsize = get_global_size(0); -// const int ysize = get_global_size(1); - float weight_no_border = 0; for (int j = 0; j <= 2 * offset; j++) { @@ -162,9 +159,6 @@ __kernel void clConvolutionYEx( if (x % step != 0) return; if (y % step != 0) return; -// const int xsize = get_global_size(0); -// const int ysize = get_global_size(1); - float weight_no_border = 0; for (int j = 0; j <= 2 * offset; j++) { @@ -207,9 +201,6 @@ __kernel void clSquareSampleEx( if (x_sample == x && y_sample == y) return; -// const int xsize = get_global_size(0); -// const int ysize = get_global_size(1); - result[y * xsize + x] = image[y_sample * xsize + x_sample]; } @@ -255,8 +246,6 @@ __kernel void clMaskHighIntensityChangeEx( const int x = get_global_id(0); const int y = get_global_id(1); if (x >= xsize || y >= ysize) return; -// const int xsize = get_global_size(0); - //const int ysize = get_global_size(1); size_t ix = y * xsize + x; const double ave[3] = { @@ -305,6 +294,7 @@ __kernel void clMaskHighIntensityChangeEx( __kernel void clEdgeDetectorMapEx( __global float *result, + const int res_xsize, const int res_ysize, __global const float *r, __global const float *g, __global const float* b, __global const float *r2, __global const float* g2, __global const float *b2, int xsize, int ysize, int step) @@ -312,8 +302,7 @@ __kernel void clEdgeDetectorMapEx( const int res_x = get_global_id(0); const int res_y = get_global_id(1); - const int res_xsize = get_global_size(0); - const int res_ysize = get_global_size(1); + if (res_x >= res_xsize || res_y >= res_ysize) return; int pos_x = res_x * step; int pos_y = res_y * step; @@ -347,9 +336,6 @@ __kernel void clBlockDiffMapEx( const int res_x = get_global_id(0); const int res_y = get_global_id(1); -// const int res_xsize = get_global_size(0); -// const int res_ysize = get_global_size(1); - if (res_x >= res_xsize || res_y >= res_ysize) return; int pos_x = res_x * step; @@ -400,6 +386,7 @@ __kernel void clBlockDiffMapEx( __kernel void clEdgeDetectorLowFreqEx( __global float *block_diff_ac, + const int res_xsize, const int res_ysize, __global const float *r, __global const float *g, __global const float* b, __global const float *r2, __global const float* g2, __global const float *b2, int xsize, int ysize, int step_) @@ -407,12 +394,11 @@ __kernel void clEdgeDetectorLowFreqEx( const int res_x = get_global_id(0); const int res_y = get_global_id(1); + if (res_x >= res_xsize || res_y >= res_ysize) return; + const int step = 8; if (res_x < step / step_) return; - const int res_xsize = get_global_size(0); - const int res_ysize = get_global_size(1); - int x = (res_x - (step / step_)) * step_; int y = res_y * step_; @@ -472,8 +458,6 @@ __kernel void clDiffPrecomputeEx( const int x = get_global_id(0); const int y = get_global_id(1); if (x >= xsize || y >= ysize) return; -// const int xsize = get_global_size(0); - //const int ysize = get_global_size(1); double valsh0[3] = { 0.0 }; double valsv0[3] = { 0.0 }; @@ -546,9 +530,7 @@ __kernel void clAverage5x5Ex(__global float *img, const int xsize, const int ysi const int x = get_global_id(0); const int y = get_global_id(1); if (x >= xsize || y >= ysize) return; -// const int xsize = get_global_size(0); -// const int ysize = get_global_size(1); - + const int row0 = y * xsize; if (x - 1 >= 0) { img[row0 + x] += img_org[row0 + x - 1]; @@ -588,8 +570,6 @@ __kernel void clMinSquareValEx(__global float* result, const int xsize, const in const int y = get_global_id(1); if (x >= xsize || y >= ysize) return; -// const int width = get_global_size(0); -// const int height = get_global_size(1); int minH = offset > y ? 0 : y - offset; int maxH = min(y + square_size - offset, ysize); @@ -621,9 +601,6 @@ __kernel void clDoMaskEx( const int x = get_global_id(0); const int y = get_global_id(1); -// const int xsize = get_global_size(0); -// const int ysize = get_global_size(1); - const double w00 = 232.206464018; const double w11 = 22.9455222245; const double w22 = 503.962310606; @@ -710,23 +687,20 @@ __kernel void clUpsampleSquareRootEx(__global float *diffmap_out, __global const } } -__kernel void clRemoveBorderEx(__global float *out, __global const float *in, int in_xsize, int s, int s2) +__kernel void clRemoveBorderEx(__global float *out, const int xsize, const int ysize, __global const float *in, int s, int s2) { const int x = get_global_id(0); const int y = get_global_id(1); - const int xsize = get_global_size(0); - const int ysize = get_global_size(1); + if (x >= xsize || y >= ysize) return; out[y * xsize + x] = in[(y + s2) * (xsize + s) + x + s2]; } -__kernel void clAddBorderEx(__global float *out, int s, int s2, __global const float *in) +__kernel void clAddBorderEx(__global float *out, const int xsize, const int ysize, int s, int s2, __global const float *in) { const int x = get_global_id(0); const int y = get_global_id(1); - const int xsize = get_global_size(0); - const int ysize = get_global_size(1); if (x >= xsize - s || y >= ysize - s) diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index a81b1189..2bee09be 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -378,15 +378,16 @@ void clEdgeDetectorMapEx( clBlurEx(rgb2.ch[i], xsize, ysize, kSigma[i], 0.0, rgb2_blured.ch[i]); } + const size_t res_xsize = (xsize + step - 1) / step; + const size_t res_ysize = (ysize + step - 1) / step; + cl_kernel kernel = ocl.kernel[KERNEL_EDGEDETECTOR]; clSetKernelArgEx(kernel, &result, + &res_xsize, &res_ysize, &rgb_blured.r, &rgb_blured.g, &rgb_blured.b, &rgb2_blured.r, &rgb2_blured.g, &rgb2_blured.b, &xsize, &ysize, &step); - const size_t res_xsize = (xsize + step - 1) / step; - const size_t res_ysize = (ysize + step - 1) / step; - size_t globalWorkSize[2] = { res_xsize, res_ysize}; cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); LOG_CL_RESULT(err); @@ -442,15 +443,16 @@ void clEdgeDetectorLowFreqEx( clBlurEx(rgb2.ch[i], xsize, ysize, kSigma, 0.0, rgb2_blured.ch[i]); } + const size_t res_xsize = (xsize + step - 1) / step; + const size_t res_ysize = (ysize + step - 1) / step; + cl_kernel kernel = ocl.kernel[KERNEL_EDGEDETECTORLOWFREQ]; clSetKernelArgEx(kernel, &block_diff_ac, + &res_xsize, &res_ysize, &rgb_blured.r, &rgb_blured.g, &rgb_blured.b, &rgb2_blured.r, &rgb2_blured.g, &rgb2_blured.b, &xsize, &ysize, &step); - const size_t res_xsize = (xsize + step - 1) / step; - const size_t res_ysize = (ysize + step - 1) / step; - size_t globalWorkSize[2] = { res_xsize, res_ysize }; cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); LOG_CL_RESULT(err); @@ -753,10 +755,13 @@ void clRemoveBorderEx(cl_mem out, const cl_mem in, const size_t xsize, const siz cl_int cls = 8 - step; cl_int cls2 = (8 - step) / 2; + int out_xsize = xsize - cls; + int out_ysize = ysize - cls; + cl_kernel kernel = ocl.kernel[KERNEL_REMOVEBORDER]; - clSetKernelArgEx(kernel, &out, &in, &xsize, &cls, &cls2); + clSetKernelArgEx(kernel, &out, &out_xsize, &out_ysize, &in, &cls, &cls2); - size_t globalWorkSize[2] = { xsize - cls, ysize - cls}; + size_t globalWorkSize[2] = { out_xsize, out_ysize}; cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); LOG_CL_RESULT(err); err = clFinish(ocl.commandQueue); @@ -770,7 +775,7 @@ void clAddBorderEx(cl_mem out, size_t xsize, size_t ysize, int step, cl_mem in) cl_int cls = 8 - step; cl_int cls2 = (8 - step) / 2; cl_kernel kernel = ocl.kernel[KERNEL_ADDBORDER]; - clSetKernelArgEx(kernel, &out, &cls, &cls2, &in); + clSetKernelArgEx(kernel, &out, &xsize, &ysize, &cls, &cls2, &in); size_t globalWorkSize[2] = { xsize, ysize}; cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); diff --git a/clguetzli/cuguetzli.cpp b/clguetzli/cuguetzli.cpp index 8df19d01..b73967d9 100644 --- a/clguetzli/cuguetzli.cpp +++ b/clguetzli/cuguetzli.cpp @@ -405,18 +405,19 @@ void cuEdgeDetectorMapEx( cuBlurEx(rgb2.ch[i], xsize, ysize, kSigma[i], 0.0, rgb2_blured.ch[i]); } + const size_t res_xsize = (xsize + step - 1) / step; + const size_t res_ysize = (ysize + step - 1) / step; + CUfunction kernel = ocl.kernel[KERNEL_EDGEDETECTOR]; const void *args[] = { &result, + &res_xsize, &res_ysize, &rgb_blured.r, &rgb_blured.g, &rgb_blured.b, &rgb2_blured.r, &rgb2_blured.g, &rgb2_blured.b, &xsize, &ysize, &step }; - const size_t res_xsize = (xsize + step - 1) / step; - const size_t res_ysize = (ysize + step - 1) / step; - CUresult err = cuLaunchKernel(kernel, - res_xsize, res_ysize, 1, - 1, 1, 1, + BLOCK_COUNT_X(res_xsize), BLOCK_COUNT_Y(res_ysize), 1, + BLOCK_SIZE_X, BLOCK_SIZE_Y, 1, 0, ocl.commandQueue, (void**)args, NULL); LOG_CU_RESULT(err); @@ -474,18 +475,20 @@ void cuEdgeDetectorLowFreqEx( cuBlurEx(rgb2.ch[i], xsize, ysize, kSigma, 0.0, rgb2_blured.ch[i]); } + const size_t res_xsize = (xsize + step - 1) / step; + const size_t res_ysize = (ysize + step - 1) / step; + CUfunction kernel = ocl.kernel[KERNEL_EDGEDETECTORLOWFREQ]; const void *args[] = { &block_diff_ac, + &res_xsize, &res_ysize, &rgb_blured.r, &rgb_blured.g, &rgb_blured.b, &rgb2_blured.r, &rgb2_blured.g, &rgb2_blured.b, &xsize, &ysize, &step }; - const size_t res_xsize = (xsize + step - 1) / step; - const size_t res_ysize = (ysize + step - 1) / step; - + CUresult err = cuLaunchKernel(kernel, - res_xsize, res_ysize, 1, - 1, 1, 1, + BLOCK_COUNT_X(res_xsize), BLOCK_COUNT_Y(res_ysize), 1, + BLOCK_SIZE_X, BLOCK_SIZE_Y, 1, 0, ocl.commandQueue, (void**)args, NULL); LOG_CU_RESULT(err); @@ -809,12 +812,15 @@ void cuRemoveBorderEx(cu_mem out, const cu_mem in, const size_t xsize, const siz int cls = 8 - step; int cls2 = (8 - step) / 2; + int out_xsize = xsize - cls; + int out_ysize = ysize - cls; + CUfunction kernel = ocl.kernel[KERNEL_REMOVEBORDER]; - const void *args[] = { &out, &in, &xsize, &cls, &cls2 }; + const void *args[] = { &out, &out_xsize, &out_ysize, &in, &cls, &cls2 }; CUresult err = cuLaunchKernel(kernel, - xsize - cls, ysize - cls, 1, - 1, 1, 1, + BLOCK_COUNT_X(out_xsize), BLOCK_COUNT_Y(out_ysize), 1, + BLOCK_SIZE_X, BLOCK_SIZE_Y, 1, 0, ocl.commandQueue, (void**)args, NULL); LOG_CU_RESULT(err); @@ -829,11 +835,11 @@ void cuAddBorderEx(cu_mem out, size_t xsize, size_t ysize, int step, cu_mem in) int cls = 8 - step; int cls2 = (8 - step) / 2; CUfunction kernel = ocl.kernel[KERNEL_ADDBORDER]; - const void *args[] = { &out, &cls, &cls2, &in }; + const void *args[] = { &out, &xsize, &ysize, &cls, &cls2, &in }; CUresult err = cuLaunchKernel(kernel, - xsize, ysize, 1, - 1, 1, 1, + BLOCK_COUNT_X(xsize), BLOCK_COUNT_Y(ysize), 1, + BLOCK_SIZE_X, BLOCK_SIZE_Y, 1, 0, ocl.commandQueue, (void**)args, NULL); LOG_CU_RESULT(err); From 9f8597d749176e783a6a722914c471a8707bc1ea Mon Sep 17 00:00:00 2001 From: strongtu Date: Tue, 6 Jun 2017 10:28:16 +0800 Subject: [PATCH 140/189] =?UTF-8?q?=E6=81=A2=E5=A4=8Dfactor=3D2=E7=9A=84?= =?UTF-8?q?=E6=94=AF=E6=8C=81=EF=BC=8C=E6=80=A7=E8=83=BD=E5=B7=AE=E5=88=AB?= =?UTF-8?q?=E4=B8=8D=E5=A4=A7=EF=BC=8C=E4=BD=86=E6=98=AF=E7=BC=96=E8=AF=91?= =?UTF-8?q?=E6=97=B6=E9=97=B4=E5=8F=98=E9=95=BF=E4=BA=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clguetzli.cl | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index c6648638..9236a52b 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -3152,16 +3152,16 @@ __device__ double CompareBlockFactor(const channel_info mayout_channel[3], candidate_channel[c] = &candidate_block[c * 8 * 8]; } -// uchar yuv16x16[3 * 16 * 16] = { 0 }; // factor 2 mode output image + uchar yuv16x16[3 * 16 * 16] = { 0 }; // factor 2 mode output image uchar yuv8x8[3 * 8 * 8] = { 0 }; // factor 1 mode output image for (int c = 0; c < 3; c++) { -// if (mayout_channel[c].factor == 1) { - // if (factor == 1) { + if (mayout_channel[c].factor == 1) { + if (factor == 1) { const coeff_t *coeff_block = candidate_channel[c]; CoeffToYUV8x8(coeff_block, &yuv8x8[c]); - /* } + } else { for (int iy = 0; iy < factor; ++iy) { for (int ix = 0; ix < factor; ++ix) { @@ -3183,8 +3183,7 @@ __device__ double CompareBlockFactor(const channel_info mayout_channel[3], } } } -*/ - /* } + } else { if (factor == 1) { int block_xx = block_x / mayout_channel[c].factor; @@ -3211,10 +3210,9 @@ __device__ double CompareBlockFactor(const channel_info mayout_channel[3], image_height); } } -*/ } - // if (factor == 1) + if (factor == 1) { float rgb0_c[3][kDCTBlockSize]; int block_8x8idx = GetOrigBlock(rgb0_c, orig_image_batch, image_width, image_height, block_x, block_y, factor, 0, 0); @@ -3227,7 +3225,6 @@ __device__ double CompareBlockFactor(const channel_info mayout_channel[3], return ComputeImage8x8Block(rgb0_c, rgb1_c, mask_scale + block_8x8idx * 3); } -/* else { int inside_x = block_x * 16 + 16 > image_width ? image_width - block_x * 16 : 16; @@ -3259,6 +3256,5 @@ __device__ double CompareBlockFactor(const channel_info mayout_channel[3], } return max_err; } -*/ } From 61fde3c0b89f204cf7f0a7bcb0b3869edc414f76 Mon Sep 17 00:00:00 2001 From: strongtu Date: Tue, 6 Jun 2017 13:11:16 +0800 Subject: [PATCH 141/189] =?UTF-8?q?=E4=BC=98=E5=8C=96=E7=BC=96=E8=AF=91?= =?UTF-8?q?=E5=92=8CTest=E8=84=9A=E6=9C=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- compile.bat | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/compile.bat b/compile.bat index 4d462695..c7cd2cc7 100644 --- a/compile.bat +++ b/compile.bat @@ -4,4 +4,9 @@ call vcvars64.bat @echo %1 --machine 64 or 32 @echo %2 -G -nvcc -Xcompiler "/wd 4819" -I"./" -arch=compute_30 --machine %1 %2 -ptx -o clguetzli\clguetzli.cu.ptx%1 clguetzli\clguetzli.cu \ No newline at end of file +set machine_num=%1% +set debug_opt=%2% + +if "%machine_num%" == "" set machine_num=64 + +nvcc -Xcompiler "/wd 4819" -I"./" -arch=compute_30 --machine %machine_num% %debug_opt% -ptx -o clguetzli\clguetzli.cu.ptx%machine_num% clguetzli\clguetzli.cu \ No newline at end of file From 8fe84545a633c06a2cebbf952021646aad1225f7 Mon Sep 17 00:00:00 2001 From: strongtu Date: Tue, 6 Jun 2017 15:07:09 +0800 Subject: [PATCH 142/189] =?UTF-8?q?=E5=87=8F=E5=B0=91kernel=E4=B8=AD?= =?UTF-8?q?=E4=B8=80=E4=BA=9B=E5=86=97=E4=BD=99=E7=9A=84=E6=95=B0=E6=8D=AE?= =?UTF-8?q?copy?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clguetzli.cl | 181 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 166 insertions(+), 15 deletions(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index 9236a52b..2509ffa2 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -47,15 +47,33 @@ __device__ void Butteraugli8x8CornerEdgeDetectorDiff( __device__ int MakeInputOrderEx(const coeff_t block[3*8*8], const coeff_t orig_block[3*8*8], IntFloatPairList *input_order); -__device__ double CompareBlockFactor(const channel_info mayout_channel[3], +__device__ double Factor2(const channel_info mayout_channel[3], const coeff_t* candidate_block, const int block_x, const int block_y, __global const float *orig_image_batch, __global const float *mask_scale, const int image_width, - const int image_height, - const int factor); + const int image_height); + +__device__ double CompareBlockFactor1(const channel_info mayout_channel[3], + const coeff_t* candidate_block, + const int block_x, + const int block_y, + __global const float *orig_image_batch, + __global const float *mask_scale, + const int image_width, + const int image_height); + +__device__ double CompareBlockFactor(const channel_info mayout_channel[3], + const coeff_t* candidate_block, + const int block_x, + const int block_y, + __global const float *orig_image_batch, + __global const float *mask_scale, + const int image_width, + const int image_height, + const int factor); __device__ void floatcopy(float *dst, const float *src, int size); __device__ void coeffcopy(coeff_t *dst, const coeff_t *src, int size); @@ -782,40 +800,37 @@ __kernel void clComputeBlockZeroingOrderEx( IntFloatPairList output_order = { 0, output_order_data }; int count = MakeInputOrderEx(mayout_block, orig_block, &input_order); - - coeff_t processed_block[kComputeBlockSize]; - coeffcopy(processed_block, mayout_block, kComputeBlockSize); - + while (input_order.size > 0) { float best_err = 1e17f; int best_i = 0; for (int i = 0; i < min(3, input_order.size); i++) { - coeff_t candidate_block[kComputeBlockSize]; - coeffcopy(candidate_block, processed_block, kComputeBlockSize); - const int idx = input_order.pData[i].idx; - candidate_block[idx] = 0; + coeff_t old_coeff = mayout_block[idx]; + mayout_block[idx] = 0; + float max_err = CompareBlockFactor(mayout_channel, - candidate_block, + mayout_block, block_x, block_y, orig_image_batch, mask_scale, - image_width, - image_height, + image_width, + image_height, factor); if (max_err < best_err) { best_err = max_err; best_i = i; } + mayout_block[idx] = old_coeff; } int idx = input_order.pData[best_i].idx; - processed_block[idx] = 0; + mayout_block[idx] = 0; list_erase(&input_order, best_i); list_push_back(&output_order, idx, best_err); @@ -3137,6 +3152,142 @@ __device__ int GetOrigBlock(float rgb0_c[3][kDCTBlockSize], return block_ix; } +__device__ double CompareBlockFactor1(const channel_info mayout_channel[3], + const coeff_t* candidate_block, + const int block_x, + const int block_y, + __global const float *orig_image_batch, + __global const float *mask_scale, + const int image_width, + const int image_height) +{ + const coeff_t *candidate_channel[3]; + for (int c = 0; c < 3; c++) { + candidate_channel[c] = &candidate_block[c * 8 * 8]; + } + + uchar yuv16x16[3 * 16 * 16] = { 0 }; // factor 2 mode output image + uchar yuv8x8[3 * 8 * 8] = { 0 }; // factor 1 mode output image + + for (int c = 0; c < 3; c++) + { + if (mayout_channel[c].factor == 1) { + const coeff_t *coeff_block = candidate_channel[c]; + CoeffToYUV8x8(coeff_block, &yuv8x8[c]); + } + else { + int block_xx = block_x / mayout_channel[c].factor; + int block_yy = block_y / mayout_channel[c].factor; + int ix = block_x % mayout_channel[c].factor;; + int iy = block_y % mayout_channel[c].factor; + + int block_16x16idx = block_yy * mayout_channel[c].block_width + block_xx; + __global const coeff_t * coeff_block = mayout_channel[c].coeff + block_16x16idx * 8 * 8; + + CoeffToYUV16x16_g(coeff_block, &yuv16x16[c], + mayout_channel[c].pixel, block_xx, block_yy, + image_width, + image_height); + + // copy YUV16x16 corner to YUV8x8 + Copy16x16To8x8(&yuv16x16[c], &yuv8x8[c], ix, iy); + } + } + + { + float rgb0_c[3][kDCTBlockSize]; + int block_8x8idx = GetOrigBlock(rgb0_c, orig_image_batch, image_width, image_height, block_x, block_y, 1, 0, 0); + + int inside_x = block_x * 8 + 8 > image_width ? image_width - block_x * 8 : 8; + int inside_y = block_y * 8 + 8 > image_height ? image_height - block_y * 8 : 8; + float rgb1_c[3][kDCTBlockSize]; + + YUVToImage(yuv8x8, rgb1_c[0], rgb1_c[1], rgb1_c[2], 8, 8, inside_x, inside_y); + + return ComputeImage8x8Block(rgb0_c, rgb1_c, mask_scale + block_8x8idx * 3); + } +} + +__device__ double Factor2(const channel_info mayout_channel[3], + const coeff_t* candidate_block, + const int block_x, + const int block_y, + __global const float *orig_image_batch, + __global const float *mask_scale, + const int image_width, + const int image_height) +{ + const int factor = 2; + const coeff_t *candidate_channel[3]; + for (int c = 0; c < 3; c++) { + candidate_channel[c] = &candidate_block[c * 8 * 8]; + } + + uchar yuv16x16[3 * 16 * 16] = { 0 }; // factor 2 mode output image + uchar yuv8x8[3 * 8 * 8] = { 0 }; // factor 1 mode output image + + for (int c = 0; c < 3; c++) + { + if (mayout_channel[c].factor == 1) { + for (int iy = 0; iy < factor; ++iy) { + for (int ix = 0; ix < factor; ++ix) { + int block_xx = block_x * factor + ix; + int block_yy = block_y * factor + iy; + + ///if (ix != off_x || iy != off_y) continue; + if (block_xx >= mayout_channel[c].block_width || + block_yy >= mayout_channel[c].block_height) + { + continue; + } + int block_8x8idx = block_yy * mayout_channel[c].block_width + block_xx; + __global const coeff_t * coeff_block = mayout_channel[c].coeff + block_8x8idx * 8 * 8; + CoeffToYUV8x8_g(coeff_block, &yuv8x8[c]); + + // copy YUV8x8 to YUV1616 corner + Copy8x8To16x16(&yuv8x8[c], &yuv16x16[c], ix, iy); + } + } + } + else { + const coeff_t * coeff_block = candidate_channel[c]; + CoeffToYUV16x16(coeff_block, &yuv16x16[c], + mayout_channel[c].pixel, block_x, block_y, + image_width, + image_height); + } + } + + int inside_x = block_x * 16 + 16 > image_width ? image_width - block_x * 16 : 16; + int inside_y = block_y * 16 + 16 > image_height ? image_height - block_y * 16 : 16; + + float rgb16x16[3][16 * 16]; + YUVToImage(yuv16x16, rgb16x16[0], rgb16x16[1], rgb16x16[2], 16, 16, inside_x, inside_y); + + double max_err = 0; + for (int iy = 0; iy < factor; ++iy) { + for (int ix = 0; ix < factor; ++ix) { + int block_xx = block_x * factor + ix; + int block_yy = block_y * factor + iy; + + if (block_xx * 8 >= image_width || + block_yy * 8 >= image_height) + { + continue; + } + + float rgb0_c[3][kDCTBlockSize]; + int block_8x8idx = GetOrigBlock(rgb0_c, orig_image_batch, image_width, image_height, block_x, block_y, factor, ix, iy); + + float rgb1_c[3][kDCTBlockSize]; + Copy16x16ToChannel(rgb16x16, rgb1_c[0], rgb1_c[1], rgb1_c[2], ix, iy); + double err = ComputeImage8x8Block(rgb0_c, rgb1_c, mask_scale + block_8x8idx * 3); + max_err = max(max_err, err); + } + } + return max_err; +} + __device__ double CompareBlockFactor(const channel_info mayout_channel[3], const coeff_t* candidate_block, const int block_x, From c90b88a86791038ff9c15eef4274e3beeddb6e67 Mon Sep 17 00:00:00 2001 From: strongtu Date: Tue, 6 Jun 2017 15:08:56 +0800 Subject: [PATCH 143/189] Merge branch 'master' of https://github.com/ianhuang-777/guetzli --- .gitignore | 1 + clguetzli/clguetzli.cl | 9 ++- clguetzli/clguetzli.cl.h | 2 +- clguetzli/clguetzli.cu | 2 +- clguetzli/ocu.cpp | 2 +- compile.sh | 12 +++ guetzli.make | 170 +++++++-------------------------------- guetzli_static.make | 157 ++++++------------------------------ premake5.lua | 3 +- 9 files changed, 75 insertions(+), 283 deletions(-) create mode 100644 compile.sh diff --git a/.gitignore b/.gitignore index 3d270281..0cc93f06 100644 --- a/.gitignore +++ b/.gitignore @@ -16,3 +16,4 @@ ipch/ *.VC.db *.VC.VC.opendb guetzli.vcxproj.user +clguetzli/clguetzli.cu.ptx* diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index 2509ffa2..7bacec40 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -1998,6 +1998,11 @@ __constant static float bias[192] = { 0.0 }; +__device__ coeff_t _abs(coeff_t val) +{ + return val >= 0 ? val : -val; +} + // chrisk todo // return the count of Non-zero item __device__ int MakeInputOrder(__global const coeff_t *block, __global const coeff_t *orig_block, IntFloatPairList *input_order, int block_size) @@ -2007,7 +2012,7 @@ __device__ int MakeInputOrder(__global const coeff_t *block, __global const coef for (int k = 1; k < block_size; ++k) { int idx = c * block_size + k; if (block[idx] != 0) { - float score = abs(orig_block[idx]) * csf[idx] + bias[idx]; + float score = _abs(orig_block[idx]) * csf[idx] + bias[idx]; size = list_push_back(input_order, idx, score); } } @@ -3118,7 +3123,7 @@ __device__ int MakeInputOrderEx(const coeff_t block[3*8*8], const coeff_t orig_b for (int k = 1; k < block_size; ++k) { int idx = c * block_size + k; if (block[idx] != 0) { - float score = abs(orig_block[idx]) * csf[idx] + bias[idx]; + float score = _abs(orig_block[idx]) * csf[idx] + bias[idx]; size = list_push_back(input_order, idx, score); } } diff --git a/clguetzli/clguetzli.cl.h b/clguetzli/clguetzli.cl.h index 2a8ed044..288db67c 100644 --- a/clguetzli/clguetzli.cl.h +++ b/clguetzli/clguetzli.cl.h @@ -3,7 +3,7 @@ #ifdef __cplusplus #ifndef __CUDACC__ -#include "CL\cl.h" +#include "CL/cl.h" #include "cuda.h" #endif #endif diff --git a/clguetzli/clguetzli.cu b/clguetzli/clguetzli.cu index dbca9906..351bed47 100644 --- a/clguetzli/clguetzli.cu +++ b/clguetzli/clguetzli.cu @@ -1,4 +1,4 @@ -#include "clguetzli\clguetzli.cl" +#include "clguetzli/clguetzli.cl" /* __device__ int get_global_id(int dim) { diff --git a/clguetzli/ocu.cpp b/clguetzli/ocu.cpp index 3a1b695f..48f2768a 100644 --- a/clguetzli/ocu.cpp +++ b/clguetzli/ocu.cpp @@ -199,7 +199,7 @@ const char* TranslateCUDAError(CUresult errorCode) case CUDA_ERROR_PEER_ACCESS_UNSUPPORTED: return "CUDA_ERROR_PEER_ACCESS_UNSUPPORTED"; case CUDA_ERROR_INVALID_PTX: return "CUDA_ERROR_INVALID_PTX"; case CUDA_ERROR_INVALID_GRAPHICS_CONTEXT: return "CUDA_ERROR_INVALID_GRAPHICS_CONTEXT"; - case CUDA_ERROR_NVLINK_UNCORRECTABLE: return "CUDA_ERROR_NVLINK_UNCORRECTABLE"; + // case CUDA_ERROR_NVLINK_UNCORRECTABLE: return "CUDA_ERROR_NVLINK_UNCORRECTABLE"; case CUDA_ERROR_INVALID_SOURCE: return "CUDA_ERROR_INVALID_SOURCE"; case CUDA_ERROR_FILE_NOT_FOUND: return "CUDA_ERROR_FILE_NOT_FOUND"; case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND: return "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND"; diff --git a/compile.sh b/compile.sh new file mode 100644 index 00000000..9aa628bc --- /dev/null +++ b/compile.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +#Compile .cu file +echo $1 --machine 64 or 32 +echo $2 -G + +nvcc -I"./" -I"/usr/local/cuda/include" -arch=compute_30 --machine $1 $2 -ptx -o clguetzli/clguetzli.cu.ptx$1 clguetzli/clguetzli.cu + +#copy to ./bin/Release +cp clguetzli/clguetzli.cu.ptx$1 bin/Release/clguetzli/clguetzli.cu.ptx$1 +cp clguetzli/clguetzli.cl bin/Release/clguetzli/clguetzli.cl +cp clguetzli/clguetzli.cl.h bin/Release/clguetzli/clguetzli.cl.h \ No newline at end of file diff --git a/guetzli.make b/guetzli.make index 442d678b..3675ba0d 100644 --- a/guetzli.make +++ b/guetzli.make @@ -15,14 +15,14 @@ ifeq ($(config),release) TARGETDIR = bin/Release TARGET = $(TARGETDIR)/guetzli OBJDIR = obj/Release/guetzli - DEFINES += + DEFINES += -D__USE_CUDA__ INCLUDES += -I. -Ithird_party/butteraugli -Iclguetzli -I"$(OPENCL_INC)" FORCE_INCLUDE += ALL_CPPFLAGS += $(CPPFLAGS) -MMD -MP $(DEFINES) $(INCLUDES) ALL_CFLAGS += $(CFLAGS) $(ALL_CPPFLAGS) -O3 -g `pkg-config --cflags libpng || libpng-config --cflags` ALL_CXXFLAGS += $(CXXFLAGS) $(ALL_CPPFLAGS) -O3 -g -std=c++11 `pkg-config --cflags libpng || libpng-config --cflags` ALL_RESFLAGS += $(RESFLAGS) $(DEFINES) $(INCLUDES) - LIBS += -lOpenCL + LIBS += -lOpenCL -lcuda LDDEPS += ALL_LDFLAGS += $(LDFLAGS) -L"$(OPENCL_LIB)" `pkg-config --libs libpng || libpng-config --ldflags` LINKCMD = $(CXX) -o "$@" $(OBJECTS) $(RESOURCES) $(ALL_LDFLAGS) $(LIBS) @@ -32,7 +32,7 @@ ifeq ($(config),release) endef define POSTBUILDCMDS endef -all: prebuild prelink $(TARGET) +all: $(TARGETDIR) $(OBJDIR) prebuild prelink $(TARGET) @: endif @@ -42,14 +42,14 @@ ifeq ($(config),debug) TARGETDIR = bin/Debug TARGET = $(TARGETDIR)/guetzli OBJDIR = obj/Debug/guetzli - DEFINES += + DEFINES += -D__USE_CUDA__ INCLUDES += -I. -Ithird_party/butteraugli -Iclguetzli -I"$(OPENCL_INC)" FORCE_INCLUDE += ALL_CPPFLAGS += $(CPPFLAGS) -MMD -MP $(DEFINES) $(INCLUDES) ALL_CFLAGS += $(CFLAGS) $(ALL_CPPFLAGS) -g `pkg-config --cflags libpng || libpng-config --cflags` ALL_CXXFLAGS += $(CXXFLAGS) $(ALL_CPPFLAGS) -g -std=c++11 `pkg-config --cflags libpng || libpng-config --cflags` ALL_RESFLAGS += $(RESFLAGS) $(DEFINES) $(INCLUDES) - LIBS += -lOpenCL + LIBS += -lOpenCL -lcuda LDDEPS += ALL_LDFLAGS += $(LDFLAGS) -L"$(OPENCL_LIB)" `pkg-config --libs libpng || libpng-config --ldflags` LINKCMD = $(CXX) -o "$@" $(OBJECTS) $(RESOURCES) $(ALL_LDFLAGS) $(LIBS) @@ -59,7 +59,7 @@ ifeq ($(config),debug) endef define POSTBUILDCMDS endef -all: prebuild prelink $(TARGET) +all: $(TARGETDIR) $(OBJDIR) prebuild prelink $(TARGET) @: endif @@ -69,7 +69,9 @@ OBJECTS := \ $(OBJDIR)/clguetzli.cl.o \ $(OBJDIR)/clguetzli.o \ $(OBJDIR)/clguetzli_test.o \ + $(OBJDIR)/cuguetzli.o \ $(OBJDIR)/ocl.o \ + $(OBJDIR)/ocu.o \ $(OBJDIR)/utils.o \ $(OBJDIR)/butteraugli_comparator.o \ $(OBJDIR)/dct_double.o \ @@ -107,13 +109,24 @@ endif $(TARGET): $(GCH) ${CUSTOMFILES} $(OBJECTS) $(LDDEPS) $(RESOURCES) @echo Linking guetzli + $(SILENT) $(LINKCMD) + $(POSTBUILDCMDS) + +$(TARGETDIR): + @echo Creating $(TARGETDIR) ifeq (posix,$(SHELLTYPE)) $(SILENT) mkdir -p $(TARGETDIR) else $(SILENT) mkdir $(subst /,\\,$(TARGETDIR)) endif - $(SILENT) $(LINKCMD) - $(POSTBUILDCMDS) + +$(OBJDIR): + @echo Creating $(OBJDIR) +ifeq (posix,$(SHELLTYPE)) + $(SILENT) mkdir -p $(OBJDIR) +else + $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) +endif clean: @echo Cleaning guetzli @@ -140,219 +153,90 @@ endif $(OBJDIR)/clbutter_comparator.o: clguetzli/clbutter_comparator.cpp @echo $(notdir $<) -ifeq (posix,$(SHELLTYPE)) - $(SILENT) mkdir -p $(OBJDIR) -else - $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) -endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/clguetzli.cl.o: clguetzli/clguetzli.cl.cpp @echo $(notdir $<) -ifeq (posix,$(SHELLTYPE)) - $(SILENT) mkdir -p $(OBJDIR) -else - $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) -endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/clguetzli.o: clguetzli/clguetzli.cpp @echo $(notdir $<) -ifeq (posix,$(SHELLTYPE)) - $(SILENT) mkdir -p $(OBJDIR) -else - $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) -endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/clguetzli_test.o: clguetzli/clguetzli_test.cpp @echo $(notdir $<) -ifeq (posix,$(SHELLTYPE)) - $(SILENT) mkdir -p $(OBJDIR) -else - $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) -endif + $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" +$(OBJDIR)/cuguetzli.o: clguetzli/cuguetzli.cpp + @echo $(notdir $<) $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/ocl.o: clguetzli/ocl.cpp @echo $(notdir $<) -ifeq (posix,$(SHELLTYPE)) - $(SILENT) mkdir -p $(OBJDIR) -else - $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) -endif + $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" +$(OBJDIR)/ocu.o: clguetzli/ocu.cpp + @echo $(notdir $<) $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/utils.o: clguetzli/utils.cpp @echo $(notdir $<) -ifeq (posix,$(SHELLTYPE)) - $(SILENT) mkdir -p $(OBJDIR) -else - $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) -endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/butteraugli_comparator.o: guetzli/butteraugli_comparator.cc @echo $(notdir $<) -ifeq (posix,$(SHELLTYPE)) - $(SILENT) mkdir -p $(OBJDIR) -else - $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) -endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/dct_double.o: guetzli/dct_double.cc @echo $(notdir $<) -ifeq (posix,$(SHELLTYPE)) - $(SILENT) mkdir -p $(OBJDIR) -else - $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) -endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/debug_print.o: guetzli/debug_print.cc @echo $(notdir $<) -ifeq (posix,$(SHELLTYPE)) - $(SILENT) mkdir -p $(OBJDIR) -else - $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) -endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/entropy_encode.o: guetzli/entropy_encode.cc @echo $(notdir $<) -ifeq (posix,$(SHELLTYPE)) - $(SILENT) mkdir -p $(OBJDIR) -else - $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) -endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/fdct.o: guetzli/fdct.cc @echo $(notdir $<) -ifeq (posix,$(SHELLTYPE)) - $(SILENT) mkdir -p $(OBJDIR) -else - $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) -endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/gamma_correct.o: guetzli/gamma_correct.cc @echo $(notdir $<) -ifeq (posix,$(SHELLTYPE)) - $(SILENT) mkdir -p $(OBJDIR) -else - $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) -endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/guetzli.o: guetzli/guetzli.cc @echo $(notdir $<) -ifeq (posix,$(SHELLTYPE)) - $(SILENT) mkdir -p $(OBJDIR) -else - $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) -endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/idct.o: guetzli/idct.cc @echo $(notdir $<) -ifeq (posix,$(SHELLTYPE)) - $(SILENT) mkdir -p $(OBJDIR) -else - $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) -endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/jpeg_data.o: guetzli/jpeg_data.cc @echo $(notdir $<) -ifeq (posix,$(SHELLTYPE)) - $(SILENT) mkdir -p $(OBJDIR) -else - $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) -endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/jpeg_data_decoder.o: guetzli/jpeg_data_decoder.cc @echo $(notdir $<) -ifeq (posix,$(SHELLTYPE)) - $(SILENT) mkdir -p $(OBJDIR) -else - $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) -endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/jpeg_data_encoder.o: guetzli/jpeg_data_encoder.cc @echo $(notdir $<) -ifeq (posix,$(SHELLTYPE)) - $(SILENT) mkdir -p $(OBJDIR) -else - $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) -endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/jpeg_data_reader.o: guetzli/jpeg_data_reader.cc @echo $(notdir $<) -ifeq (posix,$(SHELLTYPE)) - $(SILENT) mkdir -p $(OBJDIR) -else - $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) -endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/jpeg_data_writer.o: guetzli/jpeg_data_writer.cc @echo $(notdir $<) -ifeq (posix,$(SHELLTYPE)) - $(SILENT) mkdir -p $(OBJDIR) -else - $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) -endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/jpeg_huffman_decode.o: guetzli/jpeg_huffman_decode.cc @echo $(notdir $<) -ifeq (posix,$(SHELLTYPE)) - $(SILENT) mkdir -p $(OBJDIR) -else - $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) -endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/output_image.o: guetzli/output_image.cc @echo $(notdir $<) -ifeq (posix,$(SHELLTYPE)) - $(SILENT) mkdir -p $(OBJDIR) -else - $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) -endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/preprocess_downsample.o: guetzli/preprocess_downsample.cc @echo $(notdir $<) -ifeq (posix,$(SHELLTYPE)) - $(SILENT) mkdir -p $(OBJDIR) -else - $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) -endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/processor.o: guetzli/processor.cc @echo $(notdir $<) -ifeq (posix,$(SHELLTYPE)) - $(SILENT) mkdir -p $(OBJDIR) -else - $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) -endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/quality.o: guetzli/quality.cc @echo $(notdir $<) -ifeq (posix,$(SHELLTYPE)) - $(SILENT) mkdir -p $(OBJDIR) -else - $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) -endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/quantize.o: guetzli/quantize.cc @echo $(notdir $<) -ifeq (posix,$(SHELLTYPE)) - $(SILENT) mkdir -p $(OBJDIR) -else - $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) -endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/score.o: guetzli/score.cc @echo $(notdir $<) -ifeq (posix,$(SHELLTYPE)) - $(SILENT) mkdir -p $(OBJDIR) -else - $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) -endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/butteraugli.o: third_party/butteraugli/butteraugli/butteraugli.cc @echo $(notdir $<) -ifeq (posix,$(SHELLTYPE)) - $(SILENT) mkdir -p $(OBJDIR) -else - $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) -endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" -include $(OBJECTS:%.o=%.d) diff --git a/guetzli_static.make b/guetzli_static.make index f271c46f..68808523 100644 --- a/guetzli_static.make +++ b/guetzli_static.make @@ -32,7 +32,7 @@ ifeq ($(config),release) endef define POSTBUILDCMDS endef -all: prebuild prelink $(TARGET) +all: $(TARGETDIR) $(OBJDIR) prebuild prelink $(TARGET) @: endif @@ -59,7 +59,7 @@ ifeq ($(config),debug) endef define POSTBUILDCMDS endef -all: prebuild prelink $(TARGET) +all: $(TARGETDIR) $(OBJDIR) prebuild prelink $(TARGET) @: endif @@ -69,7 +69,9 @@ OBJECTS := \ $(OBJDIR)/clguetzli.cl.o \ $(OBJDIR)/clguetzli.o \ $(OBJDIR)/clguetzli_test.o \ + $(OBJDIR)/cuguetzli.o \ $(OBJDIR)/ocl.o \ + $(OBJDIR)/ocu.o \ $(OBJDIR)/utils.o \ $(OBJDIR)/butteraugli_comparator.o \ $(OBJDIR)/dct_double.o \ @@ -106,13 +108,24 @@ endif $(TARGET): $(GCH) ${CUSTOMFILES} $(OBJECTS) $(LDDEPS) $(RESOURCES) @echo Linking guetzli_static + $(SILENT) $(LINKCMD) + $(POSTBUILDCMDS) + +$(TARGETDIR): + @echo Creating $(TARGETDIR) ifeq (posix,$(SHELLTYPE)) $(SILENT) mkdir -p $(TARGETDIR) else $(SILENT) mkdir $(subst /,\\,$(TARGETDIR)) endif - $(SILENT) $(LINKCMD) - $(POSTBUILDCMDS) + +$(OBJDIR): + @echo Creating $(OBJDIR) +ifeq (posix,$(SHELLTYPE)) + $(SILENT) mkdir -p $(OBJDIR) +else + $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) +endif clean: @echo Cleaning guetzli_static @@ -139,211 +152,87 @@ endif $(OBJDIR)/clbutter_comparator.o: clguetzli/clbutter_comparator.cpp @echo $(notdir $<) -ifeq (posix,$(SHELLTYPE)) - $(SILENT) mkdir -p $(OBJDIR) -else - $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) -endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/clguetzli.cl.o: clguetzli/clguetzli.cl.cpp @echo $(notdir $<) -ifeq (posix,$(SHELLTYPE)) - $(SILENT) mkdir -p $(OBJDIR) -else - $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) -endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/clguetzli.o: clguetzli/clguetzli.cpp @echo $(notdir $<) -ifeq (posix,$(SHELLTYPE)) - $(SILENT) mkdir -p $(OBJDIR) -else - $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) -endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/clguetzli_test.o: clguetzli/clguetzli_test.cpp @echo $(notdir $<) -ifeq (posix,$(SHELLTYPE)) - $(SILENT) mkdir -p $(OBJDIR) -else - $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) -endif + $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" +$(OBJDIR)/cuguetzli.o: clguetzli/cuguetzli.cpp + @echo $(notdir $<) $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/ocl.o: clguetzli/ocl.cpp @echo $(notdir $<) -ifeq (posix,$(SHELLTYPE)) - $(SILENT) mkdir -p $(OBJDIR) -else - $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) -endif + $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" +$(OBJDIR)/ocu.o: clguetzli/ocu.cpp + @echo $(notdir $<) $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/utils.o: clguetzli/utils.cpp @echo $(notdir $<) -ifeq (posix,$(SHELLTYPE)) - $(SILENT) mkdir -p $(OBJDIR) -else - $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) -endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/butteraugli_comparator.o: guetzli/butteraugli_comparator.cc @echo $(notdir $<) -ifeq (posix,$(SHELLTYPE)) - $(SILENT) mkdir -p $(OBJDIR) -else - $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) -endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/dct_double.o: guetzli/dct_double.cc @echo $(notdir $<) -ifeq (posix,$(SHELLTYPE)) - $(SILENT) mkdir -p $(OBJDIR) -else - $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) -endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/debug_print.o: guetzli/debug_print.cc @echo $(notdir $<) -ifeq (posix,$(SHELLTYPE)) - $(SILENT) mkdir -p $(OBJDIR) -else - $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) -endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/entropy_encode.o: guetzli/entropy_encode.cc @echo $(notdir $<) -ifeq (posix,$(SHELLTYPE)) - $(SILENT) mkdir -p $(OBJDIR) -else - $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) -endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/fdct.o: guetzli/fdct.cc @echo $(notdir $<) -ifeq (posix,$(SHELLTYPE)) - $(SILENT) mkdir -p $(OBJDIR) -else - $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) -endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/gamma_correct.o: guetzli/gamma_correct.cc @echo $(notdir $<) -ifeq (posix,$(SHELLTYPE)) - $(SILENT) mkdir -p $(OBJDIR) -else - $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) -endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/idct.o: guetzli/idct.cc @echo $(notdir $<) -ifeq (posix,$(SHELLTYPE)) - $(SILENT) mkdir -p $(OBJDIR) -else - $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) -endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/jpeg_data.o: guetzli/jpeg_data.cc @echo $(notdir $<) -ifeq (posix,$(SHELLTYPE)) - $(SILENT) mkdir -p $(OBJDIR) -else - $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) -endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/jpeg_data_decoder.o: guetzli/jpeg_data_decoder.cc @echo $(notdir $<) -ifeq (posix,$(SHELLTYPE)) - $(SILENT) mkdir -p $(OBJDIR) -else - $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) -endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/jpeg_data_encoder.o: guetzli/jpeg_data_encoder.cc @echo $(notdir $<) -ifeq (posix,$(SHELLTYPE)) - $(SILENT) mkdir -p $(OBJDIR) -else - $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) -endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/jpeg_data_reader.o: guetzli/jpeg_data_reader.cc @echo $(notdir $<) -ifeq (posix,$(SHELLTYPE)) - $(SILENT) mkdir -p $(OBJDIR) -else - $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) -endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/jpeg_data_writer.o: guetzli/jpeg_data_writer.cc @echo $(notdir $<) -ifeq (posix,$(SHELLTYPE)) - $(SILENT) mkdir -p $(OBJDIR) -else - $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) -endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/jpeg_huffman_decode.o: guetzli/jpeg_huffman_decode.cc @echo $(notdir $<) -ifeq (posix,$(SHELLTYPE)) - $(SILENT) mkdir -p $(OBJDIR) -else - $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) -endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/output_image.o: guetzli/output_image.cc @echo $(notdir $<) -ifeq (posix,$(SHELLTYPE)) - $(SILENT) mkdir -p $(OBJDIR) -else - $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) -endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/preprocess_downsample.o: guetzli/preprocess_downsample.cc @echo $(notdir $<) -ifeq (posix,$(SHELLTYPE)) - $(SILENT) mkdir -p $(OBJDIR) -else - $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) -endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/processor.o: guetzli/processor.cc @echo $(notdir $<) -ifeq (posix,$(SHELLTYPE)) - $(SILENT) mkdir -p $(OBJDIR) -else - $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) -endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/quality.o: guetzli/quality.cc @echo $(notdir $<) -ifeq (posix,$(SHELLTYPE)) - $(SILENT) mkdir -p $(OBJDIR) -else - $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) -endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/quantize.o: guetzli/quantize.cc @echo $(notdir $<) -ifeq (posix,$(SHELLTYPE)) - $(SILENT) mkdir -p $(OBJDIR) -else - $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) -endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/score.o: guetzli/score.cc @echo $(notdir $<) -ifeq (posix,$(SHELLTYPE)) - $(SILENT) mkdir -p $(OBJDIR) -else - $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) -endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/butteraugli.o: third_party/butteraugli/butteraugli/butteraugli.cc @echo $(notdir $<) -ifeq (posix,$(SHELLTYPE)) - $(SILENT) mkdir -p $(OBJDIR) -else - $(SILENT) mkdir $(subst /,\\,$(OBJDIR)) -endif $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" -include $(OBJECTS:%.o=%.d) diff --git a/premake5.lua b/premake5.lua index 18f5ecee..f6723df8 100644 --- a/premake5.lua +++ b/premake5.lua @@ -42,9 +42,10 @@ workspace "guetzli" project "guetzli" kind "ConsoleApp" filter "action:gmake" + defines { "__USE_CUDA__" } linkoptions { "`pkg-config --libs libpng || libpng-config --ldflags`" } buildoptions { "`pkg-config --cflags libpng || libpng-config --cflags`" } - links { "OpenCL" } + links { "OpenCL", "cuda" } filter "action:vs*" links { "shlwapi" } filter {} From 1e4b4f4325e1c4be44c9b7501cb1bcd05c6c8322 Mon Sep 17 00:00:00 2001 From: strongtu Date: Tue, 6 Jun 2017 18:25:26 +0800 Subject: [PATCH 144/189] =?UTF-8?q?=E4=BC=98=E5=8C=96clDiffmapOpsinDynamic?= =?UTF-8?q?sImageEx?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clguetzli.cpp | 69 ++++++++++-------- clguetzli/clguetzli.h | 8 +++ clguetzli/cuguetzli.cpp | 70 +++++++++++-------- clguetzli/cuguetzli.h | 8 +++ guetzli/butteraugli_comparator.cc | 60 +++++++++++++++- guetzli/guetzli.cc | 12 ++-- .../butteraugli/butteraugli/butteraugli.h | 1 + 7 files changed, 166 insertions(+), 62 deletions(-) diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index 2bee09be..0d52eb33 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -30,11 +30,7 @@ void clDiffmapOpsinDynamicsImage( const size_t xsize, const size_t ysize, const size_t step) { - const size_t res_xsize = (xsize + step - 1) / step; - const size_t res_ysize = (ysize + step - 1) / step; - size_t channel_size = xsize * ysize * sizeof(float); - size_t channel_step_size = res_xsize * res_ysize * sizeof(float); ocl_args_d_t &ocl = getOcl(); ocl_channels xyb0 = ocl.allocMemChannels(channel_size, r, g, b); @@ -42,26 +38,7 @@ void clDiffmapOpsinDynamicsImage( cl_mem mem_result = ocl.allocMem(channel_size, result); - cl_mem edge_detector_map = ocl.allocMem(3 * channel_step_size); - cl_mem block_diff_dc = ocl.allocMem(3 * channel_step_size); - cl_mem block_diff_ac = ocl.allocMem(3 * channel_step_size); - - clMaskHighIntensityChangeEx(xyb0, xyb1, xsize, ysize); - - clEdgeDetectorMapEx(edge_detector_map, xyb0, xyb1, xsize, ysize, step); - clBlockDiffMapEx(block_diff_dc, block_diff_ac, xyb0, xyb1, xsize, ysize, step); - clEdgeDetectorLowFreqEx(block_diff_ac, xyb0, xyb1, xsize, ysize, step); - { - ocl_channels mask = ocl.allocMemChannels(channel_size); - ocl_channels mask_dc = ocl.allocMemChannels(channel_size); - clMaskEx(mask, mask_dc, xyb0, xyb1, xsize, ysize); - clCombineChannelsEx(mem_result, mask, mask_dc, xsize, ysize, block_diff_dc, block_diff_ac, edge_detector_map, res_xsize, step); - - ocl.releaseMemChannels(mask); - ocl.releaseMemChannels(mask_dc); - } - - clCalculateDiffmapEx(mem_result, xsize, ysize, step); + clDiffmapOpsinDynamicsImageEx(mem_result, xyb0, xyb1, xsize, ysize, step); clEnqueueReadBuffer(ocl.commandQueue, mem_result, false, 0, channel_size, result, 0, NULL, NULL); cl_int err = clFinish(ocl.commandQueue); @@ -69,10 +46,6 @@ void clDiffmapOpsinDynamicsImage( ocl.releaseMemChannels(xyb1); ocl.releaseMemChannels(xyb0); - clReleaseMemObject(edge_detector_map); - clReleaseMemObject(block_diff_dc); - clReleaseMemObject(block_diff_ac); - clReleaseMemObject(mem_result); } @@ -182,6 +155,46 @@ void clMask( ocl.releaseMemChannels(mask_dc); } +void clDiffmapOpsinDynamicsImageEx( + cl_mem result, + ocl_channels xyb0, + ocl_channels xyb1, + const size_t xsize, const size_t ysize, + const size_t step) +{ + const size_t res_xsize = (xsize + step - 1) / step; + const size_t res_ysize = (ysize + step - 1) / step; + + size_t channel_size = xsize * ysize * sizeof(float); + size_t channel_step_size = res_xsize * res_ysize * sizeof(float); + + ocl_args_d_t &ocl = getOcl(); + + cl_mem edge_detector_map = ocl.allocMem(3 * channel_step_size); + cl_mem block_diff_dc = ocl.allocMem(3 * channel_step_size); + cl_mem block_diff_ac = ocl.allocMem(3 * channel_step_size); + + clMaskHighIntensityChangeEx(xyb0, xyb1, xsize, ysize); + + clEdgeDetectorMapEx(edge_detector_map, xyb0, xyb1, xsize, ysize, step); + clBlockDiffMapEx(block_diff_dc, block_diff_ac, xyb0, xyb1, xsize, ysize, step); + clEdgeDetectorLowFreqEx(block_diff_ac, xyb0, xyb1, xsize, ysize, step); + { + ocl_channels mask = ocl.allocMemChannels(channel_size); + ocl_channels mask_dc = ocl.allocMemChannels(channel_size); + clMaskEx(mask, mask_dc, xyb0, xyb1, xsize, ysize); + clCombineChannelsEx(result, mask, mask_dc, xsize, ysize, block_diff_dc, block_diff_ac, edge_detector_map, res_xsize, step); + + ocl.releaseMemChannels(mask); + ocl.releaseMemChannels(mask_dc); + } + + clCalculateDiffmapEx(result, xsize, ysize, step); + + clReleaseMemObject(edge_detector_map); + clReleaseMemObject(block_diff_dc); + clReleaseMemObject(block_diff_ac); +} void clConvolutionEx( cl_mem result/*out*/, const cl_mem inp, size_t xsize, size_t ysize, diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h index cad4ef6e..9418b38d 100644 --- a/clguetzli/clguetzli.h +++ b/clguetzli/clguetzli.h @@ -48,6 +48,14 @@ void clMask( const float* r, const float* g, const float* b, const float* r2, const float* g2, const float* b2); +void clDiffmapOpsinDynamicsImageEx( + cl_mem result, + ocl_channels xyb0, + ocl_channels xyb1, + const size_t xsize, const size_t ysize, + const size_t step); + + void clConvolutionEx( cl_mem result/*out*/, const cl_mem inp, size_t xsize, size_t ysize, diff --git a/clguetzli/cuguetzli.cpp b/clguetzli/cuguetzli.cpp index b73967d9..97fb2d10 100644 --- a/clguetzli/cuguetzli.cpp +++ b/clguetzli/cuguetzli.cpp @@ -34,11 +34,7 @@ void cuDiffmapOpsinDynamicsImage( const size_t xsize, const size_t ysize, const size_t step) { - const size_t res_xsize = (xsize + step - 1) / step; - const size_t res_ysize = (ysize + step - 1) / step; - size_t channel_size = xsize * ysize * sizeof(float); - size_t channel_step_size = res_xsize * res_ysize * sizeof(float); ocu_args_d_t &ocl = getOcu(); ocu_channels xyb0 = ocl.allocMemChannels(channel_size, r, g, b); @@ -46,36 +42,13 @@ void cuDiffmapOpsinDynamicsImage( cu_mem mem_result = ocl.allocMem(channel_size, result); - cu_mem edge_detector_map = ocl.allocMem(3 * channel_step_size); - cu_mem block_diff_dc = ocl.allocMem(3 * channel_step_size); - cu_mem block_diff_ac = ocl.allocMem(3 * channel_step_size); - - cuMaskHighIntensityChangeEx(xyb0, xyb1, xsize, ysize); - - cuEdgeDetectorMapEx(edge_detector_map, xyb0, xyb1, xsize, ysize, step); - cuBlockDiffMapEx(block_diff_dc, block_diff_ac, xyb0, xyb1, xsize, ysize, step); - cuEdgeDetectorLowFreqEx(block_diff_ac, xyb0, xyb1, xsize, ysize, step); - { - ocu_channels mask = ocl.allocMemChannels(channel_size); - ocu_channels mask_dc = ocl.allocMemChannels(channel_size); - cuMaskEx(mask, mask_dc, xyb0, xyb1, xsize, ysize); - cuCombineChannelsEx(mem_result, mask, mask_dc, xsize, ysize, block_diff_dc, block_diff_ac, edge_detector_map, res_xsize, step); - - ocl.releaseMemChannels(mask); - ocl.releaseMemChannels(mask_dc); - } - - cuCalculateDiffmapEx(mem_result, xsize, ysize, step); + cuDiffmapOpsinDynamicsImageEx(mem_result, xyb0, xyb1, xsize, ysize, step); cuMemcpyDtoH(result, mem_result, channel_size); ocl.releaseMemChannels(xyb1); ocl.releaseMemChannels(xyb0); - cuMemFree(edge_detector_map); - cuMemFree(block_diff_dc); - cuMemFree(block_diff_ac); - cuMemFree(mem_result); } @@ -188,6 +161,47 @@ void cuMask( ocl.releaseMemChannels(mask_dc); } +void cuDiffmapOpsinDynamicsImageEx( + cu_mem result, + ocu_channels xyb0, + ocu_channels xyb1, + const size_t xsize, const size_t ysize, + const size_t step) +{ + const size_t res_xsize = (xsize + step - 1) / step; + const size_t res_ysize = (ysize + step - 1) / step; + + size_t channel_size = xsize * ysize * sizeof(float); + size_t channel_step_size = res_xsize * res_ysize * sizeof(float); + + ocu_args_d_t &ocl = getOcu(); + + cu_mem edge_detector_map = ocl.allocMem(3 * channel_step_size); + cu_mem block_diff_dc = ocl.allocMem(3 * channel_step_size); + cu_mem block_diff_ac = ocl.allocMem(3 * channel_step_size); + + cuMaskHighIntensityChangeEx(xyb0, xyb1, xsize, ysize); + + cuEdgeDetectorMapEx(edge_detector_map, xyb0, xyb1, xsize, ysize, step); + cuBlockDiffMapEx(block_diff_dc, block_diff_ac, xyb0, xyb1, xsize, ysize, step); + cuEdgeDetectorLowFreqEx(block_diff_ac, xyb0, xyb1, xsize, ysize, step); + { + ocu_channels mask = ocl.allocMemChannels(channel_size); + ocu_channels mask_dc = ocl.allocMemChannels(channel_size); + cuMaskEx(mask, mask_dc, xyb0, xyb1, xsize, ysize); + cuCombineChannelsEx(result, mask, mask_dc, xsize, ysize, block_diff_dc, block_diff_ac, edge_detector_map, res_xsize, step); + + ocl.releaseMemChannels(mask); + ocl.releaseMemChannels(mask_dc); + } + + cuCalculateDiffmapEx(result, xsize, ysize, step); + + cuMemFree(edge_detector_map); + cuMemFree(block_diff_dc); + cuMemFree(block_diff_ac); +} + void cuConvolutionEx( cu_mem result/*out*/, const cu_mem inp, size_t xsize, size_t ysize, diff --git a/clguetzli/cuguetzli.h b/clguetzli/cuguetzli.h index 81ec377b..5082ea1c 100644 --- a/clguetzli/cuguetzli.h +++ b/clguetzli/cuguetzli.h @@ -1,6 +1,7 @@ #pragma once #include "guetzli/processor.h" #include "clguetzli.cl.h" +#include "ocu.h" #ifdef __USE_CUDA__ @@ -34,6 +35,13 @@ void cuMask( const float* r, const float* g, const float* b, const float* r2, const float* g2, const float* b2); +void cuDiffmapOpsinDynamicsImageEx( + cu_mem result, + ocu_channels xyb0, + ocu_channels xyb1, + const size_t xsize, const size_t ysize, + const size_t step); + void cuConvolutionXEx( cu_mem result/*out*/, const cu_mem inp, size_t xsize, size_t ysize, diff --git a/guetzli/butteraugli_comparator.cc b/guetzli/butteraugli_comparator.cc index 02256e95..1c6342c8 100644 --- a/guetzli/butteraugli_comparator.cc +++ b/guetzli/butteraugli_comparator.cc @@ -22,6 +22,10 @@ #include "guetzli/gamma_correct.h" #include "guetzli/score.h" +#include "clguetzli\ocu.h" +#include "clguetzli\clguetzli.h" +#include "clguetzli\cuguetzli.h" + namespace guetzli { ButteraugliComparator::ButteraugliComparator(const int width, const int height, @@ -51,8 +55,60 @@ ButteraugliComparator::ButteraugliComparator(const int width, const int height, void ButteraugliComparator::Compare(const OutputImage& img) { std::vector > rgb(3, std::vector(width_ * height_)); img.ToLinearRGB(&rgb); - ::butteraugli::OpsinDynamicsImage(width_, height_, rgb); - comparator_.DiffmapOpsinDynamicsImage(rgb_linear_pregamma_, rgb, distmap_); + + if (MODE_OPENCL == g_mathMode) + { + const int xsize = width_; + const int ysize = height_; + distmap_.resize(xsize * ysize); + + size_t channel_size = xsize * ysize * sizeof(float); + ocl_args_d_t &ocl = getOcl(); + ocl_channels xyb1 = ocl.allocMemChannels(channel_size, rgb[0].data(), rgb[1].data(), rgb[2].data()); + ocl_channels xyb0 = ocl.allocMemChannels(channel_size, rgb_linear_pregamma_[0].data(), rgb_linear_pregamma_[1].data(), rgb_linear_pregamma_[2].data()); + + cl_mem mem_result = ocl.allocMem(channel_size, distmap_.data()); + + clOpsinDynamicsImageEx(xyb1, xsize, ysize); + clDiffmapOpsinDynamicsImageEx(mem_result, xyb0, xyb1, xsize, ysize, comparator_.step_); + + clEnqueueReadBuffer(ocl.commandQueue, mem_result, false, 0, channel_size, distmap_.data(), 0, NULL, NULL); + clFinish(ocl.commandQueue); + + clReleaseMemObject(mem_result); + ocl.releaseMemChannels(xyb0); + ocl.releaseMemChannels(xyb1); + } +#ifdef __HAVE_CUDA__ + else if (MODE_CUDA == g_mathMode) + { + const int xsize = width_; + const int ysize = height_; + + size_t channel_size = xsize * ysize * sizeof(float); + ocu_args_d_t &ocl = getOcu(); + ocu_channels xyb1 = ocl.allocMemChannels(channel_size, rgb[0].data(), rgb[1].data(), rgb[2].data()); + ocu_channels xyb0 = ocl.allocMemChannels(channel_size, rgb_linear_pregamma_[0].data(), rgb_linear_pregamma_[1].data(), rgb_linear_pregamma_[2].data()); + + cu_mem mem_result = ocl.allocMem(channel_size, distmap_.data()); + + cuOpsinDynamicsImageEx(xyb1, xsize, ysize); + + cuDiffmapOpsinDynamicsImageEx(mem_result, xyb0, xyb1, xsize, ysize, comparator_.step_); + + cuMemcpyDtoH(distmap_.data(), mem_result, channel_size); + + cuMemFree(mem_result); + ocl.releaseMemChannels(xyb0); + ocl.releaseMemChannels(xyb1); + } +#endif + else + { + ::butteraugli::OpsinDynamicsImage(width_, height_, rgb); + comparator_.DiffmapOpsinDynamicsImage(rgb_linear_pregamma_, rgb, distmap_); + } + distance_ = ::butteraugli::ButteraugliScoreFromDiffmap(distmap_); GUETZLI_LOG(stats_, " BA[100.00%%] D[%6.4f]", distance_); } diff --git a/guetzli/guetzli.cc b/guetzli/guetzli.cc index 276cb9d6..9e1be556 100644 --- a/guetzli/guetzli.cc +++ b/guetzli/guetzli.cc @@ -226,9 +226,11 @@ void Usage() { " --memlimit M - Memory limit in MB. Guetzli will fail if unable to stay under\n" " the limit. Default limit is %d MB.\n" " --opencl - Use OpenCL\n" - " --cuda - Use CUDA\n" " --checkcl - Check OpenCL result\n" +#ifdef __USE_CUDA__ + " --cuda - Use CUDA\n" " --checkcuda - Check CUDA result\n" +#endif " --nomemlimit - Do not limit memory usage.\n", kDefaultJPEGQuality, kDefaultMemlimitMB); exit(1); } @@ -264,15 +266,17 @@ int main(int argc, char** argv) { else if (!strcmp(argv[opt_idx], "--opencl")) { g_mathMode = MODE_OPENCL; } - else if (!strcmp(argv[opt_idx], "--cuda")) { - g_mathMode = MODE_CUDA; - } else if (!strcmp(argv[opt_idx], "--checkcl")) { g_mathMode = MODE_CHECKCL; } +#ifdef __USE_CUDA__ + else if (!strcmp(argv[opt_idx], "--cuda")) { + g_mathMode = MODE_CUDA; + } else if (!strcmp(argv[opt_idx], "--checkcuda")) { g_mathMode = MODE_CHECKCUDA; } +#endif else if (!strcmp(argv[opt_idx], "--")) { opt_idx++; break; diff --git a/third_party/butteraugli/butteraugli/butteraugli.h b/third_party/butteraugli/butteraugli/butteraugli.h index 637f50ff..16040e95 100644 --- a/third_party/butteraugli/butteraugli/butteraugli.h +++ b/third_party/butteraugli/butteraugli/butteraugli.h @@ -72,6 +72,7 @@ class ButteraugliComparator { const std::vector& edge_detector_map, std::vector* result); +public: const size_t xsize_; const size_t ysize_; const size_t num_pixels_; From 39950066092cf7bfc940b7bb0fab0a168f843857 Mon Sep 17 00:00:00 2001 From: strongtu Date: Tue, 6 Jun 2017 22:02:13 +0800 Subject: [PATCH 145/189] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E4=B8=80=E4=BA=9B?= =?UTF-8?q?=E8=B0=83=E8=AF=95=E4=BF=A1=E6=81=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clguetzli.cl | 11 ++++++++--- compile.bat | 2 +- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index 7bacec40..bc0c4d48 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -3072,8 +3072,10 @@ __device__ void CalcOpsinDynamicsImage(__private float rgb[3][kDCTBlockSize]) __device__ double ComputeImage8x8Block(__private float rgb0_c[3][kDCTBlockSize], __private float rgb1_c[3][kDCTBlockSize], const __global float* mask_scale_block) { -// CalcOpsinDynamicsImage(rgb0_c); - CalcOpsinDynamicsImage(rgb1_c); +// return 0; // 126ms +// CalcOpsinDynamicsImage(rgb0_c); -- calc in cpu one time + CalcOpsinDynamicsImage(rgb1_c); +// return 0; // 425ms float rgb0[3][kDCTBlockSize]; float rgb1[3][kDCTBlockSize]; @@ -3086,7 +3088,7 @@ __device__ double ComputeImage8x8Block(__private float rgb0_c[3][kDCTBlockSize], rgb0_c[0], rgb0_c[1], rgb0_c[2], rgb1_c[0], rgb1_c[1], rgb1_c[2], 8, 8); - +// return 0; // 544ms // ÕâÀïΪɶҪ°Ñfloatת³Édouble²ÅÄܼÌÐø×ö¼ÆË㣿 double b0[3 * kDCTBlockSize]; // double b1[3 * kDCTBlockSize]; @@ -3102,6 +3104,7 @@ __device__ double ComputeImage8x8Block(__private float rgb0_c[3][kDCTBlockSize], double diff_xyz_edge_dc[3] = { 0.0 }; ButteraugliBlockDiff(b0, b1, diff_xyz_dc, diff_xyz_ac, diff_xyz_edge_dc); +// return 0; // 735ms double diff = 0.0; double diff_edge = 0.0; @@ -3112,6 +3115,8 @@ __device__ double ComputeImage8x8Block(__private float rgb0_c[3][kDCTBlockSize], } const double kEdgeWeight = 0.05; return sqrt((1 - kEdgeWeight) * diff + kEdgeWeight * diff_edge); + +// 750ms } // return the count of Non-zero item diff --git a/compile.bat b/compile.bat index c7cd2cc7..3b44020d 100644 --- a/compile.bat +++ b/compile.bat @@ -9,4 +9,4 @@ set debug_opt=%2% if "%machine_num%" == "" set machine_num=64 -nvcc -Xcompiler "/wd 4819" -I"./" -arch=compute_30 --machine %machine_num% %debug_opt% -ptx -o clguetzli\clguetzli.cu.ptx%machine_num% clguetzli\clguetzli.cu \ No newline at end of file +nvcc -Xcompiler "/wd 4819" -I"./" -arch=compute_30 -lineinfo -O3 --machine %machine_num% %debug_opt% -ptx -o clguetzli\clguetzli.cu.ptx%machine_num% clguetzli\clguetzli.cu \ No newline at end of file From 1aa86d55b97bb0a7ee5c67f4133d9891f2819420 Mon Sep 17 00:00:00 2001 From: strongtu Date: Wed, 7 Jun 2017 10:12:13 +0800 Subject: [PATCH 146/189] =?UTF-8?q?kernel=E8=BF=90=E7=AE=97=E7=94=A8float?= =?UTF-8?q?=E6=9B=BF=E4=BB=A3double=EF=BC=8C=E8=8A=82=E7=9C=81=E8=BF=90?= =?UTF-8?q?=E7=AE=97=E6=97=B6=E9=97=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clguetzli.cl | 7 +++++++ clguetzli/clguetzli.cl.cpp | 6 ++++-- clguetzli/clguetzli.cl.h | 2 ++ clguetzli/clguetzli.cpp | 8 ++++++++ clguetzli/clguetzli.h | 8 ++++++++ clguetzli/cuguetzli.cpp | 8 ++++++++ clguetzli/cuguetzli.h | 8 ++++++++ 7 files changed, 45 insertions(+), 2 deletions(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index bc0c4d48..893b941f 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -2,6 +2,10 @@ #include "clguetzli/clguetzli.cl.h" +#ifdef __USE_DOUBLE_AS_FLOAT__ +#define double float +#endif + #define kBlockEdge 8 #define kBlockSize (kBlockEdge * kBlockEdge) #define kDCTBlockSize (kBlockEdge * kBlockEdge) @@ -3419,3 +3423,6 @@ __device__ double CompareBlockFactor(const channel_info mayout_channel[3], } } +#ifdef __USE_DOUBLE_AS_FLOAT__ +#undef double +#endif diff --git a/clguetzli/clguetzli.cl.cpp b/clguetzli/clguetzli.cl.cpp index a18cd110..6d1ae45b 100644 --- a/clguetzli/clguetzli.cl.cpp +++ b/clguetzli/clguetzli.cl.cpp @@ -49,9 +49,11 @@ namespace guetzli const int block_width = (width + 8 * factor_x - 1) / (8 * factor_x); const int block_height = (height + 8 * factor_y - 1) / (8 * factor_y); const int num_blocks = block_width * block_height; - +#ifdef __USE_DOUBLE_AS_FLOAT__ + const float* lut = kSrgb8ToLinearTable; +#else const double* lut = kSrgb8ToLinearTable; - +#endif imgOpsinDynamicsBlockList.resize(num_blocks * 3 * kDCTBlockSize); imgMaskXyzScaleBlockList.resize(num_blocks * 3); for (int block_y = 0, block_ix = 0; block_y < block_height; ++block_y) diff --git a/clguetzli/clguetzli.cl.h b/clguetzli/clguetzli.cl.h index 288db67c..102f3ac9 100644 --- a/clguetzli/clguetzli.cl.h +++ b/clguetzli/clguetzli.cl.h @@ -8,6 +8,8 @@ #endif #endif +#define __USE_DOUBLE_AS_FLOAT__ + #ifdef __cplusplus #ifndef __CUDACC__ #define __kernel diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index 0d52eb33..24c939b5 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -4,6 +4,10 @@ #include #include "cl.hpp" +#ifdef __USE_DOUBLE_AS_FLOAT__ +#define double float +#endif + extern MATH_MODE g_mathMode = MODE_CPU; void clOpsinDynamicsImage(float *r, float *g, float *b, const size_t xsize, const size_t ysize) @@ -820,3 +824,7 @@ void clCalculateDiffmapEx(cl_mem diffmap/*in,out*/, const size_t xsize, const si clReleaseMemObject(blurred); } + +#ifdef __USE_DOUBLE_AS_FLOAT__ +#undef double +#endif \ No newline at end of file diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h index 9418b38d..7bebdd66 100644 --- a/clguetzli/clguetzli.h +++ b/clguetzli/clguetzli.h @@ -7,6 +7,10 @@ #include "cuguetzli.h" +#ifdef __USE_DOUBLE_AS_FLOAT__ +#define double float +#endif + enum MATH_MODE { MODE_CPU = 0, @@ -146,6 +150,10 @@ void clCalculateDiffmapEx(cl_mem diffmap/*in,out*/, const size_t xsize, const si class guetzli::OutputImage; +#ifdef __USE_DOUBLE_AS_FLOAT__ +#undef double +#endif + namespace guetzli { class ButteraugliComparatorEx : public ButteraugliComparator diff --git a/clguetzli/cuguetzli.cpp b/clguetzli/cuguetzli.cpp index 97fb2d10..0918df75 100644 --- a/clguetzli/cuguetzli.cpp +++ b/clguetzli/cuguetzli.cpp @@ -4,6 +4,10 @@ #ifdef __USE_CUDA__ +#ifdef __USE_DOUBLE_AS_FLOAT__ +#define double float +#endif + #define cuFinish cuStreamSynchronize #define BLOCK_SIZE_X 16 #define BLOCK_SIZE_Y 16 @@ -885,4 +889,8 @@ void cuCalculateDiffmapEx(cu_mem diffmap/*in,out*/, const size_t xsize, const si cuMemFree(blurred); } +#ifdef __USE_DOUBLE_AS_FLOAT__ +#undef double +#endif + #endif \ No newline at end of file diff --git a/clguetzli/cuguetzli.h b/clguetzli/cuguetzli.h index 5082ea1c..a75dcc46 100644 --- a/clguetzli/cuguetzli.h +++ b/clguetzli/cuguetzli.h @@ -5,6 +5,10 @@ #ifdef __USE_CUDA__ +#ifdef __USE_DOUBLE_AS_FLOAT__ +#define double float +#endif + void cuOpsinDynamicsImage( float *r, float *g, float *b, const size_t xsize, const size_t ysize); @@ -124,4 +128,8 @@ void cuAddBorderEx(cu_mem out, const size_t xsize, const size_t ysize, const int void cuCalculateDiffmapEx(cu_mem diffmap/*in,out*/, const size_t xsize, const size_t ysize, const int step); +#ifdef __USE_DOUBLE_AS_FLOAT__ +#undef double +#endif + #endif \ No newline at end of file From 8ed0ce3e72ddef9b08eb3433c305fb559e2aca2e Mon Sep 17 00:00:00 2001 From: strongtu Date: Wed, 7 Jun 2017 12:32:24 +0800 Subject: [PATCH 147/189] =?UTF-8?q?=E4=BF=AE=E6=AD=A3=E6=95=B0=E7=BB=84?= =?UTF-8?q?=E9=95=BF=E5=BA=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clguetzli.cpp | 2 +- clguetzli/cuguetzli.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index 24c939b5..8f39fb46 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -659,7 +659,7 @@ void clDoMask(ocl_channels mask/*in, out*/, ocl_channels mask_dc/*in, out*/, siz MakeMask(extmul, extoff, mul, offset, scaler, lut_dcb); } - size_t channel_size = 512 * 3 * sizeof(double); + size_t channel_size = 512 * sizeof(double); ocl_channels xyb = ocl.allocMemChannels(channel_size, lut_x, lut_y, lut_b); ocl_channels xyb_dc = ocl.allocMemChannels(channel_size, lut_dcx, lut_dcy, lut_dcb); diff --git a/clguetzli/cuguetzli.cpp b/clguetzli/cuguetzli.cpp index 0918df75..3b8c2835 100644 --- a/clguetzli/cuguetzli.cpp +++ b/clguetzli/cuguetzli.cpp @@ -709,7 +709,7 @@ void cuDoMask(ocu_channels mask/*in, out*/, ocu_channels mask_dc/*in, out*/, siz MakeMask(extmul, extoff, mul, offset, scaler, lut_dcb); } - size_t channel_size = 512 * 3 * sizeof(double); + size_t channel_size = 512 * sizeof(double); ocu_channels xyb = ocl.allocMemChannels(channel_size, lut_x, lut_y, lut_b); ocu_channels xyb_dc = ocl.allocMemChannels(channel_size, lut_dcx, lut_dcy, lut_dcb); From 7aff1646bdb052206bb139ad01d40b62737ceba9 Mon Sep 17 00:00:00 2001 From: strongtu Date: Wed, 7 Jun 2017 17:08:29 +0800 Subject: [PATCH 148/189] =?UTF-8?q?=E6=88=91=E4=B9=9F=E4=B8=8D=E7=9F=A5?= =?UTF-8?q?=E9=81=93=E4=B8=BA=E4=BB=80=E4=B9=88=EF=BC=8C=E5=88=A0=E9=99=A4?= =?UTF-8?q?=E6=8E=89=E8=BF=99=E4=B8=AA=E7=A9=BA=E8=A1=8C=E8=AE=A1=E7=AE=97?= =?UTF-8?q?=E7=BB=93=E6=9E=9C=E5=B0=B1=E6=AD=A3=E7=A1=AE=E4=BA=86=20?= =?UTF-8?q?=E8=82=AF=E5=AE=9A=E6=98=AFopencl=E7=94=9F=E6=88=90=E4=BB=A3?= =?UTF-8?q?=E7=A0=81=E6=97=B6=E6=9C=89bug!?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clguetzli.cl | 1 - 1 file changed, 1 deletion(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index 893b941f..a947f770 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -641,7 +641,6 @@ __kernel void clDoMaskEx( mask_dc_x[idx] = (float)(InterpolateClampNegative(lut_dc_x, 512, p0)); mask_dc_y[idx] = (float)(InterpolateClampNegative(lut_dc_y, 512, p1)); mask_dc_b[idx] = (float)(InterpolateClampNegative(lut_dc_b, 512, p2)); - } __kernel void clCombineChannelsEx( From f795ad1e99cd9f199f662a10325c6679379268a5 Mon Sep 17 00:00:00 2001 From: strongtu Date: Wed, 7 Jun 2017 18:26:39 +0800 Subject: [PATCH 149/189] =?UTF-8?q?=E4=BF=AE=E6=AD=A3=E7=BC=96=E8=AF=91?= =?UTF-8?q?=E9=85=8D=E7=BD=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- guetzli.vcxproj | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/guetzli.vcxproj b/guetzli.vcxproj index e6070b25..e6e2be45 100644 --- a/guetzli.vcxproj +++ b/guetzli.vcxproj @@ -144,7 +144,7 @@ false false true - PERFTOOLS_DLL_DECL=;ENABLE_OPENCL;%(PreprocessorDefinitions) + __USE_CUDA__;PERFTOOLS_DLL_DECL=;ENABLE_OPENCL;%(PreprocessorDefinitions) Console @@ -408,14 +408,14 @@ clguetzli\clguetzli.cu.ptx64 $(ProjectDir)compile.bat 64 -G CUDA Code Builder - cu.ptx + clguetzli\clguetzli.cu.ptx64 false false $(ProjectDir)compile.bat 32 -G CUDA Code Builder - cu.ptx + clguetzli\clguetzli.cu.ptx32 CUDA Code Builder - cu.ptx + clguetzli\clguetzli.cu.ptx32 false false From 13abc16a44abb1e45f029474e82df16df5c49d68 Mon Sep 17 00:00:00 2001 From: strongtu Date: Wed, 7 Jun 2017 18:49:55 +0800 Subject: [PATCH 150/189] =?UTF-8?q?=E4=BF=AE=E6=AD=A3warning?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clguetzli.cl | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index a947f770..6ddc81c5 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -271,9 +271,9 @@ __kernel void clMaskHighIntensityChangeEx( size_t ix = y * xsize + x; const double ave[3] = { - (c0_x[ix] + c1_x[ix]) * 0.5, - (c0_y[ix] + c1_y[ix]) * 0.5, - (c0_b[ix] + c1_b[ix]) * 0.5, + (c0_x[ix] + c1_x[ix]) * 0.5f, + (c0_y[ix] + c1_y[ix]) * 0.5f, + (c0_b[ix] + c1_b[ix]) * 0.5f, }; double sqr_max_diff = -1; { @@ -2992,9 +2992,9 @@ __device__ void MaskHighIntensityChangeBlock(float *xyb0_x, float *xyb0_y, float { size_t ix = y * xsize + x; const double ave[3] = { - (c0_x[ix] + c1_x[ix]) * 0.5, - (c0_y[ix] + c1_y[ix]) * 0.5, - (c0_b[ix] + c1_b[ix]) * 0.5, + (c0_x[ix] + c1_x[ix]) * 0.5f, + (c0_y[ix] + c1_y[ix]) * 0.5f, + (c0_b[ix] + c1_b[ix]) * 0.5f, }; double sqr_max_diff = -1; { From 0c85b8fe63ce86e1935f936bcff06bf6f9b1febd Mon Sep 17 00:00:00 2001 From: strongtu Date: Wed, 7 Jun 2017 18:50:10 +0800 Subject: [PATCH 151/189] =?UTF-8?q?=E6=8D=A2=E4=B8=80=E7=BB=84=E7=BC=96?= =?UTF-8?q?=E8=AF=91=E5=8F=82=E6=95=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- compile.bat | 2 +- compile.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/compile.bat b/compile.bat index 3b44020d..1b98c758 100644 --- a/compile.bat +++ b/compile.bat @@ -9,4 +9,4 @@ set debug_opt=%2% if "%machine_num%" == "" set machine_num=64 -nvcc -Xcompiler "/wd 4819" -I"./" -arch=compute_30 -lineinfo -O3 --machine %machine_num% %debug_opt% -ptx -o clguetzli\clguetzli.cu.ptx%machine_num% clguetzli\clguetzli.cu \ No newline at end of file +nvcc -Xcompiler "/wd 4819" -I"./" -use_fast_math -ftz=true -prec-div=false -prec-sqrt=false -arch=compute_30 -O3 --machine %machine_num% %debug_opt% -ptx -o clguetzli\clguetzli.cu.ptx%machine_num% clguetzli\clguetzli.cu \ No newline at end of file diff --git a/compile.sh b/compile.sh index 9aa628bc..40cc3db3 100644 --- a/compile.sh +++ b/compile.sh @@ -4,7 +4,7 @@ echo $1 --machine 64 or 32 echo $2 -G -nvcc -I"./" -I"/usr/local/cuda/include" -arch=compute_30 --machine $1 $2 -ptx -o clguetzli/clguetzli.cu.ptx$1 clguetzli/clguetzli.cu +nvcc -I"./" -I"/usr/local/cuda/include" -use-fast_math -ftz=true -prec-div=false -prec-sqrt=false -arch=compute_30 -O3 --machine $1 $2 -ptx -o clguetzli/clguetzli.cu.ptx$1 clguetzli/clguetzli.cu #copy to ./bin/Release cp clguetzli/clguetzli.cu.ptx$1 bin/Release/clguetzli/clguetzli.cu.ptx$1 From 12cd1200b09f1911f2d0dfbbcfd9fcb9a1405220 Mon Sep 17 00:00:00 2001 From: Zhou Ke Date: Wed, 7 Jun 2017 19:55:41 +0800 Subject: [PATCH 152/189] fix linux build --- compile.sh | 4 ++-- guetzli/butteraugli_comparator.cc | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/compile.sh b/compile.sh index 40cc3db3..0b13d464 100644 --- a/compile.sh +++ b/compile.sh @@ -4,9 +4,9 @@ echo $1 --machine 64 or 32 echo $2 -G -nvcc -I"./" -I"/usr/local/cuda/include" -use-fast_math -ftz=true -prec-div=false -prec-sqrt=false -arch=compute_30 -O3 --machine $1 $2 -ptx -o clguetzli/clguetzli.cu.ptx$1 clguetzli/clguetzli.cu +nvcc -I"./" -I"/usr/local/cuda/include" -ftz=true -prec-div=false -prec-sqrt=false -arch=compute_30 -O3 --machine $1 $2 -ptx -o clguetzli/clguetzli.cu.ptx$1 clguetzli/clguetzli.cu #copy to ./bin/Release cp clguetzli/clguetzli.cu.ptx$1 bin/Release/clguetzli/clguetzli.cu.ptx$1 cp clguetzli/clguetzli.cl bin/Release/clguetzli/clguetzli.cl -cp clguetzli/clguetzli.cl.h bin/Release/clguetzli/clguetzli.cl.h \ No newline at end of file +cp clguetzli/clguetzli.cl.h bin/Release/clguetzli/clguetzli.cl.h diff --git a/guetzli/butteraugli_comparator.cc b/guetzli/butteraugli_comparator.cc index b3353044..f0ce5eb4 100644 --- a/guetzli/butteraugli_comparator.cc +++ b/guetzli/butteraugli_comparator.cc @@ -22,9 +22,9 @@ #include "guetzli/gamma_correct.h" #include "guetzli/score.h" -#include "clguetzli\ocu.h" -#include "clguetzli\clguetzli.h" -#include "clguetzli\cuguetzli.h" +#include "clguetzli/ocu.h" +#include "clguetzli/clguetzli.h" +#include "clguetzli/cuguetzli.h" namespace guetzli { From 7c2e57d6f228a6ffeae7cff92e7be424fd487d45 Mon Sep 17 00:00:00 2001 From: strongtu Date: Thu, 8 Jun 2017 01:14:18 +0800 Subject: [PATCH 153/189] =?UTF-8?q?merge=20google=E7=9A=84=E6=94=B9?= =?UTF-8?q?=E5=8A=A8=E4=B9=8B=E5=90=8E=EF=BC=8C=E6=AF=8F=E6=AC=A1compuare?= =?UTF-8?q?=20StartBlockComparisons=E9=83=BD=E4=BC=9A=E9=87=8D=E6=96=B0?= =?UTF-8?q?=E8=AE=A1=E7=AE=97=E5=8E=9F=E5=A7=8B=E5=9B=BE=E7=89=87=E7=9A=84?= =?UTF-8?q?opsin=20=E4=BC=98=E5=8C=96=E6=8E=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clguetzli.cl.cpp | 158 ++++++++++++------ clguetzli/clguetzli.h | 2 + .../butteraugli/butteraugli/butteraugli.h | 4 +- 3 files changed, 107 insertions(+), 57 deletions(-) diff --git a/clguetzli/clguetzli.cl.cpp b/clguetzli/clguetzli.cl.cpp index 12e0c6c7..7bd566df 100644 --- a/clguetzli/clguetzli.cl.cpp +++ b/clguetzli/clguetzli.cl.cpp @@ -26,6 +26,8 @@ void set_global_size(int dim, int size){ #define abs(exper) fabs((exper)) #include "clguetzli.h" #include "clguetzli.cl" +#include "cuguetzli.h" +#include "ocu.h" namespace guetzli { @@ -34,12 +36,111 @@ namespace guetzli const float target_distance, ProcessStats* stats) : ButteraugliComparator(width, height, rgb, target_distance, stats) { + if (MODE_CPU != g_mathMode) + { + rgb_orig_opsin.resize(3); + rgb_orig_opsin[0].resize(width * height); + rgb_orig_opsin[1].resize(width * height); + rgb_orig_opsin[2].resize(width * height); + +#ifdef __USE_DOUBLE_AS_FLOAT__ + const float* lut = kSrgb8ToLinearTable; +#else + const double* lut = kSrgb8ToLinearTable; +#endif + for (int c = 0; c < 3; ++c) { + for (int y = 0, ix = 0; y < height_; ++y) { + for (int x = 0; x < width_; ++x, ++ix) { + rgb_orig_opsin[c][ix] = lut[rgb_orig_[3 * ix + c]]; + } + } + } + ::butteraugli::OpsinDynamicsImage(width_, height_, rgb_orig_opsin); + } + } + + void ButteraugliComparatorEx::Compare(const OutputImage& img) + { + + if (MODE_OPENCL == g_mathMode) + { + std::vector > rgb1(3, std::vector(width_ * height_)); + img.ToLinearRGB(&rgb1); + + const int xsize = width_; + const int ysize = height_; + std::vector().swap(distmap_); + distmap_.resize(xsize * ysize); + + size_t channel_size = xsize * ysize * sizeof(float); + ocl_args_d_t &ocl = getOcl(); + ocl_channels xyb0 = ocl.allocMemChannels(channel_size, rgb_orig_opsin[0].data(), rgb_orig_opsin[1].data(), rgb_orig_opsin[2].data()); + ocl_channels xyb1 = ocl.allocMemChannels(channel_size, rgb1[0].data(), rgb1[1].data(), rgb1[2].data()); + + cl_mem mem_result = ocl.allocMem(channel_size); + + clOpsinDynamicsImageEx(xyb1, xsize, ysize); + clDiffmapOpsinDynamicsImageEx(mem_result, xyb0, xyb1, xsize, ysize, comparator_.step()); + + cl_int err = clEnqueueReadBuffer(ocl.commandQueue, mem_result, false, 0, channel_size, distmap_.data(), 0, NULL, NULL); + LOG_CL_RESULT(err); + err = clFinish(ocl.commandQueue); + LOG_CL_RESULT(err); + + clReleaseMemObject(mem_result); + ocl.releaseMemChannels(xyb0); + ocl.releaseMemChannels(xyb1); + + distance_ = ::butteraugli::ButteraugliScoreFromDiffmap(distmap_); + } +#ifdef __USE_CUDA__ + else if (MODE_CUDA == g_mathMode) + { + std::vector > rgb1(3, std::vector(width_ * height_)); + img.ToLinearRGB(&rgb1); + + const int xsize = width_; + const int ysize = height_; + std::vector().swap(distmap_); + distmap_.resize(xsize * ysize); + + size_t channel_size = xsize * ysize * sizeof(float); + ocu_args_d_t &ocl = getOcu(); + ocu_channels xyb0 = ocl.allocMemChannels(channel_size, rgb_orig_opsin[0].data(), rgb_orig_opsin[1].data(), rgb_orig_opsin[2].data()); + ocu_channels xyb1 = ocl.allocMemChannels(channel_size, rgb1[0].data(), rgb1[1].data(), rgb1[2].data()); + + cu_mem mem_result = ocl.allocMem(channel_size); + + cuOpsinDynamicsImageEx(xyb1, xsize, ysize); + + cuDiffmapOpsinDynamicsImageEx(mem_result, xyb0, xyb1, xsize, ysize, comparator_.step()); + + cuMemcpyDtoH(distmap_.data(), mem_result, channel_size); + + cuMemFree(mem_result); + ocl.releaseMemChannels(xyb0); + ocl.releaseMemChannels(xyb1); + + distance_ = ::butteraugli::ButteraugliScoreFromDiffmap(distmap_); + } +#endif + else + { + ButteraugliComparator::Compare(img); + } } void ButteraugliComparatorEx::StartBlockComparisons() { - ButteraugliComparator::StartBlockComparisons(); + if (MODE_CPU == g_mathMode) + { + ButteraugliComparator::StartBlockComparisons(); + return; + } + + std::vector > dummy(3); + ::butteraugli::Mask(rgb_orig_opsin, rgb_orig_opsin, width_, height_, &mask_xyz_, &dummy); const int width = width_; const int height = height_; @@ -129,57 +230,4 @@ namespace guetzli */ return err; } -} - -/* -if (MODE_OPENCL == g_mathMode) -{ -const int xsize = width_; -const int ysize = height_; -std::vector().swap(distmap_); -distmap_.resize(xsize * ysize); - -size_t channel_size = xsize * ysize * sizeof(float); -ocl_args_d_t &ocl = getOcl(); -ocl_channels xyb1 = ocl.allocMemChannels(channel_size, rgb[0].data(), rgb[1].data(), rgb[2].data()); -ocl_channels xyb0 = ocl.allocMemChannels(channel_size, rgb0[0].data(), rgb0[1].data(), rgb0[2].data()); - -cl_mem mem_result = ocl.allocMem(channel_size);// , distmap_.data()); - -clOpsinDynamicsImageEx(xyb1, xsize, ysize); -clDiffmapOpsinDynamicsImageEx(mem_result, xyb0, xyb1, xsize, ysize, comparator_.step_); - -clEnqueueReadBuffer(ocl.commandQueue, mem_result, false, 0, channel_size, distmap_.data(), 0, NULL, NULL); -clFinish(ocl.commandQueue); - -clReleaseMemObject(mem_result); -ocl.releaseMemChannels(xyb0); -ocl.releaseMemChannels(xyb1); -} -#ifdef __USE_CUDA__ -else if (MODE_CUDA == g_mathMode) -{ -const int xsize = width_; -const int ysize = height_; -std::vector().swap(distmap_); -distmap_.resize(xsize * ysize); - -size_t channel_size = xsize * ysize * sizeof(float); -ocu_args_d_t &ocl = getOcu(); -ocu_channels xyb1 = ocl.allocMemChannels(channel_size, rgb[0].data(), rgb[1].data(), rgb[2].data()); -ocu_channels xyb0 = ocl.allocMemChannels(channel_size, rgb0[0].data(), rgb0[1].data(), rgb0[2].data()); - -cu_mem mem_result = ocl.allocMem(channel_size);// , distmap_.data()); - -cuOpsinDynamicsImageEx(xyb1, xsize, ysize); - -cuDiffmapOpsinDynamicsImageEx(mem_result, xyb0, xyb1, xsize, ysize, comparator_.step_); - -cuMemcpyDtoH(distmap_.data(), mem_result, channel_size); - -cuMemFree(mem_result); -ocl.releaseMemChannels(xyb0); -ocl.releaseMemChannels(xyb1); -} -#endif -else*/ \ No newline at end of file +} \ No newline at end of file diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h index 7bebdd66..d5c04492 100644 --- a/clguetzli/clguetzli.h +++ b/clguetzli/clguetzli.h @@ -163,6 +163,7 @@ namespace guetzli { const std::vector* rgb, const float target_distance, ProcessStats* stats); + void Compare(const OutputImage& img) override; void StartBlockComparisons() override; void FinishBlockComparisons() override; @@ -170,5 +171,6 @@ namespace guetzli { public: std::vector imgOpsinDynamicsBlockList; // [RR..RRGG..GGBB..BB]:blockCount std::vector imgMaskXyzScaleBlockList; // [RGBRGB..RGBRGB]:blockCount + std::vector> rgb_orig_opsin; }; } \ No newline at end of file diff --git a/third_party/butteraugli/butteraugli/butteraugli.h b/third_party/butteraugli/butteraugli/butteraugli.h index 6f0451c8..547fdc58 100644 --- a/third_party/butteraugli/butteraugli/butteraugli.h +++ b/third_party/butteraugli/butteraugli/butteraugli.h @@ -49,7 +49,7 @@ class ButteraugliComparator { virtual void DiffmapOpsinDynamicsImage(std::vector> &xyb0, std::vector> &xyb1, std::vector &result); - + int step() { return step_;} protected: virtual void BlockDiffMap(const std::vector > &rgb0, const std::vector > &rgb1, @@ -72,7 +72,7 @@ class ButteraugliComparator { const std::vector& edge_detector_map, std::vector* result); -public: +protected: const size_t xsize_; const size_t ysize_; const size_t num_pixels_; From 79bce89559a928cdaef34f757cd35cf5621f945c Mon Sep 17 00:00:00 2001 From: strongtu Date: Thu, 8 Jun 2017 01:43:40 +0800 Subject: [PATCH 154/189] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E5=A4=84=E7=90=86png?= =?UTF-8?q?=E6=97=B6=E7=9A=84crash?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- guetzli/processor.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/guetzli/processor.cc b/guetzli/processor.cc index 2603f3f8..86648d17 100644 --- a/guetzli/processor.cc +++ b/guetzli/processor.cc @@ -1056,7 +1056,7 @@ bool Process(const Params& params, ProcessStats* stats, std::unique_ptr comparator; if (jpg.width >= 32 && jpg.height >= 32) { comparator.reset( - new ButteraugliComparator(jpg.width, jpg.height, &rgb, + new ButteraugliComparatorEx(jpg.width, jpg.height, &rgb, params.butteraugli_target, stats)); } bool ok = ProcessJpegData(params, jpg, comparator.get(), &out, stats); From 9e8bdb31f42e92f4bacf3e4ebb110314c5d481a6 Mon Sep 17 00:00:00 2001 From: strongtu Date: Thu, 8 Jun 2017 16:45:26 +0800 Subject: [PATCH 155/189] =?UTF-8?q?=E8=8A=82=E7=9C=81clComputeBlockZeroing?= =?UTF-8?q?OrderEx=E8=BF=87=E7=A8=8B=E4=B8=AD=E7=9A=84=E5=86=97=E4=BD=99?= =?UTF-8?q?=E8=AE=A1=E7=AE=97?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clguetzli.cl | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index 6ddc81c5..f0e16db0 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -832,6 +832,10 @@ __kernel void clComputeBlockZeroingOrderEx( mayout_block[idx] = old_coeff; } + if (best_err >= BlockErrorLimit) + { // err¶ÓÁÐÊÇÖð½¥Ôö´óµÄ£¬Èç¹ûÕâÀïÒѾ­³¬¹ýErrorLimit£¬ºóÐøµÄ¼ÆËã¾ÍÊÇÈßÓàµÄÁË + break; + } int idx = input_order.pData[best_i].idx; mayout_block[idx] = 0; list_erase(&input_order, best_i); From 43834f77af609f7f056af9b87d9e8e2520744b45 Mon Sep 17 00:00:00 2001 From: strongtu Date: Thu, 8 Jun 2017 16:50:08 +0800 Subject: [PATCH 156/189] =?UTF-8?q?=E9=9D=99=E6=80=81=E5=BA=93=E7=BC=96?= =?UTF-8?q?=E8=AF=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- guetzli_static.vcxproj | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/guetzli_static.vcxproj b/guetzli_static.vcxproj index 1d4d4e3f..05a75f9a 100644 --- a/guetzli_static.vcxproj +++ b/guetzli_static.vcxproj @@ -93,7 +93,7 @@ NotUsing Level3 - .;third_party\butteraugli;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows%(AdditionalIncludeDirectories) + .;$(CUDA_PATH)\include;third_party\butteraugli;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows%(AdditionalIncludeDirectories) Full true true @@ -110,7 +110,7 @@ NotUsing Level3 - .;third_party\butteraugli;third_party\libpng;third_party\zlib;%(AdditionalIncludeDirectories) + .;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;%(AdditionalIncludeDirectories) Full true true @@ -127,7 +127,7 @@ NotUsing Level3 - .;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories) + .;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories) EditAndContinue Disabled @@ -140,7 +140,7 @@ NotUsing Level3 - .;third_party\butteraugli;third_party\libpng;third_party\zlib;%(AdditionalIncludeDirectories) + .;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;%(AdditionalIncludeDirectories) EditAndContinue Disabled From e922dbf968fb1a5112ef675f3c5bd59fd126d13a Mon Sep 17 00:00:00 2001 From: strongtu Date: Thu, 8 Jun 2017 21:44:23 +0800 Subject: [PATCH 157/189] =?UTF-8?q?=E7=BC=96=E8=AF=91=E5=8F=82=E6=95=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- guetzli.vcxproj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/guetzli.vcxproj b/guetzli.vcxproj index e6e2be45..89a05fe2 100644 --- a/guetzli.vcxproj +++ b/guetzli.vcxproj @@ -170,7 +170,7 @@ .;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories) EditAndContinue Disabled - ENABLE_OPENCL;ENABLE_OPENCL_CHECK;_UNICODE;UNICODE;%(PreprocessorDefinitions) + __USE_CUDA__;_UNICODE;UNICODE;%(PreprocessorDefinitions) Console From 230924bfee5dae001c94e57429bae0cae7b48104 Mon Sep 17 00:00:00 2001 From: strongtu Date: Fri, 9 Jun 2017 10:03:40 +0800 Subject: [PATCH 158/189] =?UTF-8?q?=E8=B0=83=E6=95=B4=E6=B5=8B=E8=AF=95?= =?UTF-8?q?=E8=84=9A=E6=9C=AC=EF=BC=8C=E6=94=AF=E6=8C=81=E7=9B=AE=E5=BD=95?= =?UTF-8?q?=E6=89=B9=E9=87=8F=E4=BC=98=E5=8C=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- guetzli.vcxproj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/guetzli.vcxproj b/guetzli.vcxproj index 89a05fe2..86da4aa7 100644 --- a/guetzli.vcxproj +++ b/guetzli.vcxproj @@ -108,7 +108,7 @@ true false true - __USE_CUDA__;_UNICODE;UNICODE;%(PreprocessorDefinitions) + _UNICODE;UNICODE;%(PreprocessorDefinitions) Console From 0e0edb11d4ddf061863ac4837e5bee61d8d2eac1 Mon Sep 17 00:00:00 2001 From: strongtu Date: Mon, 12 Jun 2017 10:52:15 +0800 Subject: [PATCH 159/189] Merge branch 'master' of https://github.com/ianhuang-777/guetzli Conflicts: tests_tencent/testLinux/cp2test.sh --- compile.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 compile.sh diff --git a/compile.sh b/compile.sh old mode 100644 new mode 100755 From c31af45a96cc98d31c98ed1305e095b5c58eda60 Mon Sep 17 00:00:00 2001 From: strongtu Date: Mon, 12 Jun 2017 16:58:38 +0800 Subject: [PATCH 160/189] =?UTF-8?q?c=E4=BC=98=E5=8C=96=E9=80=89=E9=A1=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- guetzli.make | 1 + guetzli/processor.cc | 6 ++++++ third_party/butteraugli/butteraugli/butteraugli.cc | 9 +++++++++ 3 files changed, 16 insertions(+) diff --git a/guetzli.make b/guetzli.make index 3675ba0d..10231ff0 100644 --- a/guetzli.make +++ b/guetzli.make @@ -16,6 +16,7 @@ ifeq ($(config),release) TARGET = $(TARGETDIR)/guetzli OBJDIR = obj/Release/guetzli DEFINES += -D__USE_CUDA__ + DEFINES += -D__USE_C__ INCLUDES += -I. -Ithird_party/butteraugli -Iclguetzli -I"$(OPENCL_INC)" FORCE_INCLUDE += ALL_CPPFLAGS += $(CPPFLAGS) -MMD -MP $(DEFINES) $(INCLUDES) diff --git a/guetzli/processor.cc b/guetzli/processor.cc index 86648d17..432c62f5 100644 --- a/guetzli/processor.cc +++ b/guetzli/processor.cc @@ -450,6 +450,12 @@ void Processor::ComputeBlockZeroingOrder( block_x, block_y, &processed_block[c * kDCTBlockSize]); } } +#ifdef __USE_C__ + if (best_err >= comparator_->BlockErrorLimit()) + { // err¶ÓÁÐÊÇÖð½¥Ôö´óµÄ£¬Èç¹ûÕâÀïÒѾ­³¬¹ýErrorLimit£¬ºóÐøµÄ¼ÆËã¾ÍÊÇÈßÓàµÄÁË + break; + } +#endif } // Make the block error values monotonic. float min_err = 1e10; diff --git a/third_party/butteraugli/butteraugli/butteraugli.cc b/third_party/butteraugli/butteraugli/butteraugli.cc index 74abc526..c21eb6b1 100644 --- a/third_party/butteraugli/butteraugli/butteraugli.cc +++ b/third_party/butteraugli/butteraugli/butteraugli.cc @@ -1319,9 +1319,18 @@ void _MinSquareVal(size_t square_size, size_t offset, const size_t minh = offset > y ? 0 : y - offset; const size_t maxh = std::min(ysize, y + square_size - offset); for (size_t x = 0; x < xsize; ++x) { +#ifdef __USE_C__ + float min = values[x + minh * xsize]; +#else double min = values[x + minh * xsize]; +#endif for (size_t j = minh + 1; j < maxh; ++j) { +#ifdef __USE_C__ + min = std::min(min, values[x + j * xsize]); +#else min = fmin(min, values[x + j * xsize]); +#endif + } tmp[x + y * xsize] = static_cast(min); } From 891def12293474cf3d12a65625617e8bb86a4459 Mon Sep 17 00:00:00 2001 From: strongtu Date: Tue, 13 Jun 2017 00:23:09 +0800 Subject: [PATCH 161/189] =?UTF-8?q?=E4=BC=98=E5=8C=96c=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- third_party/butteraugli/butteraugli/butteraugli.cc | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/third_party/butteraugli/butteraugli/butteraugli.cc b/third_party/butteraugli/butteraugli/butteraugli.cc index c21eb6b1..b62e1578 100644 --- a/third_party/butteraugli/butteraugli/butteraugli.cc +++ b/third_party/butteraugli/butteraugli/butteraugli.cc @@ -1326,7 +1326,8 @@ void _MinSquareVal(size_t square_size, size_t offset, #endif for (size_t j = minh + 1; j < maxh; ++j) { #ifdef __USE_C__ - min = std::min(min, values[x + j * xsize]); + float tmpf = values[x + j * xsize]; + if (tmpf < min) min = tmpf; #else min = fmin(min, values[x + j * xsize]); #endif @@ -1341,7 +1342,12 @@ void _MinSquareVal(size_t square_size, size_t offset, for (size_t y = 0; y < ysize; ++y) { double min = tmp[minw + y * xsize]; for (size_t j = minw + 1; j < maxw; ++j) { +#ifdef __USE_C__ + float tmpf = tmp[j + y * xsize]; + if (tmpf < min) min = tmpf; +#else min = fmin(min, tmp[j + y * xsize]); +#endif } values[x + y * xsize] = static_cast(min); } From 1f26bc048da138b2c490d09ec45ecf1c964c2713 Mon Sep 17 00:00:00 2001 From: strongtu Date: Tue, 13 Jun 2017 23:34:25 +0800 Subject: [PATCH 162/189] =?UTF-8?q?=E4=B8=8D=E4=BC=98=E5=8C=96c=E4=BA=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- guetzli.make | 1 - 1 file changed, 1 deletion(-) diff --git a/guetzli.make b/guetzli.make index 10231ff0..3675ba0d 100644 --- a/guetzli.make +++ b/guetzli.make @@ -16,7 +16,6 @@ ifeq ($(config),release) TARGET = $(TARGETDIR)/guetzli OBJDIR = obj/Release/guetzli DEFINES += -D__USE_CUDA__ - DEFINES += -D__USE_C__ INCLUDES += -I. -Ithird_party/butteraugli -Iclguetzli -I"$(OPENCL_INC)" FORCE_INCLUDE += ALL_CPPFLAGS += $(CPPFLAGS) -MMD -MP $(DEFINES) $(INCLUDES) From 742b284fdb637f4b38d0393b55355bb06e20fb14 Mon Sep 17 00:00:00 2001 From: strongtu Date: Thu, 15 Jun 2017 16:14:12 +0800 Subject: [PATCH 163/189] =?UTF-8?q?=E4=BC=98=E5=8C=96c=E7=89=88=E6=9C=AC?= =?UTF-8?q?=20-double=E8=BD=AC=E4=B8=BAfloat=20-blockerror=E6=88=AA?= =?UTF-8?q?=E6=96=AD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clguetzli/clbutter_comparator.cpp | 1467 ++++++++++++++++++++++++++++- clguetzli/clbutter_comparator.h | 23 + clguetzli/clguetzli.cl.cpp | 17 +- clguetzli/clguetzli.h | 1 + guetzli/guetzli.cc | 5 + guetzli/processor.cc | 97 +- 6 files changed, 1552 insertions(+), 58 deletions(-) diff --git a/clguetzli/clbutter_comparator.cpp b/clguetzli/clbutter_comparator.cpp index 0331940b..58e76e54 100644 --- a/clguetzli/clbutter_comparator.cpp +++ b/clguetzli/clbutter_comparator.cpp @@ -2,6 +2,1262 @@ #include "clguetzli.h" #include "clguetzli_test.h" +#include +#include + +namespace butteraugli { + +static const float kInternalGoodQualityThreshold = 14.921561160295326; +static const float kGlobalScale = 1.0 / kInternalGoodQualityThreshold; + +inline float DotProductOpt(const float u[3], const float v[3]) { + return u[0] * v[0] + u[1] * v[1] + u[2] * v[2]; +} + +// Computes a horizontal convolution and transposes the result. +void ConvolutionOpt(size_t xsize, size_t ysize, + size_t xstep, + size_t len, size_t offset, + const float* __restrict__ multipliers, + const float* __restrict__ inp, + float border_ratio, + float* __restrict__ result) { + PROFILER_FUNC; + float weight_no_border = 0; + for (size_t j = 0; j <= 2 * offset; ++j) { + weight_no_border += multipliers[j]; + } + for (size_t x = 0, ox = 0; x < xsize; x += xstep, ox++) { + int minx = x < offset ? 0 : x - offset; + int maxx = std::min(xsize, x + len - offset) - 1; + float weight = 0.0; + for (int j = minx; j <= maxx; ++j) { + weight += multipliers[j - x + offset]; + } + // Interpolate linearly between the no-border scaling and border scaling. + weight = (1.0 - border_ratio) * weight + border_ratio * weight_no_border; + float scale = 1.0 / weight; + for (size_t y = 0; y < ysize; ++y) { + float sum = 0.0; + for (int j = minx; j <= maxx; ++j) { + sum += inp[y * xsize + j] * multipliers[j - x + offset]; + } + result[ox * ysize + y] = static_cast(sum * scale); + } + } +} + +void BlurOpt(size_t xsize, size_t ysize, float* channel, float sigma, + float border_ratio) { + PROFILER_FUNC; + float m = 2.25; // Accuracy increases when m is increased. + const float scaler = -1.0 / (2 * sigma * sigma); + // For m = 9.0: exp(-scaler * diff * diff) < 2^ {-52} + const int diff = std::max(1, m * fabs(sigma)); + const int expn_size = 2 * diff + 1; + std::vector expn(expn_size); + for (int i = -diff; i <= diff; ++i) { + expn[i + diff] = static_cast(exp(scaler * i * i)); + } + const int xstep = std::max(1, int(sigma / 3)); + const int ystep = xstep; + int dxsize = (xsize + xstep - 1) / xstep; + int dysize = (ysize + ystep - 1) / ystep; + std::vector tmp(dxsize * ysize); + ConvolutionOpt(xsize, ysize, xstep, expn_size, diff, expn.data(), channel, + border_ratio, + tmp.data()); + float* output = channel; + std::vector downsampled_output; + if (xstep > 1) { + downsampled_output.resize(dxsize * dysize); + output = downsampled_output.data(); + } + ConvolutionOpt(ysize, dxsize, ystep, expn_size, diff, expn.data(), tmp.data(), + border_ratio, output); + if (xstep > 1) { + for (size_t y = 0; y < ysize; y++) { + for (size_t x = 0; x < xsize; x++) { + // TODO: Use correct rounding. + channel[y * xsize + x] = + downsampled_output[(y / ystep) * dxsize + (x / xstep)]; + } + } + } +} + +// To change this to n, add the relevant FFTn function and kFFTnMapIndexTable. +constexpr size_t kBlockEdge = 8; +constexpr size_t kBlockSize = kBlockEdge * kBlockEdge; +constexpr size_t kBlockEdgeHalf = kBlockEdge / 2; +constexpr size_t kBlockHalf = kBlockEdge * kBlockEdgeHalf; + +// Contrast sensitivity related weights. +static const float *GetContrastSensitivityMatrixOpt() { + static float csf8x8[kBlockHalf + kBlockEdgeHalf + 1] = { + 5.28270670524, + 0.0, + 0.0, + 0.0, + 0.3831134973, + 0.676303603859, + 3.58927792424, + 18.6104367002, + 18.6104367002, + 3.09093131948, + 1.0, + 0.498250875965, + 0.36198671102, + 0.308982169883, + 0.1312701920435, + 2.37370549629, + 3.58927792424, + 1.0, + 2.37370549629, + 0.991205724152, + 1.05178802919, + 0.627264168628, + 0.4, + 0.1312701920435, + 0.676303603859, + 0.498250875965, + 0.991205724152, + 0.5, + 0.3831134973, + 0.349686450518, + 0.627264168628, + 0.308982169883, + 0.3831134973, + 0.36198671102, + 1.05178802919, + 0.3831134973, + 0.12, + }; + return &csf8x8[0]; +} + +std::array MakeHighFreqColorDiffDxOpt() { + std::array lut; + static const float off = 11.38708334481672; + static const float inc = 14.550189611520716; + lut[0] = 0.0; + lut[1] = off; + for (int i = 2; i < 21; ++i) { + lut[i] = lut[i - 1] + inc; + } + return lut; +} + +const float *GetHighFreqColorDiffDxOpt() { + static const std::array kLut = MakeHighFreqColorDiffDxOpt(); + return kLut.data(); +} + +std::array MakeHighFreqColorDiffDyOpt() { + std::array lut; + static const float off = 1.4103373714040413; + static const float inc = 0.7084088867024; + lut[0] = 0.0; + lut[1] = off; + for (int i = 2; i < 21; ++i) { + lut[i] = lut[i - 1] + inc; + } + return lut; +} + +const float *GetHighFreqColorDiffDyOpt() { + static const std::array kLut = MakeHighFreqColorDiffDyOpt(); + return kLut.data(); +} + +std::array MakeLowFreqColorDiffDyOpt() { + std::array lut; + static const float inc = 5.2511644570349185; + lut[0] = 0.0; + for (int i = 1; i < 21; ++i) { + lut[i] = lut[i - 1] + inc; + } + return lut; +} + +const float *GetLowFreqColorDiffDyOpt() { + static const std::array kLut = MakeLowFreqColorDiffDyOpt(); + return kLut.data(); +} + +inline float InterpolateOpt(const float *array, int size, float sx) { + float ix = fabs(sx); + assert(ix < 10000); + int baseix = static_cast(ix); + float res; + if (baseix >= size - 1) { + res = array[size - 1]; + } + else { + float mix = ix - baseix; + int nextix = baseix + 1; + res = array[baseix] + mix * (array[nextix] - array[baseix]); + } + if (sx < 0) res = -res; + return res; +} + +inline float InterpolateClampNegativeOpt(const float *array, + int size, float sx) { + if (sx < 0) { + sx = 0; + } + float ix = fabs(sx); + int baseix = static_cast(ix); + float res; + if (baseix >= size - 1) { + res = array[size - 1]; + } + else { + float mix = ix - baseix; + int nextix = baseix + 1; + res = array[baseix] + mix * (array[nextix] - array[baseix]); + } + return res; +} + +void RgbToXybOpt(float r, float g, float b, + float *valx, float *valy, float *valz) { + static const float a0 = 1.01611726948; + static const float a1 = 0.982482243696; + static const float a2 = 1.43571362627; + static const float a3 = 0.896039849412; + *valx = a0 * r - a1 * g; + *valy = a2 * r + a3 * g; + *valz = b; +} + +static inline void XybToValsOpt(float x, float y, float z, + float *valx, float *valy, float *valz) { + static const float xmul = 0.758304045695; + static const float ymul = 2.28148649801; + static const float zmul = 1.87816926918; + *valx = InterpolateOpt(GetHighFreqColorDiffDxOpt(), 21, x * xmul); + *valy = InterpolateOpt(GetHighFreqColorDiffDyOpt(), 21, y * ymul); + *valz = zmul * z; +} + +// Rough psychovisual distance to gray for low frequency colors. +static void XybLowFreqToValsOpt(float x, float y, float z, + float *valx, float *valy, float *valz) { + static const float xmul = 6.64482198135; + static const float ymul = 0.837846224276; + static const float zmul = 7.34905756986; + static const float y_to_z_mul = 0.0812519812628; + z += y_to_z_mul * y; + *valz = z * zmul; + *valx = x * xmul; + *valy = InterpolateOpt(GetLowFreqColorDiffDyOpt(), 21, y * ymul); +} + +float RemoveRangeAroundZeroOpt(float v, float range) { + if (v >= -range && v < range) { + return 0; + } + if (v < 0) { + return v + range; + } + else { + return v - range; + } +} + +void XybDiffLowFreqSquaredAccumulateOpt(float r0, float g0, float b0, + float r1, float g1, float b1, + float factor, float res[3]) { + float valx0, valy0, valz0; + float valx1, valy1, valz1; + XybLowFreqToValsOpt(r0, g0, b0, &valx0, &valy0, &valz0); + if (r1 == 0.0 && g1 == 0.0 && b1 == 0.0) { + PROFILER_ZONE("XybDiff r1=g1=b1=0"); + res[0] += factor * valx0 * valx0; + res[1] += factor * valy0 * valy0; + res[2] += factor * valz0 * valz0; + return; + } + XybLowFreqToValsOpt(r1, g1, b1, &valx1, &valy1, &valz1); + // Approximate the distance of the colors by their respective distances + // to gray. + float valx = valx0 - valx1; + float valy = valy0 - valy1; + float valz = valz0 - valz1; + res[0] += factor * valx * valx; + res[1] += factor * valy * valy; + res[2] += factor * valz * valz; +} + +struct ComplexOpt { +public: + float real; + float imag; +}; + +inline float abssq(const ComplexOpt& c) { + return c.real * c.real + c.imag * c.imag; +} + +static void TransposeBlock(ComplexOpt data[kBlockSize]) { + for (int i = 0; i < kBlockEdge; i++) { + for (int j = 0; j < i; j++) { + std::swap(data[kBlockEdge * i + j], data[kBlockEdge * j + i]); + } + } +} + +// D. J. Bernstein's Fast Fourier Transform algorithm on 4 elements. +inline void FFT4Opt(ComplexOpt* a) { + float t1, t2, t3, t4, t5, t6, t7, t8; + t5 = a[2].real; + t1 = a[0].real - t5; + t7 = a[3].real; + t5 += a[0].real; + t3 = a[1].real - t7; + t7 += a[1].real; + t8 = t5 + t7; + a[0].real = t8; + t5 -= t7; + a[1].real = t5; + t6 = a[2].imag; + t2 = a[0].imag - t6; + t6 += a[0].imag; + t5 = a[3].imag; + a[2].imag = t2 + t3; + t2 -= t3; + a[3].imag = t2; + t4 = a[1].imag - t5; + a[3].real = t1 + t4; + t1 -= t4; + a[2].real = t1; + t5 += a[1].imag; + a[0].imag = t6 + t5; + t6 -= t5; + a[1].imag = t6; +} + +static const float kSqrtHalf = 0.70710678118654752440084436210484903; + +// D. J. Bernstein's Fast Fourier Transform algorithm on 8 elements. +void FFT8OptOpt(ComplexOpt* a) { + float t1, t2, t3, t4, t5, t6, t7, t8; + + t7 = a[4].imag; + t4 = a[0].imag - t7; + t7 += a[0].imag; + a[0].imag = t7; + + t8 = a[6].real; + t5 = a[2].real - t8; + t8 += a[2].real; + a[2].real = t8; + + t7 = a[6].imag; + a[6].imag = t4 - t5; + t4 += t5; + a[4].imag = t4; + + t6 = a[2].imag - t7; + t7 += a[2].imag; + a[2].imag = t7; + + t8 = a[4].real; + t3 = a[0].real - t8; + t8 += a[0].real; + a[0].real = t8; + + a[4].real = t3 - t6; + t3 += t6; + a[6].real = t3; + + t7 = a[5].real; + t3 = a[1].real - t7; + t7 += a[1].real; + a[1].real = t7; + + t8 = a[7].imag; + t6 = a[3].imag - t8; + t8 += a[3].imag; + a[3].imag = t8; + t1 = t3 - t6; + t3 += t6; + + t7 = a[5].imag; + t4 = a[1].imag - t7; + t7 += a[1].imag; + a[1].imag = t7; + + t8 = a[7].real; + t5 = a[3].real - t8; + t8 += a[3].real; + a[3].real = t8; + + t2 = t4 - t5; + t4 += t5; + + t6 = t1 - t4; + t8 = kSqrtHalf; + t6 *= t8; + a[5].real = a[4].real - t6; + t1 += t4; + t1 *= t8; + a[5].imag = a[4].imag - t1; + t6 += a[4].real; + a[4].real = t6; + t1 += a[4].imag; + a[4].imag = t1; + + t5 = t2 - t3; + t5 *= t8; + a[7].imag = a[6].imag - t5; + t2 += t3; + t2 *= t8; + a[7].real = a[6].real - t2; + t2 += a[6].real; + a[6].real = t2; + t5 += a[6].imag; + a[6].imag = t5; + + FFT4Opt(a); + + // Reorder to the correct output order. + // TODO: Modify the above computation so that this is not needed. + ComplexOpt tmp = a[2]; + a[2] = a[3]; + a[3] = a[5]; + a[5] = a[7]; + a[7] = a[4]; + a[4] = a[1]; + a[1] = a[6]; + a[6] = tmp; +} + +// Same as FFT8, but all inputs are real. +// TODO: Since this does not need to be in-place, maybe there is a +// faster FFT than this one, which is derived from DJB's in-place complex FFT. +void RealFFT8Opt(const float* in, ComplexOpt* out) { + float t1, t2, t3, t5, t6, t7, t8; + t8 = in[6]; + t5 = in[2] - t8; + t8 += in[2]; + out[2].real = t8; + out[6].imag = -t5; + out[4].imag = t5; + t8 = in[4]; + t3 = in[0] - t8; + t8 += in[0]; + out[0].real = t8; + out[4].real = t3; + out[6].real = t3; + t7 = in[5]; + t3 = in[1] - t7; + t7 += in[1]; + out[1].real = t7; + t8 = in[7]; + t5 = in[3] - t8; + t8 += in[3]; + out[3].real = t8; + t2 = -t5; + t6 = t3 - t5; + t8 = kSqrtHalf; + t6 *= t8; + out[5].real = out[4].real - t6; + t1 = t3 + t5; + t1 *= t8; + out[5].imag = out[4].imag - t1; + t6 += out[4].real; + out[4].real = t6; + t1 += out[4].imag; + out[4].imag = t1; + t5 = t2 - t3; + t5 *= t8; + out[7].imag = out[6].imag - t5; + t2 += t3; + t2 *= t8; + out[7].real = out[6].real - t2; + t2 += out[6].real; + out[6].real = t2; + t5 += out[6].imag; + out[6].imag = t5; + t5 = out[2].real; + t1 = out[0].real - t5; + t7 = out[3].real; + t5 += out[0].real; + t3 = out[1].real - t7; + t7 += out[1].real; + t8 = t5 + t7; + out[0].real = t8; + t5 -= t7; + out[1].real = t5; + out[2].imag = t3; + out[3].imag = -t3; + out[3].real = t1; + out[2].real = t1; + out[0].imag = 0; + out[1].imag = 0; + + // Reorder to the correct output order. + // TODO: Modify the above computation so that this is not needed. + ComplexOpt tmp = out[2]; + out[2] = out[3]; + out[3] = out[5]; + out[5] = out[7]; + out[7] = out[4]; + out[4] = out[1]; + out[1] = out[6]; + out[6] = tmp; +} + +// Fills in block[kBlockEdgeHalf..(kBlockHalf+kBlockEdgeHalf)], and leaves the +// rest unmodified. +void ButteraugliFFTSquaredOpt(float block[kBlockSize]) { + float global_mul = 0.000064; + ComplexOpt block_c[kBlockSize]; + assert(kBlockEdge == 8); + for (int y = 0; y < kBlockEdge; ++y) { + RealFFT8Opt(block + y * kBlockEdge, block_c + y * kBlockEdge); + } + TransposeBlock(block_c); + float r0[kBlockEdge]; + float r1[kBlockEdge]; + for (int x = 0; x < kBlockEdge; ++x) { + r0[x] = block_c[x].real; + r1[x] = block_c[kBlockHalf + x].real; + } + RealFFT8Opt(r0, block_c); + RealFFT8Opt(r1, block_c + kBlockHalf); + for (int y = 1; y < kBlockEdgeHalf; ++y) { + FFT8OptOpt(block_c + y * kBlockEdge); + } + for (int i = kBlockEdgeHalf; i < kBlockHalf + kBlockEdgeHalf + 1; ++i) { + block[i] = abssq(block_c[i]); + block[i] *= global_mul; + } +} + +// Computes 8x8 FFT of each channel of xyb0 and xyb1 and adds the total squared +// 3-dimensional xybdiff of the two blocks to diff_xyb_{dc,ac} and the average +// diff on the edges to diff_xyb_edge_dc. +void ButteraugliBlockDiffOpt(float xyb0[3 * kBlockSize], + float xyb1[3 * kBlockSize], + float diff_xyb_dc[3], + float diff_xyb_ac[3], + float diff_xyb_edge_dc[3]) { + PROFILER_FUNC; + const float *csf8x8 = GetContrastSensitivityMatrixOpt(); + + float avgdiff_xyb[3] = { 0.0 }; + float avgdiff_edge[3][4] = { { 0.0 } }; + for (int i = 0; i < 3 * kBlockSize; ++i) { + const float diff_xyb = xyb0[i] - xyb1[i]; + const int c = i / kBlockSize; + avgdiff_xyb[c] += diff_xyb / kBlockSize; + const int k = i % kBlockSize; + const int kx = k % kBlockEdge; + const int ky = k / kBlockEdge; + const int h_edge_idx = ky == 0 ? 1 : ky == 7 ? 3 : -1; + const int v_edge_idx = kx == 0 ? 0 : kx == 7 ? 2 : -1; + if (h_edge_idx >= 0) { + avgdiff_edge[c][h_edge_idx] += diff_xyb / kBlockEdge; + } + if (v_edge_idx >= 0) { + avgdiff_edge[c][v_edge_idx] += diff_xyb / kBlockEdge; + } + } + XybDiffLowFreqSquaredAccumulateOpt(avgdiff_xyb[0], + avgdiff_xyb[1], + avgdiff_xyb[2], + 0, 0, 0, csf8x8[0], + diff_xyb_dc); + for (int i = 0; i < 4; ++i) { + XybDiffLowFreqSquaredAccumulateOpt(avgdiff_edge[0][i], + avgdiff_edge[1][i], + avgdiff_edge[2][i], + 0, 0, 0, csf8x8[0], + diff_xyb_edge_dc); + } + + float* xyb_avg = xyb0; + float* xyb_halfdiff = xyb1; + for (int i = 0; i < 3 * kBlockSize; ++i) { + float avg = (xyb0[i] + xyb1[i]) / 2; + float halfdiff = (xyb0[i] - xyb1[i]) / 2; + xyb_avg[i] = avg; + xyb_halfdiff[i] = halfdiff; + } + float *y_avg = &xyb_avg[kBlockSize]; + float *x_halfdiff_squared = &xyb_halfdiff[0]; + float *y_halfdiff = &xyb_halfdiff[kBlockSize]; + float *z_halfdiff_squared = &xyb_halfdiff[2 * kBlockSize]; + ButteraugliFFTSquaredOpt(y_avg); + ButteraugliFFTSquaredOpt(x_halfdiff_squared); + ButteraugliFFTSquaredOpt(y_halfdiff); + ButteraugliFFTSquaredOpt(z_halfdiff_squared); + + static const float xmul = 64.8; + static const float ymul = 1.753123908348329; + static const float ymul2 = 1.51983458269; + static const float zmul = 2.4; + + for (size_t i = kBlockEdgeHalf; i < kBlockHalf + kBlockEdgeHalf + 1; ++i) { + float d = csf8x8[i]; + diff_xyb_ac[0] += d * xmul * x_halfdiff_squared[i]; + diff_xyb_ac[2] += d * zmul * z_halfdiff_squared[i]; + + y_avg[i] = sqrt(y_avg[i]); + y_halfdiff[i] = sqrt(y_halfdiff[i]); + float y0 = y_avg[i] - y_halfdiff[i]; + float y1 = y_avg[i] + y_halfdiff[i]; + // Remove the impact of small absolute values. + // This improves the behavior with flat noise. + static const float ylimit = 0.04; + y0 = RemoveRangeAroundZeroOpt(y0, ylimit); + y1 = RemoveRangeAroundZeroOpt(y1, ylimit); + if (y0 != y1) { + float valy0 = InterpolateOpt(GetHighFreqColorDiffDyOpt(), 21, y0 * ymul2); + float valy1 = InterpolateOpt(GetHighFreqColorDiffDyOpt(), 21, y1 * ymul2); + float valy = ymul * (valy0 - valy1); + diff_xyb_ac[1] += d * valy * valy; + } + } +} + +// Low frequency edge detectors. +// Two edge detectors are applied in each corner of the 8x8 square. +// The squared 3-dimensional error vector is added to diff_xyb. +void Butteraugli8x8CornerEdgeDetectorDiffOpt( + const size_t pos_x, + const size_t pos_y, + const size_t xsize, + const size_t ysize, + const std::vector > &blurred0, + const std::vector > &blurred1, + float diff_xyb[3]) { + PROFILER_FUNC; + int local_count = 0; + float local_xyb[3] = { 0 }; + static const float w = 0.711100840192; + for (int k = 0; k < 4; ++k) { + size_t step = 3; + size_t offset[4][2] = { { 0, 0 },{ 0, 7 },{ 7, 0 },{ 7, 7 } }; + size_t x = pos_x + offset[k][0]; + size_t y = pos_y + offset[k][1]; + if (x >= step && x + step < xsize) { + size_t ix = y * xsize + (x - step); + size_t ix2 = ix + 2 * step; + XybDiffLowFreqSquaredAccumulateOpt( + w * (blurred0[0][ix] - blurred0[0][ix2]), + w * (blurred0[1][ix] - blurred0[1][ix2]), + w * (blurred0[2][ix] - blurred0[2][ix2]), + w * (blurred1[0][ix] - blurred1[0][ix2]), + w * (blurred1[1][ix] - blurred1[1][ix2]), + w * (blurred1[2][ix] - blurred1[2][ix2]), + 1.0, local_xyb); + ++local_count; + } + if (y >= step && y + step < ysize) { + size_t ix = (y - step) * xsize + x; + size_t ix2 = ix + 2 * step * xsize; + XybDiffLowFreqSquaredAccumulateOpt( + w * (blurred0[0][ix] - blurred0[0][ix2]), + w * (blurred0[1][ix] - blurred0[1][ix2]), + w * (blurred0[2][ix] - blurred0[2][ix2]), + w * (blurred1[0][ix] - blurred1[0][ix2]), + w * (blurred1[1][ix] - blurred1[1][ix2]), + w * (blurred1[2][ix] - blurred1[2][ix2]), + 1.0, local_xyb); + ++local_count; + } + } + static const float weight = 0.01617112696; + const float mul = weight * 8.0 / local_count; + for (int i = 0; i < 3; ++i) { + diff_xyb[i] += mul * local_xyb[i]; + } +} + +// https://en.wikipedia.org/wiki/Photopsin absordance modeling. +const float *GetOpsinAbsorbanceOpt() { + static const float kMix[12] = { + 0.348036746003, + 0.577814843137, + 0.0544556093735, + 0.774145581713, + 0.26922717275, + 0.767247733938, + 0.0366922708552, + 0.920130265014, + 0.0882062883536, + 0.158581714673, + 0.712857943858, + 10.6524069248, + }; + return &kMix[0]; +} + +void OpsinAbsorbanceOpt(const float in[3], float out[3]) { + const float *mix = GetOpsinAbsorbanceOpt(); + out[0] = mix[0] * in[0] + mix[1] * in[1] + mix[2] * in[2] + mix[3]; + out[1] = mix[4] * in[0] + mix[5] * in[1] + mix[6] * in[2] + mix[7]; + out[2] = mix[8] * in[0] + mix[9] * in[1] + mix[10] * in[2] + mix[11]; +} + +float GammaMinArgOpt() { + float in[3] = { 0.0, 0.0, 0.0 }; + float out[3]; + OpsinAbsorbanceOpt(in, out); + return std::min(out[0], std::min(out[1], out[2])); +} + +float GammaMaxArgOpt() { + float in[3] = { 255.0, 255.0, 255.0 }; + float out[3]; + OpsinAbsorbanceOpt(in, out); + return std::max(out[0], std::max(out[1], out[2])); +} + +void MaskHighIntensityChangeOpt( + size_t xsize, size_t ysize, + const std::vector > &c0, + const std::vector > &c1, + std::vector > &xyb0, + std::vector > &xyb1) { + PROFILER_FUNC; + for (size_t y = 0; y < ysize; ++y) { + for (size_t x = 0; x < xsize; ++x) { + size_t ix = y * xsize + x; + const float ave[3] = { + (c0[0][ix] + c1[0][ix]) * 0.5, + (c0[1][ix] + c1[1][ix]) * 0.5, + (c0[2][ix] + c1[2][ix]) * 0.5, + }; + float sqr_max_diff = -1; + { + int offset[4] = + { -1, 1, -static_cast(xsize), static_cast(xsize) }; + int border[4] = + { x == 0, x + 1 == xsize, y == 0, y + 1 == ysize }; + for (int dir = 0; dir < 4; ++dir) { + if (border[dir]) { + continue; + } + const int ix2 = ix + offset[dir]; + float diff = 0.5 * (c0[1][ix2] + c1[1][ix2]) - ave[1]; + diff *= diff; + if (sqr_max_diff < diff) { + sqr_max_diff = diff; + } + } + } + static const float kReductionX = 275.19165240059317; + static const float kReductionY = 18599.41286306991; + static const float kReductionZ = 410.8995306951065; + static const float kChromaBalance = 106.95800948271017; + float chroma_scale = kChromaBalance / (ave[1] + kChromaBalance); + + const float mix[3] = { + chroma_scale * kReductionX / (sqr_max_diff + kReductionX), + kReductionY / (sqr_max_diff + kReductionY), + chroma_scale * kReductionZ / (sqr_max_diff + kReductionZ), + }; + // Interpolate lineraly between the average color and the actual + // color -- to reduce the importance of this pixel. + for (int i = 0; i < 3; ++i) { + xyb0[i][ix] = static_cast(mix[i] * c0[i][ix] + (1 - mix[i]) * ave[i]); + xyb1[i][ix] = static_cast(mix[i] * c1[i][ix] + (1 - mix[i]) * ave[i]); + } + } + } +} + +float SimpleGammaOpt(float v) { + static const float kGamma = 0.387494322593; + static const float limit = 43.01745241042018; + float bright = v - limit; + if (bright >= 0) { + static const float mul = 0.0383723643799; + v -= bright * mul; + } + static const float limit2 = 94.68634353321337; + float bright2 = v - limit2; + if (bright2 >= 0) { + static const float mul = 0.22885405968; + v -= bright2 * mul; + } + static const float offset = 0.156775786057; + static const float scale = 8.898059160493739; + float retval = scale * (offset + pow(v, kGamma)); + return retval; +} + +// Polynomial evaluation via Clenshaw's scheme (similar to Horner's). +// Template enables compile-time unrolling of the recursion, but must reside +// outside of a class due to the specialization. +template +static inline void ClenshawRecursionOpt(const float x, const float *coefficients, + float *b1, float *b2) { + const float x_b1 = x * (*b1); + const float t = (x_b1 + x_b1) - (*b2) + coefficients[INDEX]; + *b2 = *b1; + *b1 = t; + + ClenshawRecursionOpt(x, coefficients, b1, b2); +} + +// Base case +template <> +inline void ClenshawRecursionOpt<0>(const float x, const float *coefficients, + float *b1, float *b2) { + const float x_b1 = x * (*b1); + // The final iteration differs - no 2 * x_b1 here. + *b1 = x_b1 - (*b2) + coefficients[0]; +} + +// Rational polynomial := dividing two polynomial evaluations. These are easier +// to find than minimax polynomials. +struct RationalPolynomialOpt { + template + static float EvaluatePolynomial(const float x, + const float(&coefficients)[N]) { + float b1 = 0.0; + float b2 = 0.0; + ClenshawRecursionOpt(x, coefficients, &b1, &b2); + return b1; + } + + // Evaluates the polynomial at x (in [min_value, max_value]). + inline float operator()(const float x) const { + // First normalize to [0, 1]. + const float x01 = (x - min_value) / (max_value - min_value); + // And then to [-1, 1] domain of Chebyshev polynomials. + const float xc = 2.0 * x01 - 1.0; + + const float yp = EvaluatePolynomial(xc, p); + const float yq = EvaluatePolynomial(xc, q); + if (yq == 0.0) return 0.0; + return static_cast(yp / yq); + } + + // Domain of the polynomials; they are undefined elsewhere. + float min_value; + float max_value; + + // Coefficients of T_n (Chebyshev polynomials of the first kind). + // Degree 5/5 is a compromise between accuracy (0.1%) and numerical stability. + float p[5 + 1]; + float q[5 + 1]; +}; + +static inline float GammaPolynomialOpt(float value) { + // Generated by gamma_polynomial.m from equispaced x/gamma(x) samples. + static const RationalPolynomialOpt r = { + 0.770000000000000, 274.579999999999984, + { + 881.979476556478289, 1496.058452015812463, 908.662212739659481, + 373.566100223287378, 85.840860336314364, 6.683258861509244, + }, + { + 12.262350348616792, 20.557285797683576, 12.161463238367844, + 4.711532733641639, 0.899112889751053, 0.035662329617191, + } }; + return static_cast(r(value)); +} + +static inline float GammaOpt(float v) { + // return SimpleGamma(v); + return GammaPolynomialOpt(static_cast(v)); +} + +void OpsinDynamicsImageOpt(size_t xsize, size_t ysize, + std::vector > &rgb) { + PROFILER_FUNC; + std::vector > blurred = rgb; + static const float kSigma = 1.1; + for (int i = 0; i < 3; ++i) { + BlurOpt(xsize, ysize, blurred[i].data(), kSigma, 0.0); + } + for (size_t i = 0; i < rgb[0].size(); ++i) { + float sensitivity[3]; + { + // Calculate sensitivity[3] based on the smoothed image gamma derivative. + float pre_rgb[3] = { blurred[0][i], blurred[1][i], blurred[2][i] }; + float pre_mixed[3]; + OpsinAbsorbanceOpt(pre_rgb, pre_mixed); + sensitivity[0] = GammaOpt(pre_mixed[0]) / pre_mixed[0]; + sensitivity[1] = GammaOpt(pre_mixed[1]) / pre_mixed[1]; + sensitivity[2] = GammaOpt(pre_mixed[2]) / pre_mixed[2]; + } + float cur_rgb[3] = { rgb[0][i], rgb[1][i], rgb[2][i] }; + float cur_mixed[3]; + OpsinAbsorbanceOpt(cur_rgb, cur_mixed); + cur_mixed[0] *= sensitivity[0]; + cur_mixed[1] *= sensitivity[1]; + cur_mixed[2] *= sensitivity[2]; + float x, y, z; + RgbToXybOpt(cur_mixed[0], cur_mixed[1], cur_mixed[2], &x, &y, &z); + rgb[0][i] = static_cast(x); + rgb[1][i] = static_cast(y); + rgb[2][i] = static_cast(z); + } +} + +void ScaleImageOpt(float scale, std::vector *result) { + PROFILER_FUNC; + for (size_t i = 0; i < result->size(); ++i) { + (*result)[i] *= static_cast(scale); + } +} + +// Making a cluster of local errors to be more impactful than +// just a single error. +void CalculateDiffmapOpt(const size_t xsize, const size_t ysize, + const size_t step, + std::vector* diffmap) { + PROFILER_FUNC; + // Shift the diffmap more correctly above the pixels, from 2.5 pixels to 0.5 + // pixels distance over the original image. The border of 2 pixels on top and + // left side and 3 pixels on right and bottom side are zeroed, but these + // values have no meaning, they only exist to keep the result map the same + // size as the input images. + int s2 = (8 - step) / 2; + { + // Upsample and take square root. + std::vector diffmap_out(xsize * ysize); + const size_t res_xsize = (xsize + step - 1) / step; + for (size_t res_y = 0; res_y + 8 - step < ysize; res_y += step) { + for (size_t res_x = 0; res_x + 8 - step < xsize; res_x += step) { + size_t res_ix = (res_y * res_xsize + res_x) / step; + float orig_val = (*diffmap)[res_ix]; + constexpr float kInitialSlope = 100; + // TODO(b/29974893): Until that is fixed do not call sqrt on very small + // numbers. + float val = orig_val < (1.0 / (kInitialSlope * kInitialSlope)) + ? kInitialSlope * orig_val + : std::sqrt(orig_val); + for (size_t off_y = 0; off_y < step; ++off_y) { + for (size_t off_x = 0; off_x < step; ++off_x) { + diffmap_out[(res_y + off_y + s2) * xsize + + res_x + off_x + s2] = val; + } + } + } + } + *diffmap = diffmap_out; + } + { + static const float kSigma = 8.8510880283; + static const float mul1 = 24.8235314874; + static const float scale = 1.0 / (1.0 + mul1); + const int s = 8 - step; + std::vector blurred((xsize - s) * (ysize - s)); + for (size_t y = 0; y < ysize - s; ++y) { + for (size_t x = 0; x < xsize - s; ++x) { + blurred[y * (xsize - s) + x] = (*diffmap)[(y + s2) * xsize + x + s2]; + } + } + static const float border_ratio = 0.03027655136; + BlurOpt(xsize - s, ysize - s, blurred.data(), kSigma, border_ratio); + for (size_t y = 0; y < ysize - s; ++y) { + for (size_t x = 0; x < xsize - s; ++x) { + (*diffmap)[(y + s2) * xsize + x + s2] + += static_cast(mul1) * blurred[y * (xsize - s) + x]; + } + } + ScaleImageOpt(scale, diffmap); + } +} + +static std::array MakeMaskOpt( + float extmul, float extoff, + float mul, float offset, + float scaler) { + std::array lut; + for (size_t i = 0; i < lut.size(); ++i) { + const float c = mul / ((0.01 * scaler * i) + offset); + lut[i] = 1.0 + extmul * (c + extoff); + assert(lut[i] >= 0.0); + lut[i] *= lut[i]; + } + return lut; +} + +float MaskXOpt(float delta) { + PROFILER_FUNC; + static const float extmul = 0.975741017749; + static const float extoff = -4.25328244168; + static const float offset = 0.454909521427; + static const float scaler = 0.0738288224836; + static const float mul = 20.8029176447; + static const std::array lut = + MakeMaskOpt(extmul, extoff, mul, offset, scaler); + return InterpolateClampNegativeOpt(lut.data(), lut.size(), delta); +} + +float MaskYOpt(float delta) { + PROFILER_FUNC; + static const float extmul = 0.373995618954; + static const float extoff = 1.5307267433; + static const float offset = 0.911952641929; + static const float scaler = 1.1731667845; + static const float mul = 16.2447033988; + static const std::array lut = + MakeMaskOpt(extmul, extoff, mul, offset, scaler); + return InterpolateClampNegativeOpt(lut.data(), lut.size(), delta); +} + +float MaskBOpt(float delta) { + PROFILER_FUNC; + static const float extmul = 0.61582234137; + static const float extoff = -4.25376118646; + static const float offset = 1.05105070921; + static const float scaler = 0.47434643535; + static const float mul = 31.1444967089; + static const std::array lut = + MakeMaskOpt(extmul, extoff, mul, offset, scaler); + return InterpolateClampNegativeOpt(lut.data(), lut.size(), delta); +} + +float MaskDcXOpt(float delta) { + PROFILER_FUNC; + static const float extmul = 1.79116943438; + static const float extoff = -3.86797479189; + static const float offset = 0.670960225853; + static const float scaler = 0.486575865525; + static const float mul = 20.4563479139; + static const std::array lut = + MakeMaskOpt(extmul, extoff, mul, offset, scaler); + return InterpolateClampNegativeOpt(lut.data(), lut.size(), delta); +} + +float MaskDcYOpt(float delta) { + PROFILER_FUNC; + static const float extmul = 0.212223514236; + static const float extoff = -3.65647120524; + static const float offset = 1.73396799447; + static const float scaler = 0.170392660501; + static const float mul = 21.6566724788; + static const std::array lut = + MakeMaskOpt(extmul, extoff, mul, offset, scaler); + return InterpolateClampNegativeOpt(lut.data(), lut.size(), delta); +} + +float MaskDcBOpt(float delta) { + PROFILER_FUNC; + static const float extmul = 0.349376011816; + static const float extoff = -0.894711072781; + static const float offset = 0.901647926679; + static const float scaler = 0.380086095024; + static const float mul = 18.0373825149; + static const std::array lut = + MakeMaskOpt(extmul, extoff, mul, offset, scaler); + return InterpolateClampNegativeOpt(lut.data(), lut.size(), delta); +} + +// Replaces values[x + y * xsize] with the minimum of the values in the +// square_size square with coordinates +// x - offset .. x + square_size - offset - 1, +// y - offset .. y + square_size - offset - 1. +void MinSquareValOpt(size_t square_size, size_t offset, + size_t xsize, size_t ysize, + float *values) { + PROFILER_FUNC; + // offset is not negative and smaller than square_size. + assert(offset < square_size); + std::vector tmp(xsize * ysize); + for (size_t y = 0; y < ysize; ++y) { + const size_t minh = offset > y ? 0 : y - offset; + const size_t maxh = std::min(ysize, y + square_size - offset); + for (size_t x = 0; x < xsize; ++x) { + float min = values[x + minh * xsize]; + for (size_t j = minh + 1; j < maxh; ++j) { + float tmpf = values[x + j * xsize]; + if (tmpf < min) min = tmpf; + } + tmp[x + y * xsize] = static_cast(min); + } + } + for (size_t x = 0; x < xsize; ++x) { + const size_t minw = offset > x ? 0 : x - offset; + const size_t maxw = std::min(xsize, x + square_size - offset); + for (size_t y = 0; y < ysize; ++y) { + float min = tmp[minw + y * xsize]; + for (size_t j = minw + 1; j < maxw; ++j) { + float tmpf = tmp[j + y * xsize]; + if (tmpf < min) min = tmpf; + } + values[x + y * xsize] = static_cast(min); + } + } +} + +void Average5x5Opt(int xsize, int ysize, std::vector* diffs) { + PROFILER_FUNC; + if (xsize < 4 || ysize < 4) { + // TODO: Make this work for small dimensions as well. + return; + } + static const float w = 0.679144890667f; + static const float scale = 1.0f / (5.0f + 4 * w); + std::vector result = *diffs; + std::vector tmp0 = *diffs; + std::vector tmp1 = *diffs; + ScaleImage(w, &tmp1); + for (int y = 0; y < ysize; y++) { + const int row0 = y * xsize; + result[row0 + 1] += tmp0[row0]; + result[row0 + 0] += tmp0[row0 + 1]; + result[row0 + 2] += tmp0[row0 + 1]; + for (int x = 2; x < xsize - 2; ++x) { + result[row0 + x - 1] += tmp0[row0 + x]; + result[row0 + x + 1] += tmp0[row0 + x]; + } + result[row0 + xsize - 3] += tmp0[row0 + xsize - 2]; + result[row0 + xsize - 1] += tmp0[row0 + xsize - 2]; + result[row0 + xsize - 2] += tmp0[row0 + xsize - 1]; + if (y > 0) { + const int rowd1 = row0 - xsize; + result[rowd1 + 1] += tmp1[row0]; + result[rowd1 + 0] += tmp0[row0]; + for (int x = 1; x < xsize - 1; ++x) { + result[rowd1 + x + 1] += tmp1[row0 + x]; + result[rowd1 + x + 0] += tmp0[row0 + x]; + result[rowd1 + x - 1] += tmp1[row0 + x]; + } + result[rowd1 + xsize - 1] += tmp0[row0 + xsize - 1]; + result[rowd1 + xsize - 2] += tmp1[row0 + xsize - 1]; + } + if (y + 1 < ysize) { + const int rowu1 = row0 + xsize; + result[rowu1 + 1] += tmp1[row0]; + result[rowu1 + 0] += tmp0[row0]; + for (int x = 1; x < xsize - 1; ++x) { + result[rowu1 + x + 1] += tmp1[row0 + x]; + result[rowu1 + x + 0] += tmp0[row0 + x]; + result[rowu1 + x - 1] += tmp1[row0 + x]; + } + result[rowu1 + xsize - 1] += tmp0[row0 + xsize - 1]; + result[rowu1 + xsize - 2] += tmp1[row0 + xsize - 1]; + } + } + *diffs = result; + ScaleImageOpt(scale, diffs); +} + +void DiffPrecomputeOpt( + const std::vector > &xyb0, + const std::vector > &xyb1, + size_t xsize, size_t ysize, + std::vector > *mask) { + PROFILER_FUNC; + mask->resize(3, std::vector(xyb0[0].size())); + float valsh0[3] = { 0.0 }; + float valsv0[3] = { 0.0 }; + float valsh1[3] = { 0.0 }; + float valsv1[3] = { 0.0 }; + int ix2; + for (size_t y = 0; y < ysize; ++y) { + for (size_t x = 0; x < xsize; ++x) { + size_t ix = x + xsize * y; + if (x + 1 < xsize) { + ix2 = ix + 1; + } + else { + ix2 = ix - 1; + } + { + float x0 = (xyb0[0][ix] - xyb0[0][ix2]); + float y0 = (xyb0[1][ix] - xyb0[1][ix2]); + float z0 = (xyb0[2][ix] - xyb0[2][ix2]); + XybToValsOpt(x0, y0, z0, &valsh0[0], &valsh0[1], &valsh0[2]); + float x1 = (xyb1[0][ix] - xyb1[0][ix2]); + float y1 = (xyb1[1][ix] - xyb1[1][ix2]); + float z1 = (xyb1[2][ix] - xyb1[2][ix2]); + XybToValsOpt(x1, y1, z1, &valsh1[0], &valsh1[1], &valsh1[2]); + } + if (y + 1 < ysize) { + ix2 = ix + xsize; + } + else { + ix2 = ix - xsize; + } + { + float x0 = (xyb0[0][ix] - xyb0[0][ix2]); + float y0 = (xyb0[1][ix] - xyb0[1][ix2]); + float z0 = (xyb0[2][ix] - xyb0[2][ix2]); + XybToValsOpt(x0, y0, z0, &valsv0[0], &valsv0[1], &valsv0[2]); + float x1 = (xyb1[0][ix] - xyb1[0][ix2]); + float y1 = (xyb1[1][ix] - xyb1[1][ix2]); + float z1 = (xyb1[2][ix] - xyb1[2][ix2]); + XybToValsOpt(x1, y1, z1, &valsv1[0], &valsv1[1], &valsv1[2]); + } + for (int i = 0; i < 3; ++i) { + float sup0 = fabs(valsh0[i]) + fabs(valsv0[i]); + float sup1 = fabs(valsh1[i]) + fabs(valsv1[i]); + float m = std::min(sup0, sup1); + (*mask)[i][ix] = static_cast(m); + } + } + } +} + +void MaskOpt(const std::vector > &xyb0, + const std::vector > &xyb1, + size_t xsize, size_t ysize, + std::vector > *mask, + std::vector > *mask_dc) { + PROFILER_FUNC; + mask->resize(3); + for (int i = 0; i < 3; ++i) { + (*mask)[i].resize(xsize * ysize); + } + DiffPrecomputeOpt(xyb0, xyb1, xsize, ysize, mask); + for (int i = 0; i < 3; ++i) { + _Average5x5(xsize, ysize, &(*mask)[i]); + MinSquareValOpt(4, 0, xsize, ysize, (*mask)[i].data()); + static const float sigma[3] = { + 9.65781083553, + 14.2644604355, + 4.53358927369, + }; + BlurOpt(xsize, ysize, (*mask)[i].data(), sigma[i], 0.0); + } + static const float w00 = 232.206464018; + static const float w11 = 22.9455222245; + static const float w22 = 503.962310606; + + mask_dc->resize(3); + for (int i = 0; i < 3; ++i) { + (*mask_dc)[i].resize(xsize * ysize); + } + for (size_t y = 0; y < ysize; ++y) { + for (size_t x = 0; x < xsize; ++x) { + const size_t idx = y * xsize + x; + const float s0 = (*mask)[0][idx]; + const float s1 = (*mask)[1][idx]; + const float s2 = (*mask)[2][idx]; + const float p0 = w00 * s0; + const float p1 = w11 * s1; + const float p2 = w22 * s2; + + (*mask)[0][idx] = static_cast(MaskXOpt(p0)); + (*mask)[1][idx] = static_cast(MaskYOpt(p1)); + (*mask)[2][idx] = static_cast(MaskBOpt(p2)); + (*mask_dc)[0][idx] = static_cast(MaskDcXOpt(p0)); + (*mask_dc)[1][idx] = static_cast(MaskDcYOpt(p1)); + (*mask_dc)[2][idx] = static_cast(MaskDcBOpt(p2)); + } + } + for (int i = 0; i < 3; ++i) { + ScaleImageOpt(kGlobalScale * kGlobalScale, &(*mask)[i]); + ScaleImageOpt(kGlobalScale * kGlobalScale, &(*mask_dc)[i]); + } +} + +} + namespace butteraugli { clButteraugliComparator::clButteraugliComparator(size_t xsize, size_t ysize, int step) @@ -29,12 +1285,17 @@ namespace butteraugli xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), xsize_, ysize_, step_); } #endif + else if (MODE_CPU_OPT == g_mathMode) + { + DiffmapOpsinDynamicsImageOpt(xyb0, xyb1, result); + } else { ButteraugliComparator::DiffmapOpsinDynamicsImage(xyb0, xyb1, result); } } + void clButteraugliComparator::BlockDiffMap(const std::vector > &xyb0, const std::vector > &xyb1, std::vector* block_diff_dc, @@ -50,8 +1311,7 @@ namespace butteraugli (*block_diff_dc).data(), (*block_diff_ac).data()); } } - - + void clButteraugliComparator::EdgeDetectorMap(const std::vector > &xyb0, const std::vector > &xyb1, std::vector* edge_detector_map) @@ -109,6 +1369,186 @@ namespace butteraugli } } + void clButteraugliComparator::DiffmapOpsinDynamicsImageOpt( + std::vector> &xyb0, + std::vector> &xyb1, + std::vector &result) + { + if (xsize_ < 8 || ysize_ < 8) return; + { + auto xyb0_c = xyb0; + auto xyb1_c = xyb1; + MaskHighIntensityChangeOpt(xsize_, ysize_, xyb0_c, xyb1_c, xyb0, xyb1); + } + assert(8 <= xsize_); + for (int i = 0; i < 3; i++) { + assert(xyb0[i].size() == num_pixels_); + assert(xyb1[i].size() == num_pixels_); + } + std::vector edge_detector_map(3 * res_xsize_ * res_ysize_); + EdgeDetectorMapOpt(xyb0, xyb1, &edge_detector_map); + std::vector block_diff_dc(3 * res_xsize_ * res_ysize_); + std::vector block_diff_ac(3 * res_xsize_ * res_ysize_); + BlockDiffMapOpt(xyb0, xyb1, &block_diff_dc, &block_diff_ac); + EdgeDetectorLowFreqOpt(xyb0, xyb1, &block_diff_ac); + { + std::vector > mask_xyb(3); + std::vector > mask_xyb_dc(3); + MaskOpt(xyb0, xyb1, xsize_, ysize_, &mask_xyb, &mask_xyb_dc); + CombineChannelsOpt(mask_xyb, mask_xyb_dc, block_diff_dc, block_diff_ac, + edge_detector_map, &result); + } + CalculateDiffmapOpt(xsize_, ysize_, step_, &result); + } + + void clButteraugliComparator::BlockDiffMapOpt(const std::vector > &xyb0, + const std::vector > &xyb1, + std::vector* block_diff_dc, + std::vector* block_diff_ac) + { + for (size_t res_y = 0; res_y + (kBlockEdge - step_ - 1) < ysize_; + res_y += step_) { + for (size_t res_x = 0; res_x + (kBlockEdge - step_ - 1) < xsize_; + res_x += step_) { + size_t res_ix = (res_y * res_xsize_ + res_x) / step_; + size_t offset = (std::min(res_y, ysize_ - 8) * xsize_ + + std::min(res_x, xsize_ - 8)); + float block0[3 * kBlockEdge * kBlockEdge]; + float block1[3 * kBlockEdge * kBlockEdge]; + for (int i = 0; i < 3; ++i) { + float *m0 = &block0[i * kBlockEdge * kBlockEdge]; + float *m1 = &block1[i * kBlockEdge * kBlockEdge]; + for (size_t y = 0; y < kBlockEdge; y++) { + for (size_t x = 0; x < kBlockEdge; x++) { + m0[kBlockEdge * y + x] = xyb0[i][offset + y * xsize_ + x]; + m1[kBlockEdge * y + x] = xyb1[i][offset + y * xsize_ + x]; + } + } + } + float diff_xyb_dc[3] = { 0.0 }; + float diff_xyb_ac[3] = { 0.0 }; + float diff_xyb_edge_dc[3] = { 0.0 }; + ButteraugliBlockDiffOpt(block0, block1, + diff_xyb_dc, diff_xyb_ac, diff_xyb_edge_dc); + for (int i = 0; i < 3; ++i) { + (*block_diff_dc)[3 * res_ix + i] = static_cast(diff_xyb_dc[i]); + (*block_diff_ac)[3 * res_ix + i] = static_cast(diff_xyb_ac[i]); + } + } + } + } + + void clButteraugliComparator::EdgeDetectorMapOpt(const std::vector > &xyb0, + const std::vector > &xyb1, + std::vector* edge_detector_map) + { + static const float kSigma[3] = { + 1.5, + 0.586, + 0.4, + }; + std::vector > blurred0(xyb0); + std::vector > blurred1(xyb1); + for (int i = 0; i < 3; i++) { + BlurOpt(xsize_, ysize_, blurred0[i].data(), kSigma[i], 0.0); + BlurOpt(xsize_, ysize_, blurred1[i].data(), kSigma[i], 0.0); + } + for (size_t res_y = 0; res_y + (8 - step_) < ysize_; res_y += step_) { + for (size_t res_x = 0; res_x + (8 - step_) < xsize_; res_x += step_) { + size_t res_ix = (res_y * res_xsize_ + res_x) / step_; + float diff_xyb[3] = { 0.0 }; + Butteraugli8x8CornerEdgeDetectorDiffOpt(std::min(res_x, xsize_ - 8), + std::min(res_y, ysize_ - 8), + xsize_, ysize_, + blurred0, blurred1, + diff_xyb); + for (int i = 0; i < 3; ++i) { + (*edge_detector_map)[3 * res_ix + i] = static_cast(diff_xyb[i]); + } + } + } + } + + void clButteraugliComparator::EdgeDetectorLowFreqOpt(const std::vector > &xyb0, + const std::vector > &xyb1, + std::vector* block_diff_ac) + { + static const float kSigma = 14; + static const float kMul = 10; + std::vector > blurred0(xyb0); + std::vector > blurred1(xyb1); + for (int i = 0; i < 3; i++) { + BlurOpt(xsize_, ysize_, blurred0[i].data(), kSigma, 0.0); + BlurOpt(xsize_, ysize_, blurred1[i].data(), kSigma, 0.0); + } + const int step = 8; + for (size_t y = 0; y + step < ysize_; y += step_) { + int resy = y / step_; + int resx = step / step_; + for (size_t x = 0; x + step < xsize_; x += step_, resx++) { + const int ix = y * xsize_ + x; + const int res_ix = resy * res_xsize_ + resx; + float diff[4][3]; + for (int i = 0; i < 3; ++i) { + int ix2 = ix + 8; + diff[0][i] = + ((blurred1[i][ix] - blurred0[i][ix]) + + (blurred0[i][ix2] - blurred1[i][ix2])); + ix2 = ix + 8 * xsize_; + diff[1][i] = + ((blurred1[i][ix] - blurred0[i][ix]) + + (blurred0[i][ix2] - blurred1[i][ix2])); + ix2 = ix + 6 * xsize_ + 6; + diff[2][i] = + ((blurred1[i][ix] - blurred0[i][ix]) + + (blurred0[i][ix2] - blurred1[i][ix2])); + ix2 = ix + 6 * xsize_ - 6; + diff[3][i] = x < step ? 0 : + ((blurred1[i][ix] - blurred0[i][ix]) + + (blurred0[i][ix2] - blurred1[i][ix2])); + } + float max_diff_xyb[3] = { 0 }; + for (int k = 0; k < 4; ++k) { + float diff_xyb[3] = { 0 }; + XybDiffLowFreqSquaredAccumulateOpt(diff[k][0], diff[k][1], diff[k][2], + 0, 0, 0, 1.0, + diff_xyb); + for (int i = 0; i < 3; ++i) { + max_diff_xyb[i] = std::max(max_diff_xyb[i], diff_xyb[i]); + } + } + for (int i = 0; i < 3; ++i) { + (*block_diff_ac)[3 * res_ix + i] += static_cast(kMul * max_diff_xyb[i]); + } + } + } + } + + void clButteraugliComparator::CombineChannelsOpt(const std::vector >& mask_xyb, + const std::vector >& mask_xyb_dc, + const std::vector& block_diff_dc, + const std::vector& block_diff_ac, + const std::vector& edge_detector_map, + std::vector* result) + { + result->resize(res_xsize_ * res_ysize_); + for (size_t res_y = 0; res_y + (8 - step_) < ysize_; res_y += step_) { + for (size_t res_x = 0; res_x + (8 - step_) < xsize_; res_x += step_) { + size_t res_ix = (res_y * res_xsize_ + res_x) / step_; + float mask[3]; + float dc_mask[3]; + for (int i = 0; i < 3; ++i) { + mask[i] = mask_xyb[i][(res_y + 3) * xsize_ + (res_x + 3)]; + dc_mask[i] = mask_xyb_dc[i][(res_y + 3) * xsize_ + (res_x + 3)]; + } + (*result)[res_ix] = static_cast( + DotProductOpt(&block_diff_dc[3 * res_ix], dc_mask) + + DotProductOpt(&block_diff_ac[3 * res_ix], mask) + + DotProductOpt(&edge_detector_map[3 * res_ix], mask)); + } + } + } + void MinSquareVal(size_t square_size, size_t offset, size_t xsize, size_t ysize, float *values) { if (MODE_CHECKCL == g_mathMode && xsize > 8 && ysize > 8) @@ -119,7 +1559,7 @@ namespace butteraugli _MinSquareVal(square_size, offset, xsize, ysize, values); tclMinSquareVal(img.data(), square_size, offset, xsize, ysize, values); } - else + else { _MinSquareVal(square_size, offset, xsize, ysize, values); } @@ -189,6 +1629,10 @@ namespace butteraugli ); } #endif + else if (MODE_CPU_OPT == g_mathMode) + { + MaskOpt(xyb0, xyb1, xsize, ysize, mask, mask_dc); + } else if (MODE_CHECKCL == g_mathMode && xsize > 8 && ysize > 8) { _Mask(xyb0, xyb1, xsize, ysize, mask, mask_dc); @@ -227,16 +1671,23 @@ namespace butteraugli std::vector > &xyb0, std::vector > &xyb1) { - _MaskHighIntensityChange(xsize, ysize, c0, c1, xyb0, xyb1); - if (MODE_CHECKCL == g_mathMode && xsize > 8 && ysize > 8) { + _MaskHighIntensityChange(xsize, ysize, c0, c1, xyb0, xyb1); tclMaskHighIntensityChange(c0[0].data(), c0[1].data(), c0[2].data(), c1[0].data(), c1[1].data(), c1[2].data(), xsize, ysize, xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), xyb1[0].data(), xyb1[1].data(), xyb1[2].data()); } + else if (MODE_CPU_OPT == g_mathMode) + { + MaskHighIntensityChangeOpt(xsize, ysize, c0, c1, xyb0, xyb1); + } + else + { + _MaskHighIntensityChange(xsize, ysize, c0, c1, xyb0, xyb1); + } } void ScaleImage(double scale, std::vector *result) @@ -314,7 +1765,11 @@ namespace butteraugli tclOpsinDynamicsImage(orig_rgb[0].data(), orig_rgb[1].data(), orig_rgb[2].data(), xsize, ysize, rgb[0].data(), rgb[1].data(), rgb[2].data()); - } + } + else if (MODE_CPU_OPT == g_mathMode) + { + OpsinDynamicsImageOpt(xsize, ysize, rgb); + } else { _OpsinDynamicsImage(xsize, ysize, rgb); diff --git a/clguetzli/clbutter_comparator.h b/clguetzli/clbutter_comparator.h index 23204047..c26de1de 100644 --- a/clguetzli/clbutter_comparator.h +++ b/clguetzli/clbutter_comparator.h @@ -15,26 +15,49 @@ namespace butteraugli { std::vector> &xyb1, std::vector &result); + virtual void DiffmapOpsinDynamicsImageOpt(std::vector> &xyb0, + std::vector> &xyb1, + std::vector &result); + virtual void BlockDiffMap(const std::vector > &rgb0, const std::vector > &rgb1, std::vector* block_diff_dc, std::vector* block_diff_ac); + virtual void BlockDiffMapOpt(const std::vector > &rgb0, + const std::vector > &rgb1, + std::vector* block_diff_dc, + std::vector* block_diff_ac); virtual void EdgeDetectorMap(const std::vector > &rgb0, const std::vector > &rgb1, std::vector* edge_detector_map); + virtual void EdgeDetectorMapOpt(const std::vector > &rgb0, + const std::vector > &rgb1, + std::vector* edge_detector_map); + virtual void EdgeDetectorLowFreq(const std::vector > &rgb0, const std::vector > &rgb1, std::vector* block_diff_ac); + virtual void EdgeDetectorLowFreqOpt(const std::vector > &rgb0, + const std::vector > &rgb1, + std::vector* block_diff_ac); + virtual void CombineChannels(const std::vector >& scale_xyb, const std::vector >& scale_xyb_dc, const std::vector& block_diff_dc, const std::vector& block_diff_ac, const std::vector& edge_detector_map, std::vector* result); + + virtual void CombineChannelsOpt(const std::vector >& scale_xyb, + const std::vector >& scale_xyb_dc, + const std::vector& block_diff_dc, + const std::vector& block_diff_ac, + const std::vector& edge_detector_map, + std::vector* result); }; void _MinSquareVal(size_t square_size, size_t offset, size_t xsize, size_t ysize, float *values); diff --git a/clguetzli/clguetzli.cl.cpp b/clguetzli/clguetzli.cl.cpp index 7bd566df..b04d6cc1 100644 --- a/clguetzli/clguetzli.cl.cpp +++ b/clguetzli/clguetzli.cl.cpp @@ -61,7 +61,6 @@ namespace guetzli void ButteraugliComparatorEx::Compare(const OutputImage& img) { - if (MODE_OPENCL == g_mathMode) { std::vector > rgb1(3, std::vector(width_ * height_)); @@ -124,11 +123,21 @@ namespace guetzli distance_ = ::butteraugli::ButteraugliScoreFromDiffmap(distmap_); } #endif - else + else if (MODE_CPU_OPT == g_mathMode) { - ButteraugliComparator::Compare(img); + std::vector > rgb0 = rgb_orig_opsin; + + std::vector > rgb(3, std::vector(width_ * height_)); + img.ToLinearRGB(&rgb); + ::butteraugli::OpsinDynamicsImage(width_, height_, rgb); + std::vector().swap(distmap_); + comparator_.DiffmapOpsinDynamicsImage(rgb0, rgb, distmap_); + distance_ = ::butteraugli::ButteraugliScoreFromDiffmap(distmap_); } - + else + { + ButteraugliComparator::Compare(img); + } } void ButteraugliComparatorEx::StartBlockComparisons() diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h index d5c04492..d25f8c80 100644 --- a/clguetzli/clguetzli.h +++ b/clguetzli/clguetzli.h @@ -14,6 +14,7 @@ enum MATH_MODE { MODE_CPU = 0, + MODE_CPU_OPT, MODE_OPENCL, MODE_CUDA, MODE_CHECKCL, diff --git a/guetzli/guetzli.cc b/guetzli/guetzli.cc index 63bc4ed1..e5a335c6 100644 --- a/guetzli/guetzli.cc +++ b/guetzli/guetzli.cc @@ -228,6 +228,7 @@ void Usage() { " the limit. Default limit is %d MB.\n" " --opencl - Use OpenCL\n" " --checkcl - Check OpenCL result\n" + " --c - Use c opt version\n" #ifdef __USE_CUDA__ " --cuda - Use CUDA\n" " --checkcuda - Check CUDA result\n" @@ -270,6 +271,10 @@ int main(int argc, char** argv) { else if (!strcmp(argv[opt_idx], "--checkcl")) { g_mathMode = MODE_CHECKCL; } + else if (!strcmp(argv[opt_idx], "--c")) + { + g_mathMode = MODE_CPU_OPT; + } #ifdef __USE_CUDA__ else if (!strcmp(argv[opt_idx], "--cuda")) { g_mathMode = MODE_CUDA; diff --git a/guetzli/processor.cc b/guetzli/processor.cc index 432c62f5..a16fcc36 100644 --- a/guetzli/processor.cc +++ b/guetzli/processor.cc @@ -409,53 +409,54 @@ void Processor::ComputeBlockZeroingOrder( memcpy(processed_block, block, sizeof(processed_block)); comparator_->SwitchBlock(block_x, block_y, factor_x, factor_y); while (!input_order.empty()) { - float best_err = 1e17f; - int best_i = 0; - for (size_t i = 0; i < std::min(params_.zeroing_greedy_lookahead, - input_order.size()); - ++i) { - coeff_t candidate_block[kBlockSize]; - memcpy(candidate_block, processed_block, sizeof(candidate_block)); - const int idx = input_order[i].first; - candidate_block[idx] = 0; - for (int c = 0; c < 3; ++c) { - if (comp_mask & (1 << c)) { - img->component(c).SetCoeffBlock( - block_x, block_y, &candidate_block[c * kDCTBlockSize]); - } - } - float max_err = 0; - for (int iy = 0; iy < factor_y; ++iy) { - for (int ix = 0; ix < factor_x; ++ix) { - int block_xx = block_x * factor_x + ix; - int block_yy = block_y * factor_y + iy; - if (8 * block_xx < img->width() && 8 * block_yy < img->height()) { - float err = static_cast(comparator_->CompareBlock(*img, ix, iy, candidate_block, comp_mask)); - max_err = std::max(max_err, err); - } - } - } - if (max_err < best_err) { - best_err = max_err; - best_i = i; - } - } - int idx = input_order[best_i].first; - processed_block[idx] = 0; - input_order.erase(input_order.begin() + best_i); - output_order->push_back({idx, best_err}); - for (int c = 0; c < 3; ++c) { - if (comp_mask & (1 << c)) { - img->component(c).SetCoeffBlock( - block_x, block_y, &processed_block[c * kDCTBlockSize]); - } - } -#ifdef __USE_C__ - if (best_err >= comparator_->BlockErrorLimit()) - { // err¶ÓÁÐÊÇÖð½¥Ôö´óµÄ£¬Èç¹ûÕâÀïÒѾ­³¬¹ýErrorLimit£¬ºóÐøµÄ¼ÆËã¾ÍÊÇÈßÓàµÄÁË - break; - } -#endif + float best_err = 1e17f; + int best_i = 0; + for (size_t i = 0; i < std::min(params_.zeroing_greedy_lookahead, + input_order.size()); + ++i) { + coeff_t candidate_block[kBlockSize]; + memcpy(candidate_block, processed_block, sizeof(candidate_block)); + const int idx = input_order[i].first; + candidate_block[idx] = 0; + for (int c = 0; c < 3; ++c) { + if (comp_mask & (1 << c)) { + img->component(c).SetCoeffBlock( + block_x, block_y, &candidate_block[c * kDCTBlockSize]); + } + } + float max_err = 0; + for (int iy = 0; iy < factor_y; ++iy) { + for (int ix = 0; ix < factor_x; ++ix) { + int block_xx = block_x * factor_x + ix; + int block_yy = block_y * factor_y + iy; + if (8 * block_xx < img->width() && 8 * block_yy < img->height()) { + float err = static_cast(comparator_->CompareBlock(*img, ix, iy, candidate_block, comp_mask)); + max_err = std::max(max_err, err); + } + } + } + if (max_err < best_err) { + best_err = max_err; + best_i = i; + } + } + int idx = input_order[best_i].first; + processed_block[idx] = 0; + input_order.erase(input_order.begin() + best_i); + output_order->push_back({ idx, best_err }); + for (int c = 0; c < 3; ++c) { + if (comp_mask & (1 << c)) { + img->component(c).SetCoeffBlock( + block_x, block_y, &processed_block[c * kDCTBlockSize]); + } + } + if (MODE_CPU_OPT == g_mathMode) + { + if (best_err >= comparator_->BlockErrorLimit()) + { // err¶ÓÁÐÊÇÖð½¥Ôö´óµÄ£¬Èç¹ûÕâÀïÒѾ­³¬¹ýErrorLimit£¬ºóÐøµÄ¼ÆËã¾ÍÊÇÈßÓàµÄÁË + break; + } + } } // Make the block error values monotonic. float min_err = 1e10; @@ -622,7 +623,7 @@ void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, co #endif } - if (MODE_CPU == g_mathMode || MODE_CHECKCL == g_mathMode) + if (MODE_CPU_OPT == g_mathMode || MODE_CPU == g_mathMode || MODE_CHECKCL == g_mathMode) { output_order_cpu.resize(num_blocks * kBlockSize); output_order = output_order_cpu.data(); From b67b00d19e7d3f7e7099752fa25a733880b5d629 Mon Sep 17 00:00:00 2001 From: Zhou Ke Date: Mon, 19 Jun 2017 21:28:17 +0800 Subject: [PATCH 164/189] Modify the flag for creating CUDA context --- clguetzli/ocu.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clguetzli/ocu.cpp b/clguetzli/ocu.cpp index 48f2768a..fddf4e20 100644 --- a/clguetzli/ocu.cpp +++ b/clguetzli/ocu.cpp @@ -19,7 +19,7 @@ ocu_args_d_t& getOcu(void) CUcontext ctxt; CUstream stream; - err = cuCtxCreate(&ctxt, CU_CTX_SCHED_BLOCKING_SYNC, dev); + err = cuCtxCreate(&ctxt, CU_CTX_SCHED_AUTO, dev); LOG_CU_RESULT(err); char name[1024]; From c1bc10cba3d87f1c6646cff732dec83a1fec9802 Mon Sep 17 00:00:00 2001 From: Zhou Ke Date: Tue, 20 Jun 2017 19:41:07 +0800 Subject: [PATCH 165/189] Add macro for opencl version --- clguetzli/clbutter_comparator.cpp | 99 ++++++++++++++++++++----------- clguetzli/clguetzli.cl.cpp | 32 ++++++---- clguetzli/clguetzli.cpp | 8 ++- clguetzli/clguetzli.h | 24 ++++---- clguetzli/clguetzli_test.cpp | 6 +- clguetzli/ocl.cpp | 6 +- clguetzli/ocl.h | 3 + guetzli/guetzli.cc | 13 ++++ guetzli/processor.cc | 25 +++++++- 9 files changed, 151 insertions(+), 65 deletions(-) diff --git a/clguetzli/clbutter_comparator.cpp b/clguetzli/clbutter_comparator.cpp index 58e76e54..e39966b1 100644 --- a/clguetzli/clbutter_comparator.cpp +++ b/clguetzli/clbutter_comparator.cpp @@ -1271,12 +1271,18 @@ namespace butteraugli std::vector> &xyb1, std::vector &result) { - if (MODE_OPENCL == g_mathMode && xsize_ > 100 && ysize_ > 100) + if (MODE_CPU_OPT == g_mathMode) + { + DiffmapOpsinDynamicsImageOpt(xyb0, xyb1, result); + } +#ifdef __USE_OPENCL__ + else if (MODE_OPENCL == g_mathMode && xsize_ > 100 && ysize_ > 100) { result.resize(xsize_ * ysize_); clDiffmapOpsinDynamicsImage(result.data(), xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), xsize_, ysize_, step_); } +#endif #ifdef __USE_CUDA__ else if (MODE_CUDA == g_mathMode && xsize_ > 100 && ysize_ > 100) { @@ -1285,10 +1291,6 @@ namespace butteraugli xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), xsize_, ysize_, step_); } #endif - else if (MODE_CPU_OPT == g_mathMode) - { - DiffmapOpsinDynamicsImageOpt(xyb0, xyb1, result); - } else { ButteraugliComparator::DiffmapOpsinDynamicsImage(xyb0, xyb1, result); @@ -1302,7 +1304,7 @@ namespace butteraugli std::vector* block_diff_ac) { ButteraugliComparator::BlockDiffMap(xyb0, xyb1, block_diff_dc, block_diff_ac); - +#ifdef __USE_OPENCL__ if (MODE_CHECKCL == g_mathMode && xsize_ > 8 && ysize_ > 8) { tclBlockDiffMap(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), @@ -1310,6 +1312,7 @@ namespace butteraugli xsize_, ysize_, step_, (*block_diff_dc).data(), (*block_diff_ac).data()); } +#endif } void clButteraugliComparator::EdgeDetectorMap(const std::vector > &xyb0, @@ -1317,7 +1320,7 @@ namespace butteraugli std::vector* edge_detector_map) { ButteraugliComparator::EdgeDetectorMap(xyb0, xyb1, edge_detector_map); - +#ifdef __USE_OPENCL__ if (MODE_CHECKCL == g_mathMode && xsize_ > 8 && ysize_ > 8) { tclEdgeDetectorMap(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), @@ -1325,12 +1328,14 @@ namespace butteraugli xsize_, ysize_, step_, (*edge_detector_map).data()); } +#endif } void clButteraugliComparator::EdgeDetectorLowFreq(const std::vector > &xyb0, const std::vector > &xyb1, std::vector* block_diff_ac) { +#ifdef __USE_OPENCL__ if (MODE_CHECKCL == g_mathMode && xsize_ > 8 && ysize_ > 8) { std::vector orign_ac = *block_diff_ac; @@ -1341,6 +1346,7 @@ namespace butteraugli orign_ac.data(), (*block_diff_ac).data()); } else +#endif { ButteraugliComparator::EdgeDetectorLowFreq(xyb0, xyb1, block_diff_ac); } @@ -1353,6 +1359,7 @@ namespace butteraugli const std::vector& edge_detector_map, std::vector* result) { +#ifdef __USE_OPENCL__ if (MODE_CHECKCL == g_mathMode && xsize_ > 8 && ysize_ > 8) { std::vector temp = *result; @@ -1364,6 +1371,7 @@ namespace butteraugli block_diff_ac.data(), edge_detector_map.data(), xsize_, ysize_, res_xsize_, res_ysize_, step_, &temp[0], &(*result)[0]); } else +#endif { ButteraugliComparator::CombineChannels(mask_xyb, mask_xyb_dc, block_diff_dc, block_diff_ac, edge_detector_map, result); } @@ -1551,6 +1559,7 @@ namespace butteraugli void MinSquareVal(size_t square_size, size_t offset, size_t xsize, size_t ysize, float *values) { +#ifdef __USE_OPENCL__ if (MODE_CHECKCL == g_mathMode && xsize > 8 && ysize > 8) { std::vector img; @@ -1560,6 +1569,7 @@ namespace butteraugli tclMinSquareVal(img.data(), square_size, offset, xsize, ysize, values); } else +#endif { _MinSquareVal(square_size, offset, xsize, ysize, values); } @@ -1567,6 +1577,7 @@ namespace butteraugli void Average5x5(int xsize, int ysize, std::vector* diffs) { +#ifdef __USE_OPENCL__ if (MODE_CHECKCL == g_mathMode && xsize > 8 && ysize > 8) { std::vector diffs_org = *diffs; @@ -1574,6 +1585,7 @@ namespace butteraugli tclAverage5x5(xsize, ysize, diffs_org, *diffs); } else +#endif { _Average5x5(xsize, ysize, diffs); } @@ -1583,10 +1595,12 @@ namespace butteraugli { _DiffPrecompute(xyb0, xyb1, xsize, ysize, mask); +#ifdef __USE_OPENCL__ if (MODE_CHECKCL == g_mathMode && xsize > 8 && ysize > 8) { tclDiffPrecompute(xyb0, xyb1, xsize, ysize, mask); } +#endif } void Mask(const std::vector > &xyb0, @@ -1595,7 +1609,12 @@ namespace butteraugli std::vector > *mask, std::vector > *mask_dc) { - if (MODE_OPENCL == g_mathMode && xsize > 100 && ysize > 100) + if (MODE_CPU_OPT == g_mathMode) + { + MaskOpt(xyb0, xyb1, xsize, ysize, mask, mask_dc); + } +#ifdef __USE_OPENCL__ + else if (MODE_OPENCL == g_mathMode && xsize > 100 && ysize > 100) { mask->resize(3); mask_dc->resize(3); @@ -1611,6 +1630,16 @@ namespace butteraugli xyb1[0].data(), xyb1[1].data(), xyb1[2].data() ); } + else if (MODE_CHECKCL == g_mathMode && xsize > 8 && ysize > 8) + { + _Mask(xyb0, xyb1, xsize, ysize, mask, mask_dc); + tclMask(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), + xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), + xsize, ysize, + (*mask)[0].data(), (*mask)[1].data(), (*mask)[2].data(), + (*mask_dc)[0].data(), (*mask_dc)[1].data(), (*mask_dc)[2].data()); + } +#endif #ifdef __USE_CUDA__ else if (MODE_CUDA == g_mathMode && xsize > 100 && ysize > 100) { @@ -1629,19 +1658,6 @@ namespace butteraugli ); } #endif - else if (MODE_CPU_OPT == g_mathMode) - { - MaskOpt(xyb0, xyb1, xsize, ysize, mask, mask_dc); - } - else if (MODE_CHECKCL == g_mathMode && xsize > 8 && ysize > 8) - { - _Mask(xyb0, xyb1, xsize, ysize, mask, mask_dc); - tclMask(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), - xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), - xsize, ysize, - (*mask)[0].data(), (*mask)[1].data(), (*mask)[2].data(), - (*mask_dc)[0].data(), (*mask_dc)[1].data(), (*mask_dc)[2].data()); - } else { _Mask(xyb0, xyb1, xsize, ysize, mask, mask_dc); @@ -1652,6 +1668,7 @@ namespace butteraugli const size_t step, std::vector* diffmap) { +#ifdef __USE_OPENCL__ if (MODE_CHECKCL == g_mathMode && xsize > 8 && ysize > 8) { std::vector diffmap_org = *diffmap; @@ -1659,6 +1676,7 @@ namespace butteraugli tclCalculateDiffmap(xsize, ysize, step, diffmap_org.data(), diffmap_org.size(), (*diffmap).data()); } else +#endif { _CalculateDiffmap(xsize, ysize, step, diffmap); } @@ -1671,6 +1689,7 @@ namespace butteraugli std::vector > &xyb0, std::vector > &xyb1) { +#ifdef __USE_OPENCL__ if (MODE_CHECKCL == g_mathMode && xsize > 8 && ysize > 8) { _MaskHighIntensityChange(xsize, ysize, c0, c1, xyb0, xyb1); @@ -1680,7 +1699,9 @@ namespace butteraugli xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), xyb1[0].data(), xyb1[1].data(), xyb1[2].data()); } - else if (MODE_CPU_OPT == g_mathMode) + else +#endif + if (MODE_CPU_OPT == g_mathMode) { MaskHighIntensityChangeOpt(xsize, ysize, c0, c1, xyb0, xyb1); } @@ -1692,6 +1713,7 @@ namespace butteraugli void ScaleImage(double scale, std::vector *result) { +#ifdef __USE_OPENCL__ if (MODE_CHECKCL == g_mathMode && result->size() > 64) { std::vector result_org = *result; @@ -1699,6 +1721,7 @@ namespace butteraugli tclScaleImage(scale, result_org.data(), (*result).data(), (*result).size()); } else +#endif { _ScaleImage(scale, result); } @@ -1714,15 +1737,18 @@ namespace butteraugli { _Convolution(xsize, ysize, xstep, len, offset, multipliers, inp, border_ratio, result); +#ifdef __USE_OPENCL__ if (MODE_CHECKCL == g_mathMode && xsize > 8 && ysize > 8) { tclConvolution(xsize, ysize, xstep, len, offset, multipliers, inp, border_ratio, result); } +#endif } void Blur(size_t xsize, size_t ysize, float* channel, double sigma, double border_ratio) { +#ifdef __USE_OPENCL__ if (MODE_CHECKCL == g_mathMode && xsize > 8 && ysize > 8) { std::vector orignChannel; @@ -1732,6 +1758,7 @@ namespace butteraugli tclBlur(orignChannel.data(), xsize, ysize, sigma, border_ratio, channel); } else +#endif { _Blur(xsize, ysize, channel, sigma, border_ratio); } @@ -1740,7 +1767,12 @@ namespace butteraugli void OpsinDynamicsImage(size_t xsize, size_t ysize, std::vector > &rgb) { - if (MODE_OPENCL == g_mathMode && xsize > 100 && ysize > 100) + if (MODE_CPU_OPT == g_mathMode) + { + OpsinDynamicsImageOpt(xsize, ysize, rgb); + } +#ifdef __USE_OPENCL__ + else if (MODE_OPENCL == g_mathMode && xsize > 100 && ysize > 100) { float * r = rgb[0].data(); float * g = rgb[1].data(); @@ -1748,6 +1780,15 @@ namespace butteraugli clOpsinDynamicsImage(r, g, b, xsize, ysize); } + else if (MODE_CHECKCL == g_mathMode && xsize > 8 & ysize > 8) + { + std::vector< std::vector> orig_rgb = rgb; + _OpsinDynamicsImage(xsize, ysize, rgb); + tclOpsinDynamicsImage(orig_rgb[0].data(), orig_rgb[1].data(), orig_rgb[2].data(), + xsize, ysize, + rgb[0].data(), rgb[1].data(), rgb[2].data()); + } +#endif #ifdef __USE_CUDA__ else if (MODE_CUDA == g_mathMode && xsize > 100 && ysize > 100) { @@ -1758,18 +1799,6 @@ namespace butteraugli cuOpsinDynamicsImage(r, g, b, xsize, ysize); } #endif - else if (MODE_CHECKCL == g_mathMode && xsize > 8 & ysize > 8) - { - std::vector< std::vector> orig_rgb = rgb; - _OpsinDynamicsImage(xsize, ysize, rgb); - tclOpsinDynamicsImage(orig_rgb[0].data(), orig_rgb[1].data(), orig_rgb[2].data(), - xsize, ysize, - rgb[0].data(), rgb[1].data(), rgb[2].data()); - } - else if (MODE_CPU_OPT == g_mathMode) - { - OpsinDynamicsImageOpt(xsize, ysize, rgb); - } else { _OpsinDynamicsImage(xsize, ysize, rgb); diff --git a/clguetzli/clguetzli.cl.cpp b/clguetzli/clguetzli.cl.cpp index b04d6cc1..5b382ae6 100644 --- a/clguetzli/clguetzli.cl.cpp +++ b/clguetzli/clguetzli.cl.cpp @@ -3,6 +3,8 @@ #include #include "utils.h" +#ifdef __USE_OPENCL__ + using namespace std; int g_idvec[10] = { 0 }; @@ -61,7 +63,19 @@ namespace guetzli void ButteraugliComparatorEx::Compare(const OutputImage& img) { - if (MODE_OPENCL == g_mathMode) + if (MODE_CPU_OPT == g_mathMode) + { + std::vector > rgb0 = rgb_orig_opsin; + + std::vector > rgb(3, std::vector(width_ * height_)); + img.ToLinearRGB(&rgb); + ::butteraugli::OpsinDynamicsImage(width_, height_, rgb); + std::vector().swap(distmap_); + comparator_.DiffmapOpsinDynamicsImage(rgb0, rgb, distmap_); + distance_ = ::butteraugli::ButteraugliScoreFromDiffmap(distmap_); + } +#ifdef __USE_OPENCL__ + else if (MODE_OPENCL == g_mathMode) { std::vector > rgb1(3, std::vector(width_ * height_)); img.ToLinearRGB(&rgb1); @@ -92,6 +106,7 @@ namespace guetzli distance_ = ::butteraugli::ButteraugliScoreFromDiffmap(distmap_); } +#endif #ifdef __USE_CUDA__ else if (MODE_CUDA == g_mathMode) { @@ -123,17 +138,6 @@ namespace guetzli distance_ = ::butteraugli::ButteraugliScoreFromDiffmap(distmap_); } #endif - else if (MODE_CPU_OPT == g_mathMode) - { - std::vector > rgb0 = rgb_orig_opsin; - - std::vector > rgb(3, std::vector(width_ * height_)); - img.ToLinearRGB(&rgb); - ::butteraugli::OpsinDynamicsImage(width_, height_, rgb); - std::vector().swap(distmap_); - comparator_.DiffmapOpsinDynamicsImage(rgb0, rgb, distmap_); - distance_ = ::butteraugli::ButteraugliScoreFromDiffmap(distmap_); - } else { ButteraugliComparator::Compare(img); @@ -239,4 +243,6 @@ namespace guetzli */ return err; } -} \ No newline at end of file +} + +#endif \ No newline at end of file diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index 8f39fb46..be8e8c10 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -4,12 +4,14 @@ #include #include "cl.hpp" +extern MATH_MODE g_mathMode = MODE_CPU; + +#ifdef __USE_OPENCL__ + #ifdef __USE_DOUBLE_AS_FLOAT__ #define double float #endif -extern MATH_MODE g_mathMode = MODE_CPU; - void clOpsinDynamicsImage(float *r, float *g, float *b, const size_t xsize, const size_t ysize) { size_t channel_size = xsize * ysize * sizeof(float); @@ -827,4 +829,6 @@ void clCalculateDiffmapEx(cl_mem diffmap/*in,out*/, const size_t xsize, const si #ifdef __USE_DOUBLE_AS_FLOAT__ #undef double +#endif + #endif \ No newline at end of file diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h index d25f8c80..c01da7a4 100644 --- a/clguetzli/clguetzli.h +++ b/clguetzli/clguetzli.h @@ -7,22 +7,24 @@ #include "cuguetzli.h" -#ifdef __USE_DOUBLE_AS_FLOAT__ -#define double float -#endif - enum MATH_MODE { - MODE_CPU = 0, + MODE_CPU = 0, MODE_CPU_OPT, - MODE_OPENCL, - MODE_CUDA, - MODE_CHECKCL, - MODE_CHECKCUDA + MODE_OPENCL, + MODE_CUDA, + MODE_CHECKCL, + MODE_CHECKCUDA }; extern MATH_MODE g_mathMode; +#ifdef __USE_OPENCL__ + +#ifdef __USE_DOUBLE_AS_FLOAT__ +#define double float +#endif + void clOpsinDynamicsImage( float *r, float *g, float *b, const size_t xsize, const size_t ysize); @@ -174,4 +176,6 @@ namespace guetzli { std::vector imgMaskXyzScaleBlockList; // [RGBRGB..RGBRGB]:blockCount std::vector> rgb_orig_opsin; }; -} \ No newline at end of file +} + +#endif \ No newline at end of file diff --git a/clguetzli/clguetzli_test.cpp b/clguetzli/clguetzli_test.cpp index 9cb4007d..b5fa50c5 100644 --- a/clguetzli/clguetzli_test.cpp +++ b/clguetzli/clguetzli_test.cpp @@ -7,6 +7,8 @@ #include "ocl.h" #include "ocu.h" +#ifdef __USE_OPENCL__ + #define FLOAT_COMPARE(a, b, c) floatCompare((a), (b), (c), __FUNCTION__, __LINE__ ) int floatCompare(const float* a, const float* b, size_t size, const char* szFunc, int line) @@ -446,4 +448,6 @@ void tclOpsinDynamicsImage(const float* r, const float* g, const float* b, size_ err = clFinish(ocl.commandQueue); ocl.releaseMemChannels(rgb); -} \ No newline at end of file +} + +#endif \ No newline at end of file diff --git a/clguetzli/ocl.cpp b/clguetzli/ocl.cpp index 639ad68e..f4427fff 100644 --- a/clguetzli/ocl.cpp +++ b/clguetzli/ocl.cpp @@ -2,6 +2,8 @@ #include #include +#ifdef __USE_OPENCL__ + ocl_args_d_t& getOcl(void) { static bool bInit = false; @@ -543,4 +545,6 @@ int SetupOpenCL(ocl_args_d_t *ocl, cl_device_type deviceType) } return CL_SUCCESS; -} \ No newline at end of file +} + +#endif \ No newline at end of file diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h index a9573fa6..f182bb88 100644 --- a/clguetzli/ocl.h +++ b/clguetzli/ocl.h @@ -4,6 +4,8 @@ #include "utils.h" #include "clguetzli.cl.h" +#ifdef __USE_OPENCL__ + // Macros for OpenCL versions #define OPENCL_VERSION_1_2 1.2f #define OPENCL_VERSION_2_0 2.0f @@ -61,3 +63,4 @@ struct ocl_args_d_t float compilerVersion; // hold the device OpenCL C version (default. 1.2) }; +#endif diff --git a/guetzli/guetzli.cc b/guetzli/guetzli.cc index e5a335c6..c972d391 100644 --- a/guetzli/guetzli.cc +++ b/guetzli/guetzli.cc @@ -29,6 +29,9 @@ #include "guetzli/quality.h" #include "guetzli/stats.h" #include "clguetzli/clguetzli.h" +#ifdef __USE_GPERFTOOLS__ +#include +#endif namespace { @@ -226,8 +229,10 @@ void Usage() { " Default value is %d.\n" " --memlimit M - Memory limit in MB. Guetzli will fail if unable to stay under\n" " the limit. Default limit is %d MB.\n" +#ifdef __USE_OPENCL__ " --opencl - Use OpenCL\n" " --checkcl - Check OpenCL result\n" +#endif " --c - Use c opt version\n" #ifdef __USE_CUDA__ " --cuda - Use CUDA\n" @@ -240,6 +245,9 @@ void Usage() { } // namespace int main(int argc, char** argv) { +#ifdef __USE_GPERFTOOLS__ + ProfilerStart("guetzli.prof"); +#endif std::set_terminate(TerminateHandler); int verbose = 0; @@ -265,12 +273,14 @@ int main(int argc, char** argv) { } else if (!strcmp(argv[opt_idx], "--nomemlimit")) { memlimit_mb = -1; } +#ifdef __USE_OPENCL__ else if (!strcmp(argv[opt_idx], "--opencl")) { g_mathMode = MODE_OPENCL; } else if (!strcmp(argv[opt_idx], "--checkcl")) { g_mathMode = MODE_CHECKCL; } +#endif else if (!strcmp(argv[opt_idx], "--c")) { g_mathMode = MODE_CPU_OPT; @@ -351,5 +361,8 @@ int main(int argc, char** argv) { } WriteFileOrDie(argv[opt_idx + 1], out_data); +#ifdef __USE_GPERFTOOLS__ + ProfilerStop(); +#endif return 0; } diff --git a/guetzli/processor.cc b/guetzli/processor.cc index a16fcc36..3d39da02 100644 --- a/guetzli/processor.cc +++ b/guetzli/processor.cc @@ -569,11 +569,14 @@ void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, co std::vector output_order_gpu; std::vector output_order_cpu; - CoeffData * output_order = NULL; - ButteraugliComparatorEx * comp = (ButteraugliComparatorEx*)comparator_; + + CoeffData * output_order = NULL; if (MODE_OPENCL == g_mathMode || MODE_CUDA == g_mathMode) { +#ifdef __USE_OPENCL__ + ButteraugliComparatorEx * comp = (ButteraugliComparatorEx*)comparator_; + channel_info orig_channel[3]; channel_info mayout_channel[3]; @@ -606,6 +609,7 @@ void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, co comp_mask, comp->BlockErrorLimit()); } +#endif #ifdef __USE_CUDA__ else { @@ -622,8 +626,11 @@ void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, co } #endif } - +#ifdef __USE_OPENCL__ if (MODE_CPU_OPT == g_mathMode || MODE_CPU == g_mathMode || MODE_CHECKCL == g_mathMode) +#else + if (MODE_CPU_OPT == g_mathMode || MODE_CPU == g_mathMode) +#endif { output_order_cpu.resize(num_blocks * kBlockSize); output_order = output_order_cpu.data(); @@ -1038,9 +1045,15 @@ bool Process(const Params& params, ProcessStats* stats, } std::unique_ptr comparator; if (jpg.width >= 32 && jpg.height >= 32) { +#ifdef __USE_OPENCL__ comparator.reset( new ButteraugliComparatorEx(jpg.width, jpg.height, &rgb, params.butteraugli_target, stats)); +#else + comparator.reset( + new ButteraugliComparator(jpg.width, jpg.height, &rgb, + params.butteraugli_target, stats)); +#endif } bool ok = ProcessJpegData(params, jpg, comparator.get(), &out, stats); *jpg_out = out.jpeg_data; @@ -1062,9 +1075,15 @@ bool Process(const Params& params, ProcessStats* stats, } std::unique_ptr comparator; if (jpg.width >= 32 && jpg.height >= 32) { +#ifdef __USE_OPENCL__ comparator.reset( new ButteraugliComparatorEx(jpg.width, jpg.height, &rgb, params.butteraugli_target, stats)); +#else + comparator.reset( + new ButteraugliComparator(jpg.width, jpg.height, &rgb, + params.butteraugli_target, stats)); +#endif } bool ok = ProcessJpegData(params, jpg, comparator.get(), &out, stats); *jpg_out = out.jpeg_data; From 66a8d9f0644371d62f47720f5a7332b4fde2a1fa Mon Sep 17 00:00:00 2001 From: ianhuang-777 <306168910@qq.com> Date: Wed, 21 Jun 2017 17:28:03 +0800 Subject: [PATCH 166/189] Add simple cuda memory pool --- clguetzli/clguetzli.cl.cpp | 14 +- clguetzli/cuguetzli.cpp | 342 ++++++++++++++++++------------------- clguetzli/ocu.cpp | 20 +-- clguetzli/ocu.h | 5 + guetzli.vcxproj | 9 +- guetzli.vcxproj.filters | 6 + 6 files changed, 204 insertions(+), 192 deletions(-) diff --git a/clguetzli/clguetzli.cl.cpp b/clguetzli/clguetzli.cl.cpp index 5b382ae6..f29a283c 100644 --- a/clguetzli/clguetzli.cl.cpp +++ b/clguetzli/clguetzli.cl.cpp @@ -119,11 +119,11 @@ namespace guetzli distmap_.resize(xsize * ysize); size_t channel_size = xsize * ysize * sizeof(float); - ocu_args_d_t &ocl = getOcu(); - ocu_channels xyb0 = ocl.allocMemChannels(channel_size, rgb_orig_opsin[0].data(), rgb_orig_opsin[1].data(), rgb_orig_opsin[2].data()); - ocu_channels xyb1 = ocl.allocMemChannels(channel_size, rgb1[0].data(), rgb1[1].data(), rgb1[2].data()); + ocu_args_d_t &ocu = getOcu(); + ocu_channels xyb0 = ocu.allocMemChannels(channel_size, rgb_orig_opsin[0].data(), rgb_orig_opsin[1].data(), rgb_orig_opsin[2].data()); + ocu_channels xyb1 = ocu.allocMemChannels(channel_size, rgb1[0].data(), rgb1[1].data(), rgb1[2].data()); - cu_mem mem_result = ocl.allocMem(channel_size); + cu_mem mem_result = ocu.allocMem(channel_size); cuOpsinDynamicsImageEx(xyb1, xsize, ysize); @@ -131,9 +131,9 @@ namespace guetzli cuMemcpyDtoH(distmap_.data(), mem_result, channel_size); - cuMemFree(mem_result); - ocl.releaseMemChannels(xyb0); - ocl.releaseMemChannels(xyb1); + ocu.releaseMem(mem_result); + ocu.releaseMemChannels(xyb0); + ocu.releaseMemChannels(xyb1); distance_ = ::butteraugli::ButteraugliScoreFromDiffmap(distmap_); } diff --git a/clguetzli/cuguetzli.cpp b/clguetzli/cuguetzli.cpp index 3b8c2835..1903c6eb 100644 --- a/clguetzli/cuguetzli.cpp +++ b/clguetzli/cuguetzli.cpp @@ -18,17 +18,17 @@ void cuOpsinDynamicsImage(float *r, float *g, float *b, const size_t xsize, cons { size_t channel_size = xsize * ysize * sizeof(float); - ocu_args_d_t &ocl = getOcu(); - ocu_channels rgb = ocl.allocMemChannels(channel_size, r, g, b); + ocu_args_d_t &ocu = getOcu(); + ocu_channels rgb = ocu.allocMemChannels(channel_size, r, g, b); cuOpsinDynamicsImageEx(rgb, xsize, ysize); - cuMemcpyDtoHAsync(r, rgb.r, channel_size, ocl.commandQueue); - cuMemcpyDtoHAsync(g, rgb.g, channel_size, ocl.commandQueue); - cuMemcpyDtoHAsync(b, rgb.b, channel_size, ocl.commandQueue); - cuFinish(ocl.commandQueue); + cuMemcpyDtoHAsync(r, rgb.r, channel_size, ocu.commandQueue); + cuMemcpyDtoHAsync(g, rgb.g, channel_size, ocu.commandQueue); + cuMemcpyDtoHAsync(b, rgb.b, channel_size, ocu.commandQueue); + cuFinish(ocu.commandQueue); - ocl.releaseMemChannels(rgb); + ocu.releaseMemChannels(rgb); } void cuDiffmapOpsinDynamicsImage( @@ -40,20 +40,20 @@ void cuDiffmapOpsinDynamicsImage( { size_t channel_size = xsize * ysize * sizeof(float); - ocu_args_d_t &ocl = getOcu(); - ocu_channels xyb0 = ocl.allocMemChannels(channel_size, r, g, b); - ocu_channels xyb1 = ocl.allocMemChannels(channel_size, r2, g2, b2); + ocu_args_d_t &ocu = getOcu(); + ocu_channels xyb0 = ocu.allocMemChannels(channel_size, r, g, b); + ocu_channels xyb1 = ocu.allocMemChannels(channel_size, r2, g2, b2); - cu_mem mem_result = ocl.allocMem(channel_size, result); + cu_mem mem_result = ocu.allocMem(channel_size, result); cuDiffmapOpsinDynamicsImageEx(mem_result, xyb0, xyb1, xsize, ysize, step); cuMemcpyDtoH(result, mem_result, channel_size); - ocl.releaseMemChannels(xyb1); - ocl.releaseMemChannels(xyb0); + ocu.releaseMemChannels(xyb1); + ocu.releaseMemChannels(xyb0); - cuMemFree(mem_result); + ocu.releaseMem(mem_result); } void cuComputeBlockZeroingOrder( @@ -75,7 +75,7 @@ void cuComputeBlockZeroingOrder( using namespace guetzli; - ocu_args_d_t &ocl = getOcu(); + ocu_args_d_t &ocu = getOcu(); cu_mem mem_orig_coeff[3]; cu_mem mem_mayout_coeff[3]; @@ -83,20 +83,20 @@ void cuComputeBlockZeroingOrder( for (int c = 0; c < 3; c++) { int block_count = orig_channel[c].block_width * orig_channel[c].block_height; - mem_orig_coeff[c] = ocl.allocMem(block_count * sizeof(::coeff_t) * kDCTBlockSize, orig_channel[c].coeff); + mem_orig_coeff[c] = ocu.allocMem(block_count * sizeof(::coeff_t) * kDCTBlockSize, orig_channel[c].coeff); block_count = mayout_channel[c].block_width * mayout_channel[c].block_height; - mem_mayout_coeff[c] = ocl.allocMem(block_count * sizeof(::coeff_t) * kDCTBlockSize, mayout_channel[c].coeff); + mem_mayout_coeff[c] = ocu.allocMem(block_count * sizeof(::coeff_t) * kDCTBlockSize, mayout_channel[c].coeff); - mem_mayout_pixel[c] = ocl.allocMem(image_width * image_height * sizeof(uint16_t), mayout_channel[c].pixel); + mem_mayout_pixel[c] = ocu.allocMem(image_width * image_height * sizeof(uint16_t), mayout_channel[c].pixel); } - cu_mem mem_orig_image = ocl.allocMem(sizeof(float) * 3 * kDCTBlockSize * block8_width * block8_height, orig_image_batch); - cu_mem mem_mask_scale = ocl.allocMem(sizeof(float) * 3 * block8_width * block8_height, mask_scale); + cu_mem mem_orig_image = ocu.allocMem(sizeof(float) * 3 * kDCTBlockSize * block8_width * block8_height, orig_image_batch); + cu_mem mem_mask_scale = ocu.allocMem(sizeof(float) * 3 * block8_width * block8_height, mask_scale); int output_order_batch_size = sizeof(CoeffData) * 3 * kDCTBlockSize * blockf_width * blockf_height; - cu_mem mem_output_order_batch = ocl.allocMem(output_order_batch_size, output_order_batch); + cu_mem mem_output_order_batch = ocu.allocMem(output_order_batch_size, output_order_batch); - CUfunction kernel = ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER]; + CUfunction kernel = ocu.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER]; const void *args[] = { &mem_orig_coeff[0], &mem_orig_coeff[1], &mem_orig_coeff[2], &mem_orig_image, &mem_mask_scale, &blockf_width, &blockf_height, @@ -113,24 +113,24 @@ void cuComputeBlockZeroingOrder( BLOCK_COUNT_X(blockf_width), BLOCK_COUNT_Y(blockf_height), 1, BLOCK_SIZE_X, BLOCK_SIZE_Y, 1, 0, - ocl.commandQueue, (void**)args, NULL); + ocu.commandQueue, (void**)args, NULL); LOG_CU_RESULT(err); - err = cuFinish(ocl.commandQueue); + err = cuFinish(ocu.commandQueue); LOG_CU_RESULT(err); cuMemcpyDtoH(output_order_batch, mem_output_order_batch, output_order_batch_size); for (int c = 0; c < 3; c++) { - cuMemFree(mem_orig_coeff[c]); - cuMemFree(mem_mayout_coeff[c]); - cuMemFree(mem_mayout_pixel[c]); + ocu.releaseMem(mem_orig_coeff[c]); + ocu.releaseMem(mem_mayout_coeff[c]); + ocu.releaseMem(mem_mayout_pixel[c]); } - cuMemFree(mem_orig_image); - cuMemFree(mem_mask_scale); - cuMemFree(mem_output_order_batch); + ocu.releaseMem(mem_orig_image); + ocu.releaseMem(mem_mask_scale); + ocu.releaseMem(mem_output_order_batch); } void cuMask( @@ -140,29 +140,29 @@ void cuMask( const float* r, const float* g, const float* b, const float* r2, const float* g2, const float* b2) { - ocu_args_d_t &ocl = getOcu(); + ocu_args_d_t &ocu = getOcu(); size_t channel_size = xsize * ysize * sizeof(float); - ocu_channels rgb = ocl.allocMemChannels(channel_size, r, g, b); - ocu_channels rgb2 = ocl.allocMemChannels(channel_size, r2, g2, b2); - ocu_channels mask = ocl.allocMemChannels(channel_size); - ocu_channels mask_dc = ocl.allocMemChannels(channel_size); + ocu_channels rgb = ocu.allocMemChannels(channel_size, r, g, b); + ocu_channels rgb2 = ocu.allocMemChannels(channel_size, r2, g2, b2); + ocu_channels mask = ocu.allocMemChannels(channel_size); + ocu_channels mask_dc = ocu.allocMemChannels(channel_size); cuMaskEx(mask, mask_dc, rgb, rgb2, xsize, ysize); - cuMemcpyDtoHAsync(mask_r, mask.r, channel_size, ocl.commandQueue); - cuMemcpyDtoHAsync(mask_g, mask.g, channel_size, ocl.commandQueue); - cuMemcpyDtoHAsync(mask_b, mask.b, channel_size, ocl.commandQueue); - cuMemcpyDtoHAsync(maskdc_r, mask_dc.r, channel_size, ocl.commandQueue); - cuMemcpyDtoHAsync(maskdc_g, mask_dc.g, channel_size, ocl.commandQueue); - cuMemcpyDtoHAsync(maskdc_b, mask_dc.b, channel_size, ocl.commandQueue); - cuFinish(ocl.commandQueue); - - ocl.releaseMemChannels(rgb); - ocl.releaseMemChannels(rgb2); - ocl.releaseMemChannels(mask); - ocl.releaseMemChannels(mask_dc); + cuMemcpyDtoHAsync(mask_r, mask.r, channel_size, ocu.commandQueue); + cuMemcpyDtoHAsync(mask_g, mask.g, channel_size, ocu.commandQueue); + cuMemcpyDtoHAsync(mask_b, mask.b, channel_size, ocu.commandQueue); + cuMemcpyDtoHAsync(maskdc_r, mask_dc.r, channel_size, ocu.commandQueue); + cuMemcpyDtoHAsync(maskdc_g, mask_dc.g, channel_size, ocu.commandQueue); + cuMemcpyDtoHAsync(maskdc_b, mask_dc.b, channel_size, ocu.commandQueue); + cuFinish(ocu.commandQueue); + + ocu.releaseMemChannels(rgb); + ocu.releaseMemChannels(rgb2); + ocu.releaseMemChannels(mask); + ocu.releaseMemChannels(mask_dc); } void cuDiffmapOpsinDynamicsImageEx( @@ -178,11 +178,11 @@ void cuDiffmapOpsinDynamicsImageEx( size_t channel_size = xsize * ysize * sizeof(float); size_t channel_step_size = res_xsize * res_ysize * sizeof(float); - ocu_args_d_t &ocl = getOcu(); + ocu_args_d_t &ocu = getOcu(); - cu_mem edge_detector_map = ocl.allocMem(3 * channel_step_size); - cu_mem block_diff_dc = ocl.allocMem(3 * channel_step_size); - cu_mem block_diff_ac = ocl.allocMem(3 * channel_step_size); + cu_mem edge_detector_map = ocu.allocMem(3 * channel_step_size); + cu_mem block_diff_dc = ocu.allocMem(3 * channel_step_size); + cu_mem block_diff_ac = ocu.allocMem(3 * channel_step_size); cuMaskHighIntensityChangeEx(xyb0, xyb1, xsize, ysize); @@ -190,20 +190,20 @@ void cuDiffmapOpsinDynamicsImageEx( cuBlockDiffMapEx(block_diff_dc, block_diff_ac, xyb0, xyb1, xsize, ysize, step); cuEdgeDetectorLowFreqEx(block_diff_ac, xyb0, xyb1, xsize, ysize, step); { - ocu_channels mask = ocl.allocMemChannels(channel_size); - ocu_channels mask_dc = ocl.allocMemChannels(channel_size); + ocu_channels mask = ocu.allocMemChannels(channel_size); + ocu_channels mask_dc = ocu.allocMemChannels(channel_size); cuMaskEx(mask, mask_dc, xyb0, xyb1, xsize, ysize); cuCombineChannelsEx(result, mask, mask_dc, xsize, ysize, block_diff_dc, block_diff_ac, edge_detector_map, res_xsize, step); - ocl.releaseMemChannels(mask); - ocl.releaseMemChannels(mask_dc); + ocu.releaseMemChannels(mask); + ocu.releaseMemChannels(mask_dc); } cuCalculateDiffmapEx(result, xsize, ysize, step); - cuMemFree(edge_detector_map); - cuMemFree(block_diff_dc); - cuMemFree(block_diff_ac); + ocu.releaseMem(edge_detector_map); + ocu.releaseMem(block_diff_dc); + ocu.releaseMem(block_diff_ac); } void cuConvolutionEx( @@ -212,20 +212,20 @@ void cuConvolutionEx( const cu_mem multipliers, size_t len, int xstep, int offset, float border_ratio) { - ocu_args_d_t &ocl = getOcu(); + ocu_args_d_t &ocu = getOcu(); size_t oxsize = (xsize + xstep - 1) / xstep; - CUfunction kernel = ocl.kernel[KERNEL_CONVOLUTION]; + CUfunction kernel = ocu.kernel[KERNEL_CONVOLUTION]; const void *args[] = { &result, &inp, &xsize, &multipliers, &len, &xstep, &offset, &border_ratio }; CUresult err = cuLaunchKernel(kernel, oxsize, ysize, 1, 1, 1, 1, 0, - ocl.commandQueue, (void**)args, NULL); + ocu.commandQueue, (void**)args, NULL); LOG_CU_RESULT(err); - err = cuFinish(ocl.commandQueue); + err = cuFinish(ocu.commandQueue); LOG_CU_RESULT(err); } @@ -236,18 +236,18 @@ void cuConvolutionXEx( const cu_mem multipliers, size_t len, int xstep, int offset, float border_ratio) { - ocu_args_d_t &ocl = getOcu(); + ocu_args_d_t &ocu = getOcu(); - CUfunction kernel = ocl.kernel[KERNEL_CONVOLUTIONX]; + CUfunction kernel = ocu.kernel[KERNEL_CONVOLUTIONX]; const void *args[] = { &result, &xsize, &ysize, &inp, &multipliers, &len, &xstep, &offset, &border_ratio }; CUresult err = cuLaunchKernel(kernel, BLOCK_COUNT_X(xsize), BLOCK_COUNT_Y(ysize), 1, BLOCK_SIZE_X, BLOCK_SIZE_Y, 1, 0, - ocl.commandQueue, (void**)args, NULL); + ocu.commandQueue, (void**)args, NULL); LOG_CU_RESULT(err); - err = cuFinish(ocl.commandQueue); + err = cuFinish(ocu.commandQueue); LOG_CU_RESULT(err); } @@ -257,18 +257,18 @@ void cuConvolutionYEx( const cu_mem multipliers, size_t len, int xstep, int offset, float border_ratio) { - ocu_args_d_t &ocl = getOcu(); + ocu_args_d_t &ocu = getOcu(); - CUfunction kernel = ocl.kernel[KERNEL_CONVOLUTIONY]; + CUfunction kernel = ocu.kernel[KERNEL_CONVOLUTIONY]; const void *args[] = { &result, &xsize, &ysize, &inp, &multipliers, &len, &xstep, &offset, &border_ratio }; CUresult err = cuLaunchKernel(kernel, BLOCK_COUNT_X(xsize), BLOCK_COUNT_Y(ysize), 1, BLOCK_SIZE_X, BLOCK_SIZE_Y, 1, 0, - ocl.commandQueue, (void**)args, NULL); + ocu.commandQueue, (void**)args, NULL); LOG_CU_RESULT(err); - err = cuFinish(ocl.commandQueue); + err = cuFinish(ocu.commandQueue); LOG_CU_RESULT(err); } @@ -277,18 +277,18 @@ void cuSquareSampleEx( const cu_mem image, size_t xsize, size_t ysize, size_t xstep, size_t ystep) { - ocu_args_d_t &ocl = getOcu(); + ocu_args_d_t &ocu = getOcu(); - CUfunction kernel = ocl.kernel[KERNEL_SQUARESAMPLE]; + CUfunction kernel = ocu.kernel[KERNEL_SQUARESAMPLE]; const void *args[] = { &result, &xsize, &ysize, &image, &xstep, &ystep }; CUresult err = cuLaunchKernel(kernel, BLOCK_COUNT_X(xsize), BLOCK_COUNT_Y(ysize), 1, BLOCK_SIZE_X, BLOCK_SIZE_Y, 1, 0, - ocl.commandQueue, (void**)args, NULL); + ocu.commandQueue, (void**)args, NULL); LOG_CU_RESULT(err); - err = cuFinish(ocl.commandQueue); + err = cuFinish(ocu.commandQueue); LOG_CU_RESULT(err); } @@ -308,26 +308,26 @@ void cuBlurEx(cu_mem image/*out, opt*/, const size_t xsize, const size_t ysize, const int xstep = std::max(1, int(sigma / 3)); - ocu_args_d_t &ocl = getOcu(); - cu_mem mem_expn = ocl.allocMem(sizeof(cl_float) * expn_size, expn.data()); + ocu_args_d_t &ocu = getOcu(); + cu_mem mem_expn = ocu.allocMem(sizeof(cl_float) * expn_size, expn.data()); if (xstep > 1) { - cu_mem m = ocl.allocMem(sizeof(cl_float) * xsize * ysize); + cu_mem m = ocu.allocMem(sizeof(cl_float) * xsize * ysize); cuConvolutionXEx(m, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio); cuConvolutionYEx(result ? result : image, m, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio); cuSquareSampleEx(result ? result : image, result ? result : image, xsize, ysize, xstep, xstep); - cuMemFree(m); + ocu.releaseMem(m); } else { - cu_mem m = ocl.allocMem(sizeof(cl_float) * xsize * ysize); + cu_mem m = ocu.allocMem(sizeof(cl_float) * xsize * ysize); cuConvolutionXEx(m, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio); cuConvolutionYEx(result ? result : image, m, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio); - cuMemFree(m); + ocu.releaseMem(m); } - cuMemFree(mem_expn); + ocu.releaseMem(mem_expn); } void cuOpsinDynamicsImageEx(ocu_channels &rgb, const size_t xsize, const size_t ysize) @@ -336,8 +336,8 @@ void cuOpsinDynamicsImageEx(ocu_channels &rgb, const size_t xsize, const size_t size_t channel_size = xsize * ysize * sizeof(float); - ocu_args_d_t &ocl = getOcu(); - ocu_channels rgb_blurred = ocl.allocMemChannels(channel_size); + ocu_args_d_t &ocu = getOcu(); + ocu_channels rgb_blurred = ocu.allocMemChannels(channel_size); const int size = xsize * ysize; @@ -345,7 +345,7 @@ void cuOpsinDynamicsImageEx(ocu_channels &rgb, const size_t xsize, const size_t cuBlurEx(rgb.g, xsize, ysize, kSigma, 0.0, rgb_blurred.g); cuBlurEx(rgb.b, xsize, ysize, kSigma, 0.0, rgb_blurred.b); - CUfunction kernel = ocl.kernel[KERNEL_OPSINDYNAMICSIMAGE]; + CUfunction kernel = ocu.kernel[KERNEL_OPSINDYNAMICSIMAGE]; const void *args[] = { &rgb.r, &rgb.g, &rgb.b, &size, &rgb_blurred.r, &rgb_blurred.g, &rgb_blurred.b }; CUresult err = cuLaunchKernel(kernel, @@ -354,12 +354,12 @@ void cuOpsinDynamicsImageEx(ocu_channels &rgb, const size_t xsize, const size_t (size + 511) / 512, 1, 1, 512, 1, 1, 0, - ocl.commandQueue, (void**)args, NULL); + ocu.commandQueue, (void**)args, NULL); LOG_CU_RESULT(err); - err = cuFinish(ocl.commandQueue); + err = cuFinish(ocu.commandQueue); LOG_CU_RESULT(err); - ocl.releaseMemChannels(rgb_blurred); + ocu.releaseMemChannels(rgb_blurred); } void cuMaskHighIntensityChangeEx( @@ -369,20 +369,20 @@ void cuMaskHighIntensityChangeEx( { size_t channel_size = xsize * ysize * sizeof(float); - ocu_args_d_t &ocl = getOcu(); + ocu_args_d_t &ocu = getOcu(); - ocu_channels c0 = ocl.allocMemChannels(channel_size); - ocu_channels c1 = ocl.allocMemChannels(channel_size); + ocu_channels c0 = ocu.allocMemChannels(channel_size); + ocu_channels c1 = ocu.allocMemChannels(channel_size); - cuMemcpyDtoDAsync(c0.r, xyb0.r, channel_size, ocl.commandQueue); - cuMemcpyDtoDAsync(c0.g, xyb0.g, channel_size, ocl.commandQueue); - cuMemcpyDtoDAsync(c0.b, xyb0.b, channel_size, ocl.commandQueue); - cuMemcpyDtoDAsync(c1.r, xyb1.r, channel_size, ocl.commandQueue); - cuMemcpyDtoDAsync(c1.g, xyb1.g, channel_size, ocl.commandQueue); - cuMemcpyDtoDAsync(c1.b, xyb1.b, channel_size, ocl.commandQueue); - cuFinish(ocl.commandQueue); + cuMemcpyDtoDAsync(c0.r, xyb0.r, channel_size, ocu.commandQueue); + cuMemcpyDtoDAsync(c0.g, xyb0.g, channel_size, ocu.commandQueue); + cuMemcpyDtoDAsync(c0.b, xyb0.b, channel_size, ocu.commandQueue); + cuMemcpyDtoDAsync(c1.r, xyb1.r, channel_size, ocu.commandQueue); + cuMemcpyDtoDAsync(c1.g, xyb1.g, channel_size, ocu.commandQueue); + cuMemcpyDtoDAsync(c1.b, xyb1.b, channel_size, ocu.commandQueue); + cuFinish(ocu.commandQueue); - CUfunction kernel = ocl.kernel[KERNEL_MASKHIGHINTENSITYCHANGE]; + CUfunction kernel = ocu.kernel[KERNEL_MASKHIGHINTENSITYCHANGE]; const void *args[] = { &xyb0.r, &xyb0.g, &xyb0.b, &xsize, &ysize, @@ -394,13 +394,13 @@ void cuMaskHighIntensityChangeEx( BLOCK_COUNT_X(xsize), BLOCK_COUNT_Y(ysize), 1, BLOCK_SIZE_X, BLOCK_SIZE_Y, 1, 0, - ocl.commandQueue, (void**)args, NULL); + ocu.commandQueue, (void**)args, NULL); LOG_CU_RESULT(err); - err = cuFinish(ocl.commandQueue); + err = cuFinish(ocu.commandQueue); LOG_CU_RESULT(err); - ocl.releaseMemChannels(c0); - ocl.releaseMemChannels(c1); + ocu.releaseMemChannels(c0); + ocu.releaseMemChannels(c1); } void cuEdgeDetectorMapEx( @@ -410,10 +410,10 @@ void cuEdgeDetectorMapEx( { size_t channel_size = xsize * ysize * sizeof(float); - ocu_args_d_t &ocl = getOcu(); + ocu_args_d_t &ocu = getOcu(); - ocu_channels rgb_blured = ocl.allocMemChannels(channel_size); - ocu_channels rgb2_blured = ocl.allocMemChannels(channel_size); + ocu_channels rgb_blured = ocu.allocMemChannels(channel_size); + ocu_channels rgb2_blured = ocu.allocMemChannels(channel_size); static const double kSigma[3] = { 1.5, 0.586, 0.4 }; @@ -426,7 +426,7 @@ void cuEdgeDetectorMapEx( const size_t res_xsize = (xsize + step - 1) / step; const size_t res_ysize = (ysize + step - 1) / step; - CUfunction kernel = ocl.kernel[KERNEL_EDGEDETECTOR]; + CUfunction kernel = ocu.kernel[KERNEL_EDGEDETECTOR]; const void *args[] = { &result, &res_xsize, &res_ysize, &rgb_blured.r, &rgb_blured.g, &rgb_blured.b, @@ -437,13 +437,13 @@ void cuEdgeDetectorMapEx( BLOCK_COUNT_X(res_xsize), BLOCK_COUNT_Y(res_ysize), 1, BLOCK_SIZE_X, BLOCK_SIZE_Y, 1, 0, - ocl.commandQueue, (void**)args, NULL); + ocu.commandQueue, (void**)args, NULL); LOG_CU_RESULT(err); - err = cuFinish(ocl.commandQueue); + err = cuFinish(ocu.commandQueue); LOG_CU_RESULT(err); - ocl.releaseMemChannels(rgb_blured); - ocl.releaseMemChannels(rgb2_blured); + ocu.releaseMemChannels(rgb_blured); + ocu.releaseMemChannels(rgb2_blured); } void cuBlockDiffMapEx( @@ -452,12 +452,12 @@ void cuBlockDiffMapEx( const ocu_channels &rgb, const ocu_channels &rgb2, const size_t xsize, const size_t ysize, const size_t step) { - ocu_args_d_t &ocl = getOcu(); + ocu_args_d_t &ocu = getOcu(); const size_t res_xsize = (xsize + step - 1) / step; const size_t res_ysize = (ysize + step - 1) / step; - CUfunction kernel = ocl.kernel[KERNEL_BLOCKDIFFMAP]; + CUfunction kernel = ocu.kernel[KERNEL_BLOCKDIFFMAP]; const void *args[] = { &block_diff_dc, &block_diff_ac, &res_xsize, &res_ysize, &rgb.r, &rgb.g, &rgb.b, @@ -468,9 +468,9 @@ void cuBlockDiffMapEx( BLOCK_COUNT_X(res_xsize), BLOCK_COUNT_Y(res_ysize), 1, BLOCK_SIZE_X, BLOCK_SIZE_Y, 1, 0, - ocl.commandQueue, (void**)args, NULL); + ocu.commandQueue, (void**)args, NULL); LOG_CU_RESULT(err); - err = cuFinish(ocl.commandQueue); + err = cuFinish(ocu.commandQueue); LOG_CU_RESULT(err); } @@ -483,9 +483,9 @@ void cuEdgeDetectorLowFreqEx( static const double kSigma = 14; - ocu_args_d_t &ocl = getOcu(); - ocu_channels rgb_blured = ocl.allocMemChannels(channel_size); - ocu_channels rgb2_blured = ocl.allocMemChannels(channel_size); + ocu_args_d_t &ocu = getOcu(); + ocu_channels rgb_blured = ocu.allocMemChannels(channel_size); + ocu_channels rgb2_blured = ocu.allocMemChannels(channel_size); for (int i = 0; i < 3; i++) { @@ -496,7 +496,7 @@ void cuEdgeDetectorLowFreqEx( const size_t res_xsize = (xsize + step - 1) / step; const size_t res_ysize = (ysize + step - 1) / step; - CUfunction kernel = ocl.kernel[KERNEL_EDGEDETECTORLOWFREQ]; + CUfunction kernel = ocu.kernel[KERNEL_EDGEDETECTORLOWFREQ]; const void *args[] = { &block_diff_ac, &res_xsize, &res_ysize, &rgb_blured.r, &rgb_blured.g, &rgb_blured.b, @@ -508,13 +508,13 @@ void cuEdgeDetectorLowFreqEx( BLOCK_COUNT_X(res_xsize), BLOCK_COUNT_Y(res_ysize), 1, BLOCK_SIZE_X, BLOCK_SIZE_Y, 1, 0, - ocl.commandQueue, (void**)args, NULL); + ocu.commandQueue, (void**)args, NULL); LOG_CU_RESULT(err); - err = cuFinish(ocl.commandQueue); + err = cuFinish(ocu.commandQueue); LOG_CU_RESULT(err); - ocl.releaseMemChannels(rgb_blured); - ocl.releaseMemChannels(rgb2_blured); + ocu.releaseMemChannels(rgb_blured); + ocu.releaseMemChannels(rgb2_blured); } void cuDiffPrecomputeEx( @@ -522,9 +522,9 @@ void cuDiffPrecomputeEx( const ocu_channels &xyb0, const ocu_channels &xyb1, const size_t xsize, const size_t ysize) { - ocu_args_d_t &ocl = getOcu(); + ocu_args_d_t &ocu = getOcu(); - CUfunction kernel = ocl.kernel[KERNEL_DIFFPRECOMPUTE]; + CUfunction kernel = ocu.kernel[KERNEL_DIFFPRECOMPUTE]; const void *args[] = { &mask.x, &mask.y, &mask.b, &xsize, &ysize, &xyb0.x, &xyb0.y, &xyb0.b, @@ -534,18 +534,18 @@ void cuDiffPrecomputeEx( BLOCK_COUNT_X(xsize), BLOCK_COUNT_Y(ysize), 1, BLOCK_SIZE_X, BLOCK_SIZE_Y, 1, 0, - ocl.commandQueue, (void**)args, NULL); + ocu.commandQueue, (void**)args, NULL); LOG_CU_RESULT(err); - err = cuFinish(ocl.commandQueue); + err = cuFinish(ocu.commandQueue); LOG_CU_RESULT(err); } void cuScaleImageEx(cu_mem img/*in, out*/, size_t size, double w) { - ocu_args_d_t &ocl = getOcu(); + ocu_args_d_t &ocu = getOcu(); float fw = w; - CUfunction kernel = ocl.kernel[KERNEL_SCALEIMAGE]; + CUfunction kernel = ocu.kernel[KERNEL_SCALEIMAGE]; const void *args[] = { &img, &size, &fw }; CUresult err = cuLaunchKernel(kernel, @@ -554,9 +554,9 @@ void cuScaleImageEx(cu_mem img/*in, out*/, size_t size, double w) // BLOCK_SIZE_X * BLOCK_SIZE_Y, 1, 1, 512, 1, 1, 0, - ocl.commandQueue, (void**)args, NULL); + ocu.commandQueue, (void**)args, NULL); LOG_CU_RESULT(err); - err = cuFinish(ocl.commandQueue); + err = cuFinish(ocu.commandQueue); LOG_CU_RESULT(err); } @@ -567,26 +567,26 @@ void cuAverage5x5Ex(cu_mem img/*in,out*/, const size_t xsize, const size_t ysize return; } - ocu_args_d_t &ocl = getOcu(); + ocu_args_d_t &ocu = getOcu(); size_t len = xsize * ysize * sizeof(float); - cu_mem img_org = ocl.allocMem(len); + cu_mem img_org = ocu.allocMem(len); cuMemcpyDtoD(img_org, img, len); - CUfunction kernel = ocl.kernel[KERNEL_AVERAGE5X5]; + CUfunction kernel = ocu.kernel[KERNEL_AVERAGE5X5]; const void *args[] = { &img, &xsize, &ysize, &img_org }; CUresult err = cuLaunchKernel(kernel, BLOCK_COUNT_X(xsize), BLOCK_COUNT_Y(ysize), 1, BLOCK_SIZE_X, BLOCK_SIZE_Y, 1, 0, - ocl.commandQueue, (void**)args, NULL); + ocu.commandQueue, (void**)args, NULL); LOG_CU_RESULT(err); - err = cuFinish(ocl.commandQueue); + err = cuFinish(ocu.commandQueue); LOG_CU_RESULT(err); - cuMemFree(img_org); + ocu.releaseMem(img_org); } void cuMinSquareValEx( @@ -594,23 +594,23 @@ void cuMinSquareValEx( const size_t xsize, const size_t ysize, const size_t square_size, const size_t offset) { - ocu_args_d_t &ocl = getOcu(); + ocu_args_d_t &ocu = getOcu(); - cu_mem result = ocl.allocMem(sizeof(float) * xsize * ysize); + cu_mem result = ocu.allocMem(sizeof(float) * xsize * ysize); - CUfunction kernel = ocl.kernel[KERNEL_MINSQUAREVAL]; + CUfunction kernel = ocu.kernel[KERNEL_MINSQUAREVAL]; const void *args[] = { &result, &xsize, &ysize, &img, &square_size, &offset }; CUresult err = cuLaunchKernel(kernel, BLOCK_COUNT_X(xsize), BLOCK_COUNT_Y(ysize), 1, BLOCK_SIZE_X, BLOCK_SIZE_Y, 1, 0, - ocl.commandQueue, (void**)args, NULL); + ocu.commandQueue, (void**)args, NULL); LOG_CU_RESULT(err); - err = cuFinish(ocl.commandQueue); + err = cuFinish(ocu.commandQueue); LOG_CU_RESULT(err); cuMemcpyDtoD(img, result, sizeof(float) * xsize * ysize); - cuMemFree(result); + ocu.releaseMem(result); } static void MakeMask(double extmul, double extoff, @@ -629,7 +629,7 @@ static const double kGlobalScale = 1.0 / kInternalGoodQualityThreshold; void cuDoMask(ocu_channels mask/*in, out*/, ocu_channels mask_dc/*in, out*/, size_t xsize, size_t ysize) { - ocu_args_d_t &ocl = getOcu(); + ocu_args_d_t &ocu = getOcu(); double extmul = 0.975741017749; double extoff = -4.25328244168; @@ -710,10 +710,10 @@ void cuDoMask(ocu_channels mask/*in, out*/, ocu_channels mask_dc/*in, out*/, siz } size_t channel_size = 512 * sizeof(double); - ocu_channels xyb = ocl.allocMemChannels(channel_size, lut_x, lut_y, lut_b); - ocu_channels xyb_dc = ocl.allocMemChannels(channel_size, lut_dcx, lut_dcy, lut_dcb); + ocu_channels xyb = ocu.allocMemChannels(channel_size, lut_x, lut_y, lut_b); + ocu_channels xyb_dc = ocu.allocMemChannels(channel_size, lut_dcx, lut_dcy, lut_dcb); - CUfunction kernel = ocl.kernel[KERNEL_DOMASK]; + CUfunction kernel = ocu.kernel[KERNEL_DOMASK]; const void *args[] = { &mask.r, &mask.g, &mask.b, &xsize, &ysize, &mask_dc.r, &mask_dc.g, &mask_dc.b, @@ -724,13 +724,13 @@ void cuDoMask(ocu_channels mask/*in, out*/, ocu_channels mask_dc/*in, out*/, siz BLOCK_COUNT_X(xsize), BLOCK_COUNT_Y(ysize), 1, BLOCK_SIZE_X, BLOCK_SIZE_Y, 1, 0, - ocl.commandQueue, (void**)args, NULL); + ocu.commandQueue, (void**)args, NULL); LOG_CU_RESULT(err); - err = cuFinish(ocl.commandQueue); + err = cuFinish(ocu.commandQueue); LOG_CU_RESULT(err); - ocl.releaseMemChannels(xyb); - ocl.releaseMemChannels(xyb_dc); + ocu.releaseMemChannels(xyb); + ocu.releaseMemChannels(xyb_dc); } void cuMaskEx( @@ -773,12 +773,12 @@ void cuCombineChannelsEx( const size_t res_xsize, const size_t step) { - ocu_args_d_t &ocl = getOcu(); + ocu_args_d_t &ocu = getOcu(); const size_t work_xsize = ((xsize - 8 + step) + step - 1) / step; const size_t work_ysize = ((ysize - 8 + step) + step - 1) / step; - CUfunction kernel = ocl.kernel[KERNEL_COMBINECHANNELS]; + CUfunction kernel = ocu.kernel[KERNEL_COMBINECHANNELS]; const void *args[] = { &result, &mask.r, &mask.g, &mask.b, &mask_dc.r, &mask_dc.g, &mask_dc.b, @@ -792,19 +792,19 @@ void cuCombineChannelsEx( work_xsize, work_ysize, 1, 1, 1, 1, 0, - ocl.commandQueue, (void**)args, NULL); + ocu.commandQueue, (void**)args, NULL); LOG_CU_RESULT(err); - err = cuFinish(ocl.commandQueue); + err = cuFinish(ocu.commandQueue); LOG_CU_RESULT(err); } void cuUpsampleSquareRootEx(cu_mem diffmap, const size_t xsize, const size_t ysize, const int step) { - ocu_args_d_t &ocl = getOcu(); + ocu_args_d_t &ocu = getOcu(); - cu_mem diffmap_out = ocl.allocMem(xsize * ysize * sizeof(float)); + cu_mem diffmap_out = ocu.allocMem(xsize * ysize * sizeof(float)); - CUfunction kernel = ocl.kernel[KERNEL_UPSAMPLESQUAREROOT]; + CUfunction kernel = ocu.kernel[KERNEL_UPSAMPLESQUAREROOT]; const void *args[] = { &diffmap_out, &diffmap, &xsize, &ysize, &step }; const size_t res_xsize = (xsize + step - 1) / step; @@ -814,18 +814,18 @@ void cuUpsampleSquareRootEx(cu_mem diffmap, const size_t xsize, const size_t ysi res_xsize, res_ysize, 1, 1, 1, 1, 0, - ocl.commandQueue, (void**)args, NULL); + ocu.commandQueue, (void**)args, NULL); LOG_CU_RESULT(err); - err = cuFinish(ocl.commandQueue); + err = cuFinish(ocu.commandQueue); LOG_CU_RESULT(err); cuMemcpyDtoD(diffmap, diffmap_out, xsize * ysize * sizeof(float)); - cuMemFree(diffmap_out); + ocu.releaseMem(diffmap_out); } void cuRemoveBorderEx(cu_mem out, const cu_mem in, const size_t xsize, const size_t ysize, const int step) { - ocu_args_d_t &ocl = getOcu(); + ocu_args_d_t &ocu = getOcu(); int cls = 8 - step; int cls2 = (8 - step) / 2; @@ -833,35 +833,35 @@ void cuRemoveBorderEx(cu_mem out, const cu_mem in, const size_t xsize, const siz int out_xsize = xsize - cls; int out_ysize = ysize - cls; - CUfunction kernel = ocl.kernel[KERNEL_REMOVEBORDER]; + CUfunction kernel = ocu.kernel[KERNEL_REMOVEBORDER]; const void *args[] = { &out, &out_xsize, &out_ysize, &in, &cls, &cls2 }; CUresult err = cuLaunchKernel(kernel, BLOCK_COUNT_X(out_xsize), BLOCK_COUNT_Y(out_ysize), 1, BLOCK_SIZE_X, BLOCK_SIZE_Y, 1, 0, - ocl.commandQueue, (void**)args, NULL); + ocu.commandQueue, (void**)args, NULL); LOG_CU_RESULT(err); - err = cuFinish(ocl.commandQueue); + err = cuFinish(ocu.commandQueue); LOG_CU_RESULT(err); } void cuAddBorderEx(cu_mem out, size_t xsize, size_t ysize, int step, cu_mem in) { - ocu_args_d_t &ocl = getOcu(); + ocu_args_d_t &ocu = getOcu(); int cls = 8 - step; int cls2 = (8 - step) / 2; - CUfunction kernel = ocl.kernel[KERNEL_ADDBORDER]; + CUfunction kernel = ocu.kernel[KERNEL_ADDBORDER]; const void *args[] = { &out, &xsize, &ysize, &cls, &cls2, &in }; CUresult err = cuLaunchKernel(kernel, BLOCK_COUNT_X(xsize), BLOCK_COUNT_Y(ysize), 1, BLOCK_SIZE_X, BLOCK_SIZE_Y, 1, 0, - ocl.commandQueue, (void**)args, NULL); + ocu.commandQueue, (void**)args, NULL); LOG_CU_RESULT(err); - err = cuFinish(ocl.commandQueue); + err = cuFinish(ocu.commandQueue); LOG_CU_RESULT(err); } @@ -876,8 +876,8 @@ void cuCalculateDiffmapEx(cu_mem diffmap/*in,out*/, const size_t xsize, const si const int s = 8 - step; int s2 = (8 - step) / 2; - ocu_args_d_t &ocl = getOcu(); - cu_mem blurred = ocl.allocMem((xsize - s) * (ysize - s) * sizeof(float)); + ocu_args_d_t &ocu = getOcu(); + cu_mem blurred = ocu.allocMem((xsize - s) * (ysize - s) * sizeof(float)); cuRemoveBorderEx(blurred, diffmap, xsize, ysize, step); static const double border_ratio = 0.03027655136; @@ -886,7 +886,7 @@ void cuCalculateDiffmapEx(cu_mem diffmap/*in,out*/, const size_t xsize, const si cuAddBorderEx(diffmap, xsize, ysize, step, blurred); cuScaleImageEx(diffmap, xsize * ysize, scale); - cuMemFree(blurred); + ocu.releaseMem(blurred); } #ifdef __USE_DOUBLE_AS_FLOAT__ diff --git a/clguetzli/ocu.cpp b/clguetzli/ocu.cpp index fddf4e20..7ebc0ac1 100644 --- a/clguetzli/ocu.cpp +++ b/clguetzli/ocu.cpp @@ -108,6 +108,7 @@ else ocu.commandQueue = stream; ocu.mod = mod; ocu.ctxt = ctxt; + ocu.mem_pool.commandQueue = ocu.commandQueue; return ocu; } @@ -125,23 +126,18 @@ ocu_args_d_t::~ocu_args_d_t() { cuModuleUnload(mod); cuCtxDestroy(ctxt); + mem_pool.drain(); // cuStreamDestroy(commandQueue); } cu_mem ocu_args_d_t::allocMem(size_t s, const void *init) { - cu_mem mem; - cuMemAlloc(&mem, s); - if (init) - { - cuMemcpyHtoDAsync(mem, init, s, commandQueue); - } - else - { - cuMemsetD8Async(mem, 0, s, commandQueue); - } + return mem_pool.allocMem(s, init); +} - return mem; +void ocu_args_d_t::releaseMem(cu_mem mem) +{ + mem_pool.releaseMem(mem); } ocu_channels ocu_args_d_t::allocMemChannels(size_t s, const void *c0, const void *c1, const void *c2) @@ -161,7 +157,7 @@ void ocu_args_d_t::releaseMemChannels(ocu_channels &rgb) { for (int i = 0; i < 3; i++) { - cuMemFree(rgb.ch[i]); + releaseMem(rgb.ch[i]); rgb.ch[i] = NULL; } } diff --git a/clguetzli/ocu.h b/clguetzli/ocu.h index dbc42916..e8697d2d 100644 --- a/clguetzli/ocu.h +++ b/clguetzli/ocu.h @@ -4,6 +4,7 @@ #include #include "ocl.h" +#include "cumem_pool.h" #define LOG_CU_RESULT(e) if (CUDA_SUCCESS != (e)) { LogError("Error: %s:%d returned %s.\n", __FUNCTION__, __LINE__, TranslateCUDAError((e)));} @@ -19,6 +20,7 @@ struct ocu_args_d_t ~ocu_args_d_t(); cu_mem allocMem(size_t s, const void *init = NULL); + void releaseMem(cu_mem mem); ocu_channels allocMemChannels(size_t s, const void *c0 = NULL, const void *c1 = NULL, const void *c2 = NULL); void releaseMemChannels(ocu_channels &rgb); @@ -27,6 +29,9 @@ struct ocu_args_d_t CUmodule mod; CUcontext ctxt; CUdevice dev; + ocu_mem_pool_t mem_pool; }; + + #endif \ No newline at end of file diff --git a/guetzli.vcxproj b/guetzli.vcxproj index 86da4aa7..4fa6af4d 100644 --- a/guetzli.vcxproj +++ b/guetzli.vcxproj @@ -144,7 +144,7 @@ false false true - __USE_CUDA__;PERFTOOLS_DLL_DECL=;ENABLE_OPENCL;%(PreprocessorDefinitions) + __USE_CUDA__;__USE_OPENCL__;PERFTOOLS_DLL_DECL=;%(PreprocessorDefinitions) Console @@ -162,6 +162,9 @@ CUDA CU + + 3 + @@ -188,7 +191,7 @@ .;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories) EditAndContinue Disabled - PERFTOOLS_DLL_DECL=;ENABLE_OPENCL;%(PreprocessorDefinitions) + PERFTOOLS_DLL_DECL=;%(PreprocessorDefinitions) Console @@ -206,6 +209,7 @@ + @@ -305,6 +309,7 @@ + diff --git a/guetzli.vcxproj.filters b/guetzli.vcxproj.filters index 38921bde..1cbb6a30 100644 --- a/guetzli.vcxproj.filters +++ b/guetzli.vcxproj.filters @@ -318,6 +318,9 @@ clguetzli + + clguetzli + @@ -593,6 +596,9 @@ clguetzli + + clguetzli + From e11a712ec5d6ff9dcc0070cb35a282b8c35dbba9 Mon Sep 17 00:00:00 2001 From: ianhuang-777 <306168910@qq.com> Date: Wed, 21 Jun 2017 17:41:12 +0800 Subject: [PATCH 167/189] Add missing files --- clguetzli/cumem_pool.cpp | 120 +++++++++++++++++++++++++++++++++++++++ clguetzli/cumem_pool.h | 37 ++++++++++++ 2 files changed, 157 insertions(+) create mode 100644 clguetzli/cumem_pool.cpp create mode 100644 clguetzli/cumem_pool.h diff --git a/clguetzli/cumem_pool.cpp b/clguetzli/cumem_pool.cpp new file mode 100644 index 00000000..706ff4ba --- /dev/null +++ b/clguetzli/cumem_pool.cpp @@ -0,0 +1,120 @@ +#include "cumem_pool.h" + +#ifdef __USE_CUDA__ + +bool compare_size(const ocu_mem_block_t& first, const ocu_mem_block_t& second) +{ + return (first.size < second.size); +} + +ocu_mem_pool_t::ocu_mem_pool_t() + :alloc_count(0) +{ + +} + +ocu_mem_pool_t::~ocu_mem_pool_t() +{ + +} + +cu_mem ocu_mem_pool_t::allocMem(size_t s, const void *init) +{ + alloc_count++; + ocu_mem_block_t *block_candidate = NULL; + for (std::list::iterator iter = mem_pool.begin(); iter != mem_pool.end(); iter++) + { + ocu_mem_block_t *block = &(*iter); + if (block->status == 0 && block->size >= s) { + block_candidate = block; + break; + } + } + cu_mem mem = NULL; + if (block_candidate != NULL) { + block_candidate->status = 1; + block_candidate->used = s; + + mem = block_candidate->mem; + //LogError("mem_pool reuse mem:%lld, used:%lld.\r\n", block_candidate->size, block_candidate->used); + } + else { + cu_mem new_mem; + cuMemAlloc(&new_mem, s); + ocu_mem_block_t mem_block; + mem_block.size = s; + mem_block.used = s; + mem_block.mem = new_mem; + mem_block.status = 1; + mem_pool.push_back(mem_block); + mem_pool.sort(compare_size); + + mem = new_mem; + //LogError("mem_pool new mem:%lld, used:%lld.\r\n", mem_block.size, mem_block.used); + } + if (init) + { + cuMemcpyHtoDAsync(mem, init, s, commandQueue); + } + else + { + cuMemsetD8Async(mem, 0, s, commandQueue); + } + + return mem; + + //cu_mem mem; + //cuMemAlloc(&mem, s); + //if (init) + //{ + // cuMemcpyHtoDAsync(mem, init, s, commandQueue); + //} + //else + //{ + // cuMemsetD8Async(mem, 0, s, commandQueue); + //} + + //return mem; +} + +void ocu_mem_pool_t::releaseMem(cu_mem mem) +{ + ocu_mem_block_t *block_candidate = NULL; + for (std::list::iterator iter = mem_pool.begin(); iter != mem_pool.end(); iter++) + { + ocu_mem_block_t *block = &(*iter); + if (block->mem == mem) { + block_candidate = block; + break; + } + } + if (block_candidate != NULL) { + block_candidate->status = 0; + block_candidate->used = 0; + } + else { + cuMemFree(mem); + LogError("mem_pool release mem:%lld can not be found.\r\n", mem); + } + + //LogError("mem_pool release mem:%lld, used:%lld.\r\n", block_candidate->size, block_candidate->used); +} + +void ocu_mem_pool_t::drain() +{ + size_t total_mem = 0; + size_t total_block = mem_pool.size(); + ocu_mem_block_t *block_candidate = NULL; + for (std::list::iterator iter = mem_pool.begin(); iter != mem_pool.end(); iter++) + { + if (iter->status == 0) { + total_mem += iter->size; + cuMemFree(iter->mem); + iter = mem_pool.erase(iter); + } + } + + LogError("mem_pool has %u blocks, and total memory is:%f kb, total alloc count:%d.\r\n", total_block, (float)(total_mem) / 1024, alloc_count); +} + +#endif \ No newline at end of file diff --git a/clguetzli/cumem_pool.h b/clguetzli/cumem_pool.h new file mode 100644 index 00000000..2abbb69d --- /dev/null +++ b/clguetzli/cumem_pool.h @@ -0,0 +1,37 @@ +#pragma once + +#ifdef __USE_CUDA__ + +#include +#include +#include "ocl.h" + +struct ocu_mem_block_t +{ + ocu_mem_block_t() + :status(0) + , used(0) + {} + ~ocu_mem_block_t() + {} + + int status; + size_t size; + size_t used; + cu_mem mem; +}; + +struct ocu_mem_pool_t +{ + ocu_mem_pool_t(); + ~ocu_mem_pool_t(); + cu_mem allocMem(size_t s, const void *init = NULL); + void releaseMem(cu_mem mem); + void drain(); + + std::list mem_pool; + CUstream commandQueue; + size_t alloc_count; +}; + +#endif \ No newline at end of file From 36a3ce62517aad0db5f92fa8c23bb7bea86bd14f Mon Sep 17 00:00:00 2001 From: ianhuang-777 <306168910@qq.com> Date: Wed, 21 Jun 2017 19:26:45 +0800 Subject: [PATCH 168/189] Clean code --- clguetzli/cumem_pool.cpp | 53 ++++++++++++++-------------------------- clguetzli/cumem_pool.h | 15 ++++++------ clguetzli/ocu.h | 2 +- 3 files changed, 28 insertions(+), 42 deletions(-) diff --git a/clguetzli/cumem_pool.cpp b/clguetzli/cumem_pool.cpp index 706ff4ba..50d96eb7 100644 --- a/clguetzli/cumem_pool.cpp +++ b/clguetzli/cumem_pool.cpp @@ -2,29 +2,31 @@ #ifdef __USE_CUDA__ -bool compare_size(const ocu_mem_block_t& first, const ocu_mem_block_t& second) +bool compare_size(const cu_mem_block_t& first, const cu_mem_block_t& second) { return (first.size < second.size); } -ocu_mem_pool_t::ocu_mem_pool_t() - :alloc_count(0) +cu_mem_pool_t::cu_mem_pool_t() + : alloc_count(0) + , total_mem_request(0) { } -ocu_mem_pool_t::~ocu_mem_pool_t() +cu_mem_pool_t::~cu_mem_pool_t() { } -cu_mem ocu_mem_pool_t::allocMem(size_t s, const void *init) +cu_mem cu_mem_pool_t::allocMem(size_t s, const void *init) { alloc_count++; - ocu_mem_block_t *block_candidate = NULL; - for (std::list::iterator iter = mem_pool.begin(); iter != mem_pool.end(); iter++) + total_mem_request += s; + cu_mem_block_t *block_candidate = NULL; + for (std::list::iterator iter = mem_pool.begin(); iter != mem_pool.end(); iter++) { - ocu_mem_block_t *block = &(*iter); + cu_mem_block_t *block = &(*iter); if (block->status == 0 && block->size >= s) { block_candidate = block; break; @@ -36,12 +38,11 @@ cu_mem ocu_mem_pool_t::allocMem(size_t s, const void *init) block_candidate->used = s; mem = block_candidate->mem; - //LogError("mem_pool reuse mem:%lld, used:%lld.\r\n", block_candidate->size, block_candidate->used); } else { cu_mem new_mem; cuMemAlloc(&new_mem, s); - ocu_mem_block_t mem_block; + cu_mem_block_t mem_block; mem_block.size = s; mem_block.used = s; mem_block.mem = new_mem; @@ -50,7 +51,6 @@ cu_mem ocu_mem_pool_t::allocMem(size_t s, const void *init) mem_pool.sort(compare_size); mem = new_mem; - //LogError("mem_pool new mem:%lld, used:%lld.\r\n", mem_block.size, mem_block.used); } if (init) { @@ -62,27 +62,14 @@ cu_mem ocu_mem_pool_t::allocMem(size_t s, const void *init) } return mem; - - //cu_mem mem; - //cuMemAlloc(&mem, s); - //if (init) - //{ - // cuMemcpyHtoDAsync(mem, init, s, commandQueue); - //} - //else - //{ - // cuMemsetD8Async(mem, 0, s, commandQueue); - //} - - //return mem; } -void ocu_mem_pool_t::releaseMem(cu_mem mem) +void cu_mem_pool_t::releaseMem(cu_mem mem) { - ocu_mem_block_t *block_candidate = NULL; - for (std::list::iterator iter = mem_pool.begin(); iter != mem_pool.end(); iter++) + cu_mem_block_t *block_candidate = NULL; + for (std::list::iterator iter = mem_pool.begin(); iter != mem_pool.end(); iter++) { - ocu_mem_block_t *block = &(*iter); + cu_mem_block_t *block = &(*iter); if (block->mem == mem) { block_candidate = block; break; @@ -96,16 +83,14 @@ void ocu_mem_pool_t::releaseMem(cu_mem mem) cuMemFree(mem); LogError("mem_pool release mem:%lld can not be found.\r\n", mem); } - - //LogError("mem_pool release mem:%lld, used:%lld.\r\n", block_candidate->size, block_candidate->used); } -void ocu_mem_pool_t::drain() +void cu_mem_pool_t::drain() { size_t total_mem = 0; size_t total_block = mem_pool.size(); - ocu_mem_block_t *block_candidate = NULL; - for (std::list::iterator iter = mem_pool.begin(); iter != mem_pool.end(); iter++) + cu_mem_block_t *block_candidate = NULL; + for (std::list::iterator iter = mem_pool.begin(); iter != mem_pool.end(); iter++) { if (iter->status == 0) { total_mem += iter->size; @@ -114,7 +99,7 @@ void ocu_mem_pool_t::drain() } } - LogError("mem_pool has %u blocks, and total memory is:%f kb, total alloc count:%d.\r\n", total_block, (float)(total_mem) / 1024, alloc_count); + LogError("mem_pool has %u blocks, and total pool memory is:%f kb, total memory request:%f kb, total alloc count:%d.\r\n", total_block, (float)(total_mem) / 1024, (float)(total_mem_request) / 1024, alloc_count); } #endif \ No newline at end of file diff --git a/clguetzli/cumem_pool.h b/clguetzli/cumem_pool.h index 2abbb69d..73355e82 100644 --- a/clguetzli/cumem_pool.h +++ b/clguetzli/cumem_pool.h @@ -6,13 +6,13 @@ #include #include "ocl.h" -struct ocu_mem_block_t +struct cu_mem_block_t { - ocu_mem_block_t() + cu_mem_block_t() :status(0) , used(0) {} - ~ocu_mem_block_t() + ~cu_mem_block_t() {} int status; @@ -21,17 +21,18 @@ struct ocu_mem_block_t cu_mem mem; }; -struct ocu_mem_pool_t +struct cu_mem_pool_t { - ocu_mem_pool_t(); - ~ocu_mem_pool_t(); + cu_mem_pool_t(); + ~cu_mem_pool_t(); cu_mem allocMem(size_t s, const void *init = NULL); void releaseMem(cu_mem mem); void drain(); - std::list mem_pool; + std::list mem_pool; CUstream commandQueue; size_t alloc_count; + size_t total_mem_request; }; #endif \ No newline at end of file diff --git a/clguetzli/ocu.h b/clguetzli/ocu.h index e8697d2d..1c13e86e 100644 --- a/clguetzli/ocu.h +++ b/clguetzli/ocu.h @@ -29,7 +29,7 @@ struct ocu_args_d_t CUmodule mod; CUcontext ctxt; CUdevice dev; - ocu_mem_pool_t mem_pool; + cu_mem_pool_t mem_pool; }; From e42fdaba25efaf867a1b8bd5c7cc7620ca41b815 Mon Sep 17 00:00:00 2001 From: strongtu Date: Wed, 21 Jun 2017 17:43:28 +0800 Subject: [PATCH 169/189] Modify makefile --- guetzli.make | 12 ++++++++---- guetzli_static.make | 4 ++++ premake5.lua | 4 ++-- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/guetzli.make b/guetzli.make index 3675ba0d..b40f6f4b 100644 --- a/guetzli.make +++ b/guetzli.make @@ -15,14 +15,14 @@ ifeq ($(config),release) TARGETDIR = bin/Release TARGET = $(TARGETDIR)/guetzli OBJDIR = obj/Release/guetzli - DEFINES += -D__USE_CUDA__ + DEFINES += -D__USE_OPENCL__ -D__USE_CUDA__ -D__USE_GPERFTOOLS__ INCLUDES += -I. -Ithird_party/butteraugli -Iclguetzli -I"$(OPENCL_INC)" FORCE_INCLUDE += ALL_CPPFLAGS += $(CPPFLAGS) -MMD -MP $(DEFINES) $(INCLUDES) ALL_CFLAGS += $(CFLAGS) $(ALL_CPPFLAGS) -O3 -g `pkg-config --cflags libpng || libpng-config --cflags` ALL_CXXFLAGS += $(CXXFLAGS) $(ALL_CPPFLAGS) -O3 -g -std=c++11 `pkg-config --cflags libpng || libpng-config --cflags` ALL_RESFLAGS += $(RESFLAGS) $(DEFINES) $(INCLUDES) - LIBS += -lOpenCL -lcuda + LIBS += -lOpenCL -lcuda -lprofiler -lunwind LDDEPS += ALL_LDFLAGS += $(LDFLAGS) -L"$(OPENCL_LIB)" `pkg-config --libs libpng || libpng-config --ldflags` LINKCMD = $(CXX) -o "$@" $(OBJECTS) $(RESOURCES) $(ALL_LDFLAGS) $(LIBS) @@ -42,14 +42,14 @@ ifeq ($(config),debug) TARGETDIR = bin/Debug TARGET = $(TARGETDIR)/guetzli OBJDIR = obj/Debug/guetzli - DEFINES += -D__USE_CUDA__ + DEFINES += -D__USE_OPENCL__ -D__USE_CUDA__ -D__USE_GPERFTOOLS__ INCLUDES += -I. -Ithird_party/butteraugli -Iclguetzli -I"$(OPENCL_INC)" FORCE_INCLUDE += ALL_CPPFLAGS += $(CPPFLAGS) -MMD -MP $(DEFINES) $(INCLUDES) ALL_CFLAGS += $(CFLAGS) $(ALL_CPPFLAGS) -g `pkg-config --cflags libpng || libpng-config --cflags` ALL_CXXFLAGS += $(CXXFLAGS) $(ALL_CPPFLAGS) -g -std=c++11 `pkg-config --cflags libpng || libpng-config --cflags` ALL_RESFLAGS += $(RESFLAGS) $(DEFINES) $(INCLUDES) - LIBS += -lOpenCL -lcuda + LIBS += -lOpenCL -lcuda -lprofiler -lunwind LDDEPS += ALL_LDFLAGS += $(LDFLAGS) -L"$(OPENCL_LIB)" `pkg-config --libs libpng || libpng-config --ldflags` LINKCMD = $(CXX) -o "$@" $(OBJECTS) $(RESOURCES) $(ALL_LDFLAGS) $(LIBS) @@ -70,6 +70,7 @@ OBJECTS := \ $(OBJDIR)/clguetzli.o \ $(OBJDIR)/clguetzli_test.o \ $(OBJDIR)/cuguetzli.o \ + $(OBJDIR)/cumem_pool.o \ $(OBJDIR)/ocl.o \ $(OBJDIR)/ocu.o \ $(OBJDIR)/utils.o \ @@ -166,6 +167,9 @@ $(OBJDIR)/clguetzli_test.o: clguetzli/clguetzli_test.cpp $(OBJDIR)/cuguetzli.o: clguetzli/cuguetzli.cpp @echo $(notdir $<) $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" +$(OBJDIR)/cumem_pool.o: clguetzli/cumem_pool.cpp + @echo $(notdir $<) + $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/ocl.o: clguetzli/ocl.cpp @echo $(notdir $<) $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" diff --git a/guetzli_static.make b/guetzli_static.make index 68808523..2d648c04 100644 --- a/guetzli_static.make +++ b/guetzli_static.make @@ -70,6 +70,7 @@ OBJECTS := \ $(OBJDIR)/clguetzli.o \ $(OBJDIR)/clguetzli_test.o \ $(OBJDIR)/cuguetzli.o \ + $(OBJDIR)/cumem_pool.o \ $(OBJDIR)/ocl.o \ $(OBJDIR)/ocu.o \ $(OBJDIR)/utils.o \ @@ -165,6 +166,9 @@ $(OBJDIR)/clguetzli_test.o: clguetzli/clguetzli_test.cpp $(OBJDIR)/cuguetzli.o: clguetzli/cuguetzli.cpp @echo $(notdir $<) $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" +$(OBJDIR)/cumem_pool.o: clguetzli/cumem_pool.cpp + @echo $(notdir $<) + $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/ocl.o: clguetzli/ocl.cpp @echo $(notdir $<) $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" diff --git a/premake5.lua b/premake5.lua index f6723df8..099df66f 100644 --- a/premake5.lua +++ b/premake5.lua @@ -42,10 +42,10 @@ workspace "guetzli" project "guetzli" kind "ConsoleApp" filter "action:gmake" - defines { "__USE_CUDA__" } + defines { "__USE_OPENCL__", "__USE_CUDA__", "__USE_GPERFTOOLS__" } linkoptions { "`pkg-config --libs libpng || libpng-config --ldflags`" } buildoptions { "`pkg-config --cflags libpng || libpng-config --cflags`" } - links { "OpenCL", "cuda" } + links { "OpenCL", "cuda", "profiler", "unwind" } filter "action:vs*" links { "shlwapi" } filter {} From 644f5637eda4c55a7177b0e5280d24be4bf588fa Mon Sep 17 00:00:00 2001 From: strongtu Date: Fri, 23 Jun 2017 15:38:55 +0800 Subject: [PATCH 170/189] =?UTF-8?q?=E9=BB=98=E8=AE=A4=E5=BC=80=E5=90=AFCUD?= =?UTF-8?q?A=20OPENCL?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- guetzli.vcxproj | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/guetzli.vcxproj b/guetzli.vcxproj index 4fa6af4d..52fda8ba 100644 --- a/guetzli.vcxproj +++ b/guetzli.vcxproj @@ -108,7 +108,7 @@ true false true - _UNICODE;UNICODE;%(PreprocessorDefinitions) + __USE_CUDA__;__USE_OPENCL__;_UNICODE;UNICODE;%(PreprocessorDefinitions) Console @@ -173,7 +173,7 @@ .;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories) EditAndContinue Disabled - __USE_CUDA__;_UNICODE;UNICODE;%(PreprocessorDefinitions) + __USE_CUDA__;__USE_OPENCL__;_UNICODE;UNICODE;%(PreprocessorDefinitions) Console @@ -191,7 +191,7 @@ .;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories) EditAndContinue Disabled - PERFTOOLS_DLL_DECL=;%(PreprocessorDefinitions) + __USE_CUDA__;__USE_OPENCL__;PERFTOOLS_DLL_DECL=;%(PreprocessorDefinitions) Console From 340d914549cda2cf06f1561b52c975f7ccb93ca1 Mon Sep 17 00:00:00 2001 From: strongtu Date: Fri, 23 Jun 2017 18:13:38 +0800 Subject: [PATCH 171/189] =?UTF-8?q?=E7=A7=BB=E9=99=A4tcmalloc=EF=BC=8C?= =?UTF-8?q?=E5=AF=B9=E6=80=A7=E8=83=BD=E6=B2=A1=E4=BB=80=E4=B9=88=E5=BD=B1?= =?UTF-8?q?=E5=93=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- guetzli.vcxproj | 81 ------------- guetzli.vcxproj.filters | 246 ---------------------------------------- 2 files changed, 327 deletions(-) diff --git a/guetzli.vcxproj b/guetzli.vcxproj index 52fda8ba..6d1153c0 100644 --- a/guetzli.vcxproj +++ b/guetzli.vcxproj @@ -239,55 +239,6 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -334,38 +285,6 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/guetzli.vcxproj.filters b/guetzli.vcxproj.filters index 1cbb6a30..768a8128 100644 --- a/guetzli.vcxproj.filters +++ b/guetzli.vcxproj.filters @@ -19,9 +19,6 @@ {cb89c1ac-8399-4814-88f2-4b69576bc9f9} - - {f2b475de-6219-478e-9e5e-08f07ef25dbc} - {64847a89-ca39-4556-ba0e-d6875c4d39ca} @@ -147,153 +144,6 @@ third_party\zlib - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - clguetzli @@ -476,102 +326,6 @@ third_party\zlib - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - - - third_party\tcmalloc_minimal - clguetzli From 6f2726b12008a9c336fdc33501358676c6dee197 Mon Sep 17 00:00:00 2001 From: ianhuang-777 <306168910@qq.com> Date: Thu, 29 Jun 2017 20:00:33 +0800 Subject: [PATCH 172/189] Change memory block status to enum --- clguetzli/cumem_pool.cpp | 10 +++++----- clguetzli/cumem_pool.h | 10 ++++++++-- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/clguetzli/cumem_pool.cpp b/clguetzli/cumem_pool.cpp index 50d96eb7..4fe4964d 100644 --- a/clguetzli/cumem_pool.cpp +++ b/clguetzli/cumem_pool.cpp @@ -27,14 +27,14 @@ cu_mem cu_mem_pool_t::allocMem(size_t s, const void *init) for (std::list::iterator iter = mem_pool.begin(); iter != mem_pool.end(); iter++) { cu_mem_block_t *block = &(*iter); - if (block->status == 0 && block->size >= s) { + if (block->status == MBS_IDLE && block->size >= s) { block_candidate = block; break; } } cu_mem mem = NULL; if (block_candidate != NULL) { - block_candidate->status = 1; + block_candidate->status = MBS_BUSY; block_candidate->used = s; mem = block_candidate->mem; @@ -46,7 +46,7 @@ cu_mem cu_mem_pool_t::allocMem(size_t s, const void *init) mem_block.size = s; mem_block.used = s; mem_block.mem = new_mem; - mem_block.status = 1; + mem_block.status = MBS_BUSY; mem_pool.push_back(mem_block); mem_pool.sort(compare_size); @@ -76,7 +76,7 @@ void cu_mem_pool_t::releaseMem(cu_mem mem) } } if (block_candidate != NULL) { - block_candidate->status = 0; + block_candidate->status = MBS_IDLE; block_candidate->used = 0; } else { @@ -92,7 +92,7 @@ void cu_mem_pool_t::drain() cu_mem_block_t *block_candidate = NULL; for (std::list::iterator iter = mem_pool.begin(); iter != mem_pool.end(); iter++) { - if (iter->status == 0) { + if (iter->status == MBS_IDLE) { total_mem += iter->size; cuMemFree(iter->mem); iter = mem_pool.erase(iter); diff --git a/clguetzli/cumem_pool.h b/clguetzli/cumem_pool.h index 73355e82..262f4106 100644 --- a/clguetzli/cumem_pool.h +++ b/clguetzli/cumem_pool.h @@ -6,16 +6,22 @@ #include #include "ocl.h" +enum mem_block_status +{ + MBS_IDLE, + MBS_BUSY, +}; + struct cu_mem_block_t { cu_mem_block_t() - :status(0) + :status(MBS_IDLE) , used(0) {} ~cu_mem_block_t() {} - int status; + mem_block_status status; size_t size; size_t used; cu_mem mem; From 46367ce2986a7977bcc863e7d76f5e71e46011f6 Mon Sep 17 00:00:00 2001 From: ianhuang-777 <306168910@qq.com> Date: Wed, 5 Jul 2017 16:33:04 +0800 Subject: [PATCH 173/189] Remove tcmalloc --- guetzli.vcxproj | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/guetzli.vcxproj b/guetzli.vcxproj index 6d1153c0..fb517ca5 100644 --- a/guetzli.vcxproj +++ b/guetzli.vcxproj @@ -138,13 +138,13 @@ NotUsing Level3 - .;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories) + .;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;%(AdditionalIncludeDirectories) MaxSpeed true false false true - __USE_CUDA__;__USE_OPENCL__;PERFTOOLS_DLL_DECL=;%(PreprocessorDefinitions) + __USE_CUDA__;__USE_OPENCL__;%(PreprocessorDefinitions) Console @@ -152,7 +152,8 @@ true cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies) mainCRTStartup - __tcmalloc + + $(CUDA_PATH)\lib\Win32 @@ -188,20 +189,24 @@ NotUsing Level3 - .;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories) + .;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;%(AdditionalIncludeDirectories) EditAndContinue Disabled - __USE_CUDA__;__USE_OPENCL__;PERFTOOLS_DLL_DECL=;%(PreprocessorDefinitions) + __USE_CUDA__;__USE_OPENCL__;%(PreprocessorDefinitions) Console true cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies) mainCRTStartup - __tcmalloc + + $(CUDA_PATH)\lib\Win32 + + 3 + From 80319852f00fb8417a649c6dd1889ce5de28d3ba Mon Sep 17 00:00:00 2001 From: zhantong Date: Fri, 7 Jul 2017 13:57:24 +0800 Subject: [PATCH 174/189] =?UTF-8?q?=E6=94=AF=E6=8C=81=E9=9D=9E=E4=B8=BB?= =?UTF-8?q?=E6=B5=81JPEG=E6=A0=BC=E5=BC=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 使用libjpeg库 --- guetzli.vcxproj | 12 ++++----- guetzli/jpeg_data_decoder.cc | 5 ++-- guetzli/processor.cc | 47 ++++++++++++++++++++++++++++++------ guetzli/processor.h | 3 +++ 4 files changed, 51 insertions(+), 16 deletions(-) diff --git a/guetzli.vcxproj b/guetzli.vcxproj index fb517ca5..ae2e8fc7 100644 --- a/guetzli.vcxproj +++ b/guetzli.vcxproj @@ -102,7 +102,7 @@ NotUsing Level3 - .;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories) + .;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;third_party\libjpeg;%(AdditionalIncludeDirectories) Full true true @@ -114,9 +114,9 @@ Console true true - cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies) + cuda.lib;OpenCL.lib;shlwapi.lib;jpeg.lib;%(AdditionalDependencies) mainCRTStartup - $(CUDA_PATH)\lib\x64 + $(CUDA_PATH)\lib\x64;third_party\libjpeg\x64 "$(INTELOCLSDKROOT)bin\x64\ioc64.exe" -cmd=build -input="%(FullPath)" -output="x64\Release\%(Filename).out" -VS -device=GPU -simd=default -targetos=current -bo=" " @@ -138,7 +138,7 @@ NotUsing Level3 - .;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;%(AdditionalIncludeDirectories) + .;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\libjpeg;%(AdditionalIncludeDirectories) MaxSpeed true false @@ -150,11 +150,11 @@ Console true true - cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies) + cuda.lib;OpenCL.lib;shlwapi.lib;jpeg.lib;%(AdditionalDependencies) mainCRTStartup - $(CUDA_PATH)\lib\Win32 + $(CUDA_PATH)\lib\Win32;C:\Users\tongzhan\GitHub\guetzli\third_party\libjpeg\x86 diff --git a/guetzli/jpeg_data_decoder.cc b/guetzli/jpeg_data_decoder.cc index 98f9f4cc..722d6663 100644 --- a/guetzli/jpeg_data_decoder.cc +++ b/guetzli/jpeg_data_decoder.cc @@ -43,9 +43,8 @@ bool HasYCbCrColorSpace(const JPEGData& jpg) { } std::vector DecodeJpegToRGB(const JPEGData& jpg) { - if (jpg.components.size() == 1 || - (jpg.components.size() == 3 && - HasYCbCrColorSpace(jpg) && (jpg.Is420() || jpg.Is444()))) { + if (jpg.components.size() == 3 && + HasYCbCrColorSpace(jpg) && (jpg.Is420() || jpg.Is444())) { OutputImage img(jpg.width, jpg.height); img.CopyFromJpegData(jpg); return img.ToSRGB(); diff --git a/guetzli/processor.cc b/guetzli/processor.cc index 3d39da02..f0a0bf48 100644 --- a/guetzli/processor.cc +++ b/guetzli/processor.cc @@ -33,6 +33,8 @@ #include "guetzli/quantize.h" #include "clguetzli/clguetzli.h" +#include "third_party/libjpeg/jpeglib.h" + namespace guetzli { namespace { @@ -1033,10 +1035,7 @@ bool Process(const Params& params, ProcessStats* stats, } std::vector rgb = DecodeJpegToRGB(jpg); if (rgb.empty()) { - fprintf(stderr, "Unsupported input JPEG file (e.g. unsupported " - "downsampling mode).\nPlease provide the input image as " - "a PNG file.\n"); - return false; + return ProcessUnsupportedJpegData(params,stats,data,jpg_out); } GuetzliOutput out; ProcessStats dummy_stats; @@ -1050,9 +1049,9 @@ bool Process(const Params& params, ProcessStats* stats, new ButteraugliComparatorEx(jpg.width, jpg.height, &rgb, params.butteraugli_target, stats)); #else - comparator.reset( - new ButteraugliComparator(jpg.width, jpg.height, &rgb, - params.butteraugli_target, stats)); + comparator.reset( + new ButteraugliComparator(jpg.width, jpg.height, &rgb, + params.butteraugli_target, stats)); #endif } bool ok = ProcessJpegData(params, jpg, comparator.get(), &out, stats); @@ -1060,6 +1059,40 @@ bool Process(const Params& params, ProcessStats* stats, return ok; } +bool ProcessUnsupportedJpegData(const Params& params, ProcessStats* stats, + const std::string& data, + std::string* jpg_out) { + struct jpeg_decompress_struct cinfo; + struct jpeg_error_mgr jerr; + cinfo.err = jpeg_std_error(&jerr); + jpeg_create_decompress(&cinfo); + jpeg_mem_src(&cinfo, (unsigned char*)data.c_str(), data.length()); + + int rc = jpeg_read_header(&cinfo, TRUE); + if (rc != 1) { + fprintf(stderr, "File does not seem to be a normal JPEG\n"); + exit(EXIT_FAILURE); + } + + cinfo.out_color_space = JCS_RGB; //force RGB output + jpeg_start_decompress(&cinfo); + int xsize = cinfo.output_width; + int ysize = cinfo.output_height; + int pixel_size = cinfo.output_components; + unsigned long bmp_size = xsize * ysize * pixel_size; + unsigned char *bmp_buffer = (unsigned char*)malloc(bmp_size); + int row_stride = cinfo.output_width * cinfo.output_components; + JSAMPARRAY buffer = (*cinfo.mem->alloc_sarray) + ((j_common_ptr)&cinfo, JPOOL_IMAGE, row_stride, 1); + while (cinfo.output_scanline < cinfo.output_height) { + unsigned char *buffer_array[1]; + buffer_array[0] = bmp_buffer + (cinfo.output_scanline) * row_stride; + jpeg_read_scanlines(&cinfo, buffer_array, 1); + } + std::vector temp_rgb(bmp_buffer, bmp_buffer + bmp_size); + return Process(params, stats, temp_rgb, xsize, ysize, jpg_out); +} + bool Process(const Params& params, ProcessStats* stats, const std::vector& rgb, int w, int h, std::string* jpg_out) { diff --git a/guetzli/processor.h b/guetzli/processor.h index 924ba0fa..e6cf4ba8 100644 --- a/guetzli/processor.h +++ b/guetzli/processor.h @@ -53,6 +53,9 @@ struct GuetzliOutput { bool ProcessJpegData(const Params& params, const JPEGData& jpg_in, Comparator* comparator, GuetzliOutput* out, ProcessStats* stats); +bool ProcessUnsupportedJpegData(const Params& params, + ProcessStats* stats, const std::string& data, + std::string* jpg_out); // Sets *out to a jpeg encoded string that will decode to an image that is // visually indistinguishable from the input rgb image. From eda913fedda3754a4c7891654c36fe6dd13ca714 Mon Sep 17 00:00:00 2001 From: strongtu Date: Sun, 9 Jul 2017 23:30:16 +0800 Subject: [PATCH 175/189] Mofidy makefile --- guetzli.make | 4 ++-- premake5.lua | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/guetzli.make b/guetzli.make index b40f6f4b..a458eb09 100644 --- a/guetzli.make +++ b/guetzli.make @@ -22,7 +22,7 @@ ifeq ($(config),release) ALL_CFLAGS += $(CFLAGS) $(ALL_CPPFLAGS) -O3 -g `pkg-config --cflags libpng || libpng-config --cflags` ALL_CXXFLAGS += $(CXXFLAGS) $(ALL_CPPFLAGS) -O3 -g -std=c++11 `pkg-config --cflags libpng || libpng-config --cflags` ALL_RESFLAGS += $(RESFLAGS) $(DEFINES) $(INCLUDES) - LIBS += -lOpenCL -lcuda -lprofiler -lunwind + LIBS += -lOpenCL -lcuda -lprofiler -lunwind -ljpeg LDDEPS += ALL_LDFLAGS += $(LDFLAGS) -L"$(OPENCL_LIB)" `pkg-config --libs libpng || libpng-config --ldflags` LINKCMD = $(CXX) -o "$@" $(OBJECTS) $(RESOURCES) $(ALL_LDFLAGS) $(LIBS) @@ -49,7 +49,7 @@ ifeq ($(config),debug) ALL_CFLAGS += $(CFLAGS) $(ALL_CPPFLAGS) -g `pkg-config --cflags libpng || libpng-config --cflags` ALL_CXXFLAGS += $(CXXFLAGS) $(ALL_CPPFLAGS) -g -std=c++11 `pkg-config --cflags libpng || libpng-config --cflags` ALL_RESFLAGS += $(RESFLAGS) $(DEFINES) $(INCLUDES) - LIBS += -lOpenCL -lcuda -lprofiler -lunwind + LIBS += -lOpenCL -lcuda -lprofiler -lunwind -ljpeg LDDEPS += ALL_LDFLAGS += $(LDFLAGS) -L"$(OPENCL_LIB)" `pkg-config --libs libpng || libpng-config --ldflags` LINKCMD = $(CXX) -o "$@" $(OBJECTS) $(RESOURCES) $(ALL_LDFLAGS) $(LIBS) diff --git a/premake5.lua b/premake5.lua index 099df66f..1c5ef6c6 100644 --- a/premake5.lua +++ b/premake5.lua @@ -31,6 +31,7 @@ workspace "guetzli" "guetzli/*.h", "third_party/butteraugli/butteraugli/butteraugli.cc", "third_party/butteraugli/butteraugli/butteraugli.h", + "third_party/libjpeg/*.h", "clguetzli/*.cpp", "clguetzli/*.h" } @@ -45,7 +46,7 @@ workspace "guetzli" defines { "__USE_OPENCL__", "__USE_CUDA__", "__USE_GPERFTOOLS__" } linkoptions { "`pkg-config --libs libpng || libpng-config --ldflags`" } buildoptions { "`pkg-config --cflags libpng || libpng-config --cflags`" } - links { "OpenCL", "cuda", "profiler", "unwind" } + links { "OpenCL", "cuda", "profiler", "unwind", "jpeg" } filter "action:vs*" links { "shlwapi" } filter {} @@ -55,6 +56,7 @@ workspace "guetzli" "guetzli/*.h", "third_party/butteraugli/butteraugli/butteraugli.cc", "third_party/butteraugli/butteraugli/butteraugli.h", + "third_party/libjpeg/*.h", "clguetzli/*.cpp", "clguetzli/*.h" } From 4058d6ed5889da49a7e467bd403498ca26d8cccf Mon Sep 17 00:00:00 2001 From: zhantong Date: Mon, 10 Jul 2017 17:54:01 +0800 Subject: [PATCH 176/189] =?UTF-8?q?=E4=BF=AE=E5=A4=8Dlibjpeg=E5=BA=93?= =?UTF-8?q?=E5=9C=A8debug=E5=92=8C32=E4=BD=8D=E4=B8=8B=E7=BC=96=E8=AF=91?= =?UTF-8?q?=E4=B8=8D=E6=88=90=E5=8A=9F=E7=9A=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- guetzli.vcxproj | 17 ++++++++++------- guetzli.vcxproj.filters | 12 ++++++++++++ 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/guetzli.vcxproj b/guetzli.vcxproj index ae2e8fc7..32cc12c7 100644 --- a/guetzli.vcxproj +++ b/guetzli.vcxproj @@ -154,7 +154,7 @@ mainCRTStartup - $(CUDA_PATH)\lib\Win32;C:\Users\tongzhan\GitHub\guetzli\third_party\libjpeg\x86 + $(CUDA_PATH)\lib\Win32;third_party\libjpeg\x86 @@ -171,7 +171,7 @@ NotUsing Level3 - .;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories) + .;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;third_party\libjpeg;%(AdditionalIncludeDirectories) EditAndContinue Disabled __USE_CUDA__;__USE_OPENCL__;_UNICODE;UNICODE;%(PreprocessorDefinitions) @@ -179,9 +179,9 @@ Console true - cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies) + cuda.lib;OpenCL.lib;shlwapi.lib;jpeg.lib;%(AdditionalDependencies) mainCRTStartup - $(CUDA_PATH)\lib\x64 + $(CUDA_PATH)\lib\x64;third_party\libjpeg\x64 @@ -189,7 +189,7 @@ NotUsing Level3 - .;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;%(AdditionalIncludeDirectories) + .;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\libjpeg;%(AdditionalIncludeDirectories) EditAndContinue Disabled __USE_CUDA__;__USE_OPENCL__;%(PreprocessorDefinitions) @@ -197,11 +197,11 @@ Console true - cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies) + cuda.lib;OpenCL.lib;shlwapi.lib;jpeg.lib;%(AdditionalDependencies) mainCRTStartup - $(CUDA_PATH)\lib\Win32 + $(CUDA_PATH)\lib\Win32;third_party\libjpeg\x86 @@ -244,6 +244,9 @@ + + + diff --git a/guetzli.vcxproj.filters b/guetzli.vcxproj.filters index 768a8128..785c7382 100644 --- a/guetzli.vcxproj.filters +++ b/guetzli.vcxproj.filters @@ -22,6 +22,9 @@ {64847a89-ca39-4556-ba0e-d6875c4d39ca} + + {1ac67559-7330-41c7-9a6d-10c3abee000e} + @@ -171,6 +174,15 @@ clguetzli + + third_party\libjpeg + + + third_party\libjpeg + + + third_party\libjpeg + From c100839e9975bb0ade5b6cff52a8feaede0198cb Mon Sep 17 00:00:00 2001 From: ianhuang-777 <306168910@qq.com> Date: Tue, 11 Jul 2017 10:36:23 +0800 Subject: [PATCH 177/189] Translate the comment. --- clguetzli/clguetzli.cl | 53 +++++++++++------------------------- clguetzli/clguetzli.cl.cpp | 29 -------------------- clguetzli/clguetzli.cu | 21 -------------- clguetzli/clguetzli_test.cpp | 10 ------- clguetzli/cumem_pool.h | 2 ++ clguetzli/ocu.cpp | 30 -------------------- guetzli/processor.cc | 5 ++-- 7 files changed, 21 insertions(+), 129 deletions(-) diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index f0e16db0..b4d11a92 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -13,7 +13,7 @@ #define kBlockHalf (kBlockEdge * kBlockEdgeHalf) #define kComputeBlockSize (kBlockSize * 3) -// IntFloatPairÊÇΪÁËÄ£Äâoutput_order input_orderµÄvector +// IntFloatPair: opencl version of output_order/input_order typedef struct __IntFloatPair { int idx; @@ -734,21 +734,20 @@ __kernel void clAddBorderEx(__global float *out, const int xsize, const int ysiz } -// batchÊÇÖ¸ÒѾ­¶þά¿éÕ¹¿ªÎªÁËһά¿é __kernel void clComputeBlockZeroingOrderEx( - __global const coeff_t *orig_batch_0, // ԭʼͼÏñϵÊý - __global const coeff_t *orig_batch_1, // ԭʼͼÏñϵÊý - __global const coeff_t *orig_batch_2, // ԭʼͼÏñϵÊý - __global const float *orig_image_batch, // ԭʼͼÏñpregamma - __global const float *mask_scale, // ԭʼͼÏñµÄij¸öÉñÃØ²ÎÊý + __global const coeff_t *orig_batch_0, // Coeffs of Original image. + __global const coeff_t *orig_batch_1, // Coeffs of Original image. + __global const coeff_t *orig_batch_2, // Coeffs of Original image. + __global const float *orig_image_batch, // pregamma of Original image.. + __global const float *mask_scale, // mask_scale of Original image.. const int block_xsize, const int block_ysize, const int image_width, const int image_height, - __global const coeff_t *mayout_batch_0, // Êä³ö±¸Ñ¡Í¼µÄϵÊý - __global const coeff_t *mayout_batch_1, // Êä³ö±¸Ñ¡Í¼µÄϵÊý - __global const coeff_t *mayout_batch_2, // Êä³ö±¸Ñ¡Í¼µÄϵÊý + __global const coeff_t *mayout_batch_0, // Coeffs of output image. + __global const coeff_t *mayout_batch_1, // Coeffs of output image. + __global const coeff_t *mayout_batch_2, // Coeffs of output image. __global const ushort *mayout_pixel_0, __global const ushort *mayout_pixel_1, __global const ushort *mayout_pixel_2, @@ -756,8 +755,8 @@ __kernel void clComputeBlockZeroingOrderEx( const channel_info mayout_channel_0, const channel_info mayout_channel_1, const channel_info mayout_channel_2, - const int factor, // µ±Ç°²ÎÓëÔËËãµÄfactor - const int comp_mask, // µ±Ç°²ÎÓëÔËËãµÄchannel + const int factor, // Current factor in computing. + const int comp_mask, // Current channel in computing. const float BlockErrorLimit, __global CoeffData *output_order_list/*out*/) { @@ -779,7 +778,7 @@ __kernel void clComputeBlockZeroingOrderEx( mayout_channel[1].pixel = mayout_pixel_1; mayout_channel[2].pixel = mayout_pixel_2; - int block_idx = 0; // ¸ù¾ÝÏÂÃæmaskÃüÖеÄchannelÀ´¼ÆËãindx + int block_idx = 0; coeff_t mayout_block[kComputeBlockSize] = { 0 }; coeff_t orig_block[kComputeBlockSize] = { 0 }; @@ -833,7 +832,7 @@ __kernel void clComputeBlockZeroingOrderEx( } if (best_err >= BlockErrorLimit) - { // err¶ÓÁÐÊÇÖð½¥Ôö´óµÄ£¬Èç¹ûÕâÀïÒѾ­³¬¹ýErrorLimit£¬ºóÐøµÄ¼ÆËã¾ÍÊÇÈßÓàµÄÁË + { // The input_order is an ascent vector, break when best_err exceed the error limit. break; } int idx = input_order.pData[best_i].idx; @@ -843,7 +842,6 @@ __kernel void clComputeBlockZeroingOrderEx( list_push_back(&output_order, idx, best_err); } - // ×¢Òâoutput_orderÕâÀïµÄresize¾ÍÊǰÑβ²¿µÄÖÃλ0 float min_err = 1e10; for (int i = output_order.size - 1; i >= 0; --i) { min_err = min(min_err, output_order.pData[i].err); @@ -855,7 +853,7 @@ __kernel void clComputeBlockZeroingOrderEx( int out_count = 0; for (int i = 0; i < kComputeBlockSize && i < output_order.size; i++) { - // ¹ýÂ˽ϴóµÄerr£¬Õⲿ·Ö½øÈëºó¶Ë¼ÆËãûÓÐÒâÒå + // err exceeding the limit is no need to continue. if (output_order.pData[i].err <= BlockErrorLimit) { output_block[out_count].idx = output_order.pData[i].idx; @@ -1573,8 +1571,6 @@ __device__ void RgbToXyb(double r, double g, double b, double *valx, double *val *valz = b; } -// chrisk todo -// return size __device__ int list_push_back(IntFloatPairList* list, int i, float f) { list->pData[list->size].idx = i; @@ -1582,8 +1578,6 @@ __device__ int list_push_back(IntFloatPairList* list, int i, float f) return ++list->size; } -// chrisk todo -// remove idx and return size __device__ int list_erase(IntFloatPairList* list, int idx) { for (int i = idx; i < list->size - 1; i++) @@ -1594,7 +1588,6 @@ __device__ int list_erase(IntFloatPairList* list, int idx) return --list->size; } -// chrisk todo __device__ int SortInputOrder(DCTScoreData* input_order, int size) { int i, j; @@ -2010,8 +2003,6 @@ __device__ coeff_t _abs(coeff_t val) return val >= 0 ? val : -val; } -// chrisk todo -// return the count of Non-zero item __device__ int MakeInputOrder(__global const coeff_t *block, __global const coeff_t *orig_block, IntFloatPairList *input_order, int block_size) { int size = 0; @@ -2763,7 +2754,6 @@ __device__ void YUVToImage(__private uchar yuv[3 * 8 * 8], float* r, float* g, f #undef lut } -// chrisk todo __device__ void BlockToImage(__private const coeff_t block[8*8*3], float r[8*8], float g[8*8], float b[8*8], int inside_x, int inside_y) { uchar idct[3][8 * 8]; @@ -2927,11 +2917,8 @@ __device__ void Convolution(size_t xsize, size_t ysize, } } -// ian todo -// ¼ÆËã½á¹ûÊä³öµ½output __device__ void BlurEx(const float *r, int xsize, int ysize, double kSigma, double border_ratio, float *output) { - // ²Î¿¼clBlurEx2µÄʵÏÖ£¬sigma = 1.1£¬Õâʱstep¡¢diff¶¼½«ÌØ»¯Îª¹Ì¶¨Öµ const double sigma = 1.1; double m = 2.25; // Accuracy increases when m is increased. const double scaler = -0.41322314049586772; // when sigma=1.1, scaler is -0.41322314049586772 @@ -2953,7 +2940,6 @@ __device__ void BlurEx(const float *r, int xsize, int ysize, double kSigma, doub border_ratio, output); } -// ian todo __device__ void OpsinDynamicsImageBlock(__private float *r, __private float *g, __private float *b, __private const float *r_blurred, __private const float *g_blurred, __private const float *b_blurred, int size) @@ -2983,7 +2969,6 @@ __device__ void OpsinDynamicsImageBlock(__private float *r, __private float *g, } } -// chrisk todo __device__ void MaskHighIntensityChangeBlock(float *xyb0_x, float *xyb0_y, float *xyb0_b, float *xyb1_x, float *xyb1_y, float *xyb1_b, const float *c0_x, const float *c0_y, const float *c0_b, @@ -3079,10 +3064,7 @@ __device__ void CalcOpsinDynamicsImage(__private float rgb[3][kDCTBlockSize]) __device__ double ComputeImage8x8Block(__private float rgb0_c[3][kDCTBlockSize], __private float rgb1_c[3][kDCTBlockSize], const __global float* mask_scale_block) { -// return 0; // 126ms -// CalcOpsinDynamicsImage(rgb0_c); -- calc in cpu one time CalcOpsinDynamicsImage(rgb1_c); -// return 0; // 425ms float rgb0[3][kDCTBlockSize]; float rgb1[3][kDCTBlockSize]; @@ -3095,9 +3077,8 @@ __device__ double ComputeImage8x8Block(__private float rgb0_c[3][kDCTBlockSize], rgb0_c[0], rgb0_c[1], rgb0_c[2], rgb1_c[0], rgb1_c[1], rgb1_c[2], 8, 8); -// return 0; // 544ms - // ÕâÀïΪɶҪ°Ñfloatת³Édouble²ÅÄܼÌÐø×ö¼ÆË㣿 - double b0[3 * kDCTBlockSize]; // + + double b0[3 * kDCTBlockSize]; double b1[3 * kDCTBlockSize]; for (int c = 0; c < 3; ++c) { for (int ix = 0; ix < kDCTBlockSize; ++ix) { @@ -3111,7 +3092,6 @@ __device__ double ComputeImage8x8Block(__private float rgb0_c[3][kDCTBlockSize], double diff_xyz_edge_dc[3] = { 0.0 }; ButteraugliBlockDiff(b0, b1, diff_xyz_dc, diff_xyz_ac, diff_xyz_edge_dc); -// return 0; // 735ms double diff = 0.0; double diff_edge = 0.0; @@ -3123,7 +3103,6 @@ __device__ double ComputeImage8x8Block(__private float rgb0_c[3][kDCTBlockSize], const double kEdgeWeight = 0.05; return sqrt((1 - kEdgeWeight) * diff + kEdgeWeight * diff_edge); -// 750ms } // return the count of Non-zero item diff --git a/clguetzli/clguetzli.cl.cpp b/clguetzli/clguetzli.cl.cpp index f29a283c..45533f60 100644 --- a/clguetzli/clguetzli.cl.cpp +++ b/clguetzli/clguetzli.cl.cpp @@ -212,35 +212,6 @@ namespace guetzli double ButteraugliComparatorEx::CompareBlock(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block, const int comp_mask) const { double err = ButteraugliComparator::CompareBlock(img, off_x, off_y, candidate_block, comp_mask); -/* - if (g_checkOpenCL) - { - channel_info mayout_channel[3]; - for (int c = 0; c < 3; c++) - { - mayout_channel[c].block_height = img.component(c).height_in_blocks(); - mayout_channel[c].block_width = img.component(c).width_in_blocks(); - mayout_channel[c].factor = img.component(c).factor_x(); - mayout_channel[c].pixel = img.component(c).pixels(); - mayout_channel[c].coeff = img.component(c).coeffs(); - } - - double err2 = CompareBlockFactor(mayout_channel, - candidate_block, - block_x_, - block_y_, - imgOpsinDynamicsBlockList.data(), - imgMaskXyzScaleBlockList.data(), - width_, - height_, - factor_x_); - - if (fabs(err - err2) > 0.001) - { - LogError("CompareBlock miss %s(%d) \r\n", __FUNCTION__, __LINE__); - } - } -*/ return err; } } diff --git a/clguetzli/clguetzli.cu b/clguetzli/clguetzli.cu index 351bed47..974be98e 100644 --- a/clguetzli/clguetzli.cu +++ b/clguetzli/clguetzli.cu @@ -1,22 +1 @@ #include "clguetzli/clguetzli.cl" -/* -__device__ int get_global_id(int dim) -{ - switch (dim) - { - case 0: - return threadIdx.x; - case 1: - return threadIdx.y; - case 2: - return threadIdx.z; - default: - return threadIdx.x; - } -} - -__device__ int get_global_size(int dim) -{ - return 0; -} -*/ diff --git a/clguetzli/clguetzli_test.cpp b/clguetzli/clguetzli_test.cpp index b5fa50c5..967a6652 100644 --- a/clguetzli/clguetzli_test.cpp +++ b/clguetzli/clguetzli_test.cpp @@ -69,7 +69,6 @@ void tclMaskHighIntensityChange(const float* r, const float* g, const float* b, ocl.releaseMemChannels(xyb1); } -// strong to void tclEdgeDetectorMap(const float* r, const float* g, const float* b, const float* r2, const float* g2, const float* b2, size_t xsize, size_t ysize, size_t step, @@ -101,7 +100,6 @@ void tclEdgeDetectorMap(const float* r, const float* g, const float* b, clReleaseMemObject(edge); } -// strong todo void tclBlockDiffMap(const float* r, const float* g, const float* b, const float* r2, const float* g2, const float* b2, size_t xsize, size_t ysize, size_t step, @@ -140,7 +138,6 @@ void tclBlockDiffMap(const float* r, const float* g, const float* b, clReleaseMemObject(block_diff_dc); } -// strong to void tclEdgeDetectorLowFreq(const float* r, const float* g, const float* b, const float* r2, const float* g2, const float* b2, size_t xsize, size_t ysize, size_t step, @@ -258,7 +255,6 @@ void tclCombineChannels(const float *mask_xyb_x, const float *mask_xyb_y, const clReleaseMemObject(cl_result); } -// ian todo void tclCalculateDiffmap(const size_t xsize, const size_t ysize, const size_t step, const float *diffmap, size_t org_len, @@ -278,7 +274,6 @@ void tclCalculateDiffmap(const size_t xsize, const size_t ysize, clReleaseMemObject(mem_diffmap); } -// chrisk todo void tclBlur(const float* channel, size_t xsize, size_t ysize, double sigma, double border_ratio, const float* result) { size_t channel_size = xsize * ysize * sizeof(float); @@ -299,7 +294,6 @@ void tclBlur(const float* channel, size_t xsize, size_t ysize, double sigma, dou clReleaseMemObject(r); } -// chrisk todo void tclConvolution(size_t xsize, size_t ysize, size_t xstep, size_t len, size_t offset, @@ -333,7 +327,6 @@ void tclConvolution(size_t xsize, size_t ysize, clReleaseMemObject(m); } -// ian todo void tclDiffPrecompute( const std::vector > &xyb0, const std::vector > &xyb1, @@ -366,7 +359,6 @@ void tclDiffPrecompute( ocl.releaseMemChannels(cl_mask); } -// ian todo void tclAverage5x5(int xsize, int ysize, const std::vector &diffs_org, const std::vector &diffs_cmp) { cl_int err = 0; @@ -382,7 +374,6 @@ void tclAverage5x5(int xsize, int ysize, const std::vector &diffs_org, co clReleaseMemObject(mem_diff); } -// chrisk todo void tclMinSquareVal(const float *img, size_t square_size, size_t offset, size_t xsize, size_t ysize, const float *result) @@ -422,7 +413,6 @@ void tclScaleImage(double scale, const float *result_org, const float *result_cm clReleaseMemObject(mem_result_org); } -// strong todo void tclOpsinDynamicsImage(const float* r, const float* g, const float* b, size_t xsize, size_t ysize, const float* result_r, const float* result_g, const float* result_b) { diff --git a/clguetzli/cumem_pool.h b/clguetzli/cumem_pool.h index 262f4106..d2ceec04 100644 --- a/clguetzli/cumem_pool.h +++ b/clguetzli/cumem_pool.h @@ -6,6 +6,8 @@ #include #include "ocl.h" +/*Simple memory pool for CUDA, aiming to reduce the memory allocation count, because it's time consuming.*/ + enum mem_block_status { MBS_IDLE, diff --git a/clguetzli/ocu.cpp b/clguetzli/ocu.cpp index 7ebc0ac1..2afe793d 100644 --- a/clguetzli/ocu.cpp +++ b/clguetzli/ocu.cpp @@ -32,35 +32,6 @@ ocu_args_d_t& getOcu(void) cuDeviceGetAttribute(&proc_count, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev); cuDeviceGetAttribute(&thread_count, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev); LogError("CUDA Adapter:%s Ver%d.%d MP %d MaxThread Per MP %d)\r\n", name, cap_major, cap_minor, proc_count, thread_count); -/* - char* source = nullptr; - size_t src_size = 0; - ReadSourceFromFile("clguetzli/clguetzli.cl", &source, &src_size); - - nvrtcProgram prog; - const char *opts[] = { "-arch=compute_30", "-default-device", "-G", "-I\"./\"", "--fmad=false" }; - nvrtcCreateProgram(&prog, source, "clguetzli.cl", 0, NULL, NULL); - nvrtcResult compile_result;// = nvrtcCompileProgram(prog, 3, opts); - if (NVRTC_SUCCESS != compile_result) - { - // Obtain compilation log from the program. - size_t logSize = 0; - nvrtcGetProgramLogSize(prog, &logSize); - char *log = new char[logSize]; - nvrtcGetProgramLog(prog, log); - - LogError("BuildInfo:\r\n%s\r\n", log); - - delete[] log; - } - - delete[] source; - // Obtain PTX from the program. - size_t ptxSize = 0; - nvrtcGetPTXSize(prog, &ptxSize); - char *ptx = new char[ptxSize]; - nvrtcGetPTX(prog, ptx); -*/ char* ptx = nullptr; size_t src_size = 0; @@ -127,7 +98,6 @@ ocu_args_d_t::~ocu_args_d_t() cuModuleUnload(mod); cuCtxDestroy(ctxt); mem_pool.drain(); -// cuStreamDestroy(commandQueue); } cu_mem ocu_args_d_t::allocMem(size_t s, const void *init) diff --git a/guetzli/processor.cc b/guetzli/processor.cc index f0a0bf48..ffbf6f24 100644 --- a/guetzli/processor.cc +++ b/guetzli/processor.cc @@ -455,7 +455,8 @@ void Processor::ComputeBlockZeroingOrder( if (MODE_CPU_OPT == g_mathMode) { if (best_err >= comparator_->BlockErrorLimit()) - { // err¶ÓÁÐÊÇÖð½¥Ôö´óµÄ£¬Èç¹ûÕâÀïÒѾ­³¬¹ýErrorLimit£¬ºóÐøµÄ¼ÆËã¾ÍÊÇÈßÓàµÄÁË + { + // The input_order is an ascent vector, break when best_err exceed the error limit. break; } } @@ -567,7 +568,7 @@ void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, co const int num_blocks = block_width * block_height; - comparator_->StartBlockComparisons(); // ³õʼ»¯Ò»Ð©²ÎÊý£¬Ö÷ÒªÊǶÔԭͼ½øÐÐһЩ´¦Àí + comparator_->StartBlockComparisons(); std::vector output_order_gpu; std::vector output_order_cpu; From 5f309e7bc922e1dcca3d304feb801eeebee8fb75 Mon Sep 17 00:00:00 2001 From: Zhou Ke Date: Tue, 11 Jul 2017 11:10:52 +0800 Subject: [PATCH 178/189] Remove some redundant files --- guetzli.vcxproj | 68 ++------------ guetzli.vcxproj.filters | 167 --------------------------------- guetzli/processor.cc | 2 +- guetzli_static.vcxproj | 51 ---------- guetzli_static.vcxproj.filters | 155 ------------------------------ 5 files changed, 9 insertions(+), 434 deletions(-) diff --git a/guetzli.vcxproj b/guetzli.vcxproj index 32cc12c7..c4eb7a8f 100644 --- a/guetzli.vcxproj +++ b/guetzli.vcxproj @@ -102,7 +102,7 @@ NotUsing Level3 - .;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;third_party\libjpeg;%(AdditionalIncludeDirectories) + .;$(CUDA_PATH)\include;third_party\butteraugli;%(AdditionalIncludeDirectories) Full true true @@ -114,7 +114,7 @@ Console true true - cuda.lib;OpenCL.lib;shlwapi.lib;jpeg.lib;%(AdditionalDependencies) + cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies) mainCRTStartup $(CUDA_PATH)\lib\x64;third_party\libjpeg\x64 @@ -138,7 +138,7 @@ NotUsing Level3 - .;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\libjpeg;%(AdditionalIncludeDirectories) + .;$(CUDA_PATH)\include;third_party\butteraugli;%(AdditionalIncludeDirectories) MaxSpeed true false @@ -150,7 +150,7 @@ Console true true - cuda.lib;OpenCL.lib;shlwapi.lib;jpeg.lib;%(AdditionalDependencies) + cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies) mainCRTStartup @@ -171,7 +171,7 @@ NotUsing Level3 - .;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;third_party\libjpeg;%(AdditionalIncludeDirectories) + .;$(CUDA_PATH)\include;third_party\butteraugli;%(AdditionalIncludeDirectories) EditAndContinue Disabled __USE_CUDA__;__USE_OPENCL__;_UNICODE;UNICODE;%(PreprocessorDefinitions) @@ -179,7 +179,7 @@ Console true - cuda.lib;OpenCL.lib;shlwapi.lib;jpeg.lib;%(AdditionalDependencies) + cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies) mainCRTStartup $(CUDA_PATH)\lib\x64;third_party\libjpeg\x64 @@ -189,7 +189,7 @@ NotUsing Level3 - .;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\libjpeg;%(AdditionalIncludeDirectories) + .;$(CUDA_PATH)\include;third_party\butteraugli;%(AdditionalIncludeDirectories) EditAndContinue Disabled __USE_CUDA__;__USE_OPENCL__;%(PreprocessorDefinitions) @@ -197,7 +197,7 @@ Console true - cuda.lib;OpenCL.lib;shlwapi.lib;jpeg.lib;%(AdditionalDependencies) + cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies) mainCRTStartup @@ -244,23 +244,6 @@ - - - - - - - - - - - - - - - - - @@ -293,36 +276,6 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -351,11 +304,6 @@ false false - - - - - diff --git a/guetzli.vcxproj.filters b/guetzli.vcxproj.filters index 785c7382..7e005105 100644 --- a/guetzli.vcxproj.filters +++ b/guetzli.vcxproj.filters @@ -13,18 +13,9 @@ {FD6FCB41-6929-36EC-F288-50C65E41EC5B} - - {40be58d6-6dfc-45a3-8ca1-7d1b14051ddc} - - - {cb89c1ac-8399-4814-88f2-4b69576bc9f9} - {64847a89-ca39-4556-ba0e-d6875c4d39ca} - - {1ac67559-7330-41c7-9a6d-10c3abee000e} - @@ -105,48 +96,6 @@ third_party\butteraugli\butteraugli - - third_party\libpng - - - third_party\libpng - - - third_party\libpng - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - clguetzli @@ -174,15 +123,6 @@ clguetzli - - third_party\libjpeg - - - third_party\libjpeg - - - third_party\libjpeg - @@ -248,96 +188,6 @@ third_party\butteraugli\butteraugli - - third_party\libpng - - - third_party\libpng - - - third_party\libpng - - - third_party\libpng - - - third_party\libpng - - - third_party\libpng - - - third_party\libpng - - - third_party\libpng - - - third_party\libpng - - - third_party\libpng - - - third_party\libpng - - - third_party\libpng - - - third_party\libpng - - - third_party\libpng - - - third_party\libpng - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - clguetzli @@ -366,23 +216,6 @@ clguetzli - - - third_party\libpng - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - clguetzli diff --git a/guetzli/processor.cc b/guetzli/processor.cc index ffbf6f24..d1cdb32a 100644 --- a/guetzli/processor.cc +++ b/guetzli/processor.cc @@ -33,7 +33,7 @@ #include "guetzli/quantize.h" #include "clguetzli/clguetzli.h" -#include "third_party/libjpeg/jpeglib.h" +#include "jpeglib.h" namespace guetzli { diff --git a/guetzli_static.vcxproj b/guetzli_static.vcxproj index 05a75f9a..3c3bd850 100644 --- a/guetzli_static.vcxproj +++ b/guetzli_static.vcxproj @@ -176,20 +176,6 @@ - - - - - - - - - - - - - - @@ -212,43 +198,6 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/guetzli_static.vcxproj.filters b/guetzli_static.vcxproj.filters index 37876e3d..94654c91 100644 --- a/guetzli_static.vcxproj.filters +++ b/guetzli_static.vcxproj.filters @@ -13,12 +13,6 @@ {FD6FCB41-6929-36EC-F288-50C65E41EC5B} - - {61f0e3eb-c213-49c5-883a-060bdaf927bb} - - - {ba7b6163-a7d1-4f14-b4b3-3d35f296563a} - @@ -99,48 +93,6 @@ third_party\butteraugli\butteraugli - - third_party\libpng - - - third_party\libpng - - - third_party\libpng - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - @@ -203,112 +155,5 @@ third_party\butteraugli\butteraugli - - third_party\libpng - - - third_party\libpng - - - third_party\libpng - - - third_party\libpng - - - third_party\libpng - - - third_party\libpng - - - third_party\libpng - - - third_party\libpng - - - third_party\libpng - - - third_party\libpng - - - third_party\libpng - - - third_party\libpng - - - third_party\libpng - - - third_party\libpng - - - third_party\libpng - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - - - third_party\libpng - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - - - third_party\zlib - \ No newline at end of file From 5aa73ae39a460ba7f5f8a8fd52c47e7b8bf53c8b Mon Sep 17 00:00:00 2001 From: strongtu Date: Tue, 11 Jul 2017 11:15:25 +0800 Subject: [PATCH 179/189] Modify makefile --- guetzli.make | 4 ++-- premake5.lua | 4 +--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/guetzli.make b/guetzli.make index a458eb09..52dbff8f 100644 --- a/guetzli.make +++ b/guetzli.make @@ -15,7 +15,7 @@ ifeq ($(config),release) TARGETDIR = bin/Release TARGET = $(TARGETDIR)/guetzli OBJDIR = obj/Release/guetzli - DEFINES += -D__USE_OPENCL__ -D__USE_CUDA__ -D__USE_GPERFTOOLS__ + DEFINES += -D__USE_OPENCL__ -D__USE_CUDA__ INCLUDES += -I. -Ithird_party/butteraugli -Iclguetzli -I"$(OPENCL_INC)" FORCE_INCLUDE += ALL_CPPFLAGS += $(CPPFLAGS) -MMD -MP $(DEFINES) $(INCLUDES) @@ -42,7 +42,7 @@ ifeq ($(config),debug) TARGETDIR = bin/Debug TARGET = $(TARGETDIR)/guetzli OBJDIR = obj/Debug/guetzli - DEFINES += -D__USE_OPENCL__ -D__USE_CUDA__ -D__USE_GPERFTOOLS__ + DEFINES += -D__USE_OPENCL__ -D__USE_CUDA__ INCLUDES += -I. -Ithird_party/butteraugli -Iclguetzli -I"$(OPENCL_INC)" FORCE_INCLUDE += ALL_CPPFLAGS += $(CPPFLAGS) -MMD -MP $(DEFINES) $(INCLUDES) diff --git a/premake5.lua b/premake5.lua index 1c5ef6c6..7f2cc3e3 100644 --- a/premake5.lua +++ b/premake5.lua @@ -31,7 +31,6 @@ workspace "guetzli" "guetzli/*.h", "third_party/butteraugli/butteraugli/butteraugli.cc", "third_party/butteraugli/butteraugli/butteraugli.h", - "third_party/libjpeg/*.h", "clguetzli/*.cpp", "clguetzli/*.h" } @@ -43,7 +42,7 @@ workspace "guetzli" project "guetzli" kind "ConsoleApp" filter "action:gmake" - defines { "__USE_OPENCL__", "__USE_CUDA__", "__USE_GPERFTOOLS__" } + defines { "__USE_OPENCL__", "__USE_CUDA__" } linkoptions { "`pkg-config --libs libpng || libpng-config --ldflags`" } buildoptions { "`pkg-config --cflags libpng || libpng-config --cflags`" } links { "OpenCL", "cuda", "profiler", "unwind", "jpeg" } @@ -56,7 +55,6 @@ workspace "guetzli" "guetzli/*.h", "third_party/butteraugli/butteraugli/butteraugli.cc", "third_party/butteraugli/butteraugli/butteraugli.h", - "third_party/libjpeg/*.h", "clguetzli/*.cpp", "clguetzli/*.h" } From c525adf38cb99ae64d4de5a5954e992b38ed1714 Mon Sep 17 00:00:00 2001 From: Zhou Ke Date: Thu, 13 Jul 2017 00:41:55 +0800 Subject: [PATCH 180/189] Disable CUDA & OpenCL by default --- clguetzli/cl.hpp | 4 ++++ clguetzli/clguetzli.cl | 3 +++ clguetzli/clguetzli.cl.h | 6 +++++- clguetzli/clguetzli_test.cpp | 4 ++-- clguetzli/ocl.h | 4 ++-- clguetzli/ocu.cpp | 4 ++-- clguetzli/utils.cpp | 5 ++++- guetzli.make | 16 ++++++++-------- guetzli.vcxproj | 24 ++++++++++++------------ guetzli/processor.cc | 11 +++++++++++ guetzli_static.make | 8 ++++---- premake5.lua | 8 ++++---- 12 files changed, 61 insertions(+), 36 deletions(-) diff --git a/clguetzli/cl.hpp b/clguetzli/cl.hpp index 8be6313e..a7043b50 100644 --- a/clguetzli/cl.hpp +++ b/clguetzli/cl.hpp @@ -1,5 +1,7 @@ #pragma once +#ifdef __USE_OPENCL__ + template inline void clSetKernelArgK(cl_kernel k, int idx, T* t) { @@ -316,3 +318,5 @@ inline void clSetKernelArgEx(cl_kernel k, clSetKernelArgK(k, 24, t24); clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15, t16, t17, t18, t19, t20, t21, t22, t23); } + +#endif // __USE_OPENCL__ \ No newline at end of file diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index b4d11a92..c2e67e80 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -1,3 +1,4 @@ +#ifdef __USE_OPENCL__ #pragma OPENCL EXTENSION cl_khr_fp64 : enable #include "clguetzli/clguetzli.cl.h" @@ -3408,3 +3409,5 @@ __device__ double CompareBlockFactor(const channel_info mayout_channel[3], #ifdef __USE_DOUBLE_AS_FLOAT__ #undef double #endif + +#endif __USE_OPENCL__ \ No newline at end of file diff --git a/clguetzli/clguetzli.cl.h b/clguetzli/clguetzli.cl.h index 102f3ac9..761ed634 100644 --- a/clguetzli/clguetzli.cl.h +++ b/clguetzli/clguetzli.cl.h @@ -1,6 +1,8 @@ #ifndef __CLGUETZLI_CL_H__ #define __CLGUETZLI_CL_H__ +#ifdef __USE_OPENCL__ + #ifdef __cplusplus #ifndef __CUDACC__ #include "CL/cl.h" @@ -148,4 +150,6 @@ __global const ushort *pixel; }channel_info; -#endif /*__CLGUETZLI_CL_H__*/ \ No newline at end of file +#endif /*__CLGUETZLI_CL_H__*/ + +#endif // __USE_OPENCL__ \ No newline at end of file diff --git a/clguetzli/clguetzli_test.cpp b/clguetzli/clguetzli_test.cpp index 967a6652..6e6fece8 100644 --- a/clguetzli/clguetzli_test.cpp +++ b/clguetzli/clguetzli_test.cpp @@ -1,3 +1,5 @@ +#ifdef __USE_OPENCL__ + #include #include #include @@ -7,8 +9,6 @@ #include "ocl.h" #include "ocu.h" -#ifdef __USE_OPENCL__ - #define FLOAT_COMPARE(a, b, c) floatCompare((a), (b), (c), __FUNCTION__, __LINE__ ) int floatCompare(const float* a, const float* b, size_t size, const char* szFunc, int line) diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h index f182bb88..f3056dd8 100644 --- a/clguetzli/ocl.h +++ b/clguetzli/ocl.h @@ -1,11 +1,11 @@ #pragma once +#ifdef __USE_OPENCL__ + #include "CL/cl.h" #include "utils.h" #include "clguetzli.cl.h" -#ifdef __USE_OPENCL__ - // Macros for OpenCL versions #define OPENCL_VERSION_1_2 1.2f #define OPENCL_VERSION_2_0 2.0f diff --git a/clguetzli/ocu.cpp b/clguetzli/ocu.cpp index 2afe793d..ea66be55 100644 --- a/clguetzli/ocu.cpp +++ b/clguetzli/ocu.cpp @@ -1,8 +1,8 @@ #include "ocu.h" -#include -#include #ifdef __USE_CUDA__ +#include +#include ocu_args_d_t& getOcu(void) { diff --git a/clguetzli/utils.cpp b/clguetzli/utils.cpp index 4fc8dbc2..da699406 100644 --- a/clguetzli/utils.cpp +++ b/clguetzli/utils.cpp @@ -19,6 +19,7 @@ * Intel Corporation is the author of the Materials, and requests that all * problem reports or change requests be submitted to it directly *****************************************************************************/ +#ifdef __USE_OPENCL__ #include #include @@ -96,4 +97,6 @@ int ReadSourceFromFile(const char* fileName, char** source, size_t* sourceSize) } return errorCode; } -#pragma warning( pop ) \ No newline at end of file +#pragma warning( pop ) + +#endif \ No newline at end of file diff --git a/guetzli.make b/guetzli.make index 52dbff8f..e16aa99b 100644 --- a/guetzli.make +++ b/guetzli.make @@ -15,16 +15,16 @@ ifeq ($(config),release) TARGETDIR = bin/Release TARGET = $(TARGETDIR)/guetzli OBJDIR = obj/Release/guetzli - DEFINES += -D__USE_OPENCL__ -D__USE_CUDA__ - INCLUDES += -I. -Ithird_party/butteraugli -Iclguetzli -I"$(OPENCL_INC)" + DEFINES += + INCLUDES += -I. -Ithird_party/butteraugli -Iclguetzli FORCE_INCLUDE += ALL_CPPFLAGS += $(CPPFLAGS) -MMD -MP $(DEFINES) $(INCLUDES) ALL_CFLAGS += $(CFLAGS) $(ALL_CPPFLAGS) -O3 -g `pkg-config --cflags libpng || libpng-config --cflags` ALL_CXXFLAGS += $(CXXFLAGS) $(ALL_CPPFLAGS) -O3 -g -std=c++11 `pkg-config --cflags libpng || libpng-config --cflags` ALL_RESFLAGS += $(RESFLAGS) $(DEFINES) $(INCLUDES) - LIBS += -lOpenCL -lcuda -lprofiler -lunwind -ljpeg + LIBS += LDDEPS += - ALL_LDFLAGS += $(LDFLAGS) -L"$(OPENCL_LIB)" `pkg-config --libs libpng || libpng-config --ldflags` + ALL_LDFLAGS += $(LDFLAGS) `pkg-config --libs libpng || libpng-config --ldflags` LINKCMD = $(CXX) -o "$@" $(OBJECTS) $(RESOURCES) $(ALL_LDFLAGS) $(LIBS) define PREBUILDCMDS endef @@ -42,16 +42,16 @@ ifeq ($(config),debug) TARGETDIR = bin/Debug TARGET = $(TARGETDIR)/guetzli OBJDIR = obj/Debug/guetzli - DEFINES += -D__USE_OPENCL__ -D__USE_CUDA__ - INCLUDES += -I. -Ithird_party/butteraugli -Iclguetzli -I"$(OPENCL_INC)" + DEFINES += + INCLUDES += -I. -Ithird_party/butteraugli -Iclguetzli FORCE_INCLUDE += ALL_CPPFLAGS += $(CPPFLAGS) -MMD -MP $(DEFINES) $(INCLUDES) ALL_CFLAGS += $(CFLAGS) $(ALL_CPPFLAGS) -g `pkg-config --cflags libpng || libpng-config --cflags` ALL_CXXFLAGS += $(CXXFLAGS) $(ALL_CPPFLAGS) -g -std=c++11 `pkg-config --cflags libpng || libpng-config --cflags` ALL_RESFLAGS += $(RESFLAGS) $(DEFINES) $(INCLUDES) - LIBS += -lOpenCL -lcuda -lprofiler -lunwind -ljpeg + LIBS += LDDEPS += - ALL_LDFLAGS += $(LDFLAGS) -L"$(OPENCL_LIB)" `pkg-config --libs libpng || libpng-config --ldflags` + ALL_LDFLAGS += $(LDFLAGS) `pkg-config --libs libpng || libpng-config --ldflags` LINKCMD = $(CXX) -o "$@" $(OBJECTS) $(RESOURCES) $(ALL_LDFLAGS) $(LIBS) define PREBUILDCMDS endef diff --git a/guetzli.vcxproj b/guetzli.vcxproj index c4eb7a8f..3a0eb72c 100644 --- a/guetzli.vcxproj +++ b/guetzli.vcxproj @@ -102,19 +102,19 @@ NotUsing Level3 - .;$(CUDA_PATH)\include;third_party\butteraugli;%(AdditionalIncludeDirectories) + .;third_party\butteraugli;%(AdditionalIncludeDirectories) Full true true false true - __USE_CUDA__;__USE_OPENCL__;_UNICODE;UNICODE;%(PreprocessorDefinitions) + _UNICODE;UNICODE;%(PreprocessorDefinitions) Console true true - cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies) + shlwapi.lib;%(AdditionalDependencies) mainCRTStartup $(CUDA_PATH)\lib\x64;third_party\libjpeg\x64 @@ -138,19 +138,19 @@ NotUsing Level3 - .;$(CUDA_PATH)\include;third_party\butteraugli;%(AdditionalIncludeDirectories) + .;third_party\butteraugli;%(AdditionalIncludeDirectories) MaxSpeed true false false true - __USE_CUDA__;__USE_OPENCL__;%(PreprocessorDefinitions) + %(PreprocessorDefinitions) Console true true - cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies) + shlwapi.lib;%(AdditionalDependencies) mainCRTStartup @@ -171,15 +171,15 @@ NotUsing Level3 - .;$(CUDA_PATH)\include;third_party\butteraugli;%(AdditionalIncludeDirectories) + .;third_party\butteraugli;%(AdditionalIncludeDirectories) EditAndContinue Disabled - __USE_CUDA__;__USE_OPENCL__;_UNICODE;UNICODE;%(PreprocessorDefinitions) + _UNICODE;UNICODE;%(PreprocessorDefinitions) Console true - cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies) + shlwapi.lib;%(AdditionalDependencies) mainCRTStartup $(CUDA_PATH)\lib\x64;third_party\libjpeg\x64 @@ -189,15 +189,15 @@ NotUsing Level3 - .;$(CUDA_PATH)\include;third_party\butteraugli;%(AdditionalIncludeDirectories) + .;third_party\butteraugli;%(AdditionalIncludeDirectories) EditAndContinue Disabled - __USE_CUDA__;__USE_OPENCL__;%(PreprocessorDefinitions) + %(PreprocessorDefinitions) Console true - cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies) + shlwapi.lib;%(AdditionalDependencies) mainCRTStartup diff --git a/guetzli/processor.cc b/guetzli/processor.cc index d1cdb32a..2e8837dc 100644 --- a/guetzli/processor.cc +++ b/guetzli/processor.cc @@ -33,7 +33,9 @@ #include "guetzli/quantize.h" #include "clguetzli/clguetzli.h" +#ifdef __SUPPORT_FULL_JPEG__ #include "jpeglib.h" +#endif namespace guetzli { @@ -668,6 +670,7 @@ void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, co } } +#ifdef __USE_OPENCL__ if (MODE_CHECKCL == g_mathMode) { int count = 0; @@ -685,6 +688,7 @@ void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, co LogError("CHK %s(%d) %d:%d\r\n", "SelectFrequencyMasking", __LINE__, count, check_size); } } +#endif std::vector candidate_coeff_offsets(num_blocks + 1); std::vector candidate_coeffs; @@ -1063,6 +1067,7 @@ bool Process(const Params& params, ProcessStats* stats, bool ProcessUnsupportedJpegData(const Params& params, ProcessStats* stats, const std::string& data, std::string* jpg_out) { +#ifdef __SUPPORT_FULL_JPEG__ struct jpeg_decompress_struct cinfo; struct jpeg_error_mgr jerr; cinfo.err = jpeg_std_error(&jerr); @@ -1092,6 +1097,12 @@ bool ProcessUnsupportedJpegData(const Params& params, ProcessStats* stats, } std::vector temp_rgb(bmp_buffer, bmp_buffer + bmp_size); return Process(params, stats, temp_rgb, xsize, ysize, jpg_out); +#else + fprintf(stderr, "Unsupported input JPEG file (e.g. unsupported " + "downsampling mode).\nPlease provide the input image as " + "a PNG file.\n"); + return false; +#endif } bool Process(const Params& params, ProcessStats* stats, diff --git a/guetzli_static.make b/guetzli_static.make index 2d648c04..9fe7bf05 100644 --- a/guetzli_static.make +++ b/guetzli_static.make @@ -16,7 +16,7 @@ ifeq ($(config),release) TARGET = $(TARGETDIR)/libguetzli_static.a OBJDIR = obj/Release/guetzli_static DEFINES += - INCLUDES += -I. -Ithird_party/butteraugli -Iclguetzli -I"$(OPENCL_INC)" + INCLUDES += -I. -Ithird_party/butteraugli -Iclguetzli FORCE_INCLUDE += ALL_CPPFLAGS += $(CPPFLAGS) -MMD -MP $(DEFINES) $(INCLUDES) ALL_CFLAGS += $(CFLAGS) $(ALL_CPPFLAGS) -O3 -g `pkg-config --static --cflags libpng || libpng-config --static --cflags` @@ -24,7 +24,7 @@ ifeq ($(config),release) ALL_RESFLAGS += $(RESFLAGS) $(DEFINES) $(INCLUDES) LIBS += LDDEPS += - ALL_LDFLAGS += $(LDFLAGS) -L"$(OPENCL_LIB)" `pkg-config --static --libs libpng || libpng-config --static --ldflags` + ALL_LDFLAGS += $(LDFLAGS) `pkg-config --static --libs libpng || libpng-config --static --ldflags` LINKCMD = $(AR) -rcs "$@" $(OBJECTS) define PREBUILDCMDS endef @@ -43,7 +43,7 @@ ifeq ($(config),debug) TARGET = $(TARGETDIR)/libguetzli_static.a OBJDIR = obj/Debug/guetzli_static DEFINES += - INCLUDES += -I. -Ithird_party/butteraugli -Iclguetzli -I"$(OPENCL_INC)" + INCLUDES += -I. -Ithird_party/butteraugli -Iclguetzli FORCE_INCLUDE += ALL_CPPFLAGS += $(CPPFLAGS) -MMD -MP $(DEFINES) $(INCLUDES) ALL_CFLAGS += $(CFLAGS) $(ALL_CPPFLAGS) -g `pkg-config --static --cflags libpng || libpng-config --static --cflags` @@ -51,7 +51,7 @@ ifeq ($(config),debug) ALL_RESFLAGS += $(RESFLAGS) $(DEFINES) $(INCLUDES) LIBS += LDDEPS += - ALL_LDFLAGS += $(LDFLAGS) -L"$(OPENCL_LIB)" `pkg-config --static --libs libpng || libpng-config --static --ldflags` + ALL_LDFLAGS += $(LDFLAGS) `pkg-config --static --libs libpng || libpng-config --static --ldflags` LINKCMD = $(AR) -rcs "$@" $(OBJECTS) define PREBUILDCMDS endef diff --git a/premake5.lua b/premake5.lua index 7f2cc3e3..cc41301b 100644 --- a/premake5.lua +++ b/premake5.lua @@ -2,8 +2,8 @@ workspace "guetzli" configurations { "Release", "Debug" } language "C++" flags { "C++11" } - includedirs { ".", "third_party/butteraugli", "clguetzli", "$(OPENCL_INC)" } - libdirs { "$(OPENCL_LIB)" } + includedirs { ".", "third_party/butteraugli", "clguetzli" } + libdirs {} filter "action:vs*" platforms { "x86_64", "x86" } @@ -42,10 +42,10 @@ workspace "guetzli" project "guetzli" kind "ConsoleApp" filter "action:gmake" - defines { "__USE_OPENCL__", "__USE_CUDA__" } + --defines { "__USE_OPENCL__", "__USE_CUDA__", "__SUPPORT_FULL_JPEG__" } linkoptions { "`pkg-config --libs libpng || libpng-config --ldflags`" } buildoptions { "`pkg-config --cflags libpng || libpng-config --cflags`" } - links { "OpenCL", "cuda", "profiler", "unwind", "jpeg" } + --links { "OpenCL", "cuda", "profiler", "unwind", "jpeg" } filter "action:vs*" links { "shlwapi" } filter {} From ba219439ca41f95b8bd678966a8631735db6fbe1 Mon Sep 17 00:00:00 2001 From: Zhou Ke Date: Thu, 13 Jul 2017 10:19:40 +0800 Subject: [PATCH 181/189] Add netpbm According to the CI fail log. no pngtopnm command in the test environment, so add netpbm package and try again. --- .travis.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.travis.yml b/.travis.yml index 39e1caaa..657e3e7b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -13,6 +13,7 @@ matrix: packages: - wget - libjpeg-progs + - netpbm - os: osx env: BUILD_SYSTEM=bazel @@ -29,6 +30,7 @@ matrix: - libpng-dev - pkg-config - libjpeg-progs + - netpbm - os: osx env: BUILD_SYSTEM=make From 93fd3f3abed6d064f39f8932d1c58f05ab99d382 Mon Sep 17 00:00:00 2001 From: Zhou Ke Date: Thu, 13 Jul 2017 10:39:13 +0800 Subject: [PATCH 182/189] Fix type cast error on Mac --- clguetzli/clbutter_comparator.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/clguetzli/clbutter_comparator.cpp b/clguetzli/clbutter_comparator.cpp index e39966b1..91e599b5 100644 --- a/clguetzli/clbutter_comparator.cpp +++ b/clguetzli/clbutter_comparator.cpp @@ -730,9 +730,9 @@ void MaskHighIntensityChangeOpt( for (size_t x = 0; x < xsize; ++x) { size_t ix = y * xsize + x; const float ave[3] = { - (c0[0][ix] + c1[0][ix]) * 0.5, - (c0[1][ix] + c1[1][ix]) * 0.5, - (c0[2][ix] + c1[2][ix]) * 0.5, + static_cast((c0[0][ix] + c1[0][ix]) * 0.5), + static_cast((c0[1][ix] + c1[1][ix]) * 0.5), + static_cast((c0[2][ix] + c1[2][ix]) * 0.5), }; float sqr_max_diff = -1; { From 1c1d7e641b8d482381385aaa3bfd5bde8340ebf9 Mon Sep 17 00:00:00 2001 From: Zhou Ke Date: Thu, 13 Jul 2017 11:25:39 +0800 Subject: [PATCH 183/189] Update bazel version to 0.5.2 --- .travis.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.travis.sh b/.travis.sh index a30f38e5..b7197e7c 100755 --- a/.travis.sh +++ b/.travis.sh @@ -14,9 +14,9 @@ case "$1" in "bazel") case "${TRAVIS_OS_NAME}" in "linux") - wget https://github.com/bazelbuild/bazel/releases/download/0.4.5/bazel_0.4.5-linux-x86_64.deb - echo 'b494d0a413e4703b6cd5312403bea4d92246d6425b3be68c9bfbeb8cc4db8a55 bazel_0.4.5-linux-x86_64.deb' | sha256sum -c --strict || exit 1 - sudo dpkg -i bazel_0.4.5-linux-x86_64.deb + wget https://github.com/bazelbuild/bazel/releases/download/0.5.2/bazel_0.5.2-linux-x86_64.deb + echo 'b14c8773dab078d3422fe4082f3ab4d9e14f02313c3b3eb4b5b40c44ce29ed59 bazel_0.5.2-linux-x86_64.deb' | sha256sum -c --strict || exit 1 + sudo dpkg -i bazel_0.5.2-linux-x86_64.deb ;; "osx") brew install bazel From 1cb26c7cc9373e0d4828f3bf265e6b25e0d45143 Mon Sep 17 00:00:00 2001 From: Zhou Ke Date: Thu, 13 Jul 2017 12:52:05 +0800 Subject: [PATCH 184/189] Add oracle-java8-installer --- .travis.sh | 7 ++++--- .travis.yml | 2 ++ 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.travis.sh b/.travis.sh index b7197e7c..905889ff 100755 --- a/.travis.sh +++ b/.travis.sh @@ -14,9 +14,10 @@ case "$1" in "bazel") case "${TRAVIS_OS_NAME}" in "linux") - wget https://github.com/bazelbuild/bazel/releases/download/0.5.2/bazel_0.5.2-linux-x86_64.deb - echo 'b14c8773dab078d3422fe4082f3ab4d9e14f02313c3b3eb4b5b40c44ce29ed59 bazel_0.5.2-linux-x86_64.deb' | sha256sum -c --strict || exit 1 - sudo dpkg -i bazel_0.5.2-linux-x86_64.deb + sudo apt-get remove oracle-java9-installer + wget https://github.com/bazelbuild/bazel/releases/download/0.4.5/bazel_0.4.5-linux-x86_64.deb + echo 'b494d0a413e4703b6cd5312403bea4d92246d6425b3be68c9bfbeb8cc4db8a55 bazel_0.4.5-linux-x86_64.deb' | sha256sum -c --strict || exit 1 + sudo dpkg -i bazel_0.4.5-linux-x86_64.deb ;; "osx") brew install bazel diff --git a/.travis.yml b/.travis.yml index 657e3e7b..9f297c16 100644 --- a/.travis.yml +++ b/.travis.yml @@ -14,6 +14,7 @@ matrix: - wget - libjpeg-progs - netpbm + - oracle-java8-installer - os: osx env: BUILD_SYSTEM=bazel @@ -37,6 +38,7 @@ matrix: install: +- jdk_switcher use oraclejdk8 - ./.travis.sh install script: - ./.travis.sh script From 40665e219e344e25a005af1078bd7f4deab21136 Mon Sep 17 00:00:00 2001 From: Zhou Ke Date: Thu, 13 Jul 2017 15:11:12 +0800 Subject: [PATCH 185/189] Try to fix Bazel build --- .travis.yml | 1 - BUILD | 4 ++ clguetzli/clbutter_comparator.cpp | 3 +- .../butteraugli/butteraugli/butteraugli.cc | 50 +++++++++++++++++++ 4 files changed, 55 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index 9f297c16..85db2b53 100644 --- a/.travis.yml +++ b/.travis.yml @@ -38,7 +38,6 @@ matrix: install: -- jdk_switcher use oraclejdk8 - ./.travis.sh install script: - ./.travis.sh script diff --git a/BUILD b/BUILD index 05bfc0da..8f2e28f1 100644 --- a/BUILD +++ b/BUILD @@ -8,6 +8,10 @@ cc_library( "guetzli/*.h", "guetzli/*.cc", "guetzli/*.inc", + "clguetzli/*.cpp", + "clguetzli/*.h", + "clguetzli/*.hpp" + ], exclude = ["guetzli/guetzli.cc"], ), diff --git a/clguetzli/clbutter_comparator.cpp b/clguetzli/clbutter_comparator.cpp index 91e599b5..3d4eb7dd 100644 --- a/clguetzli/clbutter_comparator.cpp +++ b/clguetzli/clbutter_comparator.cpp @@ -1735,9 +1735,8 @@ namespace butteraugli float border_ratio, float* __restrict__ result) { - _Convolution(xsize, ysize, xstep, len, offset, multipliers, inp, border_ratio, result); - #ifdef __USE_OPENCL__ + _Convolution(xsize, ysize, xstep, len, offset, multipliers, inp, border_ratio, result); if (MODE_CHECKCL == g_mathMode && xsize > 8 && ysize > 8) { tclConvolution(xsize, ysize, xstep, len, offset, multipliers, inp, border_ratio, result); diff --git a/third_party/butteraugli/butteraugli/butteraugli.cc b/third_party/butteraugli/butteraugli/butteraugli.cc index b62e1578..c32f226c 100644 --- a/third_party/butteraugli/butteraugli/butteraugli.cc +++ b/third_party/butteraugli/butteraugli/butteraugli.cc @@ -40,9 +40,11 @@ #include #include +#ifdef __USE_OPENCL__ #include "clguetzli/clbutter_comparator.h" #include "clguetzli/clguetzli.h" #include "clguetzli/clguetzli_test.h" +#endif // Restricted pointers speed up Convolution(); MSVC uses a different keyword. #ifdef _MSC_VER @@ -112,17 +114,28 @@ void _Blur(size_t xsize, size_t ysize, float* channel, double sigma, int dxsize = (xsize + xstep - 1) / xstep; int dysize = (ysize + ystep - 1) / ystep; std::vector tmp(dxsize * ysize); +#ifdef __USE_OPENCL__ Convolution(xsize, ysize, xstep, expn_size, diff, expn.data(), channel, border_ratio, tmp.data()); +#else + _Convolution(xsize, ysize, xstep, expn_size, diff, expn.data(), channel, + border_ratio, + tmp.data()); +#endif float* output = channel; std::vector downsampled_output; if (xstep > 1) { downsampled_output.resize(dxsize * dysize); output = downsampled_output.data(); } +#ifdef __USE_OPENCL__ Convolution(ysize, dxsize, ystep, expn_size, diff, expn.data(), tmp.data(), border_ratio, output); +#else + _Convolution(ysize, dxsize, ystep, expn_size, diff, expn.data(), tmp.data(), + border_ratio, output); +#endif if (xstep > 1) { for (size_t y = 0; y < ysize; y++) { for (size_t x = 0; x < xsize; x++) { @@ -1022,7 +1035,11 @@ void _CalculateDiffmap(const size_t xsize, const size_t ysize, += static_cast(mul1) * blurred[y * (xsize - s) + x]; } } +#ifdef __USE_OPENCL__ ScaleImage(scale, diffmap); +#else + _ScaleImage(scale, diffmap); +#endif } } @@ -1054,7 +1071,11 @@ void ButteraugliComparator::DiffmapOpsinDynamicsImage( CombineChannels(mask_xyb, mask_xyb_dc, block_diff_dc, block_diff_ac, edge_detector_map, &result); } +#ifdef __USE_OPENCL__ CalculateDiffmap(xsize_, ysize_, step_, &result); +#else + _CalculateDiffmap(xsize_, ysize_, step_, &result); +#endif } void ButteraugliComparator::BlockDiffMap( @@ -1366,7 +1387,11 @@ void _Average5x5(int xsize, int ysize, std::vector* diffs) { std::vector result = *diffs; std::vector tmp0 = *diffs; std::vector tmp1 = *diffs; +#ifdef __USE_OPENCL__ ScaleImage(w, &tmp1); +#else + _ScaleImage(w, &tmp1); +#endif for (int y = 0; y < ysize; y++) { const int row0 = y * xsize; result[row0 + 1] += tmp0[row0]; @@ -1405,7 +1430,11 @@ void _Average5x5(int xsize, int ysize, std::vector* diffs) { } } *diffs = result; +#ifdef __USE_OPENCL__ ScaleImage(scale, diffs); +#else + _ScaleImage(scale, diffs); +#endif } void _DiffPrecompute( @@ -1473,6 +1502,7 @@ void _Mask(const std::vector > &xyb0, for (int i = 0; i < 3; ++i) { (*mask)[i].resize(xsize * ysize); } +#ifdef __USE_OPENCL__ DiffPrecompute(xyb0, xyb1, xsize, ysize, mask); for (int i = 0; i < 3; ++i) { Average5x5(xsize, ysize, &(*mask)[i]); @@ -1484,6 +1514,19 @@ void _Mask(const std::vector > &xyb0, }; Blur(xsize, ysize, (*mask)[i].data(), sigma[i], 0.0); } +#else + _DiffPrecompute(xyb0, xyb1, xsize, ysize, mask); + for (int i = 0; i < 3; ++i) { + _Average5x5(xsize, ysize, &(*mask)[i]); + _MinSquareVal(4, 0, xsize, ysize, (*mask)[i].data()); + static const double sigma[3] = { + 9.65781083553, + 14.2644604355, + 4.53358927369, + }; + _Blur(xsize, ysize, (*mask)[i].data(), sigma[i], 0.0); + } +#endif static const double w00 = 232.206464018; static const double w11 = 22.9455222245; static const double w22 = 503.962310606; @@ -1510,10 +1553,17 @@ void _Mask(const std::vector > &xyb0, (*mask_dc)[2][idx] = static_cast(MaskDcB(p2)); } } +#ifdef __USE_OPENCL__ for (int i = 0; i < 3; ++i) { ScaleImage(kGlobalScale * kGlobalScale, &(*mask)[i]); ScaleImage(kGlobalScale * kGlobalScale, &(*mask_dc)[i]); } +#else + for (int i = 0; i < 3; ++i) { + _ScaleImage(kGlobalScale * kGlobalScale, &(*mask)[i]); + _ScaleImage(kGlobalScale * kGlobalScale, &(*mask_dc)[i]); + } +#endif } } // namespace butteraugli From 05ee2f8ef6d6acaba129ca0070ac7f151a305a69 Mon Sep 17 00:00:00 2001 From: Zhou Ke Date: Thu, 13 Jul 2017 19:38:49 +0800 Subject: [PATCH 186/189] Add author information --- BUILD | 3 +-- clguetzli/clbutter_comparator.cpp | 7 +++++++ clguetzli/clbutter_comparator.h | 7 +++++++ clguetzli/clguetzli.cl | 7 +++++++ clguetzli/clguetzli.cl.cpp | 7 +++++++ clguetzli/clguetzli.cl.h | 7 +++++++ clguetzli/clguetzli.cpp | 7 +++++++ clguetzli/clguetzli.cu | 7 +++++++ clguetzli/clguetzli.h | 7 +++++++ clguetzli/clguetzli_test.cpp | 7 +++++++ clguetzli/clguetzli_test.h | 7 +++++++ clguetzli/cuguetzli.cpp | 7 +++++++ clguetzli/cuguetzli.h | 7 +++++++ clguetzli/cumem_pool.cpp | 6 ++++++ clguetzli/cumem_pool.h | 5 +++++ clguetzli/ocl.cpp | 6 ++++++ clguetzli/ocl.h | 6 ++++++ clguetzli/ocu.cpp | 5 +++++ clguetzli/ocu.h | 5 +++++ 19 files changed, 118 insertions(+), 2 deletions(-) diff --git a/BUILD b/BUILD index 8f2e28f1..c88d3890 100644 --- a/BUILD +++ b/BUILD @@ -8,10 +8,9 @@ cc_library( "guetzli/*.h", "guetzli/*.cc", "guetzli/*.inc", - "clguetzli/*.cpp", + "clguetzli/*.cpp", "clguetzli/*.h", "clguetzli/*.hpp" - ], exclude = ["guetzli/guetzli.cc"], ), diff --git a/clguetzli/clbutter_comparator.cpp b/clguetzli/clbutter_comparator.cpp index 3d4eb7dd..d91055d5 100644 --- a/clguetzli/clbutter_comparator.cpp +++ b/clguetzli/clbutter_comparator.cpp @@ -1,3 +1,10 @@ +/* +* OpenCL/CUDA edition implementation of butter_comparator. +* +* Author: strongtu@tencent.com +* ianhuang@tencent.com +* chriskzhou@tencent.com +*/ #include "clbutter_comparator.h" #include "clguetzli.h" #include "clguetzli_test.h" diff --git a/clguetzli/clbutter_comparator.h b/clguetzli/clbutter_comparator.h index c26de1de..76380785 100644 --- a/clguetzli/clbutter_comparator.h +++ b/clguetzli/clbutter_comparator.h @@ -1,3 +1,10 @@ +/* +* OpenCL/CUDA edition implementation of butter_comparator. +* +* Author: strongtu@tencent.com +* ianhuang@tencent.com +* chriskzhou@tencent.com +*/ #pragma once #include #include "butteraugli/butteraugli.h" diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index c2e67e80..2d18e8bd 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -1,3 +1,10 @@ +/* +* OpenCL Kernels +* +* Author: strongtu@tencent.com +* ianhuang@tencent.com +* chriskzhou@tencent.com +*/ #ifdef __USE_OPENCL__ #pragma OPENCL EXTENSION cl_khr_fp64 : enable diff --git a/clguetzli/clguetzli.cl.cpp b/clguetzli/clguetzli.cl.cpp index 45533f60..619c0cfd 100644 --- a/clguetzli/clguetzli.cl.cpp +++ b/clguetzli/clguetzli.cl.cpp @@ -1,3 +1,10 @@ +/* +* OpenCL/CUDA edition implementation of ButteraugliComparator. +* +* Author: strongtu@tencent.com +* ianhuang@tencent.com +* chriskzhou@tencent.com +*/ #include #include #include diff --git a/clguetzli/clguetzli.cl.h b/clguetzli/clguetzli.cl.h index 761ed634..12543e42 100644 --- a/clguetzli/clguetzli.cl.h +++ b/clguetzli/clguetzli.cl.h @@ -1,3 +1,10 @@ +/* +* OpenCL/CUDA edition implementation of ButteraugliComparator. +* +* Author: strongtu@tencent.com +* ianhuang@tencent.com +* chriskzhou@tencent.com +*/ #ifndef __CLGUETZLI_CL_H__ #define __CLGUETZLI_CL_H__ diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp index be8e8c10..52129927 100644 --- a/clguetzli/clguetzli.cpp +++ b/clguetzli/clguetzli.cpp @@ -1,3 +1,10 @@ +/* +* OpenCL edition implementation of guetzli. +* +* Author: strongtu@tencent.com +* ianhuang@tencent.com +* chriskzhou@tencent.com +*/ #include "clguetzli.h" #include #include diff --git a/clguetzli/clguetzli.cu b/clguetzli/clguetzli.cu index 974be98e..2b7a71c4 100644 --- a/clguetzli/clguetzli.cu +++ b/clguetzli/clguetzli.cu @@ -1 +1,8 @@ +/* +* CUDA Kernels +* +* Author: strongtu@tencent.com +* ianhuang@tencent.com +* chriskzhou@tencent.com +*/ #include "clguetzli/clguetzli.cl" diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h index c01da7a4..c4f3961c 100644 --- a/clguetzli/clguetzli.h +++ b/clguetzli/clguetzli.h @@ -1,3 +1,10 @@ +/* +* OpenCL edition implementation of guetzli. +* +* Author: strongtu@tencent.com +* ianhuang@tencent.com +* chriskzhou@tencent.com +*/ #pragma once #include #include "guetzli/processor.h" diff --git a/clguetzli/clguetzli_test.cpp b/clguetzli/clguetzli_test.cpp index 6e6fece8..2e5af412 100644 --- a/clguetzli/clguetzli_test.cpp +++ b/clguetzli/clguetzli_test.cpp @@ -1,3 +1,10 @@ +/* +* OpenCL test cases +* +* Author: strongtu@tencent.com +* ianhuang@tencent.com +* chriskzhou@tencent.com +*/ #ifdef __USE_OPENCL__ #include diff --git a/clguetzli/clguetzli_test.h b/clguetzli/clguetzli_test.h index 94c0a2c6..dbc3c47a 100644 --- a/clguetzli/clguetzli_test.h +++ b/clguetzli/clguetzli_test.h @@ -1,3 +1,10 @@ +/* +* OpenCL test cases +* +* Author: strongtu@tencent.com +* ianhuang@tencent.com +* chriskzhou@tencent.com +*/ #pragma once #include "ocl.h" diff --git a/clguetzli/cuguetzli.cpp b/clguetzli/cuguetzli.cpp index 1903c6eb..f348edb7 100644 --- a/clguetzli/cuguetzli.cpp +++ b/clguetzli/cuguetzli.cpp @@ -1,3 +1,10 @@ +/* +* CUDA edition implementation of guetzli. +* +* Author: strongtu@tencent.com +* ianhuang@tencent.com +* chriskzhou@tencent.com +*/ #include "cuguetzli.h" #include #include "ocu.h" diff --git a/clguetzli/cuguetzli.h b/clguetzli/cuguetzli.h index a75dcc46..8c3e3444 100644 --- a/clguetzli/cuguetzli.h +++ b/clguetzli/cuguetzli.h @@ -1,3 +1,10 @@ +/* +* CUDA edition implementation of guetzli. +* +* Author: strongtu@tencent.com +* ianhuang@tencent.com +* chriskzhou@tencent.com +*/ #pragma once #include "guetzli/processor.h" #include "clguetzli.cl.h" diff --git a/clguetzli/cumem_pool.cpp b/clguetzli/cumem_pool.cpp index 4fe4964d..8252d3e7 100644 --- a/clguetzli/cumem_pool.cpp +++ b/clguetzli/cumem_pool.cpp @@ -1,3 +1,9 @@ +/* + * Memory Pool for CUDA + * + * Author: ianhuang@tencent.com + */ + #include "cumem_pool.h" #ifdef __USE_CUDA__ diff --git a/clguetzli/cumem_pool.h b/clguetzli/cumem_pool.h index d2ceec04..b878d92f 100644 --- a/clguetzli/cumem_pool.h +++ b/clguetzli/cumem_pool.h @@ -1,3 +1,8 @@ +/* +* Memory Pool for CUDA +* +* Author: ianhuang@tencent.com +*/ #pragma once #ifdef __USE_CUDA__ diff --git a/clguetzli/ocl.cpp b/clguetzli/ocl.cpp index f4427fff..851ab943 100644 --- a/clguetzli/ocl.cpp +++ b/clguetzli/ocl.cpp @@ -1,3 +1,9 @@ +/* +* OpenCL Manager +* +* Author: strongtu@tencent.com +* ianhuang@tencent.com +*/ #include "ocl.h" #include #include diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h index f3056dd8..7ccee2d8 100644 --- a/clguetzli/ocl.h +++ b/clguetzli/ocl.h @@ -1,3 +1,9 @@ +/* +* OpenCL Manager +* +* Author: strongtu@tencent.com +* ianhuang@tencent.com +*/ #pragma once #ifdef __USE_OPENCL__ diff --git a/clguetzli/ocu.cpp b/clguetzli/ocu.cpp index ea66be55..b7395ed1 100644 --- a/clguetzli/ocu.cpp +++ b/clguetzli/ocu.cpp @@ -1,3 +1,8 @@ +/* +* CUDA Manager +* +* Author: strongtu@tencent.com +*/ #include "ocu.h" #ifdef __USE_CUDA__ diff --git a/clguetzli/ocu.h b/clguetzli/ocu.h index 1c13e86e..93f675a3 100644 --- a/clguetzli/ocu.h +++ b/clguetzli/ocu.h @@ -1,3 +1,8 @@ +/* +* CUDA Manager +* +* Author: strongtu@tencent.com +*/ #pragma once #ifdef __USE_CUDA__ From 808e624565f3f2165be947271256071d1d596354 Mon Sep 17 00:00:00 2001 From: Zhou Ke Date: Thu, 13 Jul 2017 23:14:05 +0800 Subject: [PATCH 187/189] Update ReadMe --- README.md | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/README.md b/README.md index 2ecd1072..b316a904 100644 --- a/README.md +++ b/README.md @@ -99,3 +99,56 @@ attempts made. Please note that JPEG images do not support alpha channel (transparency). If the input is a PNG with an alpha channel, it will be overlaid on black background before encoding. + +# Extra features + +**Note:** Please make sure that you can build guetzli successfully before adding the following features. + +## Enable CUDA/OpenCL support + +**Note:** Before adding [CUDA](https://developer.nvidia.com/cuda-zone) support, please [check](http://developer.nvidia.com/cuda-gpus) whether your GPU support CUDA or not. + +**Note:** If you don't have an NVIDIA card that support CUDA, you can try [OpenCL](https://www.khronos.org/opencl/) instead. You can install any of the OpenCL SDKs, such as [Intel OpenCL SDK](https://software.intel.com/en-us/intel-opencl), [AMD OpenCL SDK](http://developer.amd.com/tools-and-sdks/opencl-zone/), etc. + +**Note:** The steps for adding OpenCL support is very similar with adding CUDA support, so the following introduction will be only for CUDA. + +### On POSIX systems +1. Follow the [Installation Guide for Linux ](https://developer.nvidia.com/compute/cuda/8.0/Prod2/docs/sidebar/CUDA_Installation_Guide_Linux-pdf) to setup [CUDA Toolkit](https://developer.nvidia.com/cuda-toolkit). +2. Edit `premake5.lua`, add `defines { "__USE_OPENCL__", "__USE_CUDA__" }` and `links { "OpenCL", "cuda" }` under `filter "action:gmake"`. Then do `premake5 --os=linux gmake` to update the makefile. +3. Run `make` and expect the binary to be created in `bin/Release/guetzli`. +4. Run `./compile.sh 64` or `./compile.sh 32` to build the 64 or 32 bits [ptx](http://docs.nvidia.com/cuda/parallel-thread-execution) file, and the ptx file will be copied to `bin/Release/clguetzli`. + +### On Windows +1. Follow the [Installation Guide for Microsoft Windows](https://developer.nvidia.com/compute/cuda/8.0/Prod2/docs/sidebar/CUDA_Installation_Guide_Windows-pdf) to setup `CUDA Toolkit`. +2. Open the Visual Studio project and edit the project `Property Pages` as follows: + * Add `__USE_OPENCL__` and `__USE_CUDA__` to preprocessor definitions. + * Add `OpenCL.lib` and `cuda.lib` to additional dependencies. + * Add `$(CUDA_PATH)\include` to include directories. + * Add `$(CUDA_PATH)\lib\Win32` or `$(CUDA_PATH)\lib\x64` to library directories. +3. Build it. + +### Usage +```bash +guetzli [--c|--cuda|--opencl] [other options] original.png output.jpg +guetzli [--c|--cuda|--opencl] [other options] original.jpg output.jpg +``` +You can pass a `--c` parameter to enable the procedure optimization or `--cuda` parameter to use the CUDA acceleration or `--opencl` to use the OpenCL acceleration. + +If you have any question about CUDA/OpenCL support, please contact strongtu@tencent.com, ianhuang@tencent.com or chriskzhou@tencent.com. + +## Enable full JPEG format support +### On POSIX systems +1. Install [libjpeg](http://libjpeg.sourceforge.net/). + If using your operating system + package manager, install development versions of the packages if the + distinction exists. + * On Ubuntu, do `apt-get install libjpeg8-dev`. + * On Fedora, do `dnf install libjpeg-devel`. + * On Arch Linux, do `pacman -S libjpeg`. + * On Alpine Linux, do `apk add libjpeg`. +2. Edit `premake5.lua`, add `defines {"__SUPPORT_FULL_JPEG__"}` and `links { "jpeg" }` under `filter "action:gmake"`. Then do `premake5 --os=linux gmake` to update the makefile. +3. Run `make` and expect the binary to be created in `bin/Release/guetzli` +### On Windows +1. Install `libjpeg-turbo` using vcpkg: `.\vcpkg install libjpeg-turbo` +2. Open the Visual Studio project and add `__SUPPORT_FULL_JPEG__` to preprocessor definitions in the project `Property Pages`. +3. Build it. \ No newline at end of file From af12f124f345849de30a4fb19fc8bd72f7f66fcf Mon Sep 17 00:00:00 2001 From: Zhou Ke Date: Tue, 18 Jul 2017 10:46:57 +0800 Subject: [PATCH 188/189] Update ReadMe & fix some mistakes --- README.md | 11 +++++++---- clguetzli/clguetzli.cl | 2 +- compile.sh | 2 +- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index b316a904..37fa4267 100644 --- a/README.md +++ b/README.md @@ -115,17 +115,20 @@ before encoding. ### On POSIX systems 1. Follow the [Installation Guide for Linux ](https://developer.nvidia.com/compute/cuda/8.0/Prod2/docs/sidebar/CUDA_Installation_Guide_Linux-pdf) to setup [CUDA Toolkit](https://developer.nvidia.com/cuda-toolkit). 2. Edit `premake5.lua`, add `defines { "__USE_OPENCL__", "__USE_CUDA__" }` and `links { "OpenCL", "cuda" }` under `filter "action:gmake"`. Then do `premake5 --os=linux gmake` to update the makefile. -3. Run `make` and expect the binary to be created in `bin/Release/guetzli`. -4. Run `./compile.sh 64` or `./compile.sh 32` to build the 64 or 32 bits [ptx](http://docs.nvidia.com/cuda/parallel-thread-execution) file, and the ptx file will be copied to `bin/Release/clguetzli`. +3. Edit `clguetzli/clguetzli.cl` and add `#define __USE_OPENCL__` at first line. +4. Run `make` and expect the binary to be created in `bin/Release/guetzli`. +5. Run `./compile.sh 64` or `./compile.sh 32` to build the 64 or 32 bits [ptx](http://docs.nvidia.com/cuda/parallel-thread-execution) file, and the ptx file will be copied to `bin/Release/clguetzli`. ### On Windows 1. Follow the [Installation Guide for Microsoft Windows](https://developer.nvidia.com/compute/cuda/8.0/Prod2/docs/sidebar/CUDA_Installation_Guide_Windows-pdf) to setup `CUDA Toolkit`. -2. Open the Visual Studio project and edit the project `Property Pages` as follows: +2. Copy `\VC\bin\amd64\vcvars64.bat` as `\vcvars64.bat` +3. Open the Visual Studio project and edit the project `Property Pages` as follows: * Add `__USE_OPENCL__` and `__USE_CUDA__` to preprocessor definitions. * Add `OpenCL.lib` and `cuda.lib` to additional dependencies. * Add `$(CUDA_PATH)\include` to include directories. * Add `$(CUDA_PATH)\lib\Win32` or `$(CUDA_PATH)\lib\x64` to library directories. -3. Build it. +4. Edit `clguetzli/clguetzli.cl` and add `#define __USE_OPENCL__` at first line. +5. Build it. ### Usage ```bash diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl index 2d18e8bd..2a8eb527 100644 --- a/clguetzli/clguetzli.cl +++ b/clguetzli/clguetzli.cl @@ -3417,4 +3417,4 @@ __device__ double CompareBlockFactor(const channel_info mayout_channel[3], #undef double #endif -#endif __USE_OPENCL__ \ No newline at end of file +#endif //__USE_OPENCL__ \ No newline at end of file diff --git a/compile.sh b/compile.sh index 0b13d464..eabb6473 100755 --- a/compile.sh +++ b/compile.sh @@ -4,7 +4,7 @@ echo $1 --machine 64 or 32 echo $2 -G -nvcc -I"./" -I"/usr/local/cuda/include" -ftz=true -prec-div=false -prec-sqrt=false -arch=compute_30 -O3 --machine $1 $2 -ptx -o clguetzli/clguetzli.cu.ptx$1 clguetzli/clguetzli.cu +nvcc -D__USE_OPENCL__ -I"./" -I"/usr/local/cuda/include" -ftz=true -prec-div=false -prec-sqrt=false -arch=compute_30 -O3 --machine $1 $2 -ptx -o clguetzli/clguetzli.cu.ptx$1 clguetzli/clguetzli.cu #copy to ./bin/Release cp clguetzli/clguetzli.cu.ptx$1 bin/Release/clguetzli/clguetzli.cu.ptx$1 From 14ef86d66f9caf642fa6adf2ef6c6697e17c5b25 Mon Sep 17 00:00:00 2001 From: Zhou Ke Date: Thu, 20 Jul 2017 00:00:59 +0800 Subject: [PATCH 189/189] Update appveyor.xml --- appveyor.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/appveyor.yml b/appveyor.yml index 061ab6d0..97acb3ac 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -15,7 +15,7 @@ install: - premake5.exe %TOOLSET% - git clone https://github.com/Microsoft/vcpkg - md vcpkg\downloads\nuget-3.5.0 - - appveyor DownloadFile https://dist.nuget.org/win-x86-commandline/latest/nuget.exe -FileName %appveyor_build_folder%\vcpkg\downloads\nuget-3.5.0\nuget.exe + - appveyor DownloadFile https://dist.nuget.org/win-x86-commandline/v3.5.0/nuget.exe -FileName %appveyor_build_folder%\vcpkg\downloads\nuget-3.5.0\nuget.exe - appveyor DownloadFile https://cmake.org/files/v3.8/cmake-3.8.0-rc1-win32-x86.zip -FileName %appveyor_build_folder%\vcpkg\downloads\cmake-3.8.0-rc1-win32-x86.zip - 7z x %appveyor_build_folder%\vcpkg\downloads\cmake-3.8.0-rc1-win32-x86.zip - cd vcpkg